From ade0c4932a31f9f6b5bcb21d5dc1f40231ff3dc0 Mon Sep 17 00:00:00 2001 From: Ed Santiago Date: Thu, 14 Sep 2023 10:34:20 -0600 Subject: [PATCH] CI: trace setup and runner scripts Every few months we get a new CI failure that requires scanning through logs that give no indication of what is happening or where. Tracking down the error can cost many hours. Solution: pepper cirrus scripts with showrun(), which echoes the command _and_ displays the source filename + lineno. Signed-off-by: Ed Santiago --- contrib/cirrus/lib.sh | 26 +++++----- contrib/cirrus/runner.sh | 68 +++++++++++++------------ contrib/cirrus/setup_environment.sh | 77 ++++++++++++++++++----------- 3 files changed, 98 insertions(+), 73 deletions(-) diff --git a/contrib/cirrus/lib.sh b/contrib/cirrus/lib.sh index 70bfb74309..131b3084f0 100644 --- a/contrib/cirrus/lib.sh +++ b/contrib/cirrus/lib.sh @@ -79,9 +79,11 @@ CIRRUS_REPO_NAME=${CIRRUS_REPO_NAME:-podman} # shellcheck disable=SC2154 if [[ -z "$CIRRUS_BASE_SHA" ]] && [[ -z "$CIRRUS_TAG" ]] then # Operating on a branch, or under `get_ci_vm.sh` + showrun echo "branch or get_ci_vm (CIRRUS_BASE_SHA and CIRRUS_TAG are unset)" CIRRUS_BASE_SHA=$(git rev-parse ${UPSTREAM_REMOTE:-origin}/$DEST_BRANCH) elif [[ -z "$CIRRUS_BASE_SHA" ]] then # Operating on a tag + showrun echo "operating on tag" CIRRUS_BASE_SHA=$(git rev-parse HEAD) fi # The starting place for linting and code validation @@ -164,8 +166,8 @@ setup_rootless() { ROOTLESS_UID=$rootless_uid rootless_gid=$((1500 + RANDOM % 5000)) msg "creating $rootless_uid:$rootless_gid $ROOTLESS_USER user" - groupadd -g $rootless_gid $ROOTLESS_USER - useradd -g $rootless_gid -u $rootless_uid --no-user-group --create-home $ROOTLESS_USER + showrun groupadd -g $rootless_gid $ROOTLESS_USER + showrun useradd -g $rootless_gid -u $rootless_uid --no-user-group --create-home $ROOTLESS_USER echo "$ROOTLESS_USER ALL=(root) NOPASSWD: ALL" > /etc/sudoers.d/ci-rootless @@ -174,8 +176,8 @@ setup_rootless() { msg "Creating ssh key pairs" [[ -r "$HOME/.ssh/id_rsa" ]] || \ ssh-keygen -t rsa -P "" -f "$HOME/.ssh/id_rsa" - ssh-keygen -t ed25519 -P "" -f "/home/$ROOTLESS_USER/.ssh/id_ed25519" - ssh-keygen -t rsa -P "" -f "/home/$ROOTLESS_USER/.ssh/id_rsa" + showrun ssh-keygen -t ed25519 -P "" -f "/home/$ROOTLESS_USER/.ssh/id_ed25519" + showrun ssh-keygen -t rsa -P "" -f "/home/$ROOTLESS_USER/.ssh/id_rsa" msg "Set up authorized_keys" cat $HOME/.ssh/*.pub /home/$ROOTLESS_USER/.ssh/*.pub >> $HOME/.ssh/authorized_keys @@ -231,17 +233,17 @@ use_cni() { [ -z "$(rpm -qa | grep $pkg)" ] && echo "$pkg not installed" || rpm -e --nodeps $pkg done msg "Installing default CNI configuration" - dnf install -y $PACKAGE_DOWNLOAD_DIR/podman-plugins* + showrun dnf install -y $PACKAGE_DOWNLOAD_DIR/podman-plugins* cd $GOSRC || exit 1 rm -rvf /etc/cni/net.d mkdir -p /etc/cni/net.d - install -v -D -m 644 ./cni/87-podman-bridge.conflist \ + showrun install -v -D -m 644 ./cni/87-podman-bridge.conflist \ /etc/cni/net.d/ # This config must always sort last in the list of networks (podman picks # first one as the default). This config prevents allocation of network # address space used by default in google cloud. # https://cloud.google.com/vpc/docs/vpc#ip-ranges - install -v -D -m 644 $SCRIPT_BASE/99-do-not-use-google-subnets.conflist \ + showrun install -v -D -m 644 $SCRIPT_BASE/99-do-not-use-google-subnets.conflist \ /etc/cni/net.d/ } @@ -252,7 +254,7 @@ use_netavark() { echo "NETWORK_BACKEND=netavark" >> /etc/ci_environment export NETWORK_BACKEND=netavark # needed for install_test_configs() msg "Removing any/all CNI configuration" - rm -rvf /etc/cni/net.d/* + showrun rm -rvf /etc/cni/net.d/* # N/B: The CNI packages are still installed and available. This is # on purpose, since CI needs to verify the selection mechanisms are # functional when both are available. @@ -276,8 +278,8 @@ remove_packaged_podman_files() { if systemctl --quiet is-$state $unit then echo "Warning: $unit found $state prior to packaged-file removal" - systemctl --quiet disable $unit || true - systemctl --quiet stop $unit || true + showrun systemctl --quiet disable $unit || true + showrun systemctl --quiet stop $unit || true fi done done @@ -299,9 +301,11 @@ remove_packaged_podman_files() { do # Sub-directories may contain unrelated/valuable stuff if [[ -d "$fullpath" ]]; then continue; fi - ooe.sh rm -vf "$fullpath" + showrun ooe.sh rm -vf "$fullpath" done # Be super extra sure and careful vs performant and completely safe sync && echo 3 > /proc/sys/vm/drop_caches || true } + +showrun echo "finished" diff --git a/contrib/cirrus/runner.sh b/contrib/cirrus/runner.sh index bc98244183..0c9191b6d7 100755 --- a/contrib/cirrus/runner.sh +++ b/contrib/cirrus/runner.sh @@ -19,15 +19,17 @@ set -eo pipefail # shellcheck source=contrib/cirrus/lib.sh source $(dirname $0)/lib.sh +showrun echo "starting" + function _run_validate() { # TODO: aarch64 images need python3-devel installed # https://github.com/containers/automation_images/issues/159 - bigto ooe.sh dnf install -y python3-devel + showrun bigto ooe.sh dnf install -y python3-devel # git-validation tool fails if $EPOCH_TEST_COMMIT is empty # shellcheck disable=SC2154 if [[ -n "$EPOCH_TEST_COMMIT" ]]; then - make validate + showrun make validate else warn "Skipping git-validation since \$EPOCH_TEST_COMMIT is empty" fi @@ -42,29 +44,29 @@ function _run_unit() { # shellcheck disable=SC2154 die "$TEST_FLAVOR: Unsupported PODBIN_NAME='$PODBIN_NAME'" fi - make localunit + showrun make localunit } function _run_apiv2() { _bail_if_test_can_be_skipped test/apiv2 ( - make localapiv2-bash + showrun make localapiv2-bash source .venv/requests/bin/activate - make localapiv2-python + showrun make localapiv2-python ) |& logformatter } function _run_compose() { _bail_if_test_can_be_skipped test/compose - ./test/compose/test-compose |& logformatter + showrun ./test/compose/test-compose |& logformatter } function _run_compose_v2() { _bail_if_test_can_be_skipped test/compose - ./test/compose/test-compose |& logformatter + showrun ./test/compose/test-compose |& logformatter } function _run_int() { @@ -82,18 +84,18 @@ function _run_sys() { function _run_upgrade_test() { _bail_if_test_can_be_skipped test/system test/upgrade - bats test/upgrade |& logformatter + showrun bats test/upgrade |& logformatter } function _run_bud() { _bail_if_test_can_be_skipped test/buildah-bud - ./test/buildah-bud/run-buildah-bud-tests |& logformatter + showrun ./test/buildah-bud/run-buildah-bud-tests |& logformatter } function _run_bindings() { # install ginkgo - make .install.ginkgo + showrun make .install.ginkgo # shellcheck disable=SC2155 export PATH=$PATH:$GOSRC/hack:$GOSRC/test/tools/build @@ -105,23 +107,23 @@ function _run_bindings() { fi (echo "$gitcommit_magic" && \ - make testbindings) |& logformatter + showrun make testbindings) |& logformatter } function _run_docker-py() { source .venv/docker-py/bin/activate - make run-docker-py-tests + showrun make run-docker-py-tests } function _run_endpoint() { - make test-binaries - make endpoint + showrun make test-binaries + showrun make endpoint } function _run_minikube() { _bail_if_test_can_be_skipped test/minikube msg "Testing minikube." - bats test/minikube |& logformatter + showrun bats test/minikube |& logformatter } exec_container() { @@ -187,10 +189,10 @@ function _run_swagger() { # Swagger validation takes a significant amount of time msg "Pulling \$CTR_FQIN '$CTR_FQIN' (background process)" - bin/podman pull --quiet $CTR_FQIN & + showrun bin/podman pull --quiet $CTR_FQIN & cd $GOSRC - make swagger + showrun make swagger # Cirrus-CI Artifact instruction expects file here cp -v $GOSRC/pkg/api/swagger.yaml ./ @@ -209,7 +211,7 @@ eof msg "Waiting for backgrounded podman pull to complete..." wait %% - bin/podman run -it --rm --security-opt label=disable \ + showrun bin/podman run -it --rm --security-opt label=disable \ --env-file=$envvarsfile \ -v $GOSRC:$GOSRC:ro \ --workdir $GOSRC \ @@ -219,9 +221,9 @@ eof function _run_build() { # Ensure always start from clean-slate with all vendor modules downloaded - make clean - make vendor - make podman-release # includes podman, podman-remote, and docs + showrun make clean + showrun make vendor + showrun make podman-release # includes podman, podman-remote, and docs # Last-minute confirmation that we're testing the desired runtime. # This Can't Possibly Failâ„¢ in regular CI; only when updating VMs. @@ -252,7 +254,7 @@ function _run_altbuild() { cd $GOSRC case "$ALT_NAME" in *Each*) - git fetch origin + showrun git fetch origin # The make-and-check-size script, introduced 2022-03-22 in #13518, # runs 'make' (the original purpose of this check) against # each commit, then checks image sizes to make sure that @@ -264,19 +266,19 @@ function _run_altbuild() { savedhead=$(git rev-parse HEAD) # Push to PR base. First run of the script will write size files pr_base=$(git merge-base origin/$DEST_BRANCH HEAD) - git checkout $pr_base - hack/make-and-check-size $context_dir + showrun git checkout $pr_base + showrun hack/make-and-check-size $context_dir # pop back to PR, and run incremental makes. Subsequent script # invocations will compare against original size. - git checkout $savedhead - git rebase $pr_base -x "hack/make-and-check-size $context_dir" + showrun git checkout $savedhead + showrun git rebase $pr_base -x "hack/make-and-check-size $context_dir" rm -rf $context_dir ;; *Windows*) - make podman-remote-release-windows_amd64.zip + showrun make podman-remote-release-windows_amd64.zip ;; *RPM*) - make package + showrun make package ;; Alt*x86*Cross) arches=(\ @@ -316,7 +318,7 @@ function _run_altbuild() { function _build_altbuild_archs() { for arch in "$@"; do msg "Building release archive for $arch" - make podman-release-${arch}.tar.gz GOARCH=$arch + showrun make podman-release-${arch}.tar.gz GOARCH=$arch done } @@ -418,13 +420,13 @@ dotest() { die "Found fallback podman '$fallback_podman' in \$PATH; tests require none, as a guarantee that we're testing the right binary." fi - make ${localremote}${testsuite} PODMAN_SERVER_LOG=$PODMAN_SERVER_LOG \ + showrun make ${localremote}${testsuite} PODMAN_SERVER_LOG=$PODMAN_SERVER_LOG \ |& logformatter } _run_machine() { # N/B: Can't use _bail_if_test_can_be_skipped here b/c content isn't under test/ - make localmachine |& logformatter + showrun make localmachine |& logformatter } # Optimization: will exit if the only PR diffs are under docs/ or tests/ @@ -543,4 +545,6 @@ if [ "$(type -t $handler)" != "function" ]; then die "Unknown/Unsupported \$TEST_FLAVOR=$TEST_FLAVOR" fi -$handler +showrun $handler + +showrun echo "finished" diff --git a/contrib/cirrus/setup_environment.sh b/contrib/cirrus/setup_environment.sh index 5dd1f5dc36..596eaac996 100755 --- a/contrib/cirrus/setup_environment.sh +++ b/contrib/cirrus/setup_environment.sh @@ -12,6 +12,8 @@ set -e # shellcheck source=./contrib/cirrus/lib.sh source $(dirname $0)/lib.sh +showrun echo "starting" + die_unknown() { local var_name="$1" req_env_vars var_name @@ -40,10 +42,10 @@ cp hack/podman-registry /bin # Some test operations & checks require a git "identity" _gc='git config --file /root/.gitconfig' -$_gc user.email "TMcTestFace@example.com" -$_gc user.name "Testy McTestface" +showrun $_gc user.email "TMcTestFace@example.com" +showrun $_gc user.name "Testy McTestface" # Bypass git safety/security checks when operating in a throwaway environment -git config --system --add safe.directory $GOSRC +showrun git config --system --add safe.directory $GOSRC # Ensure that all lower-level contexts and child-processes have # ready access to higher level orchestration (e.g Cirrus-CI) @@ -82,6 +84,7 @@ mkdir -p /etc/containers/containers.conf.d # respectively. # **IMPORTANT**: $OCI_RUNTIME is a fakeout! It is used only in e2e tests. # For actual podman, as in system tests, we force runtime in containers.conf +showrun echo "conditional check: CG_FS_TYPE [=$CG_FS_TYPE]" case "$CG_FS_TYPE" in tmpfs) if ((CONTAINER==0)); then @@ -107,6 +110,7 @@ printf "[engine]\ndatabase_backend=\"$CI_DESIRED_DATABASE\"\n" > /etc/containers # does not defaults to using `vfs` as storage driver) # shellcheck disable=SC2154 if [[ "$OS_RELEASE_ID" == "debian" ]]; then + showrun echo "conditional setup for debian" conf=/etc/containers/storage.conf if [[ -e $conf ]]; then die "FATAL! INTERNAL ERROR! Cannot override $conf" @@ -116,6 +120,7 @@ if [[ "$OS_RELEASE_ID" == "debian" ]]; then fi if ((CONTAINER==0)); then # Not yet running inside a container + showrun echo "conditional setup for CONTAINER == 0" # Discovered reemergence of BFQ scheduler bug in kernel 5.8.12-200 # which causes a kernel panic when system is under heavy I/O load. # Disable the I/O scheduler (a.k.a. elevator) for all environments, @@ -145,22 +150,24 @@ fi # Which distribution are we testing on. case "$OS_RELEASE_ID" in debian) + showrun echo "more conditional setup for debian" # FIXME 2023-04-11: workaround for runc regression causing failure # in system tests: "skipping device /dev/char/10:200 for systemd" # (Checked on 2023-08-08 and it's still too old: 1.1.5) # FIXME: please remove this once runc >= 1.2 makes it into debian. - modprobe tun + showrun modprobe tun # TODO: move this into image build process # We need the "en_US.UTF-8" locale for the "podman logs with non ASCII log tag" tests - sed -i '/en_US.UTF-8/s/^#//g' /etc/locale.gen - locale-gen + showrun sed -i '/en_US.UTF-8/s/^#//g' /etc/locale.gen + showrun locale-gen ;; fedora) + showrun echo "conditional setup for fedora" if ((CONTAINER==0)); then # All SELinux distros need this for systemd-in-a-container msg "Enabling container_manage_cgroup" - setsebool container_manage_cgroup true + showrun setsebool container_manage_cgroup true fi ;; *) die_unknown OS_RELEASE_ID @@ -169,6 +176,7 @@ esac # Networking: force CNI or Netavark as requested in .cirrus.yml # (this variable is mandatory). # shellcheck disable=SC2154 +showrun echo "about to set up for CI_DESIRED_NETWORK [=$CI_DESIRED_NETWORK]" case "$CI_DESIRED_NETWORK" in netavark) use_netavark ;; cni) use_cni ;; @@ -178,6 +186,7 @@ esac # Database: force SQLite or BoltDB as requested in .cirrus.yml. # If unset, will default to BoltDB. # shellcheck disable=SC2154 +showrun echo "about to set up for CI_DESIRED_DATABASE [=$CI_DESIRED_DATABASE]" case "$CI_DESIRED_DATABASE" in sqlite) warn "Forcing PODMAN_DB=sqlite" @@ -197,6 +206,7 @@ esac # Required to be defined by caller: The environment where primary testing happens # shellcheck disable=SC2154 +showrun echo "about to set up for TEST_ENVIRON [=$TEST_ENVIRON]" case "$TEST_ENVIRON" in host) # The e2e tests wrongly guess `--cgroup-manager` option @@ -244,6 +254,7 @@ case "$TEST_ENVIRON" in esac # Required to be defined by caller: Are we testing as root or a regular user +showrun echo "about to set up for PRIV_NAME [=$PRIV_NAME]" case "$PRIV_NAME" in root) # shellcheck disable=SC2154 @@ -265,6 +276,7 @@ esac # shellcheck disable=SC2154 if [[ -n "$ROOTLESS_USER" ]]; then + showrun echo "conditional setup for ROOTLESS_USER [=$ROOTLESS_USER]" echo "ROOTLESS_USER=$ROOTLESS_USER" >> /etc/ci_environment echo "ROOTLESS_UID=$ROOTLESS_UID" >> /etc/ci_environment fi @@ -285,7 +297,7 @@ if ((CONTAINER==0)); then nsswitch=/etc/authselect/nsswitch.conf if [[ -e $nsswitch ]]; then if grep -q -E 'hosts:.*resolve' $nsswitch; then - msg "Disabling systemd-resolved" + showrun echo "Disabling systemd-resolved" sed -i -e 's/^\(hosts: *\).*/\1files dns myhostname/' $nsswitch systemctl stop systemd-resolved rm -f /etc/resolv.conf @@ -322,46 +334,47 @@ esac # Required to be defined by caller: The primary type of testing that will be performed # shellcheck disable=SC2154 +showrun echo "about to set up for TEST_FLAVOR [=$TEST_FLAVOR]" case "$TEST_FLAVOR" in validate) - dnf install -y $PACKAGE_DOWNLOAD_DIR/python3*.rpm + showrun dnf install -y $PACKAGE_DOWNLOAD_DIR/python3*.rpm # For some reason, this is also needed for validation - make .install.pre-commit .install.gitvalidation + showrun make .install.pre-commit .install.gitvalidation ;; altbuild) # Defined in .cirrus.yml # shellcheck disable=SC2154 if [[ "$ALT_NAME" =~ RPM ]]; then - bigto dnf install -y glibc-minimal-langpack go-rpm-macros rpkg rpm-build shadow-utils-subid-devel + showrun bigto dnf install -y glibc-minimal-langpack go-rpm-macros rpkg rpm-build shadow-utils-subid-devel fi ;; docker-py) remove_packaged_podman_files - make install PREFIX=/usr ETCDIR=/etc + showrun make install PREFIX=/usr ETCDIR=/etc msg "Installing previously downloaded/cached packages" - dnf install -y $PACKAGE_DOWNLOAD_DIR/python3*.rpm + showrun dnf install -y $PACKAGE_DOWNLOAD_DIR/python3*.rpm virtualenv .venv/docker-py source .venv/docker-py/bin/activate - pip install --upgrade pip - pip install --requirement $GOSRC/test/python/requirements.txt + showrun pip install --upgrade pip + showrun pip install --requirement $GOSRC/test/python/requirements.txt ;; build) make clean ;; unit) - make .install.ginkgo + showrun make .install.ginkgo ;; compose_v2) - dnf -y remove docker-compose - curl -SL https://github.com/docker/compose/releases/download/v2.2.3/docker-compose-linux-x86_64 -o /usr/local/bin/docker-compose - chmod +x /usr/local/bin/docker-compose + showrun dnf -y remove docker-compose + showrun curl -SL https://github.com/docker/compose/releases/download/v2.2.3/docker-compose-linux-x86_64 -o /usr/local/bin/docker-compose + showrun chmod +x /usr/local/bin/docker-compose ;& # Continue with next item apiv2) msg "Installing previously downloaded/cached packages" - dnf install -y $PACKAGE_DOWNLOAD_DIR/python3*.rpm + showrun dnf install -y $PACKAGE_DOWNLOAD_DIR/python3*.rpm virtualenv .venv/requests source .venv/requests/bin/activate - pip install --upgrade pip - pip install --requirement $GOSRC/test/apiv2/python/requirements.txt + showrun pip install --upgrade pip + showrun pip install --requirement $GOSRC/test/apiv2/python/requirements.txt ;& # continue with next item compose) showrun make install.tools @@ -369,13 +382,14 @@ case "$TEST_FLAVOR" in showrun dnf install -y podman-docker* ;& # continue with next item int) - make .install.ginkgo + showrun make .install.ginkgo ;& sys) ;& upgrade_test) ;& bud) ;& bindings) ;& endpoint) + showrun echo "Entering shared endpoint setup" # Use existing host bits when testing is to happen inside a container # since this script will run again in that environment. # shellcheck disable=SC2154 @@ -397,21 +411,21 @@ case "$TEST_FLAVOR" in install_test_configs ;; minikube) - dnf install -y $PACKAGE_DOWNLOAD_DIR/minikube-latest* + showrun dnf install -y $PACKAGE_DOWNLOAD_DIR/minikube-latest* remove_packaged_podman_files - make install.tools - make install PREFIX=/usr ETCDIR=/etc - minikube config set driver podman + showrun make install.tools + showrun make install PREFIX=/usr ETCDIR=/etc + showrun minikube config set driver podman install_test_configs ;; machine) - dnf install -y podman-gvproxy* + showrun dnf install -y podman-gvproxy* remove_packaged_podman_files - make install PREFIX=/usr ETCDIR=/etc + showrun make install PREFIX=/usr ETCDIR=/etc install_test_configs ;; swagger) - make .install.swagger + showrun make .install.swagger ;; #fcos_image_build) # ;; @@ -425,6 +439,7 @@ esac if [[ ! "$OS_RELEASE_ID" =~ "debian" ]] && \ [[ "$CIRRUS_CHANGE_TITLE" =~ CI:NEXT ]] then + showrun echo "Entering setup for CI:NEXT" # shellcheck disable=SC2154 if [[ "$CIRRUS_PR_DRAFT" != "true" ]]; then die "Magic 'CI:NEXT' string can only be used on DRAFT PRs" @@ -448,3 +463,5 @@ echo -e "\n# End of global variable definitions" \ msg "Global CI Environment vars.:" grep -Ev '^#' /etc/ci_environment | sort | indent + +showrun echo "finished"