system test parallelization: enable two-pass approach

For the past two months we've been splitting system tests
into two categories: those that CAN be run in parallel,
and those that CANNOT. Much work has been done to replace
hardcoded names (mycontainer, mypod) with safename().
Hundreds of test runs, in CI and on Ed's laptop, have
proven this approach viable.

make {local,remote}system now runs in two steps: first
the serial ones, then the parallel ones. hack/bats will
now recognize the 'ci:parallel' tag and add --jobs (nprocs).

This requires some tweaking of leak_check, because there
can be umpteen tests running (affecting image/container/pod/etc
state) when any given test completes.

Rules for enabling parallelization in tests:

   * use unique container/pod/volume/network names (safename)
   * do not run 'podman rm -a' or 'rmi -a'
   * never use the -l (--latest) option
   * do not run 'podman ps/images' and expect precise output

Signed-off-by: Ed Santiago <santiago@redhat.com>
This commit is contained in:
Ed Santiago 2024-09-17 09:19:52 -06:00
parent f4a08f46b7
commit d571ca6536
4 changed files with 47 additions and 25 deletions

View File

@ -688,7 +688,8 @@ localmachine:
localsystem: localsystem:
# Wipe existing config, database, and cache: start with clean slate. # Wipe existing config, database, and cache: start with clean slate.
$(RM) -rf ${HOME}/.local/share/containers ${HOME}/.config/containers $(RM) -rf ${HOME}/.local/share/containers ${HOME}/.config/containers
if timeout -v 1 true; then PODMAN=$(CURDIR)/bin/podman QUADLET=$(CURDIR)/bin/quadlet bats -T test/system/; else echo "Skipping $@: 'timeout -v' unavailable'"; fi PODMAN=$(CURDIR)/bin/podman QUADLET=$(CURDIR)/bin/quadlet bats -T --filter-tags '!ci:parallel' test/system/
PODMAN=$(CURDIR)/bin/podman QUADLET=$(CURDIR)/bin/quadlet bats -T --filter-tags ci:parallel -j $$(nproc) test/system/
.PHONY: remotesystem .PHONY: remotesystem
remotesystem: remotesystem:
@ -717,7 +718,8 @@ remotesystem:
echo "Error: ./bin/podman system service did not come up" >&2;\ echo "Error: ./bin/podman system service did not come up" >&2;\
exit 1;\ exit 1;\
fi;\ fi;\
env PODMAN="$(CURDIR)/bin/podman-remote" bats -T test/system/ ;\ env PODMAN="$(CURDIR)/bin/podman-remote" bats -T --filter-tags '!ci:parallel' test/system/ ;\
env PODMAN="$(CURDIR)/bin/podman-remote" bats -T --filter-tags ci:parallel -j $$(nproc) test/system/ ;\
rc=$$?;\ rc=$$?;\
kill %1;\ kill %1;\
else \ else \

View File

@ -83,7 +83,10 @@ for i;do
--rootless) TEST_ROOT= ;; --rootless) TEST_ROOT= ;;
--remote) REMOTE=remote ;; --remote) REMOTE=remote ;;
--ts|-T) bats_opts+=("-T") ;; --ts|-T) bats_opts+=("-T") ;;
--tag=*) bats_filter=("--filter-tags" "$value") ;; --tag=*) bats_filter=("--filter-tags" "$value")
if [[ "$value" = "ci:parallel" ]]; then
bats_opts+=("--jobs" $(nproc))
fi;;
*/*.bats) TESTS=$i ;; */*.bats) TESTS=$i ;;
*) *)
if [[ $i =~ : ]]; then if [[ $i =~ : ]]; then
@ -130,7 +133,7 @@ export PODMAN_BATS_LEAK_CHECK=1
# Root # Root
if [[ "$TEST_ROOT" ]]; then if [[ "$TEST_ROOT" ]]; then
echo "# bats ${bats_filter[*]} $TESTS" echo "# bats ${bats_opts[*]} ${bats_filter[*]} $TESTS"
sudo --preserve-env=PODMAN \ sudo --preserve-env=PODMAN \
--preserve-env=QUADLET \ --preserve-env=QUADLET \
--preserve-env=PODMAN_TEST_DEBUG \ --preserve-env=PODMAN_TEST_DEBUG \
@ -144,7 +147,7 @@ fi
# Rootless. (Only if we're not already root) # Rootless. (Only if we're not already root)
if [[ "$TEST_ROOTLESS" && "$(id -u)" != 0 ]]; then if [[ "$TEST_ROOTLESS" && "$(id -u)" != 0 ]]; then
echo "--------------------------------------------------" echo "--------------------------------------------------"
echo "\$ bats ${bats_filter[*]} $TESTS" echo "\$ bats ${bats_opts[*]} ${bats_filter[*]} $TESTS"
bats "${bats_opts[@]}" "${bats_filter[@]}" $TESTS bats "${bats_opts[@]}" "${bats_filter[@]}" $TESTS
rc=$((rc | $?)) rc=$((rc | $?))
fi fi

View File

@ -222,22 +222,34 @@ function basic_teardown() {
# This must be directly after immediate-assertion-failures to capture the error code # This must be directly after immediate-assertion-failures to capture the error code
local exit_code=$? local exit_code=$?
# Only checks for leaks on a successful run (BATS_TEST_COMPLETED is set 1), # Leak check and state reset. Only run these when running tests serially!
# immediate-assertion-failures didn't fail (exit_code -eq 0) # (For parallel tests, we run a leak check only at the very end of all tests)
# and PODMAN_BATS_LEAK_CHECK is set. if [[ -z "$PARALLEL_JOBSLOT" ]]; then
# As these podman commands are slow we do not want to do this by default # Check for leaks, but only if:
# and only provide this as opt in option. (#22909) # 1) test was successful (BATS_TEST_COMPLETED is set 1); and
if [[ "$BATS_TEST_COMPLETED" -eq 1 ]] && [ $exit_code -eq 0 ] && [ -n "$PODMAN_BATS_LEAK_CHECK" ]; then # 2) immediate-assertion-failures didn't fail (exit_code -eq 0); and
leak_check # 3) PODMAN_BATS_LEAK_CHECK is set (usually only in cron).
exit_code=$((exit_code + $?)) # As these podman commands are slow we do not want to do this by default
# and only provide this as opt-in option. (#22909)
if [[ "$BATS_TEST_COMPLETED" -eq 1 ]] && [[ $exit_code -eq 0 ]] && [[ -n "$PODMAN_BATS_LEAK_CHECK" ]]; then
leak_check
exit_code=$((exit_code + $?))
fi
# Some error happened (either in teardown itself or the actual test failed)
# so do a full cleanup to ensure following tests start with a clean env.
if [ $exit_code -gt 0 ] || [ -z "$BATS_TEST_COMPLETED" ]; then
clean_setup
exit_code=$((exit_code + $?))
fi
fi fi
# Some error happened (either in teardown itself or the actual test failed) # Status file used in teardown_suite() to decide whether or not
# so do a full cleanup to ensure following tests start with a clean env. # to check for leaks
if [ $exit_code -gt 0 ] || [ -z "$BATS_TEST_COMPLETED" ]; then if [[ "$BATS_TEST_COMPLETED" -ne 1 ]]; then
clean_setup rm -f "$BATS_SUITE_TMPDIR/all-tests-passed"
exit_code=$((exit_code + $?))
fi fi
command rm -rf $PODMAN_TMPDIR command rm -rf $PODMAN_TMPDIR
exit_code=$((exit_code + $?)) exit_code=$((exit_code + $?))
return $exit_code return $exit_code
@ -426,12 +438,12 @@ function clean_setup() {
else else
# Always remove image that doesn't match by name # Always remove image that doesn't match by name
echo "# setup(): removing stray image $1" >&3 echo "# setup(): removing stray image $1" >&3
run_podman rmi --force "$1" >/dev/null 2>&1 || true _run_podman_quiet rmi --force "$1"
# Tagged image will have same IID as our test image; don't rmi it. # Tagged image will have same IID as our test image; don't rmi it.
if [[ $2 != "$PODMAN_TEST_IMAGE_ID" ]]; then if [[ $2 != "$PODMAN_TEST_IMAGE_ID" ]]; then
echo "# setup(): removing stray image $2" >&3 echo "# setup(): removing stray image $2" >&3
run_podman rmi --force "$2" >/dev/null 2>&1 || true _run_podman_quiet rmi --force "$2"
fi fi
fi fi
done done

View File

@ -32,6 +32,9 @@ function setup_suite() {
"Unable to set PODMAN_LOGIN_REGISTRY_PORT" "Unable to set PODMAN_LOGIN_REGISTRY_PORT"
clean_setup clean_setup
# Canary file. Will be removed if any individual test fails.
touch "$BATS_SUITE_TMPDIR/all-tests-passed"
} }
# Run at the very end of all tests. Useful for cleanup of non-BATS tmpdirs. # Run at the very end of all tests. Useful for cleanup of non-BATS tmpdirs.
@ -39,11 +42,13 @@ function teardown_suite() {
stop_registry stop_registry
local exit_code=$? local exit_code=$?
# After all tests make sure there are no leaks and cleanup if there are # At end, if all tests have passed, check for leaks.
leak_check # Don't do this if there were errors: failing tests may not clean up.
if [ $? -gt 0 ]; then if [[ -e "$BATS_SUITE_TMPDIR/all-tests-passed" ]]; then
exit_code=$((exit_code + 1)) leak_check
clean_setup if [ $? -gt 0 ]; then
exit_code=$((exit_code + 1))
fi
fi fi
return $exit_code return $exit_code