healthcheck: do not leak statup service

The startup service is special because we have to transition from
startup to the normal unit. And in order to do so we kill ourselves (as
we are run as part of the service). This means we always exited 1 which
causes systemd to keep us failure and not remove the transient unit
unless "reset-failed" is called. As there is no process around to do
that we cannot really do this, thus make us exit(0) which makes more
sense.

Of course we could try to reset-failed the unit later but the code for
that seems more complicated than that.

Add a new test from Ed that ensures we check for all healthcheck units
not just the timer to avoid leaks. I slightly modified it to provide a
better error on leaks.

Fixes: 0bbef4b830 ("libpod: rework shutdown handler flow")
Fixes: #24351

Signed-off-by: Paul Holzinger <pholzing@redhat.com>
This commit is contained in:
Paul Holzinger 2024-10-24 12:11:00 +02:00
parent 2da21d1524
commit 6069cdda00
No known key found for this signature in database
GPG Key ID: EB145DD938A3CAF2
2 changed files with 19 additions and 2 deletions

View File

@ -15,6 +15,7 @@ import (
"time"
"github.com/containers/podman/v5/libpod/define"
"github.com/containers/podman/v5/libpod/shutdown"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
)
@ -273,6 +274,13 @@ func (c *Container) incrementStartupHCSuccessCounter(ctx context.Context) {
// Which happens to be us.
// So this has to be last - after this, systemd serves us a
// SIGTERM and we exit.
// Special case, via SIGTERM we exit(1) which means systemd logs a failure in the unit.
// We do not want this as the unit will be leaked on failure states unless "reset-failed"
// is called. Fundamentally this is expected so switch it to exit 0.
// NOTE: This is only safe while being called from "podman healthcheck run" which we know
// is the case here as we should not alter the exit code of another process that just
// happened to call this.
shutdown.SetExitCode(0)
if err := c.removeTransientFiles(ctx, true, oldUnit); err != nil {
logrus.Errorf("Error removing container %s healthcheck: %v", c.ID(), err)
return

View File

@ -98,7 +98,11 @@ Log[-1].Output | \"Uh-oh on stdout!\\\nUh-oh on stderr!\\\n\"
# Check that we now we do have valid podman units with this
# name so that the leak check below does not turn into a NOP without noticing.
assert "$(systemctl list-units --type timer | grep $cid)" =~ "podman" "Healthcheck systemd unit exists"
run -0 systemctl list-units
cidmatch=$(grep "$cid" <<<"$output")
echo "$cidmatch"
assert "$cidmatch" =~ " $cid-[0-9a-f]+\.timer *.*/podman healthcheck run $cid" \
"Healthcheck systemd unit exists"
current_time=$(date --iso-8601=ns)
# After three successive failures, container should no longer be healthy
@ -117,7 +121,12 @@ Log[-1].Output | \"Uh-oh on stdout!\\\nUh-oh on stderr!\\\n\"
# Important check for https://github.com/containers/podman/issues/22884
# We never should leak the unit files, healthcheck uses the cid in name so just grep that.
assert "$(systemctl list-units --type timer | grep $cid)" == "" "Healthcheck systemd unit cleanup"
# (Ignore .scope units, those are conmon and can linger for 5 minutes)
# (Ignore .mount, too. They are created/removed by systemd based on the actual real mounts
# on the host and that is async and might be slow enough in CI to cause failures.)
run -0 systemctl list-units --quiet "*$cid*"
except_scope_mount=$(grep -vF ".scope " <<<"$output" | { grep -vF ".mount" || true; } )
assert "$except_scope_mount" == "" "Healthcheck systemd unit cleanup: no units leaked"
}
@test "podman healthcheck - restart cleans up old state" {