healthcheck: do not leak statup service

The startup service is special because we have to transition from startup to the normal unit. And in order to do so we kill ourselves (as we are run as part of the service). This means we always exited 1 which causes systemd to keep us failure and not remove the transient unit unless "reset-failed" is called. As there is no process around to do that we cannot really do this, thus make us exit(0) which makes more sense. Of course we could try to reset-failed the unit later but the code for that seems more complicated than that. Add a new test from Ed that ensures we check for all healthcheck units not just the timer to avoid leaks. I slightly modified it to provide a better error on leaks. Fixes: 0bbef4b830 ("libpod: rework shutdown handler flow") Fixes: #24351 Signed-off-by: Paul Holzinger <pholzing@redhat.com>
2024-10-24 12:11:00 +02:00 · 2024-10-24 12:11:00 +02:00 · 6069cdda00
parent 2da21d1524
commit 6069cdda00
2 changed files with 19 additions and 2 deletions
--- a/libpod/healthcheck.go
+++ b/libpod/healthcheck.go
@ -15,6 +15,7 @@ import (
 	"time"

 	"github.com/containers/podman/v5/libpod/define"
+	"github.com/containers/podman/v5/libpod/shutdown"
 	"github.com/sirupsen/logrus"
 	"golang.org/x/sys/unix"
 )
@ -273,6 +274,13 @@ func (c *Container) incrementStartupHCSuccessCounter(ctx context.Context) {
 		// Which happens to be us.
 		// So this has to be last - after this, systemd serves us a
 		// SIGTERM and we exit.
+		// Special case, via SIGTERM we exit(1) which means systemd logs a failure in the unit.
+		// We do not want this as the unit will be leaked on failure states unless "reset-failed"
+		// is called. Fundamentally this is expected so switch it to exit 0.
+		// NOTE: This is only safe while being called from "podman healthcheck run" which we know
+		// is the case here as we should not alter the exit code of another process that just
+		// happened to call this.
+		shutdown.SetExitCode(0)
 		if err := c.removeTransientFiles(ctx, true, oldUnit); err != nil {
 			logrus.Errorf("Error removing container %s healthcheck: %v", c.ID(), err)
 			return
--- a/test/system/220-healthcheck.bats
+++ b/test/system/220-healthcheck.bats
@ -98,7 +98,11 @@ Log[-1].Output   | \"Uh-oh on stdout!\\\nUh-oh on stderr!\\\n\"

    # Check that we now we do have valid podman units with this
    # name so that the leak check below does not turn into a NOP without noticing.
-    assert "$(systemctl list-units --type timer | grep $cid)" =~ "podman" "Healthcheck systemd unit exists"
+    run -0 systemctl list-units
+    cidmatch=$(grep "$cid" <<<"$output")
+    echo "$cidmatch"
+    assert "$cidmatch" =~ " $cid-[0-9a-f]+\.timer  *.*/podman healthcheck run $cid" \
+           "Healthcheck systemd unit exists"

    current_time=$(date --iso-8601=ns)
    # After three successive failures, container should no longer be healthy
@ -117,7 +121,12 @@ Log[-1].Output   | \"Uh-oh on stdout!\\\nUh-oh on stderr!\\\n\"

    # Important check for https://github.com/containers/podman/issues/22884
    # We never should leak the unit files, healthcheck uses the cid in name so just grep that.
-    assert "$(systemctl list-units --type timer | grep $cid)" == "" "Healthcheck systemd unit cleanup"
+    # (Ignore .scope units, those are conmon and can linger for 5 minutes)
+    # (Ignore .mount, too. They are created/removed by systemd based on the actual real mounts
+    #  on the host and that is async and might be slow enough in CI to cause failures.)
+    run -0 systemctl list-units --quiet "*$cid*"
+    except_scope_mount=$(grep -vF ".scope " <<<"$output" | { grep -vF ".mount" || true; } )
+    assert "$except_scope_mount" == "" "Healthcheck systemd unit cleanup: no units leaked"
 }

@test "podman healthcheck - restart cleans up old state" {