mirror of https://github.com/containers/podman.git
healthcheck: do not leak statup service
The startup service is special because we have to transition from
startup to the normal unit. And in order to do so we kill ourselves (as
we are run as part of the service). This means we always exited 1 which
causes systemd to keep us failure and not remove the transient unit
unless "reset-failed" is called. As there is no process around to do
that we cannot really do this, thus make us exit(0) which makes more
sense.
Of course we could try to reset-failed the unit later but the code for
that seems more complicated than that.
Add a new test from Ed that ensures we check for all healthcheck units
not just the timer to avoid leaks. I slightly modified it to provide a
better error on leaks.
Fixes: 0bbef4b830
("libpod: rework shutdown handler flow")
Fixes: #24351
Signed-off-by: Paul Holzinger <pholzing@redhat.com>
This commit is contained in:
parent
2da21d1524
commit
6069cdda00
|
@ -15,6 +15,7 @@ import (
|
|||
"time"
|
||||
|
||||
"github.com/containers/podman/v5/libpod/define"
|
||||
"github.com/containers/podman/v5/libpod/shutdown"
|
||||
"github.com/sirupsen/logrus"
|
||||
"golang.org/x/sys/unix"
|
||||
)
|
||||
|
@ -273,6 +274,13 @@ func (c *Container) incrementStartupHCSuccessCounter(ctx context.Context) {
|
|||
// Which happens to be us.
|
||||
// So this has to be last - after this, systemd serves us a
|
||||
// SIGTERM and we exit.
|
||||
// Special case, via SIGTERM we exit(1) which means systemd logs a failure in the unit.
|
||||
// We do not want this as the unit will be leaked on failure states unless "reset-failed"
|
||||
// is called. Fundamentally this is expected so switch it to exit 0.
|
||||
// NOTE: This is only safe while being called from "podman healthcheck run" which we know
|
||||
// is the case here as we should not alter the exit code of another process that just
|
||||
// happened to call this.
|
||||
shutdown.SetExitCode(0)
|
||||
if err := c.removeTransientFiles(ctx, true, oldUnit); err != nil {
|
||||
logrus.Errorf("Error removing container %s healthcheck: %v", c.ID(), err)
|
||||
return
|
||||
|
|
|
@ -98,7 +98,11 @@ Log[-1].Output | \"Uh-oh on stdout!\\\nUh-oh on stderr!\\\n\"
|
|||
|
||||
# Check that we now we do have valid podman units with this
|
||||
# name so that the leak check below does not turn into a NOP without noticing.
|
||||
assert "$(systemctl list-units --type timer | grep $cid)" =~ "podman" "Healthcheck systemd unit exists"
|
||||
run -0 systemctl list-units
|
||||
cidmatch=$(grep "$cid" <<<"$output")
|
||||
echo "$cidmatch"
|
||||
assert "$cidmatch" =~ " $cid-[0-9a-f]+\.timer *.*/podman healthcheck run $cid" \
|
||||
"Healthcheck systemd unit exists"
|
||||
|
||||
current_time=$(date --iso-8601=ns)
|
||||
# After three successive failures, container should no longer be healthy
|
||||
|
@ -117,7 +121,12 @@ Log[-1].Output | \"Uh-oh on stdout!\\\nUh-oh on stderr!\\\n\"
|
|||
|
||||
# Important check for https://github.com/containers/podman/issues/22884
|
||||
# We never should leak the unit files, healthcheck uses the cid in name so just grep that.
|
||||
assert "$(systemctl list-units --type timer | grep $cid)" == "" "Healthcheck systemd unit cleanup"
|
||||
# (Ignore .scope units, those are conmon and can linger for 5 minutes)
|
||||
# (Ignore .mount, too. They are created/removed by systemd based on the actual real mounts
|
||||
# on the host and that is async and might be slow enough in CI to cause failures.)
|
||||
run -0 systemctl list-units --quiet "*$cid*"
|
||||
except_scope_mount=$(grep -vF ".scope " <<<"$output" | { grep -vF ".mount" || true; } )
|
||||
assert "$except_scope_mount" == "" "Healthcheck systemd unit cleanup: no units leaked"
|
||||
}
|
||||
|
||||
@test "podman healthcheck - restart cleans up old state" {
|
||||
|
|
Loading…
Reference in New Issue