mirror of https://github.com/containers/podman.git
Merge pull request #24358 from Luap99/healthcheck-startup-leak
healthcheck: do not leak startup service
This commit is contained in:
commit
2f6fca6edc
|
|
@ -15,6 +15,7 @@ import (
|
|||
"time"
|
||||
|
||||
"github.com/containers/podman/v5/libpod/define"
|
||||
"github.com/containers/podman/v5/libpod/shutdown"
|
||||
"github.com/sirupsen/logrus"
|
||||
"golang.org/x/sys/unix"
|
||||
)
|
||||
|
|
@ -273,6 +274,13 @@ func (c *Container) incrementStartupHCSuccessCounter(ctx context.Context) {
|
|||
// Which happens to be us.
|
||||
// So this has to be last - after this, systemd serves us a
|
||||
// SIGTERM and we exit.
|
||||
// Special case, via SIGTERM we exit(1) which means systemd logs a failure in the unit.
|
||||
// We do not want this as the unit will be leaked on failure states unless "reset-failed"
|
||||
// is called. Fundamentally this is expected so switch it to exit 0.
|
||||
// NOTE: This is only safe while being called from "podman healthcheck run" which we know
|
||||
// is the case here as we should not alter the exit code of another process that just
|
||||
// happened to call this.
|
||||
shutdown.SetExitCode(0)
|
||||
if err := c.removeTransientFiles(ctx, true, oldUnit); err != nil {
|
||||
logrus.Errorf("Error removing container %s healthcheck: %v", c.ID(), err)
|
||||
return
|
||||
|
|
|
|||
|
|
@ -137,13 +137,8 @@ func (c *Container) removeTransientFiles(ctx context.Context, isStartup bool, un
|
|||
stopErrors = append(stopErrors, fmt.Errorf("stopping systemd health-check timer %q: %w", timerFile, err))
|
||||
}
|
||||
|
||||
// Reset the service before stopping it to make sure it's being removed
|
||||
// on stop.
|
||||
serviceChan := make(chan string)
|
||||
serviceFile := fmt.Sprintf("%s.service", unitName)
|
||||
if err := conn.ResetFailedUnitContext(ctx, serviceFile); err != nil {
|
||||
logrus.Debugf("Failed to reset unit file: %q", err)
|
||||
}
|
||||
if _, err := conn.StopUnitContext(ctx, serviceFile, "ignore-dependencies", serviceChan); err != nil {
|
||||
if !strings.HasSuffix(err.Error(), ".service not loaded.") {
|
||||
stopErrors = append(stopErrors, fmt.Errorf("removing health-check service %q: %w", serviceFile, err))
|
||||
|
|
@ -151,6 +146,12 @@ func (c *Container) removeTransientFiles(ctx context.Context, isStartup bool, un
|
|||
} else if err := systemdOpSuccessful(serviceChan); err != nil {
|
||||
stopErrors = append(stopErrors, fmt.Errorf("stopping systemd health-check service %q: %w", serviceFile, err))
|
||||
}
|
||||
// Reset the service after stopping it to make sure it's being removed, systemd keep failed transient services
|
||||
// around in its state. We do not care about the error and we need to ensure to reset the state so we do not
|
||||
// leak resources forever.
|
||||
if err := conn.ResetFailedUnitContext(ctx, serviceFile); err != nil {
|
||||
logrus.Debugf("Failed to reset unit file: %q", err)
|
||||
}
|
||||
|
||||
return errorhandling.JoinErrors(stopErrors)
|
||||
}
|
||||
|
|
|
|||
|
|
@ -98,7 +98,11 @@ Log[-1].Output | \"Uh-oh on stdout!\\\nUh-oh on stderr!\\\n\"
|
|||
|
||||
# Check that we now we do have valid podman units with this
|
||||
# name so that the leak check below does not turn into a NOP without noticing.
|
||||
assert "$(systemctl list-units --type timer | grep $cid)" =~ "podman" "Healthcheck systemd unit exists"
|
||||
run -0 systemctl list-units
|
||||
cidmatch=$(grep "$cid" <<<"$output")
|
||||
echo "$cidmatch"
|
||||
assert "$cidmatch" =~ " $cid-[0-9a-f]+\.timer *.*/podman healthcheck run $cid" \
|
||||
"Healthcheck systemd unit exists"
|
||||
|
||||
current_time=$(date --iso-8601=ns)
|
||||
# After three successive failures, container should no longer be healthy
|
||||
|
|
@ -117,7 +121,12 @@ Log[-1].Output | \"Uh-oh on stdout!\\\nUh-oh on stderr!\\\n\"
|
|||
|
||||
# Important check for https://github.com/containers/podman/issues/22884
|
||||
# We never should leak the unit files, healthcheck uses the cid in name so just grep that.
|
||||
assert "$(systemctl list-units --type timer | grep $cid)" == "" "Healthcheck systemd unit cleanup"
|
||||
# (Ignore .scope units, those are conmon and can linger for 5 minutes)
|
||||
# (Ignore .mount, too. They are created/removed by systemd based on the actual real mounts
|
||||
# on the host and that is async and might be slow enough in CI to cause failures.)
|
||||
run -0 systemctl list-units --quiet "*$cid*"
|
||||
except_scope_mount=$(grep -vF ".scope " <<<"$output" | { grep -vF ".mount" || true; } )
|
||||
assert "$except_scope_mount" == "" "Healthcheck systemd unit cleanup: no units leaked"
|
||||
}
|
||||
|
||||
@test "podman healthcheck - restart cleans up old state" {
|
||||
|
|
|
|||
Loading…
Reference in New Issue