mirror of https://github.com/containers/podman.git
fix --health-on-failure=restart in transient unit
As described in #17777, the `restart` on-failure action did not behave correctly when the health check is being run by a transient systemd unit. It ran just fine when being executed outside such a unit, for instance, manually or, as done in the system tests, in a scripted fashion. There were two issue causing the `restart` on-failure action to misbehave: 1) The transient systemd units used the default `KillMode=cgroup` which will nuke all processes in the specific cgroup including the recently restarted container/conmon once the main `podman healthcheck run` process exits. 2) Podman attempted to remove the transient systemd unit and timer during restart. That is perfectly fine when manually restarting the container but not when the restart itself is being executed inside such a transient unit. Ultimately, Podman tried to shoot itself in the foot. Fix both issues by moving the restart logic in the cleanup process. Instead of restarting the container, the `healthcheck run` will just stop the container and the cleanup process will restart the container once it has turned unhealthy. Fixes: #17777 Signed-off-by: Valentin Rothberg <vrothberg@redhat.com>
This commit is contained in:
parent
1ddf6fafcf
commit
9563415430
|
@ -229,6 +229,15 @@ func (c *Container) handleExitFile(exitFile string, fi os.FileInfo) error {
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *Container) shouldRestart() bool {
|
func (c *Container) shouldRestart() bool {
|
||||||
|
if c.config.HealthCheckOnFailureAction == define.HealthCheckOnFailureActionRestart {
|
||||||
|
isUnhealthy, err := c.isUnhealthy()
|
||||||
|
if err != nil {
|
||||||
|
logrus.Errorf("Checking if container is unhealthy: %v", err)
|
||||||
|
} else if isUnhealthy {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// If we did not get a restart policy match, return false
|
// If we did not get a restart policy match, return false
|
||||||
// Do the same if we're not a policy that restarts.
|
// Do the same if we're not a policy that restarts.
|
||||||
if !c.state.RestartPolicyMatch ||
|
if !c.state.RestartPolicyMatch ||
|
||||||
|
@ -268,6 +277,12 @@ func (c *Container) handleRestartPolicy(ctx context.Context) (_ bool, retErr err
|
||||||
return false, err
|
return false, err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if c.config.HealthCheckConfig != nil {
|
||||||
|
if err := c.removeTransientFiles(ctx, c.config.StartupHealthCheckConfig != nil && !c.state.StartupHCPassed); err != nil {
|
||||||
|
return false, err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Is the container running again?
|
// Is the container running again?
|
||||||
// If so, we don't have to do anything
|
// If so, we don't have to do anything
|
||||||
if c.ensureState(define.ContainerStateRunning, define.ContainerStatePaused) {
|
if c.ensureState(define.ContainerStateRunning, define.ContainerStatePaused) {
|
||||||
|
@ -1429,6 +1444,7 @@ func (c *Container) restartWithTimeout(ctx context.Context, timeout uint) (retEr
|
||||||
if err := c.stop(timeout); err != nil {
|
if err := c.stop(timeout); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
if c.config.HealthCheckConfig != nil {
|
if c.config.HealthCheckConfig != nil {
|
||||||
if err := c.removeTransientFiles(context.Background(), c.config.StartupHealthCheckConfig != nil && !c.state.StartupHCPassed); err != nil {
|
if err := c.removeTransientFiles(context.Background(), c.config.StartupHealthCheckConfig != nil && !c.state.StartupHCPassed); err != nil {
|
||||||
logrus.Error(err.Error())
|
logrus.Error(err.Error())
|
||||||
|
|
|
@ -184,8 +184,12 @@ func (c *Container) processHealthCheckStatus(status string) error {
|
||||||
}
|
}
|
||||||
|
|
||||||
case define.HealthCheckOnFailureActionRestart:
|
case define.HealthCheckOnFailureActionRestart:
|
||||||
if err := c.RestartWithTimeout(context.Background(), c.config.StopTimeout); err != nil {
|
// We let the cleanup process handle the restart. Otherwise
|
||||||
return fmt.Errorf("restarting container after health-check turned unhealthy: %w", err)
|
// the container would be restarted in the context of a
|
||||||
|
// transient systemd unit which may cause undesired side
|
||||||
|
// effects.
|
||||||
|
if err := c.Stop(); err != nil {
|
||||||
|
return fmt.Errorf("restarting/stopping container after health-check turned unhealthy: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
case define.HealthCheckOnFailureActionStop:
|
case define.HealthCheckOnFailureActionStop:
|
||||||
|
@ -346,6 +350,18 @@ func (c *Container) updateHealthStatus(status string) error {
|
||||||
return os.WriteFile(c.healthCheckLogPath(), newResults, 0700)
|
return os.WriteFile(c.healthCheckLogPath(), newResults, 0700)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// isUnhealthy returns if the current health check status in unhealthy.
|
||||||
|
func (c *Container) isUnhealthy() (bool, error) {
|
||||||
|
if !c.HasHealthCheck() {
|
||||||
|
return false, nil
|
||||||
|
}
|
||||||
|
healthCheck, err := c.getHealthCheckLog()
|
||||||
|
if err != nil {
|
||||||
|
return false, err
|
||||||
|
}
|
||||||
|
return healthCheck.Status == define.HealthCheckUnhealthy, nil
|
||||||
|
}
|
||||||
|
|
||||||
// UpdateHealthCheckLog parses the health check results and writes the log
|
// UpdateHealthCheckLog parses the health check results and writes the log
|
||||||
func (c *Container) updateHealthCheckLog(hcl define.HealthCheckLog, inStartPeriod bool) (string, error) {
|
func (c *Container) updateHealthCheckLog(hcl define.HealthCheckLog, inStartPeriod bool) (string, error) {
|
||||||
c.lock.Lock()
|
c.lock.Lock()
|
||||||
|
|
|
@ -139,19 +139,22 @@ Log[-1].Output | \"Uh-oh on stdout!\\\nUh-oh on stderr!\"
|
||||||
run_podman 1 healthcheck run $ctr
|
run_podman 1 healthcheck run $ctr
|
||||||
is "$output" "unhealthy" "output from 'podman healthcheck run' (policy: $policy)"
|
is "$output" "unhealthy" "output from 'podman healthcheck run' (policy: $policy)"
|
||||||
|
|
||||||
run_podman inspect $ctr --format "{{.State.Status}} {{.Config.HealthcheckOnFailureAction}}"
|
|
||||||
if [[ $policy == "restart" ]];then
|
if [[ $policy == "restart" ]];then
|
||||||
# Container has been restarted and health check works again
|
# Make sure the container transitions back to running
|
||||||
is "$output" "running $policy" "container has been restarted"
|
run_podman wait --condition=running $ctr
|
||||||
|
run_podman inspect $ctr --format "{{.RestartCount}}"
|
||||||
|
assert "${#lines[@]}" != 0 "Container has been restarted at least once"
|
||||||
run_podman container inspect $ctr --format "{{.State.Healthcheck.FailingStreak}}"
|
run_podman container inspect $ctr --format "{{.State.Healthcheck.FailingStreak}}"
|
||||||
is "$output" "0" "Failing streak of restarted container should be 0 again"
|
is "$output" "0" "Failing streak of restarted container should be 0 again"
|
||||||
run_podman healthcheck run $ctr
|
run_podman healthcheck run $ctr
|
||||||
elif [[ $policy == "none" ]];then
|
elif [[ $policy == "none" ]];then
|
||||||
|
run_podman inspect $ctr --format "{{.State.Status}} {{.Config.HealthcheckOnFailureAction}}"
|
||||||
# Container is still running and health check still broken
|
# Container is still running and health check still broken
|
||||||
is "$output" "running $policy" "container continued running"
|
is "$output" "running $policy" "container continued running"
|
||||||
run_podman 1 healthcheck run $ctr
|
run_podman 1 healthcheck run $ctr
|
||||||
is "$output" "unhealthy" "output from 'podman healthcheck run' (policy: $policy)"
|
is "$output" "unhealthy" "output from 'podman healthcheck run' (policy: $policy)"
|
||||||
else
|
else
|
||||||
|
run_podman inspect $ctr --format "{{.State.Status}} {{.Config.HealthcheckOnFailureAction}}"
|
||||||
# kill and stop yield the container into a non-running state
|
# kill and stop yield the container into a non-running state
|
||||||
is "$output" ".* $policy" "container was stopped/killed (policy: $policy)"
|
is "$output" ".* $policy" "container was stopped/killed (policy: $policy)"
|
||||||
assert "$output" != "running $policy"
|
assert "$output" != "running $policy"
|
||||||
|
@ -164,4 +167,39 @@ Log[-1].Output | \"Uh-oh on stdout!\\\nUh-oh on stderr!\"
|
||||||
done
|
done
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@test "podman healthcheck --health-on-failure with interval" {
|
||||||
|
ctr="healthcheck_c"
|
||||||
|
|
||||||
|
for policy in stop kill restart ;do
|
||||||
|
t0=$(date --iso-8601=seconds)
|
||||||
|
run_podman run -d --name $ctr \
|
||||||
|
--health-cmd /bin/false \
|
||||||
|
--health-retries=1 \
|
||||||
|
--health-on-failure=$policy \
|
||||||
|
--health-interval=1s \
|
||||||
|
$IMAGE top
|
||||||
|
|
||||||
|
if [[ $policy == "restart" ]];then
|
||||||
|
# Sleeping for 2 seconds makes the test much faster than using
|
||||||
|
# podman-wait which would compete with the container getting
|
||||||
|
# restarted.
|
||||||
|
sleep 2
|
||||||
|
# Make sure the container transitions back to running
|
||||||
|
run_podman wait --condition=running $ctr
|
||||||
|
run_podman inspect $ctr --format "{{.RestartCount}}"
|
||||||
|
assert "${#lines[@]}" != 0 "Container has been restarted at least once"
|
||||||
|
else
|
||||||
|
# kill and stop yield the container into a non-running state
|
||||||
|
run_podman wait $ctr
|
||||||
|
run_podman inspect $ctr --format "{{.State.Status}} {{.Config.HealthcheckOnFailureAction}}"
|
||||||
|
is "$output" ".* $policy" "container was stopped/killed (policy: $policy)"
|
||||||
|
assert "$output" != "running $policy"
|
||||||
|
# also make sure that it's not stuck in the stopping state
|
||||||
|
assert "$output" != "stopping $policy"
|
||||||
|
fi
|
||||||
|
|
||||||
|
run_podman rm -f -t0 $ctr
|
||||||
|
done
|
||||||
|
}
|
||||||
|
|
||||||
# vim: filetype=sh
|
# vim: filetype=sh
|
||||||
|
|
Loading…
Reference in New Issue