report healthcheck start errors
When starting a container consider healthcheck errors fatal. That way user know when systemd-run failed to setup the timer to run the healthcheck and we don't get into a state where the container is running but not the healthcheck. This also fixes the broken error reporting from the systemd-run exec, if the binary could not be run the output was just empty leaving the users with no idea what failed. Fixes #25034 Signed-off-by: Paul Holzinger <pholzing@redhat.com>
This commit is contained in:
		
							parent
							
								
									4ac061f383
								
							
						
					
					
						commit
						47a743bba2
					
				|  | @ -1144,7 +1144,7 @@ func (c *Container) init(ctx context.Context, retainRetries bool) error { | ||||||
| 			timer = c.config.StartupHealthCheckConfig.Interval.String() | 			timer = c.config.StartupHealthCheckConfig.Interval.String() | ||||||
| 		} | 		} | ||||||
| 		if err := c.createTimer(timer, c.config.StartupHealthCheckConfig != nil); err != nil { | 		if err := c.createTimer(timer, c.config.StartupHealthCheckConfig != nil); err != nil { | ||||||
| 			logrus.Error(err) | 			return fmt.Errorf("create healthcheck: %w", err) | ||||||
| 		} | 		} | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
|  | @ -1333,10 +1333,10 @@ func (c *Container) start() error { | ||||||
| 	// to update status in such case.
 | 	// to update status in such case.
 | ||||||
| 	if c.config.HealthCheckConfig != nil && !(len(c.config.HealthCheckConfig.Test) == 1 && c.config.HealthCheckConfig.Test[0] == "NONE") { | 	if c.config.HealthCheckConfig != nil && !(len(c.config.HealthCheckConfig.Test) == 1 && c.config.HealthCheckConfig.Test[0] == "NONE") { | ||||||
| 		if err := c.updateHealthStatus(define.HealthCheckStarting); err != nil { | 		if err := c.updateHealthStatus(define.HealthCheckStarting); err != nil { | ||||||
| 			logrus.Error(err) | 			return fmt.Errorf("update healthcheck status: %w", err) | ||||||
| 		} | 		} | ||||||
| 		if err := c.startTimer(c.config.StartupHealthCheckConfig != nil); err != nil { | 		if err := c.startTimer(c.config.StartupHealthCheckConfig != nil); err != nil { | ||||||
| 			logrus.Error(err) | 			return fmt.Errorf("start healthcheck: %w", err) | ||||||
| 		} | 		} | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
|  | @ -1627,7 +1627,7 @@ func (c *Container) unpause() error { | ||||||
| 			timer = c.config.StartupHealthCheckConfig.Interval.String() | 			timer = c.config.StartupHealthCheckConfig.Interval.String() | ||||||
| 		} | 		} | ||||||
| 		if err := c.createTimer(timer, isStartupHealthCheck); err != nil { | 		if err := c.createTimer(timer, isStartupHealthCheck); err != nil { | ||||||
| 			return err | 			return fmt.Errorf("create healthcheck: %w", err) | ||||||
| 		} | 		} | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
|  | @ -2895,7 +2895,9 @@ func (c *Container) resetHealthCheckTimers(noHealthCheck bool, changedTimer bool | ||||||
| 
 | 
 | ||||||
| 	if !isStartup { | 	if !isStartup { | ||||||
| 		if c.state.StartupHCPassed || c.config.StartupHealthCheckConfig == nil { | 		if c.state.StartupHCPassed || c.config.StartupHealthCheckConfig == nil { | ||||||
| 			c.recreateHealthCheckTimer(context.Background(), false, false) | 			if err := c.recreateHealthCheckTimer(context.Background(), false, false); err != nil { | ||||||
|  | 				return err | ||||||
|  | 			} | ||||||
| 		} | 		} | ||||||
| 		return nil | 		return nil | ||||||
| 	} | 	} | ||||||
|  | @ -2908,7 +2910,9 @@ func (c *Container) resetHealthCheckTimers(noHealthCheck bool, changedTimer bool | ||||||
| 			return err | 			return err | ||||||
| 		} | 		} | ||||||
| 		if wasEnabledHealthCheck { | 		if wasEnabledHealthCheck { | ||||||
| 			c.recreateHealthCheckTimer(context.Background(), true, true) | 			if err := c.recreateHealthCheckTimer(context.Background(), true, true); err != nil { | ||||||
|  | 				return err | ||||||
|  | 			} | ||||||
| 		} | 		} | ||||||
| 		return nil | 		return nil | ||||||
| 	} | 	} | ||||||
|  |  | ||||||
|  | @ -126,9 +126,13 @@ func (c *Container) runHealthCheck(ctx context.Context, isStartup bool) (define. | ||||||
| 		inStartPeriod = true | 		inStartPeriod = true | ||||||
| 		if hcErr != nil || exitCode != 0 { | 		if hcErr != nil || exitCode != 0 { | ||||||
| 			hcResult = define.HealthCheckStartup | 			hcResult = define.HealthCheckStartup | ||||||
| 			c.incrementStartupHCFailureCounter(ctx) | 			if err := c.incrementStartupHCFailureCounter(ctx); err != nil { | ||||||
|  | 				return define.HealthCheckInternalError, "", err | ||||||
|  | 			} | ||||||
| 		} else { | 		} else { | ||||||
| 			c.incrementStartupHCSuccessCounter(ctx) | 			if err := c.incrementStartupHCSuccessCounter(ctx); err != nil { | ||||||
|  | 				return define.HealthCheckInternalError, "", err | ||||||
|  | 			} | ||||||
| 		} | 		} | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
|  | @ -229,15 +233,15 @@ func checkHealthCheckCanBeRun(c *Container) (define.HealthCheckStatus, error) { | ||||||
| // Can stop the startup HC and start the regular HC if the startup HC has enough
 | // Can stop the startup HC and start the regular HC if the startup HC has enough
 | ||||||
| // consecutive successes.
 | // consecutive successes.
 | ||||||
| // NOTE: The caller must lock and sync the container.
 | // NOTE: The caller must lock and sync the container.
 | ||||||
| func (c *Container) incrementStartupHCSuccessCounter(ctx context.Context) { | func (c *Container) incrementStartupHCSuccessCounter(ctx context.Context) error { | ||||||
| 	// We don't have a startup HC, can't do anything
 | 	// We don't have a startup HC, can't do anything
 | ||||||
| 	if c.config.StartupHealthCheckConfig == nil { | 	if c.config.StartupHealthCheckConfig == nil { | ||||||
| 		return | 		return nil | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	// Race: someone else got here first
 | 	// Race: someone else got here first
 | ||||||
| 	if c.state.StartupHCPassed { | 	if c.state.StartupHCPassed { | ||||||
| 		return | 		return nil | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	// Increment the success counter
 | 	// Increment the success counter
 | ||||||
|  | @ -256,27 +260,27 @@ func (c *Container) incrementStartupHCSuccessCounter(ctx context.Context) { | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	if err := c.save(); err != nil { | 	if err := c.save(); err != nil { | ||||||
| 		logrus.Errorf("Error saving container %s state: %v", c.ID(), err) | 		return err | ||||||
| 		return |  | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	if recreateTimer { | 	if !recreateTimer { | ||||||
| 		// This kills the process the healthcheck is running.
 | 		return nil | ||||||
| 		// Which happens to be us.
 |  | ||||||
| 		// So this has to be last - after this, systemd serves us a
 |  | ||||||
| 		// SIGTERM and we exit.
 |  | ||||||
| 		// Special case, via SIGTERM we exit(1) which means systemd logs a failure in the unit.
 |  | ||||||
| 		// We do not want this as the unit will be leaked on failure states unless "reset-failed"
 |  | ||||||
| 		// is called. Fundamentally this is expected so switch it to exit 0.
 |  | ||||||
| 		// NOTE: This is only safe while being called from "podman healthcheck run" which we know
 |  | ||||||
| 		// is the case here as we should not alter the exit code of another process that just
 |  | ||||||
| 		// happened to call this.
 |  | ||||||
| 		shutdown.SetExitCode(0) |  | ||||||
| 		c.recreateHealthCheckTimer(ctx, false, true) |  | ||||||
| 	} | 	} | ||||||
|  | 	// This kills the process the healthcheck is running.
 | ||||||
|  | 	// Which happens to be us.
 | ||||||
|  | 	// So this has to be last - after this, systemd serves us a
 | ||||||
|  | 	// SIGTERM and we exit.
 | ||||||
|  | 	// Special case, via SIGTERM we exit(1) which means systemd logs a failure in the unit.
 | ||||||
|  | 	// We do not want this as the unit will be leaked on failure states unless "reset-failed"
 | ||||||
|  | 	// is called. Fundamentally this is expected so switch it to exit 0.
 | ||||||
|  | 	// NOTE: This is only safe while being called from "podman healthcheck run" which we know
 | ||||||
|  | 	// is the case here as we should not alter the exit code of another process that just
 | ||||||
|  | 	// happened to call this.
 | ||||||
|  | 	shutdown.SetExitCode(0) | ||||||
|  | 	return c.recreateHealthCheckTimer(ctx, false, true) | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| func (c *Container) recreateHealthCheckTimer(ctx context.Context, isStartup bool, isStartupRemoved bool) { | func (c *Container) recreateHealthCheckTimer(ctx context.Context, isStartup bool, isStartupRemoved bool) error { | ||||||
| 	logrus.Infof("Startup healthcheck for container %s passed, recreating timer", c.ID()) | 	logrus.Infof("Startup healthcheck for container %s passed, recreating timer", c.ID()) | ||||||
| 
 | 
 | ||||||
| 	oldUnit := c.state.HCUnitName | 	oldUnit := c.state.HCUnitName | ||||||
|  | @ -287,31 +291,30 @@ func (c *Container) recreateHealthCheckTimer(ctx context.Context, isStartup bool | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	if err := c.createTimer(interval, isStartup); err != nil { | 	if err := c.createTimer(interval, isStartup); err != nil { | ||||||
| 		logrus.Errorf("Error recreating container %s (isStartup: %t) healthcheck: %v", c.ID(), isStartup, err) | 		return fmt.Errorf("recreating container %s (isStartup: %t) healthcheck: %v", c.ID(), isStartup, err) | ||||||
| 		return |  | ||||||
| 	} | 	} | ||||||
| 	if err := c.startTimer(isStartup); err != nil { | 	if err := c.startTimer(isStartup); err != nil { | ||||||
| 		logrus.Errorf("Error restarting container %s (isStartup: %t) healthcheck timer: %v", c.ID(), isStartup, err) | 		return fmt.Errorf("restarting container %s (isStartup: %t) healthcheck timer: %v", c.ID(), isStartup, err) | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	if err := c.removeTransientFiles(ctx, isStartupRemoved, oldUnit); err != nil { | 	if err := c.removeTransientFiles(ctx, isStartupRemoved, oldUnit); err != nil { | ||||||
| 		logrus.Errorf("Error removing container %s healthcheck: %v", c.ID(), err) | 		return fmt.Errorf("removing container %s healthcheck: %v", c.ID(), err) | ||||||
| 		return |  | ||||||
| 	} | 	} | ||||||
|  | 	return nil | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| // Increment the current startup healthcheck failure counter.
 | // Increment the current startup healthcheck failure counter.
 | ||||||
| // Can restart the container if the HC fails enough times consecutively.
 | // Can restart the container if the HC fails enough times consecutively.
 | ||||||
| // NOTE: The caller must lock and sync the container.
 | // NOTE: The caller must lock and sync the container.
 | ||||||
| func (c *Container) incrementStartupHCFailureCounter(ctx context.Context) { | func (c *Container) incrementStartupHCFailureCounter(ctx context.Context) error { | ||||||
| 	// We don't have a startup HC, can't do anything
 | 	// We don't have a startup HC, can't do anything
 | ||||||
| 	if c.config.StartupHealthCheckConfig == nil { | 	if c.config.StartupHealthCheckConfig == nil { | ||||||
| 		return | 		return nil | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	// Race: someone else got here first
 | 	// Race: someone else got here first
 | ||||||
| 	if c.state.StartupHCPassed { | 	if c.state.StartupHCPassed { | ||||||
| 		return | 		return nil | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	c.state.StartupHCFailureCount++ | 	c.state.StartupHCFailureCount++ | ||||||
|  | @ -322,14 +325,12 @@ func (c *Container) incrementStartupHCFailureCounter(ctx context.Context) { | ||||||
| 		logrus.Infof("Restarting container %s as startup healthcheck failed", c.ID()) | 		logrus.Infof("Restarting container %s as startup healthcheck failed", c.ID()) | ||||||
| 		// Restart the container
 | 		// Restart the container
 | ||||||
| 		if err := c.restartWithTimeout(ctx, c.config.StopTimeout); err != nil { | 		if err := c.restartWithTimeout(ctx, c.config.StopTimeout); err != nil { | ||||||
| 			logrus.Errorf("Error restarting container %s after healthcheck failure: %v", c.ID(), err) | 			return fmt.Errorf("restarting container %s after healthcheck failure: %v", c.ID(), err) | ||||||
| 		} | 		} | ||||||
| 		return | 		return nil | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	if err := c.save(); err != nil { | 	return c.save() | ||||||
| 		logrus.Errorf("Error saving container %s state: %v", c.ID(), err) |  | ||||||
| 	} |  | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| func newHealthCheckLog(start, end time.Time, exitCode int, log string) define.HealthCheckLog { | func newHealthCheckLog(start, end time.Time, exitCode int, log string) define.HealthCheckLog { | ||||||
|  |  | ||||||
|  | @ -4,6 +4,7 @@ package libpod | ||||||
| 
 | 
 | ||||||
| import ( | import ( | ||||||
| 	"context" | 	"context" | ||||||
|  | 	"errors" | ||||||
| 	"fmt" | 	"fmt" | ||||||
| 	"math/rand" | 	"math/rand" | ||||||
| 	"os" | 	"os" | ||||||
|  | @ -55,7 +56,11 @@ func (c *Container) createTimer(interval string, isStartup bool) error { | ||||||
| 	logrus.Debugf("creating systemd-transient files: %s %s", "systemd-run", cmd) | 	logrus.Debugf("creating systemd-transient files: %s %s", "systemd-run", cmd) | ||||||
| 	systemdRun := exec.Command("systemd-run", cmd...) | 	systemdRun := exec.Command("systemd-run", cmd...) | ||||||
| 	if output, err := systemdRun.CombinedOutput(); err != nil { | 	if output, err := systemdRun.CombinedOutput(); err != nil { | ||||||
| 		return fmt.Errorf("%s", output) | 		exitError := &exec.ExitError{} | ||||||
|  | 		if errors.As(err, &exitError) { | ||||||
|  | 			return fmt.Errorf("systemd-run failed: %w: output: %s", err, strings.TrimSpace(string(output))) | ||||||
|  | 		} | ||||||
|  | 		return fmt.Errorf("failed to execute systemd-run: %w", err) | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	c.state.HCUnitName = hcUnitName | 	c.state.HCUnitName = hcUnitName | ||||||
|  |  | ||||||
|  | @ -499,4 +499,34 @@ function _check_health_log { | ||||||
|     run_podman rm -f -t0 $ctr |     run_podman rm -f -t0 $ctr | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | # https://github.com/containers/podman/issues/25034 | ||||||
|  | @test "podman healthcheck - start errors" { | ||||||
|  |     skip_if_remote '$PATH overwrite not working via remote' | ||||||
|  |     ctr1="c1-h-$(safename)" | ||||||
|  |     ctr2="c2-h-$(safename)" | ||||||
|  | 
 | ||||||
|  |     local systemd_run="$PODMAN_TMPDIR/systemd-run" | ||||||
|  |     touch $systemd_run | ||||||
|  |     chmod +x $systemd_run | ||||||
|  | 
 | ||||||
|  |     # Set custom PATH to force our stub to be called instead of the real systemd-run. | ||||||
|  |     PATH="$PODMAN_TMPDIR:$PATH" run_podman 126 run -d --name $ctr1 \ | ||||||
|  |            --health-cmd "true" $IMAGE /home/podman/pause | ||||||
|  |     assert "$output" =~ "create healthcheck: failed to execute systemd-run: fork/exec $systemd_run: exec format error" "error on invalid systemd-run" | ||||||
|  | 
 | ||||||
|  |     local systemd_run="$PODMAN_TMPDIR/systemd-run" | ||||||
|  |     cat > $systemd_run <<EOF | ||||||
|  | #!/bin/bash | ||||||
|  | echo stdout | ||||||
|  | echo stderr >&2 | ||||||
|  | exit 2 | ||||||
|  | EOF | ||||||
|  |     PATH="$PODMAN_TMPDIR:$PATH" run_podman 126 run -d --name $ctr2 \ | ||||||
|  |            --health-cmd "true" $IMAGE /home/podman/pause | ||||||
|  |     assert "$output" =~ "create healthcheck: systemd-run failed: exit status 2: output: stdout | ||||||
|  | stderr" "systemd-run error message" | ||||||
|  | 
 | ||||||
|  |     run_podman rm -f -t0 $ctr1 $ctr2 | ||||||
|  | } | ||||||
|  | 
 | ||||||
| # vim: filetype=sh | # vim: filetype=sh | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue