Merge pull request #26086 from Honny1/hc-timeout

Fix: Ensure HealthCheck exec session terminates on timeout
2025-05-12 15:44:18 +00:00 · 2025-05-12 15:44:18 +00:00 · df90606d53
parent e68bd8964d 077649f9d0
commit df90606d53
7 changed files with 38 additions and 21 deletions
--- a/cmd/podman/common/create.go
+++ b/cmd/podman/common/create.go
@ -627,7 +627,7 @@ func DefineCreateFlags(cmd *cobra.Command, cf *entities.ContainerCreateOptions,
 		createFlags.StringVar(
 			&cf.HealthTimeout,
 			healthTimeoutFlagName, define.DefaultHealthCheckTimeout,
-			"the maximum time allowed to complete the healthcheck before an interval is considered failed",
+			"the maximum time allowed to complete the healthcheck before an interval is considered failed and SIGKILL is sent to the healthcheck process",
 		)
 		_ = cmd.RegisterFlagCompletionFunc(healthTimeoutFlagName, completion.AutocompleteNone)

--- a/docs/source/markdown/options/health-timeout.md
+++ b/docs/source/markdown/options/health-timeout.md
@ -7,8 +7,7 @@
 The maximum time allowed to complete the healthcheck before an interval is considered failed. Like start-period, the
 value can be expressed in a time format such as **1m22s**. The default value is **30s**.

-Note: A timeout marks the healthcheck as failed but does not terminate the running process.
-This ensures that a slow but eventually successful healthcheck does not disrupt the container
-but is still accounted for in the health status.
+Note: A timeout marks the healthcheck as failed. If the healthcheck command itself runs longer than the specified *timeout*,
+it will be sent a `SIGKILL` signal.

 Note: This parameter will overwrite related healthcheck configuration from the image.
--- a/libpod/container_exec.go
+++ b/libpod/container_exec.go
@ -820,7 +820,7 @@ func (c *Container) ExecResize(sessionID string, newSize resize.TerminalSize) er
 	return c.ociRuntime.ExecAttachResize(c, sessionID, newSize)
 }

-func (c *Container) healthCheckExec(config *ExecConfig, streams *define.AttachStreams) (int, error) {
+func (c *Container) healthCheckExec(config *ExecConfig, timeout time.Duration, streams *define.AttachStreams) (int, error) {
 	unlock := true
 	if !c.batched {
 		c.lock.Lock()
@ -868,15 +868,24 @@ func (c *Container) healthCheckExec(config *ExecConfig, streams *define.AttachSt
 	if err != nil {
 		return -1, err
 	}
+	session.PID = pid
+	session.PIDData = getPidData(pid)

 	if !c.batched {
 		c.lock.Unlock()
 		unlock = false
 	}

-	err = <-attachErrChan
-	if err != nil {
-		return -1, fmt.Errorf("container %s light exec session with pid: %d error: %v", c.ID(), pid, err)
+	select {
+	case err = <-attachErrChan:
+		if err != nil {
+			return -1, fmt.Errorf("container %s light exec session with pid: %d error: %v", c.ID(), pid, err)
+		}
+	case <-time.After(timeout):
+		if err := c.ociRuntime.ExecStopContainer(c, session.ID(), 0); err != nil {
+			return -1, err
+		}
+		return -1, fmt.Errorf("%v of %s", define.ErrHealthCheckTimeout, c.HealthCheckConfig().Timeout.String())
 	}

 	return c.readExecExitCode(session.ID())
--- a/libpod/define/errors.go
+++ b/libpod/define/errors.go
@ -217,4 +217,7 @@ var (
 	// ErrRemovingCtrs indicates that there was an error removing all
 	// containers from a pod.
 	ErrRemovingCtrs = errors.New("removing pod containers")
+
+	// ErrHealthCheckTimeout indicates that a HealthCheck timed out.
+	ErrHealthCheckTimeout = errors.New("healthcheck command exceeded timeout")
 )
--- a/libpod/healthcheck.go
+++ b/libpod/healthcheck.go
@ -93,19 +93,23 @@ func (c *Container) runHealthCheck(ctx context.Context, isStartup bool) (define.
 	streams.AttachInput = true

 	logrus.Debugf("executing health check command %s for %s", strings.Join(newCommand, " "), c.ID())
-	timeStart := time.Now()
 	hcResult := define.HealthCheckSuccess
 	config := new(ExecConfig)
 	config.Command = newCommand
-	exitCode, hcErr := c.healthCheckExec(config, streams)
+	timeStart := time.Now()
+	exitCode, hcErr := c.healthCheckExec(config, c.HealthCheckConfig().Timeout, streams)
+	timeEnd := time.Now()
 	if hcErr != nil {
 		hcResult = define.HealthCheckFailure
-		if errors.Is(hcErr, define.ErrOCIRuntimeNotFound) ||
+		switch {
+		case errors.Is(hcErr, define.ErrOCIRuntimeNotFound) ||
 			errors.Is(hcErr, define.ErrOCIRuntimePermissionDenied) ||
-			errors.Is(hcErr, define.ErrOCIRuntime) {
+			errors.Is(hcErr, define.ErrOCIRuntime):
 			returnCode = 1
 			hcErr = nil
-		} else {
+		case errors.Is(hcErr, define.ErrHealthCheckTimeout):
+			returnCode = -1
+		default:
 			returnCode = 125
 		}
 	} else if exitCode != 0 {
@ -140,7 +144,6 @@ func (c *Container) runHealthCheck(ctx context.Context, isStartup bool) (define.
 		hcResult = define.HealthCheckContainerStopped
 	}

-	timeEnd := time.Now()
 	if c.HealthCheckConfig().StartPeriod > 0 {
 		// there is a start-period we need to honor; we add startPeriod to container start time
 		startPeriodTime := c.state.StartedTime.Add(c.HealthCheckConfig().StartPeriod)
@ -156,12 +159,6 @@ func (c *Container) runHealthCheck(ctx context.Context, isStartup bool) (define.
 		eventLog = eventLog[:c.HealthCheckMaxLogSize()]
 	}

-	if timeEnd.Sub(timeStart) > c.HealthCheckConfig().Timeout {
-		returnCode = -1
-		hcResult = define.HealthCheckFailure
-		hcErr = fmt.Errorf("healthcheck command exceeded timeout of %s", c.HealthCheckConfig().Timeout.String())
-	}
-
 	hcl := newHealthCheckLog(timeStart, timeEnd, returnCode, eventLog)

 	healthCheckResult, err := c.updateHealthCheckLog(hcl, hcResult, inStartPeriod, isStartup)
--- a/libpod/oci_conmon_exec_common.go
+++ b/libpod/oci_conmon_exec_common.go
@ -258,7 +258,7 @@ func (r *ConmonOCIRuntime) ExecStopContainer(ctr *Container, sessionID string, t

 	// SIGTERM did not work. On to SIGKILL.
 	logrus.Debugf("Killing exec session %s (PID %d) of container %s with SIGKILL", sessionID, pid, ctr.ID())
-	if err := pidHandle.Kill(unix.SIGTERM); err != nil {
+	if err := pidHandle.Kill(unix.SIGKILL); err != nil {
 		if err == unix.ESRCH {
 			return nil
 		}
--- a/test/e2e/healthcheck_run_test.go
+++ b/test/e2e/healthcheck_run_test.go
@ -404,4 +404,13 @@ HEALTHCHECK CMD ls -l / 2>&1`, ALPINE)
 		Expect(ps.OutputToStringArray()).To(HaveLen(2))
 		Expect(ps.OutputToString()).To(ContainSubstring("hc"))
 	})
+
+	It("podman healthcheck - health timeout", func() {
+		ctrName := "c-h-" + RandomString(6)
+		podmanTest.PodmanExitCleanly("run", "-d", "--name", ctrName, "--health-cmd", "top", "--health-timeout=3s", ALPINE, "top")
+
+		hc := podmanTest.Podman([]string{"healthcheck", "run", ctrName})
+		hc.WaitWithTimeout(10)
+		Expect(hc).Should(ExitWithError(125, "Error: healthcheck command exceeded timeout of 3s"))
+	})
 })