mirror of https://github.com/containers/podman.git
				
				
				
			Merge pull request #26086 from Honny1/hc-timeout
Fix: Ensure HealthCheck exec session terminates on timeout
This commit is contained in:
		
						commit
						df90606d53
					
				|  | @ -627,7 +627,7 @@ func DefineCreateFlags(cmd *cobra.Command, cf *entities.ContainerCreateOptions, | |||
| 		createFlags.StringVar( | ||||
| 			&cf.HealthTimeout, | ||||
| 			healthTimeoutFlagName, define.DefaultHealthCheckTimeout, | ||||
| 			"the maximum time allowed to complete the healthcheck before an interval is considered failed", | ||||
| 			"the maximum time allowed to complete the healthcheck before an interval is considered failed and SIGKILL is sent to the healthcheck process", | ||||
| 		) | ||||
| 		_ = cmd.RegisterFlagCompletionFunc(healthTimeoutFlagName, completion.AutocompleteNone) | ||||
| 
 | ||||
|  |  | |||
|  | @ -7,8 +7,7 @@ | |||
| The maximum time allowed to complete the healthcheck before an interval is considered failed. Like start-period, the | ||||
| value can be expressed in a time format such as **1m22s**. The default value is **30s**. | ||||
| 
 | ||||
| Note: A timeout marks the healthcheck as failed but does not terminate the running process. | ||||
| This ensures that a slow but eventually successful healthcheck does not disrupt the container | ||||
| but is still accounted for in the health status. | ||||
| Note: A timeout marks the healthcheck as failed. If the healthcheck command itself runs longer than the specified *timeout*, | ||||
| it will be sent a `SIGKILL` signal. | ||||
| 
 | ||||
| Note: This parameter will overwrite related healthcheck configuration from the image. | ||||
|  |  | |||
|  | @ -820,7 +820,7 @@ func (c *Container) ExecResize(sessionID string, newSize resize.TerminalSize) er | |||
| 	return c.ociRuntime.ExecAttachResize(c, sessionID, newSize) | ||||
| } | ||||
| 
 | ||||
| func (c *Container) healthCheckExec(config *ExecConfig, streams *define.AttachStreams) (int, error) { | ||||
| func (c *Container) healthCheckExec(config *ExecConfig, timeout time.Duration, streams *define.AttachStreams) (int, error) { | ||||
| 	unlock := true | ||||
| 	if !c.batched { | ||||
| 		c.lock.Lock() | ||||
|  | @ -868,15 +868,24 @@ func (c *Container) healthCheckExec(config *ExecConfig, streams *define.AttachSt | |||
| 	if err != nil { | ||||
| 		return -1, err | ||||
| 	} | ||||
| 	session.PID = pid | ||||
| 	session.PIDData = getPidData(pid) | ||||
| 
 | ||||
| 	if !c.batched { | ||||
| 		c.lock.Unlock() | ||||
| 		unlock = false | ||||
| 	} | ||||
| 
 | ||||
| 	err = <-attachErrChan | ||||
| 	if err != nil { | ||||
| 		return -1, fmt.Errorf("container %s light exec session with pid: %d error: %v", c.ID(), pid, err) | ||||
| 	select { | ||||
| 	case err = <-attachErrChan: | ||||
| 		if err != nil { | ||||
| 			return -1, fmt.Errorf("container %s light exec session with pid: %d error: %v", c.ID(), pid, err) | ||||
| 		} | ||||
| 	case <-time.After(timeout): | ||||
| 		if err := c.ociRuntime.ExecStopContainer(c, session.ID(), 0); err != nil { | ||||
| 			return -1, err | ||||
| 		} | ||||
| 		return -1, fmt.Errorf("%v of %s", define.ErrHealthCheckTimeout, c.HealthCheckConfig().Timeout.String()) | ||||
| 	} | ||||
| 
 | ||||
| 	return c.readExecExitCode(session.ID()) | ||||
|  |  | |||
|  | @ -217,4 +217,7 @@ var ( | |||
| 	// ErrRemovingCtrs indicates that there was an error removing all
 | ||||
| 	// containers from a pod.
 | ||||
| 	ErrRemovingCtrs = errors.New("removing pod containers") | ||||
| 
 | ||||
| 	// ErrHealthCheckTimeout indicates that a HealthCheck timed out.
 | ||||
| 	ErrHealthCheckTimeout = errors.New("healthcheck command exceeded timeout") | ||||
| ) | ||||
|  |  | |||
|  | @ -93,19 +93,23 @@ func (c *Container) runHealthCheck(ctx context.Context, isStartup bool) (define. | |||
| 	streams.AttachInput = true | ||||
| 
 | ||||
| 	logrus.Debugf("executing health check command %s for %s", strings.Join(newCommand, " "), c.ID()) | ||||
| 	timeStart := time.Now() | ||||
| 	hcResult := define.HealthCheckSuccess | ||||
| 	config := new(ExecConfig) | ||||
| 	config.Command = newCommand | ||||
| 	exitCode, hcErr := c.healthCheckExec(config, streams) | ||||
| 	timeStart := time.Now() | ||||
| 	exitCode, hcErr := c.healthCheckExec(config, c.HealthCheckConfig().Timeout, streams) | ||||
| 	timeEnd := time.Now() | ||||
| 	if hcErr != nil { | ||||
| 		hcResult = define.HealthCheckFailure | ||||
| 		if errors.Is(hcErr, define.ErrOCIRuntimeNotFound) || | ||||
| 		switch { | ||||
| 		case errors.Is(hcErr, define.ErrOCIRuntimeNotFound) || | ||||
| 			errors.Is(hcErr, define.ErrOCIRuntimePermissionDenied) || | ||||
| 			errors.Is(hcErr, define.ErrOCIRuntime) { | ||||
| 			errors.Is(hcErr, define.ErrOCIRuntime): | ||||
| 			returnCode = 1 | ||||
| 			hcErr = nil | ||||
| 		} else { | ||||
| 		case errors.Is(hcErr, define.ErrHealthCheckTimeout): | ||||
| 			returnCode = -1 | ||||
| 		default: | ||||
| 			returnCode = 125 | ||||
| 		} | ||||
| 	} else if exitCode != 0 { | ||||
|  | @ -140,7 +144,6 @@ func (c *Container) runHealthCheck(ctx context.Context, isStartup bool) (define. | |||
| 		hcResult = define.HealthCheckContainerStopped | ||||
| 	} | ||||
| 
 | ||||
| 	timeEnd := time.Now() | ||||
| 	if c.HealthCheckConfig().StartPeriod > 0 { | ||||
| 		// there is a start-period we need to honor; we add startPeriod to container start time
 | ||||
| 		startPeriodTime := c.state.StartedTime.Add(c.HealthCheckConfig().StartPeriod) | ||||
|  | @ -156,12 +159,6 @@ func (c *Container) runHealthCheck(ctx context.Context, isStartup bool) (define. | |||
| 		eventLog = eventLog[:c.HealthCheckMaxLogSize()] | ||||
| 	} | ||||
| 
 | ||||
| 	if timeEnd.Sub(timeStart) > c.HealthCheckConfig().Timeout { | ||||
| 		returnCode = -1 | ||||
| 		hcResult = define.HealthCheckFailure | ||||
| 		hcErr = fmt.Errorf("healthcheck command exceeded timeout of %s", c.HealthCheckConfig().Timeout.String()) | ||||
| 	} | ||||
| 
 | ||||
| 	hcl := newHealthCheckLog(timeStart, timeEnd, returnCode, eventLog) | ||||
| 
 | ||||
| 	healthCheckResult, err := c.updateHealthCheckLog(hcl, hcResult, inStartPeriod, isStartup) | ||||
|  |  | |||
|  | @ -258,7 +258,7 @@ func (r *ConmonOCIRuntime) ExecStopContainer(ctr *Container, sessionID string, t | |||
| 
 | ||||
| 	// SIGTERM did not work. On to SIGKILL.
 | ||||
| 	logrus.Debugf("Killing exec session %s (PID %d) of container %s with SIGKILL", sessionID, pid, ctr.ID()) | ||||
| 	if err := pidHandle.Kill(unix.SIGTERM); err != nil { | ||||
| 	if err := pidHandle.Kill(unix.SIGKILL); err != nil { | ||||
| 		if err == unix.ESRCH { | ||||
| 			return nil | ||||
| 		} | ||||
|  |  | |||
|  | @ -404,4 +404,13 @@ HEALTHCHECK CMD ls -l / 2>&1`, ALPINE) | |||
| 		Expect(ps.OutputToStringArray()).To(HaveLen(2)) | ||||
| 		Expect(ps.OutputToString()).To(ContainSubstring("hc")) | ||||
| 	}) | ||||
| 
 | ||||
| 	It("podman healthcheck - health timeout", func() { | ||||
| 		ctrName := "c-h-" + RandomString(6) | ||||
| 		podmanTest.PodmanExitCleanly("run", "-d", "--name", ctrName, "--health-cmd", "top", "--health-timeout=3s", ALPINE, "top") | ||||
| 
 | ||||
| 		hc := podmanTest.Podman([]string{"healthcheck", "run", ctrName}) | ||||
| 		hc.WaitWithTimeout(10) | ||||
| 		Expect(hc).Should(ExitWithError(125, "Error: healthcheck command exceeded timeout of 3s")) | ||||
| 	}) | ||||
| }) | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue