Merge pull request #18581 from vrothberg/fix-18572

wait: look for exit code in stopped state
2023-05-22 11:51:14 -04:00 · 2023-05-22 11:51:14 -04:00 · af8d19dc2e
parent 9252f926e6 1b9272a060
commit af8d19dc2e
3 changed files with 38 additions and 4 deletions
--- a/docs/source/markdown/podman-wait.1.md.in
+++ b/docs/source/markdown/podman-wait.1.md.in
@ -14,6 +14,11 @@ name or ID.  In the case of multiple containers, Podman waits on each consecutiv
 After all specified containers are stopped, the containers' return codes are printed
 separated by newline in the same order as they were given to the command.

+NOTE: there is an inherent race condition when waiting for containers with a
+restart policy of `always` or `on-failure`, such as those created by `podman
+kube play`. Such containers may be repeatedly exiting and restarting, possibly
+with different exit codes, but `podman wait` can only display and detect one.
+
 ## OPTIONS

 #### **--condition**=*state*
--- a/libpod/container_api.go
+++ b/libpod/container_api.go
@ -592,13 +592,21 @@ func (c *Container) WaitForExit(ctx context.Context, pollInterval time.Duration)
 			conmonAlive, err := c.ociRuntime.CheckConmonRunning(c)
 			switch {
 			case errors.Is(err, define.ErrNoSuchCtr):
+				// Container has been removed, so we assume the
+				// exit code is present in the DB.
 				containerRemoved = true
 			case err != nil:
 				return false, -1, err
 			case !conmonAlive:
+				// Give the exit code at most 20 seconds to
+				// show up in the DB.  That should largely be
+				// enough for the cleanup process.
 				timerDuration := time.Second * 20
 				conmonTimer = *time.NewTimer(timerDuration)
 				conmonTimerSet = true
+			case conmonAlive:
+				// Continue waiting if conmon's still running.
+				return false, -1, nil
 			}
 		}

@ -609,7 +617,18 @@ func (c *Container) WaitForExit(ctx context.Context, pollInterval time.Duration)
 			case <-conmonTimer.C:
 				logrus.Debugf("Exceeded conmon timeout waiting for container %s to exit", id)
 			default:
-				if !c.ensureState(define.ContainerStateExited, define.ContainerStateConfigured) {
+				switch c.state.State {
+				case define.ContainerStateExited, define.ContainerStateConfigured:
+					// Container exited, so we can look up the exit code.
+				case define.ContainerStateStopped:
+					// Continue looping unless the restart policy is always.
+					// In this case, the container would never transition to
+					// the exited state, so we need to look up the exit code.
+					if c.config.RestartPolicy != define.RestartPolicyAlways {
+						return false, -1, nil
+					}
+				default:
+					// Continue looping
 					return false, -1, nil
 				}
 			}
@ -617,9 +636,11 @@ func (c *Container) WaitForExit(ctx context.Context, pollInterval time.Duration)

 		exitCode, err := c.runtime.state.GetContainerExitCode(id)
 		if err != nil {
-			if errors.Is(err, define.ErrNoSuchExitCode) && c.ensureState(define.ContainerStateConfigured, define.ContainerStateCreated) {
-				// The container never ran.
-				return true, 0, nil
+			if errors.Is(err, define.ErrNoSuchExitCode) {
+				// If the container is configured or created, we must assume it never ran.
+				if c.ensureState(define.ContainerStateConfigured, define.ContainerStateCreated) {
+					return true, 0, nil
+				}
 			}
 			return true, -1, fmt.Errorf("%w (container in state %s)", err, c.state.State)
 		}
--- a/test/system/030-run.bats
+++ b/test/system/030-run.bats
@ -1104,5 +1104,13 @@ EOF
    rm -rf $romount
 }

+@test "podman run --restart=always -- wait" {
+    # regression test for #18572 to make sure Podman waits less than 20 seconds
+    ctr=$(random_string)
+    run_podman run -d --restart=always --name=$ctr $IMAGE false
+    PODMAN_TIMEOUT=20 run_podman wait $ctr
+    is "$output" "1" "container should exit 1"
+    run_podman rm -f -t0 $ctr
+}

 # vim: filetype=sh