From f276d53532cc850e49654c1a777660170d9572e4 Mon Sep 17 00:00:00 2001 From: Paul Holzinger Date: Fri, 9 Aug 2024 18:59:22 +0200 Subject: [PATCH 1/3] libpod: fix broken saveContainerError() We cannot unlock then lock again without syncing the state as this will then save a potentially old state causing very bad things, such as double netns cleanup issues. The fix here is simple move the saveContainerError() under the same lock. The comment about the re-lock is just wrong. Not doing this under the same lock would cause us to update the error after something else changed the container alreayd. Most likely this was caused by a misunderstanding on how go defer's work. Given they run Last In - First Out (LIFO) it is safe as long as out defer function is after the defer unlock() call. I think this issue is very bad and might have caused a variety of other weird flakes. As fact I am confident that this fixes the double cleanup errors. Fixes #21569 Also fixes the netns removal ENOENT issues seen in #19721. Signed-off-by: Paul Holzinger --- libpod/container_api.go | 81 +++++++++++++++-------------------------- 1 file changed, 30 insertions(+), 51 deletions(-) diff --git a/libpod/container_api.go b/libpod/container_api.go index 275f365b2f..a6a15d3d91 100644 --- a/libpod/container_api.go +++ b/libpod/container_api.go @@ -84,27 +84,20 @@ func (c *Container) Init(ctx context.Context, recursive bool) error { // running before starting the container. The recursive parameter, if set, will start all // dependencies before starting this container. func (c *Container) Start(ctx context.Context, recursive bool) (finalErr error) { - defer func() { - if finalErr != nil { - // Have to re-lock. - // As this is the first defer, it's the last thing to - // happen in the function - so `defer c.lock.Unlock()` - // below already fired. - if !c.batched { - c.lock.Lock() - defer c.lock.Unlock() - } - - if err := saveContainerError(c, finalErr); err != nil { - logrus.Debug(err) - } - } - }() - if !c.batched { c.lock.Lock() defer c.lock.Unlock() + // defer's are executed LIFO so we are locked here + // as long as we call this after the defer unlock() + defer func() { + if finalErr != nil { + if err := saveContainerError(c, finalErr); err != nil { + logrus.Debug(err) + } + } + }() + if err := c.syncContainer(); err != nil { return err } @@ -147,27 +140,20 @@ func (c *Container) Update(resources *spec.LinuxResources, restartPolicy *string // ordering of the two such that no output from the container is lost (e.g. the // Attach call occurs before Start). func (c *Container) Attach(ctx context.Context, streams *define.AttachStreams, keys string, resize <-chan resize.TerminalSize, start bool) (retChan <-chan error, finalErr error) { - defer func() { - if finalErr != nil { - // Have to re-lock. - // As this is the first defer, it's the last thing to - // happen in the function - so `defer c.lock.Unlock()` - // below already fired. - if !c.batched { - c.lock.Lock() - defer c.lock.Unlock() - } - - if err := saveContainerError(c, finalErr); err != nil { - logrus.Debug(err) - } - } - }() - if !c.batched { c.lock.Lock() defer c.lock.Unlock() + // defer's are executed LIFO so we are locked here + // as long as we call this after the defer unlock() + defer func() { + if finalErr != nil { + if err := saveContainerError(c, finalErr); err != nil { + logrus.Debug(err) + } + } + }() + if err := c.syncContainer(); err != nil { return nil, err } @@ -270,27 +256,20 @@ func (c *Container) Stop() error { // manually. If timeout is 0, SIGKILL will be used immediately to kill the // container. func (c *Container) StopWithTimeout(timeout uint) (finalErr error) { - defer func() { - if finalErr != nil { - // Have to re-lock. - // As this is the first defer, it's the last thing to - // happen in the function - so `defer c.lock.Unlock()` - // below already fired. - if !c.batched { - c.lock.Lock() - defer c.lock.Unlock() - } - - if err := saveContainerError(c, finalErr); err != nil { - logrus.Debug(err) - } - } - }() - if !c.batched { c.lock.Lock() defer c.lock.Unlock() + // defer's are executed LIFO so we are locked here + // as long as we call this after the defer unlock() + defer func() { + if finalErr != nil { + if err := saveContainerError(c, finalErr); err != nil { + logrus.Debug(err) + } + } + }() + if err := c.syncContainer(); err != nil { return err } From 78cb1e28cbc2a7ef246178edc742ecf61d5962b1 Mon Sep 17 00:00:00 2001 From: Paul Holzinger Date: Mon, 12 Aug 2024 11:39:24 +0200 Subject: [PATCH 2/3] libpod: do not save expected stop errors in ctr state If we try to stop a contianer that is not running or paused we get an ErrCtrStateInvalid or ErrCtrStopped error. As podman stop is idempotent this is not a user visable error at all so we should also never log it in the container state. Signed-off-by: Paul Holzinger --- libpod/container_api.go | 6 +++++- test/e2e/inspect_test.go | 9 +++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/libpod/container_api.go b/libpod/container_api.go index a6a15d3d91..300fe5ca20 100644 --- a/libpod/container_api.go +++ b/libpod/container_api.go @@ -263,7 +263,11 @@ func (c *Container) StopWithTimeout(timeout uint) (finalErr error) { // defer's are executed LIFO so we are locked here // as long as we call this after the defer unlock() defer func() { - if finalErr != nil { + // The podman stop command is idempotent while the internal function here is not. + // As such we do not want to log these errors in the state because they are not + // actually user visible errors. + if finalErr != nil && !errors.Is(finalErr, define.ErrCtrStopped) && + !errors.Is(finalErr, define.ErrCtrStateInvalid) { if err := saveContainerError(c, finalErr); err != nil { logrus.Debug(err) } diff --git a/test/e2e/inspect_test.go b/test/e2e/inspect_test.go index 202ca036ae..72b3c8b5cc 100644 --- a/test/e2e/inspect_test.go +++ b/test/e2e/inspect_test.go @@ -583,6 +583,15 @@ var _ = Describe("Podman inspect", func() { Expect(session).Should(ExitCleanly()) Expect(session.OutputToString()).To(BeEmpty()) + // Stopping the container should be a NOP and not log an error in the state here. + session = podmanTest.Podman([]string{"container", "stop", cid}) + session.WaitWithDefaultTimeout() + Expect(session).Should(ExitCleanly()) + session = podmanTest.Podman([]string{"container", "inspect", cid, "-f", "{{ .State.Error }}"}) + session.WaitWithDefaultTimeout() + Expect(session).Should(ExitCleanly()) + Expect(session.OutputToString()).To(BeEmpty(), "state error after stop") + commandNotFound := "OCI runtime attempted to invoke a command that was not found" session = podmanTest.Podman([]string{"start", cid}) session.WaitWithDefaultTimeout() From ecf88f17b6a061fcf6949abba07ba8be8101ee42 Mon Sep 17 00:00:00 2001 From: Paul Holzinger Date: Mon, 12 Aug 2024 11:47:23 +0200 Subject: [PATCH 3/3] libpod: reset state error on init If we manage to init/start a container successfully we should unset any previously stored state errors. Otherwise a user might be confused why there is an error in the state about some old error even though the container works/runs. Signed-off-by: Paul Holzinger --- libpod/container_internal.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/libpod/container_internal.go b/libpod/container_internal.go index d12b2809fc..50acaf3e7f 100644 --- a/libpod/container_internal.go +++ b/libpod/container_internal.go @@ -1106,6 +1106,9 @@ func (c *Container) init(ctx context.Context, retainRetries bool) error { c.state.RestoreLog = "" c.state.ExitCode = 0 c.state.Exited = false + // Reset any previous errors as we try to init it again, either it works and we don't + // want to keep an old error around or a new error will be written anyway. + c.state.Error = "" c.state.State = define.ContainerStateCreated c.state.StoppedByUser = false c.state.RestartPolicyMatch = false