From 4c3531a1a4dd23faeb41da5fd9f9b33dee6d9746 Mon Sep 17 00:00:00 2001 From: Paul Holzinger Date: Wed, 31 Jul 2024 15:33:23 +0200 Subject: [PATCH] fix network cleanup flake in play kube When using service containers and play kube we create a complicated set of dependencies. First in a pod all conmon/container cgroups are part of one slice, that slice will be removed when the entire pod is stopped resulting in systemd killing all processes that were part in it. Now the issue here is around the working of stopPodIfNeeded() and stopIfOnlyInfraRemains(), once a container is cleaned up it will check if the pod should be stopped depending on the pod ExitPolicy. If this is the case it wil stop all containers in that pod. However in our flaky test we calle podman pod kill which logically killed all containers already. Thus the logic now thinks on cleanup it must stop the pod and calls into pod.stopWithTimeout(). Then there we try to stop but because all containers are already stopped it just throws errors and never gets to the point were it would call Cleanup(). So the code does not do cleanup and eventually calls removePodCgroup() which will cause all conmon and other podman cleanup processes of this pod to be killed. Thus the podman container cleanup process was likely killed while actually trying to the the proper cleanup which leaves us in a bad state. Following commands such as podman pod rm will try to the cleanup again as they see it was not completed but then fail as they are unable to recover from the partial cleanup state. Long term network cleanup needs to be more robust and ideally should be idempotent to handle cases were cleanup was killed in the middle. Fixes #21569 Signed-off-by: Paul Holzinger --- libpod/pod_api.go | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/libpod/pod_api.go b/libpod/pod_api.go index 875d9ef3d5..9b0cd5c32f 100644 --- a/libpod/pod_api.go +++ b/libpod/pod_api.go @@ -169,18 +169,21 @@ func (p *Pod) stopWithTimeout(ctx context.Context, cleanup bool, timeout int) (m // Can't batch these without forcing Stop() to hold the // lock for the full duration of the timeout. // We probably don't want to do that. + var err error if timeout > -1 { - if err := c.StopWithTimeout(uint(timeout)); err != nil { - return err - } + err = c.StopWithTimeout(uint(timeout)) } else { - if err := c.Stop(); err != nil { - return err - } + err = c.Stop() + } + if err != nil && !errors.Is(err, define.ErrCtrStateInvalid) && !errors.Is(err, define.ErrCtrStopped) { + return err } if cleanup { - return c.Cleanup(ctx) + err := c.Cleanup(ctx) + if err != nil && !errors.Is(err, define.ErrCtrStateInvalid) && !errors.Is(err, define.ErrCtrStopped) { + return err + } } return nil @@ -196,9 +199,6 @@ func (p *Pod) stopWithTimeout(ctx context.Context, cleanup bool, timeout int) (m // Get returned error for every container we worked on for id, channel := range ctrErrChan { if err := <-channel; err != nil { - if errors.Is(err, define.ErrCtrStateInvalid) || errors.Is(err, define.ErrCtrStopped) { - continue - } ctrErrors[id] = err } }