remove service container _after_ pods

Do not allow for removing the service container unless all associated
pods have been removed.  Previously, the service container could be
removed when all pods have exited which can lead to a number of issues.

Now, the service container is treated like an infra container and can
only be removed along with the pods.

Also make sure that a pod is unlinked from the service container once
it's being removed.

Fixes: #16964
Signed-off-by: Valentin Rothberg <vrothberg@redhat.com>
This commit is contained in:
Valentin Rothberg 2023-01-05 14:01:21 +01:00
parent bc6908e761
commit 4a7a45f973
3 changed files with 44 additions and 14 deletions

View File

@ -687,11 +687,13 @@ func (r *Runtime) removeContainer(ctx context.Context, c *Container, force, remo
} }
if c.IsService() { if c.IsService() {
canStop, err := c.canStopServiceContainer() for _, id := range c.state.Service.Pods {
if err != nil { if _, err := c.runtime.LookupPod(id); err != nil {
if errors.Is(err, define.ErrNoSuchPod) {
continue
}
return err return err
} }
if !canStop {
return fmt.Errorf("container %s is the service container of pod(s) %s and cannot be removed without removing the pod(s)", c.ID(), strings.Join(c.state.Service.Pods, ",")) return fmt.Errorf("container %s is the service container of pod(s) %s and cannot be removed without removing the pod(s)", c.ID(), strings.Join(c.state.Service.Pods, ","))
} }
} }

View File

@ -89,6 +89,9 @@ func (c *Container) canStopServiceContainer() (bool, error) {
status, err := pod.GetPodStatus() status, err := pod.GetPodStatus()
if err != nil { if err != nil {
if errors.Is(err, define.ErrNoSuchPod) {
continue
}
return false, err return false, err
} }
@ -112,6 +115,9 @@ func (p *Pod) maybeStopServiceContainer() error {
serviceCtr, err := p.serviceContainer() serviceCtr, err := p.serviceContainer()
if err != nil { if err != nil {
if errors.Is(err, define.ErrNoSuchCtr) {
return nil
}
return fmt.Errorf("getting pod's service container: %w", err) return fmt.Errorf("getting pod's service container: %w", err)
} }
// Checking whether the service can be stopped must be done in // Checking whether the service can be stopped must be done in
@ -163,13 +169,7 @@ func (p *Pod) maybeStartServiceContainer(ctx context.Context) error {
// canRemoveServiceContainer returns true if all pods of the service are removed. // canRemoveServiceContainer returns true if all pods of the service are removed.
// Note that the method acquires the container lock. // Note that the method acquires the container lock.
func (c *Container) canRemoveServiceContainerLocked() (bool, error) { func (c *Container) canRemoveServiceContainer() (bool, error) {
c.lock.Lock()
defer c.lock.Unlock()
if err := c.syncContainer(); err != nil {
return false, err
}
if !c.IsService() { if !c.IsService() {
return false, fmt.Errorf("internal error: checking service: container %s is not a service container", c.ID()) return false, fmt.Errorf("internal error: checking service: container %s is not a service container", c.ID())
} }
@ -188,6 +188,7 @@ func (c *Container) canRemoveServiceContainerLocked() (bool, error) {
} }
// Checks whether the service container can be removed and does so. // Checks whether the service container can be removed and does so.
// It also unlinks the pod from the service container.
func (p *Pod) maybeRemoveServiceContainer() error { func (p *Pod) maybeRemoveServiceContainer() error {
if !p.hasServiceContainer() { if !p.hasServiceContainer() {
return nil return nil
@ -195,6 +196,9 @@ func (p *Pod) maybeRemoveServiceContainer() error {
serviceCtr, err := p.serviceContainer() serviceCtr, err := p.serviceContainer()
if err != nil { if err != nil {
if errors.Is(err, define.ErrNoSuchCtr) {
return nil
}
return fmt.Errorf("getting pod's service container: %w", err) return fmt.Errorf("getting pod's service container: %w", err)
} }
// Checking whether the service can be stopped must be done in // Checking whether the service can be stopped must be done in
@ -202,9 +206,31 @@ func (p *Pod) maybeRemoveServiceContainer() error {
// pod->container->servicePods hierarchy. // pod->container->servicePods hierarchy.
p.runtime.queueWork(func() { p.runtime.queueWork(func() {
logrus.Debugf("Pod %s has a service %s: checking if it can be removed", p.ID(), serviceCtr.ID()) logrus.Debugf("Pod %s has a service %s: checking if it can be removed", p.ID(), serviceCtr.ID())
canRemove, err := serviceCtr.canRemoveServiceContainerLocked() canRemove, err := func() (bool, error) { // Anonymous func for easy locking
serviceCtr.lock.Lock()
defer serviceCtr.lock.Unlock()
if err := serviceCtr.syncContainer(); err != nil {
return false, err
}
// Unlink the pod from the service container.
servicePods := make([]string, 0, len(serviceCtr.state.Service.Pods)-1)
for _, id := range serviceCtr.state.Service.Pods {
if id != p.ID() {
servicePods = append(servicePods, id)
}
}
serviceCtr.state.Service.Pods = servicePods
if err := serviceCtr.save(); err != nil {
return false, err
}
return serviceCtr.canRemoveServiceContainer()
}()
if err != nil { if err != nil {
logrus.Errorf("Checking whether service of container %s can be removed: %v", serviceCtr.ID(), err) if !errors.Is(err, define.ErrNoSuchCtr) {
logrus.Errorf("Checking whether service container %s can be removed: %v", serviceCtr.ID(), err)
}
return return
} }
if !canRemove { if !canRemove {

View File

@ -170,6 +170,8 @@ EOF
# Check for an error when trying to remove the service container # Check for an error when trying to remove the service container
run_podman 125 container rm $service_container run_podman 125 container rm $service_container
is "$output" "Error: container .* is the service container of pod(s) .* and cannot be removed without removing the pod(s)" is "$output" "Error: container .* is the service container of pod(s) .* and cannot be removed without removing the pod(s)"
run_podman 125 container rm --force $service_container
is "$output" "Error: container .* is the service container of pod(s) .* and cannot be removed without removing the pod(s)"
# Kill the pod and make sure the service is not running # Kill the pod and make sure the service is not running
run_podman pod kill test_pod run_podman pod kill test_pod