server: handle missing network namespace gracefully during networkStop

After host reboot, network namespaces are destroyed but CRI-O attempts
to clean them up during pod sandbox destruction, causing CNI plugin
failures and preventing pods from restarting properly. The fix ensures
pods can restart normally after host reboots.

Signed-off-by: Sohan Kunkerkar <sohank2602@gmail.com>
This commit is contained in:
Sohan Kunkerkar 2025-06-30 09:29:34 -04:00
parent 2edb23fd54
commit d17990aff1
5 changed files with 114 additions and 12 deletions

View File

@ -181,23 +181,42 @@ func (s *Server) networkStop(ctx context.Context, sb *sandbox.Sandbox) error {
podNetwork, err := s.newPodNetwork(ctx, sb)
if err != nil {
return err
return fmt.Errorf("failed to create pod network for sandbox %s(%s): %w", sb.Name(), sb.ID(), err)
}
// Check if the network namespace file exists and is valid before attempting CNI teardown.
// If the file doesn't exist or is invalid, skip CNI teardown and mark network as stopped.
if podNetwork.NetNS != "" {
if _, statErr := os.Stat(podNetwork.NetNS); statErr != nil {
// Network namespace file doesn't exist, mark network as stopped and return success
log.Debugf(ctx, "Network namespace file %s does not exist for pod sandbox %s(%s), skipping CNI teardown",
podNetwork.NetNS, sb.Name(), sb.ID())
return sb.SetNetworkStopped(ctx, true)
}
if validateErr := s.validateNetworkNamespace(podNetwork.NetNS); validateErr != nil {
// Network namespace file exists but is invalid (e.g., corrupted or fake file)
log.Warnf(ctx, "Network namespace file %s is invalid for pod sandbox %s(%s): %v, removing and skipping CNI teardown",
podNetwork.NetNS, sb.Name(), sb.ID(), validateErr)
s.cleanupNetns(ctx, podNetwork.NetNS, sb)
return sb.SetNetworkStopped(ctx, true)
}
}
if err := s.config.CNIPlugin().TearDownPodWithContext(stopCtx, podNetwork); err != nil {
retErr := fmt.Errorf("failed to destroy network for pod sandbox %s(%s): %w", sb.Name(), sb.ID(), err)
log.Warnf(ctx, "Failed to destroy network for pod sandbox %s(%s): %v", sb.Name(), sb.ID(), err)
if _, statErr := os.Stat(podNetwork.NetNS); statErr != nil {
return fmt.Errorf("%w: stat netns path %q: %w", retErr, podNetwork.NetNS, statErr)
// If the network namespace exists but CNI teardown failed, try to clean it up.
if podNetwork.NetNS != "" {
if _, statErr := os.Stat(podNetwork.NetNS); statErr == nil {
// Clean up the netns file since CNI teardown failed.
s.cleanupNetns(ctx, podNetwork.NetNS, sb)
}
}
// The netns file may still exists, which means that it's likely
// corrupted. Remove it to allow cleanup of the network namespace:
if rmErr := os.RemoveAll(podNetwork.NetNS); rmErr != nil {
return fmt.Errorf("%w: failed to remove netns path: %w", retErr, rmErr)
}
log.Warnf(ctx, "Removed invalid netns path %s from pod sandbox %s(%s)", podNetwork.NetNS, sb.Name(), sb.ID())
return fmt.Errorf("network teardown failed for pod sandbox %s(%s): %w", sb.Name(), sb.ID(), err)
}
return sb.SetNetworkStopped(ctx, true)

View File

@ -0,0 +1,24 @@
//go:build freebsd
// +build freebsd
package server
import (
"context"
"github.com/cri-o/cri-o/internal/lib/sandbox"
"github.com/cri-o/cri-o/internal/log"
)
// validateNetworkNamespace checks if the given path is a valid network namespace
// On FreeBSD, this is a no-op since network namespaces are Linux-specific.
func (s *Server) validateNetworkNamespace(netnsPath string) error {
// Network namespaces are Linux-specific, so on FreeBSD we assume it's valid
return nil
}
// cleanupNetns removes a network namespace file and logs the action
// On FreeBSD, this is a no-op since network namespaces are Linux-specific.
func (s *Server) cleanupNetns(ctx context.Context, netnsPath string, sb *sandbox.Sandbox) {
log.Debugf(ctx, "Network namespace cleanup not supported on this platform")
}

View File

@ -0,0 +1,36 @@
//go:build linux
// +build linux
package server
import (
"context"
"fmt"
"os"
"github.com/containernetworking/plugins/pkg/ns"
"github.com/cri-o/cri-o/internal/lib/sandbox"
"github.com/cri-o/cri-o/internal/log"
)
// validateNetworkNamespace checks if the given path is a valid network namespace.
func (s *Server) validateNetworkNamespace(netnsPath string) error {
netns, err := ns.GetNS(netnsPath)
if err != nil {
return fmt.Errorf("invalid network namespace: %w", err)
}
defer netns.Close()
return nil
}
// cleanupNetns removes a network namespace file and logs the action.
func (s *Server) cleanupNetns(ctx context.Context, netnsPath string, sb *sandbox.Sandbox) {
if rmErr := os.RemoveAll(netnsPath); rmErr != nil {
log.Warnf(ctx, "Failed to remove netns path %s: %v", netnsPath, rmErr)
} else {
log.Infof(ctx, "Removed netns path %s from pod sandbox %s(%s)", netnsPath, sb.Name(), sb.ID())
}
}

View File

@ -0,0 +1,23 @@
//go:build !linux && !freebsd
// +build !linux,!freebsd
package server
import (
"context"
"github.com/cri-o/cri-o/internal/lib/sandbox"
"github.com/cri-o/cri-o/internal/log"
)
// validateNetworkNamespace checks if the given path is a valid network namespace
// On unsupported platforms, this is a no-op since network namespaces are Linux-specific.
func (s *Server) validateNetworkNamespace(netnsPath string) error {
return nil
}
// cleanupNetns removes a network namespace file and logs the action
// On unsupported platforms, this is a no-op since network namespaces are Linux-specific.
func (s *Server) cleanupNetns(ctx context.Context, netnsPath string, sb *sandbox.Sandbox) {
log.Debugf(ctx, "Network namespace cleanup not supported on this platform")
}

View File

@ -185,5 +185,5 @@ function check_networking() {
# be able to remove the sandbox
crictl rmp -f "$POD"
grep -q "Removed invalid netns path $NETNS_PATH$NS from pod sandbox" "$CRIO_LOG"
grep -q "Removed netns path $NETNS_PATH$NS from pod sandbox" "$CRIO_LOG"
}