Add a new hidden command, podman system locks

This is a general debug command that identifies any lock conflicts that could lead to a deadlock. It's only intended for Libpod developers (while it does tell you if you need to run `podman system renumber`, you should never have to do that anyways, and the next commit will include a lot more technical info in the output that no one except a Libpod dev will want). Hence, hidden command, and only implemented for the local driver (recommend just running it by SSHing into a `podman machine` VM in the unlikely case it's needed by remote Podman). These conflicts should normally never happen, but having a command like this is useful for debugging deadlock conditions when they do occur. Signed-off-by: Matt Heon <mheon@redhat.com>
2023-06-05 14:47:12 -04:00 · 2023-06-05 14:47:12 -04:00 · 0948c078c2
parent 1013696ad2
commit 0948c078c2
6 changed files with 139 additions and 0 deletions
--- a/cmd/podman/system/locks.go
+++ b/cmd/podman/system/locks.go
@ -0,0 +1,48 @@
+package system
+
+import (
+	"fmt"
+
+	"github.com/containers/podman/v4/cmd/podman/registry"
+	"github.com/containers/podman/v4/cmd/podman/validate"
+	"github.com/spf13/cobra"
+)
+
+var (
+	locksCommand = &cobra.Command{
+		Use: "locks",
+		Short: "Debug Libpod's use of locks, identifying any potential conflicts",
+		Args: validate.NoArgs,
+		Hidden: true,
+		RunE: func(cmd *cobra.Command, args []string) error {
+			return runLocks()
+		},
+		Example: "podman system locks",
+	}
+)
+
+func init() {
+	registry.Commands = append(registry.Commands, registry.CliCommand{
+		Command: locksCommand,
+		Parent: systemCmd,
+	})
+}
+func runLocks() error {
+	report, err := registry.ContainerEngine().Locks(registry.Context())
+	if err != nil {
+		return err
+	}
+
+	for lockNum, objects := range report.LockConflicts {
+		fmt.Printf("Lock %d is in use by the following\n:", lockNum)
+		for _, obj := range objects {
+			fmt.Printf("\t%s\n", obj)
+		}
+	}
+
+	if len(report.LockConflicts) > 0 {
+		fmt.Printf("\nLock conflicts have been detected. Recommend immediate use of `podman system renumber` to resolve.\n")
+	}
+
+	return nil
+}
--- a/libpod/runtime.go
+++ b/libpod/runtime.go
@ -1188,3 +1188,73 @@ func (r *Runtime) RemoteURI() string {
 func (r *Runtime) SetRemoteURI(uri string) {
 	r.config.Engine.RemoteURI = uri
 }
+
+
+// Get information on potential lock conflicts.
+// Returns a map of lock number to object(s) using the lock, formatted as
+// "container <id>" or "volume <id>" or "pod <id>".
+// If the map returned is not empty, you should immediately renumber locks on
+// the runtime, because you have a deadlock waiting to happen.
+func (r *Runtime) LockConflicts() (map[uint32][]string, error) {
+	// Make an internal map to store what lock is associated with what
+	locksInUse := make(map[uint32][]string)
+
+	ctrs, err := r.state.AllContainers(false)
+	if err != nil {
+		return nil, err
+	}
+	for _, ctr := range ctrs {
+		lockNum := ctr.lock.ID()
+		ctrString := fmt.Sprintf("container %s", ctr.ID())
+		locksArr, ok := locksInUse[lockNum]
+		if ok {
+			locksInUse[lockNum] = append(locksArr, ctrString)
+		} else {
+			locksInUse[lockNum] = []string{ctrString}
+		}
+	}
+
+	pods, err := r.state.AllPods()
+	if err != nil {
+		return nil, err
+	}
+	for _, pod := range pods {
+		lockNum := pod.lock.ID()
+		podString := fmt.Sprintf("pod %s", pod.ID())
+		locksArr, ok := locksInUse[lockNum]
+		if ok {
+			locksInUse[lockNum] = append(locksArr, podString)
+		} else {
+			locksInUse[lockNum] = []string{podString}
+		}
+	}
+
+	volumes, err := r.state.AllVolumes()
+	if err != nil {
+		return nil, err
+	}
+	for _, vol := range volumes {
+		lockNum := vol.lock.ID()
+		volString := fmt.Sprintf("volume %s", vol.Name())
+		locksArr, ok := locksInUse[lockNum]
+		if ok {
+			locksInUse[lockNum] = append(locksArr, volString)
+		} else {
+			locksInUse[lockNum] = []string{volString}
+		}
+	}
+
+	// Now go through and find any entries with >1 item associated
+	toReturn := make(map[uint32][]string)
+	for lockNum, objects := range locksInUse {
+		// If debug logging is requested, just spit out *every* lock in
+		// use.
+		logrus.Debugf("Lock number %d is in use by %v", lockNum, objects)
+
+		if len(objects) > 1 {
+			toReturn[lockNum] = objects
+		}
+	}
+
+	return toReturn, nil
+}
--- a/pkg/domain/entities/engine_container.go
+++ b/pkg/domain/entities/engine_container.go
@ -62,6 +62,7 @@ type ContainerEngine interface { //nolint:interfacebloat
 	HealthCheckRun(ctx context.Context, nameOrID string, options HealthCheckOptions) (*define.HealthCheckResults, error)
 	Info(ctx context.Context) (*define.Info, error)
 	KubeApply(ctx context.Context, body io.Reader, opts ApplyOptions) error
+	Locks(ctx context.Context) (*LocksReport, error)
 	NetworkConnect(ctx context.Context, networkname string, options NetworkConnectOptions) error
 	NetworkCreate(ctx context.Context, network types.Network, createOptions *types.NetworkCreateOptions) (*types.Network, error)
 	NetworkUpdate(ctx context.Context, networkname string, options NetworkUpdateOptions) error
--- a/pkg/domain/entities/system.go
+++ b/pkg/domain/entities/system.go
@ -120,3 +120,9 @@ type AuthReport struct {
 	IdentityToken string
 	Status        string
 }
+
+// LocksReport describes any conflicts in Libpod's lock allocations that could
+// lead to deadlocks.
+type LocksReport struct {
+	LockConflicts map[uint32][]string
+}
--- a/pkg/domain/infra/abi/system.go
+++ b/pkg/domain/infra/abi/system.go
@ -429,3 +429,13 @@ func (ic ContainerEngine) Version(ctx context.Context) (*entities.SystemVersionR
 	report.Client = &v
 	return &report, err
 }
+
+func (ic ContainerEngine) Locks(ctx context.Context) (*entities.LocksReport, error) {
+	var report entities.LocksReport
+	conflicts, err := ic.Libpod.LockConflicts()
+	if err != nil {
+		return nil, err
+	}
+	report.LockConflicts = conflicts
+	return &report, nil
+}
--- a/pkg/domain/infra/tunnel/system.go
+++ b/pkg/domain/infra/tunnel/system.go
@ -34,3 +34,7 @@ func (ic *ContainerEngine) Unshare(ctx context.Context, args []string, options e
 func (ic ContainerEngine) Version(ctx context.Context) (*entities.SystemVersionReport, error) {
 	return system.Version(ic.ClientCtx, nil)
 }
+
+func (ic ContainerEngine) Locks(ctx context.Context) (*entities.LocksReport, error) {
+	return nil, errors.New("locks is not supported on remote clients")
+}