pkg/machine: make checkExclusiveActiveVM race free

We need to take another lock to prevent concurrent starts from different machines. I manually tested it by starting three VM in parallel with: podman machine start & podman machine start test1 & podman machine start test2 I also added a CI test that seems to work as expected (failed with the old binary, worked with the new) Before this patch I was able to start more than VM, with this patch it now only starts one of them and the other ones will fail to start with a proper error. Signed-off-by: Paul Holzinger <pholzing@redhat.com>
2024-03-13 15:30:32 +01:00 · 2024-03-13 15:30:32 +01:00 · e82d196269
parent b238303a40
commit e82d196269
4 changed files with 100 additions and 21 deletions
--- a/cmd/podman/machine/start.go
+++ b/cmd/podman/machine/start.go
@ -8,7 +8,6 @@ import (
 	"github.com/containers/podman/v5/cmd/podman/registry"
 	"github.com/containers/podman/v5/libpod/events"
 	"github.com/containers/podman/v5/pkg/machine"
-	"github.com/containers/podman/v5/pkg/machine/define"
 	"github.com/containers/podman/v5/pkg/machine/env"
 	"github.com/containers/podman/v5/pkg/machine/shim"
 	"github.com/containers/podman/v5/pkg/machine/vmconfigs"
@ -64,19 +63,6 @@ func start(_ *cobra.Command, args []string) error {
 		return err
 	}

-	state, err := provider.State(mc, false)
-	if err != nil {
-		return err
-	}
-
-	if state == define.Running {
-		return define.ErrVMAlreadyRunning
-	}
-
-	if err := shim.CheckExclusiveActiveVM(provider, mc); err != nil {
-		return err
-	}
-
 	if !startOpts.Quiet {
 		fmt.Printf("Starting machine %q\n", vmName)
 	}
--- a/pkg/machine/e2e/start_test.go
+++ b/pkg/machine/e2e/start_test.go
@ -1,6 +1,7 @@
 package e2e_test

 import (
+	"sync"
 	"time"

 	"github.com/containers/podman/v5/pkg/machine/define"
@ -119,4 +120,58 @@ var _ = Describe("podman machine start", func() {
 		Expect(inspectSession2).To(Exit(0))
 		Expect(inspectSession2.outputToString()).To(Not(Equal(define.Running)))
 	})
+
+	It("start two machines in parallel", func() {
+		i := initMachine{}
+		machine1 := "m1-" + randomString()
+		session, err := mb.setName(machine1).setCmd(i.withImage(mb.imagePath)).run()
+		Expect(err).ToNot(HaveOccurred())
+		Expect(session).To(Exit(0))
+
+		machine2 := "m2-" + randomString()
+		session, err = mb.setName(machine2).setCmd(i.withImage(mb.imagePath)).run()
+		Expect(session).To(Exit(0))
+
+		var startSession1, startSession2 *machineSession
+		wg := sync.WaitGroup{}
+		wg.Add(2)
+		// now start two machine start process in parallel
+		go func() {
+			defer GinkgoRecover()
+			defer wg.Done()
+			s := startMachine{}
+			startSession1, err = mb.setName(machine1).setCmd(s).setTimeout(time.Minute * 10).run()
+			Expect(err).ToNot(HaveOccurred())
+		}()
+		go func() {
+			defer GinkgoRecover()
+			defer wg.Done()
+			s := startMachine{}
+			// ok this is a hack and should not be needed but the way these test are setup they all
+			// share "mb" which stores the name that is used for the VM, thus running two parallel
+			// can overwrite the name from the other, work around that by creating a new mb for the
+			// second run.
+			nmb, err := newMB()
+			Expect(err).ToNot(HaveOccurred())
+			startSession2, err = nmb.setName(machine2).setCmd(s).setTimeout(time.Minute * 10).run()
+			Expect(err).ToNot(HaveOccurred())
+		}()
+		wg.Wait()
+
+		// WSL can start in parallel so just check both command exit 0 there
+		if testProvider.VMType() == define.WSLVirt {
+			Expect(startSession1).To(Exit(0))
+			Expect(startSession2).To(Exit(0))
+			return
+		}
+		// other providers have a check that only one VM can be running at any given time so make sure our check is race free
+		Expect(startSession1).To(Or(Exit(0), Exit(125)), "start command should succeed or fail with 125")
+		if startSession1.ExitCode() == 0 {
+			Expect(startSession2).To(Exit(125), "first start worked, second start must fail")
+			Expect(startSession2.errorToString()).To(ContainSubstring("machine %s: VM already running or starting", machine1))
+		} else {
+			Expect(startSession2).To(Exit(0), "first start failed, second start succeed")
+			Expect(startSession1.errorToString()).To(ContainSubstring("machine %s: VM already running or starting", machine2))
+		}
+	})
 })
--- a/pkg/machine/lock/lock.go
+++ b/pkg/machine/lock/lock.go
@ -4,6 +4,7 @@ import (
 	"fmt"
 	"path/filepath"

+	"github.com/containers/podman/v5/pkg/machine/env"
 	"github.com/containers/storage/pkg/lockfile"
 )

@ -15,3 +16,21 @@ func GetMachineLock(name string, machineConfigDir string) (*lockfile.LockFile, e
 	}
 	return lock, nil
 }
+
+const machineStartLockName = "machine-start.lock"
+
+// GetMachineStartLock is a lock only used to prevent starting different machines at the same time,
+// This is required as most provides support at max 1 running VM and to check this race free we
+// cannot allows starting two machine.
+func GetMachineStartLock() (*lockfile.LockFile, error) {
+	lockDir, err := env.GetGlobalDataDir()
+	if err != nil {
+		return nil, err
+	}
+
+	lock, err := lockfile.GetLockFile(filepath.Join(lockDir, machineStartLockName))
+	if err != nil {
+		return nil, err
+	}
+	return lock, nil
+}
--- a/pkg/machine/shim/host.go
+++ b/pkg/machine/shim/host.go
@ -268,13 +268,8 @@ func VMExists(name string, vmstubbers []vmconfigs.VMProvider) (*vmconfigs.Machin
 	return nil, false, nil
 }

-// CheckExclusiveActiveVM checks if any of the machines are already running
-func CheckExclusiveActiveVM(provider vmconfigs.VMProvider, mc *vmconfigs.MachineConfig) error {
-	// Don't check if provider supports parallel running machines
-	if !provider.RequireExclusiveActive() {
-		return nil
-	}
-
+// checkExclusiveActiveVM checks if any of the machines are already running
+func checkExclusiveActiveVM(provider vmconfigs.VMProvider, mc *vmconfigs.MachineConfig) error {
 	// Check if any other machines are running; if so, we error
 	localMachines, err := getMCsOverProviders([]vmconfigs.VMProvider{provider})
 	if err != nil {
@ -384,6 +379,30 @@ func Start(mc *vmconfigs.MachineConfig, mp vmconfigs.VMProvider, dirs *machineDe
 		return fmt.Errorf("reload config: %w", err)
 	}

+	// Don't check if provider supports parallel running machines
+	if mp.RequireExclusiveActive() {
+		startLock, err := lock.GetMachineStartLock()
+		if err != nil {
+			return err
+		}
+		startLock.Lock()
+		defer startLock.Unlock()
+
+		if err := checkExclusiveActiveVM(mp, mc); err != nil {
+			return err
+		}
+	} else {
+		// still should make sure we do not start the same machine twice
+		state, err := mp.State(mc, false)
+		if err != nil {
+			return err
+		}
+
+		if state == machineDefine.Running || state == machineDefine.Starting {
+			return fmt.Errorf("machine %s: %w", mc.Name, machineDefine.ErrVMAlreadyRunning)
+		}
+	}
+
 	// Set starting to true
 	mc.Starting = true
 	if err := mc.Write(); err != nil {