machine: QEMU: recover from failed start

After a failed start, we can run into (somehow inconsistent) states
where the machine won't start because a previous QEMU process is still
running and the PID file is being used.  Stop didn't resolve the issue
as this state wasn't detected.

Allow to recover from this state by a) detecting it during start and
error out with a more helpful message than the error QEMU would
otherwise spit out, and b) by enabling stop to kill the dangling QEMU
process - even after a failed stop.

With the changes, a recovery may look as follows:
```
_  podman git:(main) _ ./bin/darwin/podman machine start
Starting machine "podman-machine-default"
Error: cannot start VM "podman-machine-default": another instance of "/opt/homebrew/bin/qemu-system-aarch64" is already running with process ID 970: please stop and restart the VM
_  podman git:(main) _ ./bin/darwin/podman machine stop
Machine "podman-machine-default" stopped successfully
_  podman git:(main) _ ./bin/darwin/podman machine start
Starting machine "podman-machine-default"
Waiting for VM ...
```

Please note that this change does not prevent us from running into such
inconsistent states but only allows for recovering from them.

[NO NEW TESTS NEEDED] - there is no reliable reproducer.

Fixes: #16054
Signed-off-by: Valentin Rothberg <vrothberg@redhat.com>
This commit is contained in:
Valentin Rothberg 2023-08-01 13:44:07 +02:00
parent 1656b93b86
commit 8b7701f522
1 changed files with 67 additions and 1 deletions

View File

@ -28,6 +28,7 @@ import (
"github.com/containers/storage/pkg/lockfile"
"github.com/digitalocean/go-qemu/qmp"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
)
var (
@ -597,6 +598,37 @@ func (v *MachineVM) connectToPodmanSocket(maxBackoffs int, backoff time.Duration
return
}
// qemuPid returns -1 or the PID of the running QEMU instance.
func (v *MachineVM) qemuPid() (int, error) {
pidData, err := os.ReadFile(v.VMPidFilePath.GetPath())
if err != nil {
// The file may not yet exist on start or have already been
// cleaned up after stop, so we need to be defensive.
if errors.Is(err, os.ErrNotExist) {
return -1, nil
}
return -1, err
}
if len(pidData) == 0 {
return -1, nil
}
pid, err := strconv.Atoi(strings.TrimRight(string(pidData), "\n"))
if err != nil {
logrus.Warnf("Reading QEMU pidfile: %v", err)
return -1, nil
}
if err := unix.Kill(pid, 0); err != nil {
if err == unix.ESRCH {
return -1, nil
}
return -1, fmt.Errorf("pinging QEMU process: %w", err)
}
return pid, nil
}
// Start executes the qemu command line and forks it
func (v *MachineVM) Start(name string, opts machine.StartOptions) error {
var (
@ -625,6 +657,16 @@ func (v *MachineVM) Start(name string, opts machine.StartOptions) error {
return fmt.Errorf("cannot start VM %q: %w", v.Name, machine.ErrVMAlreadyRunning)
}
// If QEMU is running already, something went wrong and we cannot
// proceed.
qemuPid, err := v.qemuPid()
if err != nil {
return err
}
if qemuPid != -1 {
return fmt.Errorf("cannot start VM %q: another instance of %q is already running with process ID %d: please stop and restart the VM", v.Name, v.CmdLine[0], qemuPid)
}
v.Starting = true
if err := v.writeConfig(); err != nil {
return fmt.Errorf("writing JSON file: %w", err)
@ -940,7 +982,31 @@ func (v *MachineVM) Stop(_ string, _ machine.StopOptions) error {
if err := v.update(); err != nil {
return err
}
return v.stopLocked()
stopErr := v.stopLocked()
// Make sure that the associated QEMU process gets killed in case it's
// still running (#16054).
qemuPid, err := v.qemuPid()
if err != nil {
if stopErr == nil {
return err
}
return fmt.Errorf("%w: %w", stopErr, err)
}
if qemuPid == -1 {
return stopErr
}
if err := unix.Kill(qemuPid, unix.SIGKILL); err != nil {
if stopErr == nil {
return err
}
return fmt.Errorf("%w: %w", stopErr, err)
}
return stopErr
}
// stopLocked stops the machine and expects the caller to hold the machine's lock.