healthcheck: wait for systemd operations
Make sure to wait for the systemd operations to finish when starting/stopping healtcheck timers and services. Also make sure to stop the timer before the service to avoid a race with the timer. [NO NEW TESTS NEEDED] since it is a non-functional change and existing tests are expected to pass. Signed-off-by: Valentin Rothberg <vrothberg@redhat.com>
This commit is contained in:
parent
800a367d73
commit
f23ae4d660
|
@ -7,6 +7,7 @@ import (
|
|||
"os/exec"
|
||||
"strings"
|
||||
|
||||
"github.com/containers/podman/v4/pkg/errorhandling"
|
||||
"github.com/containers/podman/v4/pkg/rootless"
|
||||
"github.com/containers/podman/v4/pkg/systemd"
|
||||
"github.com/pkg/errors"
|
||||
|
@ -46,6 +47,17 @@ func (c *Container) createTimer() error {
|
|||
return nil
|
||||
}
|
||||
|
||||
// Wait for a message on the channel. Throw an error if the message is not "done".
|
||||
func systemdOpSuccessful(c chan string) error {
|
||||
msg := <-c
|
||||
switch msg {
|
||||
case "done":
|
||||
return nil
|
||||
default:
|
||||
return fmt.Errorf("expected %q but received %q", "done", msg)
|
||||
}
|
||||
}
|
||||
|
||||
// startTimer starts a systemd timer for the healthchecks
|
||||
func (c *Container) startTimer() error {
|
||||
if c.disableHealthCheckSystemd() {
|
||||
|
@ -56,8 +68,17 @@ func (c *Container) startTimer() error {
|
|||
return errors.Wrapf(err, "unable to get systemd connection to start healthchecks")
|
||||
}
|
||||
defer conn.Close()
|
||||
_, err = conn.StartUnitContext(context.Background(), fmt.Sprintf("%s.service", c.ID()), "fail", nil)
|
||||
return err
|
||||
|
||||
startFile := fmt.Sprintf("%s.service", c.ID())
|
||||
startChan := make(chan string)
|
||||
if _, err := conn.StartUnitContext(context.Background(), startFile, "fail", startChan); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := systemdOpSuccessful(startChan); err != nil {
|
||||
return fmt.Errorf("starting systemd health-check timer %q: %w", startFile, err)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// removeTransientFiles removes the systemd timer and unit files
|
||||
|
@ -71,30 +92,37 @@ func (c *Container) removeTransientFiles(ctx context.Context) error {
|
|||
return errors.Wrapf(err, "unable to get systemd connection to remove healthchecks")
|
||||
}
|
||||
defer conn.Close()
|
||||
|
||||
// Errors are returned at the very end. Let's make sure to stop and
|
||||
// clean up as much as possible.
|
||||
stopErrors := []error{}
|
||||
|
||||
// Stop the timer before the service to make sure the timer does not
|
||||
// fire after the service is stopped.
|
||||
timerChan := make(chan string)
|
||||
timerFile := fmt.Sprintf("%s.timer", c.ID())
|
||||
if _, err := conn.StopUnitContext(ctx, timerFile, "fail", timerChan); err != nil {
|
||||
if !strings.HasSuffix(err.Error(), ".timer not loaded.") {
|
||||
stopErrors = append(stopErrors, fmt.Errorf("removing health-check timer %q: %w", timerFile, err))
|
||||
}
|
||||
} else if err := systemdOpSuccessful(timerChan); err != nil {
|
||||
stopErrors = append(stopErrors, fmt.Errorf("stopping systemd health-check timer %q: %w", timerFile, err))
|
||||
}
|
||||
|
||||
// Reset the service before stopping it to make sure it's being removed
|
||||
// on stop.
|
||||
serviceChan := make(chan string)
|
||||
serviceFile := fmt.Sprintf("%s.service", c.ID())
|
||||
|
||||
// If the service has failed (the healthcheck has failed), then
|
||||
// the .service file is not removed on stopping the unit file. If
|
||||
// we check the properties of the service, it will automatically
|
||||
// reset the state. But checking the state takes msecs vs usecs to
|
||||
// blindly call reset.
|
||||
if err := conn.ResetFailedUnitContext(ctx, serviceFile); err != nil {
|
||||
logrus.Debugf("failed to reset unit file: %q", err)
|
||||
logrus.Debugf("Failed to reset unit file: %q", err)
|
||||
}
|
||||
if _, err := conn.StopUnitContext(ctx, serviceFile, "fail", serviceChan); err != nil {
|
||||
if !strings.HasSuffix(err.Error(), ".service not loaded.") {
|
||||
stopErrors = append(stopErrors, fmt.Errorf("removing health-check service %q: %w", serviceFile, err))
|
||||
}
|
||||
} else if err := systemdOpSuccessful(serviceChan); err != nil {
|
||||
stopErrors = append(stopErrors, fmt.Errorf("stopping systemd health-check service %q: %w", serviceFile, err))
|
||||
}
|
||||
|
||||
// We want to ignore errors where the timer unit and/or service unit has already
|
||||
// been removed. The error return is generic so we have to check against the
|
||||
// string in the error
|
||||
if _, err = conn.StopUnitContext(ctx, serviceFile, "fail", nil); err != nil {
|
||||
if !strings.HasSuffix(err.Error(), ".service not loaded.") {
|
||||
return errors.Wrapf(err, "unable to remove service file")
|
||||
}
|
||||
}
|
||||
if _, err = conn.StopUnitContext(ctx, timerFile, "fail", nil); err != nil {
|
||||
if strings.HasSuffix(err.Error(), ".timer not loaded.") {
|
||||
return nil
|
||||
}
|
||||
}
|
||||
return err
|
||||
return errorhandling.JoinErrors(stopErrors)
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue