Improve robustness of `podman system reset`

Firstly, reset is now managed by the runtime itself as a part of
initialization. This ensures that it can be used even with
runtimes that would otherwise fail to be created - most notably,
when the user has changed a core path
(runroot/root/tmpdir/staticdir).

Secondly, we now attempt a best-effort removal even if the store
completely fails to be configured.

Third, we now hold the alive lock for the entire reset operation.
This ensures that no other Podman process can start while we are
running a system reset, and removes any possibility of a race
where a user tries to create containers or pull images while we
are trying to perform a reset.

[NO NEW TESTS NEEDED] we do not test reset last I checked.

Fixes #9075

Signed-off-by: Matthew Heon <mheon@redhat.com>
This commit is contained in:
Matthew Heon 2022-06-02 14:15:06 -04:00
parent 2039445763
commit 259c79963f
7 changed files with 170 additions and 16 deletions

View File

@ -91,20 +91,12 @@ func reset(cmd *cobra.Command, args []string) {
registry.ContainerEngine().Shutdown(registry.Context())
registry.ImageEngine().Shutdown(registry.Context())
engine, err := infra.NewSystemEngine(entities.ResetMode, registry.PodmanConfig())
if err != nil {
// Do not try to shut the engine down, as a Reset engine is not valid
// after its creation.
if _, err := infra.NewSystemEngine(entities.ResetMode, registry.PodmanConfig()); err != nil {
logrus.Error(err)
os.Exit(define.ExecErrorCodeGeneric)
}
defer engine.Shutdown(registry.Context())
if err := engine.Reset(registry.Context()); err != nil {
logrus.Error(err)
// FIXME change this to return the error like other commands
// defer will never run on os.Exit()
//nolint:gocritic
os.Exit(define.ExecErrorCodeGeneric)
}
// Shutdown podman-machine and delete all machine files
if err := resetMachine(); err != nil {

View File

@ -435,6 +435,21 @@ func WithDefaultInfraCommand(cmd string) RuntimeOption {
}
}
// WithReset instructs libpod to reset all storage to factory defaults.
// All containers, pods, volumes, images, and networks will be removed.
// All directories created by Libpod will be removed.
func WithReset() RuntimeOption {
return func(rt *Runtime) error {
if rt.valid {
return define.ErrRuntimeFinalized
}
rt.doReset = true
return nil
}
}
// WithRenumber instructs libpod to perform a lock renumbering while
// initializing. This will handle migrations from early versions of libpod with
// file locks to newer versions with SHM locking, as well as changes in the

View File

@ -17,8 +17,78 @@ import (
"github.com/sirupsen/logrus"
)
// removeAllDirs removes all Podman storage directories. It is intended to be
// used as a backup for reset() when that function cannot be used due to
// failures in initializing libpod.
// It does not expect that all the directories match what is in use by Podman,
// as this is a common failure point for `system reset`. As such, our ability to
// interface with containers and pods is somewhat limited.
// This function assumes that we do not have a working c/storage store.
func (r *Runtime) removeAllDirs() error {
var lastErr error
// Grab the runtime alive lock.
// This ensures that no other Podman process can run while we are doing
// a reset, so no race conditions with containers/pods/etc being created
// while we are resetting storage.
// TODO: maybe want a helper for getting the path? This is duped from
// runtime.go
runtimeAliveLock := filepath.Join(r.config.Engine.TmpDir, "alive.lck")
aliveLock, err := storage.GetLockfile(runtimeAliveLock)
if err != nil {
logrus.Errorf("Lock runtime alive lock %s: %v", runtimeAliveLock, err)
} else {
aliveLock.Lock()
defer aliveLock.Unlock()
}
// We do not have a store - so we can't really try and remove containers
// or pods or volumes...
// Try and remove the directories, in hopes that they are unmounted.
// This is likely to fail but it's the best we can do.
// Volume path
if err := os.RemoveAll(r.config.Engine.VolumePath); err != nil {
lastErr = errors.Wrapf(err, "removing volume path")
}
// Tmpdir
if err := os.RemoveAll(r.config.Engine.TmpDir); err != nil {
if lastErr != nil {
logrus.Errorf("Reset: %v", lastErr)
}
lastErr = errors.Wrapf(err, "removing tmp dir")
}
// Runroot
if err := os.RemoveAll(r.storageConfig.RunRoot); err != nil {
if lastErr != nil {
logrus.Errorf("Reset: %v", lastErr)
}
lastErr = errors.Wrapf(err, "removing run root")
}
// Static dir
if err := os.RemoveAll(r.config.Engine.StaticDir); err != nil {
if lastErr != nil {
logrus.Errorf("Reset: %v", lastErr)
}
lastErr = errors.Wrapf(err, "removing static dir")
}
// Graph root
if err := os.RemoveAll(r.storageConfig.GraphRoot); err != nil {
if lastErr != nil {
logrus.Errorf("Reset: %v", lastErr)
}
lastErr = errors.Wrapf(err, "removing graph root")
}
return lastErr
}
// Reset removes all storage
func (r *Runtime) Reset(ctx context.Context) error {
func (r *Runtime) reset(ctx context.Context) error {
var timeout *uint
pods, err := r.GetAllPods()
if err != nil {

View File

@ -96,6 +96,10 @@ type Runtime struct {
// This bool is just needed so that we can set it for netavark interface.
syslog bool
// doReset indicates that the runtime should perform a system reset.
// All Podman files will be removed.
doReset bool
// doRenumber indicates that the runtime should perform a lock renumber
// during initialization.
// Once the runtime has been initialized and returned, this variable is
@ -235,6 +239,11 @@ func newRuntimeFromConfig(conf *config.Config, options ...RuntimeOption) (*Runti
runtime.config.CheckCgroupsAndAdjustConfig()
// If resetting storage, do *not* return a runtime.
if runtime.doReset {
return nil, nil
}
return runtime, nil
}
@ -305,6 +314,13 @@ func makeRuntime(runtime *Runtime) (retErr error) {
}
runtime.conmonPath = cPath
if runtime.noStore && runtime.doReset {
return errors.Wrapf(define.ErrInvalidArg, "cannot perform system reset if runtime is not creating a store")
}
if runtime.doReset && runtime.doRenumber {
return errors.Wrapf(define.ErrInvalidArg, "cannot perform system reset while renumbering locks")
}
// Make the static files directory if it does not exist
if err := os.MkdirAll(runtime.config.Engine.StaticDir, 0700); err != nil {
// The directory is allowed to exist
@ -339,6 +355,20 @@ func makeRuntime(runtime *Runtime) (retErr error) {
// Grab config from the database so we can reset some defaults
dbConfig, err := runtime.state.GetDBConfig()
if err != nil {
if runtime.doReset {
// We can at least delete the DB and the static files
// directory.
// Can't safely touch anything else because we aren't
// sure of other directories.
if err := runtime.state.Close(); err != nil {
logrus.Errorf("Closing database connection: %v", err)
} else {
if err := os.RemoveAll(runtime.config.Engine.StaticDir); err != nil {
logrus.Errorf("Removing static files directory %v: %v", runtime.config.Engine.StaticDir, err)
}
}
}
return errors.Wrapf(err, "error retrieving runtime configuration from database")
}
@ -372,8 +402,14 @@ func makeRuntime(runtime *Runtime) (retErr error) {
// Validate our config against the database, now that we've set our
// final storage configuration
if err := runtime.state.ValidateDBConfig(runtime); err != nil {
// If we are performing a storage reset: continue on with a
// warning. Otherwise we can't `system reset` after a change to
// the core paths.
if !runtime.doReset {
return err
}
logrus.Errorf("Runtime paths differ from those stored in database, storage reset may not remove all files")
}
if err := runtime.state.SetNamespace(runtime.config.Engine.Namespace); err != nil {
return errors.Wrapf(err, "error setting libpod namespace in state")
@ -394,6 +430,14 @@ func makeRuntime(runtime *Runtime) (retErr error) {
} else if runtime.noStore {
logrus.Debug("No store required. Not opening container store.")
} else if err := runtime.configureStore(); err != nil {
// Make a best-effort attempt to clean up if performing a
// storage reset.
if runtime.doReset {
if err := runtime.removeAllDirs(); err != nil {
logrus.Errorf("Removing libpod directories: %v", err)
}
}
return err
}
defer func() {
@ -575,6 +619,18 @@ func makeRuntime(runtime *Runtime) (retErr error) {
return err
}
// If we're resetting storage, do it now.
// We will not return a valid runtime.
// TODO: Plumb this context out so it can be set.
if runtime.doReset {
// Mark the runtime as valid, so normal functionality "mostly"
// works and we can use regular functions to remove
// ctrs/pods/etc
runtime.valid = true
return runtime.reset(context.Background())
}
// If we're renumbering locks, do it now.
// It breaks out of normal runtime init, and will not return a valid
// runtime.
@ -818,7 +874,7 @@ func (r *Runtime) DeferredShutdown(force bool) {
// still containers running or mounted
func (r *Runtime) Shutdown(force bool) error {
if !r.valid {
return define.ErrRuntimeStopped
return nil
}
if r.workerChannel != nil {

View File

@ -328,7 +328,7 @@ func (ic *ContainerEngine) SystemDf(ctx context.Context, options entities.System
}
func (se *SystemEngine) Reset(ctx context.Context) error {
return se.Libpod.Reset(ctx)
return nil
}
func (se *SystemEngine) Renumber(ctx context.Context, flags *pflag.FlagSet, config *entities.PodmanConfig) error {

View File

@ -53,7 +53,7 @@ func NewSystemEngine(setup entities.EngineSetup, facts *entities.PodmanConfig) (
case entities.RenumberMode:
r, err = GetRuntimeRenumber(context.Background(), facts.FlagSet, facts)
case entities.ResetMode:
r, err = GetRuntimeRenumber(context.Background(), facts.FlagSet, facts)
r, err = GetRuntimeReset(context.Background(), facts.FlagSet, facts)
case entities.MigrateMode:
name, flagErr := facts.FlagSet.GetString("new-runtime")
if flagErr != nil {

View File

@ -37,6 +37,7 @@ type engineOpts struct {
migrate bool
noStore bool
withFDS bool
reset bool
config *entities.PodmanConfig
}
@ -48,6 +49,7 @@ func GetRuntimeMigrate(ctx context.Context, fs *flag.FlagSet, cfg *entities.Podm
migrate: true,
noStore: false,
withFDS: true,
reset: false,
config: cfg,
})
}
@ -59,6 +61,7 @@ func GetRuntimeDisableFDs(ctx context.Context, fs *flag.FlagSet, cfg *entities.P
migrate: false,
noStore: false,
withFDS: false,
reset: false,
config: cfg,
})
}
@ -70,6 +73,7 @@ func GetRuntimeRenumber(ctx context.Context, fs *flag.FlagSet, cfg *entities.Pod
migrate: false,
noStore: false,
withFDS: true,
reset: false,
config: cfg,
})
}
@ -82,6 +86,7 @@ func GetRuntime(ctx context.Context, flags *flag.FlagSet, cfg *entities.PodmanCo
migrate: false,
noStore: false,
withFDS: true,
reset: false,
config: cfg,
})
})
@ -95,6 +100,18 @@ func GetRuntimeNoStore(ctx context.Context, fs *flag.FlagSet, cfg *entities.Podm
migrate: false,
noStore: true,
withFDS: true,
reset: false,
config: cfg,
})
}
func GetRuntimeReset(ctx context.Context, fs *flag.FlagSet, cfg *entities.PodmanConfig) (*libpod.Runtime, error) {
return getRuntime(ctx, fs, &engineOpts{
renumber: false,
migrate: false,
noStore: false,
withFDS: true,
reset: true,
config: cfg,
})
}
@ -161,6 +178,10 @@ func getRuntime(ctx context.Context, fs *flag.FlagSet, opts *engineOpts) (*libpo
}
}
if opts.reset {
options = append(options, libpod.WithReset())
}
if opts.renumber {
options = append(options, libpod.WithRenumber())
}