Added optional container checkpointing statistics

This adds the parameter '--print-stats' to 'podman container checkpoint'.
With '--print-stats' Podman will measure how long Podman itself, the OCI
runtime and CRIU requires to create a checkpoint and print out these
information. CRIU already creates checkpointing statistics which are
just read in addition to the added measurements. In contrast to just
printing out the ID of the checkpointed container, Podman will now print
out JSON:

 # podman container checkpoint --latest --print-stats
 {
     "podman_checkpoint_duration": 360749,
     "container_statistics": [
         {
             "Id": "25244244bf2efbef30fb6857ddea8cb2e5489f07eb6659e20dda117f0c466808",
             "runtime_checkpoint_duration": 177222,
             "criu_statistics": {
                 "freezing_time": 100657,
                 "frozen_time": 60700,
                 "memdump_time": 8162,
                 "memwrite_time": 4224,
                 "pages_scanned": 20561,
                 "pages_written": 2129
             }
         }
     ]
 }

The output contains 'podman_checkpoint_duration' which contains the
number of microseconds Podman required to create the checkpoint. The
output also includes 'runtime_checkpoint_duration' which is the time
the runtime needed to checkpoint that specific container. Each container
also includes 'criu_statistics' which displays the timing information
collected by CRIU.

Signed-off-by: Adrian Reber <areber@redhat.com>
This commit is contained in:
Adrian Reber 2021-11-09 09:15:50 +00:00
parent cca6df428c
commit 6202e8102b
No known key found for this signature in database
GPG Key ID: 82C9378ED3C4906A
11 changed files with 166 additions and 33 deletions

View File

@ -4,6 +4,7 @@ import (
"context"
"fmt"
"strings"
"time"
"github.com/containers/common/pkg/completion"
"github.com/containers/podman/v3/cmd/podman/common"
@ -40,6 +41,11 @@ var (
var checkpointOptions entities.CheckpointOptions
type checkpointStatistics struct {
PodmanDuration int64 `json:"podman_checkpoint_duration"`
ContainerStatistics []*entities.CheckpointReport `json:"container_statistics"`
}
func init() {
registry.Commands = append(registry.Commands, registry.CliCommand{
Command: checkpointCommand,
@ -63,11 +69,19 @@ func init() {
flags.StringP("compress", "c", "zstd", "Select compression algorithm (gzip, none, zstd) for checkpoint archive.")
_ = checkpointCommand.RegisterFlagCompletionFunc("compress", common.AutocompleteCheckpointCompressType)
flags.BoolVar(
&checkpointOptions.PrintStats,
"print-stats",
false,
"Display checkpoint statistics",
)
validate.AddLatestFlag(checkpointCommand, &checkpointOptions.Latest)
}
func checkpoint(cmd *cobra.Command, args []string) error {
var errs utils.OutputErrors
podmanStart := time.Now()
if cmd.Flags().Changed("compress") {
if checkpointOptions.Export == "" {
return errors.Errorf("--compress can only be used with --export")
@ -102,12 +116,30 @@ func checkpoint(cmd *cobra.Command, args []string) error {
if err != nil {
return err
}
podmanFinished := time.Now()
var statistics checkpointStatistics
for _, r := range responses {
if r.Err == nil {
fmt.Println(r.Id)
if checkpointOptions.PrintStats {
statistics.ContainerStatistics = append(statistics.ContainerStatistics, r)
} else {
fmt.Println(r.Id)
}
} else {
errs = append(errs, r.Err)
}
}
if checkpointOptions.PrintStats {
statistics.PodmanDuration = podmanFinished.Sub(podmanStart).Microseconds()
j, err := json.MarshalIndent(statistics, "", " ")
if err != nil {
return err
}
fmt.Println(string(j))
}
return errs.PrintErrors()
}

View File

@ -794,21 +794,29 @@ type ContainerCheckpointOptions struct {
// container no PID 1 will be in the namespace and that is not
// possible.
Pod string
// PrintStats tells the API to fill out the statistics about
// how much time each component in the stack requires to
// checkpoint a container.
PrintStats bool
}
// Checkpoint checkpoints a container
func (c *Container) Checkpoint(ctx context.Context, options ContainerCheckpointOptions) error {
// The return values *define.CRIUCheckpointRestoreStatistics and int64 (time
// the runtime needs to checkpoint the container) are only set if
// options.PrintStats is set to true. Not setting options.PrintStats to true
// will return nil and 0.
func (c *Container) Checkpoint(ctx context.Context, options ContainerCheckpointOptions) (*define.CRIUCheckpointRestoreStatistics, int64, error) {
logrus.Debugf("Trying to checkpoint container %s", c.ID())
if options.TargetFile != "" {
if err := c.prepareCheckpointExport(); err != nil {
return err
return nil, 0, err
}
}
if options.WithPrevious {
if err := c.canWithPrevious(); err != nil {
return err
return nil, 0, err
}
}
@ -817,7 +825,7 @@ func (c *Container) Checkpoint(ctx context.Context, options ContainerCheckpointO
defer c.lock.Unlock()
if err := c.syncContainer(); err != nil {
return err
return nil, 0, err
}
}
return c.checkpoint(ctx, options)

View File

@ -1129,25 +1129,26 @@ func (c *Container) checkpointRestoreSupported(version int) error {
return nil
}
func (c *Container) checkpoint(ctx context.Context, options ContainerCheckpointOptions) error {
func (c *Container) checkpoint(ctx context.Context, options ContainerCheckpointOptions) (*define.CRIUCheckpointRestoreStatistics, int64, error) {
if err := c.checkpointRestoreSupported(criu.MinCriuVersion); err != nil {
return err
return nil, 0, err
}
if c.state.State != define.ContainerStateRunning {
return errors.Wrapf(define.ErrCtrStateInvalid, "%q is not running, cannot checkpoint", c.state.State)
return nil, 0, errors.Wrapf(define.ErrCtrStateInvalid, "%q is not running, cannot checkpoint", c.state.State)
}
if c.AutoRemove() && options.TargetFile == "" {
return errors.Errorf("cannot checkpoint containers that have been started with '--rm' unless '--export' is used")
return nil, 0, errors.Errorf("cannot checkpoint containers that have been started with '--rm' unless '--export' is used")
}
if err := crutils.CRCreateFileWithLabel(c.bundlePath(), "dump.log", c.MountLabel()); err != nil {
return err
return nil, 0, err
}
if err := c.ociRuntime.CheckpointContainer(c, options); err != nil {
return err
runtimeCheckpointDuration, err := c.ociRuntime.CheckpointContainer(c, options)
if err != nil {
return nil, 0, err
}
// Save network.status. This is needed to restore the container with
@ -1155,7 +1156,7 @@ func (c *Container) checkpoint(ctx context.Context, options ContainerCheckpointO
// with one interface.
// FIXME: will this break something?
if _, err := metadata.WriteJSONFile(c.getNetworkStatus(), c.bundlePath(), metadata.NetworkStatusFile); err != nil {
return err
return nil, 0, err
}
defer c.newContainerEvent(events.Checkpoint)
@ -1165,13 +1166,13 @@ func (c *Container) checkpoint(ctx context.Context, options ContainerCheckpointO
if options.WithPrevious {
os.Remove(path.Join(c.CheckpointPath(), "parent"))
if err := os.Symlink("../pre-checkpoint", path.Join(c.CheckpointPath(), "parent")); err != nil {
return err
return nil, 0, err
}
}
if options.TargetFile != "" {
if err := c.exportCheckpoint(options); err != nil {
return err
return nil, 0, err
}
}
@ -1183,10 +1184,37 @@ func (c *Container) checkpoint(ctx context.Context, options ContainerCheckpointO
// Cleanup Storage and Network
if err := c.cleanup(ctx); err != nil {
return err
return nil, 0, err
}
}
criuStatistics, err := func() (*define.CRIUCheckpointRestoreStatistics, error) {
if !options.PrintStats {
return nil, nil
}
statsDirectory, err := os.Open(c.bundlePath())
if err != nil {
return nil, errors.Wrapf(err, "Not able to open %q", c.bundlePath())
}
dumpStatistics, err := stats.CriuGetDumpStats(statsDirectory)
if err != nil {
return nil, errors.Wrap(err, "Displaying checkpointing statistics not possible")
}
return &define.CRIUCheckpointRestoreStatistics{
FreezingTime: dumpStatistics.GetFreezingTime(),
FrozenTime: dumpStatistics.GetFrozenTime(),
MemdumpTime: dumpStatistics.GetMemdumpTime(),
MemwriteTime: dumpStatistics.GetMemwriteTime(),
PagesScanned: dumpStatistics.GetPagesScanned(),
PagesWritten: dumpStatistics.GetPagesWritten(),
}, nil
}()
if err != nil {
return nil, 0, err
}
if !options.Keep && !options.PreCheckPoint {
cleanup := []string{
"dump.log",
@ -1203,7 +1231,7 @@ func (c *Container) checkpoint(ctx context.Context, options ContainerCheckpointO
}
c.state.FinishedTime = time.Now()
return c.save()
return criuStatistics, runtimeCheckpointDuration, c.save()
}
func (c *Container) importCheckpoint(input string) error {

View File

@ -0,0 +1,32 @@
package define
// This contains values reported by CRIU during
// checkpointing or restoring.
// All names are the same as reported by CRIU.
type CRIUCheckpointRestoreStatistics struct {
// Checkpoint values
// Time required to freeze/pause/quiesce the processes
FreezingTime uint32 `json:"freezing_time,omitempty"`
// Time the processes are actually not running during checkpointing
FrozenTime uint32 `json:"frozen_time,omitempty"`
// Time required to extract memory pages from the processes
MemdumpTime uint32 `json:"memdump_time,omitempty"`
// Time required to write memory pages to disk
MemwriteTime uint32 `json:"memwrite_time,omitempty"`
// Number of memory pages CRIU analyzed
PagesScanned uint64 `json:"pages_scanned,omitempty"`
// Number of memory pages written
PagesWritten uint64 `json:"pages_written,omitempty"`
// Restore values
// Number of pages compared during restore
PagesCompared uint64 `json:"pages_compared,omitempty"`
// Number of COW pages skipped during restore
PagesSkippedCow uint64 `json:"pages_skipped_cow,omitempty"`
// Time required to fork processes
ForkingTime uint32 `json:"forking_time,omitempty"`
// Time required to restore
RestoreTime uint32 `json:"restore_time,omitempty"`
// Number of memory pages restored
PagesRestored uint64 `json:"pages_restored,omitempty"`
}

View File

@ -101,8 +101,10 @@ type OCIRuntime interface {
// CheckpointContainer checkpoints the given container.
// Some OCI runtimes may not support this - if SupportsCheckpoint()
// returns false, this is not implemented, and will always return an
// error.
CheckpointContainer(ctr *Container, options ContainerCheckpointOptions) error
// error. If CheckpointOptions.PrintStats is true the first return parameter
// contains the number of microseconds the runtime needed to checkpoint
// the given container.
CheckpointContainer(ctr *Container, options ContainerCheckpointOptions) (int64, error)
// CheckConmonRunning verifies that the given container's Conmon
// instance is still running. Runtimes without Conmon, or systems where

View File

@ -760,9 +760,9 @@ func (r *ConmonOCIRuntime) AttachResize(ctr *Container, newSize define.TerminalS
}
// CheckpointContainer checkpoints the given container.
func (r *ConmonOCIRuntime) CheckpointContainer(ctr *Container, options ContainerCheckpointOptions) error {
func (r *ConmonOCIRuntime) CheckpointContainer(ctr *Container, options ContainerCheckpointOptions) (int64, error) {
if err := label.SetSocketLabel(ctr.ProcessLabel()); err != nil {
return err
return 0, err
}
// imagePath is used by CRIU to store the actual checkpoint files
imagePath := ctr.CheckpointPath()
@ -802,14 +802,25 @@ func (r *ConmonOCIRuntime) CheckpointContainer(ctr *Container, options Container
}
runtimeDir, err := util.GetRuntimeDir()
if err != nil {
return err
return 0, err
}
if err = os.Setenv("XDG_RUNTIME_DIR", runtimeDir); err != nil {
return errors.Wrapf(err, "cannot set XDG_RUNTIME_DIR")
return 0, errors.Wrapf(err, "cannot set XDG_RUNTIME_DIR")
}
args = append(args, ctr.ID())
logrus.Debugf("the args to checkpoint: %s %s", r.path, strings.Join(args, " "))
return utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, nil, r.path, args...)
runtimeCheckpointStarted := time.Now()
err = utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, nil, r.path, args...)
runtimeCheckpointDuration := func() int64 {
if options.PrintStats {
return time.Since(runtimeCheckpointStarted).Microseconds()
}
return 0
}()
return runtimeCheckpointDuration, err
}
func (r *ConmonOCIRuntime) CheckConmonRunning(ctr *Container) (bool, error) {

View File

@ -153,8 +153,8 @@ func (r *MissingRuntime) ExecUpdateStatus(ctr *Container, sessionID string) (boo
}
// CheckpointContainer is not available as the runtime is missing
func (r *MissingRuntime) CheckpointContainer(ctr *Container, options ContainerCheckpointOptions) error {
return r.printError()
func (r *MissingRuntime) CheckpointContainer(ctr *Container, options ContainerCheckpointOptions) (int64, error) {
return 0, r.printError()
}
// CheckConmonRunning is not available as the runtime is missing

View File

@ -214,6 +214,7 @@ func Checkpoint(w http.ResponseWriter, r *http.Request) {
TCPEstablished bool `schema:"tcpEstablished"`
Export bool `schema:"export"`
IgnoreRootFS bool `schema:"ignoreRootFS"`
PrintStats bool `schema:"printStats"`
}{
// override any golang type defaults
}
@ -248,11 +249,12 @@ func Checkpoint(w http.ResponseWriter, r *http.Request) {
KeepRunning: query.LeaveRunning,
TCPEstablished: query.TCPEstablished,
IgnoreRootfs: query.IgnoreRootFS,
PrintStats: query.PrintStats,
}
if query.Export {
options.TargetFile = targetFile
}
err = ctr.Checkpoint(r.Context(), options)
criuStatistics, runtimeCheckpointDuration, err := ctr.Checkpoint(r.Context(), options)
if err != nil {
utils.InternalServerError(w, err)
return
@ -267,7 +269,15 @@ func Checkpoint(w http.ResponseWriter, r *http.Request) {
utils.WriteResponse(w, http.StatusOK, f)
return
}
utils.WriteResponse(w, http.StatusOK, entities.CheckpointReport{Id: ctr.ID()})
utils.WriteResponse(
w,
http.StatusOK,
entities.CheckpointReport{
Id: ctr.ID(),
RuntimeDuration: runtimeCheckpointDuration,
CRIUStatistics: criuStatistics,
},
)
}
func Restore(w http.ResponseWriter, r *http.Request) {

View File

@ -1441,6 +1441,10 @@ func (s *APIServer) registerContainersHandlers(r *mux.Router) error {
// name: ignoreRootFS
// type: boolean
// description: do not include root file-system changes when exporting
// - in: query
// name: printStats
// type: boolean
// description: add checkpoint statistics to the returned CheckpointReport
// produces:
// - application/json
// responses:

View File

@ -190,11 +190,14 @@ type CheckpointOptions struct {
PreCheckPoint bool
WithPrevious bool
Compression archive.Compression
PrintStats bool
}
type CheckpointReport struct {
Err error
Id string //nolint
Err error `json:"-"`
Id string `json:"Id` //nolint
RuntimeDuration int64 `json:"runtime_checkpoint_duration"`
CRIUStatistics *define.CRIUCheckpointRestoreStatistics `json:"criu_statistics"`
}
type RestoreOptions struct {

View File

@ -515,6 +515,7 @@ func (ic *ContainerEngine) ContainerCheckpoint(ctx context.Context, namesOrIds [
PreCheckPoint: options.PreCheckPoint,
WithPrevious: options.WithPrevious,
Compression: options.Compression,
PrintStats: options.PrintStats,
}
if options.All {
@ -531,10 +532,12 @@ func (ic *ContainerEngine) ContainerCheckpoint(ctx context.Context, namesOrIds [
}
reports := make([]*entities.CheckpointReport, 0, len(cons))
for _, con := range cons {
err = con.Checkpoint(ctx, checkOpts)
criuStatistics, runtimeCheckpointDuration, err := con.Checkpoint(ctx, checkOpts)
reports = append(reports, &entities.CheckpointReport{
Err: err,
Id: con.ID(),
Err: err,
Id: con.ID(),
RuntimeDuration: runtimeCheckpointDuration,
CRIUStatistics: criuStatistics,
})
}
return reports, nil