From 860c13b788944410a98a6ad5b5cfb74de0a8405b Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Mon, 4 Aug 2014 18:20:53 -0700 Subject: [PATCH] Add documentation and update restart rules. Implement time backed backoff for restarting and fix failure count when the maximum is 0 Signed-off-by: Michael Crosby --- daemon/monitor.go | 89 +++++++++++++++++------ docs/sources/reference/commandline/cli.md | 26 +++++++ runconfig/parse.go | 29 +++++--- 3 files changed, 111 insertions(+), 33 deletions(-) diff --git a/daemon/monitor.go b/daemon/monitor.go index 94d7f2b004..3eb68791d6 100644 --- a/daemon/monitor.go +++ b/daemon/monitor.go @@ -11,6 +11,8 @@ import ( "github.com/docker/docker/utils" ) +const defaultTimeIncrement = 100 + // containerMonitor monitors the execution of a container's main process. // If a restart policy is specified for the cotnainer the monitor will ensure that the // process is restarted based on the rules of the policy. When the container is finally stopped @@ -19,16 +21,30 @@ import ( type containerMonitor struct { mux sync.Mutex - container *Container + // container is the container being monitored + container *Container + + // restartPolicy is the being applied to the container monitor restartPolicy runconfig.RestartPolicy - failureCount int - shouldStop bool + + // failureCount is the number of times the container has failed to + // start in a row + failureCount int + + // shouldStop signals the monitor that the next time the container exits it is + // either because docker or the user asked for the container to be stopped + shouldStop bool + + // timeIncrement is the amount of time to wait between restarts + // this is in milliseconds + timeIncrement int } func newContainerMonitor(container *Container, policy runconfig.RestartPolicy) *containerMonitor { return &containerMonitor{ container: container, restartPolicy: policy, + timeIncrement: defaultTimeIncrement, } } @@ -62,7 +78,7 @@ func (m *containerMonitor) Close() error { // reset resets the container's IO and ensures that the command is able to be executed again // by copying the data into a new struct -func (m *containerMonitor) reset() { +func (m *containerMonitor) reset(successful bool) { container := m.container if container.Config.OpenStdin { @@ -107,14 +123,29 @@ func (m *containerMonitor) reset() { Dir: c.Dir, SysProcAttr: c.SysProcAttr, } + + // the container exited successfully so we need to reset the failure counter + // and the timeIncrement back to the default values + if successful { + m.failureCount = 0 + m.timeIncrement = defaultTimeIncrement + } else { + // otherwise we need to increment the amount of time we wait before restarting + // the process. We will build up by multiplying the increment by 2 + + m.failureCount++ + m.timeIncrement *= 2 + } } // Start starts the containers process and monitors it according to the restart policy func (m *containerMonitor) Start() error { var ( - err error - exitCode int + err error + exitStatus int ) + + // ensure that when the monitor finally exits we release the networking and unmount the rootfs defer m.Close() // reset the restart count @@ -122,31 +153,26 @@ func (m *containerMonitor) Start() error { for !m.shouldStop { m.container.RestartCount++ + if err := m.container.startLoggingToDisk(); err != nil { - m.reset() + m.reset(false) return err } pipes := execdriver.NewPipes(m.container.stdin, m.container.stdout, m.container.stderr, m.container.Config.OpenStdin) - if exitCode, err = m.container.daemon.Run(m.container, pipes, m.callback); err != nil { - m.failureCount++ - - if m.failureCount == m.restartPolicy.MaximumRetryCount { - m.ExitOnNext() - } - + if exitStatus, err = m.container.daemon.Run(m.container, pipes, m.callback); err != nil { utils.Errorf("Error running container: %s", err) } // We still wait to set the state as stopped and ensure that the locks were released - m.container.State.SetStopped(exitCode) + m.container.State.SetStopped(exitStatus) - m.reset() + m.reset(err == nil && exitStatus == 0) - if m.shouldRestart(exitCode) { - time.Sleep(1 * time.Second) + if m.shouldRestart(exitStatus) { + time.Sleep(time.Duration(m.timeIncrement) * time.Millisecond) continue } @@ -157,16 +183,31 @@ func (m *containerMonitor) Start() error { return err } -func (m *containerMonitor) shouldRestart(exitCode int) bool { +// shouldRestart checks the restart policy and applies the rules to determine if +// the container's process should be restarted +func (m *containerMonitor) shouldRestart(exitStatus int) bool { m.mux.Lock() + defer m.mux.Unlock() - shouldRestart := (m.restartPolicy.Name == "always" || - (m.restartPolicy.Name == "on-failure" && exitCode != 0)) && - !m.shouldStop + // do not restart if the user or docker has requested that this container be stopped + if m.shouldStop { + return false + } - m.mux.Unlock() + switch m.restartPolicy.Name { + case "always": + return true + case "on-failure": + // the default value of 0 for MaximumRetryCount means that we will not enforce a maximum count + if max := m.restartPolicy.MaximumRetryCount; max != 0 && m.failureCount >= max { + utils.Debugf("stopping restart of container %s because maximum failure could of %d has been reached", max) + return false + } - return shouldRestart + return exitStatus != 0 + } + + return false } // callback ensures that the container's state is properly updated after we diff --git a/docs/sources/reference/commandline/cli.md b/docs/sources/reference/commandline/cli.md index 5e6107cfaa..8f4ed19a27 100644 --- a/docs/sources/reference/commandline/cli.md +++ b/docs/sources/reference/commandline/cli.md @@ -993,6 +993,7 @@ removed before the image is removed. format: ip:hostPort:containerPort | ip::containerPort | hostPort:containerPort (use 'docker port' to see the actual mapping) --privileged=false Give extended privileges to this container + --restart="" Restart policy to apply when a container exits (no, on-failure, always) --rm=false Automatically remove the container when it exits (incompatible with -d) --sig-proxy=true Proxy received signals to the process (even in non-TTY mode). SIGCHLD, SIGSTOP, and SIGKILL are not proxied. -t, --tty=false Allocate a pseudo-TTY @@ -1220,6 +1221,31 @@ application change: `--rm` option means that when the container exits, the container's layer is removed. +#### Restart Policies + +Using the `--restart` flag on docker run you can specify a restart policy for +how a container should or should not be restarted on exit. + +** no ** - Do not restart the container when it exits. + +** on-failure ** - Restart the container only if it exits with a non zero exit status. + +** always ** - Always restart the container reguardless of the exit status. + +You can also specify the maximum amount of times docker will try to restart the +container when using the ** on-failure ** policy. The default is that docker will try forever to restart the container. + + $ sudo docker run --restart=always redis + +This will run the redis container with a restart policy of ** always ** so that if +the container exits, docker will restart it. + + $ sudo docker run --restart=on-failure:10 redis + +This will run the redis container with a restart policy of ** on-failure ** and a +maximum restart count of 10. If the redis container exits with a non-zero exit +status more than 10 times in a row docker will abort trying to restart the container. + ## save Usage: docker save IMAGE diff --git a/runconfig/parse.go b/runconfig/parse.go index ea6e9ebca2..2b4dc632a0 100644 --- a/runconfig/parse.go +++ b/runconfig/parse.go @@ -17,11 +17,12 @@ import ( ) var ( - ErrInvalidWorkingDirectory = fmt.Errorf("The working directory is invalid. It needs to be an absolute path.") - ErrConflictAttachDetach = fmt.Errorf("Conflicting options: -a and -d") - ErrConflictDetachAutoRemove = fmt.Errorf("Conflicting options: --rm and -d") - ErrConflictNetworkHostname = fmt.Errorf("Conflicting options: -h and the network mode (--net)") - ErrConflictHostNetworkAndLinks = fmt.Errorf("Conflicting options: --net=host can't be used with links. This would result in undefined behavior.") + ErrInvalidWorkingDirectory = fmt.Errorf("The working directory is invalid. It needs to be an absolute path.") + ErrConflictAttachDetach = fmt.Errorf("Conflicting options: -a and -d") + ErrConflictDetachAutoRemove = fmt.Errorf("Conflicting options: --rm and -d") + ErrConflictNetworkHostname = fmt.Errorf("Conflicting options: -h and the network mode (--net)") + ErrConflictHostNetworkAndLinks = fmt.Errorf("Conflicting options: --net=host can't be used with links. This would result in undefined behavior.") + ErrConflictRestartPolicyAndAutoRemove = fmt.Errorf("Conflicting options: --restart and --rm") ) //FIXME Only used in tests @@ -72,7 +73,7 @@ func parseRun(cmd *flag.FlagSet, args []string, sysInfo *sysinfo.SysInfo) (*Conf flCpuShares = cmd.Int64([]string{"c", "-cpu-shares"}, 0, "CPU shares (relative weight)") flCpuset = cmd.String([]string{"-cpuset"}, "", "CPUs in which to allow execution (0-3, 0,1)") flNetMode = cmd.String([]string{"-net"}, "bridge", "Set the Network mode for the container\n'bridge': creates a new network stack for the container on the docker bridge\n'none': no networking for this container\n'container:': reuses another container network stack\n'host': use the host network stack inside the container. Note: the host mode gives the container full access to local system services such as D-bus and is therefore considered insecure.") - flRestartPolicy = cmd.String([]string{"-restart"}, "", "Restart policy when the dies") + flRestartPolicy = cmd.String([]string{"-restart"}, "", "Restart policy to apply when a container exits (no, on-failure, always)") // For documentation purpose _ = cmd.Bool([]string{"#sig-proxy", "-sig-proxy"}, true, "Proxy received signals to the process (even in non-TTY mode). SIGCHLD, SIGSTOP, and SIGKILL are not proxied.") _ = cmd.String([]string{"#name", "-name"}, "", "Assign a name to the container") @@ -227,8 +228,6 @@ func parseRun(cmd *flag.FlagSet, args []string, sysInfo *sysinfo.SysInfo) (*Conf } // parse the '-e' and '--env' after, to allow override envVariables = append(envVariables, flEnv.GetAll()...) - // boo, there's no debug output for docker run - //log.Debugf("Environment variables for the container: %#v", envVariables) netMode, err := parseNetMode(*flNetMode) if err != nil { @@ -240,6 +239,10 @@ func parseRun(cmd *flag.FlagSet, args []string, sysInfo *sysinfo.SysInfo) (*Conf return nil, nil, cmd, err } + if *flAutoRemove && (restartPolicy.Name == "always" || restartPolicy.Name == "on-failure") { + return nil, nil, cmd, ErrConflictRestartPolicyAndAutoRemove + } + config := &Config{ Hostname: hostname, Domainname: domainname, @@ -307,7 +310,15 @@ func parseRestartPolicy(policy string) (RestartPolicy, error) { ) switch name { - case "no", "on-failure", "always": + case "always": + p.Name = name + + if len(parts) == 2 { + return p, fmt.Errorf("maximum restart count not valid with restart policy of \"always\"") + } + case "no": + // do nothing + case "on-failure": p.Name = name if len(parts) == 2 {