From f96e04ffc7973e290653044cc86dbc1efb18276d Mon Sep 17 00:00:00 2001 From: Vishnu Kannan Date: Wed, 8 Oct 2014 17:03:57 +0000 Subject: [PATCH] This patch adds ability in docker to detect out of memory conditions in containers. Since the containers can handle the out of memory kernel kills gracefully, docker will only provide out of memory information as an additional metadata as part of container status. Docker-DCO-1.1-Signed-off-by: Vishnu Kannan (github: vishh) --- daemon/daemon.go | 6 +- daemon/execdriver/driver.go | 11 +++- daemon/execdriver/lxc/driver.go | 14 ++--- daemon/execdriver/native/driver.go | 96 +++++++++++++++++++++--------- daemon/monitor.go | 10 ++-- daemon/state.go | 29 ++++++--- daemon/state_test.go | 4 +- integration/runtime_test.go | 3 +- 8 files changed, 119 insertions(+), 54 deletions(-) diff --git a/daemon/daemon.go b/daemon/daemon.go index b0feae917b..e04caa8ffe 100644 --- a/daemon/daemon.go +++ b/daemon/daemon.go @@ -231,7 +231,7 @@ func (daemon *Daemon) register(container *Container, updateSuffixarray bool) err log.Debugf("killing old running container %s", container.ID) existingPid := container.Pid - container.SetStopped(0) + container.SetStopped(&execdriver.ExitStatus{0, false}) // We only have to handle this for lxc because the other drivers will ensure that // no processes are left when docker dies @@ -263,7 +263,7 @@ func (daemon *Daemon) register(container *Container, updateSuffixarray bool) err log.Debugf("Marking as stopped") - container.SetStopped(-127) + container.SetStopped(&execdriver.ExitStatus{-127, false}) if err := container.ToDisk(); err != nil { return err } @@ -991,7 +991,7 @@ func (daemon *Daemon) Diff(container *Container) (archive.Archive, error) { return daemon.driver.Diff(container.ID, initID) } -func (daemon *Daemon) Run(c *Container, pipes *execdriver.Pipes, startCallback execdriver.StartCallback) (int, error) { +func (daemon *Daemon) Run(c *Container, pipes *execdriver.Pipes, startCallback execdriver.StartCallback) (*execdriver.ExitStatus, error) { return daemon.execDriver.Run(c.command, pipes, startCallback) } diff --git a/daemon/execdriver/driver.go b/daemon/execdriver/driver.go index bc2eb24eda..6ed98b78ba 100644 --- a/daemon/execdriver/driver.go +++ b/daemon/execdriver/driver.go @@ -40,9 +40,18 @@ type TtyTerminal interface { Master() *os.File } +// ExitStatus provides exit reasons for a container. +type ExitStatus struct { + // The exit code with which the container exited. + ExitCode int + + // Whether the container encountered an OOM. + OOMKilled bool +} + type Driver interface { Run(c *Command, pipes *Pipes, startCallback StartCallback) (int, error) // Run executes the process and blocks until the process exits and returns the exit code - // Exec executes the process in a running container, blocks until the process exits and returns the exit code + // Exec executes the process in an existing container, blocks until the process exits and returns the exit code Exec(c *Command, processConfig *ProcessConfig, pipes *Pipes, startCallback StartCallback) (int, error) Kill(c *Command, sig int) error Pause(c *Command) error diff --git a/daemon/execdriver/lxc/driver.go b/daemon/execdriver/lxc/driver.go index 7583a3e64f..3d8aca0354 100644 --- a/daemon/execdriver/lxc/driver.go +++ b/daemon/execdriver/lxc/driver.go @@ -55,7 +55,7 @@ func (d *driver) Name() string { return fmt.Sprintf("%s-%s", DriverName, version) } -func (d *driver) Run(c *execdriver.Command, pipes *execdriver.Pipes, startCallback execdriver.StartCallback) (int, error) { +func (d *driver) Run(c *execdriver.Command, pipes *execdriver.Pipes, startCallback execdriver.StartCallback) (*execdriver.ExitStatus, error) { var ( term execdriver.Terminal err error @@ -76,11 +76,11 @@ func (d *driver) Run(c *execdriver.Command, pipes *execdriver.Pipes, startCallba }) if err := d.generateEnvConfig(c); err != nil { - return -1, err + return nil, err } configPath, err := d.generateLXCConfig(c) if err != nil { - return -1, err + return nil, err } params := []string{ "lxc-start", @@ -155,11 +155,11 @@ func (d *driver) Run(c *execdriver.Command, pipes *execdriver.Pipes, startCallba c.ProcessConfig.Args = append([]string{name}, arg...) if err := nodes.CreateDeviceNodes(c.Rootfs, c.AutoCreatedDevices); err != nil { - return -1, err + return nil, err } if err := c.ProcessConfig.Start(); err != nil { - return -1, err + return nil, err } var ( @@ -183,7 +183,7 @@ func (d *driver) Run(c *execdriver.Command, pipes *execdriver.Pipes, startCallba c.ProcessConfig.Process.Kill() c.ProcessConfig.Wait() } - return -1, err + return nil, err } c.ContainerPid = pid @@ -194,7 +194,7 @@ func (d *driver) Run(c *execdriver.Command, pipes *execdriver.Pipes, startCallba <-waitLock - return getExitCode(c), waitErr + return &execdriver.ExitStatus{getExitCode(c), false}, waitErr } /// Return the exit code of the process diff --git a/daemon/execdriver/native/driver.go b/daemon/execdriver/native/driver.go index 3628d7b575..a37eccbabe 100644 --- a/daemon/execdriver/native/driver.go +++ b/daemon/execdriver/native/driver.go @@ -14,6 +14,7 @@ import ( "sync" "syscall" + log "github.com/Sirupsen/logrus" "github.com/docker/docker/daemon/execdriver" "github.com/docker/docker/pkg/term" "github.com/docker/libcontainer" @@ -60,11 +61,20 @@ func NewDriver(root, initPath string) (*driver, error) { }, nil } -func (d *driver) Run(c *execdriver.Command, pipes *execdriver.Pipes, startCallback execdriver.StartCallback) (int, error) { +func (d *driver) notifyOnOOM(config *libcontainer.Config) (<-chan struct{}, error) { + return fs.NotifyOnOOM(config.Cgroups) +} + +type execOutput struct { + exitCode int + err error +} + +func (d *driver) Run(c *execdriver.Command, pipes *execdriver.Pipes, startCallback execdriver.StartCallback) (*execdriver.ExitStatus, error) { // take the Command and populate the libcontainer.Config from it container, err := d.createContainer(c) if err != nil { - return -1, err + return nil, err } var term execdriver.Terminal @@ -75,7 +85,7 @@ func (d *driver) Run(c *execdriver.Command, pipes *execdriver.Pipes, startCallba term, err = execdriver.NewStdConsole(&c.ProcessConfig, pipes) } if err != nil { - return -1, err + return nil, err } c.ProcessConfig.Terminal = term @@ -92,40 +102,70 @@ func (d *driver) Run(c *execdriver.Command, pipes *execdriver.Pipes, startCallba ) if err := d.createContainerRoot(c.ID); err != nil { - return -1, err + return nil, err } defer d.cleanContainer(c.ID) if err := d.writeContainerFile(container, c.ID); err != nil { - return -1, err + return nil, err } - return namespaces.Exec(container, c.ProcessConfig.Stdin, c.ProcessConfig.Stdout, c.ProcessConfig.Stderr, c.ProcessConfig.Console, dataPath, args, func(container *libcontainer.Config, console, dataPath, init string, child *os.File, args []string) *exec.Cmd { - c.ProcessConfig.Path = d.initPath - c.ProcessConfig.Args = append([]string{ - DriverName, - "-console", console, - "-pipe", "3", - "-root", filepath.Join(d.root, c.ID), - "--", - }, args...) + execOutputChan := make(chan execOutput, 0) + waitForStart := make(chan struct{}, 0) - // set this to nil so that when we set the clone flags anything else is reset - c.ProcessConfig.SysProcAttr = &syscall.SysProcAttr{ - Cloneflags: uintptr(namespaces.GetNamespaceFlags(container.Namespaces)), + go func() { + exitCode, err := namespaces.Exec(container, c.ProcessConfig.Stdin, c.ProcessConfig.Stdout, c.ProcessConfig.Stderr, c.ProcessConfig.Console, dataPath, args, func(container *libcontainer.Config, console, dataPath, init string, child *os.File, args []string) *exec.Cmd { + c.ProcessConfig.Path = d.initPath + c.ProcessConfig.Args = append([]string{ + DriverName, + "-console", console, + "-pipe", "3", + "-root", filepath.Join(d.root, c.ID), + "--", + }, args...) + + // set this to nil so that when we set the clone flags anything else is reset + c.ProcessConfig.SysProcAttr = &syscall.SysProcAttr{ + Cloneflags: uintptr(namespaces.GetNamespaceFlags(container.Namespaces)), + } + c.ProcessConfig.ExtraFiles = []*os.File{child} + + c.ProcessConfig.Env = container.Env + c.ProcessConfig.Dir = container.RootFs + + return &c.ProcessConfig.Cmd + }, func() { + close(waitForStart) + if startCallback != nil { + c.ContainerPid = c.ProcessConfig.Process.Pid + startCallback(&c.ProcessConfig, c.ContainerPid) + } + }) + execOutputChan <- execOutput{exitCode, err} + }() + + select { + case execOutput := <-execOutputChan: + return &execdriver.ExitStatus{execOutput.exitCode, false}, execOutput.err + case <-waitForStart: + break + } + + oomKill := false + go func() { + oomKillNotification, err := d.notifyOnOOM(container) + if err == nil { + if _, ok := <-oomKillNotification; ok { + oomKill = true + } + } else { + log.Infof("WARNING: Your kernel does not support OOM notifications: %s", err) } - c.ProcessConfig.ExtraFiles = []*os.File{child} + }() + // wait for the container to exit. + execOutput := <-execOutputChan - c.ProcessConfig.Env = container.Env - c.ProcessConfig.Dir = container.RootFs - - return &c.ProcessConfig.Cmd - }, func() { - if startCallback != nil { - c.ContainerPid = c.ProcessConfig.Process.Pid - startCallback(&c.ProcessConfig, c.ContainerPid) - } - }) + return &execdriver.ExitStatus{execOutput.exitCode, oomKill}, execOutput.err } func (d *driver) Kill(p *execdriver.Command, sig int) error { diff --git a/daemon/monitor.go b/daemon/monitor.go index d0d9d70a99..9ef991eb66 100644 --- a/daemon/monitor.go +++ b/daemon/monitor.go @@ -100,7 +100,7 @@ func (m *containerMonitor) Close() error { func (m *containerMonitor) Start() error { var ( err error - exitStatus int + exitStatus *execdriver.ExitStatus // this variable indicates where we in execution flow: // before Run or after afterRun bool @@ -150,9 +150,9 @@ func (m *containerMonitor) Start() error { // here container.Lock is already lost afterRun = true - m.resetMonitor(err == nil && exitStatus == 0) + m.resetMonitor(err == nil && exitStatus.ExitCode == 0) - if m.shouldRestart(exitStatus) { + if m.shouldRestart(exitStatus.ExitCode) { m.container.SetRestarting(exitStatus) m.container.LogEvent("die") m.resetContainer(true) @@ -209,7 +209,7 @@ func (m *containerMonitor) waitForNextRestart() { // shouldRestart checks the restart policy and applies the rules to determine if // the container's process should be restarted -func (m *containerMonitor) shouldRestart(exitStatus int) bool { +func (m *containerMonitor) shouldRestart(exitCode int) bool { m.mux.Lock() defer m.mux.Unlock() @@ -228,7 +228,7 @@ func (m *containerMonitor) shouldRestart(exitStatus int) bool { return false } - return exitStatus != 0 + return exitCode != 0 } return false diff --git a/daemon/state.go b/daemon/state.go index 2dd57bd94b..282f5da930 100644 --- a/daemon/state.go +++ b/daemon/state.go @@ -5,6 +5,7 @@ import ( "sync" "time" + "github.com/docker/docker/daemon/execdriver" "github.com/docker/docker/pkg/units" ) @@ -13,6 +14,7 @@ type State struct { Running bool Paused bool Restarting bool + OOMKilled bool Pid int ExitCode int Error string // contains last known error when starting the container @@ -29,12 +31,16 @@ func NewState() *State { // String returns a human-readable description of the state func (s *State) String() string { + oomInfo := "" + if s.OOMKilled { + oomInfo = "possibly due to lack of memory" + } if s.Running { if s.Paused { return fmt.Sprintf("Up %s (Paused)", units.HumanDuration(time.Now().UTC().Sub(s.StartedAt))) } if s.Restarting { - return fmt.Sprintf("Restarting (%d) %s ago", s.ExitCode, units.HumanDuration(time.Now().UTC().Sub(s.FinishedAt))) + return fmt.Sprintf("Restarting (%d) %s ago %s", s.ExitCode, units.HumanDuration(time.Now().UTC().Sub(s.FinishedAt)), oomInfo) } return fmt.Sprintf("Up %s", units.HumanDuration(time.Now().UTC().Sub(s.StartedAt))) @@ -44,7 +50,7 @@ func (s *State) String() string { return "" } - return fmt.Sprintf("Exited (%d) %s ago", s.ExitCode, units.HumanDuration(time.Now().UTC().Sub(s.FinishedAt))) + return fmt.Sprintf("Exited (%d) %s ago %s", s.ExitCode, units.HumanDuration(time.Now().UTC().Sub(s.FinishedAt)), oomInfo) } // StateString returns a single string to describe state @@ -149,25 +155,29 @@ func (s *State) setRunning(pid int) { s.waitChan = make(chan struct{}) } -func (s *State) SetStopped(exitCode int) { +func (s *State) SetStopped(exitStatus *execdriver.ExitStatus) { s.Lock() - s.setStopped(exitCode) + s.setStopped(exitStatus) s.Unlock() } -func (s *State) setStopped(exitCode int) { +func (s *State) setStopped(exitStatus *execdriver.ExitStatus) { s.Running = false s.Restarting = false s.Pid = 0 s.FinishedAt = time.Now().UTC() - s.ExitCode = exitCode + s.ExitCode = exitStatus.ExitCode + s.OOMKilled = false + if exitStatus.OOMKilled { + s.OOMKilled = true + } close(s.waitChan) // fire waiters for stop s.waitChan = make(chan struct{}) } // SetRestarting is when docker hanldes the auto restart of containers when they are // in the middle of a stop and being restarted again -func (s *State) SetRestarting(exitCode int) { +func (s *State) SetRestarting(exitStatus *execdriver.ExitStatus) { s.Lock() // we should consider the container running when it is restarting because of // all the checks in docker around rm/stop/etc @@ -175,7 +185,10 @@ func (s *State) SetRestarting(exitCode int) { s.Restarting = true s.Pid = 0 s.FinishedAt = time.Now().UTC() - s.ExitCode = exitCode + s.ExitCode = exitStatus.ExitCode + if exitStatus.OOMKilled { + s.OOMKilled = true + } close(s.waitChan) // fire waiters for stop s.waitChan = make(chan struct{}) s.Unlock() diff --git a/daemon/state_test.go b/daemon/state_test.go index 35524356a3..32c005cf2e 100644 --- a/daemon/state_test.go +++ b/daemon/state_test.go @@ -4,6 +4,8 @@ import ( "sync/atomic" "testing" "time" + + "github.com/docker/docker/daemon/execdriver" ) func TestStateRunStop(t *testing.T) { @@ -47,7 +49,7 @@ func TestStateRunStop(t *testing.T) { atomic.StoreInt64(&exit, int64(exitCode)) close(stopped) }() - s.SetStopped(i) + s.SetStopped(&execdriver.ExitStatus{i, false}) if s.IsRunning() { t.Fatal("State is running") } diff --git a/integration/runtime_test.go b/integration/runtime_test.go index 01097b156e..75f68d5c1b 100644 --- a/integration/runtime_test.go +++ b/integration/runtime_test.go @@ -18,6 +18,7 @@ import ( log "github.com/Sirupsen/logrus" "github.com/docker/docker/daemon" + "github.com/docker/docker/daemon/execdriver" "github.com/docker/docker/engine" "github.com/docker/docker/image" "github.com/docker/docker/nat" @@ -652,7 +653,7 @@ func TestRestore(t *testing.T) { if err := container3.Run(); err != nil { t.Fatal(err) } - container2.SetStopped(0) + container2.SetStopped(&execdriver.ExitStatus{0, false}) } func TestDefaultContainerName(t *testing.T) {