docs/cluster/watchdog.go

72 lines
1.7 KiB
Go

package cluster
import (
"sync"
log "github.com/Sirupsen/logrus"
)
type Watchdog struct {
l sync.Mutex
cluster Cluster
}
// Handle cluster callbacks
func (w *Watchdog) Handle(e *Event) error {
// Skip non-swarm events.
if e.From != "swarm" {
return nil
}
switch e.Status {
case "engine_disconnect":
go w.rescheduleContainers(e.Engine)
}
return nil
}
func (w *Watchdog) rescheduleContainers(e *Engine) {
w.l.Lock()
defer w.l.Unlock()
log.Infof("Node %s failed - rescheduling containers", e.ID)
for _, c := range e.Containers() {
// Skip containers which don't have an "always" reschedule policy.
if c.Config.ReschedulePolicy() != "always" {
log.Debugf("Skipping rescheduling of %s based on rescheduling policy", c.Id)
continue
}
// Remove the container from the dead engine. If we don't, then both
// the old and new one will show up in docker ps.
// We have to do this before calling `CreateContainer`, otherwise it
// will abort because the name is already taken.
c.Engine.removeContainer(c)
newContainer, err := w.cluster.CreateContainer(c.Config, c.Info.Name)
if err != nil {
log.Errorf("Failed to reschedule container %s (Swarm ID: %s): %v", c.Id, c.Config.SwarmID(), err)
continue
}
log.Infof("Rescheduled container %s from %s to %s as %s (Swarm ID: %s)", c.Id, c.Engine.ID, newContainer.Engine.ID, newContainer.Id, c.Config.SwarmID())
if c.Info.State.Running {
if err := newContainer.Start(); err != nil {
log.Errorf("Failed to start rescheduled container %s", newContainer.Id)
}
}
}
}
func NewWatchdog(cluster Cluster) *Watchdog {
log.Debugf("Watchdog enabled")
w := &Watchdog{
cluster: cluster,
}
cluster.RegisterEventHandler(w)
return w
}