Add support for container rescheduling on node failure.

Add rescheduling integration tests. Signed-off-by: Andrea Luzzardi <aluzzardi@gmail.com>
2015-12-03 02:30:15 -08:00 · 2015-12-03 02:30:15 -08:00 · 13f60212f5
parent 56941d02a8
commit 13f60212f5
5 changed files with 149 additions and 0 deletions
--- a/cli/manage.go
+++ b/cli/manage.go
@ -321,5 +321,6 @@ func manage(c *cli.Context) {
 		server.SetHandler(api.NewPrimary(cl, tlsConfig, &statusHandler{cl, nil, nil}, c.GlobalBool("debug"), c.Bool("cors")))
 	}
 	cluster.NewWatchdog(cl)
 	log.Fatal(server.ListenAndServe())
 }
--- a/cluster/config.go
+++ b/cluster/config.go
@ -186,3 +186,12 @@ func (c *ContainerConfig) HaveNodeConstraint() bool {
 	}
 	return false
 }
 // Affinities returns all the affinities from the ContainerConfig
 func (c *ContainerConfig) ReschedulePolicy() string {
 	policy, ok := c.Labels[SwarmLabelNamespace+".reschedule-policy"]
 	if !ok {
 		return "no"
 	}
 	return policy
 }
--- a/cluster/container.go
+++ b/cluster/container.go
@ -21,6 +21,11 @@ func (c *Container) Refresh() (*Container, error) {
 	return c.Engine.refreshContainer(c.Id, true)
 }
 // Start a container
 func (c *Container) Start() error {
 	return c.Engine.client.StartContainer(c.Id, nil)
 }
 // Containers represents a list a containers
 type Containers []*Container
--- a/cluster/watchdog.go
+++ b/cluster/watchdog.go
@ -0,0 +1,71 @@
 package cluster
 import (
 	"sync"
 	log "github.com/Sirupsen/logrus"
 )
 type Watchdog struct {
 	l       sync.Mutex
 	cluster Cluster
 }
 // Handle cluster callbacks
 func (w *Watchdog) Handle(e *Event) error {
 	// Skip non-swarm events.
 	if e.From != "swarm" {
 		return nil
 	}
 	switch e.Status {
 	case "engine_disconnect":
 		go w.rescheduleContainers(e.Engine)
 	}
 	return nil
 }
 func (w *Watchdog) rescheduleContainers(e *Engine) {
 	w.l.Lock()
 	defer w.l.Unlock()
 	log.Infof("Node %s failed - rescheduling containers", e.ID)
 	for _, c := range e.Containers() {
 		// Skip containers which don't have an "always" reschedule policy.
 		if c.Config.ReschedulePolicy() != "always" {
 			log.Debugf("Skipping rescheduling of %s based on rescheduling policy", c.Id)
 			continue
 		}
 		// Remove the container from the dead engine. If we don't, then both
 		// the old and new one will show up in docker ps.
 		// We have to do this before calling `CreateContainer`, otherwise it
 		// will abort because the name is already taken.
 		c.Engine.removeContainer(c)
 		newContainer, err := w.cluster.CreateContainer(c.Config, c.Info.Name)
 		if err != nil {
 			log.Errorf("Failed to reschedule container %s (Swarm ID: %s): %v", c.Id, c.Config.SwarmID(), err)
 			continue
 		}
 		log.Infof("Rescheduled container %s from %s to %s as %s (Swarm ID: %s)", c.Id, c.Engine.ID, newContainer.Engine.ID, newContainer.Id, c.Config.SwarmID())
 		if c.Info.State.Running {
 			if err := newContainer.Start(); err != nil {
 				log.Errorf("Failed to start rescheduled container %s", newContainer.Id)
 			}
 		}
 	}
 }
 func NewWatchdog(cluster Cluster) *Watchdog {
 	log.Debugf("Watchdog enabled")
 	w := &Watchdog{
 		cluster: cluster,
 	}
 	cluster.RegisterEventHandler(w)
 	return w
 }
--- a/test/integration/rescheduling.bats
+++ b/test/integration/rescheduling.bats
@ -0,0 +1,63 @@
 #!/usr/bin/env bats
 load helpers
 function teardown() {
 	swarm_manage_cleanup
 	stop_docker
 }
@test "rescheduling" {
 	start_docker_with_busybox 2
 	swarm_manage
 	# Expect 2 nodes
 	docker_swarm info | grep -q "Nodes: 2"
 	# c1 on node-0 with reschedule=on-node-failure
 	run docker_swarm run -dit --name c1 -e constraint:node==~node-0 --label com.docker.swarm.reschedule-policy=on-node-failure busybox sh
 	[ "$status" -eq 0 ]
 	# c2 on node-0 with reschedule=never
 	run docker_swarm run -dit --name c2 -e constraint:node==~node-0 --label com.docker.swarm.reschedule-policy=off busybox sh
 	[ "$status" -eq 0 ]
 	# c3 on node-1
 	run docker_swarm run -dit --name c3 -e constraint:node==~node-1 --label com.docker.swarm.reschedule-policy=on-node-failure busybox sh
 	[ "$status" -eq 0 ]
 	run docker_swarm ps -q
 	[ "${#lines[@]}" -eq  3 ]
 	# Make sure containers are running where they should.
 	run docker_swarm inspect c1
 	[ "$status" -eq 0 ]
 	[[ "${output}" == *'"Name": "node-0"'* ]]
 	run docker_swarm inspect c2
 	[ "$status" -eq 0 ]
 	[[ "${output}" == *'"Name": "node-0"'* ]]
 	run docker_swarm inspect c3
 	[ "$status" -eq 0 ]
 	[[ "${output}" == *'"Name": "node-1"'* ]]
 	# Stop node-0
 	docker_host stop ${DOCKER_CONTAINERS[0]}
 	# Wait for Swarm to detect the node failure.
 	#retry 10 1 eval "docker_swarm info | grep -q 'Nodes: 1'"
 	sleep 5
 	docker_swarm ps
 	# c1 should have been rescheduled from node-0 to node-1
 	run docker_swarm inspect c1
 	[ "$status" -eq 0 ]
 	[[ "${output}" == *'"Name": "node-1"'* ]]
 	# c2 should still be on node-0 since the rescheduling policy was off.
 	run docker_swarm inspect c2
 	[ "$status" -eq 1 ]
 	# c3 should still be on node-1 since it wasn't affected
 	run docker_swarm inspect c3
 	[ "$status" -eq 0 ]
 	[[ "${output}" == *'"Name": "node-1"'* ]]
 }