Add support for container rescheduling on node failure.

Add rescheduling integration tests.

Signed-off-by: Andrea Luzzardi <aluzzardi@gmail.com>
This commit is contained in:
Andrea Luzzardi 2015-12-03 02:30:15 -08:00 committed by Victor Vieux
parent 56941d02a8
commit 13f60212f5
5 changed files with 149 additions and 0 deletions

View File

@ -321,5 +321,6 @@ func manage(c *cli.Context) {
server.SetHandler(api.NewPrimary(cl, tlsConfig, &statusHandler{cl, nil, nil}, c.GlobalBool("debug"), c.Bool("cors"))) server.SetHandler(api.NewPrimary(cl, tlsConfig, &statusHandler{cl, nil, nil}, c.GlobalBool("debug"), c.Bool("cors")))
} }
cluster.NewWatchdog(cl)
log.Fatal(server.ListenAndServe()) log.Fatal(server.ListenAndServe())
} }

View File

@ -186,3 +186,12 @@ func (c *ContainerConfig) HaveNodeConstraint() bool {
} }
return false return false
} }
// Affinities returns all the affinities from the ContainerConfig
func (c *ContainerConfig) ReschedulePolicy() string {
policy, ok := c.Labels[SwarmLabelNamespace+".reschedule-policy"]
if !ok {
return "no"
}
return policy
}

View File

@ -21,6 +21,11 @@ func (c *Container) Refresh() (*Container, error) {
return c.Engine.refreshContainer(c.Id, true) return c.Engine.refreshContainer(c.Id, true)
} }
// Start a container
func (c *Container) Start() error {
return c.Engine.client.StartContainer(c.Id, nil)
}
// Containers represents a list a containers // Containers represents a list a containers
type Containers []*Container type Containers []*Container

71
cluster/watchdog.go Normal file
View File

@ -0,0 +1,71 @@
package cluster
import (
"sync"
log "github.com/Sirupsen/logrus"
)
type Watchdog struct {
l sync.Mutex
cluster Cluster
}
// Handle cluster callbacks
func (w *Watchdog) Handle(e *Event) error {
// Skip non-swarm events.
if e.From != "swarm" {
return nil
}
switch e.Status {
case "engine_disconnect":
go w.rescheduleContainers(e.Engine)
}
return nil
}
func (w *Watchdog) rescheduleContainers(e *Engine) {
w.l.Lock()
defer w.l.Unlock()
log.Infof("Node %s failed - rescheduling containers", e.ID)
for _, c := range e.Containers() {
// Skip containers which don't have an "always" reschedule policy.
if c.Config.ReschedulePolicy() != "always" {
log.Debugf("Skipping rescheduling of %s based on rescheduling policy", c.Id)
continue
}
// Remove the container from the dead engine. If we don't, then both
// the old and new one will show up in docker ps.
// We have to do this before calling `CreateContainer`, otherwise it
// will abort because the name is already taken.
c.Engine.removeContainer(c)
newContainer, err := w.cluster.CreateContainer(c.Config, c.Info.Name)
if err != nil {
log.Errorf("Failed to reschedule container %s (Swarm ID: %s): %v", c.Id, c.Config.SwarmID(), err)
continue
}
log.Infof("Rescheduled container %s from %s to %s as %s (Swarm ID: %s)", c.Id, c.Engine.ID, newContainer.Engine.ID, newContainer.Id, c.Config.SwarmID())
if c.Info.State.Running {
if err := newContainer.Start(); err != nil {
log.Errorf("Failed to start rescheduled container %s", newContainer.Id)
}
}
}
}
func NewWatchdog(cluster Cluster) *Watchdog {
log.Debugf("Watchdog enabled")
w := &Watchdog{
cluster: cluster,
}
cluster.RegisterEventHandler(w)
return w
}

View File

@ -0,0 +1,63 @@
#!/usr/bin/env bats
load helpers
function teardown() {
swarm_manage_cleanup
stop_docker
}
@test "rescheduling" {
start_docker_with_busybox 2
swarm_manage
# Expect 2 nodes
docker_swarm info | grep -q "Nodes: 2"
# c1 on node-0 with reschedule=on-node-failure
run docker_swarm run -dit --name c1 -e constraint:node==~node-0 --label com.docker.swarm.reschedule-policy=on-node-failure busybox sh
[ "$status" -eq 0 ]
# c2 on node-0 with reschedule=never
run docker_swarm run -dit --name c2 -e constraint:node==~node-0 --label com.docker.swarm.reschedule-policy=off busybox sh
[ "$status" -eq 0 ]
# c3 on node-1
run docker_swarm run -dit --name c3 -e constraint:node==~node-1 --label com.docker.swarm.reschedule-policy=on-node-failure busybox sh
[ "$status" -eq 0 ]
run docker_swarm ps -q
[ "${#lines[@]}" -eq 3 ]
# Make sure containers are running where they should.
run docker_swarm inspect c1
[ "$status" -eq 0 ]
[[ "${output}" == *'"Name": "node-0"'* ]]
run docker_swarm inspect c2
[ "$status" -eq 0 ]
[[ "${output}" == *'"Name": "node-0"'* ]]
run docker_swarm inspect c3
[ "$status" -eq 0 ]
[[ "${output}" == *'"Name": "node-1"'* ]]
# Stop node-0
docker_host stop ${DOCKER_CONTAINERS[0]}
# Wait for Swarm to detect the node failure.
#retry 10 1 eval "docker_swarm info | grep -q 'Nodes: 1'"
sleep 5
docker_swarm ps
# c1 should have been rescheduled from node-0 to node-1
run docker_swarm inspect c1
[ "$status" -eq 0 ]
[[ "${output}" == *'"Name": "node-1"'* ]]
# c2 should still be on node-0 since the rescheduling policy was off.
run docker_swarm inspect c2
[ "$status" -eq 1 ]
# c3 should still be on node-1 since it wasn't affected
run docker_swarm inspect c3
[ "$status" -eq 0 ]
[[ "${output}" == *'"Name": "node-1"'* ]]
}