mirror of https://github.com/docker/docs.git
Add support for container rescheduling on node failure.
Add rescheduling integration tests. Signed-off-by: Andrea Luzzardi <aluzzardi@gmail.com>
This commit is contained in:
parent
56941d02a8
commit
13f60212f5
|
@ -321,5 +321,6 @@ func manage(c *cli.Context) {
|
||||||
server.SetHandler(api.NewPrimary(cl, tlsConfig, &statusHandler{cl, nil, nil}, c.GlobalBool("debug"), c.Bool("cors")))
|
server.SetHandler(api.NewPrimary(cl, tlsConfig, &statusHandler{cl, nil, nil}, c.GlobalBool("debug"), c.Bool("cors")))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
cluster.NewWatchdog(cl)
|
||||||
log.Fatal(server.ListenAndServe())
|
log.Fatal(server.ListenAndServe())
|
||||||
}
|
}
|
||||||
|
|
|
@ -186,3 +186,12 @@ func (c *ContainerConfig) HaveNodeConstraint() bool {
|
||||||
}
|
}
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Affinities returns all the affinities from the ContainerConfig
|
||||||
|
func (c *ContainerConfig) ReschedulePolicy() string {
|
||||||
|
policy, ok := c.Labels[SwarmLabelNamespace+".reschedule-policy"]
|
||||||
|
if !ok {
|
||||||
|
return "no"
|
||||||
|
}
|
||||||
|
return policy
|
||||||
|
}
|
||||||
|
|
|
@ -21,6 +21,11 @@ func (c *Container) Refresh() (*Container, error) {
|
||||||
return c.Engine.refreshContainer(c.Id, true)
|
return c.Engine.refreshContainer(c.Id, true)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Start a container
|
||||||
|
func (c *Container) Start() error {
|
||||||
|
return c.Engine.client.StartContainer(c.Id, nil)
|
||||||
|
}
|
||||||
|
|
||||||
// Containers represents a list a containers
|
// Containers represents a list a containers
|
||||||
type Containers []*Container
|
type Containers []*Container
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,71 @@
|
||||||
|
package cluster
|
||||||
|
|
||||||
|
import (
|
||||||
|
"sync"
|
||||||
|
|
||||||
|
log "github.com/Sirupsen/logrus"
|
||||||
|
)
|
||||||
|
|
||||||
|
type Watchdog struct {
|
||||||
|
l sync.Mutex
|
||||||
|
cluster Cluster
|
||||||
|
}
|
||||||
|
|
||||||
|
// Handle cluster callbacks
|
||||||
|
func (w *Watchdog) Handle(e *Event) error {
|
||||||
|
// Skip non-swarm events.
|
||||||
|
if e.From != "swarm" {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
switch e.Status {
|
||||||
|
case "engine_disconnect":
|
||||||
|
go w.rescheduleContainers(e.Engine)
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (w *Watchdog) rescheduleContainers(e *Engine) {
|
||||||
|
w.l.Lock()
|
||||||
|
defer w.l.Unlock()
|
||||||
|
|
||||||
|
log.Infof("Node %s failed - rescheduling containers", e.ID)
|
||||||
|
for _, c := range e.Containers() {
|
||||||
|
// Skip containers which don't have an "always" reschedule policy.
|
||||||
|
if c.Config.ReschedulePolicy() != "always" {
|
||||||
|
log.Debugf("Skipping rescheduling of %s based on rescheduling policy", c.Id)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
// Remove the container from the dead engine. If we don't, then both
|
||||||
|
// the old and new one will show up in docker ps.
|
||||||
|
// We have to do this before calling `CreateContainer`, otherwise it
|
||||||
|
// will abort because the name is already taken.
|
||||||
|
c.Engine.removeContainer(c)
|
||||||
|
|
||||||
|
newContainer, err := w.cluster.CreateContainer(c.Config, c.Info.Name)
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
log.Errorf("Failed to reschedule container %s (Swarm ID: %s): %v", c.Id, c.Config.SwarmID(), err)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
log.Infof("Rescheduled container %s from %s to %s as %s (Swarm ID: %s)", c.Id, c.Engine.ID, newContainer.Engine.ID, newContainer.Id, c.Config.SwarmID())
|
||||||
|
|
||||||
|
if c.Info.State.Running {
|
||||||
|
if err := newContainer.Start(); err != nil {
|
||||||
|
log.Errorf("Failed to start rescheduled container %s", newContainer.Id)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewWatchdog(cluster Cluster) *Watchdog {
|
||||||
|
log.Debugf("Watchdog enabled")
|
||||||
|
w := &Watchdog{
|
||||||
|
cluster: cluster,
|
||||||
|
}
|
||||||
|
cluster.RegisterEventHandler(w)
|
||||||
|
return w
|
||||||
|
}
|
|
@ -0,0 +1,63 @@
|
||||||
|
#!/usr/bin/env bats
|
||||||
|
|
||||||
|
load helpers
|
||||||
|
|
||||||
|
function teardown() {
|
||||||
|
swarm_manage_cleanup
|
||||||
|
stop_docker
|
||||||
|
}
|
||||||
|
|
||||||
|
@test "rescheduling" {
|
||||||
|
start_docker_with_busybox 2
|
||||||
|
swarm_manage
|
||||||
|
|
||||||
|
# Expect 2 nodes
|
||||||
|
docker_swarm info | grep -q "Nodes: 2"
|
||||||
|
|
||||||
|
# c1 on node-0 with reschedule=on-node-failure
|
||||||
|
run docker_swarm run -dit --name c1 -e constraint:node==~node-0 --label com.docker.swarm.reschedule-policy=on-node-failure busybox sh
|
||||||
|
[ "$status" -eq 0 ]
|
||||||
|
# c2 on node-0 with reschedule=never
|
||||||
|
run docker_swarm run -dit --name c2 -e constraint:node==~node-0 --label com.docker.swarm.reschedule-policy=off busybox sh
|
||||||
|
[ "$status" -eq 0 ]
|
||||||
|
# c3 on node-1
|
||||||
|
run docker_swarm run -dit --name c3 -e constraint:node==~node-1 --label com.docker.swarm.reschedule-policy=on-node-failure busybox sh
|
||||||
|
[ "$status" -eq 0 ]
|
||||||
|
|
||||||
|
run docker_swarm ps -q
|
||||||
|
[ "${#lines[@]}" -eq 3 ]
|
||||||
|
|
||||||
|
# Make sure containers are running where they should.
|
||||||
|
run docker_swarm inspect c1
|
||||||
|
[ "$status" -eq 0 ]
|
||||||
|
[[ "${output}" == *'"Name": "node-0"'* ]]
|
||||||
|
run docker_swarm inspect c2
|
||||||
|
[ "$status" -eq 0 ]
|
||||||
|
[[ "${output}" == *'"Name": "node-0"'* ]]
|
||||||
|
run docker_swarm inspect c3
|
||||||
|
[ "$status" -eq 0 ]
|
||||||
|
[[ "${output}" == *'"Name": "node-1"'* ]]
|
||||||
|
|
||||||
|
# Stop node-0
|
||||||
|
docker_host stop ${DOCKER_CONTAINERS[0]}
|
||||||
|
|
||||||
|
# Wait for Swarm to detect the node failure.
|
||||||
|
#retry 10 1 eval "docker_swarm info | grep -q 'Nodes: 1'"
|
||||||
|
|
||||||
|
sleep 5
|
||||||
|
docker_swarm ps
|
||||||
|
|
||||||
|
# c1 should have been rescheduled from node-0 to node-1
|
||||||
|
run docker_swarm inspect c1
|
||||||
|
[ "$status" -eq 0 ]
|
||||||
|
[[ "${output}" == *'"Name": "node-1"'* ]]
|
||||||
|
|
||||||
|
# c2 should still be on node-0 since the rescheduling policy was off.
|
||||||
|
run docker_swarm inspect c2
|
||||||
|
[ "$status" -eq 1 ]
|
||||||
|
|
||||||
|
# c3 should still be on node-1 since it wasn't affected
|
||||||
|
run docker_swarm inspect c3
|
||||||
|
[ "$status" -eq 0 ]
|
||||||
|
[[ "${output}" == *'"Name": "node-1"'* ]]
|
||||||
|
}
|
Loading…
Reference in New Issue