From ffba4054dcca9f85f8e39012df4674af47066f85 Mon Sep 17 00:00:00 2001 From: Victor Vieux Date: Thu, 24 Mar 2016 01:59:42 -0700 Subject: [PATCH] enable rescheduling watchdog only when primary Signed-off-by: Victor Vieux --- cli/manage.go | 9 ++-- test/integration/replication.bats | 77 +++++++++++++++++++++++++++++++ 2 files changed, 83 insertions(+), 3 deletions(-) diff --git a/cli/manage.go b/cli/manage.go index 6fa782ae99..5868a09158 100644 --- a/cli/manage.go +++ b/cli/manage.go @@ -148,7 +148,7 @@ func setupReplication(c *cli.Context, cluster cluster.Cluster, server *api.Serve go func() { for { - run(candidate, server, primary, replica) + run(cluster, candidate, server, primary, replica) time.Sleep(defaultRecoverTime) } }() @@ -163,19 +163,22 @@ func setupReplication(c *cli.Context, cluster cluster.Cluster, server *api.Serve server.SetHandler(primary) } -func run(candidate *leadership.Candidate, server *api.Server, primary *mux.Router, replica *api.Replica) { +func run(cl cluster.Cluster, candidate *leadership.Candidate, server *api.Server, primary *mux.Router, replica *api.Replica) { electedCh, errCh, err := candidate.RunForElection() if err != nil { return } + var watchdog *cluster.Watchdog for { select { case isElected := <-electedCh: if isElected { log.Info("Leader Election: Cluster leadership acquired") + watchdog = cluster.NewWatchdog(cl) server.SetHandler(primary) } else { log.Info("Leader Election: Cluster leadership lost") + cl.UnregisterEventHandler(watchdog) server.SetHandler(replica) } @@ -325,8 +328,8 @@ func manage(c *cli.Context) { setupReplication(c, cl, server, discovery, addr, leaderTTL, tlsConfig) } else { server.SetHandler(api.NewPrimary(cl, tlsConfig, &statusHandler{cl, nil, nil}, c.GlobalBool("debug"), c.Bool("cors"))) + cluster.NewWatchdog(cl) } - cluster.NewWatchdog(cl) log.Fatal(server.ListenAndServe()) } diff --git a/test/integration/replication.bats b/test/integration/replication.bats index 76ca6393c6..9280858f9d 100644 --- a/test/integration/replication.bats +++ b/test/integration/replication.bats @@ -77,6 +77,83 @@ function teardown() { [[ "${output}" == *"Primary: ${SWARM_HOSTS[1]}"* ]] } +function containerRunning() { + local container="$1" + local node="$2" + run docker_swarm inspect "$container" + [ "$status" -eq 0 ] + [[ "${output}" == *"\"Name\": \"$node\""* ]] + [[ "${output}" == *"\"Status\": \"running\""* ]] +} + +@test "leader election - rescheduling" { + local i=${#SWARM_MANAGE_PID[@]} + local port=$(($SWARM_BASE_PORT + $i)) + local host=127.0.0.1:$port + + start_docker_with_busybox 2 + swarm_join "$DISCOVERY" + + # Bring up one manager, make sure it becomes primary. + swarm_manage --replication --replication-ttl "4s" --advertise 127.0.0.1:$SWARM_BASE_PORT --engine-refresh-min-interval=1s --engine-refresh-max-interval=1s --engine-failure-retry=1 "$DISCOVERY" + run docker -H ${SWARM_HOSTS[0]} info + [[ "${output}" == *"Role: primary"* ]] + + # Fire up a second manager. Ensure it's a replica forwarding to the right primary. + swarm_manage --replication --replication-ttl "4s" --advertise 127.0.0.1:$(($SWARM_BASE_PORT + 1)) --engine-refresh-min-interval=1s --engine-refresh-max-interval=1s --engine-failure-retry=1 "$DISCOVERY" + run docker -H ${SWARM_HOSTS[1]} info + [[ "${output}" == *"Role: replica"* ]] + [[ "${output}" == *"Primary: ${SWARM_HOSTS[0]}"* ]] + + # c1 on node-0 with reschedule=on-node-failure + run docker_swarm run -dit --name c1 -e constraint:node==~node-0 --label 'com.docker.swarm.reschedule-policies=["on-node-failure"]' busybox sh + [ "$status" -eq 0 ] + # c2 on node-0 with reschedule=off + run docker_swarm run -dit --name c2 -e constraint:node==~node-0 --label 'com.docker.swarm.reschedule-policies=["off"]' busybox sh + [ "$status" -eq 0 ] + # c3 on node-1 + run docker_swarm run -dit --name c3 -e constraint:node==~node-1 --label 'com.docker.swarm.reschedule-policies=["on-node-failure"]' busybox sh + [ "$status" -eq 0 ] + + run docker_swarm ps -q + [ "${#lines[@]}" -eq 3 ] + + # Make sure containers are running where they should. + containerRunning "c1" "node-0" + containerRunning "c2" "node-0" + containerRunning "c3" "node-1" + + # Get c1 swarm id + swarm_id=$(docker_swarm inspect -f '{{ index .Config.Labels "com.docker.swarm.id" }}' c1) + + # Stop node-0 + docker_host stop ${DOCKER_CONTAINERS[0]} + + # Wait for Swarm to detect the node failure. + retry 5 1 eval "docker_swarm info | grep -q 'Unhealthy'" + + # Wait for the container to be rescheduled + # c1 should have been rescheduled from node-0 to node-1 + retry 5 1 containerRunning "c1" "node-1" + + # Check swarm id didn't change for c1 + [[ "$swarm_id" == $(docker_swarm inspect -f '{{ index .Config.Labels "com.docker.swarm.id" }}' c1) ]] + + run docker_swarm inspect "$swarm_id" + [ "$status" -eq 0 ] + [[ "${output}" == *'"Name": "node-1"'* ]] + + # c2 should still be on node-0 since the rescheduling policy was off. + run docker_swarm inspect c2 + [ "$status" -eq 1 ] + + # c3 should still be on node-1 since it wasn't affected + containerRunning "c3" "node-1" + + run docker_swarm ps -q + [ "${#lines[@]}" -eq 2 ] +} + @test "leader election - store failure" { # Bring up one manager, make sure it becomes primary. swarm_manage --replication --replication-ttl "4s" --advertise 127.0.0.1:$SWARM_BASE_PORT "$DISCOVERY"