mirror of https://github.com/docker/docs.git
enable rescheduling watchdog only when primary
Signed-off-by: Victor Vieux <vieux@docker.com>
This commit is contained in:
parent
195b55d06c
commit
ffba4054dc
|
@ -148,7 +148,7 @@ func setupReplication(c *cli.Context, cluster cluster.Cluster, server *api.Serve
|
||||||
|
|
||||||
go func() {
|
go func() {
|
||||||
for {
|
for {
|
||||||
run(candidate, server, primary, replica)
|
run(cluster, candidate, server, primary, replica)
|
||||||
time.Sleep(defaultRecoverTime)
|
time.Sleep(defaultRecoverTime)
|
||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
|
@ -163,19 +163,22 @@ func setupReplication(c *cli.Context, cluster cluster.Cluster, server *api.Serve
|
||||||
server.SetHandler(primary)
|
server.SetHandler(primary)
|
||||||
}
|
}
|
||||||
|
|
||||||
func run(candidate *leadership.Candidate, server *api.Server, primary *mux.Router, replica *api.Replica) {
|
func run(cl cluster.Cluster, candidate *leadership.Candidate, server *api.Server, primary *mux.Router, replica *api.Replica) {
|
||||||
electedCh, errCh, err := candidate.RunForElection()
|
electedCh, errCh, err := candidate.RunForElection()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
var watchdog *cluster.Watchdog
|
||||||
for {
|
for {
|
||||||
select {
|
select {
|
||||||
case isElected := <-electedCh:
|
case isElected := <-electedCh:
|
||||||
if isElected {
|
if isElected {
|
||||||
log.Info("Leader Election: Cluster leadership acquired")
|
log.Info("Leader Election: Cluster leadership acquired")
|
||||||
|
watchdog = cluster.NewWatchdog(cl)
|
||||||
server.SetHandler(primary)
|
server.SetHandler(primary)
|
||||||
} else {
|
} else {
|
||||||
log.Info("Leader Election: Cluster leadership lost")
|
log.Info("Leader Election: Cluster leadership lost")
|
||||||
|
cl.UnregisterEventHandler(watchdog)
|
||||||
server.SetHandler(replica)
|
server.SetHandler(replica)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -325,8 +328,8 @@ func manage(c *cli.Context) {
|
||||||
setupReplication(c, cl, server, discovery, addr, leaderTTL, tlsConfig)
|
setupReplication(c, cl, server, discovery, addr, leaderTTL, tlsConfig)
|
||||||
} else {
|
} else {
|
||||||
server.SetHandler(api.NewPrimary(cl, tlsConfig, &statusHandler{cl, nil, nil}, c.GlobalBool("debug"), c.Bool("cors")))
|
server.SetHandler(api.NewPrimary(cl, tlsConfig, &statusHandler{cl, nil, nil}, c.GlobalBool("debug"), c.Bool("cors")))
|
||||||
|
cluster.NewWatchdog(cl)
|
||||||
}
|
}
|
||||||
|
|
||||||
cluster.NewWatchdog(cl)
|
|
||||||
log.Fatal(server.ListenAndServe())
|
log.Fatal(server.ListenAndServe())
|
||||||
}
|
}
|
||||||
|
|
|
@ -77,6 +77,83 @@ function teardown() {
|
||||||
[[ "${output}" == *"Primary: ${SWARM_HOSTS[1]}"* ]]
|
[[ "${output}" == *"Primary: ${SWARM_HOSTS[1]}"* ]]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function containerRunning() {
|
||||||
|
local container="$1"
|
||||||
|
local node="$2"
|
||||||
|
run docker_swarm inspect "$container"
|
||||||
|
[ "$status" -eq 0 ]
|
||||||
|
[[ "${output}" == *"\"Name\": \"$node\""* ]]
|
||||||
|
[[ "${output}" == *"\"Status\": \"running\""* ]]
|
||||||
|
}
|
||||||
|
|
||||||
|
@test "leader election - rescheduling" {
|
||||||
|
local i=${#SWARM_MANAGE_PID[@]}
|
||||||
|
local port=$(($SWARM_BASE_PORT + $i))
|
||||||
|
local host=127.0.0.1:$port
|
||||||
|
|
||||||
|
start_docker_with_busybox 2
|
||||||
|
swarm_join "$DISCOVERY"
|
||||||
|
|
||||||
|
# Bring up one manager, make sure it becomes primary.
|
||||||
|
swarm_manage --replication --replication-ttl "4s" --advertise 127.0.0.1:$SWARM_BASE_PORT --engine-refresh-min-interval=1s --engine-refresh-max-interval=1s --engine-failure-retry=1 "$DISCOVERY"
|
||||||
|
run docker -H ${SWARM_HOSTS[0]} info
|
||||||
|
[[ "${output}" == *"Role: primary"* ]]
|
||||||
|
|
||||||
|
# Fire up a second manager. Ensure it's a replica forwarding to the right primary.
|
||||||
|
swarm_manage --replication --replication-ttl "4s" --advertise 127.0.0.1:$(($SWARM_BASE_PORT + 1)) --engine-refresh-min-interval=1s --engine-refresh-max-interval=1s --engine-failure-retry=1 "$DISCOVERY"
|
||||||
|
run docker -H ${SWARM_HOSTS[1]} info
|
||||||
|
[[ "${output}" == *"Role: replica"* ]]
|
||||||
|
[[ "${output}" == *"Primary: ${SWARM_HOSTS[0]}"* ]]
|
||||||
|
|
||||||
|
# c1 on node-0 with reschedule=on-node-failure
|
||||||
|
run docker_swarm run -dit --name c1 -e constraint:node==~node-0 --label 'com.docker.swarm.reschedule-policies=["on-node-failure"]' busybox sh
|
||||||
|
[ "$status" -eq 0 ]
|
||||||
|
# c2 on node-0 with reschedule=off
|
||||||
|
run docker_swarm run -dit --name c2 -e constraint:node==~node-0 --label 'com.docker.swarm.reschedule-policies=["off"]' busybox sh
|
||||||
|
[ "$status" -eq 0 ]
|
||||||
|
# c3 on node-1
|
||||||
|
run docker_swarm run -dit --name c3 -e constraint:node==~node-1 --label 'com.docker.swarm.reschedule-policies=["on-node-failure"]' busybox sh
|
||||||
|
[ "$status" -eq 0 ]
|
||||||
|
|
||||||
|
run docker_swarm ps -q
|
||||||
|
[ "${#lines[@]}" -eq 3 ]
|
||||||
|
|
||||||
|
# Make sure containers are running where they should.
|
||||||
|
containerRunning "c1" "node-0"
|
||||||
|
containerRunning "c2" "node-0"
|
||||||
|
containerRunning "c3" "node-1"
|
||||||
|
|
||||||
|
# Get c1 swarm id
|
||||||
|
swarm_id=$(docker_swarm inspect -f '{{ index .Config.Labels "com.docker.swarm.id" }}' c1)
|
||||||
|
|
||||||
|
# Stop node-0
|
||||||
|
docker_host stop ${DOCKER_CONTAINERS[0]}
|
||||||
|
|
||||||
|
# Wait for Swarm to detect the node failure.
|
||||||
|
retry 5 1 eval "docker_swarm info | grep -q 'Unhealthy'"
|
||||||
|
|
||||||
|
# Wait for the container to be rescheduled
|
||||||
|
# c1 should have been rescheduled from node-0 to node-1
|
||||||
|
retry 5 1 containerRunning "c1" "node-1"
|
||||||
|
|
||||||
|
# Check swarm id didn't change for c1
|
||||||
|
[[ "$swarm_id" == $(docker_swarm inspect -f '{{ index .Config.Labels "com.docker.swarm.id" }}' c1) ]]
|
||||||
|
|
||||||
|
run docker_swarm inspect "$swarm_id"
|
||||||
|
[ "$status" -eq 0 ]
|
||||||
|
[[ "${output}" == *'"Name": "node-1"'* ]]
|
||||||
|
|
||||||
|
# c2 should still be on node-0 since the rescheduling policy was off.
|
||||||
|
run docker_swarm inspect c2
|
||||||
|
[ "$status" -eq 1 ]
|
||||||
|
|
||||||
|
# c3 should still be on node-1 since it wasn't affected
|
||||||
|
containerRunning "c3" "node-1"
|
||||||
|
|
||||||
|
run docker_swarm ps -q
|
||||||
|
[ "${#lines[@]}" -eq 2 ]
|
||||||
|
}
|
||||||
|
|
||||||
@test "leader election - store failure" {
|
@test "leader election - store failure" {
|
||||||
# Bring up one manager, make sure it becomes primary.
|
# Bring up one manager, make sure it becomes primary.
|
||||||
swarm_manage --replication --replication-ttl "4s" --advertise 127.0.0.1:$SWARM_BASE_PORT "$DISCOVERY"
|
swarm_manage --replication --replication-ttl "4s" --advertise 127.0.0.1:$SWARM_BASE_PORT "$DISCOVERY"
|
||||||
|
|
Loading…
Reference in New Issue