diff --git a/cluster/engine.go b/cluster/engine.go index 20e5332707..5b051524c7 100644 --- a/cluster/engine.go +++ b/cluster/engine.go @@ -65,11 +65,12 @@ func newDelayer(rangeMin, rangeMax time.Duration) *delayer { } } -func (d *delayer) Wait() <-chan time.Time { +// Wait returns timeout event after fixed + randomized time duration +func (d *delayer) Wait(backoffFactor int) <-chan time.Time { d.l.Lock() defer d.l.Unlock() - waitPeriod := int64(d.rangeMin) + waitPeriod := int64(d.rangeMin) * int64(1+backoffFactor) if delta := int64(d.rangeMax) - int64(d.rangeMin); delta > 0 { // Int63n panics if the parameter is 0 waitPeriod += d.r.Int63n(delta) @@ -605,13 +606,21 @@ func (e *Engine) updateContainer(c dockerclient.Container, containers map[string // refreshLoop periodically triggers engine refresh. func (e *Engine) refreshLoop() { - + const maxBackoffFactor int = 1000 for { var err error + // Engines keep failing should backoff + // e.failureCount and e.opts.FailureRetry are type of int + backoffFactor := e.failureCount - e.opts.FailureRetry + if backoffFactor < 0 { + backoffFactor = 0 + } else if backoffFactor > maxBackoffFactor { + backoffFactor = maxBackoffFactor + } // Wait for the delayer or quit if we get stopped. select { - case <-e.refreshDelayer.Wait(): + case <-e.refreshDelayer.Wait(backoffFactor): case <-e.stopCh: return } diff --git a/test/integration/nodemanagement/state.bats b/test/integration/nodemanagement/state.bats index da81687bb1..911b182081 100644 --- a/test/integration/nodemanagement/state.bats +++ b/test/integration/nodemanagement/state.bats @@ -41,7 +41,7 @@ function setup_discovery_file() { # Restart node docker_host start ${DOCKER_CONTAINERS[0]} # Wait for swarm to detect node recovery - retry 5 1 eval "docker_swarm info | grep -q -i 'Status: Healthy'" + retry 15 1 eval "docker_swarm info | grep -q -i 'Status: Healthy'" } @test "node pending and recovery" { diff --git a/test/integration/rescheduling.bats b/test/integration/rescheduling.bats index 7de588e1ef..5e3fd8f953 100644 --- a/test/integration/rescheduling.bats +++ b/test/integration/rescheduling.bats @@ -155,7 +155,8 @@ function containerRunning() { # Restart node-0 docker_host start ${DOCKER_CONTAINERS[0]} # Wait for node-0 to be healthy - retry 5 1 eval "test \"$(docker_swarm info | grep \"Status: Unhealthy\" | wc -l)\" = '0'" + # Failing node refresh interval increases over time. Provide enough retry here. + retry 30 1 eval "test \"$(docker_swarm info | grep \"Status: Unhealthy\" | wc -l)\" = '0'" # Stop node-1 docker_host stop ${DOCKER_CONTAINERS[1]}