Merge pull request #1794 from dongluochen/refreshBackoffOnFailure

Implement engine refresh backoff strategy for failing nodes
This commit is contained in:
Chanwit Kaewkasi 2016-02-12 19:58:08 +07:00
commit fce3620235
3 changed files with 16 additions and 6 deletions

View File

@ -66,11 +66,12 @@ func newDelayer(rangeMin, rangeMax time.Duration) *delayer {
}
}
func (d *delayer) Wait() <-chan time.Time {
// Wait returns timeout event after fixed + randomized time duration
func (d *delayer) Wait(backoffFactor int) <-chan time.Time {
d.l.Lock()
defer d.l.Unlock()
waitPeriod := int64(d.rangeMin)
waitPeriod := int64(d.rangeMin) * int64(1+backoffFactor)
if delta := int64(d.rangeMax) - int64(d.rangeMin); delta > 0 {
// Int63n panics if the parameter is 0
waitPeriod += d.r.Int63n(delta)
@ -637,13 +638,21 @@ func (e *Engine) updateContainer(c dockerclient.Container, containers map[string
// refreshLoop periodically triggers engine refresh.
func (e *Engine) refreshLoop() {
const maxBackoffFactor int = 1000
for {
var err error
// Engines keep failing should backoff
// e.failureCount and e.opts.FailureRetry are type of int
backoffFactor := e.failureCount - e.opts.FailureRetry
if backoffFactor < 0 {
backoffFactor = 0
} else if backoffFactor > maxBackoffFactor {
backoffFactor = maxBackoffFactor
}
// Wait for the delayer or quit if we get stopped.
select {
case <-e.refreshDelayer.Wait():
case <-e.refreshDelayer.Wait(backoffFactor):
case <-e.stopCh:
return
}

View File

@ -41,7 +41,7 @@ function setup_discovery_file() {
# Restart node
docker_host start ${DOCKER_CONTAINERS[0]}
# Wait for swarm to detect node recovery
retry 5 1 eval "docker_swarm info | grep -q -i 'Status: Healthy'"
retry 15 1 eval "docker_swarm info | grep -q -i 'Status: Healthy'"
}
@test "node pending and recovery" {

View File

@ -155,7 +155,8 @@ function containerRunning() {
# Restart node-0
docker_host start ${DOCKER_CONTAINERS[0]}
# Wait for node-0 to be healthy
retry 5 1 eval "test \"$(docker_swarm info | grep \"Status: Unhealthy\" | wc -l)\" = '0'"
# Failing node refresh interval increases over time. Provide enough retry here.
retry 30 1 eval "test \"$(docker_swarm info | grep \"Status: Unhealthy\" | wc -l)\" = '0'"
# Stop node-1
docker_host stop ${DOCKER_CONTAINERS[1]}