diff --git a/cluster/engine.go b/cluster/engine.go index 45ed7bf4ac..1bba83fde5 100644 --- a/cluster/engine.go +++ b/cluster/engine.go @@ -24,6 +24,9 @@ const ( // Minimum docker engine version supported by swarm. minSupportedVersion = version.Version("1.6.0") + + // Engine failureCount threshold + engineFailureCountThreshold = 3 ) // delayer offers a simple API to random delay within a given time range. @@ -83,6 +86,7 @@ type Engine struct { client dockerclient.Client eventHandler EventHandler healthy bool + failureCount int64 overcommitRatio int64 opts *EngineOpts } @@ -192,6 +196,27 @@ func (e *Engine) Status() string { return "Unhealthy" } +// IncFailureCount increases engine's failure count, and set engine as unhealthy if threshold is crossed +func (e *Engine) IncFailureCount() { + e.Lock() + e.failureCount++ + if e.healthy && e.failureCount >= engineFailureCountThreshold { + e.healthy = false + } + e.Unlock() +} + +// SetEngineHealth sets engine healthy state +func (e *Engine) SetEngineHealth(state bool) { + e.Lock() + e.healthy = state + // if engine is healthy, clear failureCount + if state { + e.failureCount = 0 + } + e.Unlock() +} + // Gather engine specs (CPU, memory, constraints, ...). func (e *Engine) updateSpecs() error { info, err := e.client.Info() @@ -434,7 +459,7 @@ func (e *Engine) refreshLoop() { failedAttempts++ if failedAttempts >= e.opts.RefreshRetry && e.healthy { e.emitEvent("engine_disconnect") - e.healthy = false + e.SetEngineHealth(false) log.WithFields(log.Fields{"name": e.Name, "id": e.ID}).Errorf("Flagging engine as dead. Updated state failed %d times: %v", failedAttempts, err) } } else { @@ -448,7 +473,7 @@ func (e *Engine) refreshLoop() { e.client.StartMonitorEvents(e.handler, nil) e.emitEvent("engine_reconnect") } - e.healthy = true + e.SetEngineHealth(true) failedAttempts = 0 } } diff --git a/cluster/engine_test.go b/cluster/engine_test.go index 730acc78b2..31941142e1 100644 --- a/cluster/engine_test.go +++ b/cluster/engine_test.go @@ -38,6 +38,15 @@ var ( } ) +func TestEngineFailureCount(t *testing.T) { + engine := NewEngine("test", 0, engOpts) + for i := 0; i < engineFailureCountThreshold; i++ { + assert.True(t, engine.IsHealthy()) + engine.IncFailureCount() + } + assert.False(t, engine.IsHealthy()) +} + func TestEngineConnectionFailure(t *testing.T) { engine := NewEngine("test", 0, engOpts) assert.False(t, engine.isConnected())