Reorganize engine failure detection procedure. Change engine option 'RefreshRetry' to 'FailureRetry'.

Signed-off-by: Dong Chen <dongluo.chen@docker.com>
This commit is contained in:
Dong Chen 2015-12-09 17:14:56 -08:00
parent 4d24256c19
commit ec3b00c484
8 changed files with 34 additions and 39 deletions

View File

@ -26,7 +26,7 @@ var (
flHosts,
flLeaderElection, flLeaderTTL, flManageAdvertise,
flTLS, flTLSCaCert, flTLSCert, flTLSKey, flTLSVerify,
flRefreshIntervalMin, flRefreshIntervalMax, flRefreshRetry,
flRefreshIntervalMin, flRefreshIntervalMax, flFailureRetry,
flHeartBeat,
flEnableCors,
flCluster, flDiscoveryOpt, flClusterOpt},

View File

@ -71,10 +71,10 @@ var (
Value: "60s",
Usage: "set engine refresh maximum interval",
}
flRefreshRetry = cli.IntFlag{
Name: "engine-refresh-retry",
flFailureRetry = cli.IntFlag{
Name: "engine-failure-retry",
Value: 3,
Usage: "set engine refresh retry count on failure",
Usage: "set engine failure retry count",
}
flEnableCors = cli.BoolFlag{
Name: "api-enable-cors, cors",

View File

@ -240,14 +240,14 @@ func manage(c *cli.Context) {
if refreshMaxInterval < refreshMinInterval {
log.Fatal("max refresh interval cannot be less than min refresh interval")
}
refreshRetry := c.Int("engine-refresh-retry")
if refreshRetry <= 0 {
log.Fatal("invalid refresh retry count")
failureRetry := c.Int("engine-failure-retry")
if failureRetry <= 0 {
log.Fatal("invalid failure retry count")
}
engineOpts := &cluster.EngineOpts{
RefreshMinInterval: refreshMinInterval,
RefreshMaxInterval: refreshMaxInterval,
RefreshRetry: refreshRetry,
FailureRetry: failureRetry,
}
uri := getDiscovery(c)

View File

@ -24,9 +24,6 @@ const (
// Minimum docker engine version supported by swarm.
minSupportedVersion = version.Version("1.6.0")
// Engine failureCount threshold
engineFailureCountThreshold = 3
)
// delayer offers a simple API to random delay within a given time range.
@ -62,7 +59,7 @@ func (d *delayer) Wait() <-chan time.Time {
type EngineOpts struct {
RefreshMinInterval time.Duration
RefreshMaxInterval time.Duration
RefreshRetry int
FailureRetry int
}
// Engine represents a docker engine
@ -86,7 +83,7 @@ type Engine struct {
client dockerclient.Client
eventHandler EventHandler
healthy bool
failureCount int64
failureCount int
overcommitRatio int64
opts *EngineOpts
}
@ -188,6 +185,17 @@ func (e *Engine) IsHealthy() bool {
return e.healthy
}
// SetHealthy sets engine healthy state
func (e *Engine) SetHealthy(state bool) {
e.Lock()
e.healthy = state
// if engine is healthy, clear failureCount
if state {
e.failureCount = 0
}
e.Unlock()
}
// Status returns the health status of the Engine: Healthy or Unhealthy
func (e *Engine) Status() string {
if e.healthy {
@ -200,23 +208,12 @@ func (e *Engine) Status() string {
func (e *Engine) IncFailureCount() {
e.Lock()
e.failureCount++
if e.healthy && e.failureCount >= engineFailureCountThreshold {
if e.healthy && e.failureCount >= e.opts.FailureRetry {
e.healthy = false
}
e.Unlock()
}
// SetEngineHealth sets engine healthy state
func (e *Engine) SetEngineHealth(state bool) {
e.Lock()
e.healthy = state
// if engine is healthy, clear failureCount
if state {
e.failureCount = 0
}
e.Unlock()
}
// Gather engine specs (CPU, memory, constraints, ...).
func (e *Engine) updateSpecs() error {
info, err := e.client.Info()
@ -435,7 +432,6 @@ func (e *Engine) updateContainer(c dockerclient.Container, containers map[string
}
func (e *Engine) refreshLoop() {
failedAttempts := 0
for {
var err error
@ -456,15 +452,15 @@ func (e *Engine) refreshLoop() {
}
if err != nil {
failedAttempts++
if failedAttempts >= e.opts.RefreshRetry && e.healthy {
e.failureCount++
if e.failureCount >= e.opts.FailureRetry && e.healthy {
e.emitEvent("engine_disconnect")
e.SetEngineHealth(false)
log.WithFields(log.Fields{"name": e.Name, "id": e.ID}).Errorf("Flagging engine as dead. Updated state failed %d times: %v", failedAttempts, err)
e.SetHealthy(false)
log.WithFields(log.Fields{"name": e.Name, "id": e.ID}).Errorf("Flagging engine as dead. Updated state failed %d times: %v", e.failureCount, err)
}
} else {
if !e.healthy {
log.WithFields(log.Fields{"name": e.Name, "id": e.ID}).Infof("Engine came back to life after %d retries. Hooray!", failedAttempts)
log.WithFields(log.Fields{"name": e.Name, "id": e.ID}).Infof("Engine came back to life after %d retries. Hooray!", e.failureCount)
if err := e.updateSpecs(); err != nil {
log.WithFields(log.Fields{"name": e.Name, "id": e.ID}).Errorf("Update engine specs failed: %v", err)
continue
@ -473,8 +469,7 @@ func (e *Engine) refreshLoop() {
e.client.StartMonitorEvents(e.handler, nil)
e.emitEvent("engine_reconnect")
}
e.SetEngineHealth(true)
failedAttempts = 0
e.SetHealthy(true)
}
}
}

View File

@ -34,13 +34,13 @@ var (
engOpts = &EngineOpts{
RefreshMinInterval: time.Duration(30) * time.Second,
RefreshMaxInterval: time.Duration(60) * time.Second,
RefreshRetry: 3,
FailureRetry: 3,
}
)
func TestEngineFailureCount(t *testing.T) {
engine := NewEngine("test", 0, engOpts)
for i := 0; i < engineFailureCountThreshold; i++ {
for i := 0; i < engine.opts.FailureRetry; i++ {
assert.True(t, engine.IsHealthy())
engine.IncFailureCount()
}

View File

@ -13,7 +13,7 @@ func createAgent(t *testing.T, ID string, containers ...*cluster.Container) *age
engOpts := &cluster.EngineOpts{
RefreshMinInterval: time.Duration(30) * time.Second,
RefreshMaxInterval: time.Duration(60) * time.Second,
RefreshRetry: 3,
FailureRetry: 3,
}
engine := cluster.NewEngine(ID, 0, engOpts)
engine.Name = ID

View File

@ -43,7 +43,7 @@ var (
engOpts = &cluster.EngineOpts{
RefreshMinInterval: time.Duration(30) * time.Second,
RefreshMaxInterval: time.Duration(60) * time.Second,
RefreshRetry: 3,
FailureRetry: 3,
}
)

View File

@ -14,7 +14,7 @@ load helpers
[[ "${output}" == *"max refresh interval cannot be less than min refresh interval"* ]]
# engine refresh retry count
run swarm manage --engine-refresh-retry 0 --advertise 127.0.0.1:$SWARM_BASE_PORT 192.168.56.202:4444
run swarm manage --engine-failure-retry 0 --advertise 127.0.0.1:$SWARM_BASE_PORT 192.168.56.202:4444
[ "$status" -ne 0 ]
[[ "${output}" == *"invalid refresh retry count"* ]]
[[ "${output}" == *"invalid failure retry count"* ]]
}