mirror of https://github.com/docker/docs.git
Reorganize engine failure detection procedure. Change engine option 'RefreshRetry' to 'FailureRetry'.
Signed-off-by: Dong Chen <dongluo.chen@docker.com>
This commit is contained in:
parent
4d24256c19
commit
ec3b00c484
|
@ -26,7 +26,7 @@ var (
|
|||
flHosts,
|
||||
flLeaderElection, flLeaderTTL, flManageAdvertise,
|
||||
flTLS, flTLSCaCert, flTLSCert, flTLSKey, flTLSVerify,
|
||||
flRefreshIntervalMin, flRefreshIntervalMax, flRefreshRetry,
|
||||
flRefreshIntervalMin, flRefreshIntervalMax, flFailureRetry,
|
||||
flHeartBeat,
|
||||
flEnableCors,
|
||||
flCluster, flDiscoveryOpt, flClusterOpt},
|
||||
|
|
|
@ -71,10 +71,10 @@ var (
|
|||
Value: "60s",
|
||||
Usage: "set engine refresh maximum interval",
|
||||
}
|
||||
flRefreshRetry = cli.IntFlag{
|
||||
Name: "engine-refresh-retry",
|
||||
flFailureRetry = cli.IntFlag{
|
||||
Name: "engine-failure-retry",
|
||||
Value: 3,
|
||||
Usage: "set engine refresh retry count on failure",
|
||||
Usage: "set engine failure retry count",
|
||||
}
|
||||
flEnableCors = cli.BoolFlag{
|
||||
Name: "api-enable-cors, cors",
|
||||
|
|
|
@ -240,14 +240,14 @@ func manage(c *cli.Context) {
|
|||
if refreshMaxInterval < refreshMinInterval {
|
||||
log.Fatal("max refresh interval cannot be less than min refresh interval")
|
||||
}
|
||||
refreshRetry := c.Int("engine-refresh-retry")
|
||||
if refreshRetry <= 0 {
|
||||
log.Fatal("invalid refresh retry count")
|
||||
failureRetry := c.Int("engine-failure-retry")
|
||||
if failureRetry <= 0 {
|
||||
log.Fatal("invalid failure retry count")
|
||||
}
|
||||
engineOpts := &cluster.EngineOpts{
|
||||
RefreshMinInterval: refreshMinInterval,
|
||||
RefreshMaxInterval: refreshMaxInterval,
|
||||
RefreshRetry: refreshRetry,
|
||||
FailureRetry: failureRetry,
|
||||
}
|
||||
|
||||
uri := getDiscovery(c)
|
||||
|
|
|
@ -24,9 +24,6 @@ const (
|
|||
|
||||
// Minimum docker engine version supported by swarm.
|
||||
minSupportedVersion = version.Version("1.6.0")
|
||||
|
||||
// Engine failureCount threshold
|
||||
engineFailureCountThreshold = 3
|
||||
)
|
||||
|
||||
// delayer offers a simple API to random delay within a given time range.
|
||||
|
@ -62,7 +59,7 @@ func (d *delayer) Wait() <-chan time.Time {
|
|||
type EngineOpts struct {
|
||||
RefreshMinInterval time.Duration
|
||||
RefreshMaxInterval time.Duration
|
||||
RefreshRetry int
|
||||
FailureRetry int
|
||||
}
|
||||
|
||||
// Engine represents a docker engine
|
||||
|
@ -86,7 +83,7 @@ type Engine struct {
|
|||
client dockerclient.Client
|
||||
eventHandler EventHandler
|
||||
healthy bool
|
||||
failureCount int64
|
||||
failureCount int
|
||||
overcommitRatio int64
|
||||
opts *EngineOpts
|
||||
}
|
||||
|
@ -188,6 +185,17 @@ func (e *Engine) IsHealthy() bool {
|
|||
return e.healthy
|
||||
}
|
||||
|
||||
// SetHealthy sets engine healthy state
|
||||
func (e *Engine) SetHealthy(state bool) {
|
||||
e.Lock()
|
||||
e.healthy = state
|
||||
// if engine is healthy, clear failureCount
|
||||
if state {
|
||||
e.failureCount = 0
|
||||
}
|
||||
e.Unlock()
|
||||
}
|
||||
|
||||
// Status returns the health status of the Engine: Healthy or Unhealthy
|
||||
func (e *Engine) Status() string {
|
||||
if e.healthy {
|
||||
|
@ -200,23 +208,12 @@ func (e *Engine) Status() string {
|
|||
func (e *Engine) IncFailureCount() {
|
||||
e.Lock()
|
||||
e.failureCount++
|
||||
if e.healthy && e.failureCount >= engineFailureCountThreshold {
|
||||
if e.healthy && e.failureCount >= e.opts.FailureRetry {
|
||||
e.healthy = false
|
||||
}
|
||||
e.Unlock()
|
||||
}
|
||||
|
||||
// SetEngineHealth sets engine healthy state
|
||||
func (e *Engine) SetEngineHealth(state bool) {
|
||||
e.Lock()
|
||||
e.healthy = state
|
||||
// if engine is healthy, clear failureCount
|
||||
if state {
|
||||
e.failureCount = 0
|
||||
}
|
||||
e.Unlock()
|
||||
}
|
||||
|
||||
// Gather engine specs (CPU, memory, constraints, ...).
|
||||
func (e *Engine) updateSpecs() error {
|
||||
info, err := e.client.Info()
|
||||
|
@ -435,7 +432,6 @@ func (e *Engine) updateContainer(c dockerclient.Container, containers map[string
|
|||
}
|
||||
|
||||
func (e *Engine) refreshLoop() {
|
||||
failedAttempts := 0
|
||||
|
||||
for {
|
||||
var err error
|
||||
|
@ -456,15 +452,15 @@ func (e *Engine) refreshLoop() {
|
|||
}
|
||||
|
||||
if err != nil {
|
||||
failedAttempts++
|
||||
if failedAttempts >= e.opts.RefreshRetry && e.healthy {
|
||||
e.failureCount++
|
||||
if e.failureCount >= e.opts.FailureRetry && e.healthy {
|
||||
e.emitEvent("engine_disconnect")
|
||||
e.SetEngineHealth(false)
|
||||
log.WithFields(log.Fields{"name": e.Name, "id": e.ID}).Errorf("Flagging engine as dead. Updated state failed %d times: %v", failedAttempts, err)
|
||||
e.SetHealthy(false)
|
||||
log.WithFields(log.Fields{"name": e.Name, "id": e.ID}).Errorf("Flagging engine as dead. Updated state failed %d times: %v", e.failureCount, err)
|
||||
}
|
||||
} else {
|
||||
if !e.healthy {
|
||||
log.WithFields(log.Fields{"name": e.Name, "id": e.ID}).Infof("Engine came back to life after %d retries. Hooray!", failedAttempts)
|
||||
log.WithFields(log.Fields{"name": e.Name, "id": e.ID}).Infof("Engine came back to life after %d retries. Hooray!", e.failureCount)
|
||||
if err := e.updateSpecs(); err != nil {
|
||||
log.WithFields(log.Fields{"name": e.Name, "id": e.ID}).Errorf("Update engine specs failed: %v", err)
|
||||
continue
|
||||
|
@ -473,8 +469,7 @@ func (e *Engine) refreshLoop() {
|
|||
e.client.StartMonitorEvents(e.handler, nil)
|
||||
e.emitEvent("engine_reconnect")
|
||||
}
|
||||
e.SetEngineHealth(true)
|
||||
failedAttempts = 0
|
||||
e.SetHealthy(true)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -34,13 +34,13 @@ var (
|
|||
engOpts = &EngineOpts{
|
||||
RefreshMinInterval: time.Duration(30) * time.Second,
|
||||
RefreshMaxInterval: time.Duration(60) * time.Second,
|
||||
RefreshRetry: 3,
|
||||
FailureRetry: 3,
|
||||
}
|
||||
)
|
||||
|
||||
func TestEngineFailureCount(t *testing.T) {
|
||||
engine := NewEngine("test", 0, engOpts)
|
||||
for i := 0; i < engineFailureCountThreshold; i++ {
|
||||
for i := 0; i < engine.opts.FailureRetry; i++ {
|
||||
assert.True(t, engine.IsHealthy())
|
||||
engine.IncFailureCount()
|
||||
}
|
||||
|
|
|
@ -13,7 +13,7 @@ func createAgent(t *testing.T, ID string, containers ...*cluster.Container) *age
|
|||
engOpts := &cluster.EngineOpts{
|
||||
RefreshMinInterval: time.Duration(30) * time.Second,
|
||||
RefreshMaxInterval: time.Duration(60) * time.Second,
|
||||
RefreshRetry: 3,
|
||||
FailureRetry: 3,
|
||||
}
|
||||
engine := cluster.NewEngine(ID, 0, engOpts)
|
||||
engine.Name = ID
|
||||
|
|
|
@ -43,7 +43,7 @@ var (
|
|||
engOpts = &cluster.EngineOpts{
|
||||
RefreshMinInterval: time.Duration(30) * time.Second,
|
||||
RefreshMaxInterval: time.Duration(60) * time.Second,
|
||||
RefreshRetry: 3,
|
||||
FailureRetry: 3,
|
||||
}
|
||||
)
|
||||
|
||||
|
|
|
@ -14,7 +14,7 @@ load helpers
|
|||
[[ "${output}" == *"max refresh interval cannot be less than min refresh interval"* ]]
|
||||
|
||||
# engine refresh retry count
|
||||
run swarm manage --engine-refresh-retry 0 --advertise 127.0.0.1:$SWARM_BASE_PORT 192.168.56.202:4444
|
||||
run swarm manage --engine-failure-retry 0 --advertise 127.0.0.1:$SWARM_BASE_PORT 192.168.56.202:4444
|
||||
[ "$status" -ne 0 ]
|
||||
[[ "${output}" == *"invalid refresh retry count"* ]]
|
||||
[[ "${output}" == *"invalid failure retry count"* ]]
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue