From 4d24256c19c890eb8a87fde4a99f50895c869856 Mon Sep 17 00:00:00 2001 From: Dong Chen Date: Tue, 8 Dec 2015 17:58:36 -0800 Subject: [PATCH 1/6] Use failureCount as a secondary health indicator. Signed-off-by: Dong Chen --- cluster/engine.go | 29 +++++++++++++++++++++++++++-- cluster/engine_test.go | 9 +++++++++ 2 files changed, 36 insertions(+), 2 deletions(-) diff --git a/cluster/engine.go b/cluster/engine.go index 45ed7bf4ac..1bba83fde5 100644 --- a/cluster/engine.go +++ b/cluster/engine.go @@ -24,6 +24,9 @@ const ( // Minimum docker engine version supported by swarm. minSupportedVersion = version.Version("1.6.0") + + // Engine failureCount threshold + engineFailureCountThreshold = 3 ) // delayer offers a simple API to random delay within a given time range. @@ -83,6 +86,7 @@ type Engine struct { client dockerclient.Client eventHandler EventHandler healthy bool + failureCount int64 overcommitRatio int64 opts *EngineOpts } @@ -192,6 +196,27 @@ func (e *Engine) Status() string { return "Unhealthy" } +// IncFailureCount increases engine's failure count, and set engine as unhealthy if threshold is crossed +func (e *Engine) IncFailureCount() { + e.Lock() + e.failureCount++ + if e.healthy && e.failureCount >= engineFailureCountThreshold { + e.healthy = false + } + e.Unlock() +} + +// SetEngineHealth sets engine healthy state +func (e *Engine) SetEngineHealth(state bool) { + e.Lock() + e.healthy = state + // if engine is healthy, clear failureCount + if state { + e.failureCount = 0 + } + e.Unlock() +} + // Gather engine specs (CPU, memory, constraints, ...). func (e *Engine) updateSpecs() error { info, err := e.client.Info() @@ -434,7 +459,7 @@ func (e *Engine) refreshLoop() { failedAttempts++ if failedAttempts >= e.opts.RefreshRetry && e.healthy { e.emitEvent("engine_disconnect") - e.healthy = false + e.SetEngineHealth(false) log.WithFields(log.Fields{"name": e.Name, "id": e.ID}).Errorf("Flagging engine as dead. Updated state failed %d times: %v", failedAttempts, err) } } else { @@ -448,7 +473,7 @@ func (e *Engine) refreshLoop() { e.client.StartMonitorEvents(e.handler, nil) e.emitEvent("engine_reconnect") } - e.healthy = true + e.SetEngineHealth(true) failedAttempts = 0 } } diff --git a/cluster/engine_test.go b/cluster/engine_test.go index 730acc78b2..31941142e1 100644 --- a/cluster/engine_test.go +++ b/cluster/engine_test.go @@ -38,6 +38,15 @@ var ( } ) +func TestEngineFailureCount(t *testing.T) { + engine := NewEngine("test", 0, engOpts) + for i := 0; i < engineFailureCountThreshold; i++ { + assert.True(t, engine.IsHealthy()) + engine.IncFailureCount() + } + assert.False(t, engine.IsHealthy()) +} + func TestEngineConnectionFailure(t *testing.T) { engine := NewEngine("test", 0, engOpts) assert.False(t, engine.isConnected()) From ec3b00c48426a045339eaccbf5f51e5bea5580c8 Mon Sep 17 00:00:00 2001 From: Dong Chen Date: Wed, 9 Dec 2015 17:14:56 -0800 Subject: [PATCH 2/6] Reorganize engine failure detection procedure. Change engine option 'RefreshRetry' to 'FailureRetry'. Signed-off-by: Dong Chen --- cli/commands.go | 2 +- cli/flags.go | 6 ++-- cli/manage.go | 8 ++--- cluster/engine.go | 45 +++++++++++++--------------- cluster/engine_test.go | 4 +-- cluster/mesos/cluster_test.go | 2 +- cluster/swarm/cluster_test.go | 2 +- test/integration/engine_options.bats | 4 +-- 8 files changed, 34 insertions(+), 39 deletions(-) diff --git a/cli/commands.go b/cli/commands.go index 73790935c9..6e9e6bb9f3 100644 --- a/cli/commands.go +++ b/cli/commands.go @@ -26,7 +26,7 @@ var ( flHosts, flLeaderElection, flLeaderTTL, flManageAdvertise, flTLS, flTLSCaCert, flTLSCert, flTLSKey, flTLSVerify, - flRefreshIntervalMin, flRefreshIntervalMax, flRefreshRetry, + flRefreshIntervalMin, flRefreshIntervalMax, flFailureRetry, flHeartBeat, flEnableCors, flCluster, flDiscoveryOpt, flClusterOpt}, diff --git a/cli/flags.go b/cli/flags.go index 6d783189e9..b4f646c935 100644 --- a/cli/flags.go +++ b/cli/flags.go @@ -71,10 +71,10 @@ var ( Value: "60s", Usage: "set engine refresh maximum interval", } - flRefreshRetry = cli.IntFlag{ - Name: "engine-refresh-retry", + flFailureRetry = cli.IntFlag{ + Name: "engine-failure-retry", Value: 3, - Usage: "set engine refresh retry count on failure", + Usage: "set engine failure retry count", } flEnableCors = cli.BoolFlag{ Name: "api-enable-cors, cors", diff --git a/cli/manage.go b/cli/manage.go index 260cc8bc67..a6ed522176 100644 --- a/cli/manage.go +++ b/cli/manage.go @@ -240,14 +240,14 @@ func manage(c *cli.Context) { if refreshMaxInterval < refreshMinInterval { log.Fatal("max refresh interval cannot be less than min refresh interval") } - refreshRetry := c.Int("engine-refresh-retry") - if refreshRetry <= 0 { - log.Fatal("invalid refresh retry count") + failureRetry := c.Int("engine-failure-retry") + if failureRetry <= 0 { + log.Fatal("invalid failure retry count") } engineOpts := &cluster.EngineOpts{ RefreshMinInterval: refreshMinInterval, RefreshMaxInterval: refreshMaxInterval, - RefreshRetry: refreshRetry, + FailureRetry: failureRetry, } uri := getDiscovery(c) diff --git a/cluster/engine.go b/cluster/engine.go index 1bba83fde5..7f8c01cb08 100644 --- a/cluster/engine.go +++ b/cluster/engine.go @@ -24,9 +24,6 @@ const ( // Minimum docker engine version supported by swarm. minSupportedVersion = version.Version("1.6.0") - - // Engine failureCount threshold - engineFailureCountThreshold = 3 ) // delayer offers a simple API to random delay within a given time range. @@ -62,7 +59,7 @@ func (d *delayer) Wait() <-chan time.Time { type EngineOpts struct { RefreshMinInterval time.Duration RefreshMaxInterval time.Duration - RefreshRetry int + FailureRetry int } // Engine represents a docker engine @@ -86,7 +83,7 @@ type Engine struct { client dockerclient.Client eventHandler EventHandler healthy bool - failureCount int64 + failureCount int overcommitRatio int64 opts *EngineOpts } @@ -188,6 +185,17 @@ func (e *Engine) IsHealthy() bool { return e.healthy } +// SetHealthy sets engine healthy state +func (e *Engine) SetHealthy(state bool) { + e.Lock() + e.healthy = state + // if engine is healthy, clear failureCount + if state { + e.failureCount = 0 + } + e.Unlock() +} + // Status returns the health status of the Engine: Healthy or Unhealthy func (e *Engine) Status() string { if e.healthy { @@ -200,23 +208,12 @@ func (e *Engine) Status() string { func (e *Engine) IncFailureCount() { e.Lock() e.failureCount++ - if e.healthy && e.failureCount >= engineFailureCountThreshold { + if e.healthy && e.failureCount >= e.opts.FailureRetry { e.healthy = false } e.Unlock() } -// SetEngineHealth sets engine healthy state -func (e *Engine) SetEngineHealth(state bool) { - e.Lock() - e.healthy = state - // if engine is healthy, clear failureCount - if state { - e.failureCount = 0 - } - e.Unlock() -} - // Gather engine specs (CPU, memory, constraints, ...). func (e *Engine) updateSpecs() error { info, err := e.client.Info() @@ -435,7 +432,6 @@ func (e *Engine) updateContainer(c dockerclient.Container, containers map[string } func (e *Engine) refreshLoop() { - failedAttempts := 0 for { var err error @@ -456,15 +452,15 @@ func (e *Engine) refreshLoop() { } if err != nil { - failedAttempts++ - if failedAttempts >= e.opts.RefreshRetry && e.healthy { + e.failureCount++ + if e.failureCount >= e.opts.FailureRetry && e.healthy { e.emitEvent("engine_disconnect") - e.SetEngineHealth(false) - log.WithFields(log.Fields{"name": e.Name, "id": e.ID}).Errorf("Flagging engine as dead. Updated state failed %d times: %v", failedAttempts, err) + e.SetHealthy(false) + log.WithFields(log.Fields{"name": e.Name, "id": e.ID}).Errorf("Flagging engine as dead. Updated state failed %d times: %v", e.failureCount, err) } } else { if !e.healthy { - log.WithFields(log.Fields{"name": e.Name, "id": e.ID}).Infof("Engine came back to life after %d retries. Hooray!", failedAttempts) + log.WithFields(log.Fields{"name": e.Name, "id": e.ID}).Infof("Engine came back to life after %d retries. Hooray!", e.failureCount) if err := e.updateSpecs(); err != nil { log.WithFields(log.Fields{"name": e.Name, "id": e.ID}).Errorf("Update engine specs failed: %v", err) continue @@ -473,8 +469,7 @@ func (e *Engine) refreshLoop() { e.client.StartMonitorEvents(e.handler, nil) e.emitEvent("engine_reconnect") } - e.SetEngineHealth(true) - failedAttempts = 0 + e.SetHealthy(true) } } } diff --git a/cluster/engine_test.go b/cluster/engine_test.go index 31941142e1..6552c6eba7 100644 --- a/cluster/engine_test.go +++ b/cluster/engine_test.go @@ -34,13 +34,13 @@ var ( engOpts = &EngineOpts{ RefreshMinInterval: time.Duration(30) * time.Second, RefreshMaxInterval: time.Duration(60) * time.Second, - RefreshRetry: 3, + FailureRetry: 3, } ) func TestEngineFailureCount(t *testing.T) { engine := NewEngine("test", 0, engOpts) - for i := 0; i < engineFailureCountThreshold; i++ { + for i := 0; i < engine.opts.FailureRetry; i++ { assert.True(t, engine.IsHealthy()) engine.IncFailureCount() } diff --git a/cluster/mesos/cluster_test.go b/cluster/mesos/cluster_test.go index c05bf66d96..1ce8116eb5 100644 --- a/cluster/mesos/cluster_test.go +++ b/cluster/mesos/cluster_test.go @@ -13,7 +13,7 @@ func createAgent(t *testing.T, ID string, containers ...*cluster.Container) *age engOpts := &cluster.EngineOpts{ RefreshMinInterval: time.Duration(30) * time.Second, RefreshMaxInterval: time.Duration(60) * time.Second, - RefreshRetry: 3, + FailureRetry: 3, } engine := cluster.NewEngine(ID, 0, engOpts) engine.Name = ID diff --git a/cluster/swarm/cluster_test.go b/cluster/swarm/cluster_test.go index 1d3c04e19a..bb02d2918d 100644 --- a/cluster/swarm/cluster_test.go +++ b/cluster/swarm/cluster_test.go @@ -43,7 +43,7 @@ var ( engOpts = &cluster.EngineOpts{ RefreshMinInterval: time.Duration(30) * time.Second, RefreshMaxInterval: time.Duration(60) * time.Second, - RefreshRetry: 3, + FailureRetry: 3, } ) diff --git a/test/integration/engine_options.bats b/test/integration/engine_options.bats index c2347489ab..aef39fde29 100644 --- a/test/integration/engine_options.bats +++ b/test/integration/engine_options.bats @@ -14,7 +14,7 @@ load helpers [[ "${output}" == *"max refresh interval cannot be less than min refresh interval"* ]] # engine refresh retry count - run swarm manage --engine-refresh-retry 0 --advertise 127.0.0.1:$SWARM_BASE_PORT 192.168.56.202:4444 + run swarm manage --engine-failure-retry 0 --advertise 127.0.0.1:$SWARM_BASE_PORT 192.168.56.202:4444 [ "$status" -ne 0 ] - [[ "${output}" == *"invalid refresh retry count"* ]] + [[ "${output}" == *"invalid failure retry count"* ]] } From 9bc6c35321110a37f69d22b8bb8450397f818bed Mon Sep 17 00:00:00 2001 From: Dong Chen Date: Thu, 10 Dec 2015 12:21:46 -0800 Subject: [PATCH 3/6] Use engine connection error to fail engine fast. Signed-off-by: Dong Chen --- cluster/engine.go | 65 +++++++++++++++++++++++++++++++++--------- cluster/engine_test.go | 2 +- 2 files changed, 53 insertions(+), 14 deletions(-) diff --git a/cluster/engine.go b/cluster/engine.go index 7f8c01cb08..0e3e3920a5 100644 --- a/cluster/engine.go +++ b/cluster/engine.go @@ -185,8 +185,8 @@ func (e *Engine) IsHealthy() bool { return e.healthy } -// SetHealthy sets engine healthy state -func (e *Engine) SetHealthy(state bool) { +// setHealthy sets engine healthy state +func (e *Engine) setHealthy(state bool) { e.Lock() e.healthy = state // if engine is healthy, clear failureCount @@ -204,8 +204,8 @@ func (e *Engine) Status() string { return "Unhealthy" } -// IncFailureCount increases engine's failure count, and set engine as unhealthy if threshold is crossed -func (e *Engine) IncFailureCount() { +// incFailureCount increases engine's failure count, and set engine as unhealthy if threshold is crossed +func (e *Engine) incFailureCount() { e.Lock() e.failureCount++ if e.healthy && e.failureCount >= e.opts.FailureRetry { @@ -214,9 +214,24 @@ func (e *Engine) IncFailureCount() { e.Unlock() } +// checkConnectionErr checks error from dockerclient response and adjust engine healthy indicators +func (e *Engine) checkConnectionErr(err error) { + if err == nil { + e.setHealthy(true) + return + } + // will change to err == dockerclient.ErrConnectionRefused when dockerclient update is merged + if strings.Contains(err.Error(), "connection refused") || + strings.Contains(err.Error(), "Cannot connect to the docker engine endpoint") { + e.incFailureCount() + } + // other errors may be ambiguous. let refresh loop decide healthy or not. +} + // Gather engine specs (CPU, memory, constraints, ...). func (e *Engine) updateSpecs() error { info, err := e.client.Info() + e.checkConnectionErr(err) if err != nil { return err } @@ -226,6 +241,7 @@ func (e *Engine) updateSpecs() error { } v, err := e.client.Version() + e.checkConnectionErr(err) if err != nil { return err } @@ -258,6 +274,7 @@ func (e *Engine) updateSpecs() error { // RemoveImage deletes an image from the engine. func (e *Engine) RemoveImage(image *Image, name string, force bool) ([]*dockerclient.ImageDelete, error) { array, err := e.client.RemoveImage(name, force) + e.checkConnectionErr(err) e.RefreshImages() return array, err @@ -266,13 +283,16 @@ func (e *Engine) RemoveImage(image *Image, name string, force bool) ([]*dockercl // RemoveNetwork deletes a network from the engine. func (e *Engine) RemoveNetwork(network *Network) error { err := e.client.RemoveNetwork(network.ID) + e.checkConnectionErr(err) e.RefreshNetworks() return err } // RemoveVolume deletes a volume from the engine. func (e *Engine) RemoveVolume(name string) error { - if err := e.client.RemoveVolume(name); err != nil { + err := e.client.RemoveVolume(name) + e.checkConnectionErr(err) + if err != nil { return err } @@ -288,6 +308,7 @@ func (e *Engine) RemoveVolume(name string) error { // RefreshImages refreshes the list of images on the engine. func (e *Engine) RefreshImages() error { images, err := e.client.ListImages(true) + e.checkConnectionErr(err) if err != nil { return err } @@ -303,6 +324,7 @@ func (e *Engine) RefreshImages() error { // RefreshNetworks refreshes the list of networks on the engine. func (e *Engine) RefreshNetworks() error { networks, err := e.client.ListNetworks("") + e.checkConnectionErr(err) if err != nil { return err } @@ -318,6 +340,7 @@ func (e *Engine) RefreshNetworks() error { // RefreshVolumes refreshes the list of volumes on the engine. func (e *Engine) RefreshVolumes() error { volumes, err := e.client.ListVolumes() + e.checkConnectionErr(err) if err != nil { return err } @@ -335,6 +358,7 @@ func (e *Engine) RefreshVolumes() error { // FIXME: unexport this method after mesos scheduler stops using it directly func (e *Engine) RefreshContainers(full bool) error { containers, err := e.client.ListContainers(true, false, "") + e.checkConnectionErr(err) if err != nil { return err } @@ -361,6 +385,7 @@ func (e *Engine) RefreshContainers(full bool) error { // the container will be inspected. func (e *Engine) refreshContainer(ID string, full bool) (*Container, error) { containers, err := e.client.ListContainers(true, false, fmt.Sprintf("{%q:[%q]}", "id", ID)) + e.checkConnectionErr(err) if err != nil { return nil, err } @@ -407,6 +432,7 @@ func (e *Engine) updateContainer(c dockerclient.Container, containers map[string // Update ContainerInfo. if full { info, err := e.client.InspectContainer(c.Id) + e.checkConnectionErr(err) if err != nil { return nil, err } @@ -455,7 +481,7 @@ func (e *Engine) refreshLoop() { e.failureCount++ if e.failureCount >= e.opts.FailureRetry && e.healthy { e.emitEvent("engine_disconnect") - e.SetHealthy(false) + e.setHealthy(false) log.WithFields(log.Fields{"name": e.Name, "id": e.ID}).Errorf("Flagging engine as dead. Updated state failed %d times: %v", e.failureCount, err) } } else { @@ -469,7 +495,7 @@ func (e *Engine) refreshLoop() { e.client.StartMonitorEvents(e.handler, nil) e.emitEvent("engine_reconnect") } - e.SetHealthy(true) + e.setHealthy(true) } } } @@ -574,7 +600,9 @@ func (e *Engine) Create(config *ContainerConfig, name string, pullImage bool, au // RemoveContainer a container from the engine. func (e *Engine) RemoveContainer(container *Container, force, volumes bool) error { - if err := e.client.RemoveContainer(container.Id, force, volumes); err != nil { + err := e.client.RemoveContainer(container.Id, force, volumes) + e.checkConnectionErr(err) + if err != nil { return err } @@ -590,6 +618,7 @@ func (e *Engine) RemoveContainer(container *Container, force, volumes bool) erro // CreateNetwork creates a network in the engine func (e *Engine) CreateNetwork(request *dockerclient.NetworkCreate) (*dockerclient.NetworkCreateResponse, error) { response, err := e.client.CreateNetwork(request) + e.checkConnectionErr(err) e.RefreshNetworks() @@ -601,6 +630,7 @@ func (e *Engine) CreateVolume(request *dockerclient.VolumeCreateRequest) (*Volum volume, err := e.client.CreateVolume(request) e.RefreshVolumes() + e.checkConnectionErr(err) if err != nil { return nil, err @@ -614,7 +644,9 @@ func (e *Engine) Pull(image string, authConfig *dockerclient.AuthConfig) error { if !strings.Contains(image, ":") { image = image + ":latest" } - if err := e.client.PullImage(image, authConfig); err != nil { + err := e.client.PullImage(image, authConfig) + e.checkConnectionErr(err) + if err != nil { return err } @@ -626,7 +658,9 @@ func (e *Engine) Pull(image string, authConfig *dockerclient.AuthConfig) error { // Load an image on the engine func (e *Engine) Load(reader io.Reader) error { - if err := e.client.LoadImage(reader); err != nil { + err := e.client.LoadImage(reader) + e.checkConnectionErr(err) + if err != nil { return err } @@ -638,7 +672,9 @@ func (e *Engine) Load(reader io.Reader) error { // Import image func (e *Engine) Import(source string, repository string, tag string, imageReader io.Reader) error { - if _, err := e.client.ImportImage(source, repository, tag, imageReader); err != nil { + _, err := e.client.ImportImage(source, repository, tag, imageReader) + e.checkConnectionErr(err) + if err != nil { return err } @@ -794,6 +830,7 @@ func (e *Engine) cleanupContainers() { func (e *Engine) RenameContainer(container *Container, newName string) error { // send rename request err := e.client.RenameContainer(container.Id, newName) + e.checkConnectionErr(err) if err != nil { return err } @@ -805,14 +842,16 @@ func (e *Engine) RenameContainer(container *Container, newName string) error { // BuildImage build an image func (e *Engine) BuildImage(buildImage *dockerclient.BuildImage) (io.ReadCloser, error) { - - return e.client.BuildImage(buildImage) + reader, err := e.client.BuildImage(buildImage) + e.checkConnectionErr(err) + return reader, err } // TagImage tag an image func (e *Engine) TagImage(IDOrName string, repo string, tag string, force bool) error { // send tag request to docker engine err := e.client.TagImage(IDOrName, repo, tag, force) + e.checkConnectionErr(err) if err != nil { return err } diff --git a/cluster/engine_test.go b/cluster/engine_test.go index 6552c6eba7..07635360be 100644 --- a/cluster/engine_test.go +++ b/cluster/engine_test.go @@ -42,7 +42,7 @@ func TestEngineFailureCount(t *testing.T) { engine := NewEngine("test", 0, engOpts) for i := 0; i < engine.opts.FailureRetry; i++ { assert.True(t, engine.IsHealthy()) - engine.IncFailureCount() + engine.incFailureCount() } assert.False(t, engine.IsHealthy()) } From 82d16226e647c52abb316c55ab98697e80114262 Mon Sep 17 00:00:00 2001 From: Dong Chen Date: Fri, 11 Dec 2015 14:18:59 -0800 Subject: [PATCH 4/6] rebase and update test case. Signed-off-by: Dong Chen --- test/integration/api/logs.bats | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/integration/api/logs.bats b/test/integration/api/logs.bats index 38672c8b81..903d37f017 100644 --- a/test/integration/api/logs.bats +++ b/test/integration/api/logs.bats @@ -30,7 +30,7 @@ function teardown() { @test "docker logs unhealthy node" { start_docker_with_busybox 1 - swarm_manage --engine-refresh-min-interval=1s --engine-refresh-max-interval=1s --engine-refresh-retry=1 ${HOSTS[0]} + swarm_manage --engine-refresh-min-interval=1s --engine-refresh-max-interval=1s --engine-failure-retry=1 ${HOSTS[0]} # run a container with echo command docker_swarm run -d --name test_container busybox /bin/sh -c "echo hello world; echo hello docker; echo hello swarm" From d80a32b3df1a0f3db43c3541c2a39d7e9d04c271 Mon Sep 17 00:00:00 2001 From: Dong Chen Date: Fri, 11 Dec 2015 17:43:45 -0800 Subject: [PATCH 5/6] Explicitly deprecate --engine-refresh-retry. Signed-off-by: Dong Chen --- cli/commands.go | 2 +- cli/flags.go | 5 +++++ cli/manage.go | 5 +++++ 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/cli/commands.go b/cli/commands.go index 6e9e6bb9f3..a1e8263e3f 100644 --- a/cli/commands.go +++ b/cli/commands.go @@ -26,7 +26,7 @@ var ( flHosts, flLeaderElection, flLeaderTTL, flManageAdvertise, flTLS, flTLSCaCert, flTLSCert, flTLSKey, flTLSVerify, - flRefreshIntervalMin, flRefreshIntervalMax, flFailureRetry, + flRefreshIntervalMin, flRefreshIntervalMax, flFailureRetry, flRefreshRetry, flHeartBeat, flEnableCors, flCluster, flDiscoveryOpt, flClusterOpt}, diff --git a/cli/flags.go b/cli/flags.go index b4f646c935..b3890ccd8f 100644 --- a/cli/flags.go +++ b/cli/flags.go @@ -71,6 +71,11 @@ var ( Value: "60s", Usage: "set engine refresh maximum interval", } + flRefreshRetry = cli.IntFlag{ + Name: "engine-refresh-retry", + Value: 3, + Usage: "deprecated; replaced by --engine-failure-retry", + } flFailureRetry = cli.IntFlag{ Name: "engine-failure-retry", Value: 3, diff --git a/cli/manage.go b/cli/manage.go index a6ed522176..8bc4bbbdf4 100644 --- a/cli/manage.go +++ b/cli/manage.go @@ -240,6 +240,11 @@ func manage(c *cli.Context) { if refreshMaxInterval < refreshMinInterval { log.Fatal("max refresh interval cannot be less than min refresh interval") } + // engine-refresh-retry is deprecated + refreshRetry := c.Int("engine-refresh-retry") + if refreshRetry != 3 { + log.Fatal("--engine-refresh-retry is deprecated. Use --engine-failure-retry") + } failureRetry := c.Int("engine-failure-retry") if failureRetry <= 0 { log.Fatal("invalid failure retry count") From 02553d07279d9cc644946d812868abb519db8e34 Mon Sep 17 00:00:00 2001 From: Dong Chen Date: Tue, 15 Dec 2015 15:29:16 -0800 Subject: [PATCH 6/6] Cover connection failure error reported by dockerclient and by proxy cases. Signed-off-by: Dong Chen --- api/handlers.go | 38 ++++++++++++++++++------- cluster/engine.go | 70 +++++++++++++++++++++++++++++------------------ 2 files changed, 71 insertions(+), 37 deletions(-) diff --git a/api/handlers.go b/api/handlers.go index 3e341b0615..b453472372 100644 --- a/api/handlers.go +++ b/api/handlers.go @@ -386,6 +386,7 @@ func getContainerJSON(c *context, w http.ResponseWriter, r *http.Request) { client, scheme := newClientAndScheme(c.tlsConfig) resp, err := client.Get(scheme + "://" + container.Engine.Addr + "/containers/" + container.Id + "/json") + container.Engine.CheckConnectionErr(err) if err != nil { httpError(w, err.Error(), http.StatusInternalServerError) return @@ -670,6 +671,7 @@ func postContainersExec(c *context, w http.ResponseWriter, r *http.Request) { client, scheme := newClientAndScheme(c.tlsConfig) resp, err := client.Post(scheme+"://"+container.Engine.Addr+"/containers/"+container.Id+"/exec", "application/json", r.Body) + container.Engine.CheckConnectionErr(err) if err != nil { httpError(w, err.Error(), http.StatusInternalServerError) return @@ -796,7 +798,8 @@ func proxyNetwork(c *context, w http.ResponseWriter, r *http.Request) { func proxyVolume(c *context, w http.ResponseWriter, r *http.Request) { var name = mux.Vars(r)["volumename"] if volume := c.cluster.Volume(name); volume != nil { - proxy(c.tlsConfig, volume.Engine.Addr, w, r) + err := proxy(c.tlsConfig, volume.Engine.Addr, w, r) + volume.Engine.CheckConnectionErr(err) return } httpError(w, fmt.Sprintf("No such volume: %s", name), http.StatusNotFound) @@ -838,7 +841,9 @@ func proxyNetworkContainerOperation(c *context, w http.ResponseWriter, r *http.R } // request is forwarded to the container's address - if err := proxyAsync(c.tlsConfig, container.Engine.Addr, w, r, cb); err != nil { + err := proxyAsync(c.tlsConfig, container.Engine.Addr, w, r, cb) + container.Engine.CheckConnectionErr(err) + if err != nil { httpError(w, err.Error(), http.StatusNotFound) } } @@ -856,7 +861,9 @@ func proxyContainer(c *context, w http.ResponseWriter, r *http.Request) { r.URL.Path = strings.Replace(r.URL.Path, name, container.Id, 1) } - if err := proxy(c.tlsConfig, container.Engine.Addr, w, r); err != nil { + err = proxy(c.tlsConfig, container.Engine.Addr, w, r) + container.Engine.CheckConnectionErr(err) + if err != nil { httpError(w, err.Error(), http.StatusInternalServerError) } } @@ -879,7 +886,9 @@ func proxyContainerAndForceRefresh(c *context, w http.ResponseWriter, r *http.Re container.Refresh() } - if err := proxyAsync(c.tlsConfig, container.Engine.Addr, w, r, cb); err != nil { + err = proxyAsync(c.tlsConfig, container.Engine.Addr, w, r, cb) + container.Engine.CheckConnectionErr(err) + if err != nil { httpError(w, err.Error(), http.StatusInternalServerError) } } @@ -889,7 +898,8 @@ func proxyImage(c *context, w http.ResponseWriter, r *http.Request) { name := mux.Vars(r)["name"] if image := c.cluster.Image(name); image != nil { - proxy(c.tlsConfig, image.Engine.Addr, w, r) + err := proxy(c.tlsConfig, image.Engine.Addr, w, r) + image.Engine.CheckConnectionErr(err) return } httpError(w, fmt.Sprintf("No such image: %s", name), http.StatusNotFound) @@ -902,7 +912,8 @@ func proxyImageGet(c *context, w http.ResponseWriter, r *http.Request) { for _, image := range c.cluster.Images() { if len(strings.SplitN(name, ":", 2)) == 2 && image.Match(name, true) || len(strings.SplitN(name, ":", 2)) == 1 && image.Match(name, false) { - proxy(c.tlsConfig, image.Engine.Addr, w, r) + err := proxy(c.tlsConfig, image.Engine.Addr, w, r) + image.Engine.CheckConnectionErr(err) return } } @@ -925,7 +936,8 @@ func proxyImagePush(c *context, w http.ResponseWriter, r *http.Request) { for _, image := range c.cluster.Images() { if tag != "" && image.Match(name, true) || tag == "" && image.Match(name, false) { - proxy(c.tlsConfig, image.Engine.Addr, w, r) + err := proxy(c.tlsConfig, image.Engine.Addr, w, r) + image.Engine.CheckConnectionErr(err) return } } @@ -969,7 +981,9 @@ func proxyRandom(c *context, w http.ResponseWriter, r *http.Request) { return } - if err := proxy(c.tlsConfig, engine.Addr, w, r); err != nil { + err = proxy(c.tlsConfig, engine.Addr, w, r) + engine.CheckConnectionErr(err) + if err != nil { httpError(w, err.Error(), http.StatusInternalServerError) } @@ -1003,7 +1017,9 @@ func postCommit(c *context, w http.ResponseWriter, r *http.Request) { } // proxy commit request to the right node - if err := proxyAsync(c.tlsConfig, container.Engine.Addr, w, r, cb); err != nil { + err = proxyAsync(c.tlsConfig, container.Engine.Addr, w, r, cb) + container.Engine.CheckConnectionErr(err) + if err != nil { httpError(w, err.Error(), http.StatusInternalServerError) } } @@ -1093,7 +1109,9 @@ func proxyHijack(c *context, w http.ResponseWriter, r *http.Request) { r.URL.Path = strings.Replace(r.URL.Path, name, container.Id, 1) } - if err := hijack(c.tlsConfig, container.Engine.Addr, w, r); err != nil { + err = hijack(c.tlsConfig, container.Engine.Addr, w, r) + container.Engine.CheckConnectionErr(err) + if err != nil { httpError(w, err.Error(), http.StatusInternalServerError) } } diff --git a/cluster/engine.go b/cluster/engine.go index 0e3e3920a5..b62efa822d 100644 --- a/cluster/engine.go +++ b/cluster/engine.go @@ -214,16 +214,26 @@ func (e *Engine) incFailureCount() { e.Unlock() } -// checkConnectionErr checks error from dockerclient response and adjust engine healthy indicators -func (e *Engine) checkConnectionErr(err error) { +// CheckConnectionErr checks error from client response and adjust engine healthy indicators +func (e *Engine) CheckConnectionErr(err error) { if err == nil { e.setHealthy(true) return } - // will change to err == dockerclient.ErrConnectionRefused when dockerclient update is merged - if strings.Contains(err.Error(), "connection refused") || - strings.Contains(err.Error(), "Cannot connect to the docker engine endpoint") { + + // dockerclient defines ErrConnectionRefused error. but if http client is from swarm, it's not using + // dockerclient. We need string matching for these cases. Remove the first character to deal with + // case sensitive issue + if err == dockerclient.ErrConnectionRefused || + strings.Contains(err.Error(), "onnection refused") || + strings.Contains(err.Error(), "annot connect to the docker engine endpoint") { + // each connection refused instance may increase failure count so + // engine can fail fast. Short engine freeze or network failure may result + // in engine marked as unhealthy. If this causes unnecessary failure, engine + // can track last error time. Only increase failure count if last error is + // not too recent, e.g., last error is at least 1 seconds ago. e.incFailureCount() + return } // other errors may be ambiguous. let refresh loop decide healthy or not. } @@ -231,7 +241,7 @@ func (e *Engine) checkConnectionErr(err error) { // Gather engine specs (CPU, memory, constraints, ...). func (e *Engine) updateSpecs() error { info, err := e.client.Info() - e.checkConnectionErr(err) + e.CheckConnectionErr(err) if err != nil { return err } @@ -241,7 +251,7 @@ func (e *Engine) updateSpecs() error { } v, err := e.client.Version() - e.checkConnectionErr(err) + e.CheckConnectionErr(err) if err != nil { return err } @@ -274,7 +284,7 @@ func (e *Engine) updateSpecs() error { // RemoveImage deletes an image from the engine. func (e *Engine) RemoveImage(image *Image, name string, force bool) ([]*dockerclient.ImageDelete, error) { array, err := e.client.RemoveImage(name, force) - e.checkConnectionErr(err) + e.CheckConnectionErr(err) e.RefreshImages() return array, err @@ -283,7 +293,7 @@ func (e *Engine) RemoveImage(image *Image, name string, force bool) ([]*dockercl // RemoveNetwork deletes a network from the engine. func (e *Engine) RemoveNetwork(network *Network) error { err := e.client.RemoveNetwork(network.ID) - e.checkConnectionErr(err) + e.CheckConnectionErr(err) e.RefreshNetworks() return err } @@ -291,7 +301,7 @@ func (e *Engine) RemoveNetwork(network *Network) error { // RemoveVolume deletes a volume from the engine. func (e *Engine) RemoveVolume(name string) error { err := e.client.RemoveVolume(name) - e.checkConnectionErr(err) + e.CheckConnectionErr(err) if err != nil { return err } @@ -308,7 +318,7 @@ func (e *Engine) RemoveVolume(name string) error { // RefreshImages refreshes the list of images on the engine. func (e *Engine) RefreshImages() error { images, err := e.client.ListImages(true) - e.checkConnectionErr(err) + e.CheckConnectionErr(err) if err != nil { return err } @@ -324,7 +334,7 @@ func (e *Engine) RefreshImages() error { // RefreshNetworks refreshes the list of networks on the engine. func (e *Engine) RefreshNetworks() error { networks, err := e.client.ListNetworks("") - e.checkConnectionErr(err) + e.CheckConnectionErr(err) if err != nil { return err } @@ -340,7 +350,7 @@ func (e *Engine) RefreshNetworks() error { // RefreshVolumes refreshes the list of volumes on the engine. func (e *Engine) RefreshVolumes() error { volumes, err := e.client.ListVolumes() - e.checkConnectionErr(err) + e.CheckConnectionErr(err) if err != nil { return err } @@ -358,7 +368,9 @@ func (e *Engine) RefreshVolumes() error { // FIXME: unexport this method after mesos scheduler stops using it directly func (e *Engine) RefreshContainers(full bool) error { containers, err := e.client.ListContainers(true, false, "") - e.checkConnectionErr(err) + // e.CheckConnectionErr(err) is not appropriate here because refresh loop uses + // RefreshContainers function to fail/recover an engine. Adding CheckConnectionErr + // here would result in double count if err != nil { return err } @@ -385,7 +397,7 @@ func (e *Engine) RefreshContainers(full bool) error { // the container will be inspected. func (e *Engine) refreshContainer(ID string, full bool) (*Container, error) { containers, err := e.client.ListContainers(true, false, fmt.Sprintf("{%q:[%q]}", "id", ID)) - e.checkConnectionErr(err) + e.CheckConnectionErr(err) if err != nil { return nil, err } @@ -432,7 +444,7 @@ func (e *Engine) updateContainer(c dockerclient.Container, containers map[string // Update ContainerInfo. if full { info, err := e.client.InspectContainer(c.Id) - e.checkConnectionErr(err) + e.CheckConnectionErr(err) if err != nil { return nil, err } @@ -567,7 +579,9 @@ func (e *Engine) Create(config *ContainerConfig, name string, pullImage bool, au dockerConfig.CpuShares = int64(math.Ceil(float64(config.CpuShares*1024) / float64(e.Cpus))) dockerConfig.HostConfig.CpuShares = dockerConfig.CpuShares - if id, err = client.CreateContainer(&dockerConfig, name, nil); err != nil { + id, err = client.CreateContainer(&dockerConfig, name, nil) + e.CheckConnectionErr(err) + if err != nil { // If the error is other than not found, abort immediately. if err != dockerclient.ErrImageNotFound || !pullImage { return nil, err @@ -577,7 +591,9 @@ func (e *Engine) Create(config *ContainerConfig, name string, pullImage bool, au return nil, err } // ...And try again. - if id, err = client.CreateContainer(&dockerConfig, name, nil); err != nil { + id, err = client.CreateContainer(&dockerConfig, name, nil) + e.CheckConnectionErr(err) + if err != nil { return nil, err } } @@ -601,7 +617,7 @@ func (e *Engine) Create(config *ContainerConfig, name string, pullImage bool, au // RemoveContainer a container from the engine. func (e *Engine) RemoveContainer(container *Container, force, volumes bool) error { err := e.client.RemoveContainer(container.Id, force, volumes) - e.checkConnectionErr(err) + e.CheckConnectionErr(err) if err != nil { return err } @@ -618,7 +634,7 @@ func (e *Engine) RemoveContainer(container *Container, force, volumes bool) erro // CreateNetwork creates a network in the engine func (e *Engine) CreateNetwork(request *dockerclient.NetworkCreate) (*dockerclient.NetworkCreateResponse, error) { response, err := e.client.CreateNetwork(request) - e.checkConnectionErr(err) + e.CheckConnectionErr(err) e.RefreshNetworks() @@ -630,7 +646,7 @@ func (e *Engine) CreateVolume(request *dockerclient.VolumeCreateRequest) (*Volum volume, err := e.client.CreateVolume(request) e.RefreshVolumes() - e.checkConnectionErr(err) + e.CheckConnectionErr(err) if err != nil { return nil, err @@ -645,7 +661,7 @@ func (e *Engine) Pull(image string, authConfig *dockerclient.AuthConfig) error { image = image + ":latest" } err := e.client.PullImage(image, authConfig) - e.checkConnectionErr(err) + e.CheckConnectionErr(err) if err != nil { return err } @@ -659,7 +675,7 @@ func (e *Engine) Pull(image string, authConfig *dockerclient.AuthConfig) error { // Load an image on the engine func (e *Engine) Load(reader io.Reader) error { err := e.client.LoadImage(reader) - e.checkConnectionErr(err) + e.CheckConnectionErr(err) if err != nil { return err } @@ -673,7 +689,7 @@ func (e *Engine) Load(reader io.Reader) error { // Import image func (e *Engine) Import(source string, repository string, tag string, imageReader io.Reader) error { _, err := e.client.ImportImage(source, repository, tag, imageReader) - e.checkConnectionErr(err) + e.CheckConnectionErr(err) if err != nil { return err } @@ -830,7 +846,7 @@ func (e *Engine) cleanupContainers() { func (e *Engine) RenameContainer(container *Container, newName string) error { // send rename request err := e.client.RenameContainer(container.Id, newName) - e.checkConnectionErr(err) + e.CheckConnectionErr(err) if err != nil { return err } @@ -843,7 +859,7 @@ func (e *Engine) RenameContainer(container *Container, newName string) error { // BuildImage build an image func (e *Engine) BuildImage(buildImage *dockerclient.BuildImage) (io.ReadCloser, error) { reader, err := e.client.BuildImage(buildImage) - e.checkConnectionErr(err) + e.CheckConnectionErr(err) return reader, err } @@ -851,7 +867,7 @@ func (e *Engine) BuildImage(buildImage *dockerclient.BuildImage) (io.ReadCloser, func (e *Engine) TagImage(IDOrName string, repo string, tag string, force bool) error { // send tag request to docker engine err := e.client.TagImage(IDOrName, repo, tag, force) - e.checkConnectionErr(err) + e.CheckConnectionErr(err) if err != nil { return err }