diff --git a/api/handlers.go b/api/handlers.go index 3e341b0615..b453472372 100644 --- a/api/handlers.go +++ b/api/handlers.go @@ -386,6 +386,7 @@ func getContainerJSON(c *context, w http.ResponseWriter, r *http.Request) { client, scheme := newClientAndScheme(c.tlsConfig) resp, err := client.Get(scheme + "://" + container.Engine.Addr + "/containers/" + container.Id + "/json") + container.Engine.CheckConnectionErr(err) if err != nil { httpError(w, err.Error(), http.StatusInternalServerError) return @@ -670,6 +671,7 @@ func postContainersExec(c *context, w http.ResponseWriter, r *http.Request) { client, scheme := newClientAndScheme(c.tlsConfig) resp, err := client.Post(scheme+"://"+container.Engine.Addr+"/containers/"+container.Id+"/exec", "application/json", r.Body) + container.Engine.CheckConnectionErr(err) if err != nil { httpError(w, err.Error(), http.StatusInternalServerError) return @@ -796,7 +798,8 @@ func proxyNetwork(c *context, w http.ResponseWriter, r *http.Request) { func proxyVolume(c *context, w http.ResponseWriter, r *http.Request) { var name = mux.Vars(r)["volumename"] if volume := c.cluster.Volume(name); volume != nil { - proxy(c.tlsConfig, volume.Engine.Addr, w, r) + err := proxy(c.tlsConfig, volume.Engine.Addr, w, r) + volume.Engine.CheckConnectionErr(err) return } httpError(w, fmt.Sprintf("No such volume: %s", name), http.StatusNotFound) @@ -838,7 +841,9 @@ func proxyNetworkContainerOperation(c *context, w http.ResponseWriter, r *http.R } // request is forwarded to the container's address - if err := proxyAsync(c.tlsConfig, container.Engine.Addr, w, r, cb); err != nil { + err := proxyAsync(c.tlsConfig, container.Engine.Addr, w, r, cb) + container.Engine.CheckConnectionErr(err) + if err != nil { httpError(w, err.Error(), http.StatusNotFound) } } @@ -856,7 +861,9 @@ func proxyContainer(c *context, w http.ResponseWriter, r *http.Request) { r.URL.Path = strings.Replace(r.URL.Path, name, container.Id, 1) } - if err := proxy(c.tlsConfig, container.Engine.Addr, w, r); err != nil { + err = proxy(c.tlsConfig, container.Engine.Addr, w, r) + container.Engine.CheckConnectionErr(err) + if err != nil { httpError(w, err.Error(), http.StatusInternalServerError) } } @@ -879,7 +886,9 @@ func proxyContainerAndForceRefresh(c *context, w http.ResponseWriter, r *http.Re container.Refresh() } - if err := proxyAsync(c.tlsConfig, container.Engine.Addr, w, r, cb); err != nil { + err = proxyAsync(c.tlsConfig, container.Engine.Addr, w, r, cb) + container.Engine.CheckConnectionErr(err) + if err != nil { httpError(w, err.Error(), http.StatusInternalServerError) } } @@ -889,7 +898,8 @@ func proxyImage(c *context, w http.ResponseWriter, r *http.Request) { name := mux.Vars(r)["name"] if image := c.cluster.Image(name); image != nil { - proxy(c.tlsConfig, image.Engine.Addr, w, r) + err := proxy(c.tlsConfig, image.Engine.Addr, w, r) + image.Engine.CheckConnectionErr(err) return } httpError(w, fmt.Sprintf("No such image: %s", name), http.StatusNotFound) @@ -902,7 +912,8 @@ func proxyImageGet(c *context, w http.ResponseWriter, r *http.Request) { for _, image := range c.cluster.Images() { if len(strings.SplitN(name, ":", 2)) == 2 && image.Match(name, true) || len(strings.SplitN(name, ":", 2)) == 1 && image.Match(name, false) { - proxy(c.tlsConfig, image.Engine.Addr, w, r) + err := proxy(c.tlsConfig, image.Engine.Addr, w, r) + image.Engine.CheckConnectionErr(err) return } } @@ -925,7 +936,8 @@ func proxyImagePush(c *context, w http.ResponseWriter, r *http.Request) { for _, image := range c.cluster.Images() { if tag != "" && image.Match(name, true) || tag == "" && image.Match(name, false) { - proxy(c.tlsConfig, image.Engine.Addr, w, r) + err := proxy(c.tlsConfig, image.Engine.Addr, w, r) + image.Engine.CheckConnectionErr(err) return } } @@ -969,7 +981,9 @@ func proxyRandom(c *context, w http.ResponseWriter, r *http.Request) { return } - if err := proxy(c.tlsConfig, engine.Addr, w, r); err != nil { + err = proxy(c.tlsConfig, engine.Addr, w, r) + engine.CheckConnectionErr(err) + if err != nil { httpError(w, err.Error(), http.StatusInternalServerError) } @@ -1003,7 +1017,9 @@ func postCommit(c *context, w http.ResponseWriter, r *http.Request) { } // proxy commit request to the right node - if err := proxyAsync(c.tlsConfig, container.Engine.Addr, w, r, cb); err != nil { + err = proxyAsync(c.tlsConfig, container.Engine.Addr, w, r, cb) + container.Engine.CheckConnectionErr(err) + if err != nil { httpError(w, err.Error(), http.StatusInternalServerError) } } @@ -1093,7 +1109,9 @@ func proxyHijack(c *context, w http.ResponseWriter, r *http.Request) { r.URL.Path = strings.Replace(r.URL.Path, name, container.Id, 1) } - if err := hijack(c.tlsConfig, container.Engine.Addr, w, r); err != nil { + err = hijack(c.tlsConfig, container.Engine.Addr, w, r) + container.Engine.CheckConnectionErr(err) + if err != nil { httpError(w, err.Error(), http.StatusInternalServerError) } } diff --git a/cli/commands.go b/cli/commands.go index 73790935c9..a1e8263e3f 100644 --- a/cli/commands.go +++ b/cli/commands.go @@ -26,7 +26,7 @@ var ( flHosts, flLeaderElection, flLeaderTTL, flManageAdvertise, flTLS, flTLSCaCert, flTLSCert, flTLSKey, flTLSVerify, - flRefreshIntervalMin, flRefreshIntervalMax, flRefreshRetry, + flRefreshIntervalMin, flRefreshIntervalMax, flFailureRetry, flRefreshRetry, flHeartBeat, flEnableCors, flCluster, flDiscoveryOpt, flClusterOpt}, diff --git a/cli/flags.go b/cli/flags.go index b464d95691..9553a93635 100644 --- a/cli/flags.go +++ b/cli/flags.go @@ -74,7 +74,12 @@ var ( flRefreshRetry = cli.IntFlag{ Name: "engine-refresh-retry", Value: 3, - Usage: "set engine refresh retry count on failure", + Usage: "deprecated; replaced by --engine-failure-retry", + } + flFailureRetry = cli.IntFlag{ + Name: "engine-failure-retry", + Value: 3, + Usage: "set engine failure retry count", } flEnableCors = cli.BoolFlag{ Name: "api-enable-cors, cors", diff --git a/cli/manage.go b/cli/manage.go index ccd94639fd..f3203681f6 100644 --- a/cli/manage.go +++ b/cli/manage.go @@ -246,14 +246,19 @@ func manage(c *cli.Context) { if refreshMaxInterval < refreshMinInterval { log.Fatal("max refresh interval cannot be less than min refresh interval") } + // engine-refresh-retry is deprecated refreshRetry := c.Int("engine-refresh-retry") - if refreshRetry <= 0 { - log.Fatal("invalid refresh retry count") + if refreshRetry != 3 { + log.Fatal("--engine-refresh-retry is deprecated. Use --engine-failure-retry") + } + failureRetry := c.Int("engine-failure-retry") + if failureRetry <= 0 { + log.Fatal("invalid failure retry count") } engineOpts := &cluster.EngineOpts{ RefreshMinInterval: refreshMinInterval, RefreshMaxInterval: refreshMaxInterval, - RefreshRetry: refreshRetry, + FailureRetry: failureRetry, } uri := getDiscovery(c) diff --git a/cluster/engine.go b/cluster/engine.go index 45ed7bf4ac..b62efa822d 100644 --- a/cluster/engine.go +++ b/cluster/engine.go @@ -59,7 +59,7 @@ func (d *delayer) Wait() <-chan time.Time { type EngineOpts struct { RefreshMinInterval time.Duration RefreshMaxInterval time.Duration - RefreshRetry int + FailureRetry int } // Engine represents a docker engine @@ -83,6 +83,7 @@ type Engine struct { client dockerclient.Client eventHandler EventHandler healthy bool + failureCount int overcommitRatio int64 opts *EngineOpts } @@ -184,6 +185,17 @@ func (e *Engine) IsHealthy() bool { return e.healthy } +// setHealthy sets engine healthy state +func (e *Engine) setHealthy(state bool) { + e.Lock() + e.healthy = state + // if engine is healthy, clear failureCount + if state { + e.failureCount = 0 + } + e.Unlock() +} + // Status returns the health status of the Engine: Healthy or Unhealthy func (e *Engine) Status() string { if e.healthy { @@ -192,9 +204,44 @@ func (e *Engine) Status() string { return "Unhealthy" } +// incFailureCount increases engine's failure count, and set engine as unhealthy if threshold is crossed +func (e *Engine) incFailureCount() { + e.Lock() + e.failureCount++ + if e.healthy && e.failureCount >= e.opts.FailureRetry { + e.healthy = false + } + e.Unlock() +} + +// CheckConnectionErr checks error from client response and adjust engine healthy indicators +func (e *Engine) CheckConnectionErr(err error) { + if err == nil { + e.setHealthy(true) + return + } + + // dockerclient defines ErrConnectionRefused error. but if http client is from swarm, it's not using + // dockerclient. We need string matching for these cases. Remove the first character to deal with + // case sensitive issue + if err == dockerclient.ErrConnectionRefused || + strings.Contains(err.Error(), "onnection refused") || + strings.Contains(err.Error(), "annot connect to the docker engine endpoint") { + // each connection refused instance may increase failure count so + // engine can fail fast. Short engine freeze or network failure may result + // in engine marked as unhealthy. If this causes unnecessary failure, engine + // can track last error time. Only increase failure count if last error is + // not too recent, e.g., last error is at least 1 seconds ago. + e.incFailureCount() + return + } + // other errors may be ambiguous. let refresh loop decide healthy or not. +} + // Gather engine specs (CPU, memory, constraints, ...). func (e *Engine) updateSpecs() error { info, err := e.client.Info() + e.CheckConnectionErr(err) if err != nil { return err } @@ -204,6 +251,7 @@ func (e *Engine) updateSpecs() error { } v, err := e.client.Version() + e.CheckConnectionErr(err) if err != nil { return err } @@ -236,6 +284,7 @@ func (e *Engine) updateSpecs() error { // RemoveImage deletes an image from the engine. func (e *Engine) RemoveImage(image *Image, name string, force bool) ([]*dockerclient.ImageDelete, error) { array, err := e.client.RemoveImage(name, force) + e.CheckConnectionErr(err) e.RefreshImages() return array, err @@ -244,13 +293,16 @@ func (e *Engine) RemoveImage(image *Image, name string, force bool) ([]*dockercl // RemoveNetwork deletes a network from the engine. func (e *Engine) RemoveNetwork(network *Network) error { err := e.client.RemoveNetwork(network.ID) + e.CheckConnectionErr(err) e.RefreshNetworks() return err } // RemoveVolume deletes a volume from the engine. func (e *Engine) RemoveVolume(name string) error { - if err := e.client.RemoveVolume(name); err != nil { + err := e.client.RemoveVolume(name) + e.CheckConnectionErr(err) + if err != nil { return err } @@ -266,6 +318,7 @@ func (e *Engine) RemoveVolume(name string) error { // RefreshImages refreshes the list of images on the engine. func (e *Engine) RefreshImages() error { images, err := e.client.ListImages(true) + e.CheckConnectionErr(err) if err != nil { return err } @@ -281,6 +334,7 @@ func (e *Engine) RefreshImages() error { // RefreshNetworks refreshes the list of networks on the engine. func (e *Engine) RefreshNetworks() error { networks, err := e.client.ListNetworks("") + e.CheckConnectionErr(err) if err != nil { return err } @@ -296,6 +350,7 @@ func (e *Engine) RefreshNetworks() error { // RefreshVolumes refreshes the list of volumes on the engine. func (e *Engine) RefreshVolumes() error { volumes, err := e.client.ListVolumes() + e.CheckConnectionErr(err) if err != nil { return err } @@ -313,6 +368,9 @@ func (e *Engine) RefreshVolumes() error { // FIXME: unexport this method after mesos scheduler stops using it directly func (e *Engine) RefreshContainers(full bool) error { containers, err := e.client.ListContainers(true, false, "") + // e.CheckConnectionErr(err) is not appropriate here because refresh loop uses + // RefreshContainers function to fail/recover an engine. Adding CheckConnectionErr + // here would result in double count if err != nil { return err } @@ -339,6 +397,7 @@ func (e *Engine) RefreshContainers(full bool) error { // the container will be inspected. func (e *Engine) refreshContainer(ID string, full bool) (*Container, error) { containers, err := e.client.ListContainers(true, false, fmt.Sprintf("{%q:[%q]}", "id", ID)) + e.CheckConnectionErr(err) if err != nil { return nil, err } @@ -385,6 +444,7 @@ func (e *Engine) updateContainer(c dockerclient.Container, containers map[string // Update ContainerInfo. if full { info, err := e.client.InspectContainer(c.Id) + e.CheckConnectionErr(err) if err != nil { return nil, err } @@ -410,7 +470,6 @@ func (e *Engine) updateContainer(c dockerclient.Container, containers map[string } func (e *Engine) refreshLoop() { - failedAttempts := 0 for { var err error @@ -431,15 +490,15 @@ func (e *Engine) refreshLoop() { } if err != nil { - failedAttempts++ - if failedAttempts >= e.opts.RefreshRetry && e.healthy { + e.failureCount++ + if e.failureCount >= e.opts.FailureRetry && e.healthy { e.emitEvent("engine_disconnect") - e.healthy = false - log.WithFields(log.Fields{"name": e.Name, "id": e.ID}).Errorf("Flagging engine as dead. Updated state failed %d times: %v", failedAttempts, err) + e.setHealthy(false) + log.WithFields(log.Fields{"name": e.Name, "id": e.ID}).Errorf("Flagging engine as dead. Updated state failed %d times: %v", e.failureCount, err) } } else { if !e.healthy { - log.WithFields(log.Fields{"name": e.Name, "id": e.ID}).Infof("Engine came back to life after %d retries. Hooray!", failedAttempts) + log.WithFields(log.Fields{"name": e.Name, "id": e.ID}).Infof("Engine came back to life after %d retries. Hooray!", e.failureCount) if err := e.updateSpecs(); err != nil { log.WithFields(log.Fields{"name": e.Name, "id": e.ID}).Errorf("Update engine specs failed: %v", err) continue @@ -448,8 +507,7 @@ func (e *Engine) refreshLoop() { e.client.StartMonitorEvents(e.handler, nil) e.emitEvent("engine_reconnect") } - e.healthy = true - failedAttempts = 0 + e.setHealthy(true) } } } @@ -521,7 +579,9 @@ func (e *Engine) Create(config *ContainerConfig, name string, pullImage bool, au dockerConfig.CpuShares = int64(math.Ceil(float64(config.CpuShares*1024) / float64(e.Cpus))) dockerConfig.HostConfig.CpuShares = dockerConfig.CpuShares - if id, err = client.CreateContainer(&dockerConfig, name, nil); err != nil { + id, err = client.CreateContainer(&dockerConfig, name, nil) + e.CheckConnectionErr(err) + if err != nil { // If the error is other than not found, abort immediately. if err != dockerclient.ErrImageNotFound || !pullImage { return nil, err @@ -531,7 +591,9 @@ func (e *Engine) Create(config *ContainerConfig, name string, pullImage bool, au return nil, err } // ...And try again. - if id, err = client.CreateContainer(&dockerConfig, name, nil); err != nil { + id, err = client.CreateContainer(&dockerConfig, name, nil) + e.CheckConnectionErr(err) + if err != nil { return nil, err } } @@ -554,7 +616,9 @@ func (e *Engine) Create(config *ContainerConfig, name string, pullImage bool, au // RemoveContainer a container from the engine. func (e *Engine) RemoveContainer(container *Container, force, volumes bool) error { - if err := e.client.RemoveContainer(container.Id, force, volumes); err != nil { + err := e.client.RemoveContainer(container.Id, force, volumes) + e.CheckConnectionErr(err) + if err != nil { return err } @@ -570,6 +634,7 @@ func (e *Engine) RemoveContainer(container *Container, force, volumes bool) erro // CreateNetwork creates a network in the engine func (e *Engine) CreateNetwork(request *dockerclient.NetworkCreate) (*dockerclient.NetworkCreateResponse, error) { response, err := e.client.CreateNetwork(request) + e.CheckConnectionErr(err) e.RefreshNetworks() @@ -581,6 +646,7 @@ func (e *Engine) CreateVolume(request *dockerclient.VolumeCreateRequest) (*Volum volume, err := e.client.CreateVolume(request) e.RefreshVolumes() + e.CheckConnectionErr(err) if err != nil { return nil, err @@ -594,7 +660,9 @@ func (e *Engine) Pull(image string, authConfig *dockerclient.AuthConfig) error { if !strings.Contains(image, ":") { image = image + ":latest" } - if err := e.client.PullImage(image, authConfig); err != nil { + err := e.client.PullImage(image, authConfig) + e.CheckConnectionErr(err) + if err != nil { return err } @@ -606,7 +674,9 @@ func (e *Engine) Pull(image string, authConfig *dockerclient.AuthConfig) error { // Load an image on the engine func (e *Engine) Load(reader io.Reader) error { - if err := e.client.LoadImage(reader); err != nil { + err := e.client.LoadImage(reader) + e.CheckConnectionErr(err) + if err != nil { return err } @@ -618,7 +688,9 @@ func (e *Engine) Load(reader io.Reader) error { // Import image func (e *Engine) Import(source string, repository string, tag string, imageReader io.Reader) error { - if _, err := e.client.ImportImage(source, repository, tag, imageReader); err != nil { + _, err := e.client.ImportImage(source, repository, tag, imageReader) + e.CheckConnectionErr(err) + if err != nil { return err } @@ -774,6 +846,7 @@ func (e *Engine) cleanupContainers() { func (e *Engine) RenameContainer(container *Container, newName string) error { // send rename request err := e.client.RenameContainer(container.Id, newName) + e.CheckConnectionErr(err) if err != nil { return err } @@ -785,14 +858,16 @@ func (e *Engine) RenameContainer(container *Container, newName string) error { // BuildImage build an image func (e *Engine) BuildImage(buildImage *dockerclient.BuildImage) (io.ReadCloser, error) { - - return e.client.BuildImage(buildImage) + reader, err := e.client.BuildImage(buildImage) + e.CheckConnectionErr(err) + return reader, err } // TagImage tag an image func (e *Engine) TagImage(IDOrName string, repo string, tag string, force bool) error { // send tag request to docker engine err := e.client.TagImage(IDOrName, repo, tag, force) + e.CheckConnectionErr(err) if err != nil { return err } diff --git a/cluster/engine_test.go b/cluster/engine_test.go index 730acc78b2..07635360be 100644 --- a/cluster/engine_test.go +++ b/cluster/engine_test.go @@ -34,10 +34,19 @@ var ( engOpts = &EngineOpts{ RefreshMinInterval: time.Duration(30) * time.Second, RefreshMaxInterval: time.Duration(60) * time.Second, - RefreshRetry: 3, + FailureRetry: 3, } ) +func TestEngineFailureCount(t *testing.T) { + engine := NewEngine("test", 0, engOpts) + for i := 0; i < engine.opts.FailureRetry; i++ { + assert.True(t, engine.IsHealthy()) + engine.incFailureCount() + } + assert.False(t, engine.IsHealthy()) +} + func TestEngineConnectionFailure(t *testing.T) { engine := NewEngine("test", 0, engOpts) assert.False(t, engine.isConnected()) diff --git a/cluster/mesos/cluster_test.go b/cluster/mesos/cluster_test.go index c05bf66d96..1ce8116eb5 100644 --- a/cluster/mesos/cluster_test.go +++ b/cluster/mesos/cluster_test.go @@ -13,7 +13,7 @@ func createAgent(t *testing.T, ID string, containers ...*cluster.Container) *age engOpts := &cluster.EngineOpts{ RefreshMinInterval: time.Duration(30) * time.Second, RefreshMaxInterval: time.Duration(60) * time.Second, - RefreshRetry: 3, + FailureRetry: 3, } engine := cluster.NewEngine(ID, 0, engOpts) engine.Name = ID diff --git a/cluster/swarm/cluster_test.go b/cluster/swarm/cluster_test.go index 1d3c04e19a..bb02d2918d 100644 --- a/cluster/swarm/cluster_test.go +++ b/cluster/swarm/cluster_test.go @@ -43,7 +43,7 @@ var ( engOpts = &cluster.EngineOpts{ RefreshMinInterval: time.Duration(30) * time.Second, RefreshMaxInterval: time.Duration(60) * time.Second, - RefreshRetry: 3, + FailureRetry: 3, } ) diff --git a/test/integration/api/logs.bats b/test/integration/api/logs.bats index 38672c8b81..903d37f017 100644 --- a/test/integration/api/logs.bats +++ b/test/integration/api/logs.bats @@ -30,7 +30,7 @@ function teardown() { @test "docker logs unhealthy node" { start_docker_with_busybox 1 - swarm_manage --engine-refresh-min-interval=1s --engine-refresh-max-interval=1s --engine-refresh-retry=1 ${HOSTS[0]} + swarm_manage --engine-refresh-min-interval=1s --engine-refresh-max-interval=1s --engine-failure-retry=1 ${HOSTS[0]} # run a container with echo command docker_swarm run -d --name test_container busybox /bin/sh -c "echo hello world; echo hello docker; echo hello swarm" diff --git a/test/integration/engine_options.bats b/test/integration/engine_options.bats index c2347489ab..aef39fde29 100644 --- a/test/integration/engine_options.bats +++ b/test/integration/engine_options.bats @@ -14,7 +14,7 @@ load helpers [[ "${output}" == *"max refresh interval cannot be less than min refresh interval"* ]] # engine refresh retry count - run swarm manage --engine-refresh-retry 0 --advertise 127.0.0.1:$SWARM_BASE_PORT 192.168.56.202:4444 + run swarm manage --engine-failure-retry 0 --advertise 127.0.0.1:$SWARM_BASE_PORT 192.168.56.202:4444 [ "$status" -ne 0 ] - [[ "${output}" == *"invalid refresh retry count"* ]] + [[ "${output}" == *"invalid failure retry count"* ]] }