Merge pull request #1517 from dongluochen/EngineFastFailure

Use failureCount as a secondary health indicator.
This commit is contained in:
Victor Vieux 2015-12-17 16:35:29 -08:00
commit 53cf899e31
10 changed files with 151 additions and 39 deletions

View File

@ -386,6 +386,7 @@ func getContainerJSON(c *context, w http.ResponseWriter, r *http.Request) {
client, scheme := newClientAndScheme(c.tlsConfig)
resp, err := client.Get(scheme + "://" + container.Engine.Addr + "/containers/" + container.Id + "/json")
container.Engine.CheckConnectionErr(err)
if err != nil {
httpError(w, err.Error(), http.StatusInternalServerError)
return
@ -670,6 +671,7 @@ func postContainersExec(c *context, w http.ResponseWriter, r *http.Request) {
client, scheme := newClientAndScheme(c.tlsConfig)
resp, err := client.Post(scheme+"://"+container.Engine.Addr+"/containers/"+container.Id+"/exec", "application/json", r.Body)
container.Engine.CheckConnectionErr(err)
if err != nil {
httpError(w, err.Error(), http.StatusInternalServerError)
return
@ -796,7 +798,8 @@ func proxyNetwork(c *context, w http.ResponseWriter, r *http.Request) {
func proxyVolume(c *context, w http.ResponseWriter, r *http.Request) {
var name = mux.Vars(r)["volumename"]
if volume := c.cluster.Volume(name); volume != nil {
proxy(c.tlsConfig, volume.Engine.Addr, w, r)
err := proxy(c.tlsConfig, volume.Engine.Addr, w, r)
volume.Engine.CheckConnectionErr(err)
return
}
httpError(w, fmt.Sprintf("No such volume: %s", name), http.StatusNotFound)
@ -838,7 +841,9 @@ func proxyNetworkContainerOperation(c *context, w http.ResponseWriter, r *http.R
}
// request is forwarded to the container's address
if err := proxyAsync(c.tlsConfig, container.Engine.Addr, w, r, cb); err != nil {
err := proxyAsync(c.tlsConfig, container.Engine.Addr, w, r, cb)
container.Engine.CheckConnectionErr(err)
if err != nil {
httpError(w, err.Error(), http.StatusNotFound)
}
}
@ -856,7 +861,9 @@ func proxyContainer(c *context, w http.ResponseWriter, r *http.Request) {
r.URL.Path = strings.Replace(r.URL.Path, name, container.Id, 1)
}
if err := proxy(c.tlsConfig, container.Engine.Addr, w, r); err != nil {
err = proxy(c.tlsConfig, container.Engine.Addr, w, r)
container.Engine.CheckConnectionErr(err)
if err != nil {
httpError(w, err.Error(), http.StatusInternalServerError)
}
}
@ -879,7 +886,9 @@ func proxyContainerAndForceRefresh(c *context, w http.ResponseWriter, r *http.Re
container.Refresh()
}
if err := proxyAsync(c.tlsConfig, container.Engine.Addr, w, r, cb); err != nil {
err = proxyAsync(c.tlsConfig, container.Engine.Addr, w, r, cb)
container.Engine.CheckConnectionErr(err)
if err != nil {
httpError(w, err.Error(), http.StatusInternalServerError)
}
}
@ -889,7 +898,8 @@ func proxyImage(c *context, w http.ResponseWriter, r *http.Request) {
name := mux.Vars(r)["name"]
if image := c.cluster.Image(name); image != nil {
proxy(c.tlsConfig, image.Engine.Addr, w, r)
err := proxy(c.tlsConfig, image.Engine.Addr, w, r)
image.Engine.CheckConnectionErr(err)
return
}
httpError(w, fmt.Sprintf("No such image: %s", name), http.StatusNotFound)
@ -902,7 +912,8 @@ func proxyImageGet(c *context, w http.ResponseWriter, r *http.Request) {
for _, image := range c.cluster.Images() {
if len(strings.SplitN(name, ":", 2)) == 2 && image.Match(name, true) ||
len(strings.SplitN(name, ":", 2)) == 1 && image.Match(name, false) {
proxy(c.tlsConfig, image.Engine.Addr, w, r)
err := proxy(c.tlsConfig, image.Engine.Addr, w, r)
image.Engine.CheckConnectionErr(err)
return
}
}
@ -925,7 +936,8 @@ func proxyImagePush(c *context, w http.ResponseWriter, r *http.Request) {
for _, image := range c.cluster.Images() {
if tag != "" && image.Match(name, true) ||
tag == "" && image.Match(name, false) {
proxy(c.tlsConfig, image.Engine.Addr, w, r)
err := proxy(c.tlsConfig, image.Engine.Addr, w, r)
image.Engine.CheckConnectionErr(err)
return
}
}
@ -969,7 +981,9 @@ func proxyRandom(c *context, w http.ResponseWriter, r *http.Request) {
return
}
if err := proxy(c.tlsConfig, engine.Addr, w, r); err != nil {
err = proxy(c.tlsConfig, engine.Addr, w, r)
engine.CheckConnectionErr(err)
if err != nil {
httpError(w, err.Error(), http.StatusInternalServerError)
}
@ -1003,7 +1017,9 @@ func postCommit(c *context, w http.ResponseWriter, r *http.Request) {
}
// proxy commit request to the right node
if err := proxyAsync(c.tlsConfig, container.Engine.Addr, w, r, cb); err != nil {
err = proxyAsync(c.tlsConfig, container.Engine.Addr, w, r, cb)
container.Engine.CheckConnectionErr(err)
if err != nil {
httpError(w, err.Error(), http.StatusInternalServerError)
}
}
@ -1093,7 +1109,9 @@ func proxyHijack(c *context, w http.ResponseWriter, r *http.Request) {
r.URL.Path = strings.Replace(r.URL.Path, name, container.Id, 1)
}
if err := hijack(c.tlsConfig, container.Engine.Addr, w, r); err != nil {
err = hijack(c.tlsConfig, container.Engine.Addr, w, r)
container.Engine.CheckConnectionErr(err)
if err != nil {
httpError(w, err.Error(), http.StatusInternalServerError)
}
}

View File

@ -26,7 +26,7 @@ var (
flHosts,
flLeaderElection, flLeaderTTL, flManageAdvertise,
flTLS, flTLSCaCert, flTLSCert, flTLSKey, flTLSVerify,
flRefreshIntervalMin, flRefreshIntervalMax, flRefreshRetry,
flRefreshIntervalMin, flRefreshIntervalMax, flFailureRetry, flRefreshRetry,
flHeartBeat,
flEnableCors,
flCluster, flDiscoveryOpt, flClusterOpt},

View File

@ -74,7 +74,12 @@ var (
flRefreshRetry = cli.IntFlag{
Name: "engine-refresh-retry",
Value: 3,
Usage: "set engine refresh retry count on failure",
Usage: "deprecated; replaced by --engine-failure-retry",
}
flFailureRetry = cli.IntFlag{
Name: "engine-failure-retry",
Value: 3,
Usage: "set engine failure retry count",
}
flEnableCors = cli.BoolFlag{
Name: "api-enable-cors, cors",

View File

@ -246,14 +246,19 @@ func manage(c *cli.Context) {
if refreshMaxInterval < refreshMinInterval {
log.Fatal("max refresh interval cannot be less than min refresh interval")
}
// engine-refresh-retry is deprecated
refreshRetry := c.Int("engine-refresh-retry")
if refreshRetry <= 0 {
log.Fatal("invalid refresh retry count")
if refreshRetry != 3 {
log.Fatal("--engine-refresh-retry is deprecated. Use --engine-failure-retry")
}
failureRetry := c.Int("engine-failure-retry")
if failureRetry <= 0 {
log.Fatal("invalid failure retry count")
}
engineOpts := &cluster.EngineOpts{
RefreshMinInterval: refreshMinInterval,
RefreshMaxInterval: refreshMaxInterval,
RefreshRetry: refreshRetry,
FailureRetry: failureRetry,
}
uri := getDiscovery(c)

View File

@ -59,7 +59,7 @@ func (d *delayer) Wait() <-chan time.Time {
type EngineOpts struct {
RefreshMinInterval time.Duration
RefreshMaxInterval time.Duration
RefreshRetry int
FailureRetry int
}
// Engine represents a docker engine
@ -83,6 +83,7 @@ type Engine struct {
client dockerclient.Client
eventHandler EventHandler
healthy bool
failureCount int
overcommitRatio int64
opts *EngineOpts
}
@ -184,6 +185,17 @@ func (e *Engine) IsHealthy() bool {
return e.healthy
}
// setHealthy sets engine healthy state
func (e *Engine) setHealthy(state bool) {
e.Lock()
e.healthy = state
// if engine is healthy, clear failureCount
if state {
e.failureCount = 0
}
e.Unlock()
}
// Status returns the health status of the Engine: Healthy or Unhealthy
func (e *Engine) Status() string {
if e.healthy {
@ -192,9 +204,44 @@ func (e *Engine) Status() string {
return "Unhealthy"
}
// incFailureCount increases engine's failure count, and set engine as unhealthy if threshold is crossed
func (e *Engine) incFailureCount() {
e.Lock()
e.failureCount++
if e.healthy && e.failureCount >= e.opts.FailureRetry {
e.healthy = false
}
e.Unlock()
}
// CheckConnectionErr checks error from client response and adjust engine healthy indicators
func (e *Engine) CheckConnectionErr(err error) {
if err == nil {
e.setHealthy(true)
return
}
// dockerclient defines ErrConnectionRefused error. but if http client is from swarm, it's not using
// dockerclient. We need string matching for these cases. Remove the first character to deal with
// case sensitive issue
if err == dockerclient.ErrConnectionRefused ||
strings.Contains(err.Error(), "onnection refused") ||
strings.Contains(err.Error(), "annot connect to the docker engine endpoint") {
// each connection refused instance may increase failure count so
// engine can fail fast. Short engine freeze or network failure may result
// in engine marked as unhealthy. If this causes unnecessary failure, engine
// can track last error time. Only increase failure count if last error is
// not too recent, e.g., last error is at least 1 seconds ago.
e.incFailureCount()
return
}
// other errors may be ambiguous. let refresh loop decide healthy or not.
}
// Gather engine specs (CPU, memory, constraints, ...).
func (e *Engine) updateSpecs() error {
info, err := e.client.Info()
e.CheckConnectionErr(err)
if err != nil {
return err
}
@ -204,6 +251,7 @@ func (e *Engine) updateSpecs() error {
}
v, err := e.client.Version()
e.CheckConnectionErr(err)
if err != nil {
return err
}
@ -236,6 +284,7 @@ func (e *Engine) updateSpecs() error {
// RemoveImage deletes an image from the engine.
func (e *Engine) RemoveImage(image *Image, name string, force bool) ([]*dockerclient.ImageDelete, error) {
array, err := e.client.RemoveImage(name, force)
e.CheckConnectionErr(err)
e.RefreshImages()
return array, err
@ -244,13 +293,16 @@ func (e *Engine) RemoveImage(image *Image, name string, force bool) ([]*dockercl
// RemoveNetwork deletes a network from the engine.
func (e *Engine) RemoveNetwork(network *Network) error {
err := e.client.RemoveNetwork(network.ID)
e.CheckConnectionErr(err)
e.RefreshNetworks()
return err
}
// RemoveVolume deletes a volume from the engine.
func (e *Engine) RemoveVolume(name string) error {
if err := e.client.RemoveVolume(name); err != nil {
err := e.client.RemoveVolume(name)
e.CheckConnectionErr(err)
if err != nil {
return err
}
@ -266,6 +318,7 @@ func (e *Engine) RemoveVolume(name string) error {
// RefreshImages refreshes the list of images on the engine.
func (e *Engine) RefreshImages() error {
images, err := e.client.ListImages(true)
e.CheckConnectionErr(err)
if err != nil {
return err
}
@ -281,6 +334,7 @@ func (e *Engine) RefreshImages() error {
// RefreshNetworks refreshes the list of networks on the engine.
func (e *Engine) RefreshNetworks() error {
networks, err := e.client.ListNetworks("")
e.CheckConnectionErr(err)
if err != nil {
return err
}
@ -296,6 +350,7 @@ func (e *Engine) RefreshNetworks() error {
// RefreshVolumes refreshes the list of volumes on the engine.
func (e *Engine) RefreshVolumes() error {
volumes, err := e.client.ListVolumes()
e.CheckConnectionErr(err)
if err != nil {
return err
}
@ -313,6 +368,9 @@ func (e *Engine) RefreshVolumes() error {
// FIXME: unexport this method after mesos scheduler stops using it directly
func (e *Engine) RefreshContainers(full bool) error {
containers, err := e.client.ListContainers(true, false, "")
// e.CheckConnectionErr(err) is not appropriate here because refresh loop uses
// RefreshContainers function to fail/recover an engine. Adding CheckConnectionErr
// here would result in double count
if err != nil {
return err
}
@ -339,6 +397,7 @@ func (e *Engine) RefreshContainers(full bool) error {
// the container will be inspected.
func (e *Engine) refreshContainer(ID string, full bool) (*Container, error) {
containers, err := e.client.ListContainers(true, false, fmt.Sprintf("{%q:[%q]}", "id", ID))
e.CheckConnectionErr(err)
if err != nil {
return nil, err
}
@ -385,6 +444,7 @@ func (e *Engine) updateContainer(c dockerclient.Container, containers map[string
// Update ContainerInfo.
if full {
info, err := e.client.InspectContainer(c.Id)
e.CheckConnectionErr(err)
if err != nil {
return nil, err
}
@ -410,7 +470,6 @@ func (e *Engine) updateContainer(c dockerclient.Container, containers map[string
}
func (e *Engine) refreshLoop() {
failedAttempts := 0
for {
var err error
@ -431,15 +490,15 @@ func (e *Engine) refreshLoop() {
}
if err != nil {
failedAttempts++
if failedAttempts >= e.opts.RefreshRetry && e.healthy {
e.failureCount++
if e.failureCount >= e.opts.FailureRetry && e.healthy {
e.emitEvent("engine_disconnect")
e.healthy = false
log.WithFields(log.Fields{"name": e.Name, "id": e.ID}).Errorf("Flagging engine as dead. Updated state failed %d times: %v", failedAttempts, err)
e.setHealthy(false)
log.WithFields(log.Fields{"name": e.Name, "id": e.ID}).Errorf("Flagging engine as dead. Updated state failed %d times: %v", e.failureCount, err)
}
} else {
if !e.healthy {
log.WithFields(log.Fields{"name": e.Name, "id": e.ID}).Infof("Engine came back to life after %d retries. Hooray!", failedAttempts)
log.WithFields(log.Fields{"name": e.Name, "id": e.ID}).Infof("Engine came back to life after %d retries. Hooray!", e.failureCount)
if err := e.updateSpecs(); err != nil {
log.WithFields(log.Fields{"name": e.Name, "id": e.ID}).Errorf("Update engine specs failed: %v", err)
continue
@ -448,8 +507,7 @@ func (e *Engine) refreshLoop() {
e.client.StartMonitorEvents(e.handler, nil)
e.emitEvent("engine_reconnect")
}
e.healthy = true
failedAttempts = 0
e.setHealthy(true)
}
}
}
@ -521,7 +579,9 @@ func (e *Engine) Create(config *ContainerConfig, name string, pullImage bool, au
dockerConfig.CpuShares = int64(math.Ceil(float64(config.CpuShares*1024) / float64(e.Cpus)))
dockerConfig.HostConfig.CpuShares = dockerConfig.CpuShares
if id, err = client.CreateContainer(&dockerConfig, name, nil); err != nil {
id, err = client.CreateContainer(&dockerConfig, name, nil)
e.CheckConnectionErr(err)
if err != nil {
// If the error is other than not found, abort immediately.
if err != dockerclient.ErrImageNotFound || !pullImage {
return nil, err
@ -531,7 +591,9 @@ func (e *Engine) Create(config *ContainerConfig, name string, pullImage bool, au
return nil, err
}
// ...And try again.
if id, err = client.CreateContainer(&dockerConfig, name, nil); err != nil {
id, err = client.CreateContainer(&dockerConfig, name, nil)
e.CheckConnectionErr(err)
if err != nil {
return nil, err
}
}
@ -554,7 +616,9 @@ func (e *Engine) Create(config *ContainerConfig, name string, pullImage bool, au
// RemoveContainer a container from the engine.
func (e *Engine) RemoveContainer(container *Container, force, volumes bool) error {
if err := e.client.RemoveContainer(container.Id, force, volumes); err != nil {
err := e.client.RemoveContainer(container.Id, force, volumes)
e.CheckConnectionErr(err)
if err != nil {
return err
}
@ -570,6 +634,7 @@ func (e *Engine) RemoveContainer(container *Container, force, volumes bool) erro
// CreateNetwork creates a network in the engine
func (e *Engine) CreateNetwork(request *dockerclient.NetworkCreate) (*dockerclient.NetworkCreateResponse, error) {
response, err := e.client.CreateNetwork(request)
e.CheckConnectionErr(err)
e.RefreshNetworks()
@ -581,6 +646,7 @@ func (e *Engine) CreateVolume(request *dockerclient.VolumeCreateRequest) (*Volum
volume, err := e.client.CreateVolume(request)
e.RefreshVolumes()
e.CheckConnectionErr(err)
if err != nil {
return nil, err
@ -594,7 +660,9 @@ func (e *Engine) Pull(image string, authConfig *dockerclient.AuthConfig) error {
if !strings.Contains(image, ":") {
image = image + ":latest"
}
if err := e.client.PullImage(image, authConfig); err != nil {
err := e.client.PullImage(image, authConfig)
e.CheckConnectionErr(err)
if err != nil {
return err
}
@ -606,7 +674,9 @@ func (e *Engine) Pull(image string, authConfig *dockerclient.AuthConfig) error {
// Load an image on the engine
func (e *Engine) Load(reader io.Reader) error {
if err := e.client.LoadImage(reader); err != nil {
err := e.client.LoadImage(reader)
e.CheckConnectionErr(err)
if err != nil {
return err
}
@ -618,7 +688,9 @@ func (e *Engine) Load(reader io.Reader) error {
// Import image
func (e *Engine) Import(source string, repository string, tag string, imageReader io.Reader) error {
if _, err := e.client.ImportImage(source, repository, tag, imageReader); err != nil {
_, err := e.client.ImportImage(source, repository, tag, imageReader)
e.CheckConnectionErr(err)
if err != nil {
return err
}
@ -774,6 +846,7 @@ func (e *Engine) cleanupContainers() {
func (e *Engine) RenameContainer(container *Container, newName string) error {
// send rename request
err := e.client.RenameContainer(container.Id, newName)
e.CheckConnectionErr(err)
if err != nil {
return err
}
@ -785,14 +858,16 @@ func (e *Engine) RenameContainer(container *Container, newName string) error {
// BuildImage build an image
func (e *Engine) BuildImage(buildImage *dockerclient.BuildImage) (io.ReadCloser, error) {
return e.client.BuildImage(buildImage)
reader, err := e.client.BuildImage(buildImage)
e.CheckConnectionErr(err)
return reader, err
}
// TagImage tag an image
func (e *Engine) TagImage(IDOrName string, repo string, tag string, force bool) error {
// send tag request to docker engine
err := e.client.TagImage(IDOrName, repo, tag, force)
e.CheckConnectionErr(err)
if err != nil {
return err
}

View File

@ -34,10 +34,19 @@ var (
engOpts = &EngineOpts{
RefreshMinInterval: time.Duration(30) * time.Second,
RefreshMaxInterval: time.Duration(60) * time.Second,
RefreshRetry: 3,
FailureRetry: 3,
}
)
func TestEngineFailureCount(t *testing.T) {
engine := NewEngine("test", 0, engOpts)
for i := 0; i < engine.opts.FailureRetry; i++ {
assert.True(t, engine.IsHealthy())
engine.incFailureCount()
}
assert.False(t, engine.IsHealthy())
}
func TestEngineConnectionFailure(t *testing.T) {
engine := NewEngine("test", 0, engOpts)
assert.False(t, engine.isConnected())

View File

@ -13,7 +13,7 @@ func createAgent(t *testing.T, ID string, containers ...*cluster.Container) *age
engOpts := &cluster.EngineOpts{
RefreshMinInterval: time.Duration(30) * time.Second,
RefreshMaxInterval: time.Duration(60) * time.Second,
RefreshRetry: 3,
FailureRetry: 3,
}
engine := cluster.NewEngine(ID, 0, engOpts)
engine.Name = ID

View File

@ -43,7 +43,7 @@ var (
engOpts = &cluster.EngineOpts{
RefreshMinInterval: time.Duration(30) * time.Second,
RefreshMaxInterval: time.Duration(60) * time.Second,
RefreshRetry: 3,
FailureRetry: 3,
}
)

View File

@ -30,7 +30,7 @@ function teardown() {
@test "docker logs unhealthy node" {
start_docker_with_busybox 1
swarm_manage --engine-refresh-min-interval=1s --engine-refresh-max-interval=1s --engine-refresh-retry=1 ${HOSTS[0]}
swarm_manage --engine-refresh-min-interval=1s --engine-refresh-max-interval=1s --engine-failure-retry=1 ${HOSTS[0]}
# run a container with echo command
docker_swarm run -d --name test_container busybox /bin/sh -c "echo hello world; echo hello docker; echo hello swarm"

View File

@ -14,7 +14,7 @@ load helpers
[[ "${output}" == *"max refresh interval cannot be less than min refresh interval"* ]]
# engine refresh retry count
run swarm manage --engine-refresh-retry 0 --advertise 127.0.0.1:$SWARM_BASE_PORT 192.168.56.202:4444
run swarm manage --engine-failure-retry 0 --advertise 127.0.0.1:$SWARM_BASE_PORT 192.168.56.202:4444
[ "$status" -ne 0 ]
[[ "${output}" == *"invalid refresh retry count"* ]]
[[ "${output}" == *"invalid failure retry count"* ]]
}