From a3639e6ac6336492d0c3c843891f291639806a11 Mon Sep 17 00:00:00 2001 From: Aaron Lehmann Date: Fri, 22 Jul 2016 11:35:51 -0700 Subject: [PATCH] Add failure action for rolling updates This changes the default behavior so that rolling updates will not proceed once an updated task fails to start, or stops running during the update. Users can use docker service inspect --pretty servicename to see the update status, and if it pauses due to a failure, it will explain that the update is paused, and show the task ID that caused it to pause. It also shows the time since the update started. A new --update-on-failure=(pause|continue) flag selects the behavior. Pause means the update stops once a task fails, continue means the old behavior of continuing the update anyway. In the future this will be extended with additional behaviors like automatic rollback, and flags controlling parameters like how many tasks need to fail for the update to stop proceeding. This is a minimal solution for 1.12. Signed-off-by: Aaron Lehmann (cherry picked from commit 57ae29aa74e77ade3c91b1c77ba766512dae9ab4) Signed-off-by: Tibor Vass --- api/client/service/inspect.go | 13 +++ api/client/service/opts.go | 82 ++++++++++--------- api/client/service/update.go | 3 +- contrib/completion/bash/docker | 1 + contrib/completion/zsh/_docker | 1 + daemon/cluster/convert/service.go | 40 ++++++++- docs/reference/api/docker_remote_api_v1.24.md | 5 +- docs/reference/commandline/service_create.md | 55 +++++++------ docs/reference/commandline/service_update.md | 69 ++++++++-------- integration-cli/docker_api_swarm_test.go | 5 +- 10 files changed, 167 insertions(+), 107 deletions(-) diff --git a/api/client/service/inspect.go b/api/client/service/inspect.go index 80ca268e2c..31c300be8c 100644 --- a/api/client/service/inspect.go +++ b/api/client/service/inspect.go @@ -4,6 +4,7 @@ import ( "fmt" "io" "strings" + "time" "golang.org/x/net/context" @@ -101,6 +102,17 @@ func printService(out io.Writer, service swarm.Service) { fmt.Fprintf(out, " Replicas:\t%d\n", *service.Spec.Mode.Replicated.Replicas) } } + + if service.UpdateStatus.State != "" { + fmt.Fprintln(out, "Update status:") + fmt.Fprintf(out, " State:\t\t%s\n", service.UpdateStatus.State) + fmt.Fprintf(out, " Started:\t%s ago\n", strings.ToLower(units.HumanDuration(time.Since(service.UpdateStatus.StartedAt)))) + if service.UpdateStatus.State == swarm.UpdateStateCompleted { + fmt.Fprintf(out, " Completed:\t%s ago\n", strings.ToLower(units.HumanDuration(time.Since(service.UpdateStatus.CompletedAt)))) + } + fmt.Fprintf(out, " Message:\t%s\n", service.UpdateStatus.Message) + } + fmt.Fprintln(out, "Placement:") if service.Spec.TaskTemplate.Placement != nil && len(service.Spec.TaskTemplate.Placement.Constraints) > 0 { ioutils.FprintfIfNotEmpty(out, " Constraints\t: %s\n", strings.Join(service.Spec.TaskTemplate.Placement.Constraints, ", ")) @@ -110,6 +122,7 @@ func printService(out io.Writer, service swarm.Service) { if service.Spec.UpdateConfig.Delay.Nanoseconds() > 0 { fmt.Fprintf(out, " Delay:\t\t%s\n", service.Spec.UpdateConfig.Delay) } + fmt.Fprintf(out, " On failure:\t%s\n", service.Spec.UpdateConfig.FailureAction) fmt.Fprintf(out, "ContainerSpec:\n") printContainerSpec(out, service.Spec.TaskTemplate.ContainerSpec) diff --git a/api/client/service/opts.go b/api/client/service/opts.go index d832a748c2..6e20b9e83c 100644 --- a/api/client/service/opts.go +++ b/api/client/service/opts.go @@ -274,6 +274,7 @@ func (m *MountOpt) Value() []swarm.Mount { type updateOptions struct { parallelism uint64 delay time.Duration + onFailure string } type resourceOptions struct { @@ -455,8 +456,9 @@ func (opts *serviceOptions) ToService() (swarm.ServiceSpec, error) { }, Mode: swarm.ServiceMode{}, UpdateConfig: &swarm.UpdateConfig{ - Parallelism: opts.update.parallelism, - Delay: opts.update.delay, + Parallelism: opts.update.parallelism, + Delay: opts.update.delay, + FailureAction: opts.update.onFailure, }, Networks: convertNetworks(opts.networks), EndpointSpec: opts.endpoint.ToEndpointSpec(), @@ -503,6 +505,7 @@ func addServiceFlags(cmd *cobra.Command, opts *serviceOptions) { flags.Uint64Var(&opts.update.parallelism, flagUpdateParallelism, 1, "Maximum number of tasks updated simultaneously (0 to update all at once)") flags.DurationVar(&opts.update.delay, flagUpdateDelay, time.Duration(0), "Delay between updates") + flags.StringVar(&opts.update.onFailure, flagUpdateFailureAction, "pause", "Action on update failure (pause|continue)") flags.StringVar(&opts.endpoint.mode, flagEndpointMode, "", "Endpoint mode (vip or dnsrr)") @@ -513,41 +516,42 @@ func addServiceFlags(cmd *cobra.Command, opts *serviceOptions) { } const ( - flagConstraint = "constraint" - flagConstraintRemove = "constraint-rm" - flagConstraintAdd = "constraint-add" - flagEndpointMode = "endpoint-mode" - flagEnv = "env" - flagEnvRemove = "env-rm" - flagEnvAdd = "env-add" - flagLabel = "label" - flagLabelRemove = "label-rm" - flagLabelAdd = "label-add" - flagLimitCPU = "limit-cpu" - flagLimitMemory = "limit-memory" - flagMode = "mode" - flagMount = "mount" - flagMountRemove = "mount-rm" - flagMountAdd = "mount-add" - flagName = "name" - flagNetwork = "network" - flagNetworkRemove = "network-rm" - flagNetworkAdd = "network-add" - flagPublish = "publish" - flagPublishRemove = "publish-rm" - flagPublishAdd = "publish-add" - flagReplicas = "replicas" - flagReserveCPU = "reserve-cpu" - flagReserveMemory = "reserve-memory" - flagRestartCondition = "restart-condition" - flagRestartDelay = "restart-delay" - flagRestartMaxAttempts = "restart-max-attempts" - flagRestartWindow = "restart-window" - flagStopGracePeriod = "stop-grace-period" - flagUpdateDelay = "update-delay" - flagUpdateParallelism = "update-parallelism" - flagUser = "user" - flagRegistryAuth = "with-registry-auth" - flagLogDriver = "log-driver" - flagLogOpt = "log-opt" + flagConstraint = "constraint" + flagConstraintRemove = "constraint-rm" + flagConstraintAdd = "constraint-add" + flagEndpointMode = "endpoint-mode" + flagEnv = "env" + flagEnvRemove = "env-rm" + flagEnvAdd = "env-add" + flagLabel = "label" + flagLabelRemove = "label-rm" + flagLabelAdd = "label-add" + flagLimitCPU = "limit-cpu" + flagLimitMemory = "limit-memory" + flagMode = "mode" + flagMount = "mount" + flagMountRemove = "mount-rm" + flagMountAdd = "mount-add" + flagName = "name" + flagNetwork = "network" + flagNetworkRemove = "network-rm" + flagNetworkAdd = "network-add" + flagPublish = "publish" + flagPublishRemove = "publish-rm" + flagPublishAdd = "publish-add" + flagReplicas = "replicas" + flagReserveCPU = "reserve-cpu" + flagReserveMemory = "reserve-memory" + flagRestartCondition = "restart-condition" + flagRestartDelay = "restart-delay" + flagRestartMaxAttempts = "restart-max-attempts" + flagRestartWindow = "restart-window" + flagStopGracePeriod = "stop-grace-period" + flagUpdateDelay = "update-delay" + flagUpdateFailureAction = "update-failure-action" + flagUpdateParallelism = "update-parallelism" + flagUser = "user" + flagRegistryAuth = "with-registry-auth" + flagLogDriver = "log-driver" + flagLogOpt = "log-opt" ) diff --git a/api/client/service/update.go b/api/client/service/update.go index c8538fbd2f..53f22cd15f 100644 --- a/api/client/service/update.go +++ b/api/client/service/update.go @@ -191,12 +191,13 @@ func updateService(flags *pflag.FlagSet, spec *swarm.ServiceSpec) error { return err } - if anyChanged(flags, flagUpdateParallelism, flagUpdateDelay) { + if anyChanged(flags, flagUpdateParallelism, flagUpdateDelay, flagUpdateFailureAction) { if spec.UpdateConfig == nil { spec.UpdateConfig = &swarm.UpdateConfig{} } updateUint64(flagUpdateParallelism, &spec.UpdateConfig.Parallelism) updateDuration(flagUpdateDelay, &spec.UpdateConfig.Delay) + updateString(flagUpdateFailureAction, &spec.UpdateConfig.FailureAction) } updateNetworks(flags, &spec.Networks) diff --git a/contrib/completion/bash/docker b/contrib/completion/bash/docker index 248a95eb3b..afa11ccfc2 100644 --- a/contrib/completion/bash/docker +++ b/contrib/completion/bash/docker @@ -1726,6 +1726,7 @@ _docker_service_update() { --restart-window --stop-grace-period --update-delay + --update-failure-action --update-parallelism --user -u --workdir -w diff --git a/contrib/completion/zsh/_docker b/contrib/completion/zsh/_docker index e98e70225b..455059169c 100644 --- a/contrib/completion/zsh/_docker +++ b/contrib/completion/zsh/_docker @@ -1095,6 +1095,7 @@ __docker_service_subcommand() { "($help)--restart-window=[Window used to evaluate the restart policy]:window: " "($help)--stop-grace-period=[Time to wait before force killing a container]:grace period: " "($help)--update-delay=[Delay between updates]:delay: " + "($help)--update-failure-action=[Action on update failure]:mode:(pause continue)" "($help)--update-parallelism=[Maximum number of tasks updated simultaneously]:number: " "($help -u --user)"{-u=,--user=}"[Username or UID]:user:_users" "($help)--with-registry-auth[Send registry authentication details to swarm agents]" diff --git a/daemon/cluster/convert/service.go b/daemon/cluster/convert/service.go index 2872b10ab8..75e7c3bcfa 100644 --- a/daemon/cluster/convert/service.go +++ b/daemon/cluster/convert/service.go @@ -53,9 +53,16 @@ func ServiceFromGRPC(s swarmapi.Service) types.Service { } service.Spec.UpdateConfig.Delay, _ = ptypes.Duration(&s.Spec.Update.Delay) + + switch s.Spec.Update.FailureAction { + case swarmapi.UpdateConfig_PAUSE: + service.Spec.UpdateConfig.FailureAction = types.UpdateFailureActionPause + case swarmapi.UpdateConfig_CONTINUE: + service.Spec.UpdateConfig.FailureAction = types.UpdateFailureActionContinue + } } - //Mode + // Mode switch t := s.Spec.GetMode().(type) { case *swarmapi.ServiceSpec_Global: service.Spec.Mode.Global = &types.GlobalService{} @@ -65,6 +72,23 @@ func ServiceFromGRPC(s swarmapi.Service) types.Service { } } + // UpdateStatus + service.UpdateStatus = types.UpdateStatus{} + if s.UpdateStatus != nil { + switch s.UpdateStatus.State { + case swarmapi.UpdateStatus_UPDATING: + service.UpdateStatus.State = types.UpdateStateUpdating + case swarmapi.UpdateStatus_PAUSED: + service.UpdateStatus.State = types.UpdateStatePaused + case swarmapi.UpdateStatus_COMPLETED: + service.UpdateStatus.State = types.UpdateStateCompleted + } + + service.UpdateStatus.StartedAt, _ = ptypes.Timestamp(s.UpdateStatus.StartedAt) + service.UpdateStatus.CompletedAt, _ = ptypes.Timestamp(s.UpdateStatus.CompletedAt) + service.UpdateStatus.Message = s.UpdateStatus.Message + } + return service } @@ -111,9 +135,19 @@ func ServiceSpecToGRPC(s types.ServiceSpec) (swarmapi.ServiceSpec, error) { } if s.UpdateConfig != nil { + var failureAction swarmapi.UpdateConfig_FailureAction + switch s.UpdateConfig.FailureAction { + case types.UpdateFailureActionPause, "": + failureAction = swarmapi.UpdateConfig_PAUSE + case types.UpdateFailureActionContinue: + failureAction = swarmapi.UpdateConfig_CONTINUE + default: + return swarmapi.ServiceSpec{}, fmt.Errorf("unrecongized update failure action %s", s.UpdateConfig.FailureAction) + } spec.Update = &swarmapi.UpdateConfig{ - Parallelism: s.UpdateConfig.Parallelism, - Delay: *ptypes.DurationProto(s.UpdateConfig.Delay), + Parallelism: s.UpdateConfig.Parallelism, + Delay: *ptypes.DurationProto(s.UpdateConfig.Delay), + FailureAction: failureAction, } } diff --git a/docs/reference/api/docker_remote_api_v1.24.md b/docs/reference/api/docker_remote_api_v1.24.md index b9c47b4419..6df83f3fe7 100644 --- a/docs/reference/api/docker_remote_api_v1.24.md +++ b/docs/reference/api/docker_remote_api_v1.24.md @@ -3967,7 +3967,8 @@ Create a service }, "UpdateConfig": { "Delay": 30000000000.0, - "Parallelism": 2 + "Parallelism": 2, + "FailureAction": "pause" }, "EndpointSpec": { "Ports": [ @@ -4057,6 +4058,8 @@ JSON Parameters: - **Parallelism** – Maximum number of tasks to be updated in one iteration (0 means unlimited parallelism). - **Delay** – Amount of time between updates. + - **FailureAction** - Action to take if an updated task fails to run, or stops running during the + update. Values are `continue` and `pause`. - **Networks** – Array of network names or IDs to attach the service to. - **Endpoint** – Properties that can be configured to access and load balance a service. - **Spec** – diff --git a/docs/reference/commandline/service_create.md b/docs/reference/commandline/service_create.md index 3f3cffa313..da6060a96d 100644 --- a/docs/reference/commandline/service_create.md +++ b/docs/reference/commandline/service_create.md @@ -19,33 +19,34 @@ Usage: docker service create [OPTIONS] IMAGE [COMMAND] [ARG...] Create a new service Options: - --constraint value Placement constraints (default []) - --endpoint-mode string Endpoint mode (vip or dnsrr) - -e, --env value Set environment variables (default []) - --help Print usage - -l, --label value Service labels (default []) - --limit-cpu value Limit CPUs (default 0.000) - --limit-memory value Limit Memory (default 0 B) - --log-driver string Logging driver for service - --log-opt value Logging driver options (default []) - --mode string Service mode (replicated or global) (default "replicated") - --mount value Attach a mount to the service - --name string Service name - --network value Network attachments (default []) - -p, --publish value Publish a port as a node port (default []) - --replicas value Number of tasks (default none) - --reserve-cpu value Reserve CPUs (default 0.000) - --reserve-memory value Reserve Memory (default 0 B) - --restart-condition string Restart when condition is met (none, on-failure, or any) - --restart-delay value Delay between restart attempts (default none) - --restart-max-attempts value Maximum number of restarts before giving up (default none) - --restart-window value Window used to evaluate the restart policy (default none) - --stop-grace-period value Time to wait before force killing a container (default none) - --update-delay duration Delay between updates - --update-parallelism uint Maximum number of tasks updated simultaneously (0 to update all at once) (default 1) - -u, --user string Username or UID - --with-registry-auth Send registry authentication details to Swarm agents - -w, --workdir string Working directory inside the container + --constraint value Placement constraints (default []) + --endpoint-mode string Endpoint mode (vip or dnsrr) + -e, --env value Set environment variables (default []) + --help Print usage + -l, --label value Service labels (default []) + --limit-cpu value Limit CPUs (default 0.000) + --limit-memory value Limit Memory (default 0 B) + --log-driver string Logging driver for service + --log-opt value Logging driver options (default []) + --mode string Service mode (replicated or global) (default "replicated") + --mount value Attach a mount to the service + --name string Service name + --network value Network attachments (default []) + -p, --publish value Publish a port as a node port (default []) + --replicas value Number of tasks (default none) + --reserve-cpu value Reserve CPUs (default 0.000) + --reserve-memory value Reserve Memory (default 0 B) + --restart-condition string Restart when condition is met (none, on-failure, or any) + --restart-delay value Delay between restart attempts (default none) + --restart-max-attempts value Maximum number of restarts before giving up (default none) + --restart-window value Window used to evaluate the restart policy (default none) + --stop-grace-period value Time to wait before force killing a container (default none) + --update-delay duration Delay between updates + --update-failure-action string Action on update failure (pause|continue) (default "pause") + --update-parallelism uint Maximum number of tasks updated simultaneously (0 to update all at once) (default 1) + -u, --user string Username or UID + --with-registry-auth Send registry authentication details to Swarm agents + -w, --workdir string Working directory inside the container ``` Creates a service as described by the specified parameters. This command has to diff --git a/docs/reference/commandline/service_update.md b/docs/reference/commandline/service_update.md index 17e834fc7d..6bb4c7a2d1 100644 --- a/docs/reference/commandline/service_update.md +++ b/docs/reference/commandline/service_update.md @@ -18,40 +18,41 @@ Usage: docker service update [OPTIONS] SERVICE Update a service Options: - --args string Service command args - --constraint-add value Add or update placement constraints (default []) - --constraint-rm value Remove a constraint (default []) - --endpoint-mode string Endpoint mode (vip or dnsrr) - --env-add value Add or update environment variables (default []) - --env-rm value Remove an environment variable (default []) - --help Print usage - --image string Service image tag - --label-add value Add or update service labels (default []) - --label-rm value Remove a label by its key (default []) - --limit-cpu value Limit CPUs (default 0.000) - --limit-memory value Limit Memory (default 0 B) - --log-driver string Logging driver for service - --log-opt value Logging driver options (default []) - --mount-add value Add or update a mount on a service - --mount-rm value Remove a mount by its target path (default []) - --name string Service name - --network-add value Add or update network attachments (default []) - --network-rm value Remove a network by name (default []) - --publish-add value Add or update a published port (default []) - --publish-rm value Remove a published port by its target port (default []) - --replicas value Number of tasks (default none) - --reserve-cpu value Reserve CPUs (default 0.000) - --reserve-memory value Reserve Memory (default 0 B) - --restart-condition string Restart when condition is met (none, on-failure, or any) - --restart-delay value Delay between restart attempts (default none) - --restart-max-attempts value Maximum number of restarts before giving up (default none) - --restart-window value Window used to evaluate the restart policy (default none) - --stop-grace-period value Time to wait before force killing a container (default none) - --update-delay duration Delay between updates - --update-parallelism uint Maximum number of tasks updated simultaneously (0 to update all at once) (default 1) - -u, --user string Username or UID - --with-registry-auth Send registry authentication details to Swarm agents - -w, --workdir string Working directory inside the container + --args string Service command args + --constraint-add value Add or update placement constraints (default []) + --constraint-rm value Remove a constraint (default []) + --endpoint-mode string Endpoint mode (vip or dnsrr) + --env-add value Add or update environment variables (default []) + --env-rm value Remove an environment variable (default []) + --help Print usage + --image string Service image tag + --label-add value Add or update service labels (default []) + --label-rm value Remove a label by its key (default []) + --limit-cpu value Limit CPUs (default 0.000) + --limit-memory value Limit Memory (default 0 B) + --log-driver string Logging driver for service + --log-opt value Logging driver options (default []) + --mount-add value Add or update a mount on a service + --mount-rm value Remove a mount by its target path (default []) + --name string Service name + --network-add value Add or update network attachments (default []) + --network-rm value Remove a network by name (default []) + --publish-add value Add or update a published port (default []) + --publish-rm value Remove a published port by its target port (default []) + --replicas value Number of tasks (default none) + --reserve-cpu value Reserve CPUs (default 0.000) + --reserve-memory value Reserve Memory (default 0 B) + --restart-condition string Restart when condition is met (none, on-failure, or any) + --restart-delay value Delay between restart attempts (default none) + --restart-max-attempts value Maximum number of restarts before giving up (default none) + --restart-window value Window used to evaluate the restart policy (default none) + --stop-grace-period value Time to wait before force killing a container (default none) + --update-delay duration Delay between updates + --update-failure-action string Action on update failure (pause|continue) (default "pause") + --update-parallelism uint Maximum number of tasks updated simultaneously (0 to update all at once) (default 1) + -u, --user string Username or UID + --with-registry-auth Send registry authentication details to Swarm agents + -w, --workdir string Working directory inside the container ``` Updates a service as described by the specified parameters. This command has to be run targeting a manager node. diff --git a/integration-cli/docker_api_swarm_test.go b/integration-cli/docker_api_swarm_test.go index e08d4da88f..57ef0b679e 100644 --- a/integration-cli/docker_api_swarm_test.go +++ b/integration-cli/docker_api_swarm_test.go @@ -766,8 +766,9 @@ func serviceForUpdate(s *swarm.Service) { }, }, UpdateConfig: &swarm.UpdateConfig{ - Parallelism: 2, - Delay: 8 * time.Second, + Parallelism: 2, + Delay: 8 * time.Second, + FailureAction: swarm.UpdateFailureActionContinue, }, } s.Spec.Name = "updatetest"