mirror of https://github.com/docker/docs.git
Add failure action for rolling updates
This changes the default behavior so that rolling updates will not proceed once an updated task fails to start, or stops running during the update. Users can use docker service inspect --pretty servicename to see the update status, and if it pauses due to a failure, it will explain that the update is paused, and show the task ID that caused it to pause. It also shows the time since the update started. A new --update-on-failure=(pause|continue) flag selects the behavior. Pause means the update stops once a task fails, continue means the old behavior of continuing the update anyway. In the future this will be extended with additional behaviors like automatic rollback, and flags controlling parameters like how many tasks need to fail for the update to stop proceeding. This is a minimal solution for 1.12. Signed-off-by: Aaron Lehmann <aaron.lehmann@docker.com> (cherry picked from commit 57ae29aa74e77ade3c91b1c77ba766512dae9ab4) Signed-off-by: Tibor Vass <tibor@docker.com>
This commit is contained in:
parent
92cbdfece9
commit
a3639e6ac6
|
|
@ -4,6 +4,7 @@ import (
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
"strings"
|
"strings"
|
||||||
|
"time"
|
||||||
|
|
||||||
"golang.org/x/net/context"
|
"golang.org/x/net/context"
|
||||||
|
|
||||||
|
|
@ -101,6 +102,17 @@ func printService(out io.Writer, service swarm.Service) {
|
||||||
fmt.Fprintf(out, " Replicas:\t%d\n", *service.Spec.Mode.Replicated.Replicas)
|
fmt.Fprintf(out, " Replicas:\t%d\n", *service.Spec.Mode.Replicated.Replicas)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if service.UpdateStatus.State != "" {
|
||||||
|
fmt.Fprintln(out, "Update status:")
|
||||||
|
fmt.Fprintf(out, " State:\t\t%s\n", service.UpdateStatus.State)
|
||||||
|
fmt.Fprintf(out, " Started:\t%s ago\n", strings.ToLower(units.HumanDuration(time.Since(service.UpdateStatus.StartedAt))))
|
||||||
|
if service.UpdateStatus.State == swarm.UpdateStateCompleted {
|
||||||
|
fmt.Fprintf(out, " Completed:\t%s ago\n", strings.ToLower(units.HumanDuration(time.Since(service.UpdateStatus.CompletedAt))))
|
||||||
|
}
|
||||||
|
fmt.Fprintf(out, " Message:\t%s\n", service.UpdateStatus.Message)
|
||||||
|
}
|
||||||
|
|
||||||
fmt.Fprintln(out, "Placement:")
|
fmt.Fprintln(out, "Placement:")
|
||||||
if service.Spec.TaskTemplate.Placement != nil && len(service.Spec.TaskTemplate.Placement.Constraints) > 0 {
|
if service.Spec.TaskTemplate.Placement != nil && len(service.Spec.TaskTemplate.Placement.Constraints) > 0 {
|
||||||
ioutils.FprintfIfNotEmpty(out, " Constraints\t: %s\n", strings.Join(service.Spec.TaskTemplate.Placement.Constraints, ", "))
|
ioutils.FprintfIfNotEmpty(out, " Constraints\t: %s\n", strings.Join(service.Spec.TaskTemplate.Placement.Constraints, ", "))
|
||||||
|
|
@ -110,6 +122,7 @@ func printService(out io.Writer, service swarm.Service) {
|
||||||
if service.Spec.UpdateConfig.Delay.Nanoseconds() > 0 {
|
if service.Spec.UpdateConfig.Delay.Nanoseconds() > 0 {
|
||||||
fmt.Fprintf(out, " Delay:\t\t%s\n", service.Spec.UpdateConfig.Delay)
|
fmt.Fprintf(out, " Delay:\t\t%s\n", service.Spec.UpdateConfig.Delay)
|
||||||
}
|
}
|
||||||
|
fmt.Fprintf(out, " On failure:\t%s\n", service.Spec.UpdateConfig.FailureAction)
|
||||||
fmt.Fprintf(out, "ContainerSpec:\n")
|
fmt.Fprintf(out, "ContainerSpec:\n")
|
||||||
printContainerSpec(out, service.Spec.TaskTemplate.ContainerSpec)
|
printContainerSpec(out, service.Spec.TaskTemplate.ContainerSpec)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -274,6 +274,7 @@ func (m *MountOpt) Value() []swarm.Mount {
|
||||||
type updateOptions struct {
|
type updateOptions struct {
|
||||||
parallelism uint64
|
parallelism uint64
|
||||||
delay time.Duration
|
delay time.Duration
|
||||||
|
onFailure string
|
||||||
}
|
}
|
||||||
|
|
||||||
type resourceOptions struct {
|
type resourceOptions struct {
|
||||||
|
|
@ -457,6 +458,7 @@ func (opts *serviceOptions) ToService() (swarm.ServiceSpec, error) {
|
||||||
UpdateConfig: &swarm.UpdateConfig{
|
UpdateConfig: &swarm.UpdateConfig{
|
||||||
Parallelism: opts.update.parallelism,
|
Parallelism: opts.update.parallelism,
|
||||||
Delay: opts.update.delay,
|
Delay: opts.update.delay,
|
||||||
|
FailureAction: opts.update.onFailure,
|
||||||
},
|
},
|
||||||
Networks: convertNetworks(opts.networks),
|
Networks: convertNetworks(opts.networks),
|
||||||
EndpointSpec: opts.endpoint.ToEndpointSpec(),
|
EndpointSpec: opts.endpoint.ToEndpointSpec(),
|
||||||
|
|
@ -503,6 +505,7 @@ func addServiceFlags(cmd *cobra.Command, opts *serviceOptions) {
|
||||||
|
|
||||||
flags.Uint64Var(&opts.update.parallelism, flagUpdateParallelism, 1, "Maximum number of tasks updated simultaneously (0 to update all at once)")
|
flags.Uint64Var(&opts.update.parallelism, flagUpdateParallelism, 1, "Maximum number of tasks updated simultaneously (0 to update all at once)")
|
||||||
flags.DurationVar(&opts.update.delay, flagUpdateDelay, time.Duration(0), "Delay between updates")
|
flags.DurationVar(&opts.update.delay, flagUpdateDelay, time.Duration(0), "Delay between updates")
|
||||||
|
flags.StringVar(&opts.update.onFailure, flagUpdateFailureAction, "pause", "Action on update failure (pause|continue)")
|
||||||
|
|
||||||
flags.StringVar(&opts.endpoint.mode, flagEndpointMode, "", "Endpoint mode (vip or dnsrr)")
|
flags.StringVar(&opts.endpoint.mode, flagEndpointMode, "", "Endpoint mode (vip or dnsrr)")
|
||||||
|
|
||||||
|
|
@ -545,6 +548,7 @@ const (
|
||||||
flagRestartWindow = "restart-window"
|
flagRestartWindow = "restart-window"
|
||||||
flagStopGracePeriod = "stop-grace-period"
|
flagStopGracePeriod = "stop-grace-period"
|
||||||
flagUpdateDelay = "update-delay"
|
flagUpdateDelay = "update-delay"
|
||||||
|
flagUpdateFailureAction = "update-failure-action"
|
||||||
flagUpdateParallelism = "update-parallelism"
|
flagUpdateParallelism = "update-parallelism"
|
||||||
flagUser = "user"
|
flagUser = "user"
|
||||||
flagRegistryAuth = "with-registry-auth"
|
flagRegistryAuth = "with-registry-auth"
|
||||||
|
|
|
||||||
|
|
@ -191,12 +191,13 @@ func updateService(flags *pflag.FlagSet, spec *swarm.ServiceSpec) error {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
if anyChanged(flags, flagUpdateParallelism, flagUpdateDelay) {
|
if anyChanged(flags, flagUpdateParallelism, flagUpdateDelay, flagUpdateFailureAction) {
|
||||||
if spec.UpdateConfig == nil {
|
if spec.UpdateConfig == nil {
|
||||||
spec.UpdateConfig = &swarm.UpdateConfig{}
|
spec.UpdateConfig = &swarm.UpdateConfig{}
|
||||||
}
|
}
|
||||||
updateUint64(flagUpdateParallelism, &spec.UpdateConfig.Parallelism)
|
updateUint64(flagUpdateParallelism, &spec.UpdateConfig.Parallelism)
|
||||||
updateDuration(flagUpdateDelay, &spec.UpdateConfig.Delay)
|
updateDuration(flagUpdateDelay, &spec.UpdateConfig.Delay)
|
||||||
|
updateString(flagUpdateFailureAction, &spec.UpdateConfig.FailureAction)
|
||||||
}
|
}
|
||||||
|
|
||||||
updateNetworks(flags, &spec.Networks)
|
updateNetworks(flags, &spec.Networks)
|
||||||
|
|
|
||||||
|
|
@ -1726,6 +1726,7 @@ _docker_service_update() {
|
||||||
--restart-window
|
--restart-window
|
||||||
--stop-grace-period
|
--stop-grace-period
|
||||||
--update-delay
|
--update-delay
|
||||||
|
--update-failure-action
|
||||||
--update-parallelism
|
--update-parallelism
|
||||||
--user -u
|
--user -u
|
||||||
--workdir -w
|
--workdir -w
|
||||||
|
|
|
||||||
|
|
@ -1095,6 +1095,7 @@ __docker_service_subcommand() {
|
||||||
"($help)--restart-window=[Window used to evaluate the restart policy]:window: "
|
"($help)--restart-window=[Window used to evaluate the restart policy]:window: "
|
||||||
"($help)--stop-grace-period=[Time to wait before force killing a container]:grace period: "
|
"($help)--stop-grace-period=[Time to wait before force killing a container]:grace period: "
|
||||||
"($help)--update-delay=[Delay between updates]:delay: "
|
"($help)--update-delay=[Delay between updates]:delay: "
|
||||||
|
"($help)--update-failure-action=[Action on update failure]:mode:(pause continue)"
|
||||||
"($help)--update-parallelism=[Maximum number of tasks updated simultaneously]:number: "
|
"($help)--update-parallelism=[Maximum number of tasks updated simultaneously]:number: "
|
||||||
"($help -u --user)"{-u=,--user=}"[Username or UID]:user:_users"
|
"($help -u --user)"{-u=,--user=}"[Username or UID]:user:_users"
|
||||||
"($help)--with-registry-auth[Send registry authentication details to swarm agents]"
|
"($help)--with-registry-auth[Send registry authentication details to swarm agents]"
|
||||||
|
|
|
||||||
|
|
@ -53,6 +53,13 @@ func ServiceFromGRPC(s swarmapi.Service) types.Service {
|
||||||
}
|
}
|
||||||
|
|
||||||
service.Spec.UpdateConfig.Delay, _ = ptypes.Duration(&s.Spec.Update.Delay)
|
service.Spec.UpdateConfig.Delay, _ = ptypes.Duration(&s.Spec.Update.Delay)
|
||||||
|
|
||||||
|
switch s.Spec.Update.FailureAction {
|
||||||
|
case swarmapi.UpdateConfig_PAUSE:
|
||||||
|
service.Spec.UpdateConfig.FailureAction = types.UpdateFailureActionPause
|
||||||
|
case swarmapi.UpdateConfig_CONTINUE:
|
||||||
|
service.Spec.UpdateConfig.FailureAction = types.UpdateFailureActionContinue
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Mode
|
// Mode
|
||||||
|
|
@ -65,6 +72,23 @@ func ServiceFromGRPC(s swarmapi.Service) types.Service {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// UpdateStatus
|
||||||
|
service.UpdateStatus = types.UpdateStatus{}
|
||||||
|
if s.UpdateStatus != nil {
|
||||||
|
switch s.UpdateStatus.State {
|
||||||
|
case swarmapi.UpdateStatus_UPDATING:
|
||||||
|
service.UpdateStatus.State = types.UpdateStateUpdating
|
||||||
|
case swarmapi.UpdateStatus_PAUSED:
|
||||||
|
service.UpdateStatus.State = types.UpdateStatePaused
|
||||||
|
case swarmapi.UpdateStatus_COMPLETED:
|
||||||
|
service.UpdateStatus.State = types.UpdateStateCompleted
|
||||||
|
}
|
||||||
|
|
||||||
|
service.UpdateStatus.StartedAt, _ = ptypes.Timestamp(s.UpdateStatus.StartedAt)
|
||||||
|
service.UpdateStatus.CompletedAt, _ = ptypes.Timestamp(s.UpdateStatus.CompletedAt)
|
||||||
|
service.UpdateStatus.Message = s.UpdateStatus.Message
|
||||||
|
}
|
||||||
|
|
||||||
return service
|
return service
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -111,9 +135,19 @@ func ServiceSpecToGRPC(s types.ServiceSpec) (swarmapi.ServiceSpec, error) {
|
||||||
}
|
}
|
||||||
|
|
||||||
if s.UpdateConfig != nil {
|
if s.UpdateConfig != nil {
|
||||||
|
var failureAction swarmapi.UpdateConfig_FailureAction
|
||||||
|
switch s.UpdateConfig.FailureAction {
|
||||||
|
case types.UpdateFailureActionPause, "":
|
||||||
|
failureAction = swarmapi.UpdateConfig_PAUSE
|
||||||
|
case types.UpdateFailureActionContinue:
|
||||||
|
failureAction = swarmapi.UpdateConfig_CONTINUE
|
||||||
|
default:
|
||||||
|
return swarmapi.ServiceSpec{}, fmt.Errorf("unrecongized update failure action %s", s.UpdateConfig.FailureAction)
|
||||||
|
}
|
||||||
spec.Update = &swarmapi.UpdateConfig{
|
spec.Update = &swarmapi.UpdateConfig{
|
||||||
Parallelism: s.UpdateConfig.Parallelism,
|
Parallelism: s.UpdateConfig.Parallelism,
|
||||||
Delay: *ptypes.DurationProto(s.UpdateConfig.Delay),
|
Delay: *ptypes.DurationProto(s.UpdateConfig.Delay),
|
||||||
|
FailureAction: failureAction,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -3967,7 +3967,8 @@ Create a service
|
||||||
},
|
},
|
||||||
"UpdateConfig": {
|
"UpdateConfig": {
|
||||||
"Delay": 30000000000.0,
|
"Delay": 30000000000.0,
|
||||||
"Parallelism": 2
|
"Parallelism": 2,
|
||||||
|
"FailureAction": "pause"
|
||||||
},
|
},
|
||||||
"EndpointSpec": {
|
"EndpointSpec": {
|
||||||
"Ports": [
|
"Ports": [
|
||||||
|
|
@ -4057,6 +4058,8 @@ JSON Parameters:
|
||||||
- **Parallelism** – Maximum number of tasks to be updated in one iteration (0 means unlimited
|
- **Parallelism** – Maximum number of tasks to be updated in one iteration (0 means unlimited
|
||||||
parallelism).
|
parallelism).
|
||||||
- **Delay** – Amount of time between updates.
|
- **Delay** – Amount of time between updates.
|
||||||
|
- **FailureAction** - Action to take if an updated task fails to run, or stops running during the
|
||||||
|
update. Values are `continue` and `pause`.
|
||||||
- **Networks** – Array of network names or IDs to attach the service to.
|
- **Networks** – Array of network names or IDs to attach the service to.
|
||||||
- **Endpoint** – Properties that can be configured to access and load balance a service.
|
- **Endpoint** – Properties that can be configured to access and load balance a service.
|
||||||
- **Spec** –
|
- **Spec** –
|
||||||
|
|
|
||||||
|
|
@ -42,6 +42,7 @@ Options:
|
||||||
--restart-window value Window used to evaluate the restart policy (default none)
|
--restart-window value Window used to evaluate the restart policy (default none)
|
||||||
--stop-grace-period value Time to wait before force killing a container (default none)
|
--stop-grace-period value Time to wait before force killing a container (default none)
|
||||||
--update-delay duration Delay between updates
|
--update-delay duration Delay between updates
|
||||||
|
--update-failure-action string Action on update failure (pause|continue) (default "pause")
|
||||||
--update-parallelism uint Maximum number of tasks updated simultaneously (0 to update all at once) (default 1)
|
--update-parallelism uint Maximum number of tasks updated simultaneously (0 to update all at once) (default 1)
|
||||||
-u, --user string Username or UID
|
-u, --user string Username or UID
|
||||||
--with-registry-auth Send registry authentication details to Swarm agents
|
--with-registry-auth Send registry authentication details to Swarm agents
|
||||||
|
|
|
||||||
|
|
@ -48,6 +48,7 @@ Options:
|
||||||
--restart-window value Window used to evaluate the restart policy (default none)
|
--restart-window value Window used to evaluate the restart policy (default none)
|
||||||
--stop-grace-period value Time to wait before force killing a container (default none)
|
--stop-grace-period value Time to wait before force killing a container (default none)
|
||||||
--update-delay duration Delay between updates
|
--update-delay duration Delay between updates
|
||||||
|
--update-failure-action string Action on update failure (pause|continue) (default "pause")
|
||||||
--update-parallelism uint Maximum number of tasks updated simultaneously (0 to update all at once) (default 1)
|
--update-parallelism uint Maximum number of tasks updated simultaneously (0 to update all at once) (default 1)
|
||||||
-u, --user string Username or UID
|
-u, --user string Username or UID
|
||||||
--with-registry-auth Send registry authentication details to Swarm agents
|
--with-registry-auth Send registry authentication details to Swarm agents
|
||||||
|
|
|
||||||
|
|
@ -768,6 +768,7 @@ func serviceForUpdate(s *swarm.Service) {
|
||||||
UpdateConfig: &swarm.UpdateConfig{
|
UpdateConfig: &swarm.UpdateConfig{
|
||||||
Parallelism: 2,
|
Parallelism: 2,
|
||||||
Delay: 8 * time.Second,
|
Delay: 8 * time.Second,
|
||||||
|
FailureAction: swarm.UpdateFailureActionContinue,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
s.Spec.Name = "updatetest"
|
s.Spec.Name = "updatetest"
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue