diff --git a/cmd/kops/rolling-update_cluster.go b/cmd/kops/rolling-update_cluster.go index 8081fc02a9..6e601e15d3 100644 --- a/cmd/kops/rolling-update_cluster.go +++ b/cmd/kops/rolling-update_cluster.go @@ -107,10 +107,6 @@ type RollingUpdateOptions struct { // does not validate, after a validation period. FailOnValidate bool - // ExitOnFirstError exits the rolling update when a single instancegroup's - // rolling update experiences an error instead of retrying all instancegroups. - ExitOnFirstError bool - // DrainTimeout is the maximum time to wait while draining a node. DrainTimeout time.Duration @@ -155,7 +151,6 @@ func (o *RollingUpdateOptions) InitDefaults() { o.CloudOnly = false o.FailOnDrainError = false o.FailOnValidate = true - o.ExitOnFirstError = false o.ControlPlaneInterval = 15 * time.Second o.NodeInterval = 15 * time.Second @@ -213,7 +208,6 @@ func NewCmdRollingUpdateCluster(f *util.Factory, out io.Writer) *cobra.Command { cmd.Flags().BoolVar(&options.FailOnDrainError, "fail-on-drain-error", true, "Fail if draining a node fails") cmd.Flags().BoolVar(&options.FailOnValidate, "fail-on-validate-error", true, "Fail if the cluster fails to validate") - cmd.Flags().BoolVar(&options.ExitOnFirstError, "exit-on-first-error", false, "Exit on the first node or apiserver instancegroup error") cmd.Flags().SetNormalizeFunc(func(f *pflag.FlagSet, name string) pflag.NormalizedName { switch name { @@ -368,7 +362,6 @@ func RunRollingUpdateCluster(ctx context.Context, f *util.Factory, out io.Writer ValidationTimeout: options.ValidationTimeout, ValidateCount: int(options.ValidateCount), DrainTimeout: options.DrainTimeout, - ExitOnFirstError: options.ExitOnFirstError, // TODO should we expose this to the UI? ValidateTickDuration: 30 * time.Second, ValidateSuccessDuration: 10 * time.Second, diff --git a/docs/cli/kops_rolling-update_cluster.md b/docs/cli/kops_rolling-update_cluster.md index c0089a69d2..93d8e849e2 100644 --- a/docs/cli/kops_rolling-update_cluster.md +++ b/docs/cli/kops_rolling-update_cluster.md @@ -58,7 +58,6 @@ kops rolling-update cluster [CLUSTER] [flags] ``` --bastion-interval duration Time to wait between restarting bastions (default 15s) --cloudonly Perform rolling update without confirming progress with Kubernetes - --exit-on-first-error Exit on the first node or apiserver instancegroup error --control-plane-interval duration Time to wait between restarting control plane nodes (default 15s) --drain-timeout duration Maximum time to wait for a node to drain (default 15m0s) --fail-on-drain-error Fail if draining a node fails (default true) diff --git a/pkg/instancegroups/rollingupdate.go b/pkg/instancegroups/rollingupdate.go index 760a6a45fd..0c20cdcdf7 100644 --- a/pkg/instancegroups/rollingupdate.go +++ b/pkg/instancegroups/rollingupdate.go @@ -20,6 +20,7 @@ import ( "context" "fmt" "sort" + "strings" "sync" "time" @@ -82,11 +83,6 @@ type RollingUpdateCluster struct { // DrainTimeout is the maximum amount of time to wait while draining a node. DrainTimeout time.Duration - // ExitOnFirstError ensures the rolling update stops on the first error returned by any - // node or apiserver instancegroup. The default is `false` which will try to roll every instance - // group in serial and then return any errors. - ExitOnFirstError bool - // Options holds user-specified options Options RollingUpdateOptions } @@ -192,7 +188,7 @@ func (c *RollingUpdateCluster) RollingUpdate(groups map[string]*cloudinstances.C for _, k := range sortGroups(apiServerGroups) { err := c.rollingUpdateInstanceGroup(apiServerGroups[k], c.NodeInterval) - if err != nil && c.ExitOnFirstError { + if err != nil && exitableError(err) { return err } @@ -214,7 +210,7 @@ func (c *RollingUpdateCluster) RollingUpdate(groups map[string]*cloudinstances.C for _, k := range sortGroups(nodeGroups) { err := c.rollingUpdateInstanceGroup(nodeGroups[k], c.NodeInterval) - if err != nil && c.ExitOnFirstError { + if err != nil && exitableError(err) { return err } @@ -241,3 +237,13 @@ func sortGroups(groupMap map[string]*cloudinstances.CloudInstanceGroup) []string sort.Strings(groups) return groups } + +// exitableError inspects an error to determine if the error is +// fatal enough that the rolling update cannot continue. +// +// For example, if a cluster is unable to be validated by the deadline, then it +// is unlikely that it will validate on the next instance roll, so an early exit as a +// warning to the user is more appropriate. +func exitableError(err error) bool { + return strings.HasPrefix(err.Error(), "error validating cluster") +} diff --git a/pkg/instancegroups/rollingupdate_test.go b/pkg/instancegroups/rollingupdate_test.go index dcf904e31e..767aca626b 100644 --- a/pkg/instancegroups/rollingupdate_test.go +++ b/pkg/instancegroups/rollingupdate_test.go @@ -562,22 +562,20 @@ func TestRollingUpdateValidationErrorInstanceGroupNil(t *testing.T) { assertGroupInstanceCount(t, cloud, "bastion-1", 1) } -func TestRollingUpdateValidationErrorInstanceGroupExitFirstFailure(t *testing.T) { +func TestRollingUpdateValidationErrorInstanceGroupExitableError(t *testing.T) { c, cloud := getTestSetup() groups := make(map[string]*cloudinstances.CloudInstanceGroup) makeGroup(groups, c.K8sClient, cloud, "node-1", kopsapi.InstanceGroupRoleNode, 3, 3) makeGroup(groups, c.K8sClient, cloud, "node-2", kopsapi.InstanceGroupRoleNode, 3, 3) makeGroup(groups, c.K8sClient, cloud, "node-3", kopsapi.InstanceGroupRoleNode, 3, 3) - makeGroup(groups, c.K8sClient, cloud, "master-1", kopsapi.InstanceGroupRoleMaster, 2, 0) + makeGroup(groups, c.K8sClient, cloud, "master-1", kopsapi.InstanceGroupRoleControlPlane, 2, 0) makeGroup(groups, c.K8sClient, cloud, "bastion-1", kopsapi.InstanceGroupRoleBastion, 1, 0) c.ClusterValidator = &instanceGroupNodeSpecificErrorClusterValidator{ InstanceGroup: groups["node-2"].InstanceGroup, } - c.ExitOnFirstError = true - err := c.RollingUpdate(groups, &kopsapi.InstanceGroupList{}) assert.Error(t, err, "rolling update")