Remove optionality and exit when specific error prefix is matched

Signed-off-by: Jack Andersen <jandersen@plaid.com>
This commit is contained in:
Jack Andersen 2022-11-25 08:03:55 -08:00
parent f9ea9b3ef8
commit 6efd68f428
No known key found for this signature in database
GPG Key ID: 3F47FBCA70B709C7
4 changed files with 15 additions and 19 deletions

View File

@ -107,10 +107,6 @@ type RollingUpdateOptions struct {
// does not validate, after a validation period. // does not validate, after a validation period.
FailOnValidate bool FailOnValidate bool
// ExitOnFirstError exits the rolling update when a single instancegroup's
// rolling update experiences an error instead of retrying all instancegroups.
ExitOnFirstError bool
// DrainTimeout is the maximum time to wait while draining a node. // DrainTimeout is the maximum time to wait while draining a node.
DrainTimeout time.Duration DrainTimeout time.Duration
@ -155,7 +151,6 @@ func (o *RollingUpdateOptions) InitDefaults() {
o.CloudOnly = false o.CloudOnly = false
o.FailOnDrainError = false o.FailOnDrainError = false
o.FailOnValidate = true o.FailOnValidate = true
o.ExitOnFirstError = false
o.ControlPlaneInterval = 15 * time.Second o.ControlPlaneInterval = 15 * time.Second
o.NodeInterval = 15 * time.Second o.NodeInterval = 15 * time.Second
@ -213,7 +208,6 @@ func NewCmdRollingUpdateCluster(f *util.Factory, out io.Writer) *cobra.Command {
cmd.Flags().BoolVar(&options.FailOnDrainError, "fail-on-drain-error", true, "Fail if draining a node fails") cmd.Flags().BoolVar(&options.FailOnDrainError, "fail-on-drain-error", true, "Fail if draining a node fails")
cmd.Flags().BoolVar(&options.FailOnValidate, "fail-on-validate-error", true, "Fail if the cluster fails to validate") cmd.Flags().BoolVar(&options.FailOnValidate, "fail-on-validate-error", true, "Fail if the cluster fails to validate")
cmd.Flags().BoolVar(&options.ExitOnFirstError, "exit-on-first-error", false, "Exit on the first node or apiserver instancegroup error")
cmd.Flags().SetNormalizeFunc(func(f *pflag.FlagSet, name string) pflag.NormalizedName { cmd.Flags().SetNormalizeFunc(func(f *pflag.FlagSet, name string) pflag.NormalizedName {
switch name { switch name {
@ -368,7 +362,6 @@ func RunRollingUpdateCluster(ctx context.Context, f *util.Factory, out io.Writer
ValidationTimeout: options.ValidationTimeout, ValidationTimeout: options.ValidationTimeout,
ValidateCount: int(options.ValidateCount), ValidateCount: int(options.ValidateCount),
DrainTimeout: options.DrainTimeout, DrainTimeout: options.DrainTimeout,
ExitOnFirstError: options.ExitOnFirstError,
// TODO should we expose this to the UI? // TODO should we expose this to the UI?
ValidateTickDuration: 30 * time.Second, ValidateTickDuration: 30 * time.Second,
ValidateSuccessDuration: 10 * time.Second, ValidateSuccessDuration: 10 * time.Second,

View File

@ -58,7 +58,6 @@ kops rolling-update cluster [CLUSTER] [flags]
``` ```
--bastion-interval duration Time to wait between restarting bastions (default 15s) --bastion-interval duration Time to wait between restarting bastions (default 15s)
--cloudonly Perform rolling update without confirming progress with Kubernetes --cloudonly Perform rolling update without confirming progress with Kubernetes
--exit-on-first-error Exit on the first node or apiserver instancegroup error
--control-plane-interval duration Time to wait between restarting control plane nodes (default 15s) --control-plane-interval duration Time to wait between restarting control plane nodes (default 15s)
--drain-timeout duration Maximum time to wait for a node to drain (default 15m0s) --drain-timeout duration Maximum time to wait for a node to drain (default 15m0s)
--fail-on-drain-error Fail if draining a node fails (default true) --fail-on-drain-error Fail if draining a node fails (default true)

View File

@ -20,6 +20,7 @@ import (
"context" "context"
"fmt" "fmt"
"sort" "sort"
"strings"
"sync" "sync"
"time" "time"
@ -82,11 +83,6 @@ type RollingUpdateCluster struct {
// DrainTimeout is the maximum amount of time to wait while draining a node. // DrainTimeout is the maximum amount of time to wait while draining a node.
DrainTimeout time.Duration DrainTimeout time.Duration
// ExitOnFirstError ensures the rolling update stops on the first error returned by any
// node or apiserver instancegroup. The default is `false` which will try to roll every instance
// group in serial and then return any errors.
ExitOnFirstError bool
// Options holds user-specified options // Options holds user-specified options
Options RollingUpdateOptions Options RollingUpdateOptions
} }
@ -192,7 +188,7 @@ func (c *RollingUpdateCluster) RollingUpdate(groups map[string]*cloudinstances.C
for _, k := range sortGroups(apiServerGroups) { for _, k := range sortGroups(apiServerGroups) {
err := c.rollingUpdateInstanceGroup(apiServerGroups[k], c.NodeInterval) err := c.rollingUpdateInstanceGroup(apiServerGroups[k], c.NodeInterval)
if err != nil && c.ExitOnFirstError { if err != nil && exitableError(err) {
return err return err
} }
@ -214,7 +210,7 @@ func (c *RollingUpdateCluster) RollingUpdate(groups map[string]*cloudinstances.C
for _, k := range sortGroups(nodeGroups) { for _, k := range sortGroups(nodeGroups) {
err := c.rollingUpdateInstanceGroup(nodeGroups[k], c.NodeInterval) err := c.rollingUpdateInstanceGroup(nodeGroups[k], c.NodeInterval)
if err != nil && c.ExitOnFirstError { if err != nil && exitableError(err) {
return err return err
} }
@ -241,3 +237,13 @@ func sortGroups(groupMap map[string]*cloudinstances.CloudInstanceGroup) []string
sort.Strings(groups) sort.Strings(groups)
return groups return groups
} }
// exitableError inspects an error to determine if the error is
// fatal enough that the rolling update cannot continue.
//
// For example, if a cluster is unable to be validated by the deadline, then it
// is unlikely that it will validate on the next instance roll, so an early exit as a
// warning to the user is more appropriate.
func exitableError(err error) bool {
return strings.HasPrefix(err.Error(), "error validating cluster")
}

View File

@ -562,22 +562,20 @@ func TestRollingUpdateValidationErrorInstanceGroupNil(t *testing.T) {
assertGroupInstanceCount(t, cloud, "bastion-1", 1) assertGroupInstanceCount(t, cloud, "bastion-1", 1)
} }
func TestRollingUpdateValidationErrorInstanceGroupExitFirstFailure(t *testing.T) { func TestRollingUpdateValidationErrorInstanceGroupExitableError(t *testing.T) {
c, cloud := getTestSetup() c, cloud := getTestSetup()
groups := make(map[string]*cloudinstances.CloudInstanceGroup) groups := make(map[string]*cloudinstances.CloudInstanceGroup)
makeGroup(groups, c.K8sClient, cloud, "node-1", kopsapi.InstanceGroupRoleNode, 3, 3) makeGroup(groups, c.K8sClient, cloud, "node-1", kopsapi.InstanceGroupRoleNode, 3, 3)
makeGroup(groups, c.K8sClient, cloud, "node-2", kopsapi.InstanceGroupRoleNode, 3, 3) makeGroup(groups, c.K8sClient, cloud, "node-2", kopsapi.InstanceGroupRoleNode, 3, 3)
makeGroup(groups, c.K8sClient, cloud, "node-3", kopsapi.InstanceGroupRoleNode, 3, 3) makeGroup(groups, c.K8sClient, cloud, "node-3", kopsapi.InstanceGroupRoleNode, 3, 3)
makeGroup(groups, c.K8sClient, cloud, "master-1", kopsapi.InstanceGroupRoleMaster, 2, 0) makeGroup(groups, c.K8sClient, cloud, "master-1", kopsapi.InstanceGroupRoleControlPlane, 2, 0)
makeGroup(groups, c.K8sClient, cloud, "bastion-1", kopsapi.InstanceGroupRoleBastion, 1, 0) makeGroup(groups, c.K8sClient, cloud, "bastion-1", kopsapi.InstanceGroupRoleBastion, 1, 0)
c.ClusterValidator = &instanceGroupNodeSpecificErrorClusterValidator{ c.ClusterValidator = &instanceGroupNodeSpecificErrorClusterValidator{
InstanceGroup: groups["node-2"].InstanceGroup, InstanceGroup: groups["node-2"].InstanceGroup,
} }
c.ExitOnFirstError = true
err := c.RollingUpdate(groups, &kopsapi.InstanceGroupList{}) err := c.RollingUpdate(groups, &kopsapi.InstanceGroupList{})
assert.Error(t, err, "rolling update") assert.Error(t, err, "rolling update")