Merge pull request #14194 from jandersen-plaid/jandersen-plaid-exit-first-error

Exit rolling updates when encountering specific errors
This commit is contained in:
Kubernetes Prow Robot 2023-01-09 23:59:25 -08:00 committed by GitHub
commit f6a36bfc42
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 79 additions and 4 deletions

View File

@ -13,6 +13,12 @@ with "control-plane-". The names of groups for existing clusters are unchanged.
* The channels CLI that kOps use to manage addons is now bundled with the kOps binary. These commands are useful for addon diagnostics and troubleshooting. For example, to list installed addons, run `kops toolbox addons get addons`.
* Since kOps 1.24, by default during rolling updates, kOps will time out after
spending 15 minutes on an InstanceGroup (instead of hanging indefinitely on
eviction errors), proceeding to the next InstanceGroup after timing out.
As of kOps 1.26, rolling updates will not proceed if a cluster validation
error is encountered while updating an InstanceGroup.
## AWS
* Bastions are now fronted by a Network Load Balancer.
@ -69,4 +75,4 @@ CNIs, use the "cni" networking option instead.
# Help Wanted
* kOps needs maintainers for Canal, Flannel, and Kube-Router, to keep versions up to date and move the integration from experimental to stable.
If no volunteers step up by the time kOps 1.27 is released, support will be phased out.
If no volunteers step up by the time kOps 1.27 is released, support will be phased out.

View File

@ -43,6 +43,30 @@ import (
const rollingUpdateTaintKey = "kops.k8s.io/scheduled-for-update"
// ValidationTimeoutError represents an error that occurs when
// the cluster fails to validate within the designated timeout.
type ValidationTimeoutError struct {
operation string
err error
}
func (v *ValidationTimeoutError) Error() string {
return fmt.Sprintf("error validating cluster%s: %s", v.operation, v.err.Error())
}
func (v *ValidationTimeoutError) Unwrap() error {
return v.err
}
// Is checks that a given error is a ValidationTimeoutError.
func (v *ValidationTimeoutError) Is(err error) bool {
// Currently all validation timeout errors are equivalent
// If you wish to differentiate, please update the instances of `errors.Is` that check
// this error to take that into account
_, ok := err.(*ValidationTimeoutError)
return ok
}
// promptInteractive asks the user to continue, mostly copied from vendor/google.golang.org/api/examples/gmail.go.
func promptInteractive(upgradedHostID, upgradedHostName string) (stopPrompting bool, err error) {
stopPrompting = false
@ -480,7 +504,10 @@ func (c *RollingUpdateCluster) maybeValidate(operation string, validateCount int
if c.FailOnValidate {
klog.Errorf("Cluster did not validate within %s", c.ValidationTimeout)
return fmt.Errorf("error validating cluster%s: %v", operation, err)
return &ValidationTimeoutError{
operation: operation,
err: err,
}
}
klog.Warningf("Cluster validation failed%s, proceeding since fail-on-validate is set to false: %v", operation, err)

View File

@ -18,6 +18,7 @@ package instancegroups
import (
"context"
stderrors "errors"
"fmt"
"sort"
"sync"
@ -191,7 +192,10 @@ func (c *RollingUpdateCluster) RollingUpdate(groups map[string]*cloudinstances.C
if err != nil {
klog.Errorf("failed to roll InstanceGroup %q: %v", k, err)
}
// TODO: Bail on error?
if isExitableError(err) {
return err
}
}
}
@ -213,7 +217,10 @@ func (c *RollingUpdateCluster) RollingUpdate(groups map[string]*cloudinstances.C
if err != nil {
klog.Errorf("failed to roll InstanceGroup %q: %v", k, err)
}
// TODO: Bail on error?
if isExitableError(err) {
return err
}
}
}
@ -236,3 +243,13 @@ func sortGroups(groupMap map[string]*cloudinstances.CloudInstanceGroup) []string
sort.Strings(groups)
return groups
}
// isExitableError inspects an error to determine if the error is
// fatal enough that the rolling update cannot continue.
//
// For example, if a cluster is unable to be validated by the deadline, then it
// is unlikely that it will validate on the next instance roll, so an early exit as a
// warning to the user is more appropriate.
func isExitableError(err error) bool {
return stderrors.Is(err, &ValidationTimeoutError{})
}

View File

@ -562,6 +562,31 @@ func TestRollingUpdateValidationErrorInstanceGroupNil(t *testing.T) {
assertGroupInstanceCount(t, cloud, "bastion-1", 1)
}
func TestRollingUpdateValidationErrorInstanceGroupExitableError(t *testing.T) {
c, cloud := getTestSetup()
groups := make(map[string]*cloudinstances.CloudInstanceGroup)
makeGroup(groups, c.K8sClient, cloud, "node-1", kopsapi.InstanceGroupRoleNode, 3, 3)
makeGroup(groups, c.K8sClient, cloud, "node-2", kopsapi.InstanceGroupRoleNode, 3, 3)
makeGroup(groups, c.K8sClient, cloud, "node-3", kopsapi.InstanceGroupRoleNode, 3, 3)
makeGroup(groups, c.K8sClient, cloud, "master-1", kopsapi.InstanceGroupRoleControlPlane, 2, 0)
makeGroup(groups, c.K8sClient, cloud, "bastion-1", kopsapi.InstanceGroupRoleBastion, 1, 0)
c.ClusterValidator = &instanceGroupNodeSpecificErrorClusterValidator{
InstanceGroup: groups["node-2"].InstanceGroup,
}
err := c.RollingUpdate(groups, &kopsapi.InstanceGroupList{})
assert.Error(t, err, "rolling update")
assertGroupInstanceCount(t, cloud, "node-1", 0)
assertGroupInstanceCount(t, cloud, "node-2", 3)
assertGroupInstanceCount(t, cloud, "node-3", 3)
assertGroupInstanceCount(t, cloud, "master-1", 2)
assertGroupInstanceCount(t, cloud, "bastion-1", 1)
}
func TestRollingUpdateClusterFailsValidationAfterOneNode(t *testing.T) {
c, cloud := getTestSetup()