instancegroups: Ignore validation errors in unrelated instance groups

When unrelated instance groups produce validation errors, the instance group
being updated produces a failure and is forced to wait for rolling update to continue.

This can be avoided as failures in different node instance groups usually don't affect
the instance group being affected in any way.
This commit is contained in:
Bharath Vedartham 2020-10-31 19:17:24 +05:30
parent f99c04fafa
commit 7067f5f47a
1 changed files with 32 additions and 10 deletions

View File

@ -36,6 +36,7 @@ import (
"k8s.io/klog/v2"
api "k8s.io/kops/pkg/apis/kops"
"k8s.io/kops/pkg/cloudinstances"
"k8s.io/kops/pkg/validation"
"k8s.io/kubectl/pkg/drain"
)
@ -92,7 +93,7 @@ func (c *RollingUpdateCluster) rollingUpdateInstanceGroup(group *cloudinstances.
if isBastion {
klog.V(3).Info("Not validating the cluster as instance is a bastion.")
} else if err = c.maybeValidate("", 1); err != nil {
} else if err = c.maybeValidate("", 1, group); err != nil {
return err
}
@ -147,7 +148,7 @@ func (c *RollingUpdateCluster) rollingUpdateInstanceGroup(group *cloudinstances.
klog.Infof("waiting for %v after detaching instance", sleepAfterTerminate)
time.Sleep(sleepAfterTerminate)
if err := c.maybeValidate(" after detaching instance", c.ValidateCount); err != nil {
if err := c.maybeValidate(" after detaching instance", c.ValidateCount, group); err != nil {
return err
}
noneReady = false
@ -181,7 +182,7 @@ func (c *RollingUpdateCluster) rollingUpdateInstanceGroup(group *cloudinstances.
return waitForPendingBeforeReturningError(runningDrains, terminateChan, err)
}
err = c.maybeValidate(" after terminating instance", c.ValidateCount)
err = c.maybeValidate(" after terminating instance", c.ValidateCount, group)
if err != nil {
return waitForPendingBeforeReturningError(runningDrains, terminateChan, err)
}
@ -227,7 +228,7 @@ func (c *RollingUpdateCluster) rollingUpdateInstanceGroup(group *cloudinstances.
}
}
err = c.maybeValidate(" after terminating instance", c.ValidateCount)
err = c.maybeValidate(" after terminating instance", c.ValidateCount, group)
if err != nil {
return err
}
@ -410,14 +411,14 @@ func (c *RollingUpdateCluster) reconcileInstanceGroup() error {
}
func (c *RollingUpdateCluster) maybeValidate(operation string, validateCount int) error {
func (c *RollingUpdateCluster) maybeValidate(operation string, validateCount int, group *cloudinstances.CloudInstanceGroup) error {
if c.CloudOnly {
klog.Warningf("Not validating cluster as cloudonly flag is set.")
} else {
klog.Info("Validating the cluster.")
if err := c.validateClusterWithTimeout(validateCount); err != nil {
if err := c.validateClusterWithTimeout(validateCount, group); err != nil {
if c.FailOnValidate {
klog.Errorf("Cluster did not validate within %s", c.ValidationTimeout)
@ -431,7 +432,7 @@ func (c *RollingUpdateCluster) maybeValidate(operation string, validateCount int
}
// validateClusterWithTimeout runs validation.ValidateCluster until either we get positive result or the timeout expires
func (c *RollingUpdateCluster) validateClusterWithTimeout(validateCount int) error {
func (c *RollingUpdateCluster) validateClusterWithTimeout(validateCount int, group *cloudinstances.CloudInstanceGroup) error {
ctx, cancel := context.WithTimeout(context.Background(), c.ValidationTimeout)
defer cancel()
@ -445,7 +446,7 @@ func (c *RollingUpdateCluster) validateClusterWithTimeout(validateCount int) err
for {
// Note that we validate at least once before checking the timeout, in case the cluster is healthy with a short timeout
result, err := c.ClusterValidator.Validate()
if err == nil && len(result.Failures) == 0 {
if err == nil && !hasFailureRelevantToGroup(result.Failures, group) {
successCount++
if successCount >= validateCount {
klog.Info("Cluster validated.")
@ -477,7 +478,7 @@ func (c *RollingUpdateCluster) validateClusterWithTimeout(validateCount int) err
// Reset the success count; we want N consecutive successful validations
successCount = 0
// Wait before retrying
// Wait before retrying in some cases
// TODO: Should we check if we have enough time left before the deadline?
time.Sleep(c.ValidateTickDuration)
}
@ -485,6 +486,27 @@ func (c *RollingUpdateCluster) validateClusterWithTimeout(validateCount int) err
return fmt.Errorf("cluster did not validate within a duration of %q", c.ValidationTimeout)
}
// checks if the validation failures returned after cluster validation are relevant to the current
// instance group whose rolling update is occurring
func hasFailureRelevantToGroup(failures []*validation.ValidationError, group *cloudinstances.CloudInstanceGroup) bool {
// Ignore non critical validation errors in other instance groups like below target size errors
for _, failure := range failures {
// Determining InstanceGroups for certain resources like Pods, ComponentStatus is not straightforward.
// Till we are able to determine the InstanceGroups for these resources without ambiguity, the
// InstanceGroup field of the ValidationErrors for these resources will be nil
if failure.InstanceGroup == nil {
return true
}
// if there is a failure in the same instance group or a failure which has cluster wide impact
if (failure.InstanceGroup.IsMaster()) || (failure.InstanceGroup == group.InstanceGroup) {
return true
}
}
return false
}
// detachInstance detaches a Cloud Instance
func (c *RollingUpdateCluster) detachInstance(u *cloudinstances.CloudInstance) error {
id := u.ID
@ -608,7 +630,7 @@ func (c *RollingUpdateCluster) UpdateSingleInstance(cloudMember *cloudinstances.
if err != nil {
return fmt.Errorf("failed to detach instance: %v", err)
}
if err := c.maybeValidate(" after detaching instance", c.ValidateCount); err != nil {
if err := c.maybeValidate(" after detaching instance", c.ValidateCount, cloudMember.CloudInstanceGroup); err != nil {
return err
}
}