Rolling-update validation harmonization

This is a follow-on to #8868; I believe the intent of that was to
expose the option to do more (or fewer) retries.

We previously had a single retry to prevent flapping; this basically
unifies the previous behaviour with the idea of making it
configurable.

* validate-count=0 effectively turns off validation.

* validate-count=1 will do a single validation, without flapping
  detection.

* validate-count>=2 will require N succesful validations in a row,
waiting ValidateSuccessDuration in between.

A nice side-effect of this is that the tests now explicitly specify
ValidateCount=1 instead of setting ValidateSuccessDuration=0, which
had the side effect of doing the equivalent to ValidateCount=1.
This commit is contained in:
Justin Santa Barbara 2020-04-17 01:36:12 -04:00
parent ea3e974d81
commit ffb6cd61aa
4 changed files with 83 additions and 69 deletions

View File

@ -341,10 +341,10 @@ func RunRollingUpdateCluster(ctx context.Context, f *util.Factory, out io.Writer
ClusterName: options.ClusterName,
PostDrainDelay: options.PostDrainDelay,
ValidationTimeout: options.ValidationTimeout,
ValidateCount: options.ValidateCount,
ValidateSucceeded: 0,
ValidateCount: int(options.ValidateCount),
// TODO should we expose this to the UI?
ValidateTickDuration: 30 * time.Second,
ValidateSuccessDuration: 10 * time.Second,
}
err = d.AdjustNeedUpdate(groups, cluster, list)

View File

@ -69,7 +69,7 @@ func promptInteractive(upgradedHostId, upgradedHostName string) (stopPrompting b
}
// RollingUpdate performs a rolling update on a list of instances.
func (c *RollingUpdateCluster) rollingUpdateInstanceGroup(ctx context.Context, cluster *api.Cluster, group *cloudinstances.CloudInstanceGroup, isBastion bool, sleepAfterTerminate time.Duration, validationTimeout time.Duration) (err error) {
func (c *RollingUpdateCluster) rollingUpdateInstanceGroup(ctx context.Context, cluster *api.Cluster, group *cloudinstances.CloudInstanceGroup, isBastion bool, sleepAfterTerminate time.Duration) (err error) {
// Do not need a k8s client if you are doing cloudonly.
if c.K8sClient == nil && !c.CloudOnly {
return fmt.Errorf("rollingUpdate is missing a k8s client")
@ -156,7 +156,7 @@ func (c *RollingUpdateCluster) rollingUpdateInstanceGroup(ctx context.Context, c
klog.Infof("waiting for %v after detaching instance", sleepAfterTerminate)
time.Sleep(sleepAfterTerminate)
if err := c.maybeValidate(validationTimeout, "detaching"); err != nil {
if err := c.maybeValidate(c.ValidationTimeout, "detaching"); err != nil {
return err
}
noneReady = false
@ -185,7 +185,7 @@ func (c *RollingUpdateCluster) rollingUpdateInstanceGroup(ctx context.Context, c
return waitForPendingBeforeReturningError(runningDrains, terminateChan, err)
}
err = c.maybeValidate(validationTimeout, "removing")
err = c.maybeValidate(c.ValidationTimeout, "removing")
if err != nil {
return waitForPendingBeforeReturningError(runningDrains, terminateChan, err)
}
@ -231,7 +231,7 @@ func (c *RollingUpdateCluster) rollingUpdateInstanceGroup(ctx context.Context, c
}
}
err = c.maybeValidate(validationTimeout, "removing")
err = c.maybeValidate(c.ValidationTimeout, "removing")
if err != nil {
return err
}
@ -401,57 +401,64 @@ func (c *RollingUpdateCluster) maybeValidate(validationTimeout time.Duration, op
}
// validateClusterWithDuration runs validation.ValidateCluster until either we get positive result or the timeout expires
func (c *RollingUpdateCluster) validateClusterWithDuration(duration time.Duration) error {
// Try to validate cluster at least once, this will handle durations that are lower
// than our tick time
if c.tryValidateCluster(duration) {
func (c *RollingUpdateCluster) validateClusterWithDuration(validationTimeout time.Duration) error {
ctx, cancel := context.WithTimeout(context.Background(), validationTimeout)
defer cancel()
if c.tryValidateCluster(ctx) {
return nil
}
timeout := time.After(duration)
ticker := time.NewTicker(c.ValidateTickDuration)
defer ticker.Stop()
// Keep trying until we're timed out or got a result or got an error
for {
select {
case <-timeout:
// Got a timeout fail with a timeout error
return fmt.Errorf("cluster did not validate within a duration of %q", duration)
case <-ticker.C:
// Got a tick, validate cluster
if c.tryValidateCluster(duration) {
return nil
}
// ValidateCluster didn't work yet, so let's try again
// this will exit up to the for loop
}
}
return fmt.Errorf("cluster did not validate within a duration of %q", validationTimeout)
}
func (c *RollingUpdateCluster) tryValidateCluster(duration time.Duration) bool {
result, err := c.ClusterValidator.Validate()
func (c *RollingUpdateCluster) tryValidateCluster(ctx context.Context) bool {
if c.ValidateCount == 0 {
klog.Warningf("skipping cluster validation because validate-count was 0")
return true
}
if err == nil && len(result.Failures) == 0 && c.ValidateCount > 0 {
c.ValidateSucceeded++
if c.ValidateSucceeded < c.ValidateCount {
klog.Infof("Cluster validated; revalidating %d time(s) to make sure it does not flap.", c.ValidateCount-c.ValidateSucceeded)
return false
successCount := 0
for {
// Note that we validate at least once before checking the timeout, in case the cluster is healthy with a short timeout
result, err := c.ClusterValidator.Validate()
if err == nil && len(result.Failures) == 0 {
successCount++
if successCount >= c.ValidateCount {
klog.Info("Cluster validated.")
return true
} else {
klog.Infof("Cluster validated; revalidating in %s to make sure it does not flap.", c.ValidateSuccessDuration)
time.Sleep(c.ValidateSuccessDuration)
continue
}
}
if err != nil {
klog.Infof("Cluster did not validate, will try again in %q until duration %q expires: %v.", c.ValidateTickDuration, duration, err)
if ctx.Err() != nil {
klog.Infof("Cluster did not validate within deadline: %v.", err)
return false
}
klog.Infof("Cluster did not validate, will retry in %q: %v.", c.ValidateTickDuration, err)
} else if len(result.Failures) > 0 {
messages := []string{}
for _, failure := range result.Failures {
messages = append(messages, failure.Message)
}
klog.Infof("Cluster did not pass validation, will try again in %q until duration %q expires: %s.", c.ValidateTickDuration, duration, strings.Join(messages, ", "))
if ctx.Err() != nil {
klog.Infof("Cluster did not pass validation within deadline: %s.", strings.Join(messages, ", "))
return false
} else {
klog.Info("Cluster validated.")
return true
}
klog.Infof("Cluster did not pass validation, will retry in %q: %s.", c.ValidateTickDuration, strings.Join(messages, ", "))
}
// Reset the success count; we want N consecutive successful validations
successCount = 0
// Wait before retrying
// TODO: Should we check if we have enough time left before the deadline?
time.Sleep(c.ValidateTickDuration)
}
}

View File

@ -65,11 +65,12 @@ type RollingUpdateCluster struct {
// ValidateTickDuration is the amount of time to wait between cluster validation attempts
ValidateTickDuration time.Duration
// ValidateCount is the amount of time that a cluster needs to be validated after single node update
ValidateCount int32
// ValidateSuccessDuration is the amount of time a cluster must continue to validate successfully
// before updating the next node
ValidateSuccessDuration time.Duration
// ValidateSucceeded is the amount of times that a cluster validate is succeeded already
ValidateSucceeded int32
// ValidateCount is the amount of time that a cluster needs to be validated after single node update
ValidateCount int
}
// AdjustNeedUpdate adjusts the set of instances that need updating, using factors outside those known by the cloud implementation
@ -136,7 +137,7 @@ func (c *RollingUpdateCluster) RollingUpdate(ctx context.Context, groups map[str
defer wg.Done()
err := c.rollingUpdateInstanceGroup(ctx, cluster, group, true, c.BastionInterval, c.ValidationTimeout)
err := c.rollingUpdateInstanceGroup(ctx, cluster, group, true, c.BastionInterval)
resultsMutex.Lock()
results[k] = err
@ -161,7 +162,7 @@ func (c *RollingUpdateCluster) RollingUpdate(ctx context.Context, groups map[str
// and we don't want to roll all the masters at the same time. See issue #284
for _, group := range masterGroups {
err := c.rollingUpdateInstanceGroup(ctx, cluster, group, false, c.MasterInterval, c.ValidationTimeout)
err := c.rollingUpdateInstanceGroup(ctx, cluster, group, false, c.MasterInterval)
// Do not continue update if master(s) failed, cluster is potentially in an unhealthy state
if err != nil {
@ -183,7 +184,7 @@ func (c *RollingUpdateCluster) RollingUpdate(ctx context.Context, groups map[str
}
for k, group := range nodeGroups {
err := c.rollingUpdateInstanceGroup(ctx, cluster, group, false, c.NodeInterval, c.ValidationTimeout)
err := c.rollingUpdateInstanceGroup(ctx, cluster, group, false, c.NodeInterval)
results[k] = err

View File

@ -69,8 +69,8 @@ func getTestSetup() (*RollingUpdateCluster, *awsup.MockAWSCloud, *kopsapi.Cluste
ClusterValidator: &successfulClusterValidator{},
FailOnValidate: true,
ValidateTickDuration: 1 * time.Millisecond,
ValidateCount: 1,
ValidateSucceeded: 0,
ValidateSuccessDuration: 5 * time.Millisecond,
ValidateCount: 2,
}
return c, mockcloud, cluster
@ -545,7 +545,6 @@ func TestRollingUpdateFlappingValidation(t *testing.T) {
ctx := context.Background()
c, cloud, cluster := getTestSetup()
c.ValidateCount = 3
// This should only take a few milliseconds,
// but we have to pad to allow for random delays (e.g. GC)
@ -1024,6 +1023,7 @@ func TestRollingUpdateMaxUnavailableAllNeedUpdate(t *testing.T) {
c, cloud, cluster := getTestSetup()
concurrentTest := newConcurrentTest(t, cloud, 0, true)
c.ValidateCount = 1
c.ClusterValidator = concurrentTest
cloud.MockEC2 = concurrentTest
@ -1048,6 +1048,7 @@ func TestRollingUpdateMaxUnavailableAllButOneNeedUpdate(t *testing.T) {
c, cloud, cluster := getTestSetup()
concurrentTest := newConcurrentTest(t, cloud, 0, false)
c.ValidateCount = 1
c.ClusterValidator = concurrentTest
cloud.MockEC2 = concurrentTest
@ -1071,6 +1072,7 @@ func TestRollingUpdateMaxUnavailableAllNeedUpdateMaster(t *testing.T) {
c, cloud, cluster := getTestSetup()
concurrentTest := newConcurrentTest(t, cloud, 0, true)
c.ValidateCount = 1
c.ClusterValidator = concurrentTest
cloud.MockEC2 = concurrentTest
@ -1124,6 +1126,7 @@ func TestRollingUpdateMaxSurgeAllNeedUpdate(t *testing.T) {
c, cloud, cluster := getTestSetup()
concurrentTest := newConcurrentTest(t, cloud, 2, true)
c.ValidateCount = 1
c.ClusterValidator = concurrentTest
cloud.MockAutoscaling = &concurrentTestAutoscaling{
AutoScalingAPI: cloud.MockAutoscaling,
@ -1152,6 +1155,7 @@ func TestRollingUpdateMaxSurgeAllButOneNeedUpdate(t *testing.T) {
c, cloud, cluster := getTestSetup()
concurrentTest := newConcurrentTest(t, cloud, 2, false)
c.ValidateCount = 1
c.ClusterValidator = concurrentTest
cloud.MockAutoscaling = &concurrentTestAutoscaling{
AutoScalingAPI: cloud.MockAutoscaling,
@ -1302,6 +1306,7 @@ func TestRollingUpdateMaxSurgeAllNeedUpdateOneAlreadyDetached(t *testing.T) {
detached: map[string]bool{},
}
c.ValidateCount = 1
c.ClusterValidator = alreadyDetachedTest
cloud.MockAutoscaling = &alreadyDetachedTestAutoscaling{
AutoScalingAPI: cloud.MockAutoscaling,
@ -1332,6 +1337,7 @@ func TestRollingUpdateMaxSurgeAllNeedUpdateMaxAlreadyDetached(t *testing.T) {
// Should behave the same as TestRollingUpdateMaxUnavailableAllNeedUpdate
concurrentTest := newConcurrentTest(t, cloud, 0, true)
c.ValidateCount = 1
c.ClusterValidator = concurrentTest
cloud.MockEC2 = concurrentTest