mirror of https://github.com/kubernetes/kops.git
Rolling-update validation harmonization
This is a follow-on to #8868; I believe the intent of that was to expose the option to do more (or fewer) retries. We previously had a single retry to prevent flapping; this basically unifies the previous behaviour with the idea of making it configurable. * validate-count=0 effectively turns off validation. * validate-count=1 will do a single validation, without flapping detection. * validate-count>=2 will require N succesful validations in a row, waiting ValidateSuccessDuration in between. A nice side-effect of this is that the tests now explicitly specify ValidateCount=1 instead of setting ValidateSuccessDuration=0, which had the side effect of doing the equivalent to ValidateCount=1.
This commit is contained in:
parent
ea3e974d81
commit
ffb6cd61aa
|
@ -341,10 +341,10 @@ func RunRollingUpdateCluster(ctx context.Context, f *util.Factory, out io.Writer
|
||||||
ClusterName: options.ClusterName,
|
ClusterName: options.ClusterName,
|
||||||
PostDrainDelay: options.PostDrainDelay,
|
PostDrainDelay: options.PostDrainDelay,
|
||||||
ValidationTimeout: options.ValidationTimeout,
|
ValidationTimeout: options.ValidationTimeout,
|
||||||
ValidateCount: options.ValidateCount,
|
ValidateCount: int(options.ValidateCount),
|
||||||
ValidateSucceeded: 0,
|
|
||||||
// TODO should we expose this to the UI?
|
// TODO should we expose this to the UI?
|
||||||
ValidateTickDuration: 30 * time.Second,
|
ValidateTickDuration: 30 * time.Second,
|
||||||
|
ValidateSuccessDuration: 10 * time.Second,
|
||||||
}
|
}
|
||||||
|
|
||||||
err = d.AdjustNeedUpdate(groups, cluster, list)
|
err = d.AdjustNeedUpdate(groups, cluster, list)
|
||||||
|
|
|
@ -69,7 +69,7 @@ func promptInteractive(upgradedHostId, upgradedHostName string) (stopPrompting b
|
||||||
}
|
}
|
||||||
|
|
||||||
// RollingUpdate performs a rolling update on a list of instances.
|
// RollingUpdate performs a rolling update on a list of instances.
|
||||||
func (c *RollingUpdateCluster) rollingUpdateInstanceGroup(ctx context.Context, cluster *api.Cluster, group *cloudinstances.CloudInstanceGroup, isBastion bool, sleepAfterTerminate time.Duration, validationTimeout time.Duration) (err error) {
|
func (c *RollingUpdateCluster) rollingUpdateInstanceGroup(ctx context.Context, cluster *api.Cluster, group *cloudinstances.CloudInstanceGroup, isBastion bool, sleepAfterTerminate time.Duration) (err error) {
|
||||||
// Do not need a k8s client if you are doing cloudonly.
|
// Do not need a k8s client if you are doing cloudonly.
|
||||||
if c.K8sClient == nil && !c.CloudOnly {
|
if c.K8sClient == nil && !c.CloudOnly {
|
||||||
return fmt.Errorf("rollingUpdate is missing a k8s client")
|
return fmt.Errorf("rollingUpdate is missing a k8s client")
|
||||||
|
@ -156,7 +156,7 @@ func (c *RollingUpdateCluster) rollingUpdateInstanceGroup(ctx context.Context, c
|
||||||
klog.Infof("waiting for %v after detaching instance", sleepAfterTerminate)
|
klog.Infof("waiting for %v after detaching instance", sleepAfterTerminate)
|
||||||
time.Sleep(sleepAfterTerminate)
|
time.Sleep(sleepAfterTerminate)
|
||||||
|
|
||||||
if err := c.maybeValidate(validationTimeout, "detaching"); err != nil {
|
if err := c.maybeValidate(c.ValidationTimeout, "detaching"); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
noneReady = false
|
noneReady = false
|
||||||
|
@ -185,7 +185,7 @@ func (c *RollingUpdateCluster) rollingUpdateInstanceGroup(ctx context.Context, c
|
||||||
return waitForPendingBeforeReturningError(runningDrains, terminateChan, err)
|
return waitForPendingBeforeReturningError(runningDrains, terminateChan, err)
|
||||||
}
|
}
|
||||||
|
|
||||||
err = c.maybeValidate(validationTimeout, "removing")
|
err = c.maybeValidate(c.ValidationTimeout, "removing")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return waitForPendingBeforeReturningError(runningDrains, terminateChan, err)
|
return waitForPendingBeforeReturningError(runningDrains, terminateChan, err)
|
||||||
}
|
}
|
||||||
|
@ -231,7 +231,7 @@ func (c *RollingUpdateCluster) rollingUpdateInstanceGroup(ctx context.Context, c
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
err = c.maybeValidate(validationTimeout, "removing")
|
err = c.maybeValidate(c.ValidationTimeout, "removing")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
@ -401,57 +401,64 @@ func (c *RollingUpdateCluster) maybeValidate(validationTimeout time.Duration, op
|
||||||
}
|
}
|
||||||
|
|
||||||
// validateClusterWithDuration runs validation.ValidateCluster until either we get positive result or the timeout expires
|
// validateClusterWithDuration runs validation.ValidateCluster until either we get positive result or the timeout expires
|
||||||
func (c *RollingUpdateCluster) validateClusterWithDuration(duration time.Duration) error {
|
func (c *RollingUpdateCluster) validateClusterWithDuration(validationTimeout time.Duration) error {
|
||||||
// Try to validate cluster at least once, this will handle durations that are lower
|
ctx, cancel := context.WithTimeout(context.Background(), validationTimeout)
|
||||||
// than our tick time
|
defer cancel()
|
||||||
if c.tryValidateCluster(duration) {
|
|
||||||
|
if c.tryValidateCluster(ctx) {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
timeout := time.After(duration)
|
return fmt.Errorf("cluster did not validate within a duration of %q", validationTimeout)
|
||||||
ticker := time.NewTicker(c.ValidateTickDuration)
|
|
||||||
defer ticker.Stop()
|
|
||||||
// Keep trying until we're timed out or got a result or got an error
|
|
||||||
for {
|
|
||||||
select {
|
|
||||||
case <-timeout:
|
|
||||||
// Got a timeout fail with a timeout error
|
|
||||||
return fmt.Errorf("cluster did not validate within a duration of %q", duration)
|
|
||||||
case <-ticker.C:
|
|
||||||
// Got a tick, validate cluster
|
|
||||||
if c.tryValidateCluster(duration) {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
// ValidateCluster didn't work yet, so let's try again
|
|
||||||
// this will exit up to the for loop
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *RollingUpdateCluster) tryValidateCluster(duration time.Duration) bool {
|
func (c *RollingUpdateCluster) tryValidateCluster(ctx context.Context) bool {
|
||||||
result, err := c.ClusterValidator.Validate()
|
if c.ValidateCount == 0 {
|
||||||
|
klog.Warningf("skipping cluster validation because validate-count was 0")
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
if err == nil && len(result.Failures) == 0 && c.ValidateCount > 0 {
|
successCount := 0
|
||||||
c.ValidateSucceeded++
|
|
||||||
if c.ValidateSucceeded < c.ValidateCount {
|
for {
|
||||||
klog.Infof("Cluster validated; revalidating %d time(s) to make sure it does not flap.", c.ValidateCount-c.ValidateSucceeded)
|
// Note that we validate at least once before checking the timeout, in case the cluster is healthy with a short timeout
|
||||||
return false
|
result, err := c.ClusterValidator.Validate()
|
||||||
|
if err == nil && len(result.Failures) == 0 {
|
||||||
|
successCount++
|
||||||
|
if successCount >= c.ValidateCount {
|
||||||
|
klog.Info("Cluster validated.")
|
||||||
|
return true
|
||||||
|
} else {
|
||||||
|
klog.Infof("Cluster validated; revalidating in %s to make sure it does not flap.", c.ValidateSuccessDuration)
|
||||||
|
time.Sleep(c.ValidateSuccessDuration)
|
||||||
|
continue
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
klog.Infof("Cluster did not validate, will try again in %q until duration %q expires: %v.", c.ValidateTickDuration, duration, err)
|
if ctx.Err() != nil {
|
||||||
|
klog.Infof("Cluster did not validate within deadline: %v.", err)
|
||||||
return false
|
return false
|
||||||
|
}
|
||||||
|
klog.Infof("Cluster did not validate, will retry in %q: %v.", c.ValidateTickDuration, err)
|
||||||
} else if len(result.Failures) > 0 {
|
} else if len(result.Failures) > 0 {
|
||||||
messages := []string{}
|
messages := []string{}
|
||||||
for _, failure := range result.Failures {
|
for _, failure := range result.Failures {
|
||||||
messages = append(messages, failure.Message)
|
messages = append(messages, failure.Message)
|
||||||
}
|
}
|
||||||
klog.Infof("Cluster did not pass validation, will try again in %q until duration %q expires: %s.", c.ValidateTickDuration, duration, strings.Join(messages, ", "))
|
if ctx.Err() != nil {
|
||||||
|
klog.Infof("Cluster did not pass validation within deadline: %s.", strings.Join(messages, ", "))
|
||||||
return false
|
return false
|
||||||
} else {
|
}
|
||||||
klog.Info("Cluster validated.")
|
klog.Infof("Cluster did not pass validation, will retry in %q: %s.", c.ValidateTickDuration, strings.Join(messages, ", "))
|
||||||
return true
|
}
|
||||||
|
|
||||||
|
// Reset the success count; we want N consecutive successful validations
|
||||||
|
successCount = 0
|
||||||
|
|
||||||
|
// Wait before retrying
|
||||||
|
// TODO: Should we check if we have enough time left before the deadline?
|
||||||
|
time.Sleep(c.ValidateTickDuration)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -65,11 +65,12 @@ type RollingUpdateCluster struct {
|
||||||
// ValidateTickDuration is the amount of time to wait between cluster validation attempts
|
// ValidateTickDuration is the amount of time to wait between cluster validation attempts
|
||||||
ValidateTickDuration time.Duration
|
ValidateTickDuration time.Duration
|
||||||
|
|
||||||
// ValidateCount is the amount of time that a cluster needs to be validated after single node update
|
// ValidateSuccessDuration is the amount of time a cluster must continue to validate successfully
|
||||||
ValidateCount int32
|
// before updating the next node
|
||||||
|
ValidateSuccessDuration time.Duration
|
||||||
|
|
||||||
// ValidateSucceeded is the amount of times that a cluster validate is succeeded already
|
// ValidateCount is the amount of time that a cluster needs to be validated after single node update
|
||||||
ValidateSucceeded int32
|
ValidateCount int
|
||||||
}
|
}
|
||||||
|
|
||||||
// AdjustNeedUpdate adjusts the set of instances that need updating, using factors outside those known by the cloud implementation
|
// AdjustNeedUpdate adjusts the set of instances that need updating, using factors outside those known by the cloud implementation
|
||||||
|
@ -136,7 +137,7 @@ func (c *RollingUpdateCluster) RollingUpdate(ctx context.Context, groups map[str
|
||||||
|
|
||||||
defer wg.Done()
|
defer wg.Done()
|
||||||
|
|
||||||
err := c.rollingUpdateInstanceGroup(ctx, cluster, group, true, c.BastionInterval, c.ValidationTimeout)
|
err := c.rollingUpdateInstanceGroup(ctx, cluster, group, true, c.BastionInterval)
|
||||||
|
|
||||||
resultsMutex.Lock()
|
resultsMutex.Lock()
|
||||||
results[k] = err
|
results[k] = err
|
||||||
|
@ -161,7 +162,7 @@ func (c *RollingUpdateCluster) RollingUpdate(ctx context.Context, groups map[str
|
||||||
// and we don't want to roll all the masters at the same time. See issue #284
|
// and we don't want to roll all the masters at the same time. See issue #284
|
||||||
|
|
||||||
for _, group := range masterGroups {
|
for _, group := range masterGroups {
|
||||||
err := c.rollingUpdateInstanceGroup(ctx, cluster, group, false, c.MasterInterval, c.ValidationTimeout)
|
err := c.rollingUpdateInstanceGroup(ctx, cluster, group, false, c.MasterInterval)
|
||||||
|
|
||||||
// Do not continue update if master(s) failed, cluster is potentially in an unhealthy state
|
// Do not continue update if master(s) failed, cluster is potentially in an unhealthy state
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
@ -183,7 +184,7 @@ func (c *RollingUpdateCluster) RollingUpdate(ctx context.Context, groups map[str
|
||||||
}
|
}
|
||||||
|
|
||||||
for k, group := range nodeGroups {
|
for k, group := range nodeGroups {
|
||||||
err := c.rollingUpdateInstanceGroup(ctx, cluster, group, false, c.NodeInterval, c.ValidationTimeout)
|
err := c.rollingUpdateInstanceGroup(ctx, cluster, group, false, c.NodeInterval)
|
||||||
|
|
||||||
results[k] = err
|
results[k] = err
|
||||||
|
|
||||||
|
|
|
@ -69,8 +69,8 @@ func getTestSetup() (*RollingUpdateCluster, *awsup.MockAWSCloud, *kopsapi.Cluste
|
||||||
ClusterValidator: &successfulClusterValidator{},
|
ClusterValidator: &successfulClusterValidator{},
|
||||||
FailOnValidate: true,
|
FailOnValidate: true,
|
||||||
ValidateTickDuration: 1 * time.Millisecond,
|
ValidateTickDuration: 1 * time.Millisecond,
|
||||||
ValidateCount: 1,
|
ValidateSuccessDuration: 5 * time.Millisecond,
|
||||||
ValidateSucceeded: 0,
|
ValidateCount: 2,
|
||||||
}
|
}
|
||||||
|
|
||||||
return c, mockcloud, cluster
|
return c, mockcloud, cluster
|
||||||
|
@ -545,7 +545,6 @@ func TestRollingUpdateFlappingValidation(t *testing.T) {
|
||||||
ctx := context.Background()
|
ctx := context.Background()
|
||||||
|
|
||||||
c, cloud, cluster := getTestSetup()
|
c, cloud, cluster := getTestSetup()
|
||||||
c.ValidateCount = 3
|
|
||||||
|
|
||||||
// This should only take a few milliseconds,
|
// This should only take a few milliseconds,
|
||||||
// but we have to pad to allow for random delays (e.g. GC)
|
// but we have to pad to allow for random delays (e.g. GC)
|
||||||
|
@ -1024,6 +1023,7 @@ func TestRollingUpdateMaxUnavailableAllNeedUpdate(t *testing.T) {
|
||||||
c, cloud, cluster := getTestSetup()
|
c, cloud, cluster := getTestSetup()
|
||||||
|
|
||||||
concurrentTest := newConcurrentTest(t, cloud, 0, true)
|
concurrentTest := newConcurrentTest(t, cloud, 0, true)
|
||||||
|
c.ValidateCount = 1
|
||||||
c.ClusterValidator = concurrentTest
|
c.ClusterValidator = concurrentTest
|
||||||
cloud.MockEC2 = concurrentTest
|
cloud.MockEC2 = concurrentTest
|
||||||
|
|
||||||
|
@ -1048,6 +1048,7 @@ func TestRollingUpdateMaxUnavailableAllButOneNeedUpdate(t *testing.T) {
|
||||||
c, cloud, cluster := getTestSetup()
|
c, cloud, cluster := getTestSetup()
|
||||||
|
|
||||||
concurrentTest := newConcurrentTest(t, cloud, 0, false)
|
concurrentTest := newConcurrentTest(t, cloud, 0, false)
|
||||||
|
c.ValidateCount = 1
|
||||||
c.ClusterValidator = concurrentTest
|
c.ClusterValidator = concurrentTest
|
||||||
cloud.MockEC2 = concurrentTest
|
cloud.MockEC2 = concurrentTest
|
||||||
|
|
||||||
|
@ -1071,6 +1072,7 @@ func TestRollingUpdateMaxUnavailableAllNeedUpdateMaster(t *testing.T) {
|
||||||
c, cloud, cluster := getTestSetup()
|
c, cloud, cluster := getTestSetup()
|
||||||
|
|
||||||
concurrentTest := newConcurrentTest(t, cloud, 0, true)
|
concurrentTest := newConcurrentTest(t, cloud, 0, true)
|
||||||
|
c.ValidateCount = 1
|
||||||
c.ClusterValidator = concurrentTest
|
c.ClusterValidator = concurrentTest
|
||||||
cloud.MockEC2 = concurrentTest
|
cloud.MockEC2 = concurrentTest
|
||||||
|
|
||||||
|
@ -1124,6 +1126,7 @@ func TestRollingUpdateMaxSurgeAllNeedUpdate(t *testing.T) {
|
||||||
c, cloud, cluster := getTestSetup()
|
c, cloud, cluster := getTestSetup()
|
||||||
|
|
||||||
concurrentTest := newConcurrentTest(t, cloud, 2, true)
|
concurrentTest := newConcurrentTest(t, cloud, 2, true)
|
||||||
|
c.ValidateCount = 1
|
||||||
c.ClusterValidator = concurrentTest
|
c.ClusterValidator = concurrentTest
|
||||||
cloud.MockAutoscaling = &concurrentTestAutoscaling{
|
cloud.MockAutoscaling = &concurrentTestAutoscaling{
|
||||||
AutoScalingAPI: cloud.MockAutoscaling,
|
AutoScalingAPI: cloud.MockAutoscaling,
|
||||||
|
@ -1152,6 +1155,7 @@ func TestRollingUpdateMaxSurgeAllButOneNeedUpdate(t *testing.T) {
|
||||||
c, cloud, cluster := getTestSetup()
|
c, cloud, cluster := getTestSetup()
|
||||||
|
|
||||||
concurrentTest := newConcurrentTest(t, cloud, 2, false)
|
concurrentTest := newConcurrentTest(t, cloud, 2, false)
|
||||||
|
c.ValidateCount = 1
|
||||||
c.ClusterValidator = concurrentTest
|
c.ClusterValidator = concurrentTest
|
||||||
cloud.MockAutoscaling = &concurrentTestAutoscaling{
|
cloud.MockAutoscaling = &concurrentTestAutoscaling{
|
||||||
AutoScalingAPI: cloud.MockAutoscaling,
|
AutoScalingAPI: cloud.MockAutoscaling,
|
||||||
|
@ -1302,6 +1306,7 @@ func TestRollingUpdateMaxSurgeAllNeedUpdateOneAlreadyDetached(t *testing.T) {
|
||||||
detached: map[string]bool{},
|
detached: map[string]bool{},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
c.ValidateCount = 1
|
||||||
c.ClusterValidator = alreadyDetachedTest
|
c.ClusterValidator = alreadyDetachedTest
|
||||||
cloud.MockAutoscaling = &alreadyDetachedTestAutoscaling{
|
cloud.MockAutoscaling = &alreadyDetachedTestAutoscaling{
|
||||||
AutoScalingAPI: cloud.MockAutoscaling,
|
AutoScalingAPI: cloud.MockAutoscaling,
|
||||||
|
@ -1332,6 +1337,7 @@ func TestRollingUpdateMaxSurgeAllNeedUpdateMaxAlreadyDetached(t *testing.T) {
|
||||||
|
|
||||||
// Should behave the same as TestRollingUpdateMaxUnavailableAllNeedUpdate
|
// Should behave the same as TestRollingUpdateMaxUnavailableAllNeedUpdate
|
||||||
concurrentTest := newConcurrentTest(t, cloud, 0, true)
|
concurrentTest := newConcurrentTest(t, cloud, 0, true)
|
||||||
|
c.ValidateCount = 1
|
||||||
c.ClusterValidator = concurrentTest
|
c.ClusterValidator = concurrentTest
|
||||||
cloud.MockEC2 = concurrentTest
|
cloud.MockEC2 = concurrentTest
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue