From e1e79790ef5e384bf3a565ca1a667abe130d6576 Mon Sep 17 00:00:00 2001 From: Jesse Haka Date: Wed, 8 Apr 2020 00:34:15 +0300 Subject: [PATCH] validate cluster n times in rolling update --- cmd/kops/rollingupdatecluster.go | 10 ++++++-- docs/cli/kops_rolling-update_cluster.md | 1 + pkg/instancegroups/instancegroups.go | 10 ++++---- pkg/instancegroups/rollingupdate.go | 8 ++++--- pkg/instancegroups/rollingupdate_test.go | 29 ++++++++++-------------- 5 files changed, 32 insertions(+), 26 deletions(-) diff --git a/cmd/kops/rollingupdatecluster.go b/cmd/kops/rollingupdatecluster.go index 32907bfa1f..dc4770965d 100644 --- a/cmd/kops/rollingupdatecluster.go +++ b/cmd/kops/rollingupdatecluster.go @@ -121,6 +121,9 @@ type RollingUpdateOptions struct { // ValidationTimeout is the timeout for validation to succeed after the drain and pause ValidationTimeout time.Duration + // ValidateTimes is the amount of time that a cluster needs to be validated after single node update + ValidateTimes int32 + // MasterInterval is the minimum time to wait after stopping a master node. This does not include drain and validate time. MasterInterval time.Duration @@ -158,6 +161,7 @@ func (o *RollingUpdateOptions) InitDefaults() { o.PostDrainDelay = 5 * time.Second o.ValidationTimeout = 15 * time.Minute + o.ValidateTimes = 2 } func NewCmdRollingUpdateCluster(f *util.Factory, out io.Writer) *cobra.Command { @@ -177,6 +181,7 @@ func NewCmdRollingUpdateCluster(f *util.Factory, out io.Writer) *cobra.Command { cmd.Flags().BoolVar(&options.CloudOnly, "cloudonly", options.CloudOnly, "Perform rolling update without confirming progress with k8s") cmd.Flags().DurationVar(&options.ValidationTimeout, "validation-timeout", options.ValidationTimeout, "Maximum time to wait for a cluster to validate") + cmd.Flags().Int32Var(&options.ValidateTimes, "validate-times", options.ValidateTimes, "Amount of times that a cluster needs to be validated after single node update") cmd.Flags().DurationVar(&options.MasterInterval, "master-interval", options.MasterInterval, "Time to wait between restarting masters") cmd.Flags().DurationVar(&options.NodeInterval, "node-interval", options.NodeInterval, "Time to wait between restarting nodes") cmd.Flags().DurationVar(&options.BastionInterval, "bastion-interval", options.BastionInterval, "Time to wait between restarting bastions") @@ -333,9 +338,10 @@ func RunRollingUpdateCluster(f *util.Factory, out io.Writer, options *RollingUpd ClusterName: options.ClusterName, PostDrainDelay: options.PostDrainDelay, ValidationTimeout: options.ValidationTimeout, + ValidateTimes: options.ValidateTimes, + ValidateSucceeded: 0, // TODO should we expose this to the UI? - ValidateTickDuration: 30 * time.Second, - ValidateSuccessDuration: 10 * time.Second, + ValidateTickDuration: 30 * time.Second, } err = d.AdjustNeedUpdate(groups, cluster, list) diff --git a/docs/cli/kops_rolling-update_cluster.md b/docs/cli/kops_rolling-update_cluster.md index 493072e3e4..2f868030b4 100644 --- a/docs/cli/kops_rolling-update_cluster.md +++ b/docs/cli/kops_rolling-update_cluster.md @@ -80,6 +80,7 @@ kops rolling-update cluster [flags] --master-interval duration Time to wait between restarting masters (default 15s) --node-interval duration Time to wait between restarting nodes (default 15s) --post-drain-delay duration Time to wait after draining each node (default 5s) + --validate-times int32 Amount of times that a cluster needs to be validated after single node update (default 2) --validation-timeout duration Maximum time to wait for a cluster to validate (default 15m0s) -y, --yes Perform rolling update immediately, without --yes rolling-update executes a dry-run ``` diff --git a/pkg/instancegroups/instancegroups.go b/pkg/instancegroups/instancegroups.go index a34f503b3e..077c1849c4 100644 --- a/pkg/instancegroups/instancegroups.go +++ b/pkg/instancegroups/instancegroups.go @@ -430,10 +430,12 @@ func (c *RollingUpdateCluster) validateClusterWithDuration(duration time.Duratio func (c *RollingUpdateCluster) tryValidateCluster(duration time.Duration) bool { result, err := c.ClusterValidator.Validate() - if err == nil && len(result.Failures) == 0 && c.ValidateSuccessDuration > 0 { - klog.Infof("Cluster validated; revalidating in %s to make sure it does not flap.", c.ValidateSuccessDuration) - time.Sleep(c.ValidateSuccessDuration) - result, err = c.ClusterValidator.Validate() + if err == nil && len(result.Failures) == 0 && c.ValidateTimes > 0 { + c.ValidateSucceeded++ + if c.ValidateSucceeded < c.ValidateTimes { + klog.Infof("Cluster validated; revalidating %d time(s) to make sure it does not flap.", c.ValidateTimes-c.ValidateSucceeded) + return false + } } if err != nil { diff --git a/pkg/instancegroups/rollingupdate.go b/pkg/instancegroups/rollingupdate.go index e16d9e795c..1ca4332261 100644 --- a/pkg/instancegroups/rollingupdate.go +++ b/pkg/instancegroups/rollingupdate.go @@ -64,9 +64,11 @@ type RollingUpdateCluster struct { // ValidateTickDuration is the amount of time to wait between cluster validation attempts ValidateTickDuration time.Duration - // ValidateSuccessDuration is the amount of time a cluster must continue to validate successfully - // before updating the next node - ValidateSuccessDuration time.Duration + // ValidateTimes is the amount of time that a cluster needs to be validated after single node update + ValidateTimes int32 + + // ValidateSucceeded is the amount of times that a cluster validate is succeeded already + ValidateSucceeded int32 } // AdjustNeedUpdate adjusts the set of instances that need updating, using factors outside those known by the cloud implementation diff --git a/pkg/instancegroups/rollingupdate_test.go b/pkg/instancegroups/rollingupdate_test.go index 3635ea7c44..2f905efd7d 100644 --- a/pkg/instancegroups/rollingupdate_test.go +++ b/pkg/instancegroups/rollingupdate_test.go @@ -59,16 +59,17 @@ func getTestSetup() (*RollingUpdateCluster, *awsup.MockAWSCloud, *kopsapi.Cluste cluster.Name = "test.k8s.local" c := &RollingUpdateCluster{ - Cloud: mockcloud, - MasterInterval: 1 * time.Millisecond, - NodeInterval: 1 * time.Millisecond, - BastionInterval: 1 * time.Millisecond, - Force: false, - K8sClient: k8sClient, - ClusterValidator: &successfulClusterValidator{}, - FailOnValidate: true, - ValidateTickDuration: 1 * time.Millisecond, - ValidateSuccessDuration: 5 * time.Millisecond, + Cloud: mockcloud, + MasterInterval: 1 * time.Millisecond, + NodeInterval: 1 * time.Millisecond, + BastionInterval: 1 * time.Millisecond, + Force: false, + K8sClient: k8sClient, + ClusterValidator: &successfulClusterValidator{}, + FailOnValidate: true, + ValidateTickDuration: 1 * time.Millisecond, + ValidateTimes: 1, + ValidateSucceeded: 0, } return c, mockcloud, cluster @@ -511,6 +512,7 @@ func (v *flappingClusterValidator) Validate() (*validation.ValidationCluster, er func TestRollingUpdateFlappingValidation(t *testing.T) { c, cloud, cluster := getTestSetup() + c.ValidateTimes = 3 // This should only take a few milliseconds, // but we have to pad to allow for random delays (e.g. GC) @@ -977,7 +979,6 @@ func TestRollingUpdateMaxUnavailableAllNeedUpdate(t *testing.T) { c, cloud, cluster := getTestSetup() concurrentTest := newConcurrentTest(t, cloud, 0, true) - c.ValidateSuccessDuration = 0 c.ClusterValidator = concurrentTest cloud.MockEC2 = concurrentTest @@ -1000,7 +1001,6 @@ func TestRollingUpdateMaxUnavailableAllButOneNeedUpdate(t *testing.T) { c, cloud, cluster := getTestSetup() concurrentTest := newConcurrentTest(t, cloud, 0, false) - c.ValidateSuccessDuration = 0 c.ClusterValidator = concurrentTest cloud.MockEC2 = concurrentTest @@ -1022,7 +1022,6 @@ func TestRollingUpdateMaxUnavailableAllNeedUpdateMaster(t *testing.T) { c, cloud, cluster := getTestSetup() concurrentTest := newConcurrentTest(t, cloud, 0, true) - c.ValidateSuccessDuration = 0 c.ClusterValidator = concurrentTest cloud.MockEC2 = concurrentTest @@ -1074,7 +1073,6 @@ func TestRollingUpdateMaxSurgeAllNeedUpdate(t *testing.T) { c, cloud, cluster := getTestSetup() concurrentTest := newConcurrentTest(t, cloud, 2, true) - c.ValidateSuccessDuration = 0 c.ClusterValidator = concurrentTest cloud.MockAutoscaling = &concurrentTestAutoscaling{ AutoScalingAPI: cloud.MockAutoscaling, @@ -1101,7 +1099,6 @@ func TestRollingUpdateMaxSurgeAllButOneNeedUpdate(t *testing.T) { c, cloud, cluster := getTestSetup() concurrentTest := newConcurrentTest(t, cloud, 2, false) - c.ValidateSuccessDuration = 0 c.ClusterValidator = concurrentTest cloud.MockAutoscaling = &concurrentTestAutoscaling{ AutoScalingAPI: cloud.MockAutoscaling, @@ -1248,7 +1245,6 @@ func TestRollingUpdateMaxSurgeAllNeedUpdateOneAlreadyDetached(t *testing.T) { detached: map[string]bool{}, } - c.ValidateSuccessDuration = 0 c.ClusterValidator = alreadyDetachedTest cloud.MockAutoscaling = &alreadyDetachedTestAutoscaling{ AutoScalingAPI: cloud.MockAutoscaling, @@ -1277,7 +1273,6 @@ func TestRollingUpdateMaxSurgeAllNeedUpdateMaxAlreadyDetached(t *testing.T) { // Should behave the same as TestRollingUpdateMaxUnavailableAllNeedUpdate concurrentTest := newConcurrentTest(t, cloud, 0, true) - c.ValidateSuccessDuration = 0 c.ClusterValidator = concurrentTest cloud.MockEC2 = concurrentTest