diff --git a/cmd/kops/rollingupdatecluster.go b/cmd/kops/rollingupdatecluster.go index acb9824940..1df6c67d94 100644 --- a/cmd/kops/rollingupdatecluster.go +++ b/cmd/kops/rollingupdatecluster.go @@ -414,7 +414,8 @@ func RunRollingUpdateCluster(f *util.Factory, out io.Writer, options *RollingUpd PostDrainDelay: options.PostDrainDelay, ValidationTimeout: options.ValidationTimeout, // TODO should we expose this to the UI? - ValidateTickDuration: 30 * time.Second, + ValidateTickDuration: 30 * time.Second, + ValidateSuccessDuration: 10 * time.Second, } return d.RollingUpdate(groups, cluster, list) } diff --git a/pkg/instancegroups/instancegroups.go b/pkg/instancegroups/instancegroups.go index 3b02488d53..0de9c23c8a 100644 --- a/pkg/instancegroups/instancegroups.go +++ b/pkg/instancegroups/instancegroups.go @@ -262,8 +262,15 @@ func (r *RollingUpdateInstanceGroup) tryValidateCluster(rollingUpdateData *Rolli klog.Infof("Cluster did not pass validation, will try again in %q until duration %q expires: %s.", tickDuration, duration, strings.Join(messages, ", ")) return false } else { - klog.Info("Cluster validated.") - return true + klog.Info("Cluster validated, revalidating to make sure it does not flap.") + time.Sleep(rollingUpdateData.ValidateSuccessDuration) + result, err = rollingUpdateData.ClusterValidator.Validate() + if err == nil && len(result.Failures) == 0 { + klog.Info("Cluster validated.") + return true + } + klog.Info("Cluster did not revalidate.") + return false } } diff --git a/pkg/instancegroups/rollingupdate.go b/pkg/instancegroups/rollingupdate.go index 7ae9cd27e7..c8cc340ee4 100644 --- a/pkg/instancegroups/rollingupdate.go +++ b/pkg/instancegroups/rollingupdate.go @@ -63,6 +63,10 @@ type RollingUpdateCluster struct { // ValidateTickDuration is the amount of time to wait between cluster validation attempts ValidateTickDuration time.Duration + + // ValidateSuccessDuration is the amount of time a cluster must continue to validate successfully + // before updating the next node + ValidateSuccessDuration time.Duration } // RollingUpdate performs a rolling update on a K8s Cluster. diff --git a/pkg/instancegroups/rollingupdate_test.go b/pkg/instancegroups/rollingupdate_test.go index 1518cd515a..027fd6ab10 100644 --- a/pkg/instancegroups/rollingupdate_test.go +++ b/pkg/instancegroups/rollingupdate_test.go @@ -46,15 +46,16 @@ func getTestSetup() (*RollingUpdateCluster, awsup.AWSCloud, *kopsapi.Cluster) { cluster.Name = "test.k8s.local" c := &RollingUpdateCluster{ - Cloud: mockcloud, - MasterInterval: 1 * time.Millisecond, - NodeInterval: 1 * time.Millisecond, - BastionInterval: 1 * time.Millisecond, - Force: false, - K8sClient: k8sClient, - ClusterValidator: &successfulClusterValidator{}, - FailOnValidate: true, - ValidateTickDuration: 1 * time.Millisecond, + Cloud: mockcloud, + MasterInterval: 1 * time.Millisecond, + NodeInterval: 1 * time.Millisecond, + BastionInterval: 1 * time.Millisecond, + Force: false, + K8sClient: k8sClient, + ClusterValidator: &successfulClusterValidator{}, + FailOnValidate: true, + ValidateTickDuration: 1 * time.Millisecond, + ValidateSuccessDuration: 5 * time.Millisecond, } return c, mockcloud, cluster