mirror of https://github.com/kubernetes/kops.git
cluster validation - allow flapping of validation errors
Previously with --wait if a cluster successfully validated and then a subsequent validation failed (perhaps due to a new critical pod being scheduled and not being ready) we would previously fail the `validate cluster` command immediately. This will now reset the success counter that approaches --count, allowing validation attempts to continue until we timeout from --wait. I'm hoping this fixes prow job failures like this: https://prow.k8s.io/view/gs/kubernetes-jenkins/logs/e2e-kops-grid-u1804-k18-containerd/1370875829445201920 where `kops validate cluster --count 10 --wait 15m` was invoked at `23:15:48` but exited with failure at `23:22:59`. In my opinion, `kops validate cluster --count 10 --wait 15m` should only ever exit with failure if the 15 minute timeout has been reached.
This commit is contained in:
parent
a92992e10a
commit
ce073593da
|
|
@ -217,8 +217,9 @@ func RunValidateCluster(ctx context.Context, f *util.Factory, cmd *cobra.Command
|
|||
return result, nil
|
||||
}
|
||||
} else {
|
||||
if options.wait > 0 && consecutive == 0 {
|
||||
if options.wait > 0 {
|
||||
klog.Warningf("(will retry): cluster not yet healthy")
|
||||
consecutive = 0
|
||||
time.Sleep(pollInterval)
|
||||
continue
|
||||
} else {
|
||||
|
|
|
|||
Loading…
Reference in New Issue