mirror of https://github.com/kubernetes/kops.git
cluster validation - allow flapping of validation errors
Previously with --wait if a cluster successfully validated and then a subsequent validation failed (perhaps due to a new critical pod being scheduled and not being ready) we would previously fail the `validate cluster` command immediately. This will now reset the success counter that approaches --count, allowing validation attempts to continue until we timeout from --wait. I'm hoping this fixes prow job failures like this: https://prow.k8s.io/view/gs/kubernetes-jenkins/logs/e2e-kops-grid-u1804-k18-containerd/1370875829445201920 where `kops validate cluster --count 10 --wait 15m` was invoked at `23:15:48` but exited with failure at `23:22:59`. In my opinion, `kops validate cluster --count 10 --wait 15m` should only ever exit with failure if the 15 minute timeout has been reached.
This commit is contained in:
parent
a92992e10a
commit
ce073593da
|
|
@ -217,8 +217,9 @@ func RunValidateCluster(ctx context.Context, f *util.Factory, cmd *cobra.Command
|
||||||
return result, nil
|
return result, nil
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if options.wait > 0 && consecutive == 0 {
|
if options.wait > 0 {
|
||||||
klog.Warningf("(will retry): cluster not yet healthy")
|
klog.Warningf("(will retry): cluster not yet healthy")
|
||||||
|
consecutive = 0
|
||||||
time.Sleep(pollInterval)
|
time.Sleep(pollInterval)
|
||||||
continue
|
continue
|
||||||
} else {
|
} else {
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue