Merge pull request #13103 from heybronson/set-eviction-timeout

add node-drain-timeout flag to rolling-update
This commit is contained in:
Kubernetes Prow Robot 2022-01-28 01:40:31 -08:00 committed by GitHub
commit 9023720a08
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 13 additions and 3 deletions

View File

@ -107,6 +107,9 @@ type RollingUpdateOptions struct {
// does not validate, after a validation period.
FailOnValidate bool
// DrainTimeout is the maximum time to wait while draining a node.
DrainTimeout time.Duration
// PostDrainDelay is the duration of a pause after a drain operation
PostDrainDelay time.Duration
@ -154,6 +157,8 @@ func (o *RollingUpdateOptions) InitDefaults() {
o.PostDrainDelay = 5 * time.Second
o.ValidationTimeout = 15 * time.Minute
o.ValidateCount = 2
o.DrainTimeout = 15 * time.Minute
}
func NewCmdRollingUpdateCluster(f *util.Factory, out io.Writer) *cobra.Command {
@ -182,6 +187,7 @@ func NewCmdRollingUpdateCluster(f *util.Factory, out io.Writer) *cobra.Command {
cmd.Flags().BoolVar(&options.CloudOnly, "cloudonly", options.CloudOnly, "Perform rolling update without confirming progress with Kubernetes")
cmd.Flags().DurationVar(&options.ValidationTimeout, "validation-timeout", options.ValidationTimeout, "Maximum time to wait for a cluster to validate")
cmd.Flags().DurationVar(&options.DrainTimeout, "drain-timeout", options.DrainTimeout, "Maximum time to wait for a node to drain")
cmd.Flags().Int32Var(&options.ValidateCount, "validate-count", options.ValidateCount, "Number of times that a cluster needs to be validated after single node update")
cmd.Flags().DurationVar(&options.MasterInterval, "master-interval", options.MasterInterval, "Time to wait between restarting control plane nodes")
cmd.Flags().DurationVar(&options.NodeInterval, "node-interval", options.NodeInterval, "Time to wait between restarting worker nodes")
@ -336,6 +342,7 @@ func RunRollingUpdateCluster(ctx context.Context, f *util.Factory, out io.Writer
PostDrainDelay: options.PostDrainDelay,
ValidationTimeout: options.ValidationTimeout,
ValidateCount: int(options.ValidateCount),
DrainTimeout: options.DrainTimeout,
// TODO should we expose this to the UI?
ValidateTickDuration: 30 * time.Second,
ValidateSuccessDuration: 10 * time.Second,

View File

@ -58,6 +58,7 @@ kops rolling-update cluster [CLUSTER] [flags]
```
--bastion-interval duration Time to wait between restarting bastions (default 15s)
--cloudonly Perform rolling update without confirming progress with Kubernetes
--drain-timeout duration Maximum time to wait for a node to drain (default 15m0s)
--fail-on-drain-error Fail if draining a node fails (default true)
--fail-on-validate-error Fail if the cluster fails to validate (default true)
--force Force rolling update, even if no changes

View File

@ -62,6 +62,7 @@ It is recommended to keep using the `v1alpha2` API version.
* IPv6 pod subnets is in a working state using public IPv6 addresses for the Pod network. This works with both Cilium and Calico. IPv6 is still behind a feature flag until service controllers and addons implement support for IPv6. See [the IPv6 documentation](https://kops.sigs.k8s.io/networking/ipv6/).
* The `kops rolling-update cluster` command has a new `--drain-timeout` flag for specifying the maximum amount of time to wait when attempting to drain a node. Previously, rolling-updates would attempt to drain a node for an indefinite amount of time. If `--drain-timeout` is not specified, a default of 15 minutes is applied.
# Full change list since 1.22.0 release

View File

@ -624,12 +624,10 @@ func (c *RollingUpdateCluster) drainNode(u *cloudinstances.CloudInstance) error
IgnoreAllDaemonSets: true,
Out: os.Stdout,
ErrOut: os.Stderr,
Timeout: c.DrainTimeout,
// We want to proceed even when pods are using emptyDir volumes
DeleteEmptyDirData: true,
// Other options we might want to set:
// Timeout?
}
if err := drain.RunCordonOrUncordon(helper, u.Node, true); err != nil {

View File

@ -77,6 +77,9 @@ type RollingUpdateCluster struct {
// ValidateCount is the amount of time that a cluster needs to be validated after single node update
ValidateCount int
// DrainTimeout is the maximum amount of time to wait while draining a node.
DrainTimeout time.Duration
}
// AdjustNeedUpdate adjusts the set of instances that need updating, using factors outside those known by the cloud implementation