Merge pull request #13103 from heybronson/set-eviction-timeout

add node-drain-timeout flag to rolling-update
2022-01-28 01:40:31 -08:00 · 2022-01-28 01:40:31 -08:00 · 9023720a08
parent 6ed628cff2 86b0ef0d0c
commit 9023720a08
5 changed files with 13 additions and 3 deletions
--- a/cmd/kops/rolling-update_cluster.go
+++ b/cmd/kops/rolling-update_cluster.go
@ -107,6 +107,9 @@ type RollingUpdateOptions struct {
 	// does not validate, after a validation period.
 	FailOnValidate bool

+	// DrainTimeout is the maximum time to wait while draining a node.
+	DrainTimeout time.Duration
+
 	// PostDrainDelay is the duration of a pause after a drain operation
 	PostDrainDelay time.Duration

@ -154,6 +157,8 @@ func (o *RollingUpdateOptions) InitDefaults() {
 	o.PostDrainDelay = 5 * time.Second
 	o.ValidationTimeout = 15 * time.Minute
 	o.ValidateCount = 2
+
+	o.DrainTimeout = 15 * time.Minute
 }

 func NewCmdRollingUpdateCluster(f *util.Factory, out io.Writer) *cobra.Command {
@ -182,6 +187,7 @@ func NewCmdRollingUpdateCluster(f *util.Factory, out io.Writer) *cobra.Command {
 	cmd.Flags().BoolVar(&options.CloudOnly, "cloudonly", options.CloudOnly, "Perform rolling update without confirming progress with Kubernetes")

 	cmd.Flags().DurationVar(&options.ValidationTimeout, "validation-timeout", options.ValidationTimeout, "Maximum time to wait for a cluster to validate")
+	cmd.Flags().DurationVar(&options.DrainTimeout, "drain-timeout", options.DrainTimeout, "Maximum time to wait for a node to drain")
 	cmd.Flags().Int32Var(&options.ValidateCount, "validate-count", options.ValidateCount, "Number of times that a cluster needs to be validated after single node update")
 	cmd.Flags().DurationVar(&options.MasterInterval, "master-interval", options.MasterInterval, "Time to wait between restarting control plane nodes")
 	cmd.Flags().DurationVar(&options.NodeInterval, "node-interval", options.NodeInterval, "Time to wait between restarting worker nodes")
@ -336,6 +342,7 @@ func RunRollingUpdateCluster(ctx context.Context, f *util.Factory, out io.Writer
 		PostDrainDelay:    options.PostDrainDelay,
 		ValidationTimeout: options.ValidationTimeout,
 		ValidateCount:     int(options.ValidateCount),
+		DrainTimeout:      options.DrainTimeout,
 		// TODO should we expose this to the UI?
 		ValidateTickDuration:    30 * time.Second,
 		ValidateSuccessDuration: 10 * time.Second,
--- a/docs/cli/kops_rolling-update_cluster.md
+++ b/docs/cli/kops_rolling-update_cluster.md
@ -58,6 +58,7 @@ kops rolling-update cluster [CLUSTER] [flags]
 ```
      --bastion-interval duration      Time to wait between restarting bastions (default 15s)
      --cloudonly                      Perform rolling update without confirming progress with Kubernetes
+      --drain-timeout duration         Maximum time to wait for a node to drain (default 15m0s)
      --fail-on-drain-error            Fail if draining a node fails (default true)
      --fail-on-validate-error         Fail if the cluster fails to validate (default true)
      --force                          Force rolling update, even if no changes
--- a/docs/releases/1.23-NOTES.md
+++ b/docs/releases/1.23-NOTES.md
@ -62,6 +62,7 @@ It is recommended to keep using the `v1alpha2` API version.

 * IPv6 pod subnets is in a working state using public IPv6 addresses for the Pod network. This works with both Cilium and Calico. IPv6 is still behind a feature flag until service controllers and addons implement support for IPv6. See [the IPv6 documentation](https://kops.sigs.k8s.io/networking/ipv6/).

+* The `kops rolling-update cluster` command has a new `--drain-timeout` flag for specifying the maximum amount of time to wait when attempting to drain a node. Previously, rolling-updates would attempt to drain a node for an indefinite amount of time. If `--drain-timeout` is not specified, a default of 15 minutes is applied.

 # Full change list since 1.22.0 release

--- a/pkg/instancegroups/instancegroups.go
+++ b/pkg/instancegroups/instancegroups.go
@ -624,12 +624,10 @@ func (c *RollingUpdateCluster) drainNode(u *cloudinstances.CloudInstance) error
 		IgnoreAllDaemonSets: true,
 		Out:                 os.Stdout,
 		ErrOut:              os.Stderr,
+		Timeout:             c.DrainTimeout,

 		// We want to proceed even when pods are using emptyDir volumes
 		DeleteEmptyDirData: true,
-
-		// Other options we might want to set:
-		// Timeout?
 	}

 	if err := drain.RunCordonOrUncordon(helper, u.Node, true); err != nil {
--- a/pkg/instancegroups/rollingupdate.go
+++ b/pkg/instancegroups/rollingupdate.go
@ -77,6 +77,9 @@ type RollingUpdateCluster struct {

 	// ValidateCount is the amount of time that a cluster needs to be validated after single node update
 	ValidateCount int
+
+	// DrainTimeout is the maximum amount of time to wait while draining a node.
+	DrainTimeout time.Duration
 }

 // AdjustNeedUpdate adjusts the set of instances that need updating, using factors outside those known by the cloud implementation