From 86b0ef0d0c3b516718bc17da134a72556eef7720 Mon Sep 17 00:00:00 2001
From: Bronson Mirafuentes <bronson.mirafuentes@doordash.com>
Date: Thu, 20 Jan 2022 14:05:55 -0800
Subject: [PATCH] add drain-timeout flag to rolling-update cluster

---
 cmd/kops/rolling-update_cluster.go      | 7 +++++++
 docs/cli/kops_rolling-update_cluster.md | 1 +
 docs/releases/1.23-NOTES.md             | 1 +
 pkg/instancegroups/instancegroups.go    | 4 +---
 pkg/instancegroups/rollingupdate.go     | 3 +++
 5 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/cmd/kops/rolling-update_cluster.go b/cmd/kops/rolling-update_cluster.go
index fb4d1f664b..cfbb51e46a 100644
--- a/cmd/kops/rolling-update_cluster.go
+++ b/cmd/kops/rolling-update_cluster.go
@@ -107,6 +107,9 @@ type RollingUpdateOptions struct {
 	// does not validate, after a validation period.
 	FailOnValidate bool
 
+	// DrainTimeout is the maximum time to wait while draining a node.
+	DrainTimeout time.Duration
+
 	// PostDrainDelay is the duration of a pause after a drain operation
 	PostDrainDelay time.Duration
 
@@ -154,6 +157,8 @@ func (o *RollingUpdateOptions) InitDefaults() {
 	o.PostDrainDelay = 5 * time.Second
 	o.ValidationTimeout = 15 * time.Minute
 	o.ValidateCount = 2
+
+	o.DrainTimeout = 15 * time.Minute
 }
 
 func NewCmdRollingUpdateCluster(f *util.Factory, out io.Writer) *cobra.Command {
@@ -182,6 +187,7 @@ func NewCmdRollingUpdateCluster(f *util.Factory, out io.Writer) *cobra.Command {
 	cmd.Flags().BoolVar(&options.CloudOnly, "cloudonly", options.CloudOnly, "Perform rolling update without confirming progress with Kubernetes")
 
 	cmd.Flags().DurationVar(&options.ValidationTimeout, "validation-timeout", options.ValidationTimeout, "Maximum time to wait for a cluster to validate")
+	cmd.Flags().DurationVar(&options.DrainTimeout, "drain-timeout", options.DrainTimeout, "Maximum time to wait for a node to drain")
 	cmd.Flags().Int32Var(&options.ValidateCount, "validate-count", options.ValidateCount, "Number of times that a cluster needs to be validated after single node update")
 	cmd.Flags().DurationVar(&options.MasterInterval, "master-interval", options.MasterInterval, "Time to wait between restarting control plane nodes")
 	cmd.Flags().DurationVar(&options.NodeInterval, "node-interval", options.NodeInterval, "Time to wait between restarting worker nodes")
@@ -336,6 +342,7 @@ func RunRollingUpdateCluster(ctx context.Context, f *util.Factory, out io.Writer
 		PostDrainDelay:    options.PostDrainDelay,
 		ValidationTimeout: options.ValidationTimeout,
 		ValidateCount:     int(options.ValidateCount),
+		DrainTimeout:      options.DrainTimeout,
 		// TODO should we expose this to the UI?
 		ValidateTickDuration:    30 * time.Second,
 		ValidateSuccessDuration: 10 * time.Second,
diff --git a/docs/cli/kops_rolling-update_cluster.md b/docs/cli/kops_rolling-update_cluster.md
index 76746e4ce8..ee024f0f37 100644
--- a/docs/cli/kops_rolling-update_cluster.md
+++ b/docs/cli/kops_rolling-update_cluster.md
@@ -58,6 +58,7 @@ kops rolling-update cluster [CLUSTER] [flags]
 ```
       --bastion-interval duration      Time to wait between restarting bastions (default 15s)
       --cloudonly                      Perform rolling update without confirming progress with Kubernetes
+      --drain-timeout duration         Maximum time to wait for a node to drain (default 15m0s)
       --fail-on-drain-error            Fail if draining a node fails (default true)
       --fail-on-validate-error         Fail if the cluster fails to validate (default true)
       --force                          Force rolling update, even if no changes
diff --git a/docs/releases/1.23-NOTES.md b/docs/releases/1.23-NOTES.md
index 3d28993a3f..4208ceb13e 100644
--- a/docs/releases/1.23-NOTES.md
+++ b/docs/releases/1.23-NOTES.md
@@ -62,6 +62,7 @@ It is recommended to keep using the `v1alpha2` API version.
 
 * IPv6 pod subnets is in a working state using public IPv6 addresses for the Pod network. This works with both Cilium and Calico. IPv6 is still behind a feature flag until service controllers and addons implement support for IPv6. See [the IPv6 documentation](https://kops.sigs.k8s.io/networking/ipv6/).
 
+* The `kops rolling-update cluster` command has a new `--drain-timeout` flag for specifying the maximum amount of time to wait when attempting to drain a node. Previously, rolling-updates would attempt to drain a node for an indefinite amount of time. If `--drain-timeout` is not specified, a default of 15 minutes is applied.
 
 # Full change list since 1.22.0 release
 
diff --git a/pkg/instancegroups/instancegroups.go b/pkg/instancegroups/instancegroups.go
index 4ecc55f6ab..712021d818 100644
--- a/pkg/instancegroups/instancegroups.go
+++ b/pkg/instancegroups/instancegroups.go
@@ -624,12 +624,10 @@ func (c *RollingUpdateCluster) drainNode(u *cloudinstances.CloudInstance) error
 		IgnoreAllDaemonSets: true,
 		Out:                 os.Stdout,
 		ErrOut:              os.Stderr,
+		Timeout:             c.DrainTimeout,
 
 		// We want to proceed even when pods are using emptyDir volumes
 		DeleteEmptyDirData: true,
-
-		// Other options we might want to set:
-		// Timeout?
 	}
 
 	if err := drain.RunCordonOrUncordon(helper, u.Node, true); err != nil {
diff --git a/pkg/instancegroups/rollingupdate.go b/pkg/instancegroups/rollingupdate.go
index 28ebbca32d..079bbf4a10 100644
--- a/pkg/instancegroups/rollingupdate.go
+++ b/pkg/instancegroups/rollingupdate.go
@@ -77,6 +77,9 @@ type RollingUpdateCluster struct {
 
 	// ValidateCount is the amount of time that a cluster needs to be validated after single node update
 	ValidateCount int
+
+	// DrainTimeout is the maximum amount of time to wait while draining a node.
+	DrainTimeout time.Duration
 }
 
 // AdjustNeedUpdate adjusts the set of instances that need updating, using factors outside those known by the cloud implementation