From 86b0ef0d0c3b516718bc17da134a72556eef7720 Mon Sep 17 00:00:00 2001 From: Bronson Mirafuentes Date: Thu, 20 Jan 2022 14:05:55 -0800 Subject: [PATCH] add drain-timeout flag to rolling-update cluster --- cmd/kops/rolling-update_cluster.go | 7 +++++++ docs/cli/kops_rolling-update_cluster.md | 1 + docs/releases/1.23-NOTES.md | 1 + pkg/instancegroups/instancegroups.go | 4 +--- pkg/instancegroups/rollingupdate.go | 3 +++ 5 files changed, 13 insertions(+), 3 deletions(-) diff --git a/cmd/kops/rolling-update_cluster.go b/cmd/kops/rolling-update_cluster.go index fb4d1f664b..cfbb51e46a 100644 --- a/cmd/kops/rolling-update_cluster.go +++ b/cmd/kops/rolling-update_cluster.go @@ -107,6 +107,9 @@ type RollingUpdateOptions struct { // does not validate, after a validation period. FailOnValidate bool + // DrainTimeout is the maximum time to wait while draining a node. + DrainTimeout time.Duration + // PostDrainDelay is the duration of a pause after a drain operation PostDrainDelay time.Duration @@ -154,6 +157,8 @@ func (o *RollingUpdateOptions) InitDefaults() { o.PostDrainDelay = 5 * time.Second o.ValidationTimeout = 15 * time.Minute o.ValidateCount = 2 + + o.DrainTimeout = 15 * time.Minute } func NewCmdRollingUpdateCluster(f *util.Factory, out io.Writer) *cobra.Command { @@ -182,6 +187,7 @@ func NewCmdRollingUpdateCluster(f *util.Factory, out io.Writer) *cobra.Command { cmd.Flags().BoolVar(&options.CloudOnly, "cloudonly", options.CloudOnly, "Perform rolling update without confirming progress with Kubernetes") cmd.Flags().DurationVar(&options.ValidationTimeout, "validation-timeout", options.ValidationTimeout, "Maximum time to wait for a cluster to validate") + cmd.Flags().DurationVar(&options.DrainTimeout, "drain-timeout", options.DrainTimeout, "Maximum time to wait for a node to drain") cmd.Flags().Int32Var(&options.ValidateCount, "validate-count", options.ValidateCount, "Number of times that a cluster needs to be validated after single node update") cmd.Flags().DurationVar(&options.MasterInterval, "master-interval", options.MasterInterval, "Time to wait between restarting control plane nodes") cmd.Flags().DurationVar(&options.NodeInterval, "node-interval", options.NodeInterval, "Time to wait between restarting worker nodes") @@ -336,6 +342,7 @@ func RunRollingUpdateCluster(ctx context.Context, f *util.Factory, out io.Writer PostDrainDelay: options.PostDrainDelay, ValidationTimeout: options.ValidationTimeout, ValidateCount: int(options.ValidateCount), + DrainTimeout: options.DrainTimeout, // TODO should we expose this to the UI? ValidateTickDuration: 30 * time.Second, ValidateSuccessDuration: 10 * time.Second, diff --git a/docs/cli/kops_rolling-update_cluster.md b/docs/cli/kops_rolling-update_cluster.md index 76746e4ce8..ee024f0f37 100644 --- a/docs/cli/kops_rolling-update_cluster.md +++ b/docs/cli/kops_rolling-update_cluster.md @@ -58,6 +58,7 @@ kops rolling-update cluster [CLUSTER] [flags] ``` --bastion-interval duration Time to wait between restarting bastions (default 15s) --cloudonly Perform rolling update without confirming progress with Kubernetes + --drain-timeout duration Maximum time to wait for a node to drain (default 15m0s) --fail-on-drain-error Fail if draining a node fails (default true) --fail-on-validate-error Fail if the cluster fails to validate (default true) --force Force rolling update, even if no changes diff --git a/docs/releases/1.23-NOTES.md b/docs/releases/1.23-NOTES.md index 3d28993a3f..4208ceb13e 100644 --- a/docs/releases/1.23-NOTES.md +++ b/docs/releases/1.23-NOTES.md @@ -62,6 +62,7 @@ It is recommended to keep using the `v1alpha2` API version. * IPv6 pod subnets is in a working state using public IPv6 addresses for the Pod network. This works with both Cilium and Calico. IPv6 is still behind a feature flag until service controllers and addons implement support for IPv6. See [the IPv6 documentation](https://kops.sigs.k8s.io/networking/ipv6/). +* The `kops rolling-update cluster` command has a new `--drain-timeout` flag for specifying the maximum amount of time to wait when attempting to drain a node. Previously, rolling-updates would attempt to drain a node for an indefinite amount of time. If `--drain-timeout` is not specified, a default of 15 minutes is applied. # Full change list since 1.22.0 release diff --git a/pkg/instancegroups/instancegroups.go b/pkg/instancegroups/instancegroups.go index 4ecc55f6ab..712021d818 100644 --- a/pkg/instancegroups/instancegroups.go +++ b/pkg/instancegroups/instancegroups.go @@ -624,12 +624,10 @@ func (c *RollingUpdateCluster) drainNode(u *cloudinstances.CloudInstance) error IgnoreAllDaemonSets: true, Out: os.Stdout, ErrOut: os.Stderr, + Timeout: c.DrainTimeout, // We want to proceed even when pods are using emptyDir volumes DeleteEmptyDirData: true, - - // Other options we might want to set: - // Timeout? } if err := drain.RunCordonOrUncordon(helper, u.Node, true); err != nil { diff --git a/pkg/instancegroups/rollingupdate.go b/pkg/instancegroups/rollingupdate.go index 28ebbca32d..079bbf4a10 100644 --- a/pkg/instancegroups/rollingupdate.go +++ b/pkg/instancegroups/rollingupdate.go @@ -77,6 +77,9 @@ type RollingUpdateCluster struct { // ValidateCount is the amount of time that a cluster needs to be validated after single node update ValidateCount int + + // DrainTimeout is the maximum amount of time to wait while draining a node. + DrainTimeout time.Duration } // AdjustNeedUpdate adjusts the set of instances that need updating, using factors outside those known by the cloud implementation