add option to keep node group backoff on OutOfResource error

2023-05-11 19:04:27 -07:00 · 2023-05-11 19:04:27 -07:00 · aa1af03862
parent 6c14a3a3cd
commit aa1af03862
5 changed files with 21 additions and 1 deletions
--- a/cluster-autoscaler/clusterstate/clusterstate.go
+++ b/cluster-autoscaler/clusterstate/clusterstate.go
@ -86,6 +86,8 @@ type ClusterStateRegistryConfig struct {
 	// Minimum number of nodes that must be unready for MaxTotalUnreadyPercentage to apply.
 	// This is to ensure that in very small clusters (e.g. 2 nodes) a single node's failure doesn't disable autoscaling.
 	OkTotalUnreadyCount int
+	// NodeGroupKeepBackoffOutOfResources is whether a backoff can be removed before expiration when a scale-up fails due to the cloud provider being out of resources.
+	NodeGroupKeepBackoffOutOfResources bool
 }

 // IncorrectNodeGroupSize contains information about how much the current size of the node group
@ -264,7 +266,11 @@ func (csr *ClusterStateRegistry) updateScaleRequests(currentTime time.Time) {
 			// scale-out finished successfully
 			// remove it and reset node group backoff
 			delete(csr.scaleUpRequests, nodeGroupName)
-			csr.backoff.RemoveBackoff(scaleUpRequest.NodeGroup, csr.nodeInfosForGroups[scaleUpRequest.NodeGroup.Id()])
+			shouldKeepBackoff := csr.config.NodeGroupKeepBackoffOutOfResources && csr.backoff.IsNodeGroupOutOfResources(scaleUpRequest.NodeGroup)
+			if !shouldKeepBackoff {
+				klog.V(4).Infof("Removing backoff for node group %v", scaleUpRequest.NodeGroup.Id())
+				csr.backoff.RemoveBackoff(scaleUpRequest.NodeGroup, csr.nodeInfosForGroups[scaleUpRequest.NodeGroup.Id()])
+			}
 			klog.V(4).Infof("Scale up in group %v finished successfully in %v",
 				nodeGroupName, currentTime.Sub(scaleUpRequest.Time))
 			continue
--- a/cluster-autoscaler/config/autoscaling_options.go
+++ b/cluster-autoscaler/config/autoscaling_options.go
@ -249,6 +249,8 @@ type AutoscalingOptions struct {
 	MaxNodeGroupBackoffDuration time.Duration
 	// NodeGroupBackoffResetTimeout is the time after last failed scale-up when the backoff duration is reset.
 	NodeGroupBackoffResetTimeout time.Duration
+	// NodeGroupKeepBackoffOutOfResources is whether a backoff can be removed before expiration when a scale-up fails due to the cloud provider being out of resources.
+	NodeGroupKeepBackoffOutOfResources bool
 	// MaxScaleDownParallelism is the maximum number of nodes (both empty and needing drain) that can be deleted in parallel.
 	MaxScaleDownParallelism int
 	// MaxDrainParallelism is the maximum number of nodes needing drain, that can be drained and deleted in parallel.
--- a/cluster-autoscaler/main.go
+++ b/cluster-autoscaler/main.go
@ -231,6 +231,7 @@ var (
 		"maxNodeGroupBackoffDuration is the maximum backoff duration for a NodeGroup after new nodes failed to start.")
 	nodeGroupBackoffResetTimeout = flag.Duration("node-group-backoff-reset-timeout", 3*time.Hour,
 		"nodeGroupBackoffResetTimeout is the time after last failed scale-up when the backoff duration is reset.")
+	nodeGroupKeepBackoffOutOfResources 		= flag.Bool("node-group-keep-backoff-out-of-resources", false, "Prevents removal of backoff before expiration when a scale-up fails due to the cloud provider being out of resources.")
 	maxScaleDownParallelismFlag             = flag.Int("max-scale-down-parallelism", 10, "Maximum number of nodes (both empty and needing drain) that can be deleted in parallel.")
 	maxDrainParallelismFlag                 = flag.Int("max-drain-parallelism", 1, "Maximum number of nodes needing drain, that can be drained and deleted in parallel.")
 	recordDuplicatedEvents                  = flag.Bool("record-duplicated-events", false, "enable duplication of similar events within a 5 minute window.")
@ -406,6 +407,7 @@ func createAutoscalingOptions() config.AutoscalingOptions {
 		InitialNodeGroupBackoffDuration:    *initialNodeGroupBackoffDuration,
 		MaxNodeGroupBackoffDuration:        *maxNodeGroupBackoffDuration,
 		NodeGroupBackoffResetTimeout:       *nodeGroupBackoffResetTimeout,
+		NodeGroupKeepBackoffOutOfResources: *nodeGroupKeepBackoffOutOfResources,
 		MaxScaleDownParallelism:            *maxScaleDownParallelismFlag,
 		MaxDrainParallelism:                *maxDrainParallelismFlag,
 		RecordDuplicatedEvents:             *recordDuplicatedEvents,
--- a/cluster-autoscaler/utils/backoff/backoff.go
+++ b/cluster-autoscaler/utils/backoff/backoff.go
@ -39,4 +39,6 @@ type Backoff interface {
 	RemoveBackoff(nodeGroup cloudprovider.NodeGroup, nodeInfo *schedulerframework.NodeInfo)
 	// RemoveStaleBackoffData removes stale backoff data.
 	RemoveStaleBackoffData(currentTime time.Time)
+	// IsNodeGroupOutOfResources returns true if the given node group is out of resources.
+	IsNodeGroupOutOfResources(nodeGroup cloudprovider.NodeGroup) bool
 }
--- a/cluster-autoscaler/utils/backoff/exponential_backoff.go
+++ b/cluster-autoscaler/utils/backoff/exponential_backoff.go
@ -38,6 +38,7 @@ type exponentialBackoffInfo struct {
 	backoffUntil        time.Time
 	lastFailedExecution time.Time
 	errorInfo           cloudprovider.InstanceErrorInfo
+	errorClass          cloudprovider.InstanceErrorClass
 }

 // NewExponentialBackoff creates an instance of exponential backoff.
@ -89,6 +90,7 @@ func (b *exponentialBackoff) Backoff(nodeGroup cloudprovider.NodeGroup, nodeInfo
 		backoffUntil:        backoffUntil,
 		lastFailedExecution: currentTime,
 		errorInfo:           errorInfo,
+		errorClass:          errorClass,
 	}
 	return backoffUntil
 }
@ -118,3 +120,9 @@ func (b *exponentialBackoff) RemoveStaleBackoffData(currentTime time.Time) {
 		}
 	}
 }
+
+// IsNodeGroupOutOfResources returns true if the given node group is out of resources.
+func (b *exponentialBackoff) IsNodeGroupOutOfResources(nodeGroup cloudprovider.NodeGroup) bool {
+	backoffInfo, found := b.backoffInfo[b.nodeGroupKey(nodeGroup)]
+	return found && backoffInfo.errorClass == cloudprovider.OutOfResourcesErrorClass
+}