add option to keep node group backoff on OutOfResource error

This commit is contained in:
Will Bowers 2023-05-11 19:04:27 -07:00
parent 6c14a3a3cd
commit aa1af03862
5 changed files with 21 additions and 1 deletions

View File

@ -86,6 +86,8 @@ type ClusterStateRegistryConfig struct {
// Minimum number of nodes that must be unready for MaxTotalUnreadyPercentage to apply.
// This is to ensure that in very small clusters (e.g. 2 nodes) a single node's failure doesn't disable autoscaling.
OkTotalUnreadyCount int
// NodeGroupKeepBackoffOutOfResources is whether a backoff can be removed before expiration when a scale-up fails due to the cloud provider being out of resources.
NodeGroupKeepBackoffOutOfResources bool
}
// IncorrectNodeGroupSize contains information about how much the current size of the node group
@ -264,7 +266,11 @@ func (csr *ClusterStateRegistry) updateScaleRequests(currentTime time.Time) {
// scale-out finished successfully
// remove it and reset node group backoff
delete(csr.scaleUpRequests, nodeGroupName)
csr.backoff.RemoveBackoff(scaleUpRequest.NodeGroup, csr.nodeInfosForGroups[scaleUpRequest.NodeGroup.Id()])
shouldKeepBackoff := csr.config.NodeGroupKeepBackoffOutOfResources && csr.backoff.IsNodeGroupOutOfResources(scaleUpRequest.NodeGroup)
if !shouldKeepBackoff {
klog.V(4).Infof("Removing backoff for node group %v", scaleUpRequest.NodeGroup.Id())
csr.backoff.RemoveBackoff(scaleUpRequest.NodeGroup, csr.nodeInfosForGroups[scaleUpRequest.NodeGroup.Id()])
}
klog.V(4).Infof("Scale up in group %v finished successfully in %v",
nodeGroupName, currentTime.Sub(scaleUpRequest.Time))
continue

View File

@ -249,6 +249,8 @@ type AutoscalingOptions struct {
MaxNodeGroupBackoffDuration time.Duration
// NodeGroupBackoffResetTimeout is the time after last failed scale-up when the backoff duration is reset.
NodeGroupBackoffResetTimeout time.Duration
// NodeGroupKeepBackoffOutOfResources is whether a backoff can be removed before expiration when a scale-up fails due to the cloud provider being out of resources.
NodeGroupKeepBackoffOutOfResources bool
// MaxScaleDownParallelism is the maximum number of nodes (both empty and needing drain) that can be deleted in parallel.
MaxScaleDownParallelism int
// MaxDrainParallelism is the maximum number of nodes needing drain, that can be drained and deleted in parallel.

View File

@ -231,6 +231,7 @@ var (
"maxNodeGroupBackoffDuration is the maximum backoff duration for a NodeGroup after new nodes failed to start.")
nodeGroupBackoffResetTimeout = flag.Duration("node-group-backoff-reset-timeout", 3*time.Hour,
"nodeGroupBackoffResetTimeout is the time after last failed scale-up when the backoff duration is reset.")
nodeGroupKeepBackoffOutOfResources = flag.Bool("node-group-keep-backoff-out-of-resources", false, "Prevents removal of backoff before expiration when a scale-up fails due to the cloud provider being out of resources.")
maxScaleDownParallelismFlag = flag.Int("max-scale-down-parallelism", 10, "Maximum number of nodes (both empty and needing drain) that can be deleted in parallel.")
maxDrainParallelismFlag = flag.Int("max-drain-parallelism", 1, "Maximum number of nodes needing drain, that can be drained and deleted in parallel.")
recordDuplicatedEvents = flag.Bool("record-duplicated-events", false, "enable duplication of similar events within a 5 minute window.")
@ -406,6 +407,7 @@ func createAutoscalingOptions() config.AutoscalingOptions {
InitialNodeGroupBackoffDuration: *initialNodeGroupBackoffDuration,
MaxNodeGroupBackoffDuration: *maxNodeGroupBackoffDuration,
NodeGroupBackoffResetTimeout: *nodeGroupBackoffResetTimeout,
NodeGroupKeepBackoffOutOfResources: *nodeGroupKeepBackoffOutOfResources,
MaxScaleDownParallelism: *maxScaleDownParallelismFlag,
MaxDrainParallelism: *maxDrainParallelismFlag,
RecordDuplicatedEvents: *recordDuplicatedEvents,

View File

@ -39,4 +39,6 @@ type Backoff interface {
RemoveBackoff(nodeGroup cloudprovider.NodeGroup, nodeInfo *schedulerframework.NodeInfo)
// RemoveStaleBackoffData removes stale backoff data.
RemoveStaleBackoffData(currentTime time.Time)
// IsNodeGroupOutOfResources returns true if the given node group is out of resources.
IsNodeGroupOutOfResources(nodeGroup cloudprovider.NodeGroup) bool
}

View File

@ -38,6 +38,7 @@ type exponentialBackoffInfo struct {
backoffUntil time.Time
lastFailedExecution time.Time
errorInfo cloudprovider.InstanceErrorInfo
errorClass cloudprovider.InstanceErrorClass
}
// NewExponentialBackoff creates an instance of exponential backoff.
@ -89,6 +90,7 @@ func (b *exponentialBackoff) Backoff(nodeGroup cloudprovider.NodeGroup, nodeInfo
backoffUntil: backoffUntil,
lastFailedExecution: currentTime,
errorInfo: errorInfo,
errorClass: errorClass,
}
return backoffUntil
}
@ -118,3 +120,9 @@ func (b *exponentialBackoff) RemoveStaleBackoffData(currentTime time.Time) {
}
}
}
// IsNodeGroupOutOfResources returns true if the given node group is out of resources.
func (b *exponentialBackoff) IsNodeGroupOutOfResources(nodeGroup cloudprovider.NodeGroup) bool {
backoffInfo, found := b.backoffInfo[b.nodeGroupKey(nodeGroup)]
return found && backoffInfo.errorClass == cloudprovider.OutOfResourcesErrorClass
}