add option to keep node group backoff on OutOfResource error
This commit is contained in:
parent
6c14a3a3cd
commit
aa1af03862
|
@ -86,6 +86,8 @@ type ClusterStateRegistryConfig struct {
|
|||
// Minimum number of nodes that must be unready for MaxTotalUnreadyPercentage to apply.
|
||||
// This is to ensure that in very small clusters (e.g. 2 nodes) a single node's failure doesn't disable autoscaling.
|
||||
OkTotalUnreadyCount int
|
||||
// NodeGroupKeepBackoffOutOfResources is whether a backoff can be removed before expiration when a scale-up fails due to the cloud provider being out of resources.
|
||||
NodeGroupKeepBackoffOutOfResources bool
|
||||
}
|
||||
|
||||
// IncorrectNodeGroupSize contains information about how much the current size of the node group
|
||||
|
@ -264,7 +266,11 @@ func (csr *ClusterStateRegistry) updateScaleRequests(currentTime time.Time) {
|
|||
// scale-out finished successfully
|
||||
// remove it and reset node group backoff
|
||||
delete(csr.scaleUpRequests, nodeGroupName)
|
||||
csr.backoff.RemoveBackoff(scaleUpRequest.NodeGroup, csr.nodeInfosForGroups[scaleUpRequest.NodeGroup.Id()])
|
||||
shouldKeepBackoff := csr.config.NodeGroupKeepBackoffOutOfResources && csr.backoff.IsNodeGroupOutOfResources(scaleUpRequest.NodeGroup)
|
||||
if !shouldKeepBackoff {
|
||||
klog.V(4).Infof("Removing backoff for node group %v", scaleUpRequest.NodeGroup.Id())
|
||||
csr.backoff.RemoveBackoff(scaleUpRequest.NodeGroup, csr.nodeInfosForGroups[scaleUpRequest.NodeGroup.Id()])
|
||||
}
|
||||
klog.V(4).Infof("Scale up in group %v finished successfully in %v",
|
||||
nodeGroupName, currentTime.Sub(scaleUpRequest.Time))
|
||||
continue
|
||||
|
|
|
@ -249,6 +249,8 @@ type AutoscalingOptions struct {
|
|||
MaxNodeGroupBackoffDuration time.Duration
|
||||
// NodeGroupBackoffResetTimeout is the time after last failed scale-up when the backoff duration is reset.
|
||||
NodeGroupBackoffResetTimeout time.Duration
|
||||
// NodeGroupKeepBackoffOutOfResources is whether a backoff can be removed before expiration when a scale-up fails due to the cloud provider being out of resources.
|
||||
NodeGroupKeepBackoffOutOfResources bool
|
||||
// MaxScaleDownParallelism is the maximum number of nodes (both empty and needing drain) that can be deleted in parallel.
|
||||
MaxScaleDownParallelism int
|
||||
// MaxDrainParallelism is the maximum number of nodes needing drain, that can be drained and deleted in parallel.
|
||||
|
|
|
@ -231,6 +231,7 @@ var (
|
|||
"maxNodeGroupBackoffDuration is the maximum backoff duration for a NodeGroup after new nodes failed to start.")
|
||||
nodeGroupBackoffResetTimeout = flag.Duration("node-group-backoff-reset-timeout", 3*time.Hour,
|
||||
"nodeGroupBackoffResetTimeout is the time after last failed scale-up when the backoff duration is reset.")
|
||||
nodeGroupKeepBackoffOutOfResources = flag.Bool("node-group-keep-backoff-out-of-resources", false, "Prevents removal of backoff before expiration when a scale-up fails due to the cloud provider being out of resources.")
|
||||
maxScaleDownParallelismFlag = flag.Int("max-scale-down-parallelism", 10, "Maximum number of nodes (both empty and needing drain) that can be deleted in parallel.")
|
||||
maxDrainParallelismFlag = flag.Int("max-drain-parallelism", 1, "Maximum number of nodes needing drain, that can be drained and deleted in parallel.")
|
||||
recordDuplicatedEvents = flag.Bool("record-duplicated-events", false, "enable duplication of similar events within a 5 minute window.")
|
||||
|
@ -406,6 +407,7 @@ func createAutoscalingOptions() config.AutoscalingOptions {
|
|||
InitialNodeGroupBackoffDuration: *initialNodeGroupBackoffDuration,
|
||||
MaxNodeGroupBackoffDuration: *maxNodeGroupBackoffDuration,
|
||||
NodeGroupBackoffResetTimeout: *nodeGroupBackoffResetTimeout,
|
||||
NodeGroupKeepBackoffOutOfResources: *nodeGroupKeepBackoffOutOfResources,
|
||||
MaxScaleDownParallelism: *maxScaleDownParallelismFlag,
|
||||
MaxDrainParallelism: *maxDrainParallelismFlag,
|
||||
RecordDuplicatedEvents: *recordDuplicatedEvents,
|
||||
|
|
|
@ -39,4 +39,6 @@ type Backoff interface {
|
|||
RemoveBackoff(nodeGroup cloudprovider.NodeGroup, nodeInfo *schedulerframework.NodeInfo)
|
||||
// RemoveStaleBackoffData removes stale backoff data.
|
||||
RemoveStaleBackoffData(currentTime time.Time)
|
||||
// IsNodeGroupOutOfResources returns true if the given node group is out of resources.
|
||||
IsNodeGroupOutOfResources(nodeGroup cloudprovider.NodeGroup) bool
|
||||
}
|
||||
|
|
|
@ -38,6 +38,7 @@ type exponentialBackoffInfo struct {
|
|||
backoffUntil time.Time
|
||||
lastFailedExecution time.Time
|
||||
errorInfo cloudprovider.InstanceErrorInfo
|
||||
errorClass cloudprovider.InstanceErrorClass
|
||||
}
|
||||
|
||||
// NewExponentialBackoff creates an instance of exponential backoff.
|
||||
|
@ -89,6 +90,7 @@ func (b *exponentialBackoff) Backoff(nodeGroup cloudprovider.NodeGroup, nodeInfo
|
|||
backoffUntil: backoffUntil,
|
||||
lastFailedExecution: currentTime,
|
||||
errorInfo: errorInfo,
|
||||
errorClass: errorClass,
|
||||
}
|
||||
return backoffUntil
|
||||
}
|
||||
|
@ -118,3 +120,9 @@ func (b *exponentialBackoff) RemoveStaleBackoffData(currentTime time.Time) {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
// IsNodeGroupOutOfResources returns true if the given node group is out of resources.
|
||||
func (b *exponentialBackoff) IsNodeGroupOutOfResources(nodeGroup cloudprovider.NodeGroup) bool {
|
||||
backoffInfo, found := b.backoffInfo[b.nodeGroupKey(nodeGroup)]
|
||||
return found && backoffInfo.errorClass == cloudprovider.OutOfResourcesErrorClass
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue