Add error details to autoscaling backoff.

Change-Id: I3b5c62ba13c2e048ce2d7170016af07182c11eee
This commit is contained in:
Walid Ghallab 2023-11-29 20:50:47 +00:00
parent 8f75e9cf2b
commit cf6176f80d
7 changed files with 165 additions and 74 deletions

View File

@ -143,6 +143,13 @@ type ClusterStateRegistry struct {
scaleUpFailures map[string][]ScaleUpFailure
}
// NodeGroupScalingSafety contains information about the safety of the node group to scale up/down.
type NodeGroupScalingSafety struct {
SafeToScale bool
Healthy bool
BackoffStatus backoff.Status
}
// NewClusterStateRegistry creates new ClusterStateRegistry.
func NewClusterStateRegistry(cloudProvider cloudprovider.CloudProvider, config ClusterStateRegistryConfig, logRecorder *utils.LogEventRecorder, backoff backoff.Backoff, nodeGroupConfigProcessor nodegroupconfig.NodeGroupConfigProcessor) *ClusterStateRegistry {
emptyStatus := &api.ClusterAutoscalerStatus{
@ -276,7 +283,11 @@ func (csr *ClusterStateRegistry) updateScaleRequests(currentTime time.Time) {
} else {
gpuResource, gpuType = gpu.GetGpuInfoForMetrics(csr.cloudProvider.GetNodeGpuConfig(nodeInfo.Node()), availableGPUTypes, nodeInfo.Node(), scaleUpRequest.NodeGroup)
}
csr.registerFailedScaleUpNoLock(scaleUpRequest.NodeGroup, metrics.Timeout, cloudprovider.OtherErrorClass, "timeout", gpuResource, gpuType, currentTime)
csr.registerFailedScaleUpNoLock(scaleUpRequest.NodeGroup, metrics.Timeout, cloudprovider.InstanceErrorInfo{
ErrorClass: cloudprovider.OtherErrorClass,
ErrorCode: "timeout",
ErrorMessage: fmt.Sprintf("Scale-up timed out for node group %v after %v", nodeGroupName, currentTime.Sub(scaleUpRequest.Time)),
}, gpuResource, gpuType, currentTime)
delete(csr.scaleUpRequests, nodeGroupName)
}
}
@ -291,25 +302,29 @@ func (csr *ClusterStateRegistry) updateScaleRequests(currentTime time.Time) {
}
// To be executed under a lock.
func (csr *ClusterStateRegistry) backoffNodeGroup(nodeGroup cloudprovider.NodeGroup, errorClass cloudprovider.InstanceErrorClass, errorCode string, currentTime time.Time) {
func (csr *ClusterStateRegistry) backoffNodeGroup(nodeGroup cloudprovider.NodeGroup, errorInfo cloudprovider.InstanceErrorInfo, currentTime time.Time) {
nodeGroupInfo := csr.nodeInfosForGroups[nodeGroup.Id()]
backoffUntil := csr.backoff.Backoff(nodeGroup, nodeGroupInfo, errorClass, errorCode, currentTime)
klog.Warningf("Disabling scale-up for node group %v until %v; errorClass=%v; errorCode=%v", nodeGroup.Id(), backoffUntil, errorClass, errorCode)
backoffUntil := csr.backoff.Backoff(nodeGroup, nodeGroupInfo, errorInfo, currentTime)
klog.Warningf("Disabling scale-up for node group %v until %v; errorClass=%v; errorCode=%v", nodeGroup.Id(), backoffUntil, errorInfo.ErrorClass, errorInfo.ErrorCode)
}
// RegisterFailedScaleUp should be called after getting error from cloudprovider
// when trying to scale-up node group. It will mark this group as not safe to autoscale
// for some time.
func (csr *ClusterStateRegistry) RegisterFailedScaleUp(nodeGroup cloudprovider.NodeGroup, reason metrics.FailedScaleUpReason, gpuResourceName, gpuType string, currentTime time.Time) {
func (csr *ClusterStateRegistry) RegisterFailedScaleUp(nodeGroup cloudprovider.NodeGroup, reason metrics.FailedScaleUpReason, errorMessage, gpuResourceName, gpuType string, currentTime time.Time) {
csr.Lock()
defer csr.Unlock()
csr.registerFailedScaleUpNoLock(nodeGroup, reason, cloudprovider.OtherErrorClass, string(reason), gpuResourceName, gpuType, currentTime)
csr.registerFailedScaleUpNoLock(nodeGroup, reason, cloudprovider.InstanceErrorInfo{
ErrorClass: cloudprovider.OtherErrorClass,
ErrorCode: string(reason),
ErrorMessage: errorMessage,
}, gpuResourceName, gpuType, currentTime)
}
func (csr *ClusterStateRegistry) registerFailedScaleUpNoLock(nodeGroup cloudprovider.NodeGroup, reason metrics.FailedScaleUpReason, errorClass cloudprovider.InstanceErrorClass, errorCode string, gpuResourceName, gpuType string, currentTime time.Time) {
func (csr *ClusterStateRegistry) registerFailedScaleUpNoLock(nodeGroup cloudprovider.NodeGroup, reason metrics.FailedScaleUpReason, errorInfo cloudprovider.InstanceErrorInfo, gpuResourceName, gpuType string, currentTime time.Time) {
csr.scaleUpFailures[nodeGroup.Id()] = append(csr.scaleUpFailures[nodeGroup.Id()], ScaleUpFailure{NodeGroup: nodeGroup, Reason: reason, Time: currentTime})
metrics.RegisterFailedScaleUp(reason, gpuResourceName, gpuType)
csr.backoffNodeGroup(nodeGroup, errorClass, errorCode, currentTime)
csr.backoffNodeGroup(nodeGroup, errorInfo, currentTime)
}
// UpdateNodes updates the state of the nodes in the ClusterStateRegistry and recalculates the stats
@ -414,7 +429,6 @@ func (csr *ClusterStateRegistry) IsNodeGroupHealthy(nodeGroupName string) bool {
unjustifiedUnready += acceptable.MinNodes - len(readiness.Ready)
}
// TODO: verify against max nodes as well.
if unjustifiedUnready > csr.config.OkTotalUnreadyCount &&
float64(unjustifiedUnready) > csr.config.MaxTotalUnreadyPercentage/100.0*
float64(len(readiness.Ready)+len(readiness.Unready)+len(readiness.NotStarted)) {
@ -441,12 +455,11 @@ func (csr *ClusterStateRegistry) updateNodeGroupMetrics() {
metrics.UpdateNodeGroupsCount(autoscaled, autoprovisioned)
}
// IsNodeGroupSafeToScaleUp returns true if node group can be scaled up now.
func (csr *ClusterStateRegistry) IsNodeGroupSafeToScaleUp(nodeGroup cloudprovider.NodeGroup, now time.Time) bool {
if !csr.IsNodeGroupHealthy(nodeGroup.Id()) {
return false
}
return !csr.backoff.IsBackedOff(nodeGroup, csr.nodeInfosForGroups[nodeGroup.Id()], now)
// IsNodeGroupSafeToScaleUp returns information about node group safety to be scaled up now.
func (csr *ClusterStateRegistry) IsNodeGroupSafeToScaleUp(nodeGroup cloudprovider.NodeGroup, now time.Time) NodeGroupScalingSafety {
isHealthy := csr.IsNodeGroupHealthy(nodeGroup.Id())
backoffStatus := csr.backoff.BackoffStatus(nodeGroup, csr.nodeInfosForGroups[nodeGroup.Id()], now)
return NodeGroupScalingSafety{SafeToScale: isHealthy && !backoffStatus.IsBackedOff, Healthy: isHealthy, BackoffStatus: backoffStatus}
}
func (csr *ClusterStateRegistry) getProvisionedAndTargetSizesForNodeGroup(nodeGroupName string) (provisioned, target int, ok bool) {
@ -788,7 +801,7 @@ func (csr *ClusterStateRegistry) GetClusterReadiness() Readiness {
return csr.totalReadiness
}
func buildHealthStatusNodeGroup(isReady bool, readiness Readiness, acceptable AcceptableRange, minSize, maxSize int) api.ClusterAutoscalerCondition {
func buildHealthStatusNodeGroup(isHealthy bool, readiness Readiness, acceptable AcceptableRange, minSize, maxSize int) api.ClusterAutoscalerCondition {
condition := api.ClusterAutoscalerCondition{
Type: api.ClusterAutoscalerHealth,
Message: fmt.Sprintf("ready=%d unready=%d (resourceUnready=%d) notStarted=%d longNotStarted=0 registered=%d longUnregistered=%d cloudProviderTarget=%d (minSize=%d, maxSize=%d)",
@ -803,7 +816,7 @@ func buildHealthStatusNodeGroup(isReady bool, readiness Readiness, acceptable Ac
maxSize),
LastProbeTime: metav1.Time{Time: readiness.Time},
}
if isReady {
if isHealthy {
condition.Status = api.ClusterAutoscalerHealthy
} else {
condition.Status = api.ClusterAutoscalerUnhealthy
@ -811,7 +824,7 @@ func buildHealthStatusNodeGroup(isReady bool, readiness Readiness, acceptable Ac
return condition
}
func buildScaleUpStatusNodeGroup(isScaleUpInProgress bool, isSafeToScaleUp bool, readiness Readiness, acceptable AcceptableRange) api.ClusterAutoscalerCondition {
func buildScaleUpStatusNodeGroup(isScaleUpInProgress bool, scaleUpSafety NodeGroupScalingSafety, readiness Readiness, acceptable AcceptableRange) api.ClusterAutoscalerCondition {
condition := api.ClusterAutoscalerCondition{
Type: api.ClusterAutoscalerScaleUp,
Message: fmt.Sprintf("ready=%d cloudProviderTarget=%d",
@ -821,7 +834,9 @@ func buildScaleUpStatusNodeGroup(isScaleUpInProgress bool, isSafeToScaleUp bool,
}
if isScaleUpInProgress {
condition.Status = api.ClusterAutoscalerInProgress
} else if !isSafeToScaleUp {
} else if !scaleUpSafety.Healthy {
condition.Status = api.ClusterAutoscalerUnhealthy
} else if !scaleUpSafety.SafeToScale {
condition.Status = api.ClusterAutoscalerBackoff
} else {
condition.Status = api.ClusterAutoscalerNoActivity
@ -1123,7 +1138,12 @@ func (csr *ClusterStateRegistry) handleInstanceCreationErrorsForNodeGroup(
}
// Decrease the scale up request by the number of deleted nodes
csr.registerOrUpdateScaleUpNoLock(nodeGroup, -len(unseenInstanceIds), currentTime)
csr.registerFailedScaleUpNoLock(nodeGroup, metrics.FailedScaleUpReason(errorCode.code), errorCode.class, errorCode.code, gpuResource, gpuType, currentTime)
csr.registerFailedScaleUpNoLock(nodeGroup, metrics.FailedScaleUpReason(errorCode.code), cloudprovider.InstanceErrorInfo{
ErrorClass: errorCode.class,
ErrorCode: errorCode.code,
ErrorMessage: csr.buildErrorMessageEventString(currentUniqueErrorMessagesForErrorCode[errorCode]),
}, gpuResource, gpuType, currentTime)
}
}
}

View File

@ -923,13 +923,31 @@ func TestScaleUpBackoff(t *testing.T) {
assert.NoError(t, err)
assert.True(t, clusterstate.IsClusterHealthy())
assert.True(t, clusterstate.IsNodeGroupHealthy("ng1"))
assert.False(t, clusterstate.IsNodeGroupSafeToScaleUp(ng1, now))
assert.Equal(t, NodeGroupScalingSafety{
SafeToScale: false,
Healthy: true,
BackoffStatus: backoff.Status{
IsBackedOff: true,
ErrorInfo: &cloudprovider.InstanceErrorInfo{
ErrorClass: cloudprovider.OtherErrorClass,
ErrorCode: "timeout",
ErrorMessage: "Scale-up timed out for node group ng1 after 3m0s",
},
},
}, clusterstate.IsNodeGroupSafeToScaleUp(ng1, now))
assert.Equal(t, backoff.Status{
IsBackedOff: true,
ErrorInfo: &cloudprovider.InstanceErrorInfo{
ErrorClass: cloudprovider.OtherErrorClass,
ErrorCode: "timeout",
ErrorMessage: "Scale-up timed out for node group ng1 after 3m0s",
}}, clusterstate.backoff.BackoffStatus(ng1, nil, now))
// Backoff should expire after timeout
now = now.Add(5 * time.Minute /*InitialNodeGroupBackoffDuration*/).Add(time.Second)
assert.True(t, clusterstate.IsClusterHealthy())
assert.True(t, clusterstate.IsNodeGroupHealthy("ng1"))
assert.True(t, clusterstate.IsNodeGroupSafeToScaleUp(ng1, now))
assert.Equal(t, NodeGroupScalingSafety{SafeToScale: true, Healthy: true}, clusterstate.IsNodeGroupSafeToScaleUp(ng1, now))
// Another failed scale up should cause longer backoff
clusterstate.RegisterOrUpdateScaleUp(provider.GetNodeGroup("ng1"), 1, now.Add(-121*time.Second))
@ -938,10 +956,32 @@ func TestScaleUpBackoff(t *testing.T) {
assert.NoError(t, err)
assert.True(t, clusterstate.IsClusterHealthy())
assert.True(t, clusterstate.IsNodeGroupHealthy("ng1"))
assert.False(t, clusterstate.IsNodeGroupSafeToScaleUp(ng1, now))
assert.Equal(t, NodeGroupScalingSafety{
SafeToScale: false,
Healthy: true,
BackoffStatus: backoff.Status{
IsBackedOff: true,
ErrorInfo: &cloudprovider.InstanceErrorInfo{
ErrorClass: cloudprovider.OtherErrorClass,
ErrorCode: "timeout",
ErrorMessage: "Scale-up timed out for node group ng1 after 2m1s",
},
},
}, clusterstate.IsNodeGroupSafeToScaleUp(ng1, now))
now = now.Add(5 * time.Minute /*InitialNodeGroupBackoffDuration*/).Add(time.Second)
assert.False(t, clusterstate.IsNodeGroupSafeToScaleUp(ng1, now))
assert.Equal(t, NodeGroupScalingSafety{
SafeToScale: false,
Healthy: true,
BackoffStatus: backoff.Status{
IsBackedOff: true,
ErrorInfo: &cloudprovider.InstanceErrorInfo{
ErrorClass: cloudprovider.OtherErrorClass,
ErrorCode: "timeout",
ErrorMessage: "Scale-up timed out for node group ng1 after 2m1s",
},
},
}, clusterstate.IsNodeGroupSafeToScaleUp(ng1, now))
// The backoff should be cleared after a successful scale-up
clusterstate.RegisterOrUpdateScaleUp(provider.GetNodeGroup("ng1"), 1, now)
@ -952,8 +992,8 @@ func TestScaleUpBackoff(t *testing.T) {
assert.NoError(t, err)
assert.True(t, clusterstate.IsClusterHealthy())
assert.True(t, clusterstate.IsNodeGroupHealthy("ng1"))
assert.True(t, clusterstate.IsNodeGroupSafeToScaleUp(ng1, now))
assert.False(t, clusterstate.backoff.IsBackedOff(ng1, nil, now))
assert.Equal(t, NodeGroupScalingSafety{SafeToScale: true, Healthy: true}, clusterstate.IsNodeGroupSafeToScaleUp(ng1, now))
assert.Equal(t, backoff.Status{IsBackedOff: false}, clusterstate.backoff.BackoffStatus(ng1, nil, now))
}
func TestGetClusterSize(t *testing.T) {
@ -1070,9 +1110,9 @@ func TestScaleUpFailures(t *testing.T) {
fakeLogRecorder, _ := utils.NewStatusMapRecorder(fakeClient, "kube-system", kube_record.NewFakeRecorder(5), false, "my-cool-configmap")
clusterstate := NewClusterStateRegistry(provider, ClusterStateRegistryConfig{}, fakeLogRecorder, newBackoff(), nodegroupconfig.NewDefaultNodeGroupConfigProcessor(config.NodeGroupAutoscalingOptions{MaxNodeProvisionTime: 15 * time.Minute}))
clusterstate.RegisterFailedScaleUp(provider.GetNodeGroup("ng1"), metrics.Timeout, "", "", now)
clusterstate.RegisterFailedScaleUp(provider.GetNodeGroup("ng2"), metrics.Timeout, "", "", now)
clusterstate.RegisterFailedScaleUp(provider.GetNodeGroup("ng1"), metrics.APIError, "", "", now.Add(time.Minute))
clusterstate.RegisterFailedScaleUp(provider.GetNodeGroup("ng1"), metrics.Timeout, "", "", "", now)
clusterstate.RegisterFailedScaleUp(provider.GetNodeGroup("ng2"), metrics.Timeout, "", "", "", now)
clusterstate.RegisterFailedScaleUp(provider.GetNodeGroup("ng1"), metrics.APIError, "", "", "", now.Add(time.Minute))
failures := clusterstate.GetScaleUpFailures()
assert.Equal(t, map[string][]ScaleUpFailure{

View File

@ -151,7 +151,7 @@ func (e *scaleUpExecutor) executeScaleUp(
if err := info.Group.IncreaseSize(increase); err != nil {
e.autoscalingContext.LogRecorder.Eventf(apiv1.EventTypeWarning, "FailedToScaleUpGroup", "Scale-up failed for group %s: %v", info.Group.Id(), err)
aerr := errors.ToAutoscalerError(errors.CloudProviderError, err).AddPrefix("failed to increase node group size: ")
e.clusterStateRegistry.RegisterFailedScaleUp(info.Group, metrics.FailedScaleUpReason(string(aerr.Type())), gpuResourceName, gpuType, now)
e.clusterStateRegistry.RegisterFailedScaleUp(info.Group, metrics.FailedScaleUpReason(string(aerr.Type())), aerr.Error(), gpuResourceName, gpuType, now)
return aerr
}
e.clusterStateRegistry.RegisterOrUpdateScaleUp(

View File

@ -569,13 +569,15 @@ func (o *ScaleUpOrchestrator) UpcomingNodes(nodeInfos map[string]*schedulerframe
// IsNodeGroupReadyToScaleUp returns nil if node group is ready to be scaled up, otherwise a reason is provided.
func (o *ScaleUpOrchestrator) IsNodeGroupReadyToScaleUp(nodeGroup cloudprovider.NodeGroup, now time.Time) *SkippedReasons {
// Non-existing node groups are created later so skip check for them.
if nodeGroup.Exist() && !o.clusterStateRegistry.IsNodeGroupSafeToScaleUp(nodeGroup, now) {
// Hack that depends on internals of IsNodeGroupSafeToScaleUp.
if !o.clusterStateRegistry.IsNodeGroupHealthy(nodeGroup.Id()) {
if !nodeGroup.Exist() {
return nil
}
if scaleUpSafety := o.clusterStateRegistry.IsNodeGroupSafeToScaleUp(nodeGroup, now); !scaleUpSafety.SafeToScale {
if !scaleUpSafety.Healthy {
klog.Warningf("Node group %s is not ready for scaleup - unhealthy", nodeGroup.Id())
return NotReadyReason
}
klog.Warningf("Node group %s is not ready for scaleup - backoff", nodeGroup.Id())
klog.Warningf("Node group %s is not ready for scaleup - backoff with status: %v", nodeGroup.Id(), scaleUpSafety.BackoffStatus)
return BackoffReason
}
return nil
@ -658,7 +660,7 @@ func (o *ScaleUpOrchestrator) ComputeSimilarNodeGroups(
var validSimilarNodeGroups []cloudprovider.NodeGroup
for _, ng := range similarNodeGroups {
// Non-existing node groups are created later so skip check for them.
if ng.Exist() && !o.clusterStateRegistry.IsNodeGroupSafeToScaleUp(ng, now) {
if ng.Exist() && !o.clusterStateRegistry.IsNodeGroupSafeToScaleUp(ng, now).SafeToScale {
klog.V(2).Infof("Ignoring node group %s when balancing: group is not ready for scaleup", ng.Id())
} else if similarSchedulablePods, found := schedulablePods[ng.Id()]; found && matchingSchedulablePods(groupSchedulablePods, similarSchedulablePods) {
validSimilarNodeGroups = append(validSimilarNodeGroups, ng)

View File

@ -23,12 +23,18 @@ import (
schedulerframework "k8s.io/kubernetes/pkg/scheduler/framework"
)
// Status contains information about back off status.
type Status struct {
IsBackedOff bool
ErrorInfo *cloudprovider.InstanceErrorInfo
}
// Backoff allows time-based backing off of node groups considered in scale up algorithm
type Backoff interface {
// Backoff execution for the given node group. Returns time till execution is backed off.
Backoff(nodeGroup cloudprovider.NodeGroup, nodeInfo *schedulerframework.NodeInfo, errorClass cloudprovider.InstanceErrorClass, errorCode string, currentTime time.Time) time.Time
// IsBackedOff returns true if execution is backed off for the given node group.
IsBackedOff(nodeGroup cloudprovider.NodeGroup, nodeInfo *schedulerframework.NodeInfo, currentTime time.Time) bool
Backoff(nodeGroup cloudprovider.NodeGroup, nodeInfo *schedulerframework.NodeInfo, errorInfo cloudprovider.InstanceErrorInfo, currentTime time.Time) time.Time
// BackoffStatus returns whether the execution is backed off for the given node group and error info when the node group is backed off.
BackoffStatus(nodeGroup cloudprovider.NodeGroup, nodeInfo *schedulerframework.NodeInfo, currentTime time.Time) Status
// RemoveBackoff removes backoff data for the given node group.
RemoveBackoff(nodeGroup cloudprovider.NodeGroup, nodeInfo *schedulerframework.NodeInfo)
// RemoveStaleBackoffData removes stale backoff data.

View File

@ -37,6 +37,7 @@ type exponentialBackoffInfo struct {
duration time.Duration
backoffUntil time.Time
lastFailedExecution time.Time
errorInfo cloudprovider.InstanceErrorInfo
}
// NewExponentialBackoff creates an instance of exponential backoff.
@ -66,7 +67,7 @@ func NewIdBasedExponentialBackoff(initialBackoffDuration time.Duration, maxBacko
}
// Backoff execution for the given node group. Returns time till execution is backed off.
func (b *exponentialBackoff) Backoff(nodeGroup cloudprovider.NodeGroup, nodeInfo *schedulerframework.NodeInfo, errorClass cloudprovider.InstanceErrorClass, errorCode string, currentTime time.Time) time.Time {
func (b *exponentialBackoff) Backoff(nodeGroup cloudprovider.NodeGroup, nodeInfo *schedulerframework.NodeInfo, errorInfo cloudprovider.InstanceErrorInfo, currentTime time.Time) time.Time {
duration := b.initialBackoffDuration
key := b.nodeGroupKey(nodeGroup)
if backoffInfo, found := b.backoffInfo[key]; found {
@ -87,14 +88,21 @@ func (b *exponentialBackoff) Backoff(nodeGroup cloudprovider.NodeGroup, nodeInfo
duration: duration,
backoffUntil: backoffUntil,
lastFailedExecution: currentTime,
errorInfo: errorInfo,
}
return backoffUntil
}
// IsBackedOff returns true if execution is backed off for the given node group.
func (b *exponentialBackoff) IsBackedOff(nodeGroup cloudprovider.NodeGroup, nodeInfo *schedulerframework.NodeInfo, currentTime time.Time) bool {
// BackoffStatus returns whether the execution is backed off for the given node group and error info when the node group is backed off.
func (b *exponentialBackoff) BackoffStatus(nodeGroup cloudprovider.NodeGroup, nodeInfo *schedulerframework.NodeInfo, currentTime time.Time) Status {
backoffInfo, found := b.backoffInfo[b.nodeGroupKey(nodeGroup)]
return found && backoffInfo.backoffUntil.After(currentTime)
if !found || backoffInfo.backoffUntil.Before(currentTime) {
return Status{IsBackedOff: false}
}
return Status{
IsBackedOff: true,
ErrorInfo: &backoffInfo.errorInfo,
}
}
// RemoveBackoff removes backoff data for the given node group.

View File

@ -35,45 +35,58 @@ func nodeGroup(id string) cloudprovider.NodeGroup {
var nodeGroup1 = nodeGroup("id1")
var nodeGroup2 = nodeGroup("id2")
var quotaError = cloudprovider.InstanceErrorInfo{ErrorClass: cloudprovider.OutOfResourcesErrorClass, ErrorCode: "QUOTA_EXCEEDED", ErrorMessage: "Not enough CPU"}
var ipSpaceExhaustedError = cloudprovider.InstanceErrorInfo{ErrorClass: cloudprovider.OtherErrorClass, ErrorCode: "IP_SPACE_EXHAUSTED", ErrorMessage: "IP space has been exhausted"}
var noBackOff = Status{IsBackedOff: false}
var backoffWithQuotaError = Status{
IsBackedOff: true,
ErrorInfo: &quotaError,
}
var backoffWithIpSpaceExhaustedError = Status{
IsBackedOff: true,
ErrorInfo: &ipSpaceExhaustedError,
}
func TestBackoffTwoKeys(t *testing.T) {
backoff := NewIdBasedExponentialBackoff(10*time.Minute, time.Hour, 3*time.Hour)
startTime := time.Now()
assert.False(t, backoff.IsBackedOff(nodeGroup1, nil, startTime))
assert.False(t, backoff.IsBackedOff(nodeGroup2, nil, startTime))
backoff.Backoff(nodeGroup1, nil, cloudprovider.OtherErrorClass, "", startTime.Add(time.Minute))
assert.True(t, backoff.IsBackedOff(nodeGroup1, nil, startTime.Add(2*time.Minute)))
assert.False(t, backoff.IsBackedOff(nodeGroup2, nil, startTime))
assert.False(t, backoff.IsBackedOff(nodeGroup1, nil, startTime.Add(11*time.Minute)))
assert.Equal(t, noBackOff, backoff.BackoffStatus(nodeGroup1, nil, startTime))
assert.Equal(t, noBackOff, backoff.BackoffStatus(nodeGroup2, nil, startTime))
backoff.Backoff(nodeGroup1, nil, quotaError, startTime.Add(time.Minute))
assert.Equal(t, backoffWithQuotaError, backoff.BackoffStatus(nodeGroup1, nil, startTime.Add(2*time.Minute)))
assert.Equal(t, noBackOff, backoff.BackoffStatus(nodeGroup2, nil, startTime))
assert.Equal(t, noBackOff, backoff.BackoffStatus(nodeGroup1, nil, startTime.Add(11*time.Minute+1*time.Millisecond)))
}
func TestMaxBackoff(t *testing.T) {
backoff := NewIdBasedExponentialBackoff(1*time.Minute, 3*time.Minute, 3*time.Hour)
startTime := time.Now()
backoff.Backoff(nodeGroup1, nil, cloudprovider.OtherErrorClass, "", startTime)
assert.True(t, backoff.IsBackedOff(nodeGroup1, nil, startTime))
assert.False(t, backoff.IsBackedOff(nodeGroup1, nil, startTime.Add(1*time.Minute)))
backoff.Backoff(nodeGroup1, nil, cloudprovider.OtherErrorClass, "", startTime.Add(1*time.Minute))
assert.True(t, backoff.IsBackedOff(nodeGroup1, nil, startTime.Add(1*time.Minute)))
assert.False(t, backoff.IsBackedOff(nodeGroup1, nil, startTime.Add(3*time.Minute)))
backoff.Backoff(nodeGroup1, nil, cloudprovider.OtherErrorClass, "", startTime.Add(3*time.Minute))
assert.True(t, backoff.IsBackedOff(nodeGroup1, nil, startTime.Add(3*time.Minute)))
assert.False(t, backoff.IsBackedOff(nodeGroup1, nil, startTime.Add(6*time.Minute)))
backoff.Backoff(nodeGroup1, nil, ipSpaceExhaustedError, startTime)
assert.Equal(t, backoffWithIpSpaceExhaustedError, backoff.BackoffStatus(nodeGroup1, nil, startTime))
assert.Equal(t, noBackOff, backoff.BackoffStatus(nodeGroup1, nil, startTime.Add(1*time.Minute+1*time.Millisecond)))
backoff.Backoff(nodeGroup1, nil, ipSpaceExhaustedError, startTime.Add(1*time.Minute))
assert.Equal(t, backoffWithIpSpaceExhaustedError, backoff.BackoffStatus(nodeGroup1, nil, startTime.Add(1*time.Minute)))
assert.Equal(t, noBackOff, backoff.BackoffStatus(nodeGroup1, nil, startTime.Add(3*time.Minute)))
backoff.Backoff(nodeGroup1, nil, ipSpaceExhaustedError, startTime.Add(3*time.Minute))
assert.Equal(t, backoffWithIpSpaceExhaustedError, backoff.BackoffStatus(nodeGroup1, nil, startTime.Add(3*time.Minute)))
assert.Equal(t, noBackOff, backoff.BackoffStatus(nodeGroup1, nil, startTime.Add(6*time.Minute)))
}
func TestRemoveBackoff(t *testing.T) {
backoff := NewIdBasedExponentialBackoff(1*time.Minute, 3*time.Minute, 3*time.Hour)
startTime := time.Now()
backoff.Backoff(nodeGroup1, nil, cloudprovider.OtherErrorClass, "", startTime)
assert.True(t, backoff.IsBackedOff(nodeGroup1, nil, startTime))
backoff.Backoff(nodeGroup1, nil, quotaError, startTime)
assert.Equal(t, backoffWithQuotaError, backoff.BackoffStatus(nodeGroup1, nil, startTime))
backoff.RemoveBackoff(nodeGroup1, nil)
assert.False(t, backoff.IsBackedOff(nodeGroup1, nil, startTime))
assert.Equal(t, noBackOff, backoff.BackoffStatus(nodeGroup1, nil, startTime))
}
func TestResetStaleBackoffData(t *testing.T) {
backoff := NewIdBasedExponentialBackoff(1*time.Minute, 3*time.Minute, 3*time.Hour)
startTime := time.Now()
backoff.Backoff(nodeGroup1, nil, cloudprovider.OtherErrorClass, "", startTime)
backoff.Backoff(nodeGroup2, nil, cloudprovider.OtherErrorClass, "", startTime.Add(time.Hour))
backoff.Backoff(nodeGroup1, nil, quotaError, startTime)
backoff.Backoff(nodeGroup2, nil, quotaError, startTime.Add(time.Hour))
backoff.RemoveStaleBackoffData(startTime.Add(time.Hour))
assert.Equal(t, 2, len(backoff.(*exponentialBackoff).backoffInfo))
backoff.RemoveStaleBackoffData(startTime.Add(4 * time.Hour))
@ -84,20 +97,22 @@ func TestResetStaleBackoffData(t *testing.T) {
func TestIncreaseExistingBackoff(t *testing.T) {
backoff := NewIdBasedExponentialBackoff(1*time.Second, 10*time.Minute, 3*time.Hour)
startTime := time.Now()
backoff.Backoff(nodeGroup1, nil, cloudprovider.OtherErrorClass, "", startTime)
currentTime := time.Date(2023, 12, 12, 12, 0, 0, 0, time.UTC)
backoff.Backoff(nodeGroup1, nil, quotaError, currentTime)
// NG in backoff for one second here
assert.True(t, backoff.IsBackedOff(nodeGroup1, nil, startTime))
assert.Equal(t, backoffWithQuotaError, backoff.BackoffStatus(nodeGroup1, nil, currentTime))
// Come out of backoff
time.Sleep(1 * time.Second)
assert.False(t, backoff.IsBackedOff(nodeGroup1, nil, time.Now()))
// Confirm existing backoff duration has been increased by backing off again
backoff.Backoff(nodeGroup1, nil, cloudprovider.OtherErrorClass, "", time.Now())
currentTime = currentTime.Add(1*time.Second + 1*time.Millisecond)
assert.Equal(t, noBackOff, backoff.BackoffStatus(nodeGroup1, nil, currentTime))
// Confirm existing backoff duration and error info have been increased by backing off again
backoff.Backoff(nodeGroup1, nil, ipSpaceExhaustedError, currentTime)
// Backoff should be for 2 seconds now
assert.True(t, backoff.IsBackedOff(nodeGroup1, nil, time.Now()))
time.Sleep(1 * time.Second)
assert.True(t, backoff.IsBackedOff(nodeGroup1, nil, time.Now()))
time.Sleep(1 * time.Second)
assert.False(t, backoff.IsBackedOff(nodeGroup1, nil, time.Now()))
assert.Equal(t, backoffWithIpSpaceExhaustedError, backoff.BackoffStatus(nodeGroup1, nil, currentTime))
currentTime = currentTime.Add(1 * time.Second)
// Doing backoff during existing backoff should change error info and backoff end period but doesn't change the duration.
backoff.Backoff(nodeGroup1, nil, quotaError, currentTime)
assert.Equal(t, backoffWithQuotaError, backoff.BackoffStatus(nodeGroup1, nil, currentTime))
currentTime = currentTime.Add(2*time.Second + 1*time.Millisecond)
assert.Equal(t, noBackOff, backoff.BackoffStatus(nodeGroup1, nil, currentTime))
// Result: existing backoff duration was scaled up beyond initial duration
}