Expose backoff time parameters

This commit is contained in:
weidongcai 2022-05-12 15:10:08 +08:00
parent 52a2adebf1
commit 03a0475502
6 changed files with 28 additions and 21 deletions

View File

@ -42,15 +42,6 @@ import (
const (
// MaxNodeStartupTime is the maximum time from the moment the node is registered to the time the node is ready.
MaxNodeStartupTime = 15 * time.Minute
// MaxNodeGroupBackoffDuration is the maximum backoff duration for a NodeGroup after new nodes failed to start.
MaxNodeGroupBackoffDuration = 30 * time.Minute
// InitialNodeGroupBackoffDuration is the duration of first backoff after a new node failed to start.
InitialNodeGroupBackoffDuration = 5 * time.Minute
// NodeGroupBackoffResetTimeout is the time after last failed scale-up when the backoff duration is reset.
NodeGroupBackoffResetTimeout = 3 * time.Hour
)
// ScaleUpRequest contains information about the requested node group scale up.

View File

@ -653,7 +653,7 @@ func TestUpdateLastTransitionTimes(t *testing.T) {
}
}
expectedNgTimestamps := make(map[string](map[api.ClusterAutoscalerConditionType]metav1.Time), 0)
expectedNgTimestamps := make(map[string]map[api.ClusterAutoscalerConditionType]metav1.Time, 0)
// Same as cluster-wide
expectedNgTimestamps["ng1"] = map[api.ClusterAutoscalerConditionType]metav1.Time{
api.ClusterAutoscalerHealth: now,
@ -710,7 +710,7 @@ func TestScaleUpBackoff(t *testing.T) {
assert.False(t, clusterstate.IsNodeGroupSafeToScaleUp(ng1, now))
// Backoff should expire after timeout
now = now.Add(InitialNodeGroupBackoffDuration).Add(time.Second)
now = now.Add(5 * time.Minute /*InitialNodeGroupBackoffDuration*/).Add(time.Second)
assert.True(t, clusterstate.IsClusterHealthy())
assert.True(t, clusterstate.IsNodeGroupHealthy("ng1"))
assert.True(t, clusterstate.IsNodeGroupSafeToScaleUp(ng1, now))
@ -724,7 +724,7 @@ func TestScaleUpBackoff(t *testing.T) {
assert.True(t, clusterstate.IsNodeGroupHealthy("ng1"))
assert.False(t, clusterstate.IsNodeGroupSafeToScaleUp(ng1, now))
now = now.Add(InitialNodeGroupBackoffDuration).Add(time.Second)
now = now.Add(5 * time.Minute /*InitialNodeGroupBackoffDuration*/).Add(time.Second)
assert.False(t, clusterstate.IsNodeGroupSafeToScaleUp(ng1, now))
// The backoff should be cleared after a successful scale-up
@ -873,5 +873,6 @@ func TestScaleUpFailures(t *testing.T) {
}
func newBackoff() backoff.Backoff {
return backoff.NewIdBasedExponentialBackoff(InitialNodeGroupBackoffDuration, MaxNodeGroupBackoffDuration, NodeGroupBackoffResetTimeout)
return backoff.NewIdBasedExponentialBackoff(5*time.Minute, /*InitialNodeGroupBackoffDuration*/
30*time.Minute /*MaxNodeGroupBackoffDuration*/, 3*time.Hour /*NodeGroupBackoffResetTimeout*/)
}

View File

@ -171,4 +171,10 @@ type AutoscalingOptions struct {
DaemonSetEvictionForOccupiedNodes bool
// User agent to use for HTTP calls.
UserAgent string
// InitialNodeGroupBackoffDuration is the duration of first backoff after a new node failed to start
InitialNodeGroupBackoffDuration time.Duration
// MaxNodeGroupBackoffDuration is the maximum backoff duration for a NodeGroup after new nodes failed to start.
MaxNodeGroupBackoffDuration time.Duration
// NodeGroupBackoffResetTimeout is the time after last failed scale-up when the backoff duration is reset.
NodeGroupBackoffResetTimeout time.Duration
}

View File

@ -22,7 +22,6 @@ import (
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
cloudBuilder "k8s.io/autoscaler/cluster-autoscaler/cloudprovider/builder"
"k8s.io/autoscaler/cluster-autoscaler/clusterstate"
"k8s.io/autoscaler/cluster-autoscaler/config"
"k8s.io/autoscaler/cluster-autoscaler/context"
"k8s.io/autoscaler/cluster-autoscaler/debuggingsnapshot"
@ -121,7 +120,7 @@ func initializeDefaultOptions(opts *AutoscalerOptions) error {
}
if opts.Backoff == nil {
opts.Backoff =
backoff.NewIdBasedExponentialBackoff(clusterstate.InitialNodeGroupBackoffDuration, clusterstate.MaxNodeGroupBackoffDuration, clusterstate.NodeGroupBackoffResetTimeout)
backoff.NewIdBasedExponentialBackoff(opts.InitialNodeGroupBackoffDuration, opts.MaxNodeGroupBackoffDuration, opts.NodeGroupBackoffResetTimeout)
}
return nil

View File

@ -20,6 +20,7 @@ import (
"fmt"
"reflect"
"testing"
"time"
"k8s.io/autoscaler/cluster-autoscaler/debuggingsnapshot"
@ -53,7 +54,6 @@ import (
"github.com/stretchr/testify/assert"
apiv1 "k8s.io/api/core/v1"
"k8s.io/autoscaler/cluster-autoscaler/clusterstate"
"k8s.io/autoscaler/cluster-autoscaler/utils/backoff"
kube_client "k8s.io/client-go/kubernetes"
kube_record "k8s.io/client-go/tools/record"
@ -296,5 +296,6 @@ func (p *MockAutoprovisioningNodeGroupListProcessor) CleanUp() {
// NewBackoff creates a new backoff object
func NewBackoff() backoff.Backoff {
return backoff.NewIdBasedExponentialBackoff(clusterstate.InitialNodeGroupBackoffDuration, clusterstate.MaxNodeGroupBackoffDuration, clusterstate.NodeGroupBackoffResetTimeout)
return backoff.NewIdBasedExponentialBackoff(5*time.Minute, /*InitialNodeGroupBackoffDuration*/
30*time.Minute /*MaxNodeGroupBackoffDuration*/, 3*time.Hour /*NodeGroupBackoffResetTimeout*/)
}

View File

@ -62,7 +62,7 @@ import (
componentbaseconfig "k8s.io/component-base/config"
"k8s.io/component-base/config/options"
"k8s.io/component-base/metrics/legacyregistry"
klog "k8s.io/klog/v2"
"k8s.io/klog/v2"
)
// MultiStringFlag is a flag for passing multiple parameters using same flag
@ -188,10 +188,16 @@ var (
daemonSetEvictionForEmptyNodes = flag.Bool("daemonset-eviction-for-empty-nodes", false, "DaemonSet pods will be gracefully terminated from empty nodes")
daemonSetEvictionForOccupiedNodes = flag.Bool("daemonset-eviction-for-occupied-nodes", true, "DaemonSet pods will be gracefully terminated from non-empty nodes")
userAgent = flag.String("user-agent", "cluster-autoscaler", "User agent used for HTTP calls.")
emitPerNodeGroupMetrics = flag.Bool("emit-per-nodegroup-metrics", false, "If true, emit per node group metrics.")
debuggingSnapshotEnabled = flag.Bool("debugging-snapshot-enabled", false, "Whether the debugging snapshot of cluster autoscaler feature is enabled")
nodeInfoCacheExpireTime = flag.Duration("node-info-cache-expire-time", 87600*time.Hour, "Node Info cache expire time for each item. Default value is 10 years.")
emitPerNodeGroupMetrics = flag.Bool("emit-per-nodegroup-metrics", false, "If true, emit per node group metrics.")
debuggingSnapshotEnabled = flag.Bool("debugging-snapshot-enabled", false, "Whether the debugging snapshot of cluster autoscaler feature is enabled")
nodeInfoCacheExpireTime = flag.Duration("node-info-cache-expire-time", 87600*time.Hour, "Node Info cache expire time for each item. Default value is 10 years.")
initialNodeGroupBackoffDuration = flag.Duration("initial-node-group-backoff-duration", 5*time.Minute,
"initialNodeGroupBackoffDuration is the duration of first backoff after a new node failed to start.")
maxNodeGroupBackoffDuration = flag.Duration("max-node-group-backoff-duration", 30*time.Minute,
"maxNodeGroupBackoffDuration is the maximum backoff duration for a NodeGroup after new nodes failed to start.")
nodeGroupBackoffResetTimeout = flag.Duration("node-group-backoff-reset-timeout", 3*time.Hour,
"nodeGroupBackoffResetTimeout is the time after last failed scale-up when the backoff duration is reset.")
)
func createAutoscalingOptions() config.AutoscalingOptions {
@ -272,6 +278,9 @@ func createAutoscalingOptions() config.AutoscalingOptions {
DaemonSetEvictionForEmptyNodes: *daemonSetEvictionForEmptyNodes,
DaemonSetEvictionForOccupiedNodes: *daemonSetEvictionForOccupiedNodes,
UserAgent: *userAgent,
InitialNodeGroupBackoffDuration: *initialNodeGroupBackoffDuration,
MaxNodeGroupBackoffDuration: *maxNodeGroupBackoffDuration,
NodeGroupBackoffResetTimeout: *nodeGroupBackoffResetTimeout,
}
}