Expose backoff time parameters
This commit is contained in:
parent
52a2adebf1
commit
03a0475502
|
|
@ -42,15 +42,6 @@ import (
|
||||||
const (
|
const (
|
||||||
// MaxNodeStartupTime is the maximum time from the moment the node is registered to the time the node is ready.
|
// MaxNodeStartupTime is the maximum time from the moment the node is registered to the time the node is ready.
|
||||||
MaxNodeStartupTime = 15 * time.Minute
|
MaxNodeStartupTime = 15 * time.Minute
|
||||||
|
|
||||||
// MaxNodeGroupBackoffDuration is the maximum backoff duration for a NodeGroup after new nodes failed to start.
|
|
||||||
MaxNodeGroupBackoffDuration = 30 * time.Minute
|
|
||||||
|
|
||||||
// InitialNodeGroupBackoffDuration is the duration of first backoff after a new node failed to start.
|
|
||||||
InitialNodeGroupBackoffDuration = 5 * time.Minute
|
|
||||||
|
|
||||||
// NodeGroupBackoffResetTimeout is the time after last failed scale-up when the backoff duration is reset.
|
|
||||||
NodeGroupBackoffResetTimeout = 3 * time.Hour
|
|
||||||
)
|
)
|
||||||
|
|
||||||
// ScaleUpRequest contains information about the requested node group scale up.
|
// ScaleUpRequest contains information about the requested node group scale up.
|
||||||
|
|
|
||||||
|
|
@ -653,7 +653,7 @@ func TestUpdateLastTransitionTimes(t *testing.T) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
expectedNgTimestamps := make(map[string](map[api.ClusterAutoscalerConditionType]metav1.Time), 0)
|
expectedNgTimestamps := make(map[string]map[api.ClusterAutoscalerConditionType]metav1.Time, 0)
|
||||||
// Same as cluster-wide
|
// Same as cluster-wide
|
||||||
expectedNgTimestamps["ng1"] = map[api.ClusterAutoscalerConditionType]metav1.Time{
|
expectedNgTimestamps["ng1"] = map[api.ClusterAutoscalerConditionType]metav1.Time{
|
||||||
api.ClusterAutoscalerHealth: now,
|
api.ClusterAutoscalerHealth: now,
|
||||||
|
|
@ -710,7 +710,7 @@ func TestScaleUpBackoff(t *testing.T) {
|
||||||
assert.False(t, clusterstate.IsNodeGroupSafeToScaleUp(ng1, now))
|
assert.False(t, clusterstate.IsNodeGroupSafeToScaleUp(ng1, now))
|
||||||
|
|
||||||
// Backoff should expire after timeout
|
// Backoff should expire after timeout
|
||||||
now = now.Add(InitialNodeGroupBackoffDuration).Add(time.Second)
|
now = now.Add(5 * time.Minute /*InitialNodeGroupBackoffDuration*/).Add(time.Second)
|
||||||
assert.True(t, clusterstate.IsClusterHealthy())
|
assert.True(t, clusterstate.IsClusterHealthy())
|
||||||
assert.True(t, clusterstate.IsNodeGroupHealthy("ng1"))
|
assert.True(t, clusterstate.IsNodeGroupHealthy("ng1"))
|
||||||
assert.True(t, clusterstate.IsNodeGroupSafeToScaleUp(ng1, now))
|
assert.True(t, clusterstate.IsNodeGroupSafeToScaleUp(ng1, now))
|
||||||
|
|
@ -724,7 +724,7 @@ func TestScaleUpBackoff(t *testing.T) {
|
||||||
assert.True(t, clusterstate.IsNodeGroupHealthy("ng1"))
|
assert.True(t, clusterstate.IsNodeGroupHealthy("ng1"))
|
||||||
assert.False(t, clusterstate.IsNodeGroupSafeToScaleUp(ng1, now))
|
assert.False(t, clusterstate.IsNodeGroupSafeToScaleUp(ng1, now))
|
||||||
|
|
||||||
now = now.Add(InitialNodeGroupBackoffDuration).Add(time.Second)
|
now = now.Add(5 * time.Minute /*InitialNodeGroupBackoffDuration*/).Add(time.Second)
|
||||||
assert.False(t, clusterstate.IsNodeGroupSafeToScaleUp(ng1, now))
|
assert.False(t, clusterstate.IsNodeGroupSafeToScaleUp(ng1, now))
|
||||||
|
|
||||||
// The backoff should be cleared after a successful scale-up
|
// The backoff should be cleared after a successful scale-up
|
||||||
|
|
@ -873,5 +873,6 @@ func TestScaleUpFailures(t *testing.T) {
|
||||||
}
|
}
|
||||||
|
|
||||||
func newBackoff() backoff.Backoff {
|
func newBackoff() backoff.Backoff {
|
||||||
return backoff.NewIdBasedExponentialBackoff(InitialNodeGroupBackoffDuration, MaxNodeGroupBackoffDuration, NodeGroupBackoffResetTimeout)
|
return backoff.NewIdBasedExponentialBackoff(5*time.Minute, /*InitialNodeGroupBackoffDuration*/
|
||||||
|
30*time.Minute /*MaxNodeGroupBackoffDuration*/, 3*time.Hour /*NodeGroupBackoffResetTimeout*/)
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -171,4 +171,10 @@ type AutoscalingOptions struct {
|
||||||
DaemonSetEvictionForOccupiedNodes bool
|
DaemonSetEvictionForOccupiedNodes bool
|
||||||
// User agent to use for HTTP calls.
|
// User agent to use for HTTP calls.
|
||||||
UserAgent string
|
UserAgent string
|
||||||
|
// InitialNodeGroupBackoffDuration is the duration of first backoff after a new node failed to start
|
||||||
|
InitialNodeGroupBackoffDuration time.Duration
|
||||||
|
// MaxNodeGroupBackoffDuration is the maximum backoff duration for a NodeGroup after new nodes failed to start.
|
||||||
|
MaxNodeGroupBackoffDuration time.Duration
|
||||||
|
// NodeGroupBackoffResetTimeout is the time after last failed scale-up when the backoff duration is reset.
|
||||||
|
NodeGroupBackoffResetTimeout time.Duration
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -22,7 +22,6 @@ import (
|
||||||
|
|
||||||
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
|
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
|
||||||
cloudBuilder "k8s.io/autoscaler/cluster-autoscaler/cloudprovider/builder"
|
cloudBuilder "k8s.io/autoscaler/cluster-autoscaler/cloudprovider/builder"
|
||||||
"k8s.io/autoscaler/cluster-autoscaler/clusterstate"
|
|
||||||
"k8s.io/autoscaler/cluster-autoscaler/config"
|
"k8s.io/autoscaler/cluster-autoscaler/config"
|
||||||
"k8s.io/autoscaler/cluster-autoscaler/context"
|
"k8s.io/autoscaler/cluster-autoscaler/context"
|
||||||
"k8s.io/autoscaler/cluster-autoscaler/debuggingsnapshot"
|
"k8s.io/autoscaler/cluster-autoscaler/debuggingsnapshot"
|
||||||
|
|
@ -121,7 +120,7 @@ func initializeDefaultOptions(opts *AutoscalerOptions) error {
|
||||||
}
|
}
|
||||||
if opts.Backoff == nil {
|
if opts.Backoff == nil {
|
||||||
opts.Backoff =
|
opts.Backoff =
|
||||||
backoff.NewIdBasedExponentialBackoff(clusterstate.InitialNodeGroupBackoffDuration, clusterstate.MaxNodeGroupBackoffDuration, clusterstate.NodeGroupBackoffResetTimeout)
|
backoff.NewIdBasedExponentialBackoff(opts.InitialNodeGroupBackoffDuration, opts.MaxNodeGroupBackoffDuration, opts.NodeGroupBackoffResetTimeout)
|
||||||
}
|
}
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
|
|
|
||||||
|
|
@ -20,6 +20,7 @@ import (
|
||||||
"fmt"
|
"fmt"
|
||||||
"reflect"
|
"reflect"
|
||||||
"testing"
|
"testing"
|
||||||
|
"time"
|
||||||
|
|
||||||
"k8s.io/autoscaler/cluster-autoscaler/debuggingsnapshot"
|
"k8s.io/autoscaler/cluster-autoscaler/debuggingsnapshot"
|
||||||
|
|
||||||
|
|
@ -53,7 +54,6 @@ import (
|
||||||
"github.com/stretchr/testify/assert"
|
"github.com/stretchr/testify/assert"
|
||||||
|
|
||||||
apiv1 "k8s.io/api/core/v1"
|
apiv1 "k8s.io/api/core/v1"
|
||||||
"k8s.io/autoscaler/cluster-autoscaler/clusterstate"
|
|
||||||
"k8s.io/autoscaler/cluster-autoscaler/utils/backoff"
|
"k8s.io/autoscaler/cluster-autoscaler/utils/backoff"
|
||||||
kube_client "k8s.io/client-go/kubernetes"
|
kube_client "k8s.io/client-go/kubernetes"
|
||||||
kube_record "k8s.io/client-go/tools/record"
|
kube_record "k8s.io/client-go/tools/record"
|
||||||
|
|
@ -296,5 +296,6 @@ func (p *MockAutoprovisioningNodeGroupListProcessor) CleanUp() {
|
||||||
|
|
||||||
// NewBackoff creates a new backoff object
|
// NewBackoff creates a new backoff object
|
||||||
func NewBackoff() backoff.Backoff {
|
func NewBackoff() backoff.Backoff {
|
||||||
return backoff.NewIdBasedExponentialBackoff(clusterstate.InitialNodeGroupBackoffDuration, clusterstate.MaxNodeGroupBackoffDuration, clusterstate.NodeGroupBackoffResetTimeout)
|
return backoff.NewIdBasedExponentialBackoff(5*time.Minute, /*InitialNodeGroupBackoffDuration*/
|
||||||
|
30*time.Minute /*MaxNodeGroupBackoffDuration*/, 3*time.Hour /*NodeGroupBackoffResetTimeout*/)
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -62,7 +62,7 @@ import (
|
||||||
componentbaseconfig "k8s.io/component-base/config"
|
componentbaseconfig "k8s.io/component-base/config"
|
||||||
"k8s.io/component-base/config/options"
|
"k8s.io/component-base/config/options"
|
||||||
"k8s.io/component-base/metrics/legacyregistry"
|
"k8s.io/component-base/metrics/legacyregistry"
|
||||||
klog "k8s.io/klog/v2"
|
"k8s.io/klog/v2"
|
||||||
)
|
)
|
||||||
|
|
||||||
// MultiStringFlag is a flag for passing multiple parameters using same flag
|
// MultiStringFlag is a flag for passing multiple parameters using same flag
|
||||||
|
|
@ -188,10 +188,16 @@ var (
|
||||||
daemonSetEvictionForEmptyNodes = flag.Bool("daemonset-eviction-for-empty-nodes", false, "DaemonSet pods will be gracefully terminated from empty nodes")
|
daemonSetEvictionForEmptyNodes = flag.Bool("daemonset-eviction-for-empty-nodes", false, "DaemonSet pods will be gracefully terminated from empty nodes")
|
||||||
daemonSetEvictionForOccupiedNodes = flag.Bool("daemonset-eviction-for-occupied-nodes", true, "DaemonSet pods will be gracefully terminated from non-empty nodes")
|
daemonSetEvictionForOccupiedNodes = flag.Bool("daemonset-eviction-for-occupied-nodes", true, "DaemonSet pods will be gracefully terminated from non-empty nodes")
|
||||||
userAgent = flag.String("user-agent", "cluster-autoscaler", "User agent used for HTTP calls.")
|
userAgent = flag.String("user-agent", "cluster-autoscaler", "User agent used for HTTP calls.")
|
||||||
|
emitPerNodeGroupMetrics = flag.Bool("emit-per-nodegroup-metrics", false, "If true, emit per node group metrics.")
|
||||||
|
debuggingSnapshotEnabled = flag.Bool("debugging-snapshot-enabled", false, "Whether the debugging snapshot of cluster autoscaler feature is enabled")
|
||||||
|
nodeInfoCacheExpireTime = flag.Duration("node-info-cache-expire-time", 87600*time.Hour, "Node Info cache expire time for each item. Default value is 10 years.")
|
||||||
|
|
||||||
emitPerNodeGroupMetrics = flag.Bool("emit-per-nodegroup-metrics", false, "If true, emit per node group metrics.")
|
initialNodeGroupBackoffDuration = flag.Duration("initial-node-group-backoff-duration", 5*time.Minute,
|
||||||
debuggingSnapshotEnabled = flag.Bool("debugging-snapshot-enabled", false, "Whether the debugging snapshot of cluster autoscaler feature is enabled")
|
"initialNodeGroupBackoffDuration is the duration of first backoff after a new node failed to start.")
|
||||||
nodeInfoCacheExpireTime = flag.Duration("node-info-cache-expire-time", 87600*time.Hour, "Node Info cache expire time for each item. Default value is 10 years.")
|
maxNodeGroupBackoffDuration = flag.Duration("max-node-group-backoff-duration", 30*time.Minute,
|
||||||
|
"maxNodeGroupBackoffDuration is the maximum backoff duration for a NodeGroup after new nodes failed to start.")
|
||||||
|
nodeGroupBackoffResetTimeout = flag.Duration("node-group-backoff-reset-timeout", 3*time.Hour,
|
||||||
|
"nodeGroupBackoffResetTimeout is the time after last failed scale-up when the backoff duration is reset.")
|
||||||
)
|
)
|
||||||
|
|
||||||
func createAutoscalingOptions() config.AutoscalingOptions {
|
func createAutoscalingOptions() config.AutoscalingOptions {
|
||||||
|
|
@ -272,6 +278,9 @@ func createAutoscalingOptions() config.AutoscalingOptions {
|
||||||
DaemonSetEvictionForEmptyNodes: *daemonSetEvictionForEmptyNodes,
|
DaemonSetEvictionForEmptyNodes: *daemonSetEvictionForEmptyNodes,
|
||||||
DaemonSetEvictionForOccupiedNodes: *daemonSetEvictionForOccupiedNodes,
|
DaemonSetEvictionForOccupiedNodes: *daemonSetEvictionForOccupiedNodes,
|
||||||
UserAgent: *userAgent,
|
UserAgent: *userAgent,
|
||||||
|
InitialNodeGroupBackoffDuration: *initialNodeGroupBackoffDuration,
|
||||||
|
MaxNodeGroupBackoffDuration: *maxNodeGroupBackoffDuration,
|
||||||
|
NodeGroupBackoffResetTimeout: *nodeGroupBackoffResetTimeout,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue