Merge pull request #354 from electronicarts/feature/mtcode/scale-down-delay-flags

Introduce new flags to control scale down behavior
This commit is contained in:
Marcin Wielgus 2017-09-19 18:35:37 +02:00 committed by GitHub
commit 012909d64e
4 changed files with 42 additions and 31 deletions

View File

@ -100,10 +100,12 @@ type AutoscalingOptions struct {
NodeGroups []string NodeGroups []string
// ScaleDownEnabled is used to allow CA to scale down the cluster // ScaleDownEnabled is used to allow CA to scale down the cluster
ScaleDownEnabled bool ScaleDownEnabled bool
// ScaleDownDelay sets the duration from the last scale up to the time when CA starts to check scale down options // ScaleDownDelayAfterAdd sets the duration from the last scale up to the time when CA starts to check scale down options
ScaleDownDelay time.Duration ScaleDownDelayAfterAdd time.Duration
// ScaleDownTrialInterval sets how often scale down possibility is check // ScaleDownDelayAfterDelete sets the duration between scale down attempts if scale down removes one or more nodes
ScaleDownTrialInterval time.Duration ScaleDownDelayAfterDelete time.Duration
// ScaleDownDelayAfterFailure sets the duration before the next scale down attempt if scale down results in an error
ScaleDownDelayAfterFailure time.Duration
// ScaleDownNonEmptyCandidatesCount is the maximum number of non empty nodes // ScaleDownNonEmptyCandidatesCount is the maximum number of non empty nodes
// considered at once as candidates for scale down. // considered at once as candidates for scale down.
ScaleDownNonEmptyCandidatesCount int ScaleDownNonEmptyCandidatesCount int

View File

@ -36,9 +36,10 @@ type StaticAutoscaler struct {
// AutoscalingContext consists of validated settings and options for this autoscaler // AutoscalingContext consists of validated settings and options for this autoscaler
*AutoscalingContext *AutoscalingContext
kube_util.ListerRegistry kube_util.ListerRegistry
lastScaleUpTime time.Time lastScaleUpTime time.Time
lastScaleDownFailedTrial time.Time lastScaleDownDeleteTime time.Time
scaleDown *ScaleDown lastScaleDownFailTime time.Time
scaleDown *ScaleDown
} }
// NewStaticAutoscaler creates an instance of Autoscaler filled with provided parameters // NewStaticAutoscaler creates an instance of Autoscaler filled with provided parameters
@ -59,11 +60,12 @@ func NewStaticAutoscaler(opts AutoscalingOptions, predicateChecker *simulator.Pr
scaleDown := NewScaleDown(autoscalingContext) scaleDown := NewScaleDown(autoscalingContext)
return &StaticAutoscaler{ return &StaticAutoscaler{
AutoscalingContext: autoscalingContext, AutoscalingContext: autoscalingContext,
ListerRegistry: listerRegistry, ListerRegistry: listerRegistry,
lastScaleUpTime: time.Now(), lastScaleUpTime: time.Now(),
lastScaleDownFailedTrial: time.Now(), lastScaleDownDeleteTime: time.Now(),
scaleDown: scaleDown, lastScaleDownFailTime: time.Now(),
scaleDown: scaleDown,
}, nil }, nil
} }
@ -278,14 +280,16 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) errors.AutoscalerError
} }
// In dry run only utilization is updated // In dry run only utilization is updated
calculateUnneededOnly := a.lastScaleUpTime.Add(a.ScaleDownDelay).After(currentTime) || calculateUnneededOnly := a.lastScaleUpTime.Add(a.ScaleDownDelayAfterAdd).After(currentTime) ||
a.lastScaleDownFailedTrial.Add(a.ScaleDownTrialInterval).After(currentTime) || a.lastScaleDownFailTime.Add(a.ScaleDownDelayAfterFailure).After(currentTime) ||
a.lastScaleDownDeleteTime.Add(a.ScaleDownDelayAfterDelete).After(currentTime) ||
schedulablePodsPresent || schedulablePodsPresent ||
scaleDown.nodeDeleteStatus.IsDeleteInProgress() scaleDown.nodeDeleteStatus.IsDeleteInProgress()
glog.V(4).Infof("Scale down status: unneededOnly=%v lastScaleUpTime=%s "+ glog.V(4).Infof("Scale down status: unneededOnly=%v lastScaleUpTime=%s "+
"lastScaleDownFailedTrail=%s schedulablePodsPresent=%v", calculateUnneededOnly, "lastScaleDownDeleteTime=%v lastScaleDownFailTime=%s schedulablePodsPresent=%v isDeleteInProgress=%v",
a.lastScaleUpTime, a.lastScaleDownFailedTrial, schedulablePodsPresent) calculateUnneededOnly, a.lastScaleUpTime, a.lastScaleDownDeleteTime, a.lastScaleDownFailTime,
schedulablePodsPresent, scaleDown.nodeDeleteStatus.IsDeleteInProgress())
if !calculateUnneededOnly { if !calculateUnneededOnly {
glog.V(4).Infof("Starting scale down") glog.V(4).Infof("Starting scale down")
@ -310,7 +314,9 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) errors.AutoscalerError
return typedErr return typedErr
} }
if result == ScaleDownError { if result == ScaleDownError {
a.lastScaleDownFailedTrial = currentTime a.lastScaleDownFailTime = currentTime
} else if result == ScaleDownNodeDeleted {
a.lastScaleDownDeleteTime = currentTime
} }
} }
} }

View File

@ -188,10 +188,10 @@ func TestStaticAutoscalerRunOnce(t *testing.T) {
sd := NewScaleDown(context) sd := NewScaleDown(context)
autoscaler := &StaticAutoscaler{AutoscalingContext: context, autoscaler := &StaticAutoscaler{AutoscalingContext: context,
ListerRegistry: listerRegistry, ListerRegistry: listerRegistry,
lastScaleUpTime: time.Now(), lastScaleUpTime: time.Now(),
lastScaleDownFailedTrial: time.Now(), lastScaleDownFailTime: time.Now(),
scaleDown: sd} scaleDown: sd}
// MaxNodesTotal reached. // MaxNodesTotal reached.
readyNodeListerMock.On("List").Return([]*apiv1.Node{n1}, nil).Once() readyNodeListerMock.On("List").Return([]*apiv1.Node{n1}, nil).Once()
@ -360,10 +360,10 @@ func TestStaticAutoscalerRunOnceWithAutoprovisionedEnabled(t *testing.T) {
sd := NewScaleDown(context) sd := NewScaleDown(context)
autoscaler := &StaticAutoscaler{AutoscalingContext: context, autoscaler := &StaticAutoscaler{AutoscalingContext: context,
ListerRegistry: listerRegistry, ListerRegistry: listerRegistry,
lastScaleUpTime: time.Now(), lastScaleUpTime: time.Now(),
lastScaleDownFailedTrial: time.Now(), lastScaleDownFailTime: time.Now(),
scaleDown: sd} scaleDown: sd}
// Scale up. // Scale up.
readyNodeListerMock.On("List").Return([]*apiv1.Node{n1}, nil).Once() readyNodeListerMock.On("List").Return([]*apiv1.Node{n1}, nil).Once()

View File

@ -75,16 +75,18 @@ var (
namespace = flag.String("namespace", "kube-system", "Namespace in which cluster-autoscaler run. If a --configmap flag is also provided, ensure that the configmap exists in this namespace before CA runs.") namespace = flag.String("namespace", "kube-system", "Namespace in which cluster-autoscaler run. If a --configmap flag is also provided, ensure that the configmap exists in this namespace before CA runs.")
nodeGroupAutoDiscovery = flag.String("node-group-auto-discovery", "", "One or more definition(s) of node group auto-discovery. A definition is expressed `<name of discoverer per cloud provider>:[<key>[=<value>]]`. Only the `aws` cloud provider is currently supported. The only valid discoverer for it is `asg` and the valid key is `tag`. For example, specifying `--cloud-provider aws` and `--node-group-auto-discovery asg:tag=cluster-autoscaler/auto-discovery/enabled,kubernetes.io/cluster/<YOUR CLUSTER NAME>` results in ASGs tagged with `cluster-autoscaler/auto-discovery/enabled` and `kubernetes.io/cluster/<YOUR CLUSTER NAME>` to be considered as target node groups") nodeGroupAutoDiscovery = flag.String("node-group-auto-discovery", "", "One or more definition(s) of node group auto-discovery. A definition is expressed `<name of discoverer per cloud provider>:[<key>[=<value>]]`. Only the `aws` cloud provider is currently supported. The only valid discoverer for it is `asg` and the valid key is `tag`. For example, specifying `--cloud-provider aws` and `--node-group-auto-discovery asg:tag=cluster-autoscaler/auto-discovery/enabled,kubernetes.io/cluster/<YOUR CLUSTER NAME>` results in ASGs tagged with `cluster-autoscaler/auto-discovery/enabled` and `kubernetes.io/cluster/<YOUR CLUSTER NAME>` to be considered as target node groups")
scaleDownEnabled = flag.Bool("scale-down-enabled", true, "Should CA scale down the cluster") scaleDownEnabled = flag.Bool("scale-down-enabled", true, "Should CA scale down the cluster")
scaleDownDelay = flag.Duration("scale-down-delay", 10*time.Minute, scaleDownDelayAfterAdd = flag.Duration("scale-down-delay-after-add", 10*time.Minute,
"Duration from the last scale up to the time when CA starts to check scale down options") "How long after scale up that scale down evaluation resumes")
scaleDownDelayAfterDelete = flag.Duration("scale-down-delay-after-delete", *scanInterval,
"How long after node deletion that scale down evaluation resumes, defaults to scanInterval")
scaleDownDelayAfterFailure = flag.Duration("scale-down-delay-after-failure", 3*time.Minute,
"How long after scale down failure that scale down evaluation resumes")
scaleDownUnneededTime = flag.Duration("scale-down-unneeded-time", 10*time.Minute, scaleDownUnneededTime = flag.Duration("scale-down-unneeded-time", 10*time.Minute,
"How long a node should be unneeded before it is eligible for scale down") "How long a node should be unneeded before it is eligible for scale down")
scaleDownUnreadyTime = flag.Duration("scale-down-unready-time", 20*time.Minute, scaleDownUnreadyTime = flag.Duration("scale-down-unready-time", 20*time.Minute,
"How long an unready node should be unneeded before it is eligible for scale down") "How long an unready node should be unneeded before it is eligible for scale down")
scaleDownUtilizationThreshold = flag.Float64("scale-down-utilization-threshold", 0.5, scaleDownUtilizationThreshold = flag.Float64("scale-down-utilization-threshold", 0.5,
"Node utilization level, defined as sum of requested resources divided by capacity, below which a node can be considered for scale down") "Node utilization level, defined as sum of requested resources divided by capacity, below which a node can be considered for scale down")
scaleDownTrialInterval = flag.Duration("scale-down-trial-interval", 1*time.Minute,
"How often scale down possiblity is check")
scaleDownNonEmptyCandidatesCount = flag.Int("scale-down-non-empty-candidates-count", 30, scaleDownNonEmptyCandidatesCount = flag.Int("scale-down-non-empty-candidates-count", 30,
"Maximum number of non empty nodes considered in one iteration as candidates for scale down with drain."+ "Maximum number of non empty nodes considered in one iteration as candidates for scale down with drain."+
"Lower value means better CA responsiveness but possible slower scale down latency."+ "Lower value means better CA responsiveness but possible slower scale down latency."+
@ -158,9 +160,10 @@ func createAutoscalerOptions() core.AutoscalerOptions {
MinMemoryTotal: minMemoryTotal, MinMemoryTotal: minMemoryTotal,
NodeGroups: nodeGroupsFlag, NodeGroups: nodeGroupsFlag,
UnregisteredNodeRemovalTime: *unregisteredNodeRemovalTime, UnregisteredNodeRemovalTime: *unregisteredNodeRemovalTime,
ScaleDownDelay: *scaleDownDelay, ScaleDownDelayAfterAdd: *scaleDownDelayAfterAdd,
ScaleDownDelayAfterDelete: *scaleDownDelayAfterDelete,
ScaleDownDelayAfterFailure: *scaleDownDelayAfterFailure,
ScaleDownEnabled: *scaleDownEnabled, ScaleDownEnabled: *scaleDownEnabled,
ScaleDownTrialInterval: *scaleDownTrialInterval,
ScaleDownUnneededTime: *scaleDownUnneededTime, ScaleDownUnneededTime: *scaleDownUnneededTime,
ScaleDownUnreadyTime: *scaleDownUnreadyTime, ScaleDownUnreadyTime: *scaleDownUnreadyTime,
ScaleDownUtilizationThreshold: *scaleDownUtilizationThreshold, ScaleDownUtilizationThreshold: *scaleDownUtilizationThreshold,