Merge pull request #354 from electronicarts/feature/mtcode/scale-down-delay-flags

Introduce new flags to control scale down behavior
This commit is contained in:
Marcin Wielgus 2017-09-19 18:35:37 +02:00 committed by GitHub
commit 012909d64e
4 changed files with 42 additions and 31 deletions

View File

@ -100,10 +100,12 @@ type AutoscalingOptions struct {
NodeGroups []string
// ScaleDownEnabled is used to allow CA to scale down the cluster
ScaleDownEnabled bool
// ScaleDownDelay sets the duration from the last scale up to the time when CA starts to check scale down options
ScaleDownDelay time.Duration
// ScaleDownTrialInterval sets how often scale down possibility is check
ScaleDownTrialInterval time.Duration
// ScaleDownDelayAfterAdd sets the duration from the last scale up to the time when CA starts to check scale down options
ScaleDownDelayAfterAdd time.Duration
// ScaleDownDelayAfterDelete sets the duration between scale down attempts if scale down removes one or more nodes
ScaleDownDelayAfterDelete time.Duration
// ScaleDownDelayAfterFailure sets the duration before the next scale down attempt if scale down results in an error
ScaleDownDelayAfterFailure time.Duration
// ScaleDownNonEmptyCandidatesCount is the maximum number of non empty nodes
// considered at once as candidates for scale down.
ScaleDownNonEmptyCandidatesCount int

View File

@ -36,9 +36,10 @@ type StaticAutoscaler struct {
// AutoscalingContext consists of validated settings and options for this autoscaler
*AutoscalingContext
kube_util.ListerRegistry
lastScaleUpTime time.Time
lastScaleDownFailedTrial time.Time
scaleDown *ScaleDown
lastScaleUpTime time.Time
lastScaleDownDeleteTime time.Time
lastScaleDownFailTime time.Time
scaleDown *ScaleDown
}
// NewStaticAutoscaler creates an instance of Autoscaler filled with provided parameters
@ -59,11 +60,12 @@ func NewStaticAutoscaler(opts AutoscalingOptions, predicateChecker *simulator.Pr
scaleDown := NewScaleDown(autoscalingContext)
return &StaticAutoscaler{
AutoscalingContext: autoscalingContext,
ListerRegistry: listerRegistry,
lastScaleUpTime: time.Now(),
lastScaleDownFailedTrial: time.Now(),
scaleDown: scaleDown,
AutoscalingContext: autoscalingContext,
ListerRegistry: listerRegistry,
lastScaleUpTime: time.Now(),
lastScaleDownDeleteTime: time.Now(),
lastScaleDownFailTime: time.Now(),
scaleDown: scaleDown,
}, nil
}
@ -278,14 +280,16 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) errors.AutoscalerError
}
// In dry run only utilization is updated
calculateUnneededOnly := a.lastScaleUpTime.Add(a.ScaleDownDelay).After(currentTime) ||
a.lastScaleDownFailedTrial.Add(a.ScaleDownTrialInterval).After(currentTime) ||
calculateUnneededOnly := a.lastScaleUpTime.Add(a.ScaleDownDelayAfterAdd).After(currentTime) ||
a.lastScaleDownFailTime.Add(a.ScaleDownDelayAfterFailure).After(currentTime) ||
a.lastScaleDownDeleteTime.Add(a.ScaleDownDelayAfterDelete).After(currentTime) ||
schedulablePodsPresent ||
scaleDown.nodeDeleteStatus.IsDeleteInProgress()
glog.V(4).Infof("Scale down status: unneededOnly=%v lastScaleUpTime=%s "+
"lastScaleDownFailedTrail=%s schedulablePodsPresent=%v", calculateUnneededOnly,
a.lastScaleUpTime, a.lastScaleDownFailedTrial, schedulablePodsPresent)
"lastScaleDownDeleteTime=%v lastScaleDownFailTime=%s schedulablePodsPresent=%v isDeleteInProgress=%v",
calculateUnneededOnly, a.lastScaleUpTime, a.lastScaleDownDeleteTime, a.lastScaleDownFailTime,
schedulablePodsPresent, scaleDown.nodeDeleteStatus.IsDeleteInProgress())
if !calculateUnneededOnly {
glog.V(4).Infof("Starting scale down")
@ -310,7 +314,9 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) errors.AutoscalerError
return typedErr
}
if result == ScaleDownError {
a.lastScaleDownFailedTrial = currentTime
a.lastScaleDownFailTime = currentTime
} else if result == ScaleDownNodeDeleted {
a.lastScaleDownDeleteTime = currentTime
}
}
}

View File

@ -188,10 +188,10 @@ func TestStaticAutoscalerRunOnce(t *testing.T) {
sd := NewScaleDown(context)
autoscaler := &StaticAutoscaler{AutoscalingContext: context,
ListerRegistry: listerRegistry,
lastScaleUpTime: time.Now(),
lastScaleDownFailedTrial: time.Now(),
scaleDown: sd}
ListerRegistry: listerRegistry,
lastScaleUpTime: time.Now(),
lastScaleDownFailTime: time.Now(),
scaleDown: sd}
// MaxNodesTotal reached.
readyNodeListerMock.On("List").Return([]*apiv1.Node{n1}, nil).Once()
@ -360,10 +360,10 @@ func TestStaticAutoscalerRunOnceWithAutoprovisionedEnabled(t *testing.T) {
sd := NewScaleDown(context)
autoscaler := &StaticAutoscaler{AutoscalingContext: context,
ListerRegistry: listerRegistry,
lastScaleUpTime: time.Now(),
lastScaleDownFailedTrial: time.Now(),
scaleDown: sd}
ListerRegistry: listerRegistry,
lastScaleUpTime: time.Now(),
lastScaleDownFailTime: time.Now(),
scaleDown: sd}
// Scale up.
readyNodeListerMock.On("List").Return([]*apiv1.Node{n1}, nil).Once()

View File

@ -75,16 +75,18 @@ var (
namespace = flag.String("namespace", "kube-system", "Namespace in which cluster-autoscaler run. If a --configmap flag is also provided, ensure that the configmap exists in this namespace before CA runs.")
nodeGroupAutoDiscovery = flag.String("node-group-auto-discovery", "", "One or more definition(s) of node group auto-discovery. A definition is expressed `<name of discoverer per cloud provider>:[<key>[=<value>]]`. Only the `aws` cloud provider is currently supported. The only valid discoverer for it is `asg` and the valid key is `tag`. For example, specifying `--cloud-provider aws` and `--node-group-auto-discovery asg:tag=cluster-autoscaler/auto-discovery/enabled,kubernetes.io/cluster/<YOUR CLUSTER NAME>` results in ASGs tagged with `cluster-autoscaler/auto-discovery/enabled` and `kubernetes.io/cluster/<YOUR CLUSTER NAME>` to be considered as target node groups")
scaleDownEnabled = flag.Bool("scale-down-enabled", true, "Should CA scale down the cluster")
scaleDownDelay = flag.Duration("scale-down-delay", 10*time.Minute,
"Duration from the last scale up to the time when CA starts to check scale down options")
scaleDownDelayAfterAdd = flag.Duration("scale-down-delay-after-add", 10*time.Minute,
"How long after scale up that scale down evaluation resumes")
scaleDownDelayAfterDelete = flag.Duration("scale-down-delay-after-delete", *scanInterval,
"How long after node deletion that scale down evaluation resumes, defaults to scanInterval")
scaleDownDelayAfterFailure = flag.Duration("scale-down-delay-after-failure", 3*time.Minute,
"How long after scale down failure that scale down evaluation resumes")
scaleDownUnneededTime = flag.Duration("scale-down-unneeded-time", 10*time.Minute,
"How long a node should be unneeded before it is eligible for scale down")
scaleDownUnreadyTime = flag.Duration("scale-down-unready-time", 20*time.Minute,
"How long an unready node should be unneeded before it is eligible for scale down")
scaleDownUtilizationThreshold = flag.Float64("scale-down-utilization-threshold", 0.5,
"Node utilization level, defined as sum of requested resources divided by capacity, below which a node can be considered for scale down")
scaleDownTrialInterval = flag.Duration("scale-down-trial-interval", 1*time.Minute,
"How often scale down possiblity is check")
scaleDownNonEmptyCandidatesCount = flag.Int("scale-down-non-empty-candidates-count", 30,
"Maximum number of non empty nodes considered in one iteration as candidates for scale down with drain."+
"Lower value means better CA responsiveness but possible slower scale down latency."+
@ -158,9 +160,10 @@ func createAutoscalerOptions() core.AutoscalerOptions {
MinMemoryTotal: minMemoryTotal,
NodeGroups: nodeGroupsFlag,
UnregisteredNodeRemovalTime: *unregisteredNodeRemovalTime,
ScaleDownDelay: *scaleDownDelay,
ScaleDownDelayAfterAdd: *scaleDownDelayAfterAdd,
ScaleDownDelayAfterDelete: *scaleDownDelayAfterDelete,
ScaleDownDelayAfterFailure: *scaleDownDelayAfterFailure,
ScaleDownEnabled: *scaleDownEnabled,
ScaleDownTrialInterval: *scaleDownTrialInterval,
ScaleDownUnneededTime: *scaleDownUnneededTime,
ScaleDownUnreadyTime: *scaleDownUnreadyTime,
ScaleDownUtilizationThreshold: *scaleDownUtilizationThreshold,