Merge pull request #354 from electronicarts/feature/mtcode/scale-down-delay-flags
Introduce new flags to control scale down behavior
This commit is contained in:
commit
012909d64e
|
|
@ -100,10 +100,12 @@ type AutoscalingOptions struct {
|
|||
NodeGroups []string
|
||||
// ScaleDownEnabled is used to allow CA to scale down the cluster
|
||||
ScaleDownEnabled bool
|
||||
// ScaleDownDelay sets the duration from the last scale up to the time when CA starts to check scale down options
|
||||
ScaleDownDelay time.Duration
|
||||
// ScaleDownTrialInterval sets how often scale down possibility is check
|
||||
ScaleDownTrialInterval time.Duration
|
||||
// ScaleDownDelayAfterAdd sets the duration from the last scale up to the time when CA starts to check scale down options
|
||||
ScaleDownDelayAfterAdd time.Duration
|
||||
// ScaleDownDelayAfterDelete sets the duration between scale down attempts if scale down removes one or more nodes
|
||||
ScaleDownDelayAfterDelete time.Duration
|
||||
// ScaleDownDelayAfterFailure sets the duration before the next scale down attempt if scale down results in an error
|
||||
ScaleDownDelayAfterFailure time.Duration
|
||||
// ScaleDownNonEmptyCandidatesCount is the maximum number of non empty nodes
|
||||
// considered at once as candidates for scale down.
|
||||
ScaleDownNonEmptyCandidatesCount int
|
||||
|
|
|
|||
|
|
@ -36,9 +36,10 @@ type StaticAutoscaler struct {
|
|||
// AutoscalingContext consists of validated settings and options for this autoscaler
|
||||
*AutoscalingContext
|
||||
kube_util.ListerRegistry
|
||||
lastScaleUpTime time.Time
|
||||
lastScaleDownFailedTrial time.Time
|
||||
scaleDown *ScaleDown
|
||||
lastScaleUpTime time.Time
|
||||
lastScaleDownDeleteTime time.Time
|
||||
lastScaleDownFailTime time.Time
|
||||
scaleDown *ScaleDown
|
||||
}
|
||||
|
||||
// NewStaticAutoscaler creates an instance of Autoscaler filled with provided parameters
|
||||
|
|
@ -59,11 +60,12 @@ func NewStaticAutoscaler(opts AutoscalingOptions, predicateChecker *simulator.Pr
|
|||
scaleDown := NewScaleDown(autoscalingContext)
|
||||
|
||||
return &StaticAutoscaler{
|
||||
AutoscalingContext: autoscalingContext,
|
||||
ListerRegistry: listerRegistry,
|
||||
lastScaleUpTime: time.Now(),
|
||||
lastScaleDownFailedTrial: time.Now(),
|
||||
scaleDown: scaleDown,
|
||||
AutoscalingContext: autoscalingContext,
|
||||
ListerRegistry: listerRegistry,
|
||||
lastScaleUpTime: time.Now(),
|
||||
lastScaleDownDeleteTime: time.Now(),
|
||||
lastScaleDownFailTime: time.Now(),
|
||||
scaleDown: scaleDown,
|
||||
}, nil
|
||||
}
|
||||
|
||||
|
|
@ -278,14 +280,16 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) errors.AutoscalerError
|
|||
}
|
||||
|
||||
// In dry run only utilization is updated
|
||||
calculateUnneededOnly := a.lastScaleUpTime.Add(a.ScaleDownDelay).After(currentTime) ||
|
||||
a.lastScaleDownFailedTrial.Add(a.ScaleDownTrialInterval).After(currentTime) ||
|
||||
calculateUnneededOnly := a.lastScaleUpTime.Add(a.ScaleDownDelayAfterAdd).After(currentTime) ||
|
||||
a.lastScaleDownFailTime.Add(a.ScaleDownDelayAfterFailure).After(currentTime) ||
|
||||
a.lastScaleDownDeleteTime.Add(a.ScaleDownDelayAfterDelete).After(currentTime) ||
|
||||
schedulablePodsPresent ||
|
||||
scaleDown.nodeDeleteStatus.IsDeleteInProgress()
|
||||
|
||||
glog.V(4).Infof("Scale down status: unneededOnly=%v lastScaleUpTime=%s "+
|
||||
"lastScaleDownFailedTrail=%s schedulablePodsPresent=%v", calculateUnneededOnly,
|
||||
a.lastScaleUpTime, a.lastScaleDownFailedTrial, schedulablePodsPresent)
|
||||
"lastScaleDownDeleteTime=%v lastScaleDownFailTime=%s schedulablePodsPresent=%v isDeleteInProgress=%v",
|
||||
calculateUnneededOnly, a.lastScaleUpTime, a.lastScaleDownDeleteTime, a.lastScaleDownFailTime,
|
||||
schedulablePodsPresent, scaleDown.nodeDeleteStatus.IsDeleteInProgress())
|
||||
|
||||
if !calculateUnneededOnly {
|
||||
glog.V(4).Infof("Starting scale down")
|
||||
|
|
@ -310,7 +314,9 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) errors.AutoscalerError
|
|||
return typedErr
|
||||
}
|
||||
if result == ScaleDownError {
|
||||
a.lastScaleDownFailedTrial = currentTime
|
||||
a.lastScaleDownFailTime = currentTime
|
||||
} else if result == ScaleDownNodeDeleted {
|
||||
a.lastScaleDownDeleteTime = currentTime
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -188,10 +188,10 @@ func TestStaticAutoscalerRunOnce(t *testing.T) {
|
|||
sd := NewScaleDown(context)
|
||||
|
||||
autoscaler := &StaticAutoscaler{AutoscalingContext: context,
|
||||
ListerRegistry: listerRegistry,
|
||||
lastScaleUpTime: time.Now(),
|
||||
lastScaleDownFailedTrial: time.Now(),
|
||||
scaleDown: sd}
|
||||
ListerRegistry: listerRegistry,
|
||||
lastScaleUpTime: time.Now(),
|
||||
lastScaleDownFailTime: time.Now(),
|
||||
scaleDown: sd}
|
||||
|
||||
// MaxNodesTotal reached.
|
||||
readyNodeListerMock.On("List").Return([]*apiv1.Node{n1}, nil).Once()
|
||||
|
|
@ -360,10 +360,10 @@ func TestStaticAutoscalerRunOnceWithAutoprovisionedEnabled(t *testing.T) {
|
|||
sd := NewScaleDown(context)
|
||||
|
||||
autoscaler := &StaticAutoscaler{AutoscalingContext: context,
|
||||
ListerRegistry: listerRegistry,
|
||||
lastScaleUpTime: time.Now(),
|
||||
lastScaleDownFailedTrial: time.Now(),
|
||||
scaleDown: sd}
|
||||
ListerRegistry: listerRegistry,
|
||||
lastScaleUpTime: time.Now(),
|
||||
lastScaleDownFailTime: time.Now(),
|
||||
scaleDown: sd}
|
||||
|
||||
// Scale up.
|
||||
readyNodeListerMock.On("List").Return([]*apiv1.Node{n1}, nil).Once()
|
||||
|
|
|
|||
|
|
@ -75,16 +75,18 @@ var (
|
|||
namespace = flag.String("namespace", "kube-system", "Namespace in which cluster-autoscaler run. If a --configmap flag is also provided, ensure that the configmap exists in this namespace before CA runs.")
|
||||
nodeGroupAutoDiscovery = flag.String("node-group-auto-discovery", "", "One or more definition(s) of node group auto-discovery. A definition is expressed `<name of discoverer per cloud provider>:[<key>[=<value>]]`. Only the `aws` cloud provider is currently supported. The only valid discoverer for it is `asg` and the valid key is `tag`. For example, specifying `--cloud-provider aws` and `--node-group-auto-discovery asg:tag=cluster-autoscaler/auto-discovery/enabled,kubernetes.io/cluster/<YOUR CLUSTER NAME>` results in ASGs tagged with `cluster-autoscaler/auto-discovery/enabled` and `kubernetes.io/cluster/<YOUR CLUSTER NAME>` to be considered as target node groups")
|
||||
scaleDownEnabled = flag.Bool("scale-down-enabled", true, "Should CA scale down the cluster")
|
||||
scaleDownDelay = flag.Duration("scale-down-delay", 10*time.Minute,
|
||||
"Duration from the last scale up to the time when CA starts to check scale down options")
|
||||
scaleDownDelayAfterAdd = flag.Duration("scale-down-delay-after-add", 10*time.Minute,
|
||||
"How long after scale up that scale down evaluation resumes")
|
||||
scaleDownDelayAfterDelete = flag.Duration("scale-down-delay-after-delete", *scanInterval,
|
||||
"How long after node deletion that scale down evaluation resumes, defaults to scanInterval")
|
||||
scaleDownDelayAfterFailure = flag.Duration("scale-down-delay-after-failure", 3*time.Minute,
|
||||
"How long after scale down failure that scale down evaluation resumes")
|
||||
scaleDownUnneededTime = flag.Duration("scale-down-unneeded-time", 10*time.Minute,
|
||||
"How long a node should be unneeded before it is eligible for scale down")
|
||||
scaleDownUnreadyTime = flag.Duration("scale-down-unready-time", 20*time.Minute,
|
||||
"How long an unready node should be unneeded before it is eligible for scale down")
|
||||
scaleDownUtilizationThreshold = flag.Float64("scale-down-utilization-threshold", 0.5,
|
||||
"Node utilization level, defined as sum of requested resources divided by capacity, below which a node can be considered for scale down")
|
||||
scaleDownTrialInterval = flag.Duration("scale-down-trial-interval", 1*time.Minute,
|
||||
"How often scale down possiblity is check")
|
||||
scaleDownNonEmptyCandidatesCount = flag.Int("scale-down-non-empty-candidates-count", 30,
|
||||
"Maximum number of non empty nodes considered in one iteration as candidates for scale down with drain."+
|
||||
"Lower value means better CA responsiveness but possible slower scale down latency."+
|
||||
|
|
@ -158,9 +160,10 @@ func createAutoscalerOptions() core.AutoscalerOptions {
|
|||
MinMemoryTotal: minMemoryTotal,
|
||||
NodeGroups: nodeGroupsFlag,
|
||||
UnregisteredNodeRemovalTime: *unregisteredNodeRemovalTime,
|
||||
ScaleDownDelay: *scaleDownDelay,
|
||||
ScaleDownDelayAfterAdd: *scaleDownDelayAfterAdd,
|
||||
ScaleDownDelayAfterDelete: *scaleDownDelayAfterDelete,
|
||||
ScaleDownDelayAfterFailure: *scaleDownDelayAfterFailure,
|
||||
ScaleDownEnabled: *scaleDownEnabled,
|
||||
ScaleDownTrialInterval: *scaleDownTrialInterval,
|
||||
ScaleDownUnneededTime: *scaleDownUnneededTime,
|
||||
ScaleDownUnreadyTime: *scaleDownUnreadyTime,
|
||||
ScaleDownUtilizationThreshold: *scaleDownUtilizationThreshold,
|
||||
|
|
|
|||
Loading…
Reference in New Issue