feat: set `IgnoreDaemonSetsUtilization` per nodegroup

Signed-off-by: vadasambar <surajrbanakar@gmail.com>

fix: test cases failing for actuator and scaledown/eligibility
- abstract default values into `config`
Signed-off-by: vadasambar <surajrbanakar@gmail.com>

refactor: rename global `IgnoreDaemonSetsUtilization` -> `GlobalIgnoreDaemonSetsUtilization` in code
- there is no change in the flag name
- rename `thresholdGetter` -> `configGetter` and tweak it to accomodate `GetIgnoreDaemonSetsUtilization`
Signed-off-by: vadasambar <surajrbanakar@gmail.com>

refactor: reset help text for `ignore-daemonsets-utilization` flag
- because per nodegroup override is supported only for AWS ASG tags as of now
Signed-off-by: vadasambar <surajrbanakar@gmail.com>

docs: add info about overriding `--ignore-daemonsets-utilization` per ASG
- in AWS cloud provider README
Signed-off-by: vadasambar <surajrbanakar@gmail.com>

refactor: use a limiting interface in actuator in place of `NodeGroupConfigProcessor` interface
- to limit the functions that can be used
- since we need it only for `GetIgnoreDaemonSetsUtilization`
Signed-off-by: vadasambar <surajrbanakar@gmail.com>

fix: tests failing for actuator
- rename `staticNodeGroupConfigProcessor` -> `MockNodeGroupConfigGetter`
- move `MockNodeGroupConfigGetter` to test/common so that it can be used in different tests
Signed-off-by: vadasambar <surajrbanakar@gmail.com>

fix: go lint errors for `MockNodeGroupConfigGetter`
Signed-off-by: vadasambar <surajrbanakar@gmail.com>

test: add tests for `IgnoreDaemonSetsUtilization` in cloud provider dir
Signed-off-by: vadasambar <surajrbanakar@gmail.com>

test: update node group config processor tests for `IgnoreDaemonSetsUtilization`
Signed-off-by: vadasambar <surajrbanakar@gmail.com>

test: update eligibility test cases for `IgnoreDaemonSetsUtilization`
Signed-off-by: vadasambar <surajrbanakar@gmail.com>

test: run actuation tests for 2 NGS
- one with `IgnoreDaemonSetsUtilization`: `false`
- one with `IgnoreDaemonSetsUtilization`: `true`
Signed-off-by: vadasambar <surajrbanakar@gmail.com>

test: add tests for `IgnoreDaemonSetsUtilization` in actuator
- add helper to generate multiple ds pods dynamically
- get rid of mock config processor because it is not required
Signed-off-by: vadasambar <surajrbanakar@gmail.com>

test: fix failing tests for actuator
Signed-off-by: vadasambar <surajrbanakar@gmail.com>

refactor: remove `GlobalIgnoreDaemonSetUtilization` autoscaling option
- not required
Signed-off-by: vadasambar <surajrbanakar@gmail.com>

fix: warn message `DefaultScaleDownUnreadyTimeKey` -> `DefaultIgnoreDaemonSetsUtilizationKey`
Signed-off-by: vadasambar <surajrbanakar@gmail.com>

refactor: use `generateDsPods` instead of `generateDsPod`
Signed-off-by: vadasambar <surajrbanakar@gmail.com>

refactor: `globaIgnoreDaemonSetsUtilization` -> `ignoreDaemonSetsUtilization`
Signed-off-by: vadasambar <surajrbanakar@gmail.com>
This commit is contained in:
vadasambar 2023-04-10 11:20:00 +05:30
parent b569db410f
commit 7941bab214
15 changed files with 205 additions and 60 deletions

View File

@ -246,6 +246,8 @@ as string). Currently supported autoscaling options (and example values) are:
(overrides `--scale-down-unneeded-time` value for that specific ASG)
* `k8s.io/cluster-autoscaler/node-template/autoscaling-options/scaledownunreadytime`: `20m0s`
(overrides `--scale-down-unready-time` value for that specific ASG)
* `k8s.io/cluster-autoscaler/node-template/autoscaling-options/ignoredaemonsetsutilization`: `true`
(overrides `--ignore-daemonsets-utilization` value for that specific ASG)
**NOTE:** It is your responsibility to ensure such labels and/or taints are
applied via the node's kubelet configuration at startup. Cluster Autoscaler will not set the node taints for you.

View File

@ -245,6 +245,15 @@ func (m *AwsManager) GetAsgOptions(asg asg, defaults config.NodeGroupAutoscaling
}
}
if stringOpt, found := options[config.DefaultIgnoreDaemonSetsUtilizationKey]; found {
if opt, err := strconv.ParseBool(stringOpt); err != nil {
klog.Warningf("failed to convert asg %s %s tag to bool: %v",
asg.Name, config.DefaultIgnoreDaemonSetsUtilizationKey, err)
} else {
defaults.IgnoreDaemonSetsUtilization = opt
}
}
return &defaults
}

View File

@ -130,6 +130,7 @@ func TestGetAsgOptions(t *testing.T) {
ScaleDownGpuUtilizationThreshold: 0.2,
ScaleDownUnneededTime: time.Second,
ScaleDownUnreadyTime: time.Minute,
IgnoreDaemonSetsUtilization: false,
}
tests := []struct {
@ -145,39 +146,60 @@ func TestGetAsgOptions(t *testing.T) {
{
description: "keep defaults on invalid tags values",
tags: map[string]string{
"scaledownutilizationthreshold": "not-a-float",
"scaledownunneededtime": "not-a-duration",
"ScaleDownUnreadyTime": "",
config.DefaultScaleDownUtilizationThresholdKey: "not-a-float",
config.DefaultScaleDownUnneededTimeKey: "not-a-duration",
"ScaleDownUnreadyTime": "",
config.DefaultIgnoreDaemonSetsUtilizationKey: "not-a-bool",
},
expected: &defaultOptions,
},
{
description: "use provided tags and fill missing with defaults",
tags: map[string]string{
"scaledownutilizationthreshold": "0.42",
"scaledownunneededtime": "1h",
config.DefaultScaleDownUtilizationThresholdKey: "0.42",
config.DefaultScaleDownUnneededTimeKey: "1h",
config.DefaultIgnoreDaemonSetsUtilizationKey: "true",
},
expected: &config.NodeGroupAutoscalingOptions{
ScaleDownUtilizationThreshold: 0.42,
ScaleDownGpuUtilizationThreshold: defaultOptions.ScaleDownGpuUtilizationThreshold,
ScaleDownUnneededTime: time.Hour,
ScaleDownUnreadyTime: defaultOptions.ScaleDownUnreadyTime,
IgnoreDaemonSetsUtilization: true,
},
},
{
description: "use provided tags (happy path)",
tags: map[string]string{
config.DefaultScaleDownUtilizationThresholdKey: "0.42",
config.DefaultScaleDownUnneededTimeKey: "1h",
config.DefaultScaleDownGpuUtilizationThresholdKey: "0.7",
config.DefaultScaleDownUnreadyTimeKey: "25m",
config.DefaultIgnoreDaemonSetsUtilizationKey: "true",
},
expected: &config.NodeGroupAutoscalingOptions{
ScaleDownUtilizationThreshold: 0.42,
ScaleDownGpuUtilizationThreshold: 0.7,
ScaleDownUnneededTime: time.Hour,
ScaleDownUnreadyTime: 25 * time.Minute,
IgnoreDaemonSetsUtilization: true,
},
},
{
description: "ignore unknown tags",
tags: map[string]string{
"scaledownutilizationthreshold": "0.6",
"scaledowngpuutilizationthreshold": "0.7",
"scaledownunneededtime": "1m",
"scaledownunreadytime": "1h",
"notyetspecified": "42",
config.DefaultScaleDownUtilizationThresholdKey: "0.6",
config.DefaultScaleDownGpuUtilizationThresholdKey: "0.7",
config.DefaultScaleDownUnneededTimeKey: "1m",
config.DefaultScaleDownUnreadyTimeKey: "1h",
"notyetspecified": "42",
},
expected: &config.NodeGroupAutoscalingOptions{
ScaleDownUtilizationThreshold: 0.6,
ScaleDownGpuUtilizationThreshold: 0.7,
ScaleDownUnneededTime: time.Minute,
ScaleDownUnreadyTime: time.Hour,
IgnoreDaemonSetsUtilization: false,
},
},
}

View File

@ -48,6 +48,8 @@ type NodeGroupAutoscalingOptions struct {
MaxNodeProvisionTime time.Duration
// ZeroOrMaxNodeScaling means that a node group should be scaled up to maximum size or down to zero nodes all at once instead of one-by-one.
ZeroOrMaxNodeScaling bool
// IgnoreDaemonSetsUtilization sets if daemonsets utilization should be considered during node scale-down
IgnoreDaemonSetsUtilization bool
}
// GCEOptions contain autoscaling options specific to GCE cloud provider.
@ -117,8 +119,6 @@ type AutoscalingOptions struct {
GRPCExpanderCert string
// GRPCExpanderURL is the url of the gRPC server when using the gRPC expander
GRPCExpanderURL string
// IgnoreDaemonSetsUtilization is whether CA will ignore DaemonSet pods when calculating resource utilization for scaling down
IgnoreDaemonSetsUtilization bool
// IgnoreMirrorPodsUtilization is whether CA will ignore Mirror pods when calculating resource utilization for scaling down
IgnoreMirrorPodsUtilization bool
// MaxGracefulTerminationSec is maximum number of seconds scale down waits for pods to terminate before

View File

@ -16,6 +16,8 @@ limitations under the License.
package config
import "time"
const (
// DefaultMaxClusterCores is the default maximum number of cores in the cluster.
DefaultMaxClusterCores = 5000 * 64
@ -32,4 +34,14 @@ const (
DefaultScaleDownUnreadyTimeKey = "scaledownunreadytime"
// DefaultMaxNodeProvisionTimeKey identifies MaxNodeProvisionTime autoscaling option
DefaultMaxNodeProvisionTimeKey = "maxnodeprovisiontime"
// DefaultIgnoreDaemonSetsUtilizationKey identifies IgnoreDaemonSetsUtilization autoscaling option
DefaultIgnoreDaemonSetsUtilizationKey = "ignoredaemonsetsutilization"
// DefaultScaleDownUnneededTime identifies ScaleDownUnneededTime autoscaling option
DefaultScaleDownUnneededTime = 10 * time.Minute
// DefaultScaleDownUnreadyTime identifies ScaleDownUnreadyTime autoscaling option
DefaultScaleDownUnreadyTime = 20 * time.Minute
// DefaultScaleDownUtilizationThreshold identifies ScaleDownUtilizationThreshold autoscaling option
DefaultScaleDownUtilizationThreshold = 0.5
// DefaultScaleDownGpuUtilizationThreshold identifies ScaleDownGpuUtilizationThreshold autoscaling option
DefaultScaleDownGpuUtilizationThreshold = 0.5
)

View File

@ -33,6 +33,7 @@ import (
"k8s.io/autoscaler/cluster-autoscaler/core/scaledown/status"
"k8s.io/autoscaler/cluster-autoscaler/core/utils"
"k8s.io/autoscaler/cluster-autoscaler/metrics"
"k8s.io/autoscaler/cluster-autoscaler/processors"
"k8s.io/autoscaler/cluster-autoscaler/simulator"
"k8s.io/autoscaler/cluster-autoscaler/simulator/clustersnapshot"
"k8s.io/autoscaler/cluster-autoscaler/simulator/utilization"
@ -52,10 +53,18 @@ type Actuator struct {
// This is a larger change to the code structure which impacts some existing actuator unit tests
// as well as Cluster Autoscaler implementations that may override ScaleDownSetProcessor
budgetProcessor *budgets.ScaleDownBudgetProcessor
configGetter actuatorNodeGroupConfigGetter
}
// actuatorNodeGroupConfigGetter is an interface to limit the functions that can be used
// from NodeGroupConfigProcessor interface
type actuatorNodeGroupConfigGetter interface {
// GetIgnoreDaemonSetsUtilization returns IgnoreDaemonSetsUtilization value that should be used for a given NodeGroup.
GetIgnoreDaemonSetsUtilization(context *context.AutoscalingContext, nodeGroup cloudprovider.NodeGroup) (bool, error)
}
// NewActuator returns a new instance of Actuator.
func NewActuator(ctx *context.AutoscalingContext, csr *clusterstate.ClusterStateRegistry, ndt *deletiontracker.NodeDeletionTracker, deleteOptions simulator.NodeDeleteOptions) *Actuator {
func NewActuator(ctx *context.AutoscalingContext, csr *clusterstate.ClusterStateRegistry, ndt *deletiontracker.NodeDeletionTracker, deleteOptions simulator.NodeDeleteOptions, processors *processors.AutoscalingProcessors) *Actuator {
ndb := NewNodeDeletionBatcher(ctx, csr, ndt, ctx.NodeDeletionBatcherInterval)
return &Actuator{
ctx: ctx,
@ -64,6 +73,7 @@ func NewActuator(ctx *context.AutoscalingContext, csr *clusterstate.ClusterState
nodeDeletionScheduler: NewGroupDeletionScheduler(ctx, ndt, ndb, NewDefaultEvictor(deleteOptions, ndt)),
budgetProcessor: budgets.NewScaleDownBudgetProcessor(ctx),
deleteOptions: deleteOptions,
configGetter: processors.NodeGroupConfigProcessor,
}
}
@ -263,8 +273,14 @@ func (a *Actuator) scaleDownNodeToReport(node *apiv1.Node, drain bool) (*status.
if err != nil {
return nil, err
}
ignoreDaemonSetsUtilization, err := a.configGetter.GetIgnoreDaemonSetsUtilization(a.ctx, nodeGroup)
if err != nil {
return nil, err
}
gpuConfig := a.ctx.CloudProvider.GetNodeGpuConfig(node)
utilInfo, err := utilization.Calculate(nodeInfo, a.ctx.IgnoreDaemonSetsUtilization, a.ctx.IgnoreMirrorPodsUtilization, gpuConfig, time.Now())
utilInfo, err := utilization.Calculate(nodeInfo, ignoreDaemonSetsUtilization, a.ctx.IgnoreMirrorPodsUtilization, gpuConfig, time.Now())
if err != nil {
return nil, err
}

View File

@ -41,20 +41,22 @@ const (
// Checker is responsible for deciding which nodes pass the criteria for scale down.
type Checker struct {
thresholdGetter utilizationThresholdGetter
configGetter nodeGroupConfigGetter
}
type utilizationThresholdGetter interface {
type nodeGroupConfigGetter interface {
// GetScaleDownUtilizationThreshold returns ScaleDownUtilizationThreshold value that should be used for a given NodeGroup.
GetScaleDownUtilizationThreshold(context *context.AutoscalingContext, nodeGroup cloudprovider.NodeGroup) (float64, error)
// GetScaleDownGpuUtilizationThreshold returns ScaleDownGpuUtilizationThreshold value that should be used for a given NodeGroup.
GetScaleDownGpuUtilizationThreshold(context *context.AutoscalingContext, nodeGroup cloudprovider.NodeGroup) (float64, error)
// GetIgnoreDaemonSetsUtilization returns IgnoreDaemonSetsUtilization value that should be used for a given NodeGroup.
GetIgnoreDaemonSetsUtilization(context *context.AutoscalingContext, nodeGroup cloudprovider.NodeGroup) (bool, error)
}
// NewChecker creates a new Checker object.
func NewChecker(thresholdGetter utilizationThresholdGetter) *Checker {
func NewChecker(configGetter nodeGroupConfigGetter) *Checker {
return &Checker{
thresholdGetter: thresholdGetter,
configGetter: configGetter,
}
}
@ -118,12 +120,6 @@ func (c *Checker) unremovableReasonAndNodeUtilization(context *context.Autoscali
return simulator.ScaleDownDisabledAnnotation, nil
}
gpuConfig := context.CloudProvider.GetNodeGpuConfig(node)
utilInfo, err := utilization.Calculate(nodeInfo, context.IgnoreDaemonSetsUtilization, context.IgnoreMirrorPodsUtilization, gpuConfig, timestamp)
if err != nil {
klog.Warningf("Failed to calculate utilization for %s: %v", node.Name, err)
}
nodeGroup, err := context.CloudProvider.NodeGroupForNode(node)
if err != nil {
klog.Warning("Node group not found for node %v: %v", node.Name, err)
@ -136,6 +132,18 @@ func (c *Checker) unremovableReasonAndNodeUtilization(context *context.Autoscali
return simulator.NotAutoscaled, nil
}
ignoreDaemonSetsUtilization, err := c.configGetter.GetIgnoreDaemonSetsUtilization(context, nodeGroup)
if err != nil {
klog.Warningf("Couldn't retrieve `IgnoreDaemonSetsUtilization` option for node %v: %v", node.Name, err)
return simulator.UnexpectedError, nil
}
gpuConfig := context.CloudProvider.GetNodeGpuConfig(node)
utilInfo, err := utilization.Calculate(nodeInfo, ignoreDaemonSetsUtilization, context.IgnoreMirrorPodsUtilization, gpuConfig, timestamp)
if err != nil {
klog.Warningf("Failed to calculate utilization for %s: %v", node.Name, err)
}
// If scale down of unready nodes is disabled, skip the node if it is unready
if !context.ScaleDownUnreadyEnabled {
ready, _, _ := kube_util.GetReadinessState(node)
@ -166,12 +174,12 @@ func (c *Checker) isNodeBelowUtilizationThreshold(context *context.AutoscalingCo
var err error
gpuConfig := context.CloudProvider.GetNodeGpuConfig(node)
if gpuConfig != nil {
threshold, err = c.thresholdGetter.GetScaleDownGpuUtilizationThreshold(context, nodeGroup)
threshold, err = c.configGetter.GetScaleDownGpuUtilizationThreshold(context, nodeGroup)
if err != nil {
return false, err
}
} else {
threshold, err = c.thresholdGetter.GetScaleDownUtilizationThreshold(context, nodeGroup)
threshold, err = c.configGetter.GetScaleDownUtilizationThreshold(context, nodeGroup)
if err != nil {
return false, err
}

View File

@ -21,12 +21,11 @@ import (
"testing"
"time"
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
testprovider "k8s.io/autoscaler/cluster-autoscaler/cloudprovider/test"
"k8s.io/autoscaler/cluster-autoscaler/config"
"k8s.io/autoscaler/cluster-autoscaler/context"
"k8s.io/autoscaler/cluster-autoscaler/core/scaledown/unremovable"
. "k8s.io/autoscaler/cluster-autoscaler/core/test"
"k8s.io/autoscaler/cluster-autoscaler/processors/nodegroupconfig"
"k8s.io/autoscaler/cluster-autoscaler/simulator/clustersnapshot"
"k8s.io/autoscaler/cluster-autoscaler/utils/taints"
. "k8s.io/autoscaler/cluster-autoscaler/utils/test"
@ -36,6 +35,15 @@ import (
"k8s.io/client-go/kubernetes/fake"
)
type testCase struct {
desc string
nodes []*apiv1.Node
pods []*apiv1.Pod
want []string
scaleDownUnready bool
ignoreDaemonSetsUtilization bool
}
func TestFilterOutUnremovable(t *testing.T) {
now := time.Now()
@ -59,13 +67,10 @@ func TestFilterOutUnremovable(t *testing.T) {
smallPod := BuildTestPod("smallPod", 100, 0)
smallPod.Spec.NodeName = "regular"
testCases := []struct {
desc string
nodes []*apiv1.Node
pods []*apiv1.Pod
want []string
scaleDownUnready bool
}{
dsPod := BuildDSTestPod("dsPod", 500, 0)
dsPod.Spec.NodeName = "regular"
testCases := []testCase{
{
desc: "regular node stays",
nodes: []*apiv1.Node{regularNode},
@ -111,14 +116,53 @@ func TestFilterOutUnremovable(t *testing.T) {
scaleDownUnready: false,
},
}
allTestCases := testCases
// run all test cases again with `IgnoreDaemonSetsUtilization` set to true
for _, tc := range testCases {
t := tc // shallow copy
t.ignoreDaemonSetsUtilization = true
allTestCases = append(allTestCases, t)
}
ignoreDsCases := []testCase{
{
desc: "high utilization daemonsets node is filtered out",
nodes: []*apiv1.Node{regularNode},
pods: []*apiv1.Pod{smallPod, dsPod},
want: []string{},
scaleDownUnready: true,
ignoreDaemonSetsUtilization: false,
},
{
desc: "high utilization daemonsets node stays",
nodes: []*apiv1.Node{regularNode},
pods: []*apiv1.Pod{smallPod, dsPod},
want: []string{"regular"},
scaleDownUnready: true,
ignoreDaemonSetsUtilization: true,
},
}
allTestCases = append(allTestCases, ignoreDsCases...)
for _, tc := range allTestCases {
tc := tc
t.Run(tc.desc, func(t *testing.T) {
t.Parallel()
c := NewChecker(&staticThresholdGetter{0.5})
s := nodegroupconfig.DelegatingNodeGroupConfigProcessor{}
c := NewChecker(&s)
options := config.AutoscalingOptions{
UnremovableNodeRecheckTimeout: 5 * time.Minute,
ScaleDownUnreadyEnabled: tc.scaleDownUnready,
NodeGroupDefaults: config.NodeGroupAutoscalingOptions{
ScaleDownUtilizationThreshold: config.DefaultScaleDownUtilizationThreshold,
ScaleDownGpuUtilizationThreshold: config.DefaultScaleDownGpuUtilizationThreshold,
ScaleDownUnneededTime: config.DefaultScaleDownUnneededTime,
ScaleDownUnreadyTime: config.DefaultScaleDownUnreadyTime,
IgnoreDaemonSetsUtilization: tc.ignoreDaemonSetsUtilization,
},
}
provider := testprovider.NewTestCloudProvider(nil, nil)
provider.AddNodeGroup("ng1", 1, 10, 2)
@ -136,15 +180,3 @@ func TestFilterOutUnremovable(t *testing.T) {
})
}
}
type staticThresholdGetter struct {
threshold float64
}
func (s *staticThresholdGetter) GetScaleDownUtilizationThreshold(_ *context.AutoscalingContext, _ cloudprovider.NodeGroup) (float64, error) {
return s.threshold, nil
}
func (s *staticThresholdGetter) GetScaleDownGpuUtilizationThreshold(_ *context.AutoscalingContext, _ cloudprovider.NodeGroup) (float64, error) {
return s.threshold, nil
}

View File

@ -1303,7 +1303,8 @@ func newWrapperForTesting(ctx *context.AutoscalingContext, clusterStateRegistry
MinReplicaCount: 0,
SkipNodesWithCustomControllerPods: true,
}
sd := NewScaleDown(ctx, NewTestProcessors(ctx), ndt, deleteOptions)
actuator := actuation.NewActuator(ctx, clusterStateRegistry, ndt, deleteOptions)
processors := NewTestProcessors(ctx)
sd := NewScaleDown(ctx, processors, ndt, deleteOptions)
actuator := actuation.NewActuator(ctx, clusterStateRegistry, ndt, deleteOptions, processors)
return NewScaleDownWrapper(sd, actuator)
}

View File

@ -172,7 +172,7 @@ func NewStaticAutoscaler(
// during the struct creation rather than here.
ndt := deletiontracker.NewNodeDeletionTracker(0 * time.Second)
scaleDown := legacy.NewScaleDown(autoscalingContext, processors, ndt, deleteOptions)
actuator := actuation.NewActuator(autoscalingContext, clusterStateRegistry, ndt, deleteOptions)
actuator := actuation.NewActuator(autoscalingContext, clusterStateRegistry, ndt, deleteOptions, processors)
autoscalingContext.ScaleDownActuator = actuator
var scaleDownPlanner scaledown.Planner

View File

@ -159,7 +159,7 @@ func (m *onNodeGroupDeleteMock) Delete(id string) error {
func setUpScaleDownActuator(ctx *context.AutoscalingContext, options config.AutoscalingOptions) {
deleteOptions := simulator.NewNodeDeleteOptions(options)
ctx.ScaleDownActuator = actuation.NewActuator(ctx, nil, deletiontracker.NewNodeDeletionTracker(0*time.Second), deleteOptions)
ctx.ScaleDownActuator = actuation.NewActuator(ctx, nil, deletiontracker.NewNodeDeletionTracker(0*time.Second), deleteOptions, NewTestProcessors(ctx))
}
func TestStaticAutoscalerRunOnce(t *testing.T) {
@ -1433,7 +1433,7 @@ func TestStaticAutoscalerUpcomingScaleDownCandidates(t *testing.T) {
csr.RegisterProviders(clusterstate.NewMockMaxNodeProvisionTimeProvider(15 * time.Minute))
// Setting the Actuator is necessary for testing any scale-down logic, it shouldn't have anything to do in this test.
actuator := actuation.NewActuator(&ctx, csr, deletiontracker.NewNodeDeletionTracker(0*time.Second), simulator.NodeDeleteOptions{})
actuator := actuation.NewActuator(&ctx, csr, deletiontracker.NewNodeDeletionTracker(0*time.Second), simulator.NodeDeleteOptions{}, NewTestProcessors(&ctx))
ctx.ScaleDownActuator = actuator
// Fake planner that keeps track of the scale-down candidates passed to UpdateClusterState.
@ -1761,7 +1761,7 @@ func newScaleDownPlannerAndActuator(t *testing.T, ctx *context.AutoscalingContex
}
ndt := deletiontracker.NewNodeDeletionTracker(0 * time.Second)
sd := legacy.NewScaleDown(ctx, p, ndt, deleteOptions)
actuator := actuation.NewActuator(ctx, cs, ndt, deleteOptions)
actuator := actuation.NewActuator(ctx, cs, ndt, deleteOptions, p)
wrapper := legacy.NewScaleDownWrapper(sd, actuator)
return wrapper, wrapper
}

View File

@ -107,13 +107,13 @@ var (
"How long after node deletion that scale down evaluation resumes, defaults to scanInterval")
scaleDownDelayAfterFailure = flag.Duration("scale-down-delay-after-failure", 3*time.Minute,
"How long after scale down failure that scale down evaluation resumes")
scaleDownUnneededTime = flag.Duration("scale-down-unneeded-time", 10*time.Minute,
scaleDownUnneededTime = flag.Duration("scale-down-unneeded-time", config.DefaultScaleDownUnneededTime,
"How long a node should be unneeded before it is eligible for scale down")
scaleDownUnreadyTime = flag.Duration("scale-down-unready-time", 20*time.Minute,
scaleDownUnreadyTime = flag.Duration("scale-down-unready-time", config.DefaultScaleDownUnreadyTime,
"How long an unready node should be unneeded before it is eligible for scale down")
scaleDownUtilizationThreshold = flag.Float64("scale-down-utilization-threshold", 0.5,
scaleDownUtilizationThreshold = flag.Float64("scale-down-utilization-threshold", config.DefaultScaleDownUtilizationThreshold,
"Sum of cpu or memory of all pods running on the node divided by node's corresponding allocatable resource, below which a node can be considered for scale down")
scaleDownGpuUtilizationThreshold = flag.Float64("scale-down-gpu-utilization-threshold", 0.5,
scaleDownGpuUtilizationThreshold = flag.Float64("scale-down-gpu-utilization-threshold", config.DefaultScaleDownGpuUtilizationThreshold,
"Sum of gpu requests of all pods running on the node divided by node's allocatable resource, below which a node can be considered for scale down."+
"Utilization calculation only cares about gpu resource for accelerator node. cpu and memory utilization will be ignored.")
scaleDownNonEmptyCandidatesCount = flag.Int("scale-down-non-empty-candidates-count", 30,
@ -258,7 +258,9 @@ func createAutoscalingOptions() config.AutoscalingOptions {
ScaleDownGpuUtilizationThreshold: *scaleDownGpuUtilizationThreshold,
ScaleDownUnneededTime: *scaleDownUnneededTime,
ScaleDownUnreadyTime: *scaleDownUnreadyTime,
MaxNodeProvisionTime: *maxNodeProvisionTime,
// per nodegroup setting, defaulted to global setting
IgnoreDaemonSetsUtilization: *ignoreDaemonSetsUtilization,
MaxNodeProvisionTime: *maxNodeProvisionTime,
},
CloudConfig: *cloudConfig,
CloudProviderName: *cloudProviderFlag,
@ -271,7 +273,6 @@ func createAutoscalingOptions() config.AutoscalingOptions {
ExpanderNames: *expanderFlag,
GRPCExpanderCert: *grpcExpanderCert,
GRPCExpanderURL: *grpcExpanderURL,
IgnoreDaemonSetsUtilization: *ignoreDaemonSetsUtilization,
IgnoreMirrorPodsUtilization: *ignoreMirrorPodsUtilization,
MaxBulkSoftTaintCount: *maxBulkSoftTaintCount,
MaxBulkSoftTaintTime: *maxBulkSoftTaintTime,

View File

@ -35,6 +35,8 @@ type NodeGroupConfigProcessor interface {
GetScaleDownGpuUtilizationThreshold(context *context.AutoscalingContext, nodeGroup cloudprovider.NodeGroup) (float64, error)
// GetMaxNodeProvisionTime return MaxNodeProvisionTime value that should be used for a given NodeGroup.
GetMaxNodeProvisionTime(context *context.AutoscalingContext, nodeGroup cloudprovider.NodeGroup) (time.Duration, error)
// GetIgnoreDaemonSetsUtilization returns IgnoreDaemonSetsUtilization value that should be used for a given NodeGroup.
GetIgnoreDaemonSetsUtilization(context *context.AutoscalingContext, nodeGroup cloudprovider.NodeGroup) (bool, error)
// CleanUp cleans up processor's internal structures.
CleanUp()
}
@ -105,6 +107,18 @@ func (p *DelegatingNodeGroupConfigProcessor) GetMaxNodeProvisionTime(context *co
return ngConfig.MaxNodeProvisionTime, nil
}
// GetIgnoreDaemonSetsUtilization returns IgnoreDaemonSetsUtilization value that should be used for a given NodeGroup.
func (p *DelegatingNodeGroupConfigProcessor) GetIgnoreDaemonSetsUtilization(context *context.AutoscalingContext, nodeGroup cloudprovider.NodeGroup) (bool, error) {
ngConfig, err := nodeGroup.GetOptions(context.NodeGroupDefaults)
if err != nil && err != cloudprovider.ErrNotImplemented {
return false, err
}
if ngConfig == nil || err == cloudprovider.ErrNotImplemented {
return context.NodeGroupDefaults.IgnoreDaemonSetsUtilization, nil
}
return ngConfig.IgnoreDaemonSetsUtilization, nil
}
// CleanUp cleans up processor's internal structures.
func (p *DelegatingNodeGroupConfigProcessor) CleanUp() {
}

View File

@ -49,6 +49,7 @@ func TestDelegatingNodeGroupConfigProcessor(t *testing.T) {
ScaleDownGpuUtilizationThreshold: 0.6,
ScaleDownUtilizationThreshold: 0.5,
MaxNodeProvisionTime: 15 * time.Minute,
IgnoreDaemonSetsUtilization: true,
}
ngOpts := &config.NodeGroupAutoscalingOptions{
ScaleDownUnneededTime: 10 * time.Minute,
@ -56,6 +57,7 @@ func TestDelegatingNodeGroupConfigProcessor(t *testing.T) {
ScaleDownGpuUtilizationThreshold: 0.85,
ScaleDownUtilizationThreshold: 0.75,
MaxNodeProvisionTime: 60 * time.Minute,
IgnoreDaemonSetsUtilization: false,
}
testUnneededTime := func(t *testing.T, p DelegatingNodeGroupConfigProcessor, c *context.AutoscalingContext, ng cloudprovider.NodeGroup, w Want, we error) {
@ -109,18 +111,32 @@ func TestDelegatingNodeGroupConfigProcessor(t *testing.T) {
assert.Equal(t, res, results[w])
}
// for IgnoreDaemonSetsUtilization
testIgnoreDSUtilization := func(t *testing.T, p DelegatingNodeGroupConfigProcessor, c *context.AutoscalingContext, ng cloudprovider.NodeGroup, w Want, we error) {
res, err := p.GetIgnoreDaemonSetsUtilization(c, ng)
assert.Equal(t, err, we)
results := map[Want]bool{
NIL: false,
GLOBAL: true,
NG: false,
}
assert.Equal(t, res, results[w])
}
funcs := map[string]func(*testing.T, DelegatingNodeGroupConfigProcessor, *context.AutoscalingContext, cloudprovider.NodeGroup, Want, error){
"ScaleDownUnneededTime": testUnneededTime,
"ScaleDownUnreadyTime": testUnreadyTime,
"ScaleDownUtilizationThreshold": testUtilizationThreshold,
"ScaleDownGpuUtilizationThreshold": testGpuThreshold,
"MaxNodeProvisionTime": testMaxNodeProvisionTime,
"IgnoreDaemonSetsUtilization": testIgnoreDSUtilization,
"MultipleOptions": func(t *testing.T, p DelegatingNodeGroupConfigProcessor, c *context.AutoscalingContext, ng cloudprovider.NodeGroup, w Want, we error) {
testUnneededTime(t, p, c, ng, w, we)
testUnreadyTime(t, p, c, ng, w, we)
testUtilizationThreshold(t, p, c, ng, w, we)
testGpuThreshold(t, p, c, ng, w, we)
testMaxNodeProvisionTime(t, p, c, ng, w, we)
testIgnoreDSUtilization(t, p, c, ng, w, we)
},
"RepeatingTheSameCallGivesConsistentResults": func(t *testing.T, p DelegatingNodeGroupConfigProcessor, c *context.AutoscalingContext, ng cloudprovider.NodeGroup, w Want, we error) {
testUnneededTime(t, p, c, ng, w, we)
@ -128,6 +144,9 @@ func TestDelegatingNodeGroupConfigProcessor(t *testing.T) {
// throw in a different call
testGpuThreshold(t, p, c, ng, w, we)
testUnneededTime(t, p, c, ng, w, we)
// throw in another different call
testIgnoreDSUtilization(t, p, c, ng, w, we)
testUnneededTime(t, p, c, ng, w, we)
},
}

View File

@ -67,6 +67,15 @@ func BuildTestPod(name string, cpu int64, mem int64) *apiv1.Pod {
return pod
}
// BuildDSTestPod creates a DaemonSet pod with cpu and memory.
func BuildDSTestPod(name string, cpu int64, mem int64) *apiv1.Pod {
pod := BuildTestPod(name, cpu, mem)
pod.OwnerReferences = GenerateOwnerReferences("ds", "DaemonSet", "apps/v1", "some-uid")
return pod
}
// BuildTestPodWithEphemeralStorage creates a pod with cpu, memory and ephemeral storage resources.
func BuildTestPodWithEphemeralStorage(name string, cpu, mem, ephemeralStorage int64) *apiv1.Pod {
startTime := metav1.Unix(0, 0)