Break node categorization in scale down planner on timeout.

This commit is contained in:
Aleksandra Gacek 2022-12-02 09:38:17 +01:00
parent 170cf0f2aa
commit bae587d20c
4 changed files with 14 additions and 5 deletions

View File

@ -119,6 +119,9 @@ type AutoscalingOptions struct {
// The formula to calculate additional candidates number is following:
// max(#nodes * ScaleDownCandidatesPoolRatio, ScaleDownCandidatesPoolMinCount)
ScaleDownCandidatesPoolMinCount int
// ScaleDownSimulationTimeout defines the maximum time that can be
// spent on scale down simulation.
ScaleDownSimulationTimeout time.Duration
// NodeDeletionDelayTimeout is maximum time CA waits for removing delay-deletion.cluster-autoscaler.kubernetes.io/ annotations before deleting the node.
NodeDeletionDelayTimeout time.Duration
// WriteStatusConfigMap tells if the status information should be written to a ConfigMap

View File

@ -285,10 +285,14 @@ func (p *Planner) categorizeNodes(podDestinations map[string]bool, scaleDownCand
p.unremovableNodes.Add(n)
}
p.nodeUtilizationMap = utilizationMap
for _, node := range currentlyUnneededNodeNames {
// TODO(x13n): break on timeout. Figure out how to handle nodes
// identified as unneeded in previous iteration, but now
// skipped due to timeout.
timer := time.NewTimer(p.context.ScaleDownSimulationTimeout)
for i, node := range currentlyUnneededNodeNames {
select {
case <-timer.C:
klog.Warningf("%d out of %d nodes skipped in scale down simulation due to timeout.", len(currentlyUnneededNodeNames)-i, len(currentlyUnneededNodeNames))
break
default:
}
removable, unremovable := p.rs.SimulateNodeRemoval(node, podDestinations, p.latestUpdate, pdbs)
if unremovable != nil {
unremovableCount += 1

View File

@ -395,7 +395,7 @@ func TestUpdateClusterState(t *testing.T) {
assert.NoError(t, err)
registry := kube_util.NewListerRegistry(nil, nil, nil, nil, nil, nil, nil, nil, rsLister, nil)
provider := testprovider.NewTestCloudProvider(nil, nil)
context, err := NewScaleTestAutoscalingContext(config.AutoscalingOptions{}, &fake.Clientset{}, registry, provider, nil, nil)
context, err := NewScaleTestAutoscalingContext(config.AutoscalingOptions{ScaleDownSimulationTimeout: 5 * time.Minute}, &fake.Clientset{}, registry, provider, nil, nil)
assert.NoError(t, err)
clustersnapshot.InitializeClusterSnapshotOrDie(t, context.ClusterSnapshot, tc.nodes, tc.pods)
deleteOptions := simulator.NodeDeleteOptions{}

View File

@ -211,6 +211,7 @@ var (
skipNodesWithLocalStorage = flag.Bool("skip-nodes-with-local-storage", true, "If true cluster autoscaler will never delete nodes with pods with local storage, e.g. EmptyDir or HostPath")
minReplicaCount = flag.Int("min-replica-count", 0, "Minimum number or replicas that a replica set or replication controller should have to allow their pods deletion in scale down")
nodeDeleteDelayAfterTaint = flag.Duration("node-delete-delay-after-taint", 5*time.Second, "How long to wait before deleting a node after tainting it")
scaleDownSimulationTimeout = flag.Duration("scale-down-simulation-timeout", 5*time.Minute, "How long should we run scale down simulation.")
)
func createAutoscalingOptions() config.AutoscalingOptions {
@ -307,6 +308,7 @@ func createAutoscalingOptions() config.AutoscalingOptions {
SkipNodesWithLocalStorage: *skipNodesWithLocalStorage,
MinReplicaCount: *minReplicaCount,
NodeDeleteDelayAfterTaint: *nodeDeleteDelayAfterTaint,
ScaleDownSimulationTimeout: *scaleDownSimulationTimeout,
}
}