make DecreaseTargetSize more accurate for clusterapi

this change ensures that when DecreaseTargetSize is counting the nodes
that it does not include any instances which are considered to be
pending (i.e. not having a node ref), deleting, or are failed. this change will
allow the core autoscaler to then decrease the size of the node group
accordingly, instead of raising an error.

This change also add some code to the unit tests to make detection of
this condition easier.
This commit is contained in:
elmiko 2025-03-05 12:43:04 -05:00
parent f04fd5b231
commit 003e6cd67c
4 changed files with 283 additions and 37 deletions

View File

@ -55,6 +55,7 @@ const (
resourceNameMachineSet = "machinesets"
resourceNameMachineDeployment = "machinedeployments"
resourceNameMachinePool = "machinepools"
deletingMachinePrefix = "deleting-machine-"
failedMachinePrefix = "failed-machine-"
pendingMachinePrefix = "pending-machine-"
machineTemplateKind = "MachineTemplate"
@ -314,6 +315,9 @@ func (c *machineController) findMachineByProviderID(providerID normalizedProvide
return u.DeepCopy(), nil
}
if isDeletingMachineProviderID(providerID) {
return c.findMachine(machineKeyFromDeletingMachineProviderID(providerID))
}
if isFailedMachineProviderID(providerID) {
return c.findMachine(machineKeyFromFailedProviderID(providerID))
}
@ -339,6 +343,19 @@ func (c *machineController) findMachineByProviderID(providerID normalizedProvide
return c.findMachine(path.Join(ns, machineID))
}
func createDeletingMachineNormalizedProviderID(namespace, name string) string {
return fmt.Sprintf("%s%s_%s", deletingMachinePrefix, namespace, name)
}
func isDeletingMachineProviderID(providerID normalizedProviderID) bool {
return strings.HasPrefix(string(providerID), deletingMachinePrefix)
}
func machineKeyFromDeletingMachineProviderID(providerID normalizedProviderID) string {
namespaceName := strings.TrimPrefix(string(providerID), deletingMachinePrefix)
return strings.Replace(namespaceName, "_", "/", 1)
}
func isPendingMachineProviderID(providerID normalizedProviderID) bool {
return strings.HasPrefix(string(providerID), pendingMachinePrefix)
}
@ -610,6 +627,12 @@ func (c *machineController) findScalableResourceProviderIDs(scalableResource *un
if found {
if providerID != "" {
// If the machine is deleting, prepend the deletion guard on the provider id
// so that it can be properly filtered when counting the number of nodes and instances.
if !machine.GetDeletionTimestamp().IsZero() {
klog.V(4).Infof("Machine %q has a non-zero deletion timestamp", machine.GetName())
providerID = createDeletingMachineNormalizedProviderID(machine.GetNamespace(), machine.GetName())
}
providerIDs = append(providerIDs, providerID)
continue
}

View File

@ -2131,3 +2131,116 @@ func Test_machineController_nodeGroups(t *testing.T) {
})
}
}
func Test_isDeletingMachineProviderID(t *testing.T) {
type testCase struct {
description string
providerID string
expectedReturn bool
}
testCases := []testCase{
{
description: "proper provider ID without deletion prefix should return false",
providerID: "fake-provider://a.provider.id-0001",
expectedReturn: false,
},
{
description: "provider ID with deletion prefix should return true",
providerID: fmt.Sprintf("%s%s_%s", deletingMachinePrefix, "cluster-api", "id-0001"),
expectedReturn: true,
},
{
description: "empty provider ID should return false",
providerID: "",
expectedReturn: false,
},
{
description: "provider ID created with createDeletingMachineNormalizedProviderID returns true",
providerID: createDeletingMachineNormalizedProviderID("cluster-api", "id-0001"),
expectedReturn: true,
},
}
for _, tc := range testCases {
t.Run(tc.description, func(t *testing.T) {
observed := isDeletingMachineProviderID(normalizedProviderID(tc.providerID))
if observed != tc.expectedReturn {
t.Fatalf("unexpected return for provider ID %q, expected %t, observed %t", tc.providerID, tc.expectedReturn, observed)
}
})
}
}
func Test_machineKeyFromDeletingMachineProviderID(t *testing.T) {
type testCase struct {
description string
providerID string
expectedReturn string
}
testCases := []testCase{
{
description: "real looking provider ID with no deletion prefix returns provider id",
providerID: "fake-provider://a.provider.id-0001",
expectedReturn: "fake-provider://a.provider.id-0001",
},
{
description: "namespace_name provider ID with no deletion prefix returns proper provider ID",
providerID: "cluster-api_id-0001",
expectedReturn: "cluster-api/id-0001",
},
{
description: "namespace_name provider ID with deletion prefix returns proper provider ID",
providerID: fmt.Sprintf("%s%s_%s", deletingMachinePrefix, "cluster-api", "id-0001"),
expectedReturn: "cluster-api/id-0001",
},
{
description: "namespace_name provider ID with deletion prefix and two underscores returns proper provider ID",
providerID: fmt.Sprintf("%s%s_%s", deletingMachinePrefix, "cluster-api", "id_0001"),
expectedReturn: "cluster-api/id_0001",
},
{
description: "provider ID created with createDeletingMachineNormalizedProviderID returns proper provider ID",
providerID: createDeletingMachineNormalizedProviderID("cluster-api", "id-0001"),
expectedReturn: "cluster-api/id-0001",
},
}
for _, tc := range testCases {
t.Run(tc.description, func(t *testing.T) {
observed := machineKeyFromDeletingMachineProviderID(normalizedProviderID(tc.providerID))
if observed != tc.expectedReturn {
t.Fatalf("unexpected return for provider ID %q, expected %q, observed %q", tc.providerID, tc.expectedReturn, observed)
}
})
}
}
func Test_createDeletingMachineNormalizedProviderID(t *testing.T) {
type testCase struct {
description string
namespace string
name string
expectedReturn string
}
testCases := []testCase{
{
description: "namespace and name return proper normalized ID",
namespace: "cluster-api",
name: "id-0001",
expectedReturn: fmt.Sprintf("%s%s_%s", deletingMachinePrefix, "cluster-api", "id-0001"),
},
}
for _, tc := range testCases {
t.Run(tc.description, func(t *testing.T) {
observed := createDeletingMachineNormalizedProviderID(tc.namespace, tc.name)
if observed != tc.expectedReturn {
t.Fatalf("unexpected return for (namespace %q, name %q), expected %q, observed %q", tc.namespace, tc.name, tc.expectedReturn, observed)
}
})
}
}

View File

@ -203,11 +203,28 @@ func (ng *nodegroup) DecreaseTargetSize(delta int) error {
return err
}
if size+delta < len(nodes) {
return fmt.Errorf("attempt to delete existing nodes targetSize:%d delta:%d existingNodes: %d",
size, delta, len(nodes))
// we want to filter out machines that are not nodes (eg failed or pending)
// so that the node group size can be set properly. this affects situations
// where an instance is created, but cannot become a node due to cloud provider
// issues such as quota limitations, and thus the autoscaler needs to properly
// set the size of the node group. without this adjustment, the core autoscaler
// will become confused about the state of nodes and instances in the clusterapi
// provider.
actualNodes := 0
for _, node := range nodes {
if !isPendingMachineProviderID(normalizedProviderID(node.Id)) &&
!isFailedMachineProviderID(normalizedProviderID(node.Id)) &&
!isDeletingMachineProviderID(normalizedProviderID(node.Id)) {
actualNodes += 1
}
}
if size+delta < actualNodes {
return fmt.Errorf("node group %s: attempt to delete existing nodes currentReplicas:%d delta:%d existingNodes: %d",
ng.scalableResource.Name(), size, delta, actualNodes)
}
klog.V(4).Infof("%s: DecreaseTargetSize: scaling down: currentReplicas: %d, delta: %d, existingNodes: %d", ng.scalableResource.Name(), size, delta, len(nodes))
return ng.scalableResource.SetSize(size + delta)
}

View File

@ -406,18 +406,73 @@ func TestNodeGroupIncreaseSize(t *testing.T) {
func TestNodeGroupDecreaseTargetSize(t *testing.T) {
type testCase struct {
description string
delta int
initial int32
targetSizeIncrement int32
expected int32
expectedError bool
description string
delta int
initial int32
targetSizeIncrement int32
expected int32
expectedError bool
includeDeletingMachine bool
includeFailedMachine bool
includePendingMachine bool
}
test := func(t *testing.T, tc *testCase, testConfig *testConfig) {
controller, stop := mustCreateTestController(t, testConfig)
defer stop()
// machines in deletion should not be counted towards the active nodes when calculating a decrease in size.
if tc.includeDeletingMachine {
if tc.initial < 1 {
t.Fatal("test cannot pass, deleted machine requires at least 1 machine in machineset")
}
// Simulate a machine in deleting
machine := testConfig.machines[0].DeepCopy()
timestamp := metav1.Now()
machine.SetDeletionTimestamp(&timestamp)
if err := updateResource(controller.managementClient, controller.machineInformer, controller.machineResource, machine); err != nil {
t.Fatalf("unexpected error updating machine, got %v", err)
}
}
// machines that have failed should not be counted towards the active nodes when calculating a decrease in size.
if tc.includeFailedMachine {
// because we want to allow for tests that have deleted machines and failed machines, we use the second machine in the test data.
if tc.initial < 2 {
t.Fatal("test cannot pass, failed machine requires at least 2 machine in machineset")
}
// Simulate a failed machine
machine := testConfig.machines[1].DeepCopy()
unstructured.RemoveNestedField(machine.Object, "spec", "providerID")
unstructured.SetNestedField(machine.Object, "FailureMessage", "status", "failureMessage")
if err := updateResource(controller.managementClient, controller.machineInformer, controller.machineResource, machine); err != nil {
t.Fatalf("unexpected error updating machine, got %v", err)
}
}
// machines that are in pending state should not be counted towards the active nodes when calculating a decrease in size.
if tc.includePendingMachine {
// because we want to allow for tests that have deleted, failed machines, and pending machine, we use the third machine in the test data.
if tc.initial < 3 {
t.Fatal("test cannot pass, pending machine requires at least 3 machine in machineset")
}
// Simulate a pending machine
machine := testConfig.machines[2].DeepCopy()
unstructured.RemoveNestedField(machine.Object, "spec", "providerID")
unstructured.RemoveNestedField(machine.Object, "status", "nodeRef")
if err := updateResource(controller.managementClient, controller.machineInformer, controller.machineResource, machine); err != nil {
t.Fatalf("unexpected error updating machine, got %v", err)
}
}
nodegroups, err := controller.nodeGroups()
if err != nil {
t.Fatalf("unexpected error: %v", err)
@ -522,45 +577,83 @@ func TestNodeGroupDecreaseTargetSize(t *testing.T) {
}
}
annotations := map[string]string{
nodeGroupMinSizeAnnotationKey: "1",
nodeGroupMaxSizeAnnotationKey: "10",
}
t.Run("MachineSet", func(t *testing.T) {
tc := testCase{
testCases := []testCase{
{
description: "Same number of existing instances and node group target size should error",
initial: 3,
targetSizeIncrement: 0,
expected: 3,
delta: -1,
expectedError: true,
}
test(t, &tc, createMachineSetTestConfig(RandomString(6), RandomString(6), RandomString(6), int(tc.initial), annotations, nil))
})
t.Run("MachineSet", func(t *testing.T) {
tc := testCase{
},
{
description: "A node group with target size 4 but only 3 existing instances should decrease by 1",
initial: 3,
targetSizeIncrement: 1,
expected: 3,
delta: -1,
}
test(t, &tc, createMachineSetTestConfig(RandomString(6), RandomString(6), RandomString(6), int(tc.initial), annotations, nil))
})
},
{
description: "A node group with 4 replicas with one machine in deleting state should decrease by 1",
initial: 4,
targetSizeIncrement: 0,
expected: 3,
delta: -1,
includeDeletingMachine: true,
},
{
description: "A node group with 4 replicas with one failed machine should decrease by 1",
initial: 4,
targetSizeIncrement: 0,
expected: 3,
delta: -1,
includeFailedMachine: true,
},
{
description: "A node group with 4 replicas with one pending machine should decrease by 1",
initial: 4,
targetSizeIncrement: 0,
expected: 3,
delta: -1,
includePendingMachine: true,
},
{
description: "A node group with 5 replicas with one pending and one failed machine should decrease by 2",
initial: 5,
targetSizeIncrement: 0,
expected: 3,
delta: -2,
includeFailedMachine: true,
includePendingMachine: true,
},
{
description: "A node group with 5 replicas with one pending, one failed, and one deleting machine should decrease by 3",
initial: 5,
targetSizeIncrement: 0,
expected: 2,
delta: -3,
includeFailedMachine: true,
includePendingMachine: true,
includeDeletingMachine: true,
},
}
t.Run("MachineDeployment", func(t *testing.T) {
tc := testCase{
description: "Same number of existing instances and node group target size should error",
initial: 3,
targetSizeIncrement: 0,
expected: 3,
delta: -1,
expectedError: true,
}
test(t, &tc, createMachineDeploymentTestConfig(RandomString(6), RandomString(6), RandomString(6), int(tc.initial), annotations, nil))
})
annotations := map[string]string{
nodeGroupMinSizeAnnotationKey: "1",
nodeGroupMaxSizeAnnotationKey: "10",
}
for _, tc := range testCases {
t.Run(tc.description, func(t *testing.T) {
test(t, &tc, createMachineSetTestConfig(RandomString(6), RandomString(6), RandomString(6), int(tc.initial), annotations, nil))
})
}
for _, tc := range testCases {
t.Run(tc.description, func(t *testing.T) {
test(t, &tc, createMachineDeploymentTestConfig(RandomString(6), RandomString(6), RandomString(6), int(tc.initial), annotations, nil))
})
}
}
func TestNodeGroupDecreaseSizeErrors(t *testing.T) {
@ -580,7 +673,7 @@ func TestNodeGroupDecreaseSizeErrors(t *testing.T) {
description: "errors because initial+delta < len(nodes)",
delta: -1,
initial: 3,
errorMsg: "attempt to delete existing nodes targetSize:3 delta:-1 existingNodes: 3",
errorMsg: "attempt to delete existing nodes currentReplicas:3 delta:-1 existingNodes: 3",
}}
test := func(t *testing.T, tc *testCase, testConfig *testConfig) {