From 51f1660db717ba40ba128ea5994c4b58bd00c8de Mon Sep 17 00:00:00 2001 From: mendelski Date: Fri, 5 May 2023 22:30:35 +0000 Subject: [PATCH] deduplicate scale-up test setup --- .../core/scaleup/orchestrator/executor.go | 80 +- .../scaleup/orchestrator/executor_test.go | 16 +- .../scaleup/orchestrator/orchestrator_test.go | 778 +++++++----------- cluster-autoscaler/core/test/common.go | 110 ++- cluster-autoscaler/main.go | 2 +- 5 files changed, 454 insertions(+), 532 deletions(-) diff --git a/cluster-autoscaler/core/scaleup/orchestrator/executor.go b/cluster-autoscaler/core/scaleup/orchestrator/executor.go index 7039c7dee5..55d0d66293 100644 --- a/cluster-autoscaler/core/scaleup/orchestrator/executor.go +++ b/cluster-autoscaler/core/scaleup/orchestrator/executor.go @@ -93,7 +93,7 @@ func (e *scaleUpExecutor) executeScaleUpsParallel( nodeInfos map[string]*schedulerframework.NodeInfo, now time.Time, ) (errors.AutoscalerError, []cloudprovider.NodeGroup) { - if err := assertUniqueNodeGroups(scaleUpInfos); err != nil { + if err := checkUniqueNodeGroups(scaleUpInfos); err != nil { return err, extractNodeGroups(scaleUpInfos) } type errResult struct { @@ -171,6 +171,15 @@ func combineConcurrentScaleUpErrors(errs []errors.AutoscalerError) errors.Autosc if len(errs) == 1 { return errs[0] } + uniqueMessages := make(map[string]bool) + uniqueTypes := make(map[errors.AutoscalerErrorType]bool) + for _, err := range errs { + uniqueTypes[err.Type()] = true + uniqueMessages[err.Error()] = true + } + if len(uniqueTypes) == 1 && len(uniqueMessages) == 1 { + return errs[0] + } // sort to stabilize the results and easier log aggregation sort.Slice(errs, func(i, j int) bool { errA := errs[i] @@ -180,44 +189,43 @@ func combineConcurrentScaleUpErrors(errs []errors.AutoscalerError) errors.Autosc } return errA.Type() < errB.Type() }) - firstErrMsg := errs[0].Error() - firstErrType := errs[0].Type() - singleError := true - singleType := true - for _, err := range errs { - singleType = singleType && firstErrType == err.Type() - singleError = singleError && singleType && firstErrMsg == err.Error() - } - if singleError { - return errs[0] - } - var builder strings.Builder - builder.WriteString(firstErrMsg) - builder.WriteString(" ...other concurrent errors: [") - concurrentErrs := make(map[string]bool) - for _, err := range errs { - var concurrentErr string - if singleType { - concurrentErr = err.Error() - } else { - concurrentErr = fmt.Sprintf("[%s] %s", err.Type(), err.Error()) - } - n := len(concurrentErrs) - if !concurrentErrs[concurrentErr] && n > 0 { - if n > 1 { - builder.WriteString(", ") - } - builder.WriteString(fmt.Sprintf("%q", concurrentErr)) - } - concurrentErrs[concurrentErr] = true - } - builder.WriteString("]") - return errors.NewAutoscalerError(firstErrType, builder.String()) + firstErr := errs[0] + printErrorTypes := len(uniqueTypes) > 1 + message := formatMessageFromConcurrentErrors(errs, printErrorTypes) + return errors.NewAutoscalerError(firstErr.Type(), message) } -// Asserts all groups are scaled only once. +func formatMessageFromConcurrentErrors(errs []errors.AutoscalerError, printErrorTypes bool) string { + firstErr := errs[0] + var builder strings.Builder + builder.WriteString(firstErr.Error()) + builder.WriteString(" ...and other concurrent errors: [") + formattedErrs := map[errors.AutoscalerError]bool{ + firstErr: true, + } + for _, err := range errs { + if _, has := formattedErrs[err]; has { + continue + } + formattedErrs[err] = true + var message string + if printErrorTypes { + message = fmt.Sprintf("[%s] %s", err.Type(), err.Error()) + } else { + message = err.Error() + } + if len(formattedErrs) > 2 { + builder.WriteString(", ") + } + builder.WriteString(fmt.Sprintf("%q", message)) + } + builder.WriteString("]") + return builder.String() +} + +// Checks if all groups are scaled only once. // Scaling one group multiple times concurrently may cause problems. -func assertUniqueNodeGroups(scaleUpInfos []nodegroupset.ScaleUpInfo) errors.AutoscalerError { +func checkUniqueNodeGroups(scaleUpInfos []nodegroupset.ScaleUpInfo) errors.AutoscalerError { uniqueGroups := make(map[string]bool) for _, info := range scaleUpInfos { if uniqueGroups[info.Group.Id()] { diff --git a/cluster-autoscaler/core/scaleup/orchestrator/executor_test.go b/cluster-autoscaler/core/scaleup/orchestrator/executor_test.go index 304cf0eb62..a7ef5d60f5 100644 --- a/cluster-autoscaler/core/scaleup/orchestrator/executor_test.go +++ b/cluster-autoscaler/core/scaleup/orchestrator/executor_test.go @@ -19,9 +19,9 @@ package orchestrator import ( "testing" - "github.com/stretchr/testify/assert" - "k8s.io/autoscaler/cluster-autoscaler/utils/errors" + + "github.com/stretchr/testify/assert" ) func TestCombinedConcurrentScaleUpErrors(t *testing.T) { @@ -58,7 +58,7 @@ func TestCombinedConcurrentScaleUpErrors(t *testing.T) { }, expectedErr: errors.NewAutoscalerError( errors.CloudProviderError, - "provider error ...other concurrent errors: [\"[internalError] internal error\"]", + "provider error ...and other concurrent errors: [\"[internalError] internal error\"]", ), }, { @@ -69,7 +69,7 @@ func TestCombinedConcurrentScaleUpErrors(t *testing.T) { }, expectedErr: errors.NewAutoscalerError( errors.CloudProviderError, - "provider error ...other concurrent errors: [\"[internalError] internal error\"]", + "provider error ...and other concurrent errors: [\"[internalError] internal error\"]", ), }, { @@ -81,7 +81,7 @@ func TestCombinedConcurrentScaleUpErrors(t *testing.T) { }, expectedErr: errors.NewAutoscalerError( errors.InternalError, - "A ...other concurrent errors: [\"B\", \"C\"]"), + "A ...and other concurrent errors: [\"B\", \"C\"]"), }, { desc: "errors with the same type and some duplicated messages", @@ -92,7 +92,7 @@ func TestCombinedConcurrentScaleUpErrors(t *testing.T) { }, expectedErr: errors.NewAutoscalerError( errors.InternalError, - "A ...other concurrent errors: [\"B\"]"), + "A ...and other concurrent errors: [\"B\"]"), }, { desc: "some duplicated errors", @@ -104,7 +104,7 @@ func TestCombinedConcurrentScaleUpErrors(t *testing.T) { }, expectedErr: errors.NewAutoscalerError( errors.CloudProviderError, - "A ...other concurrent errors: [\"[cloudProviderError] B\", \"[internalError] A\"]"), + "A ...and other concurrent errors: [\"[cloudProviderError] B\", \"[internalError] A\"]"), }, { desc: "different errors with quotes in messages", @@ -114,7 +114,7 @@ func TestCombinedConcurrentScaleUpErrors(t *testing.T) { }, expectedErr: errors.NewAutoscalerError( errors.InternalError, - "\"first\" ...other concurrent errors: [\"\\\"second\\\"\"]"), + "\"first\" ...and other concurrent errors: [\"\\\"second\\\"\"]"), }, } diff --git a/cluster-autoscaler/core/scaleup/orchestrator/orchestrator_test.go b/cluster-autoscaler/core/scaleup/orchestrator/orchestrator_test.go index 5ae3aad23f..bc6e6d9eea 100644 --- a/cluster-autoscaler/core/scaleup/orchestrator/orchestrator_test.go +++ b/cluster-autoscaler/core/scaleup/orchestrator/orchestrator_test.go @@ -52,8 +52,6 @@ import ( schedulerframework "k8s.io/kubernetes/pkg/scheduler/framework" "github.com/stretchr/testify/assert" - - "k8s.io/autoscaler/cluster-autoscaler/expander" ) var defaultOptions = config.AutoscalingOptions{ @@ -66,7 +64,7 @@ var defaultOptions = config.AutoscalingOptions{ // Scale up scenarios. func TestScaleUpOK(t *testing.T) { - config := &ScaleTestConfig{ + config := &ScaleUpTestConfig{ Nodes: []NodeConfig{ {Name: "n1", Cpu: 100, Memory: 100, Gpu: 0, Ready: true, Group: "ng1"}, {Name: "n2", Cpu: 1000, Memory: 1000, Gpu: 0, Ready: true, Group: "ng2"}, @@ -78,8 +76,7 @@ func TestScaleUpOK(t *testing.T) { ExtraPods: []PodConfig{ {Name: "p-new", Cpu: 500, Memory: 0, Gpu: 0, Node: "", ToleratesGpu: false}, }, - Options: defaultOptions, - ExpansionOptionToChoose: GroupSizeChange{GroupName: "ng2", SizeChange: 1}, + ExpansionOptionToChoose: &GroupSizeChange{GroupName: "ng2", SizeChange: 1}, } expectedResults := &ScaleTestResults{ FinalOption: GroupSizeChange{GroupName: "ng2", SizeChange: 1}, @@ -93,7 +90,7 @@ func TestScaleUpOK(t *testing.T) { // There are triggering, remaining & awaiting pods. func TestMixedScaleUp(t *testing.T) { - config := &ScaleTestConfig{ + config := &ScaleUpTestConfig{ Nodes: []NodeConfig{ {Name: "n1", Cpu: 100, Memory: 1000, Gpu: 0, Ready: true, Group: "ng1"}, {Name: "n2", Cpu: 1000, Memory: 100, Gpu: 0, Ready: true, Group: "ng2"}, @@ -110,8 +107,7 @@ func TestMixedScaleUp(t *testing.T) { // can only be scheduled on ng1 {Name: "awaiting", Cpu: 0, Memory: 200, Gpu: 0, Node: "", ToleratesGpu: false}, }, - Options: defaultOptions, - ExpansionOptionToChoose: GroupSizeChange{GroupName: "ng2", SizeChange: 1}, + ExpansionOptionToChoose: &GroupSizeChange{GroupName: "ng2", SizeChange: 1}, } expectedResults := &ScaleTestResults{ FinalOption: GroupSizeChange{GroupName: "ng2", SizeChange: 1}, @@ -128,7 +124,7 @@ func TestMixedScaleUp(t *testing.T) { func TestScaleUpMaxCoresLimitHit(t *testing.T) { options := defaultOptions options.MaxCoresTotal = 9 - config := &ScaleTestConfig{ + config := &ScaleUpTestConfig{ Nodes: []NodeConfig{ {Name: "n1", Cpu: 2000, Memory: 100, Gpu: 0, Ready: true, Group: "ng1"}, {Name: "n2", Cpu: 4000, Memory: 1000, Gpu: 0, Ready: true, Group: "ng2"}, @@ -141,8 +137,8 @@ func TestScaleUpMaxCoresLimitHit(t *testing.T) { {Name: "p-new-1", Cpu: 2000, Memory: 0, Gpu: 0, Node: "", ToleratesGpu: false}, {Name: "p-new-2", Cpu: 2000, Memory: 0, Gpu: 0, Node: "", ToleratesGpu: false}, }, - ExpansionOptionToChoose: GroupSizeChange{GroupName: "ng1", SizeChange: 2}, - Options: options, + ExpansionOptionToChoose: &GroupSizeChange{GroupName: "ng1", SizeChange: 2}, + Options: &options, } results := &ScaleTestResults{ FinalOption: GroupSizeChange{GroupName: "ng1", SizeChange: 1}, @@ -157,7 +153,7 @@ func TestScaleUpMaxCoresLimitHit(t *testing.T) { func TestScaleUpMaxCoresLimitHitWithNotAutoscaledGroup(t *testing.T) { options := defaultOptions options.MaxCoresTotal = 9 - config := &ScaleTestConfig{ + config := &ScaleUpTestConfig{ Nodes: []NodeConfig{ {Name: "n1", Cpu: 2000, Memory: 100, Gpu: 0, Ready: true, Group: "ng1"}, {Name: "n2", Cpu: 4000, Memory: 1000, Gpu: 0, Ready: true, Group: ""}, @@ -170,8 +166,8 @@ func TestScaleUpMaxCoresLimitHitWithNotAutoscaledGroup(t *testing.T) { {Name: "p-new-1", Cpu: 2000, Memory: 0, Gpu: 0, Node: "", ToleratesGpu: false}, {Name: "p-new-2", Cpu: 2000, Memory: 0, Gpu: 0, Node: "", ToleratesGpu: false}, }, - ExpansionOptionToChoose: GroupSizeChange{GroupName: "ng1", SizeChange: 2}, - Options: options, + ExpansionOptionToChoose: &GroupSizeChange{GroupName: "ng1", SizeChange: 2}, + Options: &options, } results := &ScaleTestResults{ FinalOption: GroupSizeChange{GroupName: "ng1", SizeChange: 1}, @@ -186,7 +182,7 @@ func TestScaleUpMaxCoresLimitHitWithNotAutoscaledGroup(t *testing.T) { func TestScaleUpMaxMemoryLimitHit(t *testing.T) { options := defaultOptions options.MaxMemoryTotal = 1300 * utils.MiB - config := &ScaleTestConfig{ + config := &ScaleUpTestConfig{ Nodes: []NodeConfig{ {Name: "n1", Cpu: 2000, Memory: 100 * utils.MiB, Gpu: 0, Ready: true, Group: "ng1"}, {Name: "n2", Cpu: 4000, Memory: 1000 * utils.MiB, Gpu: 0, Ready: true, Group: "ng2"}, @@ -200,8 +196,8 @@ func TestScaleUpMaxMemoryLimitHit(t *testing.T) { {Name: "p-new-2", Cpu: 2000, Memory: 100 * utils.MiB, Gpu: 0, Node: "", ToleratesGpu: false}, {Name: "p-new-3", Cpu: 2000, Memory: 100 * utils.MiB, Gpu: 0, Node: "", ToleratesGpu: false}, }, - ExpansionOptionToChoose: GroupSizeChange{GroupName: "ng1", SizeChange: 3}, - Options: options, + ExpansionOptionToChoose: &GroupSizeChange{GroupName: "ng1", SizeChange: 3}, + Options: &options, } results := &ScaleTestResults{ FinalOption: GroupSizeChange{GroupName: "ng1", SizeChange: 2}, @@ -216,7 +212,7 @@ func TestScaleUpMaxMemoryLimitHit(t *testing.T) { func TestScaleUpMaxMemoryLimitHitWithNotAutoscaledGroup(t *testing.T) { options := defaultOptions options.MaxMemoryTotal = 1300 * utils.MiB - config := &ScaleTestConfig{ + config := &ScaleUpTestConfig{ Nodes: []NodeConfig{ {Name: "n1", Cpu: 2000, Memory: 100 * utils.MiB, Gpu: 0, Ready: true, Group: "ng1"}, {Name: "n2", Cpu: 4000, Memory: 1000 * utils.MiB, Gpu: 0, Ready: true, Group: ""}, @@ -230,8 +226,8 @@ func TestScaleUpMaxMemoryLimitHitWithNotAutoscaledGroup(t *testing.T) { {Name: "p-new-2", Cpu: 2000, Memory: 100 * utils.MiB, Gpu: 0, Node: "", ToleratesGpu: false}, {Name: "p-new-3", Cpu: 2000, Memory: 100 * utils.MiB, Gpu: 0, Node: "", ToleratesGpu: false}, }, - ExpansionOptionToChoose: GroupSizeChange{GroupName: "ng1", SizeChange: 3}, - Options: options, + ExpansionOptionToChoose: &GroupSizeChange{GroupName: "ng1", SizeChange: 3}, + Options: &options, } results := &ScaleTestResults{ FinalOption: GroupSizeChange{GroupName: "ng1", SizeChange: 2}, @@ -247,11 +243,7 @@ func TestScaleUpTwoGroups(t *testing.T) { options := defaultOptions options.BalanceSimilarNodeGroups = true options.ParallelScaleUp = true - config := &GroupsScaleUpTestConfig{ - Groups: []NodeGroupConfig{ - {Name: "ng1", MaxSize: 2}, - {Name: "ng2", MaxSize: 2}, - }, + config := &ScaleUpTestConfig{ Nodes: []NodeConfig{ {Name: "ng1-n1", Cpu: 1500, Memory: 1000 * utils.MiB, Ready: true, Group: "ng1"}, {Name: "ng2-n1", Cpu: 1500, Memory: 1000 * utils.MiB, Ready: true, Group: "ng2"}, @@ -267,25 +259,29 @@ func TestScaleUpTwoGroups(t *testing.T) { Options: &options, } testCases := []struct { - desc string - parallelScaleUp bool + desc string + parallel bool }{ { - desc: "synchronous scale up", - parallelScaleUp: false, + desc: "synchronous scale up", + parallel: false, }, { - desc: "parallel scale up", - parallelScaleUp: true, + desc: "parallel scale up", + parallel: true, }, } for _, tc := range testCases { t.Run(tc.desc, func(t *testing.T) { - config.Options.ParallelScaleUp = tc.parallelScaleUp - result := scaleUpGroups(t, config) - assert.NoError(t, result.Error) - assert.Equal(t, 2, result.TargetSizes["ng1"]) - assert.Equal(t, 2, result.TargetSizes["ng2"]) + config.Options.ParallelScaleUp = tc.parallel + result := runSimpleScaleUpTest(t, config) + assert.True(t, result.ScaleUpStatus.WasSuccessful()) + assert.Nil(t, result.ScaleUpError) + assert.Equal(t, result.GroupTargetSizes, map[string]int{ + "ng1": 2, + "ng2": 2, + }) + assert.ElementsMatch(t, result.ScaleUpStatus.PodsTriggeredScaleUp, []string{"p3", "p4"}) }) } } @@ -293,7 +289,7 @@ func TestScaleUpTwoGroups(t *testing.T) { func TestCloudProviderFailingToScaleUpGroups(t *testing.T) { options := defaultOptions options.BalanceSimilarNodeGroups = true - config := &GroupsScaleUpTestConfig{ + config := &ScaleUpTestConfig{ Groups: []NodeGroupConfig{ {Name: "ng1", MaxSize: 2}, {Name: "ng2", MaxSize: 2}, @@ -326,35 +322,35 @@ func TestCloudProviderFailingToScaleUpGroups(t *testing.T) { } testCases := []struct { desc string - parallelScaleUp bool + parallel bool onScaleUp testprovider.OnScaleUpFunc expectConcurrentErrors bool expectedTotalTargetSizes int }{ { desc: "synchronous scale up - two failures", - parallelScaleUp: false, + parallel: false, onScaleUp: failAlwaysScaleUp, expectConcurrentErrors: false, expectedTotalTargetSizes: 3, // first error stops scale up process }, { desc: "parallel scale up - two failures", - parallelScaleUp: true, + parallel: true, onScaleUp: failAlwaysScaleUp, expectConcurrentErrors: true, expectedTotalTargetSizes: 4, }, { desc: "synchronous scale up - one failure", - parallelScaleUp: false, + parallel: false, onScaleUp: failOnceScaleUp(), expectConcurrentErrors: false, expectedTotalTargetSizes: 3, }, { desc: "parallel scale up - one failure", - parallelScaleUp: true, + parallel: true, onScaleUp: failOnceScaleUp(), expectConcurrentErrors: false, expectedTotalTargetSizes: 4, @@ -362,18 +358,272 @@ func TestCloudProviderFailingToScaleUpGroups(t *testing.T) { } for _, tc := range testCases { t.Run(tc.desc, func(t *testing.T) { - config.Options.ParallelScaleUp = tc.parallelScaleUp + config.Options.ParallelScaleUp = tc.parallel config.OnScaleUp = tc.onScaleUp - result := scaleUpGroups(t, config) - assert.Error(t, result.Error) - assert.Equal(t, errors.CloudProviderError, result.Error.Type()) - assert.Equal(t, tc.expectConcurrentErrors, strings.Contains(result.Error.Error(), "...other concurrent errors")) - assert.Equal(t, tc.expectedTotalTargetSizes, result.TargetSizes["ng1"]+result.TargetSizes["ng2"]) + result := runSimpleScaleUpTest(t, config) + assert.False(t, result.ScaleUpStatus.WasSuccessful()) + assert.Equal(t, errors.CloudProviderError, result.ScaleUpError.Type()) + assert.Equal(t, tc.expectedTotalTargetSizes, result.GroupTargetSizes["ng1"]+result.GroupTargetSizes["ng2"]) + assert.Equal(t, tc.expectConcurrentErrors, strings.Contains(result.ScaleUpError.Error(), "...and other concurrent errors")) }) } } -func scaleUpGroups(t *testing.T, config *GroupsScaleUpTestConfig) *GroupsScaleUpTestResult { +func TestScaleUpCapToMaxTotalNodesLimit(t *testing.T) { + options := defaultOptions + options.MaxNodesTotal = 3 + config := &ScaleUpTestConfig{ + Nodes: []NodeConfig{ + {Name: "n1", Cpu: 2000, Memory: 100 * utils.MiB, Gpu: 0, Ready: true, Group: "ng1"}, + {Name: "n2", Cpu: 4000, Memory: 1000 * utils.MiB, Gpu: 0, Ready: true, Group: "ng2"}, + }, + Pods: []PodConfig{ + {Name: "p1", Cpu: 1000, Memory: 0, Gpu: 0, Node: "n1", ToleratesGpu: false}, + {Name: "p2", Cpu: 3000, Memory: 0, Gpu: 0, Node: "n2", ToleratesGpu: false}, + }, + ExtraPods: []PodConfig{ + {Name: "p-new-1", Cpu: 4000, Memory: 100 * utils.MiB, Gpu: 0, Node: "", ToleratesGpu: false}, + {Name: "p-new-2", Cpu: 4000, Memory: 100 * utils.MiB, Gpu: 0, Node: "", ToleratesGpu: false}, + {Name: "p-new-3", Cpu: 4000, Memory: 100 * utils.MiB, Gpu: 0, Node: "", ToleratesGpu: false}, + }, + ExpansionOptionToChoose: &GroupSizeChange{GroupName: "ng2", SizeChange: 3}, + Options: &options, + } + results := &ScaleTestResults{ + FinalOption: GroupSizeChange{GroupName: "ng2", SizeChange: 1}, + ScaleUpStatus: ScaleUpStatusInfo{ + PodsTriggeredScaleUp: []string{"p-new-1", "p-new-2", "p-new-3"}, + }, + } + + simpleScaleUpTest(t, config, results) +} + +func TestScaleUpCapToMaxTotalNodesLimitWithNotAutoscaledGroup(t *testing.T) { + options := defaultOptions + options.MaxNodesTotal = 3 + config := &ScaleUpTestConfig{ + Nodes: []NodeConfig{ + {Name: "n1", Cpu: 2000, Memory: 100 * utils.MiB, Gpu: 0, Ready: true, Group: ""}, + {Name: "n2", Cpu: 4000, Memory: 1000 * utils.MiB, Gpu: 0, Ready: true, Group: "ng2"}, + }, + Pods: []PodConfig{ + {Name: "p1", Cpu: 1000, Memory: 0, Gpu: 0, Node: "n1", ToleratesGpu: false}, + {Name: "p2", Cpu: 3000, Memory: 0, Gpu: 0, Node: "n2", ToleratesGpu: false}, + }, + ExtraPods: []PodConfig{ + {Name: "p-new-1", Cpu: 4000, Memory: 100 * utils.MiB, Gpu: 0, Node: "", ToleratesGpu: false}, + {Name: "p-new-2", Cpu: 4000, Memory: 100 * utils.MiB, Gpu: 0, Node: "", ToleratesGpu: false}, + {Name: "p-new-3", Cpu: 4000, Memory: 100 * utils.MiB, Gpu: 0, Node: "", ToleratesGpu: false}, + }, + ExpansionOptionToChoose: &GroupSizeChange{GroupName: "ng2", SizeChange: 3}, + Options: &options, + } + results := &ScaleTestResults{ + FinalOption: GroupSizeChange{GroupName: "ng2", SizeChange: 1}, + ScaleUpStatus: ScaleUpStatusInfo{ + PodsTriggeredScaleUp: []string{"p-new-1", "p-new-2", "p-new-3"}, + }, + } + + simpleScaleUpTest(t, config, results) +} + +func TestWillConsiderGpuAndStandardPoolForPodWhichDoesNotRequireGpu(t *testing.T) { + options := defaultOptions + options.MaxNodesTotal = 100 + config := &ScaleUpTestConfig{ + Nodes: []NodeConfig{ + {Name: "gpu-node-1", Cpu: 2000, Memory: 1000 * utils.MiB, Gpu: 1, Ready: true, Group: "gpu-pool"}, + {Name: "std-node-1", Cpu: 2000, Memory: 1000 * utils.MiB, Gpu: 0, Ready: true, Group: "std-pool"}, + }, + Pods: []PodConfig{ + {Name: "gpu-pod-1", Cpu: 2000, Memory: 1000 * utils.MiB, Gpu: 1, Node: "gpu-node-1", ToleratesGpu: true}, + {Name: "std-pod-1", Cpu: 2000, Memory: 1000 * utils.MiB, Gpu: 0, Node: "std-node-1", ToleratesGpu: false}, + }, + ExtraPods: []PodConfig{ + {Name: "extra-std-pod", Cpu: 2000, Memory: 1000 * utils.MiB, Gpu: 0, Node: "", ToleratesGpu: true}, + }, + ExpansionOptionToChoose: &GroupSizeChange{GroupName: "std-pool", SizeChange: 1}, + Options: &options, + } + results := &ScaleTestResults{ + FinalOption: GroupSizeChange{GroupName: "std-pool", SizeChange: 1}, + ExpansionOptions: []GroupSizeChange{ + {GroupName: "std-pool", SizeChange: 1}, + {GroupName: "gpu-pool", SizeChange: 1}, + }, + ScaleUpStatus: ScaleUpStatusInfo{ + PodsTriggeredScaleUp: []string{"extra-std-pod"}, + }, + } + + simpleScaleUpTest(t, config, results) +} + +func TestWillConsiderOnlyGpuPoolForPodWhichDoesRequiresGpu(t *testing.T) { + options := defaultOptions + options.MaxNodesTotal = 100 + config := &ScaleUpTestConfig{ + Nodes: []NodeConfig{ + {Name: "gpu-node-1", Cpu: 2000, Memory: 1000 * utils.MiB, Gpu: 1, Ready: true, Group: "gpu-pool"}, + {Name: "std-node-1", Cpu: 2000, Memory: 1000 * utils.MiB, Gpu: 0, Ready: true, Group: "std-pool"}, + }, + Pods: []PodConfig{ + {Name: "gpu-pod-1", Cpu: 2000, Memory: 1000 * utils.MiB, Gpu: 1, Node: "gpu-node-1", ToleratesGpu: true}, + {Name: "std-pod-1", Cpu: 2000, Memory: 1000 * utils.MiB, Gpu: 0, Node: "std-node-1", ToleratesGpu: false}, + }, + ExtraPods: []PodConfig{ + {Name: "extra-gpu-pod", Cpu: 2000, Memory: 1000 * utils.MiB, Gpu: 1, Node: "", ToleratesGpu: true}, + }, + ExpansionOptionToChoose: &GroupSizeChange{GroupName: "gpu-pool", SizeChange: 1}, + Options: &options, + } + results := &ScaleTestResults{ + FinalOption: GroupSizeChange{GroupName: "gpu-pool", SizeChange: 1}, + ExpansionOptions: []GroupSizeChange{ + {GroupName: "gpu-pool", SizeChange: 1}, + }, + ScaleUpStatus: ScaleUpStatusInfo{ + PodsTriggeredScaleUp: []string{"extra-gpu-pod"}, + }, + } + + simpleScaleUpTest(t, config, results) +} + +func TestWillConsiderAllPoolsWhichFitTwoPodsRequiringGpus(t *testing.T) { + options := defaultOptions + options.MaxNodesTotal = 100 + config := &ScaleUpTestConfig{ + Nodes: []NodeConfig{ + {Name: "gpu-1-node-1", Cpu: 2000, Memory: 1000 * utils.MiB, Gpu: 1, Ready: true, Group: "gpu-1-pool"}, + {Name: "gpu-2-node-1", Cpu: 2000, Memory: 1000 * utils.MiB, Gpu: 2, Ready: true, Group: "gpu-2-pool"}, + {Name: "gpu-4-node-1", Cpu: 2000, Memory: 1000 * utils.MiB, Gpu: 4, Ready: true, Group: "gpu-4-pool"}, + {Name: "std-node-1", Cpu: 2000, Memory: 1000 * utils.MiB, Gpu: 0, Ready: true, Group: "std-pool"}, + }, + Pods: []PodConfig{ + {Name: "gpu-pod-1", Cpu: 2000, Memory: 1000 * utils.MiB, Gpu: 1, Node: "gpu-1-node-1", ToleratesGpu: true}, + {Name: "gpu-pod-2", Cpu: 2000, Memory: 1000 * utils.MiB, Gpu: 2, Node: "gpu-2-node-1", ToleratesGpu: true}, + {Name: "gpu-pod-3", Cpu: 2000, Memory: 1000 * utils.MiB, Gpu: 4, Node: "gpu-4-node-1", ToleratesGpu: true}, + {Name: "std-pod-1", Cpu: 2000, Memory: 1000 * utils.MiB, Gpu: 0, Node: "std-node-1", ToleratesGpu: false}, + }, + ExtraPods: []PodConfig{ + {Name: "extra-gpu-pod-1", Cpu: 1, Memory: 1 * utils.MiB, Gpu: 1, Node: "", ToleratesGpu: true}, // CPU and mem negligible + {Name: "extra-gpu-pod-2", Cpu: 1, Memory: 1 * utils.MiB, Gpu: 1, Node: "", ToleratesGpu: true}, // CPU and mem negligible + {Name: "extra-gpu-pod-3", Cpu: 1, Memory: 1 * utils.MiB, Gpu: 1, Node: "", ToleratesGpu: true}, // CPU and mem negligible + }, + ExpansionOptionToChoose: &GroupSizeChange{GroupName: "gpu-1-pool", SizeChange: 3}, + Options: &options, + } + results := &ScaleTestResults{ + FinalOption: GroupSizeChange{GroupName: "gpu-1-pool", SizeChange: 3}, + ExpansionOptions: []GroupSizeChange{ + {GroupName: "gpu-1-pool", SizeChange: 3}, + {GroupName: "gpu-2-pool", SizeChange: 2}, + {GroupName: "gpu-4-pool", SizeChange: 1}, + }, + ScaleUpStatus: ScaleUpStatusInfo{ + PodsTriggeredScaleUp: []string{"extra-gpu-pod-1", "extra-gpu-pod-2", "extra-gpu-pod-3"}, + }, + } + + simpleScaleUpTest(t, config, results) +} + +// No scale up scenarios. +func TestNoScaleUpMaxCoresLimitHit(t *testing.T) { + options := defaultOptions + options.MaxCoresTotal = 7 + options.MaxMemoryTotal = 1150 + config := &ScaleUpTestConfig{ + Nodes: []NodeConfig{ + {Name: "n1", Cpu: 2000, Memory: 100, Gpu: 0, Ready: true, Group: "ng1"}, + {Name: "n2", Cpu: 4000, Memory: 1000, Gpu: 0, Ready: true, Group: "ng2"}, + }, + Pods: []PodConfig{ + {Name: "p1", Cpu: 1000, Memory: 0, Gpu: 0, Node: "n1", ToleratesGpu: false}, + {Name: "p2", Cpu: 3000, Memory: 0, Gpu: 0, Node: "n2", ToleratesGpu: false}, + }, + ExtraPods: []PodConfig{ + {Name: "p-new-1", Cpu: 2000, Memory: 0, Gpu: 0, Node: "", ToleratesGpu: false}, + {Name: "p-new-2", Cpu: 2000, Memory: 0, Gpu: 0, Node: "", ToleratesGpu: false}, + }, + Options: &options, + } + results := &ScaleTestResults{ + NoScaleUpReason: "max cluster cpu, memory limit reached", + ScaleUpStatus: ScaleUpStatusInfo{ + PodsRemainUnschedulable: []string{"p-new-1", "p-new-2"}, + }, + } + + simpleNoScaleUpTest(t, config, results) +} + +func simpleScaleUpTest(t *testing.T, config *ScaleUpTestConfig, expectedResults *ScaleTestResults) { + results := runSimpleScaleUpTest(t, config) + assert.NotNil(t, results.GroupSizeChanges[0], "Expected scale up event") + assert.Equal(t, expectedResults.FinalOption, results.GroupSizeChanges[0]) + assert.True(t, results.ScaleUpStatus.WasSuccessful()) + nodeEventSeen := false + for _, event := range results.Events { + if strings.Contains(event, "TriggeredScaleUp") && strings.Contains(event, expectedResults.FinalOption.GroupName) { + nodeEventSeen = true + } + if len(expectedResults.ScaleUpStatus.PodsRemainUnschedulable) == 0 { + assert.NotRegexp(t, regexp.MustCompile("NotTriggerScaleUp"), event) + } + } + assert.True(t, nodeEventSeen) + if len(expectedResults.ExpansionOptions) > 0 { + // Empty ExpansionOptions means we do not want to do any assertions + // on contents of actual scaleUp options + if config.ExpansionOptionToChoose != nil { + // Check that option to choose is part of expected options. + assert.Contains(t, expectedResults.ExpansionOptions, *config.ExpansionOptionToChoose, "final expected expansion option must be in expected expansion options") + assert.Contains(t, results.ExpansionOptions, *config.ExpansionOptionToChoose, "final expected expansion option must be in expected expansion options") + } + assert.ElementsMatch(t, results.ExpansionOptions, expectedResults.ExpansionOptions, + "actual and expected expansion options should be the same") + } + if expectedResults.GroupTargetSizes != nil { + assert.Equal(t, expectedResults.GroupTargetSizes, results.GroupTargetSizes) + } + assert.ElementsMatch(t, results.ScaleUpStatus.PodsTriggeredScaleUp, expectedResults.ScaleUpStatus.PodsTriggeredScaleUp, + "actual and expected triggering pods should be the same") + assert.ElementsMatch(t, results.ScaleUpStatus.PodsRemainUnschedulable, expectedResults.ScaleUpStatus.PodsRemainUnschedulable, + "actual and expected remaining pods should be the same") + assert.ElementsMatch(t, results.ScaleUpStatus.PodsAwaitEvaluation, expectedResults.ScaleUpStatus.PodsAwaitEvaluation, + "actual and expected awaiting evaluation pods should be the same") +} + +func simpleNoScaleUpTest(t *testing.T, config *ScaleUpTestConfig, expectedResults *ScaleTestResults) { + results := runSimpleScaleUpTest(t, config) + assert.Nil(t, results.GroupSizeChanges) + assert.False(t, results.ScaleUpStatus.WasSuccessful()) + noScaleUpEventSeen := false + for _, event := range results.Events { + if strings.Contains(event, "NotTriggerScaleUp") { + if strings.Contains(event, expectedResults.NoScaleUpReason) { + noScaleUpEventSeen = true + } else { + // Surprisingly useful for debugging. + fmt.Println("Event:", event) + } + } + assert.NotRegexp(t, regexp.MustCompile("TriggeredScaleUp"), event) + } + assert.True(t, noScaleUpEventSeen) + assert.ElementsMatch(t, results.ScaleUpStatus.PodsTriggeredScaleUp, expectedResults.ScaleUpStatus.PodsTriggeredScaleUp, + "actual and expected triggering pods should be the same") + assert.ElementsMatch(t, results.ScaleUpStatus.PodsRemainUnschedulable, expectedResults.ScaleUpStatus.PodsRemainUnschedulable, + "actual and expected remaining pods should be the same") + assert.ElementsMatch(t, results.ScaleUpStatus.PodsAwaitEvaluation, expectedResults.ScaleUpStatus.PodsAwaitEvaluation, + "actual and expected awaiting evaluation pods should be the same") +} + +func runSimpleScaleUpTest(t *testing.T, config *ScaleUpTestConfig) *ScaleUpTestResult { now := time.Now() groupSizeChangesChannel := make(chan GroupSizeChange, 20) groupNodes := make(map[string][]*apiv1.Node) @@ -440,18 +690,22 @@ func scaleUpGroups(t *testing.T, config *GroupsScaleUpTestConfig) *GroupsScaleUp } // build orchestrator - context, _ := NewScaleTestAutoscalingContext(options, &fake.Clientset{}, listers, provider, nil, nil) - nodeInfos, _ := nodeinfosprovider.NewDefaultTemplateNodeInfoProvider(nil, false). + context, err := NewScaleTestAutoscalingContext(options, &fake.Clientset{}, listers, provider, nil, nil) + assert.NoError(t, err) + nodeInfos, err := nodeinfosprovider.NewDefaultTemplateNodeInfoProvider(nil, false). Process(&context, nodes, []*appsv1.DaemonSet{}, taints.TaintConfig{}, now) + assert.NoError(t, err) clusterState := clusterstate. NewClusterStateRegistry(provider, clusterstate.ClusterStateRegistryConfig{}, context.LogRecorder, NewBackoff(), clusterstate.NewStaticMaxNodeProvisionTimeProvider(15*time.Minute)) clusterState.UpdateNodes(nodes, nodeInfos, time.Now()) processors := NewTestProcessors(&context) orchestrator := New() orchestrator.Initialize(&context, processors, clusterState, taints.TaintConfig{}) + expander := NewMockRepotingStrategy(t, config.ExpansionOptionToChoose) + context.ExpanderStrategy = expander // scale up - scaleUpStatus, err := orchestrator.ScaleUp(extraPods, nodes, []*appsv1.DaemonSet{}, nodeInfos) + scaleUpStatus, scaleUpErr := orchestrator.ScaleUp(extraPods, nodes, []*appsv1.DaemonSet{}, nodeInfos) processors.ScaleUpStatusProcessor.Process(&context, scaleUpStatus) // aggregate group size changes @@ -475,418 +729,13 @@ func scaleUpGroups(t *testing.T, config *GroupsScaleUpTestConfig) *GroupsScaleUp targetSizes[group.Id()], _ = group.TargetSize() } - return &GroupsScaleUpTestResult{ - Error: err, + return &ScaleUpTestResult{ + ScaleUpError: scaleUpErr, ScaleUpStatus: simplifyScaleUpStatus(scaleUpStatus), GroupSizeChanges: groupSizeChanges, Events: events, - TargetSizes: targetSizes, - } -} - -func TestScaleUpCapToMaxTotalNodesLimit(t *testing.T) { - options := defaultOptions - options.MaxNodesTotal = 3 - config := &ScaleTestConfig{ - Nodes: []NodeConfig{ - {Name: "n1", Cpu: 2000, Memory: 100 * utils.MiB, Gpu: 0, Ready: true, Group: "ng1"}, - {Name: "n2", Cpu: 4000, Memory: 1000 * utils.MiB, Gpu: 0, Ready: true, Group: "ng2"}, - }, - Pods: []PodConfig{ - {Name: "p1", Cpu: 1000, Memory: 0, Gpu: 0, Node: "n1", ToleratesGpu: false}, - {Name: "p2", Cpu: 3000, Memory: 0, Gpu: 0, Node: "n2", ToleratesGpu: false}, - }, - ExtraPods: []PodConfig{ - {Name: "p-new-1", Cpu: 4000, Memory: 100 * utils.MiB, Gpu: 0, Node: "", ToleratesGpu: false}, - {Name: "p-new-2", Cpu: 4000, Memory: 100 * utils.MiB, Gpu: 0, Node: "", ToleratesGpu: false}, - {Name: "p-new-3", Cpu: 4000, Memory: 100 * utils.MiB, Gpu: 0, Node: "", ToleratesGpu: false}, - }, - ExpansionOptionToChoose: GroupSizeChange{GroupName: "ng2", SizeChange: 3}, - Options: options, - } - results := &ScaleTestResults{ - FinalOption: GroupSizeChange{GroupName: "ng2", SizeChange: 1}, - ScaleUpStatus: ScaleUpStatusInfo{ - PodsTriggeredScaleUp: []string{"p-new-1", "p-new-2", "p-new-3"}, - }, - } - - simpleScaleUpTest(t, config, results) -} - -func TestScaleUpCapToMaxTotalNodesLimitWithNotAutoscaledGroup(t *testing.T) { - options := defaultOptions - options.MaxNodesTotal = 3 - config := &ScaleTestConfig{ - Nodes: []NodeConfig{ - {Name: "n1", Cpu: 2000, Memory: 100 * utils.MiB, Gpu: 0, Ready: true, Group: ""}, - {Name: "n2", Cpu: 4000, Memory: 1000 * utils.MiB, Gpu: 0, Ready: true, Group: "ng2"}, - }, - Pods: []PodConfig{ - {Name: "p1", Cpu: 1000, Memory: 0, Gpu: 0, Node: "n1", ToleratesGpu: false}, - {Name: "p2", Cpu: 3000, Memory: 0, Gpu: 0, Node: "n2", ToleratesGpu: false}, - }, - ExtraPods: []PodConfig{ - {Name: "p-new-1", Cpu: 4000, Memory: 100 * utils.MiB, Gpu: 0, Node: "", ToleratesGpu: false}, - {Name: "p-new-2", Cpu: 4000, Memory: 100 * utils.MiB, Gpu: 0, Node: "", ToleratesGpu: false}, - {Name: "p-new-3", Cpu: 4000, Memory: 100 * utils.MiB, Gpu: 0, Node: "", ToleratesGpu: false}, - }, - ExpansionOptionToChoose: GroupSizeChange{GroupName: "ng2", SizeChange: 3}, - Options: options, - } - results := &ScaleTestResults{ - FinalOption: GroupSizeChange{GroupName: "ng2", SizeChange: 1}, - ScaleUpStatus: ScaleUpStatusInfo{ - PodsTriggeredScaleUp: []string{"p-new-1", "p-new-2", "p-new-3"}, - }, - } - - simpleScaleUpTest(t, config, results) -} - -func TestWillConsiderGpuAndStandardPoolForPodWhichDoesNotRequireGpu(t *testing.T) { - options := defaultOptions - options.MaxNodesTotal = 100 - config := &ScaleTestConfig{ - Nodes: []NodeConfig{ - {Name: "gpu-node-1", Cpu: 2000, Memory: 1000 * utils.MiB, Gpu: 1, Ready: true, Group: "gpu-pool"}, - {Name: "std-node-1", Cpu: 2000, Memory: 1000 * utils.MiB, Gpu: 0, Ready: true, Group: "std-pool"}, - }, - Pods: []PodConfig{ - {Name: "gpu-pod-1", Cpu: 2000, Memory: 1000 * utils.MiB, Gpu: 1, Node: "gpu-node-1", ToleratesGpu: true}, - {Name: "std-pod-1", Cpu: 2000, Memory: 1000 * utils.MiB, Gpu: 0, Node: "std-node-1", ToleratesGpu: false}, - }, - ExtraPods: []PodConfig{ - {Name: "extra-std-pod", Cpu: 2000, Memory: 1000 * utils.MiB, Gpu: 0, Node: "", ToleratesGpu: true}, - }, - ExpansionOptionToChoose: GroupSizeChange{GroupName: "std-pool", SizeChange: 1}, - Options: options, - } - results := &ScaleTestResults{ - FinalOption: GroupSizeChange{GroupName: "std-pool", SizeChange: 1}, - ExpansionOptions: []GroupSizeChange{ - {GroupName: "std-pool", SizeChange: 1}, - {GroupName: "gpu-pool", SizeChange: 1}, - }, - ScaleUpStatus: ScaleUpStatusInfo{ - PodsTriggeredScaleUp: []string{"extra-std-pod"}, - }, - } - - simpleScaleUpTest(t, config, results) -} - -func TestWillConsiderOnlyGpuPoolForPodWhichDoesRequiresGpu(t *testing.T) { - options := defaultOptions - options.MaxNodesTotal = 100 - config := &ScaleTestConfig{ - Nodes: []NodeConfig{ - {Name: "gpu-node-1", Cpu: 2000, Memory: 1000 * utils.MiB, Gpu: 1, Ready: true, Group: "gpu-pool"}, - {Name: "std-node-1", Cpu: 2000, Memory: 1000 * utils.MiB, Gpu: 0, Ready: true, Group: "std-pool"}, - }, - Pods: []PodConfig{ - {Name: "gpu-pod-1", Cpu: 2000, Memory: 1000 * utils.MiB, Gpu: 1, Node: "gpu-node-1", ToleratesGpu: true}, - {Name: "std-pod-1", Cpu: 2000, Memory: 1000 * utils.MiB, Gpu: 0, Node: "std-node-1", ToleratesGpu: false}, - }, - ExtraPods: []PodConfig{ - {Name: "extra-gpu-pod", Cpu: 2000, Memory: 1000 * utils.MiB, Gpu: 1, Node: "", ToleratesGpu: true}, - }, - ExpansionOptionToChoose: GroupSizeChange{GroupName: "gpu-pool", SizeChange: 1}, - Options: options, - } - results := &ScaleTestResults{ - FinalOption: GroupSizeChange{GroupName: "gpu-pool", SizeChange: 1}, - ExpansionOptions: []GroupSizeChange{ - {GroupName: "gpu-pool", SizeChange: 1}, - }, - ScaleUpStatus: ScaleUpStatusInfo{ - PodsTriggeredScaleUp: []string{"extra-gpu-pod"}, - }, - } - - simpleScaleUpTest(t, config, results) -} - -func TestWillConsiderAllPoolsWhichFitTwoPodsRequiringGpus(t *testing.T) { - options := defaultOptions - options.MaxNodesTotal = 100 - config := &ScaleTestConfig{ - Nodes: []NodeConfig{ - {Name: "gpu-1-node-1", Cpu: 2000, Memory: 1000 * utils.MiB, Gpu: 1, Ready: true, Group: "gpu-1-pool"}, - {Name: "gpu-2-node-1", Cpu: 2000, Memory: 1000 * utils.MiB, Gpu: 2, Ready: true, Group: "gpu-2-pool"}, - {Name: "gpu-4-node-1", Cpu: 2000, Memory: 1000 * utils.MiB, Gpu: 4, Ready: true, Group: "gpu-4-pool"}, - {Name: "std-node-1", Cpu: 2000, Memory: 1000 * utils.MiB, Gpu: 0, Ready: true, Group: "std-pool"}, - }, - Pods: []PodConfig{ - {Name: "gpu-pod-1", Cpu: 2000, Memory: 1000 * utils.MiB, Gpu: 1, Node: "gpu-1-node-1", ToleratesGpu: true}, - {Name: "gpu-pod-2", Cpu: 2000, Memory: 1000 * utils.MiB, Gpu: 2, Node: "gpu-2-node-1", ToleratesGpu: true}, - {Name: "gpu-pod-3", Cpu: 2000, Memory: 1000 * utils.MiB, Gpu: 4, Node: "gpu-4-node-1", ToleratesGpu: true}, - {Name: "std-pod-1", Cpu: 2000, Memory: 1000 * utils.MiB, Gpu: 0, Node: "std-node-1", ToleratesGpu: false}, - }, - ExtraPods: []PodConfig{ - {Name: "extra-gpu-pod-1", Cpu: 1, Memory: 1 * utils.MiB, Gpu: 1, Node: "", ToleratesGpu: true}, // CPU and mem negligible - {Name: "extra-gpu-pod-2", Cpu: 1, Memory: 1 * utils.MiB, Gpu: 1, Node: "", ToleratesGpu: true}, // CPU and mem negligible - {Name: "extra-gpu-pod-3", Cpu: 1, Memory: 1 * utils.MiB, Gpu: 1, Node: "", ToleratesGpu: true}, // CPU and mem negligible - }, - ExpansionOptionToChoose: GroupSizeChange{GroupName: "gpu-1-pool", SizeChange: 3}, - Options: options, - } - results := &ScaleTestResults{ - FinalOption: GroupSizeChange{GroupName: "gpu-1-pool", SizeChange: 3}, - ExpansionOptions: []GroupSizeChange{ - {GroupName: "gpu-1-pool", SizeChange: 3}, - {GroupName: "gpu-2-pool", SizeChange: 2}, - {GroupName: "gpu-4-pool", SizeChange: 1}, - }, - ScaleUpStatus: ScaleUpStatusInfo{ - PodsTriggeredScaleUp: []string{"extra-gpu-pod-1", "extra-gpu-pod-2", "extra-gpu-pod-3"}, - }, - } - - simpleScaleUpTest(t, config, results) -} - -// No scale up scenarios. -func TestNoScaleUpMaxCoresLimitHit(t *testing.T) { - options := defaultOptions - options.MaxCoresTotal = 7 - options.MaxMemoryTotal = 1150 - config := &ScaleTestConfig{ - Nodes: []NodeConfig{ - {Name: "n1", Cpu: 2000, Memory: 100, Gpu: 0, Ready: true, Group: "ng1"}, - {Name: "n2", Cpu: 4000, Memory: 1000, Gpu: 0, Ready: true, Group: "ng2"}, - }, - Pods: []PodConfig{ - {Name: "p1", Cpu: 1000, Memory: 0, Gpu: 0, Node: "n1", ToleratesGpu: false}, - {Name: "p2", Cpu: 3000, Memory: 0, Gpu: 0, Node: "n2", ToleratesGpu: false}, - }, - ExtraPods: []PodConfig{ - {Name: "p-new-1", Cpu: 2000, Memory: 0, Gpu: 0, Node: "", ToleratesGpu: false}, - {Name: "p-new-2", Cpu: 2000, Memory: 0, Gpu: 0, Node: "", ToleratesGpu: false}, - }, - Options: options, - } - results := &ScaleTestResults{ - NoScaleUpReason: "max cluster cpu, memory limit reached", - ScaleUpStatus: ScaleUpStatusInfo{ - PodsRemainUnschedulable: []string{"p-new-1", "p-new-2"}, - }, - } - - simpleNoScaleUpTest(t, config, results) -} - -// To implement expander.Strategy, BestOption method must have a struct receiver. -// This prevents it from modifying fields of reportingStrategy, so we need a thin -// pointer wrapper for mutable parts. -type expanderResults struct { - inputOptions []GroupSizeChange -} - -type reportingStrategy struct { - initialNodeConfigs []NodeConfig - optionToChoose GroupSizeChange - results *expanderResults - t *testing.T -} - -func (r reportingStrategy) BestOption(options []expander.Option, nodeInfo map[string]*schedulerframework.NodeInfo) *expander.Option { - r.results.inputOptions = expanderOptionsToGroupSizeChanges(options) - for _, option := range options { - GroupSizeChange := expanderOptionToGroupSizeChange(option) - if GroupSizeChange == r.optionToChoose { - return &option - } - } - assert.Fail(r.t, "did not find expansionOptionToChoose %s", r.optionToChoose) - return nil -} - -func expanderOptionsToGroupSizeChanges(options []expander.Option) []GroupSizeChange { - groupSizeChanges := make([]GroupSizeChange, 0, len(options)) - for _, option := range options { - GroupSizeChange := expanderOptionToGroupSizeChange(option) - groupSizeChanges = append(groupSizeChanges, GroupSizeChange) - } - return groupSizeChanges -} - -func expanderOptionToGroupSizeChange(option expander.Option) GroupSizeChange { - groupName := option.NodeGroup.Id() - groupSizeIncrement := option.NodeCount - scaleUpOption := GroupSizeChange{GroupName: groupName, SizeChange: groupSizeIncrement} - return scaleUpOption -} - -func runSimpleScaleUpTest(t *testing.T, config *ScaleTestConfig) *ScaleTestResults { - expandedGroups := make(chan GroupSizeChange, 10) - now := time.Now() - - groups := make(map[string][]*apiv1.Node) - nodes := make([]*apiv1.Node, 0, len(config.Nodes)) - for _, n := range config.Nodes { - node := BuildTestNode(n.Name, n.Cpu, n.Memory) - if n.Gpu > 0 { - AddGpusToNode(node, n.Gpu) - } - SetNodeReadyState(node, n.Ready, now.Add(-2*time.Minute)) - nodes = append(nodes, node) - if n.Group != "" { - groups[n.Group] = append(groups[n.Group], node) - } - } - - pods := make([]*apiv1.Pod, 0, len(config.Pods)) - for _, p := range config.Pods { - pod := buildTestPod(p) - pods = append(pods, pod) - } - - podLister := kube_util.NewTestPodLister(pods) - listers := kube_util.NewListerRegistry(nil, nil, podLister, nil, nil, nil, nil, nil, nil, nil) - - provider := testprovider.NewTestCloudProvider(func(nodeGroup string, increase int) error { - expandedGroups <- GroupSizeChange{GroupName: nodeGroup, SizeChange: increase} - return nil - }, nil) - - for name, nodesInGroup := range groups { - provider.AddNodeGroup(name, 1, 10, len(nodesInGroup)) - for _, n := range nodesInGroup { - provider.AddNode(name, n) - } - } - - resourceLimiter := cloudprovider.NewResourceLimiter( - map[string]int64{cloudprovider.ResourceNameCores: config.Options.MinCoresTotal, cloudprovider.ResourceNameMemory: config.Options.MinMemoryTotal}, - map[string]int64{cloudprovider.ResourceNameCores: config.Options.MaxCoresTotal, cloudprovider.ResourceNameMemory: config.Options.MaxMemoryTotal}) - provider.SetResourceLimiter(resourceLimiter) - - assert.NotNil(t, provider) - - // Create context with non-random expander strategy. - context, err := NewScaleTestAutoscalingContext(config.Options, &fake.Clientset{}, listers, provider, nil, nil) - assert.NoError(t, err) - - expander := reportingStrategy{ - initialNodeConfigs: config.Nodes, - optionToChoose: config.ExpansionOptionToChoose, - results: &expanderResults{}, - t: t, - } - context.ExpanderStrategy = expander - - nodeInfos, _ := nodeinfosprovider.NewDefaultTemplateNodeInfoProvider(nil, false).Process(&context, nodes, []*appsv1.DaemonSet{}, taints.TaintConfig{}, now) - clusterState := clusterstate.NewClusterStateRegistry(provider, clusterstate.ClusterStateRegistryConfig{}, context.LogRecorder, NewBackoff(), clusterstate.NewStaticMaxNodeProvisionTimeProvider(15*time.Minute)) - clusterState.UpdateNodes(nodes, nodeInfos, time.Now()) - - extraPods := make([]*apiv1.Pod, 0, len(config.ExtraPods)) - for _, p := range config.ExtraPods { - pod := buildTestPod(p) - extraPods = append(extraPods, pod) - } - - processors := NewTestProcessors(&context) - suOrchestrator := New() - suOrchestrator.Initialize(&context, processors, clusterState, taints.TaintConfig{}) - scaleUpStatus, err := suOrchestrator.ScaleUp(extraPods, nodes, []*appsv1.DaemonSet{}, nodeInfos) - processors.ScaleUpStatusProcessor.Process(&context, scaleUpStatus) - - assert.NoError(t, err) - - expandedGroup := getGroupSizeChangeFromChan(expandedGroups) - var expandedGroupStruct GroupSizeChange - if expandedGroup != nil { - expandedGroupStruct = *expandedGroup - } - - events := []string{} - for eventsLeft := true; eventsLeft; { - select { - case event := <-context.Recorder.(*kube_record.FakeRecorder).Events: - events = append(events, event) - default: - eventsLeft = false - } - } - - return &ScaleTestResults{ - ExpansionOptions: expander.results.inputOptions, - FinalOption: expandedGroupStruct, - ScaleUpStatus: simplifyScaleUpStatus(scaleUpStatus), - Events: events, - } -} - -func simpleNoScaleUpTest(t *testing.T, config *ScaleTestConfig, expectedResults *ScaleTestResults) { - results := runSimpleScaleUpTest(t, config) - - assert.Equal(t, GroupSizeChange{}, results.FinalOption) - assert.False(t, results.ScaleUpStatus.WasSuccessful()) - noScaleUpEventSeen := false - for _, event := range results.Events { - if strings.Contains(event, "NotTriggerScaleUp") { - if strings.Contains(event, expectedResults.NoScaleUpReason) { - noScaleUpEventSeen = true - } else { - // Surprisingly useful for debugging. - fmt.Println("Event:", event) - } - } - assert.NotRegexp(t, regexp.MustCompile("TriggeredScaleUp"), event) - } - assert.True(t, noScaleUpEventSeen) - assert.ElementsMatch(t, results.ScaleUpStatus.PodsTriggeredScaleUp, expectedResults.ScaleUpStatus.PodsTriggeredScaleUp, - "actual and expected triggering pods should be the same") - assert.ElementsMatch(t, results.ScaleUpStatus.PodsRemainUnschedulable, expectedResults.ScaleUpStatus.PodsRemainUnschedulable, - "actual and expected remaining pods should be the same") - assert.ElementsMatch(t, results.ScaleUpStatus.PodsAwaitEvaluation, expectedResults.ScaleUpStatus.PodsAwaitEvaluation, - "actual and expected awaiting evaluation pods should be the same") -} - -func simpleScaleUpTest(t *testing.T, config *ScaleTestConfig, expectedResults *ScaleTestResults) { - results := runSimpleScaleUpTest(t, config) - - assert.NotNil(t, results.FinalOption, "Expected scale up event") - assert.Equal(t, expectedResults.FinalOption, results.FinalOption) - assert.True(t, results.ScaleUpStatus.WasSuccessful()) - nodeEventSeen := false - for _, event := range results.Events { - if strings.Contains(event, "TriggeredScaleUp") && strings.Contains(event, expectedResults.FinalOption.GroupName) { - nodeEventSeen = true - } - if len(expectedResults.ScaleUpStatus.PodsRemainUnschedulable) == 0 { - assert.NotRegexp(t, regexp.MustCompile("NotTriggerScaleUp"), event) - } - } - assert.True(t, nodeEventSeen) - - if len(expectedResults.ExpansionOptions) > 0 { - // Empty ExpansionOptions means we do not want to do any assertions - // on contents of actual scaleUp options - - // Check that option to choose is part of expected options. - assert.Contains(t, expectedResults.ExpansionOptions, config.ExpansionOptionToChoose, "final expected expansion option must be in expected expansion options") - assert.Contains(t, results.ExpansionOptions, config.ExpansionOptionToChoose, "final expected expansion option must be in expected expansion options") - - assert.ElementsMatch(t, results.ExpansionOptions, expectedResults.ExpansionOptions, - "actual and expected expansion options should be the same") - } - - assert.ElementsMatch(t, results.ScaleUpStatus.PodsTriggeredScaleUp, expectedResults.ScaleUpStatus.PodsTriggeredScaleUp, - "actual and expected triggering pods should be the same") - assert.ElementsMatch(t, results.ScaleUpStatus.PodsRemainUnschedulable, expectedResults.ScaleUpStatus.PodsRemainUnschedulable, - "actual and expected remaining pods should be the same") - assert.ElementsMatch(t, results.ScaleUpStatus.PodsAwaitEvaluation, expectedResults.ScaleUpStatus.PodsAwaitEvaluation, - "actual and expected awaiting evaluation pods should be the same") -} - -func getGroupSizeChangeFromChan(c chan GroupSizeChange) *GroupSizeChange { - select { - case val := <-c: - return &val - case <-time.After(100 * time.Millisecond): - return nil + GroupTargetSizes: targetSizes, + ExpansionOptions: expander.LastInputOptions(), } } @@ -1293,7 +1142,7 @@ func TestCheckDeltaWithinLimits(t *testing.T) { func TestAuthErrorHandling(t *testing.T) { metrics.RegisterAll(false) - config := &GroupsScaleUpTestConfig{ + config := &ScaleUpTestConfig{ Groups: []NodeGroupConfig{ {Name: "ng1", MaxSize: 2}, }, @@ -1308,12 +1157,12 @@ func TestAuthErrorHandling(t *testing.T) { }, Options: &defaultOptions, } - results := scaleUpGroups(t, config) + results := runSimpleScaleUpTest(t, config) expected := errors.NewAutoscalerError( errors.AutoscalerErrorType("authError"), "failed to increase node group size: auth error", ) - assert.Equal(t, expected, results.Error) + assert.Equal(t, expected, results.ScaleUpError) assertLegacyRegistryEntry(t, "cluster_autoscaler_failed_scale_ups_total{reason=\"authError\"} 1") } @@ -1325,7 +1174,6 @@ func assertLegacyRegistryEntry(t *testing.T, entry string) { rr := httptest.NewRecorder() handler := http.HandlerFunc(legacyregistry.Handler().ServeHTTP) handler.ServeHTTP(rr, req) - // Check that the status code is what we expect. if status := rr.Code; status != http.StatusOK { t.Errorf("handler returned wrong status code: got %v want %v", diff --git a/cluster-autoscaler/core/test/common.go b/cluster-autoscaler/core/test/common.go index 9204523603..5a84f4435b 100644 --- a/cluster-autoscaler/core/test/common.go +++ b/cluster-autoscaler/core/test/common.go @@ -22,10 +22,6 @@ import ( "testing" "time" - "k8s.io/autoscaler/cluster-autoscaler/core/scaledown/pdb" - "k8s.io/autoscaler/cluster-autoscaler/debuggingsnapshot" - - "k8s.io/apimachinery/pkg/api/resource" "k8s.io/autoscaler/cluster-autoscaler/cloudprovider" testcloudprovider "k8s.io/autoscaler/cluster-autoscaler/cloudprovider/test" "k8s.io/autoscaler/cluster-autoscaler/clusterstate/utils" @@ -33,7 +29,10 @@ import ( "k8s.io/autoscaler/cluster-autoscaler/context" "k8s.io/autoscaler/cluster-autoscaler/core/podlistprocessor" "k8s.io/autoscaler/cluster-autoscaler/core/scaledown/deletiontracker" + "k8s.io/autoscaler/cluster-autoscaler/core/scaledown/pdb" + "k8s.io/autoscaler/cluster-autoscaler/debuggingsnapshot" "k8s.io/autoscaler/cluster-autoscaler/estimator" + "k8s.io/autoscaler/cluster-autoscaler/expander" "k8s.io/autoscaler/cluster-autoscaler/expander/random" "k8s.io/autoscaler/cluster-autoscaler/metrics" "k8s.io/autoscaler/cluster-autoscaler/processors" @@ -50,18 +49,17 @@ import ( "k8s.io/autoscaler/cluster-autoscaler/processors/status" "k8s.io/autoscaler/cluster-autoscaler/simulator/clustersnapshot" "k8s.io/autoscaler/cluster-autoscaler/simulator/predicatechecker" + "k8s.io/autoscaler/cluster-autoscaler/utils/backoff" "k8s.io/autoscaler/cluster-autoscaler/utils/errors" kube_util "k8s.io/autoscaler/cluster-autoscaler/utils/kubernetes" "k8s.io/autoscaler/cluster-autoscaler/utils/labels" "github.com/stretchr/testify/assert" - apiv1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" kube_client "k8s.io/client-go/kubernetes" kube_record "k8s.io/client-go/tools/record" schedulerframework "k8s.io/kubernetes/pkg/scheduler/framework" - - "k8s.io/autoscaler/cluster-autoscaler/utils/backoff" ) // NodeConfig is a node config used in tests @@ -110,28 +108,31 @@ type NodeGroupConfig struct { MaxSize int } -// GroupsScaleUpTestConfig represents a config of a scale test -type GroupsScaleUpTestConfig struct { - Groups []NodeGroupConfig - Nodes []NodeConfig - Pods []PodConfig - ExtraPods []PodConfig - OnScaleUp testcloudprovider.OnScaleUpFunc - Options *config.AutoscalingOptions +// ScaleUpTestConfig represents a config of a scale test +type ScaleUpTestConfig struct { + Groups []NodeGroupConfig + Nodes []NodeConfig + Pods []PodConfig + ExtraPods []PodConfig + OnScaleUp testcloudprovider.OnScaleUpFunc + ExpansionOptionToChoose *GroupSizeChange + Options *config.AutoscalingOptions } -// GroupsScaleUpTestResult represents a node groups scale up result -type GroupsScaleUpTestResult struct { - Error errors.AutoscalerError +// ScaleUpTestResult represents a node groups scale up result +type ScaleUpTestResult struct { + ScaleUpError errors.AutoscalerError ScaleUpStatus ScaleUpStatusInfo GroupSizeChanges []GroupSizeChange + ExpansionOptions []GroupSizeChange Events []string - TargetSizes map[string]int + GroupTargetSizes map[string]int } // ScaleTestResults contains results of a scale test type ScaleTestResults struct { ExpansionOptions []GroupSizeChange + GroupTargetSizes map[string]int FinalOption GroupSizeChange NoScaleUpReason string FinalScaleDowns []string @@ -186,7 +187,8 @@ func NewTestProcessors(context *context.AutoscalingContext) *processors.Autoscal func NewScaleTestAutoscalingContext( options config.AutoscalingOptions, fakeClient kube_client.Interface, listers kube_util.ListerRegistry, provider cloudprovider.CloudProvider, - processorCallbacks processor_callbacks.ProcessorCallbacks, debuggingSnapshotter debuggingsnapshot.DebuggingSnapshotter) (context.AutoscalingContext, error) { + processorCallbacks processor_callbacks.ProcessorCallbacks, debuggingSnapshotter debuggingsnapshot.DebuggingSnapshotter, +) (context.AutoscalingContext, error) { // Not enough buffer space causes the test to hang without printing any logs. // This is not useful. fakeRecorder := kube_record.NewFakeRecorder(100) @@ -306,8 +308,8 @@ type MockAutoprovisioningNodeGroupListProcessor struct { // Process extends the list of node groups func (p *MockAutoprovisioningNodeGroupListProcessor) Process(context *context.AutoscalingContext, nodeGroups []cloudprovider.NodeGroup, nodeInfos map[string]*schedulerframework.NodeInfo, - unschedulablePods []*apiv1.Pod) ([]cloudprovider.NodeGroup, map[string]*schedulerframework.NodeInfo, error) { - + unschedulablePods []*apiv1.Pod, +) ([]cloudprovider.NodeGroup, map[string]*schedulerframework.NodeInfo, error) { machines, err := context.CloudProvider.GetAvailableMachineTypes() assert.NoError(p.T, err) @@ -332,3 +334,67 @@ func NewBackoff() backoff.Backoff { return backoff.NewIdBasedExponentialBackoff(5*time.Minute, /*InitialNodeGroupBackoffDuration*/ 30*time.Minute /*MaxNodeGroupBackoffDuration*/, 3*time.Hour /*NodeGroupBackoffResetTimeout*/) } + +// To implement expander.Strategy, BestOption method must have a struct receiver. +// This prevents it from modifying fields of reportingStrategy, so we need a thin +// pointer wrapper for mutable parts. +type expanderResults struct { + inputOptions []GroupSizeChange +} + +// MockReportingStrategy implements expander.Strategy +type MockReportingStrategy struct { + defaultStrategy expander.Strategy + optionToChoose *GroupSizeChange + t *testing.T + results *expanderResults +} + +// NewMockRepotingStrategy creates an expander strategy with reporting and mocking capabilities. +func NewMockRepotingStrategy(t *testing.T, optionToChoose *GroupSizeChange) *MockReportingStrategy { + return &MockReportingStrategy{ + defaultStrategy: random.NewStrategy(), + results: &expanderResults{}, + optionToChoose: optionToChoose, + t: t, + } +} + +// LastInputOptions provides access to expansion options passed as an input in recent strategy execution +func (r *MockReportingStrategy) LastInputOptions() []GroupSizeChange { + return r.results.inputOptions +} + +// BestOption satisfies the Strategy interface. Picks the best option from those passed as an argument. +// When parameter optionToChoose is defined, it's picked as the best one. +// Otherwise, random option is used. +func (r *MockReportingStrategy) BestOption(options []expander.Option, nodeInfo map[string]*schedulerframework.NodeInfo) *expander.Option { + r.results.inputOptions = expanderOptionsToGroupSizeChanges(options) + if r.optionToChoose == nil { + return r.defaultStrategy.BestOption(options, nodeInfo) + } + for _, option := range options { + groupSizeChange := expanderOptionToGroupSizeChange(option) + if groupSizeChange == *r.optionToChoose { + return &option + } + } + assert.Fail(r.t, "did not find expansionOptionToChoose %+v", r.optionToChoose) + return nil +} + +func expanderOptionsToGroupSizeChanges(options []expander.Option) []GroupSizeChange { + groupSizeChanges := make([]GroupSizeChange, 0, len(options)) + for _, option := range options { + groupSizeChange := expanderOptionToGroupSizeChange(option) + groupSizeChanges = append(groupSizeChanges, groupSizeChange) + } + return groupSizeChanges +} + +func expanderOptionToGroupSizeChange(option expander.Option) GroupSizeChange { + groupName := option.NodeGroup.Id() + groupSizeIncrement := option.NodeCount + scaleUpOption := GroupSizeChange{GroupName: groupName, SizeChange: groupSizeIncrement} + return scaleUpOption +} diff --git a/cluster-autoscaler/main.go b/cluster-autoscaler/main.go index d976f9a3ea..073e3fdf1b 100644 --- a/cluster-autoscaler/main.go +++ b/cluster-autoscaler/main.go @@ -148,7 +148,7 @@ var ( maxTotalUnreadyPercentage = flag.Float64("max-total-unready-percentage", 45, "Maximum percentage of unready nodes in the cluster. After this is exceeded, CA halts operations") okTotalUnreadyCount = flag.Int("ok-total-unready-count", 3, "Number of allowed unready nodes, irrespective of max-total-unready-percentage") scaleUpFromZero = flag.Bool("scale-up-from-zero", true, "Should CA scale up when there are 0 ready nodes.") - parallelScaleUp = flag.Bool("parallel-scale-up", false, "Whether to allow parallel node groups scale up") + parallelScaleUp = flag.Bool("parallel-scale-up", false, "Whether to allow parallel node groups scale up. Experimental: may not work on some cloud providers, enable at your own risk.") maxNodeProvisionTime = flag.Duration("max-node-provision-time", 15*time.Minute, "The default maximum time CA waits for node to be provisioned - the value can be overridden per node group") maxPodEvictionTime = flag.Duration("max-pod-eviction-time", 2*time.Minute, "Maximum time CA tries to evict a pod before giving up") nodeGroupsFlag = multiStringFlag(