deduplicate scale-up test setup

This commit is contained in:
mendelski 2023-05-05 22:30:35 +00:00
parent 3e2d48d86d
commit 51f1660db7
No known key found for this signature in database
GPG Key ID: 381F13B19D5B68EC
5 changed files with 454 additions and 532 deletions

View File

@ -93,7 +93,7 @@ func (e *scaleUpExecutor) executeScaleUpsParallel(
nodeInfos map[string]*schedulerframework.NodeInfo,
now time.Time,
) (errors.AutoscalerError, []cloudprovider.NodeGroup) {
if err := assertUniqueNodeGroups(scaleUpInfos); err != nil {
if err := checkUniqueNodeGroups(scaleUpInfos); err != nil {
return err, extractNodeGroups(scaleUpInfos)
}
type errResult struct {
@ -171,6 +171,15 @@ func combineConcurrentScaleUpErrors(errs []errors.AutoscalerError) errors.Autosc
if len(errs) == 1 {
return errs[0]
}
uniqueMessages := make(map[string]bool)
uniqueTypes := make(map[errors.AutoscalerErrorType]bool)
for _, err := range errs {
uniqueTypes[err.Type()] = true
uniqueMessages[err.Error()] = true
}
if len(uniqueTypes) == 1 && len(uniqueMessages) == 1 {
return errs[0]
}
// sort to stabilize the results and easier log aggregation
sort.Slice(errs, func(i, j int) bool {
errA := errs[i]
@ -180,44 +189,43 @@ func combineConcurrentScaleUpErrors(errs []errors.AutoscalerError) errors.Autosc
}
return errA.Type() < errB.Type()
})
firstErrMsg := errs[0].Error()
firstErrType := errs[0].Type()
singleError := true
singleType := true
for _, err := range errs {
singleType = singleType && firstErrType == err.Type()
singleError = singleError && singleType && firstErrMsg == err.Error()
}
if singleError {
return errs[0]
}
var builder strings.Builder
builder.WriteString(firstErrMsg)
builder.WriteString(" ...other concurrent errors: [")
concurrentErrs := make(map[string]bool)
for _, err := range errs {
var concurrentErr string
if singleType {
concurrentErr = err.Error()
} else {
concurrentErr = fmt.Sprintf("[%s] %s", err.Type(), err.Error())
}
n := len(concurrentErrs)
if !concurrentErrs[concurrentErr] && n > 0 {
if n > 1 {
builder.WriteString(", ")
}
builder.WriteString(fmt.Sprintf("%q", concurrentErr))
}
concurrentErrs[concurrentErr] = true
}
builder.WriteString("]")
return errors.NewAutoscalerError(firstErrType, builder.String())
firstErr := errs[0]
printErrorTypes := len(uniqueTypes) > 1
message := formatMessageFromConcurrentErrors(errs, printErrorTypes)
return errors.NewAutoscalerError(firstErr.Type(), message)
}
// Asserts all groups are scaled only once.
func formatMessageFromConcurrentErrors(errs []errors.AutoscalerError, printErrorTypes bool) string {
firstErr := errs[0]
var builder strings.Builder
builder.WriteString(firstErr.Error())
builder.WriteString(" ...and other concurrent errors: [")
formattedErrs := map[errors.AutoscalerError]bool{
firstErr: true,
}
for _, err := range errs {
if _, has := formattedErrs[err]; has {
continue
}
formattedErrs[err] = true
var message string
if printErrorTypes {
message = fmt.Sprintf("[%s] %s", err.Type(), err.Error())
} else {
message = err.Error()
}
if len(formattedErrs) > 2 {
builder.WriteString(", ")
}
builder.WriteString(fmt.Sprintf("%q", message))
}
builder.WriteString("]")
return builder.String()
}
// Checks if all groups are scaled only once.
// Scaling one group multiple times concurrently may cause problems.
func assertUniqueNodeGroups(scaleUpInfos []nodegroupset.ScaleUpInfo) errors.AutoscalerError {
func checkUniqueNodeGroups(scaleUpInfos []nodegroupset.ScaleUpInfo) errors.AutoscalerError {
uniqueGroups := make(map[string]bool)
for _, info := range scaleUpInfos {
if uniqueGroups[info.Group.Id()] {

View File

@ -19,9 +19,9 @@ package orchestrator
import (
"testing"
"github.com/stretchr/testify/assert"
"k8s.io/autoscaler/cluster-autoscaler/utils/errors"
"github.com/stretchr/testify/assert"
)
func TestCombinedConcurrentScaleUpErrors(t *testing.T) {
@ -58,7 +58,7 @@ func TestCombinedConcurrentScaleUpErrors(t *testing.T) {
},
expectedErr: errors.NewAutoscalerError(
errors.CloudProviderError,
"provider error ...other concurrent errors: [\"[internalError] internal error\"]",
"provider error ...and other concurrent errors: [\"[internalError] internal error\"]",
),
},
{
@ -69,7 +69,7 @@ func TestCombinedConcurrentScaleUpErrors(t *testing.T) {
},
expectedErr: errors.NewAutoscalerError(
errors.CloudProviderError,
"provider error ...other concurrent errors: [\"[internalError] internal error\"]",
"provider error ...and other concurrent errors: [\"[internalError] internal error\"]",
),
},
{
@ -81,7 +81,7 @@ func TestCombinedConcurrentScaleUpErrors(t *testing.T) {
},
expectedErr: errors.NewAutoscalerError(
errors.InternalError,
"A ...other concurrent errors: [\"B\", \"C\"]"),
"A ...and other concurrent errors: [\"B\", \"C\"]"),
},
{
desc: "errors with the same type and some duplicated messages",
@ -92,7 +92,7 @@ func TestCombinedConcurrentScaleUpErrors(t *testing.T) {
},
expectedErr: errors.NewAutoscalerError(
errors.InternalError,
"A ...other concurrent errors: [\"B\"]"),
"A ...and other concurrent errors: [\"B\"]"),
},
{
desc: "some duplicated errors",
@ -104,7 +104,7 @@ func TestCombinedConcurrentScaleUpErrors(t *testing.T) {
},
expectedErr: errors.NewAutoscalerError(
errors.CloudProviderError,
"A ...other concurrent errors: [\"[cloudProviderError] B\", \"[internalError] A\"]"),
"A ...and other concurrent errors: [\"[cloudProviderError] B\", \"[internalError] A\"]"),
},
{
desc: "different errors with quotes in messages",
@ -114,7 +114,7 @@ func TestCombinedConcurrentScaleUpErrors(t *testing.T) {
},
expectedErr: errors.NewAutoscalerError(
errors.InternalError,
"\"first\" ...other concurrent errors: [\"\\\"second\\\"\"]"),
"\"first\" ...and other concurrent errors: [\"\\\"second\\\"\"]"),
},
}

View File

@ -52,8 +52,6 @@ import (
schedulerframework "k8s.io/kubernetes/pkg/scheduler/framework"
"github.com/stretchr/testify/assert"
"k8s.io/autoscaler/cluster-autoscaler/expander"
)
var defaultOptions = config.AutoscalingOptions{
@ -66,7 +64,7 @@ var defaultOptions = config.AutoscalingOptions{
// Scale up scenarios.
func TestScaleUpOK(t *testing.T) {
config := &ScaleTestConfig{
config := &ScaleUpTestConfig{
Nodes: []NodeConfig{
{Name: "n1", Cpu: 100, Memory: 100, Gpu: 0, Ready: true, Group: "ng1"},
{Name: "n2", Cpu: 1000, Memory: 1000, Gpu: 0, Ready: true, Group: "ng2"},
@ -78,8 +76,7 @@ func TestScaleUpOK(t *testing.T) {
ExtraPods: []PodConfig{
{Name: "p-new", Cpu: 500, Memory: 0, Gpu: 0, Node: "", ToleratesGpu: false},
},
Options: defaultOptions,
ExpansionOptionToChoose: GroupSizeChange{GroupName: "ng2", SizeChange: 1},
ExpansionOptionToChoose: &GroupSizeChange{GroupName: "ng2", SizeChange: 1},
}
expectedResults := &ScaleTestResults{
FinalOption: GroupSizeChange{GroupName: "ng2", SizeChange: 1},
@ -93,7 +90,7 @@ func TestScaleUpOK(t *testing.T) {
// There are triggering, remaining & awaiting pods.
func TestMixedScaleUp(t *testing.T) {
config := &ScaleTestConfig{
config := &ScaleUpTestConfig{
Nodes: []NodeConfig{
{Name: "n1", Cpu: 100, Memory: 1000, Gpu: 0, Ready: true, Group: "ng1"},
{Name: "n2", Cpu: 1000, Memory: 100, Gpu: 0, Ready: true, Group: "ng2"},
@ -110,8 +107,7 @@ func TestMixedScaleUp(t *testing.T) {
// can only be scheduled on ng1
{Name: "awaiting", Cpu: 0, Memory: 200, Gpu: 0, Node: "", ToleratesGpu: false},
},
Options: defaultOptions,
ExpansionOptionToChoose: GroupSizeChange{GroupName: "ng2", SizeChange: 1},
ExpansionOptionToChoose: &GroupSizeChange{GroupName: "ng2", SizeChange: 1},
}
expectedResults := &ScaleTestResults{
FinalOption: GroupSizeChange{GroupName: "ng2", SizeChange: 1},
@ -128,7 +124,7 @@ func TestMixedScaleUp(t *testing.T) {
func TestScaleUpMaxCoresLimitHit(t *testing.T) {
options := defaultOptions
options.MaxCoresTotal = 9
config := &ScaleTestConfig{
config := &ScaleUpTestConfig{
Nodes: []NodeConfig{
{Name: "n1", Cpu: 2000, Memory: 100, Gpu: 0, Ready: true, Group: "ng1"},
{Name: "n2", Cpu: 4000, Memory: 1000, Gpu: 0, Ready: true, Group: "ng2"},
@ -141,8 +137,8 @@ func TestScaleUpMaxCoresLimitHit(t *testing.T) {
{Name: "p-new-1", Cpu: 2000, Memory: 0, Gpu: 0, Node: "", ToleratesGpu: false},
{Name: "p-new-2", Cpu: 2000, Memory: 0, Gpu: 0, Node: "", ToleratesGpu: false},
},
ExpansionOptionToChoose: GroupSizeChange{GroupName: "ng1", SizeChange: 2},
Options: options,
ExpansionOptionToChoose: &GroupSizeChange{GroupName: "ng1", SizeChange: 2},
Options: &options,
}
results := &ScaleTestResults{
FinalOption: GroupSizeChange{GroupName: "ng1", SizeChange: 1},
@ -157,7 +153,7 @@ func TestScaleUpMaxCoresLimitHit(t *testing.T) {
func TestScaleUpMaxCoresLimitHitWithNotAutoscaledGroup(t *testing.T) {
options := defaultOptions
options.MaxCoresTotal = 9
config := &ScaleTestConfig{
config := &ScaleUpTestConfig{
Nodes: []NodeConfig{
{Name: "n1", Cpu: 2000, Memory: 100, Gpu: 0, Ready: true, Group: "ng1"},
{Name: "n2", Cpu: 4000, Memory: 1000, Gpu: 0, Ready: true, Group: ""},
@ -170,8 +166,8 @@ func TestScaleUpMaxCoresLimitHitWithNotAutoscaledGroup(t *testing.T) {
{Name: "p-new-1", Cpu: 2000, Memory: 0, Gpu: 0, Node: "", ToleratesGpu: false},
{Name: "p-new-2", Cpu: 2000, Memory: 0, Gpu: 0, Node: "", ToleratesGpu: false},
},
ExpansionOptionToChoose: GroupSizeChange{GroupName: "ng1", SizeChange: 2},
Options: options,
ExpansionOptionToChoose: &GroupSizeChange{GroupName: "ng1", SizeChange: 2},
Options: &options,
}
results := &ScaleTestResults{
FinalOption: GroupSizeChange{GroupName: "ng1", SizeChange: 1},
@ -186,7 +182,7 @@ func TestScaleUpMaxCoresLimitHitWithNotAutoscaledGroup(t *testing.T) {
func TestScaleUpMaxMemoryLimitHit(t *testing.T) {
options := defaultOptions
options.MaxMemoryTotal = 1300 * utils.MiB
config := &ScaleTestConfig{
config := &ScaleUpTestConfig{
Nodes: []NodeConfig{
{Name: "n1", Cpu: 2000, Memory: 100 * utils.MiB, Gpu: 0, Ready: true, Group: "ng1"},
{Name: "n2", Cpu: 4000, Memory: 1000 * utils.MiB, Gpu: 0, Ready: true, Group: "ng2"},
@ -200,8 +196,8 @@ func TestScaleUpMaxMemoryLimitHit(t *testing.T) {
{Name: "p-new-2", Cpu: 2000, Memory: 100 * utils.MiB, Gpu: 0, Node: "", ToleratesGpu: false},
{Name: "p-new-3", Cpu: 2000, Memory: 100 * utils.MiB, Gpu: 0, Node: "", ToleratesGpu: false},
},
ExpansionOptionToChoose: GroupSizeChange{GroupName: "ng1", SizeChange: 3},
Options: options,
ExpansionOptionToChoose: &GroupSizeChange{GroupName: "ng1", SizeChange: 3},
Options: &options,
}
results := &ScaleTestResults{
FinalOption: GroupSizeChange{GroupName: "ng1", SizeChange: 2},
@ -216,7 +212,7 @@ func TestScaleUpMaxMemoryLimitHit(t *testing.T) {
func TestScaleUpMaxMemoryLimitHitWithNotAutoscaledGroup(t *testing.T) {
options := defaultOptions
options.MaxMemoryTotal = 1300 * utils.MiB
config := &ScaleTestConfig{
config := &ScaleUpTestConfig{
Nodes: []NodeConfig{
{Name: "n1", Cpu: 2000, Memory: 100 * utils.MiB, Gpu: 0, Ready: true, Group: "ng1"},
{Name: "n2", Cpu: 4000, Memory: 1000 * utils.MiB, Gpu: 0, Ready: true, Group: ""},
@ -230,8 +226,8 @@ func TestScaleUpMaxMemoryLimitHitWithNotAutoscaledGroup(t *testing.T) {
{Name: "p-new-2", Cpu: 2000, Memory: 100 * utils.MiB, Gpu: 0, Node: "", ToleratesGpu: false},
{Name: "p-new-3", Cpu: 2000, Memory: 100 * utils.MiB, Gpu: 0, Node: "", ToleratesGpu: false},
},
ExpansionOptionToChoose: GroupSizeChange{GroupName: "ng1", SizeChange: 3},
Options: options,
ExpansionOptionToChoose: &GroupSizeChange{GroupName: "ng1", SizeChange: 3},
Options: &options,
}
results := &ScaleTestResults{
FinalOption: GroupSizeChange{GroupName: "ng1", SizeChange: 2},
@ -247,11 +243,7 @@ func TestScaleUpTwoGroups(t *testing.T) {
options := defaultOptions
options.BalanceSimilarNodeGroups = true
options.ParallelScaleUp = true
config := &GroupsScaleUpTestConfig{
Groups: []NodeGroupConfig{
{Name: "ng1", MaxSize: 2},
{Name: "ng2", MaxSize: 2},
},
config := &ScaleUpTestConfig{
Nodes: []NodeConfig{
{Name: "ng1-n1", Cpu: 1500, Memory: 1000 * utils.MiB, Ready: true, Group: "ng1"},
{Name: "ng2-n1", Cpu: 1500, Memory: 1000 * utils.MiB, Ready: true, Group: "ng2"},
@ -267,25 +259,29 @@ func TestScaleUpTwoGroups(t *testing.T) {
Options: &options,
}
testCases := []struct {
desc string
parallelScaleUp bool
desc string
parallel bool
}{
{
desc: "synchronous scale up",
parallelScaleUp: false,
desc: "synchronous scale up",
parallel: false,
},
{
desc: "parallel scale up",
parallelScaleUp: true,
desc: "parallel scale up",
parallel: true,
},
}
for _, tc := range testCases {
t.Run(tc.desc, func(t *testing.T) {
config.Options.ParallelScaleUp = tc.parallelScaleUp
result := scaleUpGroups(t, config)
assert.NoError(t, result.Error)
assert.Equal(t, 2, result.TargetSizes["ng1"])
assert.Equal(t, 2, result.TargetSizes["ng2"])
config.Options.ParallelScaleUp = tc.parallel
result := runSimpleScaleUpTest(t, config)
assert.True(t, result.ScaleUpStatus.WasSuccessful())
assert.Nil(t, result.ScaleUpError)
assert.Equal(t, result.GroupTargetSizes, map[string]int{
"ng1": 2,
"ng2": 2,
})
assert.ElementsMatch(t, result.ScaleUpStatus.PodsTriggeredScaleUp, []string{"p3", "p4"})
})
}
}
@ -293,7 +289,7 @@ func TestScaleUpTwoGroups(t *testing.T) {
func TestCloudProviderFailingToScaleUpGroups(t *testing.T) {
options := defaultOptions
options.BalanceSimilarNodeGroups = true
config := &GroupsScaleUpTestConfig{
config := &ScaleUpTestConfig{
Groups: []NodeGroupConfig{
{Name: "ng1", MaxSize: 2},
{Name: "ng2", MaxSize: 2},
@ -326,35 +322,35 @@ func TestCloudProviderFailingToScaleUpGroups(t *testing.T) {
}
testCases := []struct {
desc string
parallelScaleUp bool
parallel bool
onScaleUp testprovider.OnScaleUpFunc
expectConcurrentErrors bool
expectedTotalTargetSizes int
}{
{
desc: "synchronous scale up - two failures",
parallelScaleUp: false,
parallel: false,
onScaleUp: failAlwaysScaleUp,
expectConcurrentErrors: false,
expectedTotalTargetSizes: 3, // first error stops scale up process
},
{
desc: "parallel scale up - two failures",
parallelScaleUp: true,
parallel: true,
onScaleUp: failAlwaysScaleUp,
expectConcurrentErrors: true,
expectedTotalTargetSizes: 4,
},
{
desc: "synchronous scale up - one failure",
parallelScaleUp: false,
parallel: false,
onScaleUp: failOnceScaleUp(),
expectConcurrentErrors: false,
expectedTotalTargetSizes: 3,
},
{
desc: "parallel scale up - one failure",
parallelScaleUp: true,
parallel: true,
onScaleUp: failOnceScaleUp(),
expectConcurrentErrors: false,
expectedTotalTargetSizes: 4,
@ -362,18 +358,272 @@ func TestCloudProviderFailingToScaleUpGroups(t *testing.T) {
}
for _, tc := range testCases {
t.Run(tc.desc, func(t *testing.T) {
config.Options.ParallelScaleUp = tc.parallelScaleUp
config.Options.ParallelScaleUp = tc.parallel
config.OnScaleUp = tc.onScaleUp
result := scaleUpGroups(t, config)
assert.Error(t, result.Error)
assert.Equal(t, errors.CloudProviderError, result.Error.Type())
assert.Equal(t, tc.expectConcurrentErrors, strings.Contains(result.Error.Error(), "...other concurrent errors"))
assert.Equal(t, tc.expectedTotalTargetSizes, result.TargetSizes["ng1"]+result.TargetSizes["ng2"])
result := runSimpleScaleUpTest(t, config)
assert.False(t, result.ScaleUpStatus.WasSuccessful())
assert.Equal(t, errors.CloudProviderError, result.ScaleUpError.Type())
assert.Equal(t, tc.expectedTotalTargetSizes, result.GroupTargetSizes["ng1"]+result.GroupTargetSizes["ng2"])
assert.Equal(t, tc.expectConcurrentErrors, strings.Contains(result.ScaleUpError.Error(), "...and other concurrent errors"))
})
}
}
func scaleUpGroups(t *testing.T, config *GroupsScaleUpTestConfig) *GroupsScaleUpTestResult {
func TestScaleUpCapToMaxTotalNodesLimit(t *testing.T) {
options := defaultOptions
options.MaxNodesTotal = 3
config := &ScaleUpTestConfig{
Nodes: []NodeConfig{
{Name: "n1", Cpu: 2000, Memory: 100 * utils.MiB, Gpu: 0, Ready: true, Group: "ng1"},
{Name: "n2", Cpu: 4000, Memory: 1000 * utils.MiB, Gpu: 0, Ready: true, Group: "ng2"},
},
Pods: []PodConfig{
{Name: "p1", Cpu: 1000, Memory: 0, Gpu: 0, Node: "n1", ToleratesGpu: false},
{Name: "p2", Cpu: 3000, Memory: 0, Gpu: 0, Node: "n2", ToleratesGpu: false},
},
ExtraPods: []PodConfig{
{Name: "p-new-1", Cpu: 4000, Memory: 100 * utils.MiB, Gpu: 0, Node: "", ToleratesGpu: false},
{Name: "p-new-2", Cpu: 4000, Memory: 100 * utils.MiB, Gpu: 0, Node: "", ToleratesGpu: false},
{Name: "p-new-3", Cpu: 4000, Memory: 100 * utils.MiB, Gpu: 0, Node: "", ToleratesGpu: false},
},
ExpansionOptionToChoose: &GroupSizeChange{GroupName: "ng2", SizeChange: 3},
Options: &options,
}
results := &ScaleTestResults{
FinalOption: GroupSizeChange{GroupName: "ng2", SizeChange: 1},
ScaleUpStatus: ScaleUpStatusInfo{
PodsTriggeredScaleUp: []string{"p-new-1", "p-new-2", "p-new-3"},
},
}
simpleScaleUpTest(t, config, results)
}
func TestScaleUpCapToMaxTotalNodesLimitWithNotAutoscaledGroup(t *testing.T) {
options := defaultOptions
options.MaxNodesTotal = 3
config := &ScaleUpTestConfig{
Nodes: []NodeConfig{
{Name: "n1", Cpu: 2000, Memory: 100 * utils.MiB, Gpu: 0, Ready: true, Group: ""},
{Name: "n2", Cpu: 4000, Memory: 1000 * utils.MiB, Gpu: 0, Ready: true, Group: "ng2"},
},
Pods: []PodConfig{
{Name: "p1", Cpu: 1000, Memory: 0, Gpu: 0, Node: "n1", ToleratesGpu: false},
{Name: "p2", Cpu: 3000, Memory: 0, Gpu: 0, Node: "n2", ToleratesGpu: false},
},
ExtraPods: []PodConfig{
{Name: "p-new-1", Cpu: 4000, Memory: 100 * utils.MiB, Gpu: 0, Node: "", ToleratesGpu: false},
{Name: "p-new-2", Cpu: 4000, Memory: 100 * utils.MiB, Gpu: 0, Node: "", ToleratesGpu: false},
{Name: "p-new-3", Cpu: 4000, Memory: 100 * utils.MiB, Gpu: 0, Node: "", ToleratesGpu: false},
},
ExpansionOptionToChoose: &GroupSizeChange{GroupName: "ng2", SizeChange: 3},
Options: &options,
}
results := &ScaleTestResults{
FinalOption: GroupSizeChange{GroupName: "ng2", SizeChange: 1},
ScaleUpStatus: ScaleUpStatusInfo{
PodsTriggeredScaleUp: []string{"p-new-1", "p-new-2", "p-new-3"},
},
}
simpleScaleUpTest(t, config, results)
}
func TestWillConsiderGpuAndStandardPoolForPodWhichDoesNotRequireGpu(t *testing.T) {
options := defaultOptions
options.MaxNodesTotal = 100
config := &ScaleUpTestConfig{
Nodes: []NodeConfig{
{Name: "gpu-node-1", Cpu: 2000, Memory: 1000 * utils.MiB, Gpu: 1, Ready: true, Group: "gpu-pool"},
{Name: "std-node-1", Cpu: 2000, Memory: 1000 * utils.MiB, Gpu: 0, Ready: true, Group: "std-pool"},
},
Pods: []PodConfig{
{Name: "gpu-pod-1", Cpu: 2000, Memory: 1000 * utils.MiB, Gpu: 1, Node: "gpu-node-1", ToleratesGpu: true},
{Name: "std-pod-1", Cpu: 2000, Memory: 1000 * utils.MiB, Gpu: 0, Node: "std-node-1", ToleratesGpu: false},
},
ExtraPods: []PodConfig{
{Name: "extra-std-pod", Cpu: 2000, Memory: 1000 * utils.MiB, Gpu: 0, Node: "", ToleratesGpu: true},
},
ExpansionOptionToChoose: &GroupSizeChange{GroupName: "std-pool", SizeChange: 1},
Options: &options,
}
results := &ScaleTestResults{
FinalOption: GroupSizeChange{GroupName: "std-pool", SizeChange: 1},
ExpansionOptions: []GroupSizeChange{
{GroupName: "std-pool", SizeChange: 1},
{GroupName: "gpu-pool", SizeChange: 1},
},
ScaleUpStatus: ScaleUpStatusInfo{
PodsTriggeredScaleUp: []string{"extra-std-pod"},
},
}
simpleScaleUpTest(t, config, results)
}
func TestWillConsiderOnlyGpuPoolForPodWhichDoesRequiresGpu(t *testing.T) {
options := defaultOptions
options.MaxNodesTotal = 100
config := &ScaleUpTestConfig{
Nodes: []NodeConfig{
{Name: "gpu-node-1", Cpu: 2000, Memory: 1000 * utils.MiB, Gpu: 1, Ready: true, Group: "gpu-pool"},
{Name: "std-node-1", Cpu: 2000, Memory: 1000 * utils.MiB, Gpu: 0, Ready: true, Group: "std-pool"},
},
Pods: []PodConfig{
{Name: "gpu-pod-1", Cpu: 2000, Memory: 1000 * utils.MiB, Gpu: 1, Node: "gpu-node-1", ToleratesGpu: true},
{Name: "std-pod-1", Cpu: 2000, Memory: 1000 * utils.MiB, Gpu: 0, Node: "std-node-1", ToleratesGpu: false},
},
ExtraPods: []PodConfig{
{Name: "extra-gpu-pod", Cpu: 2000, Memory: 1000 * utils.MiB, Gpu: 1, Node: "", ToleratesGpu: true},
},
ExpansionOptionToChoose: &GroupSizeChange{GroupName: "gpu-pool", SizeChange: 1},
Options: &options,
}
results := &ScaleTestResults{
FinalOption: GroupSizeChange{GroupName: "gpu-pool", SizeChange: 1},
ExpansionOptions: []GroupSizeChange{
{GroupName: "gpu-pool", SizeChange: 1},
},
ScaleUpStatus: ScaleUpStatusInfo{
PodsTriggeredScaleUp: []string{"extra-gpu-pod"},
},
}
simpleScaleUpTest(t, config, results)
}
func TestWillConsiderAllPoolsWhichFitTwoPodsRequiringGpus(t *testing.T) {
options := defaultOptions
options.MaxNodesTotal = 100
config := &ScaleUpTestConfig{
Nodes: []NodeConfig{
{Name: "gpu-1-node-1", Cpu: 2000, Memory: 1000 * utils.MiB, Gpu: 1, Ready: true, Group: "gpu-1-pool"},
{Name: "gpu-2-node-1", Cpu: 2000, Memory: 1000 * utils.MiB, Gpu: 2, Ready: true, Group: "gpu-2-pool"},
{Name: "gpu-4-node-1", Cpu: 2000, Memory: 1000 * utils.MiB, Gpu: 4, Ready: true, Group: "gpu-4-pool"},
{Name: "std-node-1", Cpu: 2000, Memory: 1000 * utils.MiB, Gpu: 0, Ready: true, Group: "std-pool"},
},
Pods: []PodConfig{
{Name: "gpu-pod-1", Cpu: 2000, Memory: 1000 * utils.MiB, Gpu: 1, Node: "gpu-1-node-1", ToleratesGpu: true},
{Name: "gpu-pod-2", Cpu: 2000, Memory: 1000 * utils.MiB, Gpu: 2, Node: "gpu-2-node-1", ToleratesGpu: true},
{Name: "gpu-pod-3", Cpu: 2000, Memory: 1000 * utils.MiB, Gpu: 4, Node: "gpu-4-node-1", ToleratesGpu: true},
{Name: "std-pod-1", Cpu: 2000, Memory: 1000 * utils.MiB, Gpu: 0, Node: "std-node-1", ToleratesGpu: false},
},
ExtraPods: []PodConfig{
{Name: "extra-gpu-pod-1", Cpu: 1, Memory: 1 * utils.MiB, Gpu: 1, Node: "", ToleratesGpu: true}, // CPU and mem negligible
{Name: "extra-gpu-pod-2", Cpu: 1, Memory: 1 * utils.MiB, Gpu: 1, Node: "", ToleratesGpu: true}, // CPU and mem negligible
{Name: "extra-gpu-pod-3", Cpu: 1, Memory: 1 * utils.MiB, Gpu: 1, Node: "", ToleratesGpu: true}, // CPU and mem negligible
},
ExpansionOptionToChoose: &GroupSizeChange{GroupName: "gpu-1-pool", SizeChange: 3},
Options: &options,
}
results := &ScaleTestResults{
FinalOption: GroupSizeChange{GroupName: "gpu-1-pool", SizeChange: 3},
ExpansionOptions: []GroupSizeChange{
{GroupName: "gpu-1-pool", SizeChange: 3},
{GroupName: "gpu-2-pool", SizeChange: 2},
{GroupName: "gpu-4-pool", SizeChange: 1},
},
ScaleUpStatus: ScaleUpStatusInfo{
PodsTriggeredScaleUp: []string{"extra-gpu-pod-1", "extra-gpu-pod-2", "extra-gpu-pod-3"},
},
}
simpleScaleUpTest(t, config, results)
}
// No scale up scenarios.
func TestNoScaleUpMaxCoresLimitHit(t *testing.T) {
options := defaultOptions
options.MaxCoresTotal = 7
options.MaxMemoryTotal = 1150
config := &ScaleUpTestConfig{
Nodes: []NodeConfig{
{Name: "n1", Cpu: 2000, Memory: 100, Gpu: 0, Ready: true, Group: "ng1"},
{Name: "n2", Cpu: 4000, Memory: 1000, Gpu: 0, Ready: true, Group: "ng2"},
},
Pods: []PodConfig{
{Name: "p1", Cpu: 1000, Memory: 0, Gpu: 0, Node: "n1", ToleratesGpu: false},
{Name: "p2", Cpu: 3000, Memory: 0, Gpu: 0, Node: "n2", ToleratesGpu: false},
},
ExtraPods: []PodConfig{
{Name: "p-new-1", Cpu: 2000, Memory: 0, Gpu: 0, Node: "", ToleratesGpu: false},
{Name: "p-new-2", Cpu: 2000, Memory: 0, Gpu: 0, Node: "", ToleratesGpu: false},
},
Options: &options,
}
results := &ScaleTestResults{
NoScaleUpReason: "max cluster cpu, memory limit reached",
ScaleUpStatus: ScaleUpStatusInfo{
PodsRemainUnschedulable: []string{"p-new-1", "p-new-2"},
},
}
simpleNoScaleUpTest(t, config, results)
}
func simpleScaleUpTest(t *testing.T, config *ScaleUpTestConfig, expectedResults *ScaleTestResults) {
results := runSimpleScaleUpTest(t, config)
assert.NotNil(t, results.GroupSizeChanges[0], "Expected scale up event")
assert.Equal(t, expectedResults.FinalOption, results.GroupSizeChanges[0])
assert.True(t, results.ScaleUpStatus.WasSuccessful())
nodeEventSeen := false
for _, event := range results.Events {
if strings.Contains(event, "TriggeredScaleUp") && strings.Contains(event, expectedResults.FinalOption.GroupName) {
nodeEventSeen = true
}
if len(expectedResults.ScaleUpStatus.PodsRemainUnschedulable) == 0 {
assert.NotRegexp(t, regexp.MustCompile("NotTriggerScaleUp"), event)
}
}
assert.True(t, nodeEventSeen)
if len(expectedResults.ExpansionOptions) > 0 {
// Empty ExpansionOptions means we do not want to do any assertions
// on contents of actual scaleUp options
if config.ExpansionOptionToChoose != nil {
// Check that option to choose is part of expected options.
assert.Contains(t, expectedResults.ExpansionOptions, *config.ExpansionOptionToChoose, "final expected expansion option must be in expected expansion options")
assert.Contains(t, results.ExpansionOptions, *config.ExpansionOptionToChoose, "final expected expansion option must be in expected expansion options")
}
assert.ElementsMatch(t, results.ExpansionOptions, expectedResults.ExpansionOptions,
"actual and expected expansion options should be the same")
}
if expectedResults.GroupTargetSizes != nil {
assert.Equal(t, expectedResults.GroupTargetSizes, results.GroupTargetSizes)
}
assert.ElementsMatch(t, results.ScaleUpStatus.PodsTriggeredScaleUp, expectedResults.ScaleUpStatus.PodsTriggeredScaleUp,
"actual and expected triggering pods should be the same")
assert.ElementsMatch(t, results.ScaleUpStatus.PodsRemainUnschedulable, expectedResults.ScaleUpStatus.PodsRemainUnschedulable,
"actual and expected remaining pods should be the same")
assert.ElementsMatch(t, results.ScaleUpStatus.PodsAwaitEvaluation, expectedResults.ScaleUpStatus.PodsAwaitEvaluation,
"actual and expected awaiting evaluation pods should be the same")
}
func simpleNoScaleUpTest(t *testing.T, config *ScaleUpTestConfig, expectedResults *ScaleTestResults) {
results := runSimpleScaleUpTest(t, config)
assert.Nil(t, results.GroupSizeChanges)
assert.False(t, results.ScaleUpStatus.WasSuccessful())
noScaleUpEventSeen := false
for _, event := range results.Events {
if strings.Contains(event, "NotTriggerScaleUp") {
if strings.Contains(event, expectedResults.NoScaleUpReason) {
noScaleUpEventSeen = true
} else {
// Surprisingly useful for debugging.
fmt.Println("Event:", event)
}
}
assert.NotRegexp(t, regexp.MustCompile("TriggeredScaleUp"), event)
}
assert.True(t, noScaleUpEventSeen)
assert.ElementsMatch(t, results.ScaleUpStatus.PodsTriggeredScaleUp, expectedResults.ScaleUpStatus.PodsTriggeredScaleUp,
"actual and expected triggering pods should be the same")
assert.ElementsMatch(t, results.ScaleUpStatus.PodsRemainUnschedulable, expectedResults.ScaleUpStatus.PodsRemainUnschedulable,
"actual and expected remaining pods should be the same")
assert.ElementsMatch(t, results.ScaleUpStatus.PodsAwaitEvaluation, expectedResults.ScaleUpStatus.PodsAwaitEvaluation,
"actual and expected awaiting evaluation pods should be the same")
}
func runSimpleScaleUpTest(t *testing.T, config *ScaleUpTestConfig) *ScaleUpTestResult {
now := time.Now()
groupSizeChangesChannel := make(chan GroupSizeChange, 20)
groupNodes := make(map[string][]*apiv1.Node)
@ -440,18 +690,22 @@ func scaleUpGroups(t *testing.T, config *GroupsScaleUpTestConfig) *GroupsScaleUp
}
// build orchestrator
context, _ := NewScaleTestAutoscalingContext(options, &fake.Clientset{}, listers, provider, nil, nil)
nodeInfos, _ := nodeinfosprovider.NewDefaultTemplateNodeInfoProvider(nil, false).
context, err := NewScaleTestAutoscalingContext(options, &fake.Clientset{}, listers, provider, nil, nil)
assert.NoError(t, err)
nodeInfos, err := nodeinfosprovider.NewDefaultTemplateNodeInfoProvider(nil, false).
Process(&context, nodes, []*appsv1.DaemonSet{}, taints.TaintConfig{}, now)
assert.NoError(t, err)
clusterState := clusterstate.
NewClusterStateRegistry(provider, clusterstate.ClusterStateRegistryConfig{}, context.LogRecorder, NewBackoff(), clusterstate.NewStaticMaxNodeProvisionTimeProvider(15*time.Minute))
clusterState.UpdateNodes(nodes, nodeInfos, time.Now())
processors := NewTestProcessors(&context)
orchestrator := New()
orchestrator.Initialize(&context, processors, clusterState, taints.TaintConfig{})
expander := NewMockRepotingStrategy(t, config.ExpansionOptionToChoose)
context.ExpanderStrategy = expander
// scale up
scaleUpStatus, err := orchestrator.ScaleUp(extraPods, nodes, []*appsv1.DaemonSet{}, nodeInfos)
scaleUpStatus, scaleUpErr := orchestrator.ScaleUp(extraPods, nodes, []*appsv1.DaemonSet{}, nodeInfos)
processors.ScaleUpStatusProcessor.Process(&context, scaleUpStatus)
// aggregate group size changes
@ -475,418 +729,13 @@ func scaleUpGroups(t *testing.T, config *GroupsScaleUpTestConfig) *GroupsScaleUp
targetSizes[group.Id()], _ = group.TargetSize()
}
return &GroupsScaleUpTestResult{
Error: err,
return &ScaleUpTestResult{
ScaleUpError: scaleUpErr,
ScaleUpStatus: simplifyScaleUpStatus(scaleUpStatus),
GroupSizeChanges: groupSizeChanges,
Events: events,
TargetSizes: targetSizes,
}
}
func TestScaleUpCapToMaxTotalNodesLimit(t *testing.T) {
options := defaultOptions
options.MaxNodesTotal = 3
config := &ScaleTestConfig{
Nodes: []NodeConfig{
{Name: "n1", Cpu: 2000, Memory: 100 * utils.MiB, Gpu: 0, Ready: true, Group: "ng1"},
{Name: "n2", Cpu: 4000, Memory: 1000 * utils.MiB, Gpu: 0, Ready: true, Group: "ng2"},
},
Pods: []PodConfig{
{Name: "p1", Cpu: 1000, Memory: 0, Gpu: 0, Node: "n1", ToleratesGpu: false},
{Name: "p2", Cpu: 3000, Memory: 0, Gpu: 0, Node: "n2", ToleratesGpu: false},
},
ExtraPods: []PodConfig{
{Name: "p-new-1", Cpu: 4000, Memory: 100 * utils.MiB, Gpu: 0, Node: "", ToleratesGpu: false},
{Name: "p-new-2", Cpu: 4000, Memory: 100 * utils.MiB, Gpu: 0, Node: "", ToleratesGpu: false},
{Name: "p-new-3", Cpu: 4000, Memory: 100 * utils.MiB, Gpu: 0, Node: "", ToleratesGpu: false},
},
ExpansionOptionToChoose: GroupSizeChange{GroupName: "ng2", SizeChange: 3},
Options: options,
}
results := &ScaleTestResults{
FinalOption: GroupSizeChange{GroupName: "ng2", SizeChange: 1},
ScaleUpStatus: ScaleUpStatusInfo{
PodsTriggeredScaleUp: []string{"p-new-1", "p-new-2", "p-new-3"},
},
}
simpleScaleUpTest(t, config, results)
}
func TestScaleUpCapToMaxTotalNodesLimitWithNotAutoscaledGroup(t *testing.T) {
options := defaultOptions
options.MaxNodesTotal = 3
config := &ScaleTestConfig{
Nodes: []NodeConfig{
{Name: "n1", Cpu: 2000, Memory: 100 * utils.MiB, Gpu: 0, Ready: true, Group: ""},
{Name: "n2", Cpu: 4000, Memory: 1000 * utils.MiB, Gpu: 0, Ready: true, Group: "ng2"},
},
Pods: []PodConfig{
{Name: "p1", Cpu: 1000, Memory: 0, Gpu: 0, Node: "n1", ToleratesGpu: false},
{Name: "p2", Cpu: 3000, Memory: 0, Gpu: 0, Node: "n2", ToleratesGpu: false},
},
ExtraPods: []PodConfig{
{Name: "p-new-1", Cpu: 4000, Memory: 100 * utils.MiB, Gpu: 0, Node: "", ToleratesGpu: false},
{Name: "p-new-2", Cpu: 4000, Memory: 100 * utils.MiB, Gpu: 0, Node: "", ToleratesGpu: false},
{Name: "p-new-3", Cpu: 4000, Memory: 100 * utils.MiB, Gpu: 0, Node: "", ToleratesGpu: false},
},
ExpansionOptionToChoose: GroupSizeChange{GroupName: "ng2", SizeChange: 3},
Options: options,
}
results := &ScaleTestResults{
FinalOption: GroupSizeChange{GroupName: "ng2", SizeChange: 1},
ScaleUpStatus: ScaleUpStatusInfo{
PodsTriggeredScaleUp: []string{"p-new-1", "p-new-2", "p-new-3"},
},
}
simpleScaleUpTest(t, config, results)
}
func TestWillConsiderGpuAndStandardPoolForPodWhichDoesNotRequireGpu(t *testing.T) {
options := defaultOptions
options.MaxNodesTotal = 100
config := &ScaleTestConfig{
Nodes: []NodeConfig{
{Name: "gpu-node-1", Cpu: 2000, Memory: 1000 * utils.MiB, Gpu: 1, Ready: true, Group: "gpu-pool"},
{Name: "std-node-1", Cpu: 2000, Memory: 1000 * utils.MiB, Gpu: 0, Ready: true, Group: "std-pool"},
},
Pods: []PodConfig{
{Name: "gpu-pod-1", Cpu: 2000, Memory: 1000 * utils.MiB, Gpu: 1, Node: "gpu-node-1", ToleratesGpu: true},
{Name: "std-pod-1", Cpu: 2000, Memory: 1000 * utils.MiB, Gpu: 0, Node: "std-node-1", ToleratesGpu: false},
},
ExtraPods: []PodConfig{
{Name: "extra-std-pod", Cpu: 2000, Memory: 1000 * utils.MiB, Gpu: 0, Node: "", ToleratesGpu: true},
},
ExpansionOptionToChoose: GroupSizeChange{GroupName: "std-pool", SizeChange: 1},
Options: options,
}
results := &ScaleTestResults{
FinalOption: GroupSizeChange{GroupName: "std-pool", SizeChange: 1},
ExpansionOptions: []GroupSizeChange{
{GroupName: "std-pool", SizeChange: 1},
{GroupName: "gpu-pool", SizeChange: 1},
},
ScaleUpStatus: ScaleUpStatusInfo{
PodsTriggeredScaleUp: []string{"extra-std-pod"},
},
}
simpleScaleUpTest(t, config, results)
}
func TestWillConsiderOnlyGpuPoolForPodWhichDoesRequiresGpu(t *testing.T) {
options := defaultOptions
options.MaxNodesTotal = 100
config := &ScaleTestConfig{
Nodes: []NodeConfig{
{Name: "gpu-node-1", Cpu: 2000, Memory: 1000 * utils.MiB, Gpu: 1, Ready: true, Group: "gpu-pool"},
{Name: "std-node-1", Cpu: 2000, Memory: 1000 * utils.MiB, Gpu: 0, Ready: true, Group: "std-pool"},
},
Pods: []PodConfig{
{Name: "gpu-pod-1", Cpu: 2000, Memory: 1000 * utils.MiB, Gpu: 1, Node: "gpu-node-1", ToleratesGpu: true},
{Name: "std-pod-1", Cpu: 2000, Memory: 1000 * utils.MiB, Gpu: 0, Node: "std-node-1", ToleratesGpu: false},
},
ExtraPods: []PodConfig{
{Name: "extra-gpu-pod", Cpu: 2000, Memory: 1000 * utils.MiB, Gpu: 1, Node: "", ToleratesGpu: true},
},
ExpansionOptionToChoose: GroupSizeChange{GroupName: "gpu-pool", SizeChange: 1},
Options: options,
}
results := &ScaleTestResults{
FinalOption: GroupSizeChange{GroupName: "gpu-pool", SizeChange: 1},
ExpansionOptions: []GroupSizeChange{
{GroupName: "gpu-pool", SizeChange: 1},
},
ScaleUpStatus: ScaleUpStatusInfo{
PodsTriggeredScaleUp: []string{"extra-gpu-pod"},
},
}
simpleScaleUpTest(t, config, results)
}
func TestWillConsiderAllPoolsWhichFitTwoPodsRequiringGpus(t *testing.T) {
options := defaultOptions
options.MaxNodesTotal = 100
config := &ScaleTestConfig{
Nodes: []NodeConfig{
{Name: "gpu-1-node-1", Cpu: 2000, Memory: 1000 * utils.MiB, Gpu: 1, Ready: true, Group: "gpu-1-pool"},
{Name: "gpu-2-node-1", Cpu: 2000, Memory: 1000 * utils.MiB, Gpu: 2, Ready: true, Group: "gpu-2-pool"},
{Name: "gpu-4-node-1", Cpu: 2000, Memory: 1000 * utils.MiB, Gpu: 4, Ready: true, Group: "gpu-4-pool"},
{Name: "std-node-1", Cpu: 2000, Memory: 1000 * utils.MiB, Gpu: 0, Ready: true, Group: "std-pool"},
},
Pods: []PodConfig{
{Name: "gpu-pod-1", Cpu: 2000, Memory: 1000 * utils.MiB, Gpu: 1, Node: "gpu-1-node-1", ToleratesGpu: true},
{Name: "gpu-pod-2", Cpu: 2000, Memory: 1000 * utils.MiB, Gpu: 2, Node: "gpu-2-node-1", ToleratesGpu: true},
{Name: "gpu-pod-3", Cpu: 2000, Memory: 1000 * utils.MiB, Gpu: 4, Node: "gpu-4-node-1", ToleratesGpu: true},
{Name: "std-pod-1", Cpu: 2000, Memory: 1000 * utils.MiB, Gpu: 0, Node: "std-node-1", ToleratesGpu: false},
},
ExtraPods: []PodConfig{
{Name: "extra-gpu-pod-1", Cpu: 1, Memory: 1 * utils.MiB, Gpu: 1, Node: "", ToleratesGpu: true}, // CPU and mem negligible
{Name: "extra-gpu-pod-2", Cpu: 1, Memory: 1 * utils.MiB, Gpu: 1, Node: "", ToleratesGpu: true}, // CPU and mem negligible
{Name: "extra-gpu-pod-3", Cpu: 1, Memory: 1 * utils.MiB, Gpu: 1, Node: "", ToleratesGpu: true}, // CPU and mem negligible
},
ExpansionOptionToChoose: GroupSizeChange{GroupName: "gpu-1-pool", SizeChange: 3},
Options: options,
}
results := &ScaleTestResults{
FinalOption: GroupSizeChange{GroupName: "gpu-1-pool", SizeChange: 3},
ExpansionOptions: []GroupSizeChange{
{GroupName: "gpu-1-pool", SizeChange: 3},
{GroupName: "gpu-2-pool", SizeChange: 2},
{GroupName: "gpu-4-pool", SizeChange: 1},
},
ScaleUpStatus: ScaleUpStatusInfo{
PodsTriggeredScaleUp: []string{"extra-gpu-pod-1", "extra-gpu-pod-2", "extra-gpu-pod-3"},
},
}
simpleScaleUpTest(t, config, results)
}
// No scale up scenarios.
func TestNoScaleUpMaxCoresLimitHit(t *testing.T) {
options := defaultOptions
options.MaxCoresTotal = 7
options.MaxMemoryTotal = 1150
config := &ScaleTestConfig{
Nodes: []NodeConfig{
{Name: "n1", Cpu: 2000, Memory: 100, Gpu: 0, Ready: true, Group: "ng1"},
{Name: "n2", Cpu: 4000, Memory: 1000, Gpu: 0, Ready: true, Group: "ng2"},
},
Pods: []PodConfig{
{Name: "p1", Cpu: 1000, Memory: 0, Gpu: 0, Node: "n1", ToleratesGpu: false},
{Name: "p2", Cpu: 3000, Memory: 0, Gpu: 0, Node: "n2", ToleratesGpu: false},
},
ExtraPods: []PodConfig{
{Name: "p-new-1", Cpu: 2000, Memory: 0, Gpu: 0, Node: "", ToleratesGpu: false},
{Name: "p-new-2", Cpu: 2000, Memory: 0, Gpu: 0, Node: "", ToleratesGpu: false},
},
Options: options,
}
results := &ScaleTestResults{
NoScaleUpReason: "max cluster cpu, memory limit reached",
ScaleUpStatus: ScaleUpStatusInfo{
PodsRemainUnschedulable: []string{"p-new-1", "p-new-2"},
},
}
simpleNoScaleUpTest(t, config, results)
}
// To implement expander.Strategy, BestOption method must have a struct receiver.
// This prevents it from modifying fields of reportingStrategy, so we need a thin
// pointer wrapper for mutable parts.
type expanderResults struct {
inputOptions []GroupSizeChange
}
type reportingStrategy struct {
initialNodeConfigs []NodeConfig
optionToChoose GroupSizeChange
results *expanderResults
t *testing.T
}
func (r reportingStrategy) BestOption(options []expander.Option, nodeInfo map[string]*schedulerframework.NodeInfo) *expander.Option {
r.results.inputOptions = expanderOptionsToGroupSizeChanges(options)
for _, option := range options {
GroupSizeChange := expanderOptionToGroupSizeChange(option)
if GroupSizeChange == r.optionToChoose {
return &option
}
}
assert.Fail(r.t, "did not find expansionOptionToChoose %s", r.optionToChoose)
return nil
}
func expanderOptionsToGroupSizeChanges(options []expander.Option) []GroupSizeChange {
groupSizeChanges := make([]GroupSizeChange, 0, len(options))
for _, option := range options {
GroupSizeChange := expanderOptionToGroupSizeChange(option)
groupSizeChanges = append(groupSizeChanges, GroupSizeChange)
}
return groupSizeChanges
}
func expanderOptionToGroupSizeChange(option expander.Option) GroupSizeChange {
groupName := option.NodeGroup.Id()
groupSizeIncrement := option.NodeCount
scaleUpOption := GroupSizeChange{GroupName: groupName, SizeChange: groupSizeIncrement}
return scaleUpOption
}
func runSimpleScaleUpTest(t *testing.T, config *ScaleTestConfig) *ScaleTestResults {
expandedGroups := make(chan GroupSizeChange, 10)
now := time.Now()
groups := make(map[string][]*apiv1.Node)
nodes := make([]*apiv1.Node, 0, len(config.Nodes))
for _, n := range config.Nodes {
node := BuildTestNode(n.Name, n.Cpu, n.Memory)
if n.Gpu > 0 {
AddGpusToNode(node, n.Gpu)
}
SetNodeReadyState(node, n.Ready, now.Add(-2*time.Minute))
nodes = append(nodes, node)
if n.Group != "" {
groups[n.Group] = append(groups[n.Group], node)
}
}
pods := make([]*apiv1.Pod, 0, len(config.Pods))
for _, p := range config.Pods {
pod := buildTestPod(p)
pods = append(pods, pod)
}
podLister := kube_util.NewTestPodLister(pods)
listers := kube_util.NewListerRegistry(nil, nil, podLister, nil, nil, nil, nil, nil, nil, nil)
provider := testprovider.NewTestCloudProvider(func(nodeGroup string, increase int) error {
expandedGroups <- GroupSizeChange{GroupName: nodeGroup, SizeChange: increase}
return nil
}, nil)
for name, nodesInGroup := range groups {
provider.AddNodeGroup(name, 1, 10, len(nodesInGroup))
for _, n := range nodesInGroup {
provider.AddNode(name, n)
}
}
resourceLimiter := cloudprovider.NewResourceLimiter(
map[string]int64{cloudprovider.ResourceNameCores: config.Options.MinCoresTotal, cloudprovider.ResourceNameMemory: config.Options.MinMemoryTotal},
map[string]int64{cloudprovider.ResourceNameCores: config.Options.MaxCoresTotal, cloudprovider.ResourceNameMemory: config.Options.MaxMemoryTotal})
provider.SetResourceLimiter(resourceLimiter)
assert.NotNil(t, provider)
// Create context with non-random expander strategy.
context, err := NewScaleTestAutoscalingContext(config.Options, &fake.Clientset{}, listers, provider, nil, nil)
assert.NoError(t, err)
expander := reportingStrategy{
initialNodeConfigs: config.Nodes,
optionToChoose: config.ExpansionOptionToChoose,
results: &expanderResults{},
t: t,
}
context.ExpanderStrategy = expander
nodeInfos, _ := nodeinfosprovider.NewDefaultTemplateNodeInfoProvider(nil, false).Process(&context, nodes, []*appsv1.DaemonSet{}, taints.TaintConfig{}, now)
clusterState := clusterstate.NewClusterStateRegistry(provider, clusterstate.ClusterStateRegistryConfig{}, context.LogRecorder, NewBackoff(), clusterstate.NewStaticMaxNodeProvisionTimeProvider(15*time.Minute))
clusterState.UpdateNodes(nodes, nodeInfos, time.Now())
extraPods := make([]*apiv1.Pod, 0, len(config.ExtraPods))
for _, p := range config.ExtraPods {
pod := buildTestPod(p)
extraPods = append(extraPods, pod)
}
processors := NewTestProcessors(&context)
suOrchestrator := New()
suOrchestrator.Initialize(&context, processors, clusterState, taints.TaintConfig{})
scaleUpStatus, err := suOrchestrator.ScaleUp(extraPods, nodes, []*appsv1.DaemonSet{}, nodeInfos)
processors.ScaleUpStatusProcessor.Process(&context, scaleUpStatus)
assert.NoError(t, err)
expandedGroup := getGroupSizeChangeFromChan(expandedGroups)
var expandedGroupStruct GroupSizeChange
if expandedGroup != nil {
expandedGroupStruct = *expandedGroup
}
events := []string{}
for eventsLeft := true; eventsLeft; {
select {
case event := <-context.Recorder.(*kube_record.FakeRecorder).Events:
events = append(events, event)
default:
eventsLeft = false
}
}
return &ScaleTestResults{
ExpansionOptions: expander.results.inputOptions,
FinalOption: expandedGroupStruct,
ScaleUpStatus: simplifyScaleUpStatus(scaleUpStatus),
Events: events,
}
}
func simpleNoScaleUpTest(t *testing.T, config *ScaleTestConfig, expectedResults *ScaleTestResults) {
results := runSimpleScaleUpTest(t, config)
assert.Equal(t, GroupSizeChange{}, results.FinalOption)
assert.False(t, results.ScaleUpStatus.WasSuccessful())
noScaleUpEventSeen := false
for _, event := range results.Events {
if strings.Contains(event, "NotTriggerScaleUp") {
if strings.Contains(event, expectedResults.NoScaleUpReason) {
noScaleUpEventSeen = true
} else {
// Surprisingly useful for debugging.
fmt.Println("Event:", event)
}
}
assert.NotRegexp(t, regexp.MustCompile("TriggeredScaleUp"), event)
}
assert.True(t, noScaleUpEventSeen)
assert.ElementsMatch(t, results.ScaleUpStatus.PodsTriggeredScaleUp, expectedResults.ScaleUpStatus.PodsTriggeredScaleUp,
"actual and expected triggering pods should be the same")
assert.ElementsMatch(t, results.ScaleUpStatus.PodsRemainUnschedulable, expectedResults.ScaleUpStatus.PodsRemainUnschedulable,
"actual and expected remaining pods should be the same")
assert.ElementsMatch(t, results.ScaleUpStatus.PodsAwaitEvaluation, expectedResults.ScaleUpStatus.PodsAwaitEvaluation,
"actual and expected awaiting evaluation pods should be the same")
}
func simpleScaleUpTest(t *testing.T, config *ScaleTestConfig, expectedResults *ScaleTestResults) {
results := runSimpleScaleUpTest(t, config)
assert.NotNil(t, results.FinalOption, "Expected scale up event")
assert.Equal(t, expectedResults.FinalOption, results.FinalOption)
assert.True(t, results.ScaleUpStatus.WasSuccessful())
nodeEventSeen := false
for _, event := range results.Events {
if strings.Contains(event, "TriggeredScaleUp") && strings.Contains(event, expectedResults.FinalOption.GroupName) {
nodeEventSeen = true
}
if len(expectedResults.ScaleUpStatus.PodsRemainUnschedulable) == 0 {
assert.NotRegexp(t, regexp.MustCompile("NotTriggerScaleUp"), event)
}
}
assert.True(t, nodeEventSeen)
if len(expectedResults.ExpansionOptions) > 0 {
// Empty ExpansionOptions means we do not want to do any assertions
// on contents of actual scaleUp options
// Check that option to choose is part of expected options.
assert.Contains(t, expectedResults.ExpansionOptions, config.ExpansionOptionToChoose, "final expected expansion option must be in expected expansion options")
assert.Contains(t, results.ExpansionOptions, config.ExpansionOptionToChoose, "final expected expansion option must be in expected expansion options")
assert.ElementsMatch(t, results.ExpansionOptions, expectedResults.ExpansionOptions,
"actual and expected expansion options should be the same")
}
assert.ElementsMatch(t, results.ScaleUpStatus.PodsTriggeredScaleUp, expectedResults.ScaleUpStatus.PodsTriggeredScaleUp,
"actual and expected triggering pods should be the same")
assert.ElementsMatch(t, results.ScaleUpStatus.PodsRemainUnschedulable, expectedResults.ScaleUpStatus.PodsRemainUnschedulable,
"actual and expected remaining pods should be the same")
assert.ElementsMatch(t, results.ScaleUpStatus.PodsAwaitEvaluation, expectedResults.ScaleUpStatus.PodsAwaitEvaluation,
"actual and expected awaiting evaluation pods should be the same")
}
func getGroupSizeChangeFromChan(c chan GroupSizeChange) *GroupSizeChange {
select {
case val := <-c:
return &val
case <-time.After(100 * time.Millisecond):
return nil
GroupTargetSizes: targetSizes,
ExpansionOptions: expander.LastInputOptions(),
}
}
@ -1293,7 +1142,7 @@ func TestCheckDeltaWithinLimits(t *testing.T) {
func TestAuthErrorHandling(t *testing.T) {
metrics.RegisterAll(false)
config := &GroupsScaleUpTestConfig{
config := &ScaleUpTestConfig{
Groups: []NodeGroupConfig{
{Name: "ng1", MaxSize: 2},
},
@ -1308,12 +1157,12 @@ func TestAuthErrorHandling(t *testing.T) {
},
Options: &defaultOptions,
}
results := scaleUpGroups(t, config)
results := runSimpleScaleUpTest(t, config)
expected := errors.NewAutoscalerError(
errors.AutoscalerErrorType("authError"),
"failed to increase node group size: auth error",
)
assert.Equal(t, expected, results.Error)
assert.Equal(t, expected, results.ScaleUpError)
assertLegacyRegistryEntry(t, "cluster_autoscaler_failed_scale_ups_total{reason=\"authError\"} 1")
}
@ -1325,7 +1174,6 @@ func assertLegacyRegistryEntry(t *testing.T, entry string) {
rr := httptest.NewRecorder()
handler := http.HandlerFunc(legacyregistry.Handler().ServeHTTP)
handler.ServeHTTP(rr, req)
// Check that the status code is what we expect.
if status := rr.Code; status != http.StatusOK {
t.Errorf("handler returned wrong status code: got %v want %v",

View File

@ -22,10 +22,6 @@ import (
"testing"
"time"
"k8s.io/autoscaler/cluster-autoscaler/core/scaledown/pdb"
"k8s.io/autoscaler/cluster-autoscaler/debuggingsnapshot"
"k8s.io/apimachinery/pkg/api/resource"
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
testcloudprovider "k8s.io/autoscaler/cluster-autoscaler/cloudprovider/test"
"k8s.io/autoscaler/cluster-autoscaler/clusterstate/utils"
@ -33,7 +29,10 @@ import (
"k8s.io/autoscaler/cluster-autoscaler/context"
"k8s.io/autoscaler/cluster-autoscaler/core/podlistprocessor"
"k8s.io/autoscaler/cluster-autoscaler/core/scaledown/deletiontracker"
"k8s.io/autoscaler/cluster-autoscaler/core/scaledown/pdb"
"k8s.io/autoscaler/cluster-autoscaler/debuggingsnapshot"
"k8s.io/autoscaler/cluster-autoscaler/estimator"
"k8s.io/autoscaler/cluster-autoscaler/expander"
"k8s.io/autoscaler/cluster-autoscaler/expander/random"
"k8s.io/autoscaler/cluster-autoscaler/metrics"
"k8s.io/autoscaler/cluster-autoscaler/processors"
@ -50,18 +49,17 @@ import (
"k8s.io/autoscaler/cluster-autoscaler/processors/status"
"k8s.io/autoscaler/cluster-autoscaler/simulator/clustersnapshot"
"k8s.io/autoscaler/cluster-autoscaler/simulator/predicatechecker"
"k8s.io/autoscaler/cluster-autoscaler/utils/backoff"
"k8s.io/autoscaler/cluster-autoscaler/utils/errors"
kube_util "k8s.io/autoscaler/cluster-autoscaler/utils/kubernetes"
"k8s.io/autoscaler/cluster-autoscaler/utils/labels"
"github.com/stretchr/testify/assert"
apiv1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
kube_client "k8s.io/client-go/kubernetes"
kube_record "k8s.io/client-go/tools/record"
schedulerframework "k8s.io/kubernetes/pkg/scheduler/framework"
"k8s.io/autoscaler/cluster-autoscaler/utils/backoff"
)
// NodeConfig is a node config used in tests
@ -110,28 +108,31 @@ type NodeGroupConfig struct {
MaxSize int
}
// GroupsScaleUpTestConfig represents a config of a scale test
type GroupsScaleUpTestConfig struct {
Groups []NodeGroupConfig
Nodes []NodeConfig
Pods []PodConfig
ExtraPods []PodConfig
OnScaleUp testcloudprovider.OnScaleUpFunc
Options *config.AutoscalingOptions
// ScaleUpTestConfig represents a config of a scale test
type ScaleUpTestConfig struct {
Groups []NodeGroupConfig
Nodes []NodeConfig
Pods []PodConfig
ExtraPods []PodConfig
OnScaleUp testcloudprovider.OnScaleUpFunc
ExpansionOptionToChoose *GroupSizeChange
Options *config.AutoscalingOptions
}
// GroupsScaleUpTestResult represents a node groups scale up result
type GroupsScaleUpTestResult struct {
Error errors.AutoscalerError
// ScaleUpTestResult represents a node groups scale up result
type ScaleUpTestResult struct {
ScaleUpError errors.AutoscalerError
ScaleUpStatus ScaleUpStatusInfo
GroupSizeChanges []GroupSizeChange
ExpansionOptions []GroupSizeChange
Events []string
TargetSizes map[string]int
GroupTargetSizes map[string]int
}
// ScaleTestResults contains results of a scale test
type ScaleTestResults struct {
ExpansionOptions []GroupSizeChange
GroupTargetSizes map[string]int
FinalOption GroupSizeChange
NoScaleUpReason string
FinalScaleDowns []string
@ -186,7 +187,8 @@ func NewTestProcessors(context *context.AutoscalingContext) *processors.Autoscal
func NewScaleTestAutoscalingContext(
options config.AutoscalingOptions, fakeClient kube_client.Interface,
listers kube_util.ListerRegistry, provider cloudprovider.CloudProvider,
processorCallbacks processor_callbacks.ProcessorCallbacks, debuggingSnapshotter debuggingsnapshot.DebuggingSnapshotter) (context.AutoscalingContext, error) {
processorCallbacks processor_callbacks.ProcessorCallbacks, debuggingSnapshotter debuggingsnapshot.DebuggingSnapshotter,
) (context.AutoscalingContext, error) {
// Not enough buffer space causes the test to hang without printing any logs.
// This is not useful.
fakeRecorder := kube_record.NewFakeRecorder(100)
@ -306,8 +308,8 @@ type MockAutoprovisioningNodeGroupListProcessor struct {
// Process extends the list of node groups
func (p *MockAutoprovisioningNodeGroupListProcessor) Process(context *context.AutoscalingContext, nodeGroups []cloudprovider.NodeGroup, nodeInfos map[string]*schedulerframework.NodeInfo,
unschedulablePods []*apiv1.Pod) ([]cloudprovider.NodeGroup, map[string]*schedulerframework.NodeInfo, error) {
unschedulablePods []*apiv1.Pod,
) ([]cloudprovider.NodeGroup, map[string]*schedulerframework.NodeInfo, error) {
machines, err := context.CloudProvider.GetAvailableMachineTypes()
assert.NoError(p.T, err)
@ -332,3 +334,67 @@ func NewBackoff() backoff.Backoff {
return backoff.NewIdBasedExponentialBackoff(5*time.Minute, /*InitialNodeGroupBackoffDuration*/
30*time.Minute /*MaxNodeGroupBackoffDuration*/, 3*time.Hour /*NodeGroupBackoffResetTimeout*/)
}
// To implement expander.Strategy, BestOption method must have a struct receiver.
// This prevents it from modifying fields of reportingStrategy, so we need a thin
// pointer wrapper for mutable parts.
type expanderResults struct {
inputOptions []GroupSizeChange
}
// MockReportingStrategy implements expander.Strategy
type MockReportingStrategy struct {
defaultStrategy expander.Strategy
optionToChoose *GroupSizeChange
t *testing.T
results *expanderResults
}
// NewMockRepotingStrategy creates an expander strategy with reporting and mocking capabilities.
func NewMockRepotingStrategy(t *testing.T, optionToChoose *GroupSizeChange) *MockReportingStrategy {
return &MockReportingStrategy{
defaultStrategy: random.NewStrategy(),
results: &expanderResults{},
optionToChoose: optionToChoose,
t: t,
}
}
// LastInputOptions provides access to expansion options passed as an input in recent strategy execution
func (r *MockReportingStrategy) LastInputOptions() []GroupSizeChange {
return r.results.inputOptions
}
// BestOption satisfies the Strategy interface. Picks the best option from those passed as an argument.
// When parameter optionToChoose is defined, it's picked as the best one.
// Otherwise, random option is used.
func (r *MockReportingStrategy) BestOption(options []expander.Option, nodeInfo map[string]*schedulerframework.NodeInfo) *expander.Option {
r.results.inputOptions = expanderOptionsToGroupSizeChanges(options)
if r.optionToChoose == nil {
return r.defaultStrategy.BestOption(options, nodeInfo)
}
for _, option := range options {
groupSizeChange := expanderOptionToGroupSizeChange(option)
if groupSizeChange == *r.optionToChoose {
return &option
}
}
assert.Fail(r.t, "did not find expansionOptionToChoose %+v", r.optionToChoose)
return nil
}
func expanderOptionsToGroupSizeChanges(options []expander.Option) []GroupSizeChange {
groupSizeChanges := make([]GroupSizeChange, 0, len(options))
for _, option := range options {
groupSizeChange := expanderOptionToGroupSizeChange(option)
groupSizeChanges = append(groupSizeChanges, groupSizeChange)
}
return groupSizeChanges
}
func expanderOptionToGroupSizeChange(option expander.Option) GroupSizeChange {
groupName := option.NodeGroup.Id()
groupSizeIncrement := option.NodeCount
scaleUpOption := GroupSizeChange{GroupName: groupName, SizeChange: groupSizeIncrement}
return scaleUpOption
}

View File

@ -148,7 +148,7 @@ var (
maxTotalUnreadyPercentage = flag.Float64("max-total-unready-percentage", 45, "Maximum percentage of unready nodes in the cluster. After this is exceeded, CA halts operations")
okTotalUnreadyCount = flag.Int("ok-total-unready-count", 3, "Number of allowed unready nodes, irrespective of max-total-unready-percentage")
scaleUpFromZero = flag.Bool("scale-up-from-zero", true, "Should CA scale up when there are 0 ready nodes.")
parallelScaleUp = flag.Bool("parallel-scale-up", false, "Whether to allow parallel node groups scale up")
parallelScaleUp = flag.Bool("parallel-scale-up", false, "Whether to allow parallel node groups scale up. Experimental: may not work on some cloud providers, enable at your own risk.")
maxNodeProvisionTime = flag.Duration("max-node-provision-time", 15*time.Minute, "The default maximum time CA waits for node to be provisioned - the value can be overridden per node group")
maxPodEvictionTime = flag.Duration("max-pod-eviction-time", 2*time.Minute, "Maximum time CA tries to evict a pod before giving up")
nodeGroupsFlag = multiStringFlag(