Merge pull request #5731 from pmendelski/feature-parallel-scale-up

Feature: parallel scale up
This commit is contained in:
Kubernetes Prow Robot 2023-05-11 06:27:44 -07:00 committed by GitHub
commit ac8253f366
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 830 additions and 304 deletions

View File

@ -128,6 +128,8 @@ type AutoscalingOptions struct {
OkTotalUnreadyCount int OkTotalUnreadyCount int
// ScaleUpFromZero defines if CA should scale up when there 0 ready nodes. // ScaleUpFromZero defines if CA should scale up when there 0 ready nodes.
ScaleUpFromZero bool ScaleUpFromZero bool
// ParallelScaleUp defines whether CA can scale up node groups in parallel.
ParallelScaleUp bool
// CloudConfig is the path to the cloud provider configuration file. Empty string for no configuration file. // CloudConfig is the path to the cloud provider configuration file. Empty string for no configuration file.
CloudConfig string CloudConfig string
// CloudProviderName sets the type of the cloud provider CA is about to run in. Allowed values: gce, aws // CloudProviderName sets the type of the cloud provider CA is about to run in. Allowed values: gce, aws

View File

@ -0,0 +1,248 @@
/*
Copyright 2016 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package orchestrator
import (
"fmt"
"sort"
"strings"
"sync"
"time"
apiv1 "k8s.io/api/core/v1"
"k8s.io/klog/v2"
schedulerframework "k8s.io/kubernetes/pkg/scheduler/framework"
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
"k8s.io/autoscaler/cluster-autoscaler/clusterstate"
"k8s.io/autoscaler/cluster-autoscaler/context"
"k8s.io/autoscaler/cluster-autoscaler/metrics"
"k8s.io/autoscaler/cluster-autoscaler/processors/nodegroupset"
"k8s.io/autoscaler/cluster-autoscaler/utils/errors"
"k8s.io/autoscaler/cluster-autoscaler/utils/gpu"
)
// ScaleUpExecutor scales up node groups.
type scaleUpExecutor struct {
autoscalingContext *context.AutoscalingContext
clusterStateRegistry *clusterstate.ClusterStateRegistry
}
// New returns new instance of scale up executor.
func newScaleUpExecutor(
autoscalingContext *context.AutoscalingContext,
clusterStateRegistry *clusterstate.ClusterStateRegistry,
) *scaleUpExecutor {
return &scaleUpExecutor{
autoscalingContext: autoscalingContext,
clusterStateRegistry: clusterStateRegistry,
}
}
// ExecuteScaleUps executes the scale ups, based on the provided scale up infos and options.
// May scale up groups concurrently when autoscler option is enabled.
// In case of issues returns an error and a scale up info which failed to execute.
// If there were multiple concurrent errors one combined error is returned.
func (e *scaleUpExecutor) ExecuteScaleUps(
scaleUpInfos []nodegroupset.ScaleUpInfo,
nodeInfos map[string]*schedulerframework.NodeInfo,
now time.Time,
) (errors.AutoscalerError, []cloudprovider.NodeGroup) {
options := e.autoscalingContext.AutoscalingOptions
if options.ParallelScaleUp {
return e.executeScaleUpsParallel(scaleUpInfos, nodeInfos, now)
}
return e.executeScaleUpsSync(scaleUpInfos, nodeInfos, now)
}
func (e *scaleUpExecutor) executeScaleUpsSync(
scaleUpInfos []nodegroupset.ScaleUpInfo,
nodeInfos map[string]*schedulerframework.NodeInfo,
now time.Time,
) (errors.AutoscalerError, []cloudprovider.NodeGroup) {
availableGPUTypes := e.autoscalingContext.CloudProvider.GetAvailableGPUTypes()
for _, scaleUpInfo := range scaleUpInfos {
nodeInfo, ok := nodeInfos[scaleUpInfo.Group.Id()]
if !ok {
klog.Errorf("ExecuteScaleUp: failed to get node info for node group %s", scaleUpInfo.Group.Id())
continue
}
if aErr := e.executeScaleUp(scaleUpInfo, nodeInfo, availableGPUTypes, now); aErr != nil {
return aErr, []cloudprovider.NodeGroup{scaleUpInfo.Group}
}
}
return nil, nil
}
func (e *scaleUpExecutor) executeScaleUpsParallel(
scaleUpInfos []nodegroupset.ScaleUpInfo,
nodeInfos map[string]*schedulerframework.NodeInfo,
now time.Time,
) (errors.AutoscalerError, []cloudprovider.NodeGroup) {
if err := checkUniqueNodeGroups(scaleUpInfos); err != nil {
return err, extractNodeGroups(scaleUpInfos)
}
type errResult struct {
err errors.AutoscalerError
info *nodegroupset.ScaleUpInfo
}
scaleUpsLen := len(scaleUpInfos)
errResults := make(chan errResult, scaleUpsLen)
var wg sync.WaitGroup
wg.Add(scaleUpsLen)
availableGPUTypes := e.autoscalingContext.CloudProvider.GetAvailableGPUTypes()
for _, scaleUpInfo := range scaleUpInfos {
go func(info nodegroupset.ScaleUpInfo) {
defer wg.Done()
nodeInfo, ok := nodeInfos[info.Group.Id()]
if !ok {
klog.Errorf("ExecuteScaleUp: failed to get node info for node group %s", info.Group.Id())
return
}
if aErr := e.executeScaleUp(info, nodeInfo, availableGPUTypes, now); aErr != nil {
errResults <- errResult{err: aErr, info: &info}
}
}(scaleUpInfo)
}
wg.Wait()
close(errResults)
var results []errResult
for err := range errResults {
results = append(results, err)
}
if len(results) > 0 {
failedNodeGroups := make([]cloudprovider.NodeGroup, len(results))
scaleUpErrors := make([]errors.AutoscalerError, len(results))
for i, result := range results {
failedNodeGroups[i] = result.info.Group
scaleUpErrors[i] = result.err
}
return combineConcurrentScaleUpErrors(scaleUpErrors), failedNodeGroups
}
return nil, nil
}
func (e *scaleUpExecutor) executeScaleUp(
info nodegroupset.ScaleUpInfo,
nodeInfo *schedulerframework.NodeInfo,
availableGPUTypes map[string]struct{},
now time.Time,
) errors.AutoscalerError {
gpuConfig := e.autoscalingContext.CloudProvider.GetNodeGpuConfig(nodeInfo.Node())
gpuResourceName, gpuType := gpu.GetGpuInfoForMetrics(gpuConfig, availableGPUTypes, nodeInfo.Node(), nil)
klog.V(0).Infof("Scale-up: setting group %s size to %d", info.Group.Id(), info.NewSize)
e.autoscalingContext.LogRecorder.Eventf(apiv1.EventTypeNormal, "ScaledUpGroup",
"Scale-up: setting group %s size to %d instead of %d (max: %d)", info.Group.Id(), info.NewSize, info.CurrentSize, info.MaxSize)
increase := info.NewSize - info.CurrentSize
if err := info.Group.IncreaseSize(increase); err != nil {
e.autoscalingContext.LogRecorder.Eventf(apiv1.EventTypeWarning, "FailedToScaleUpGroup", "Scale-up failed for group %s: %v", info.Group.Id(), err)
aerr := errors.ToAutoscalerError(errors.CloudProviderError, err).AddPrefix("failed to increase node group size: ")
e.clusterStateRegistry.RegisterFailedScaleUp(info.Group, metrics.FailedScaleUpReason(string(aerr.Type())), now)
return aerr
}
e.clusterStateRegistry.RegisterOrUpdateScaleUp(
info.Group,
increase,
time.Now())
metrics.RegisterScaleUp(increase, gpuResourceName, gpuType)
e.autoscalingContext.LogRecorder.Eventf(apiv1.EventTypeNormal, "ScaledUpGroup",
"Scale-up: group %s size set to %d instead of %d (max: %d)", info.Group.Id(), info.NewSize, info.CurrentSize, info.MaxSize)
return nil
}
func combineConcurrentScaleUpErrors(errs []errors.AutoscalerError) errors.AutoscalerError {
if len(errs) == 0 {
return nil
}
if len(errs) == 1 {
return errs[0]
}
uniqueMessages := make(map[string]bool)
uniqueTypes := make(map[errors.AutoscalerErrorType]bool)
for _, err := range errs {
uniqueTypes[err.Type()] = true
uniqueMessages[err.Error()] = true
}
if len(uniqueTypes) == 1 && len(uniqueMessages) == 1 {
return errs[0]
}
// sort to stabilize the results and easier log aggregation
sort.Slice(errs, func(i, j int) bool {
errA := errs[i]
errB := errs[j]
if errA.Type() == errB.Type() {
return errs[i].Error() < errs[j].Error()
}
return errA.Type() < errB.Type()
})
firstErr := errs[0]
printErrorTypes := len(uniqueTypes) > 1
message := formatMessageFromConcurrentErrors(errs, printErrorTypes)
return errors.NewAutoscalerError(firstErr.Type(), message)
}
func formatMessageFromConcurrentErrors(errs []errors.AutoscalerError, printErrorTypes bool) string {
firstErr := errs[0]
var builder strings.Builder
builder.WriteString(firstErr.Error())
builder.WriteString(" ...and other concurrent errors: [")
formattedErrs := map[errors.AutoscalerError]bool{
firstErr: true,
}
for _, err := range errs {
if _, has := formattedErrs[err]; has {
continue
}
formattedErrs[err] = true
var message string
if printErrorTypes {
message = fmt.Sprintf("[%s] %s", err.Type(), err.Error())
} else {
message = err.Error()
}
if len(formattedErrs) > 2 {
builder.WriteString(", ")
}
builder.WriteString(fmt.Sprintf("%q", message))
}
builder.WriteString("]")
return builder.String()
}
// Checks if all groups are scaled only once.
// Scaling one group multiple times concurrently may cause problems.
func checkUniqueNodeGroups(scaleUpInfos []nodegroupset.ScaleUpInfo) errors.AutoscalerError {
uniqueGroups := make(map[string]bool)
for _, info := range scaleUpInfos {
if uniqueGroups[info.Group.Id()] {
return errors.NewAutoscalerError(
errors.InternalError,
"assertion failure: detected group double scaling: %s", info.Group.Id(),
)
}
uniqueGroups[info.Group.Id()] = true
}
return nil
}
func extractNodeGroups(scaleUpInfos []nodegroupset.ScaleUpInfo) []cloudprovider.NodeGroup {
groups := make([]cloudprovider.NodeGroup, len(scaleUpInfos))
for i, info := range scaleUpInfos {
groups[i] = info.Group
}
return groups
}

View File

@ -0,0 +1,127 @@
/*
Copyright 2016 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package orchestrator
import (
"testing"
"k8s.io/autoscaler/cluster-autoscaler/utils/errors"
"github.com/stretchr/testify/assert"
)
func TestCombinedConcurrentScaleUpErrors(t *testing.T) {
cloudProviderErr := errors.NewAutoscalerError(errors.CloudProviderError, "provider error")
internalErr := errors.NewAutoscalerError(errors.InternalError, "internal error")
testCases := []struct {
desc string
errors []errors.AutoscalerError
expectedErr errors.AutoscalerError
}{
{
desc: "no errors",
errors: []errors.AutoscalerError{},
expectedErr: nil,
},
{
desc: "single error",
errors: []errors.AutoscalerError{internalErr},
expectedErr: internalErr,
},
{
desc: "two duplicated errors",
errors: []errors.AutoscalerError{
internalErr,
internalErr,
},
expectedErr: internalErr,
},
{
desc: "two different errors",
errors: []errors.AutoscalerError{
cloudProviderErr,
internalErr,
},
expectedErr: errors.NewAutoscalerError(
errors.CloudProviderError,
"provider error ...and other concurrent errors: [\"[internalError] internal error\"]",
),
},
{
desc: "two different errors - reverse alphabetical order",
errors: []errors.AutoscalerError{
internalErr,
cloudProviderErr,
},
expectedErr: errors.NewAutoscalerError(
errors.CloudProviderError,
"provider error ...and other concurrent errors: [\"[internalError] internal error\"]",
),
},
{
desc: "errors with the same type and different messages",
errors: []errors.AutoscalerError{
errors.NewAutoscalerError(errors.InternalError, "A"),
errors.NewAutoscalerError(errors.InternalError, "B"),
errors.NewAutoscalerError(errors.InternalError, "C"),
},
expectedErr: errors.NewAutoscalerError(
errors.InternalError,
"A ...and other concurrent errors: [\"B\", \"C\"]"),
},
{
desc: "errors with the same type and some duplicated messages",
errors: []errors.AutoscalerError{
errors.NewAutoscalerError(errors.InternalError, "A"),
errors.NewAutoscalerError(errors.InternalError, "B"),
errors.NewAutoscalerError(errors.InternalError, "A"),
},
expectedErr: errors.NewAutoscalerError(
errors.InternalError,
"A ...and other concurrent errors: [\"B\"]"),
},
{
desc: "some duplicated errors",
errors: []errors.AutoscalerError{
errors.NewAutoscalerError(errors.CloudProviderError, "A"),
errors.NewAutoscalerError(errors.CloudProviderError, "A"),
errors.NewAutoscalerError(errors.CloudProviderError, "B"),
errors.NewAutoscalerError(errors.InternalError, "A"),
},
expectedErr: errors.NewAutoscalerError(
errors.CloudProviderError,
"A ...and other concurrent errors: [\"[cloudProviderError] B\", \"[internalError] A\"]"),
},
{
desc: "different errors with quotes in messages",
errors: []errors.AutoscalerError{
errors.NewAutoscalerError(errors.InternalError, "\"first\""),
errors.NewAutoscalerError(errors.InternalError, "\"second\""),
},
expectedErr: errors.NewAutoscalerError(
errors.InternalError,
"\"first\" ...and other concurrent errors: [\"\\\"second\\\"\"]"),
},
}
for _, testCase := range testCases {
t.Run(testCase.desc, func(t *testing.T) {
combinedErr := combineConcurrentScaleUpErrors(testCase.errors)
assert.Equal(t, testCase.expectedErr, combinedErr)
})
}
}

View File

@ -22,6 +22,9 @@ import (
appsv1 "k8s.io/api/apps/v1" appsv1 "k8s.io/api/apps/v1"
apiv1 "k8s.io/api/core/v1" apiv1 "k8s.io/api/core/v1"
"k8s.io/klog/v2"
schedulerframework "k8s.io/kubernetes/pkg/scheduler/framework"
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider" "k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
"k8s.io/autoscaler/cluster-autoscaler/clusterstate" "k8s.io/autoscaler/cluster-autoscaler/clusterstate"
"k8s.io/autoscaler/cluster-autoscaler/context" "k8s.io/autoscaler/cluster-autoscaler/context"
@ -36,11 +39,8 @@ import (
"k8s.io/autoscaler/cluster-autoscaler/processors/nodegroupset" "k8s.io/autoscaler/cluster-autoscaler/processors/nodegroupset"
"k8s.io/autoscaler/cluster-autoscaler/processors/status" "k8s.io/autoscaler/cluster-autoscaler/processors/status"
"k8s.io/autoscaler/cluster-autoscaler/utils/errors" "k8s.io/autoscaler/cluster-autoscaler/utils/errors"
"k8s.io/autoscaler/cluster-autoscaler/utils/gpu"
"k8s.io/autoscaler/cluster-autoscaler/utils/klogx" "k8s.io/autoscaler/cluster-autoscaler/utils/klogx"
"k8s.io/autoscaler/cluster-autoscaler/utils/taints" "k8s.io/autoscaler/cluster-autoscaler/utils/taints"
"k8s.io/klog/v2"
schedulerframework "k8s.io/kubernetes/pkg/scheduler/framework"
) )
// ScaleUpOrchestrator implements scaleup.Orchestrator interface. // ScaleUpOrchestrator implements scaleup.Orchestrator interface.
@ -49,6 +49,7 @@ type ScaleUpOrchestrator struct {
processors *ca_processors.AutoscalingProcessors processors *ca_processors.AutoscalingProcessors
resourceManager *resource.Manager resourceManager *resource.Manager
clusterStateRegistry *clusterstate.ClusterStateRegistry clusterStateRegistry *clusterstate.ClusterStateRegistry
scaleUpExecutor *scaleUpExecutor
taintConfig taints.TaintConfig taintConfig taints.TaintConfig
initialized bool initialized bool
} }
@ -72,6 +73,7 @@ func (o *ScaleUpOrchestrator) Initialize(
o.clusterStateRegistry = clusterStateRegistry o.clusterStateRegistry = clusterStateRegistry
o.taintConfig = taintConfig o.taintConfig = taintConfig
o.resourceManager = resource.NewManager(processors.CustomResourcesProcessor) o.resourceManager = resource.NewManager(processors.CustomResourcesProcessor)
o.scaleUpExecutor = newScaleUpExecutor(autoscalingContext, clusterStateRegistry)
o.initialized = true o.initialized = true
} }
@ -284,7 +286,7 @@ func (o *ScaleUpOrchestrator) ScaleUp(
} }
if len(targetNodeGroups) > 1 { if len(targetNodeGroups) > 1 {
var names = []string{} names := []string{}
for _, ng := range targetNodeGroups { for _, ng := range targetNodeGroups {
names = append(names, ng.Id()) names = append(names, ng.Id())
} }
@ -300,11 +302,12 @@ func (o *ScaleUpOrchestrator) ScaleUp(
} }
klog.V(1).Infof("Final scale-up plan: %v", scaleUpInfos) klog.V(1).Infof("Final scale-up plan: %v", scaleUpInfos)
if aErr, failedInfo := o.ExecuteScaleUps(scaleUpInfos, nodeInfos, now); aErr != nil { aErr, failedNodeGroups := o.scaleUpExecutor.ExecuteScaleUps(scaleUpInfos, nodeInfos, now)
if aErr != nil {
return scaleUpError( return scaleUpError(
&status.ScaleUpStatus{ &status.ScaleUpStatus{
CreateNodeGroupResults: createNodeGroupResults, CreateNodeGroupResults: createNodeGroupResults,
FailedResizeNodeGroups: []cloudprovider.NodeGroup{failedInfo.Group}, FailedResizeNodeGroups: failedNodeGroups,
PodsTriggeredScaleUp: bestOption.Pods, PodsTriggeredScaleUp: bestOption.Pods,
}, },
aErr, aErr,
@ -405,10 +408,11 @@ func (o *ScaleUpOrchestrator) ScaleUpToNodeGroupMinSize(
} }
klog.V(1).Infof("ScaleUpToNodeGroupMinSize: final scale-up plan: %v", scaleUpInfos) klog.V(1).Infof("ScaleUpToNodeGroupMinSize: final scale-up plan: %v", scaleUpInfos)
if aErr, failedInfo := o.ExecuteScaleUps(scaleUpInfos, nodeInfos, now); aErr != nil { aErr, failedNodeGroups := o.scaleUpExecutor.ExecuteScaleUps(scaleUpInfos, nodeInfos, now)
if aErr != nil {
return scaleUpError( return scaleUpError(
&status.ScaleUpStatus{ &status.ScaleUpStatus{
FailedResizeNodeGroups: []cloudprovider.NodeGroup{failedInfo.Group}, FailedResizeNodeGroups: failedNodeGroups,
}, },
aErr, aErr,
) )
@ -553,60 +557,11 @@ func (o *ScaleUpOrchestrator) GetCappedNewNodeCount(newNodeCount, currentNodeCou
return newNodeCount, nil return newNodeCount, nil
} }
// ExecuteScaleUps executes the scale ups, based on the provided scale up infos.
// In case of issues returns an error and a scale up info which failed to execute.
func (o *ScaleUpOrchestrator) ExecuteScaleUps(
scaleUpInfos []nodegroupset.ScaleUpInfo,
nodeInfos map[string]*schedulerframework.NodeInfo,
now time.Time,
) (errors.AutoscalerError, *nodegroupset.ScaleUpInfo) {
availableGPUTypes := o.autoscalingContext.CloudProvider.GetAvailableGPUTypes()
for _, info := range scaleUpInfos {
nodeInfo, ok := nodeInfos[info.Group.Id()]
if !ok {
klog.Errorf("ExecuteScaleUp: failed to get node info for node group %s", info.Group.Id())
continue
}
gpuConfig := o.autoscalingContext.CloudProvider.GetNodeGpuConfig(nodeInfo.Node())
gpuResourceName, gpuType := gpu.GetGpuInfoForMetrics(gpuConfig, availableGPUTypes, nodeInfo.Node(), nil)
if aErr := o.executeScaleUp(info, gpuResourceName, gpuType, now); aErr != nil {
return aErr, &info
}
}
return nil, nil
}
func (o *ScaleUpOrchestrator) executeScaleUp(
info nodegroupset.ScaleUpInfo,
gpuResourceName, gpuType string,
now time.Time,
) errors.AutoscalerError {
klog.V(0).Infof("Scale-up: setting group %s size to %d", info.Group.Id(), info.NewSize)
o.autoscalingContext.LogRecorder.Eventf(apiv1.EventTypeNormal, "ScaledUpGroup",
"Scale-up: setting group %s size to %d instead of %d (max: %d)", info.Group.Id(), info.NewSize, info.CurrentSize, info.MaxSize)
increase := info.NewSize - info.CurrentSize
if err := info.Group.IncreaseSize(increase); err != nil {
o.autoscalingContext.LogRecorder.Eventf(apiv1.EventTypeWarning, "FailedToScaleUpGroup", "Scale-up failed for group %s: %v", info.Group.Id(), err)
aerr := errors.ToAutoscalerError(errors.CloudProviderError, err).AddPrefix("failed to increase node group size: ")
o.clusterStateRegistry.RegisterFailedScaleUp(info.Group, metrics.FailedScaleUpReason(string(aerr.Type())), now)
return aerr
}
o.clusterStateRegistry.RegisterOrUpdateScaleUp(
info.Group,
increase,
time.Now())
metrics.RegisterScaleUp(increase, gpuResourceName, gpuType)
o.autoscalingContext.LogRecorder.Eventf(apiv1.EventTypeNormal, "ScaledUpGroup",
"Scale-up: group %s size set to %d instead of %d (max: %d)", info.Group.Id(), info.NewSize, info.CurrentSize, info.MaxSize)
return nil
}
func filterNodeGroupsByPods( func filterNodeGroupsByPods(
groups []cloudprovider.NodeGroup, groups []cloudprovider.NodeGroup,
podsRequiredToFit []*apiv1.Pod, podsRequiredToFit []*apiv1.Pod,
expansionOptions map[string]expander.Option, expansionOptions map[string]expander.Option,
) []cloudprovider.NodeGroup { ) []cloudprovider.NodeGroup {
result := make([]cloudprovider.NodeGroup, 0) result := make([]cloudprovider.NodeGroup, 0)
for _, group := range groups { for _, group := range groups {

View File

@ -22,11 +22,14 @@ import (
"net/http/httptest" "net/http/httptest"
"regexp" "regexp"
"strings" "strings"
"sync/atomic"
"testing" "testing"
"time" "time"
kube_record "k8s.io/client-go/tools/record"
"k8s.io/component-base/metrics/legacyregistry"
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider" "k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
mockprovider "k8s.io/autoscaler/cluster-autoscaler/cloudprovider/mocks"
testprovider "k8s.io/autoscaler/cluster-autoscaler/cloudprovider/test" testprovider "k8s.io/autoscaler/cluster-autoscaler/cloudprovider/test"
"k8s.io/autoscaler/cluster-autoscaler/clusterstate" "k8s.io/autoscaler/cluster-autoscaler/clusterstate"
"k8s.io/autoscaler/cluster-autoscaler/config" "k8s.io/autoscaler/cluster-autoscaler/config"
@ -35,7 +38,6 @@ import (
"k8s.io/autoscaler/cluster-autoscaler/core/utils" "k8s.io/autoscaler/cluster-autoscaler/core/utils"
"k8s.io/autoscaler/cluster-autoscaler/estimator" "k8s.io/autoscaler/cluster-autoscaler/estimator"
"k8s.io/autoscaler/cluster-autoscaler/metrics" "k8s.io/autoscaler/cluster-autoscaler/metrics"
"k8s.io/autoscaler/cluster-autoscaler/processors/nodegroupset"
"k8s.io/autoscaler/cluster-autoscaler/processors/nodeinfosprovider" "k8s.io/autoscaler/cluster-autoscaler/processors/nodeinfosprovider"
"k8s.io/autoscaler/cluster-autoscaler/processors/status" "k8s.io/autoscaler/cluster-autoscaler/processors/status"
"k8s.io/autoscaler/cluster-autoscaler/utils/errors" "k8s.io/autoscaler/cluster-autoscaler/utils/errors"
@ -43,8 +45,6 @@ import (
"k8s.io/autoscaler/cluster-autoscaler/utils/taints" "k8s.io/autoscaler/cluster-autoscaler/utils/taints"
. "k8s.io/autoscaler/cluster-autoscaler/utils/test" . "k8s.io/autoscaler/cluster-autoscaler/utils/test"
"k8s.io/autoscaler/cluster-autoscaler/utils/units" "k8s.io/autoscaler/cluster-autoscaler/utils/units"
kube_record "k8s.io/client-go/tools/record"
"k8s.io/component-base/metrics/legacyregistry"
appsv1 "k8s.io/api/apps/v1" appsv1 "k8s.io/api/apps/v1"
apiv1 "k8s.io/api/core/v1" apiv1 "k8s.io/api/core/v1"
@ -52,7 +52,6 @@ import (
schedulerframework "k8s.io/kubernetes/pkg/scheduler/framework" schedulerframework "k8s.io/kubernetes/pkg/scheduler/framework"
"github.com/stretchr/testify/assert" "github.com/stretchr/testify/assert"
"k8s.io/autoscaler/cluster-autoscaler/expander"
) )
var defaultOptions = config.AutoscalingOptions{ var defaultOptions = config.AutoscalingOptions{
@ -65,7 +64,7 @@ var defaultOptions = config.AutoscalingOptions{
// Scale up scenarios. // Scale up scenarios.
func TestScaleUpOK(t *testing.T) { func TestScaleUpOK(t *testing.T) {
config := &ScaleTestConfig{ config := &ScaleUpTestConfig{
Nodes: []NodeConfig{ Nodes: []NodeConfig{
{Name: "n1", Cpu: 100, Memory: 100, Gpu: 0, Ready: true, Group: "ng1"}, {Name: "n1", Cpu: 100, Memory: 100, Gpu: 0, Ready: true, Group: "ng1"},
{Name: "n2", Cpu: 1000, Memory: 1000, Gpu: 0, Ready: true, Group: "ng2"}, {Name: "n2", Cpu: 1000, Memory: 1000, Gpu: 0, Ready: true, Group: "ng2"},
@ -77,8 +76,7 @@ func TestScaleUpOK(t *testing.T) {
ExtraPods: []PodConfig{ ExtraPods: []PodConfig{
{Name: "p-new", Cpu: 500, Memory: 0, Gpu: 0, Node: "", ToleratesGpu: false}, {Name: "p-new", Cpu: 500, Memory: 0, Gpu: 0, Node: "", ToleratesGpu: false},
}, },
Options: defaultOptions, ExpansionOptionToChoose: &GroupSizeChange{GroupName: "ng2", SizeChange: 1},
ExpansionOptionToChoose: GroupSizeChange{GroupName: "ng2", SizeChange: 1},
} }
expectedResults := &ScaleTestResults{ expectedResults := &ScaleTestResults{
FinalOption: GroupSizeChange{GroupName: "ng2", SizeChange: 1}, FinalOption: GroupSizeChange{GroupName: "ng2", SizeChange: 1},
@ -92,7 +90,7 @@ func TestScaleUpOK(t *testing.T) {
// There are triggering, remaining & awaiting pods. // There are triggering, remaining & awaiting pods.
func TestMixedScaleUp(t *testing.T) { func TestMixedScaleUp(t *testing.T) {
config := &ScaleTestConfig{ config := &ScaleUpTestConfig{
Nodes: []NodeConfig{ Nodes: []NodeConfig{
{Name: "n1", Cpu: 100, Memory: 1000, Gpu: 0, Ready: true, Group: "ng1"}, {Name: "n1", Cpu: 100, Memory: 1000, Gpu: 0, Ready: true, Group: "ng1"},
{Name: "n2", Cpu: 1000, Memory: 100, Gpu: 0, Ready: true, Group: "ng2"}, {Name: "n2", Cpu: 1000, Memory: 100, Gpu: 0, Ready: true, Group: "ng2"},
@ -109,8 +107,7 @@ func TestMixedScaleUp(t *testing.T) {
// can only be scheduled on ng1 // can only be scheduled on ng1
{Name: "awaiting", Cpu: 0, Memory: 200, Gpu: 0, Node: "", ToleratesGpu: false}, {Name: "awaiting", Cpu: 0, Memory: 200, Gpu: 0, Node: "", ToleratesGpu: false},
}, },
Options: defaultOptions, ExpansionOptionToChoose: &GroupSizeChange{GroupName: "ng2", SizeChange: 1},
ExpansionOptionToChoose: GroupSizeChange{GroupName: "ng2", SizeChange: 1},
} }
expectedResults := &ScaleTestResults{ expectedResults := &ScaleTestResults{
FinalOption: GroupSizeChange{GroupName: "ng2", SizeChange: 1}, FinalOption: GroupSizeChange{GroupName: "ng2", SizeChange: 1},
@ -127,7 +124,7 @@ func TestMixedScaleUp(t *testing.T) {
func TestScaleUpMaxCoresLimitHit(t *testing.T) { func TestScaleUpMaxCoresLimitHit(t *testing.T) {
options := defaultOptions options := defaultOptions
options.MaxCoresTotal = 9 options.MaxCoresTotal = 9
config := &ScaleTestConfig{ config := &ScaleUpTestConfig{
Nodes: []NodeConfig{ Nodes: []NodeConfig{
{Name: "n1", Cpu: 2000, Memory: 100, Gpu: 0, Ready: true, Group: "ng1"}, {Name: "n1", Cpu: 2000, Memory: 100, Gpu: 0, Ready: true, Group: "ng1"},
{Name: "n2", Cpu: 4000, Memory: 1000, Gpu: 0, Ready: true, Group: "ng2"}, {Name: "n2", Cpu: 4000, Memory: 1000, Gpu: 0, Ready: true, Group: "ng2"},
@ -140,8 +137,8 @@ func TestScaleUpMaxCoresLimitHit(t *testing.T) {
{Name: "p-new-1", Cpu: 2000, Memory: 0, Gpu: 0, Node: "", ToleratesGpu: false}, {Name: "p-new-1", Cpu: 2000, Memory: 0, Gpu: 0, Node: "", ToleratesGpu: false},
{Name: "p-new-2", Cpu: 2000, Memory: 0, Gpu: 0, Node: "", ToleratesGpu: false}, {Name: "p-new-2", Cpu: 2000, Memory: 0, Gpu: 0, Node: "", ToleratesGpu: false},
}, },
ExpansionOptionToChoose: GroupSizeChange{GroupName: "ng1", SizeChange: 2}, ExpansionOptionToChoose: &GroupSizeChange{GroupName: "ng1", SizeChange: 2},
Options: options, Options: &options,
} }
results := &ScaleTestResults{ results := &ScaleTestResults{
FinalOption: GroupSizeChange{GroupName: "ng1", SizeChange: 1}, FinalOption: GroupSizeChange{GroupName: "ng1", SizeChange: 1},
@ -156,7 +153,7 @@ func TestScaleUpMaxCoresLimitHit(t *testing.T) {
func TestScaleUpMaxCoresLimitHitWithNotAutoscaledGroup(t *testing.T) { func TestScaleUpMaxCoresLimitHitWithNotAutoscaledGroup(t *testing.T) {
options := defaultOptions options := defaultOptions
options.MaxCoresTotal = 9 options.MaxCoresTotal = 9
config := &ScaleTestConfig{ config := &ScaleUpTestConfig{
Nodes: []NodeConfig{ Nodes: []NodeConfig{
{Name: "n1", Cpu: 2000, Memory: 100, Gpu: 0, Ready: true, Group: "ng1"}, {Name: "n1", Cpu: 2000, Memory: 100, Gpu: 0, Ready: true, Group: "ng1"},
{Name: "n2", Cpu: 4000, Memory: 1000, Gpu: 0, Ready: true, Group: ""}, {Name: "n2", Cpu: 4000, Memory: 1000, Gpu: 0, Ready: true, Group: ""},
@ -169,8 +166,8 @@ func TestScaleUpMaxCoresLimitHitWithNotAutoscaledGroup(t *testing.T) {
{Name: "p-new-1", Cpu: 2000, Memory: 0, Gpu: 0, Node: "", ToleratesGpu: false}, {Name: "p-new-1", Cpu: 2000, Memory: 0, Gpu: 0, Node: "", ToleratesGpu: false},
{Name: "p-new-2", Cpu: 2000, Memory: 0, Gpu: 0, Node: "", ToleratesGpu: false}, {Name: "p-new-2", Cpu: 2000, Memory: 0, Gpu: 0, Node: "", ToleratesGpu: false},
}, },
ExpansionOptionToChoose: GroupSizeChange{GroupName: "ng1", SizeChange: 2}, ExpansionOptionToChoose: &GroupSizeChange{GroupName: "ng1", SizeChange: 2},
Options: options, Options: &options,
} }
results := &ScaleTestResults{ results := &ScaleTestResults{
FinalOption: GroupSizeChange{GroupName: "ng1", SizeChange: 1}, FinalOption: GroupSizeChange{GroupName: "ng1", SizeChange: 1},
@ -185,7 +182,7 @@ func TestScaleUpMaxCoresLimitHitWithNotAutoscaledGroup(t *testing.T) {
func TestScaleUpMaxMemoryLimitHit(t *testing.T) { func TestScaleUpMaxMemoryLimitHit(t *testing.T) {
options := defaultOptions options := defaultOptions
options.MaxMemoryTotal = 1300 * utils.MiB options.MaxMemoryTotal = 1300 * utils.MiB
config := &ScaleTestConfig{ config := &ScaleUpTestConfig{
Nodes: []NodeConfig{ Nodes: []NodeConfig{
{Name: "n1", Cpu: 2000, Memory: 100 * utils.MiB, Gpu: 0, Ready: true, Group: "ng1"}, {Name: "n1", Cpu: 2000, Memory: 100 * utils.MiB, Gpu: 0, Ready: true, Group: "ng1"},
{Name: "n2", Cpu: 4000, Memory: 1000 * utils.MiB, Gpu: 0, Ready: true, Group: "ng2"}, {Name: "n2", Cpu: 4000, Memory: 1000 * utils.MiB, Gpu: 0, Ready: true, Group: "ng2"},
@ -199,8 +196,8 @@ func TestScaleUpMaxMemoryLimitHit(t *testing.T) {
{Name: "p-new-2", Cpu: 2000, Memory: 100 * utils.MiB, Gpu: 0, Node: "", ToleratesGpu: false}, {Name: "p-new-2", Cpu: 2000, Memory: 100 * utils.MiB, Gpu: 0, Node: "", ToleratesGpu: false},
{Name: "p-new-3", Cpu: 2000, Memory: 100 * utils.MiB, Gpu: 0, Node: "", ToleratesGpu: false}, {Name: "p-new-3", Cpu: 2000, Memory: 100 * utils.MiB, Gpu: 0, Node: "", ToleratesGpu: false},
}, },
ExpansionOptionToChoose: GroupSizeChange{GroupName: "ng1", SizeChange: 3}, ExpansionOptionToChoose: &GroupSizeChange{GroupName: "ng1", SizeChange: 3},
Options: options, Options: &options,
} }
results := &ScaleTestResults{ results := &ScaleTestResults{
FinalOption: GroupSizeChange{GroupName: "ng1", SizeChange: 2}, FinalOption: GroupSizeChange{GroupName: "ng1", SizeChange: 2},
@ -215,7 +212,7 @@ func TestScaleUpMaxMemoryLimitHit(t *testing.T) {
func TestScaleUpMaxMemoryLimitHitWithNotAutoscaledGroup(t *testing.T) { func TestScaleUpMaxMemoryLimitHitWithNotAutoscaledGroup(t *testing.T) {
options := defaultOptions options := defaultOptions
options.MaxMemoryTotal = 1300 * utils.MiB options.MaxMemoryTotal = 1300 * utils.MiB
config := &ScaleTestConfig{ config := &ScaleUpTestConfig{
Nodes: []NodeConfig{ Nodes: []NodeConfig{
{Name: "n1", Cpu: 2000, Memory: 100 * utils.MiB, Gpu: 0, Ready: true, Group: "ng1"}, {Name: "n1", Cpu: 2000, Memory: 100 * utils.MiB, Gpu: 0, Ready: true, Group: "ng1"},
{Name: "n2", Cpu: 4000, Memory: 1000 * utils.MiB, Gpu: 0, Ready: true, Group: ""}, {Name: "n2", Cpu: 4000, Memory: 1000 * utils.MiB, Gpu: 0, Ready: true, Group: ""},
@ -229,8 +226,8 @@ func TestScaleUpMaxMemoryLimitHitWithNotAutoscaledGroup(t *testing.T) {
{Name: "p-new-2", Cpu: 2000, Memory: 100 * utils.MiB, Gpu: 0, Node: "", ToleratesGpu: false}, {Name: "p-new-2", Cpu: 2000, Memory: 100 * utils.MiB, Gpu: 0, Node: "", ToleratesGpu: false},
{Name: "p-new-3", Cpu: 2000, Memory: 100 * utils.MiB, Gpu: 0, Node: "", ToleratesGpu: false}, {Name: "p-new-3", Cpu: 2000, Memory: 100 * utils.MiB, Gpu: 0, Node: "", ToleratesGpu: false},
}, },
ExpansionOptionToChoose: GroupSizeChange{GroupName: "ng1", SizeChange: 3}, ExpansionOptionToChoose: &GroupSizeChange{GroupName: "ng1", SizeChange: 3},
Options: options, Options: &options,
} }
results := &ScaleTestResults{ results := &ScaleTestResults{
FinalOption: GroupSizeChange{GroupName: "ng1", SizeChange: 2}, FinalOption: GroupSizeChange{GroupName: "ng1", SizeChange: 2},
@ -242,10 +239,140 @@ func TestScaleUpMaxMemoryLimitHitWithNotAutoscaledGroup(t *testing.T) {
simpleScaleUpTest(t, config, results) simpleScaleUpTest(t, config, results)
} }
func TestScaleUpTwoGroups(t *testing.T) {
options := defaultOptions
options.BalanceSimilarNodeGroups = true
options.ParallelScaleUp = true
config := &ScaleUpTestConfig{
Nodes: []NodeConfig{
{Name: "ng1-n1", Cpu: 1500, Memory: 1000 * utils.MiB, Ready: true, Group: "ng1"},
{Name: "ng2-n1", Cpu: 1500, Memory: 1000 * utils.MiB, Ready: true, Group: "ng2"},
},
Pods: []PodConfig{
{Name: "p1", Cpu: 1400, Node: "ng1-n1"},
{Name: "p2", Cpu: 1400, Node: "ng2-n1"},
},
ExtraPods: []PodConfig{
{Name: "p3", Cpu: 1400},
{Name: "p4", Cpu: 1400},
},
Options: &options,
}
testCases := []struct {
desc string
parallel bool
}{
{
desc: "synchronous scale up",
parallel: false,
},
{
desc: "parallel scale up",
parallel: true,
},
}
for _, tc := range testCases {
t.Run(tc.desc, func(t *testing.T) {
config.Options.ParallelScaleUp = tc.parallel
result := runSimpleScaleUpTest(t, config)
assert.True(t, result.ScaleUpStatus.WasSuccessful())
assert.Nil(t, result.ScaleUpError)
assert.Equal(t, result.GroupTargetSizes, map[string]int{
"ng1": 2,
"ng2": 2,
})
assert.ElementsMatch(t, result.ScaleUpStatus.PodsTriggeredScaleUp, []string{"p3", "p4"})
})
}
}
func TestCloudProviderFailingToScaleUpGroups(t *testing.T) {
options := defaultOptions
options.BalanceSimilarNodeGroups = true
config := &ScaleUpTestConfig{
Groups: []NodeGroupConfig{
{Name: "ng1", MaxSize: 2},
{Name: "ng2", MaxSize: 2},
},
Nodes: []NodeConfig{
{Name: "ng1-n1", Cpu: 1500, Memory: 1000 * utils.MiB, Ready: true, Group: "ng1"},
{Name: "ng2-n1", Cpu: 1500, Memory: 1000 * utils.MiB, Ready: true, Group: "ng2"},
},
Pods: []PodConfig{
{Name: "p1", Cpu: 1400, Node: "ng1-n1"},
{Name: "p2", Cpu: 1400, Node: "ng2-n1"},
},
ExtraPods: []PodConfig{
{Name: "p3", Cpu: 1400},
{Name: "p4", Cpu: 1400},
},
Options: &options,
}
failAlwaysScaleUp := func(group string, i int) error {
return fmt.Errorf("provider error for: %s", group)
}
failOnceScaleUp := func() testprovider.OnScaleUpFunc {
var executed atomic.Bool
return func(group string, _ int) error {
if !executed.Swap(true) {
return fmt.Errorf("provider error for: %s", group)
}
return nil
}
}
testCases := []struct {
desc string
parallel bool
onScaleUp testprovider.OnScaleUpFunc
expectConcurrentErrors bool
expectedTotalTargetSizes int
}{
{
desc: "synchronous scale up - two failures",
parallel: false,
onScaleUp: failAlwaysScaleUp,
expectConcurrentErrors: false,
expectedTotalTargetSizes: 3, // first error stops scale up process
},
{
desc: "parallel scale up - two failures",
parallel: true,
onScaleUp: failAlwaysScaleUp,
expectConcurrentErrors: true,
expectedTotalTargetSizes: 4,
},
{
desc: "synchronous scale up - one failure",
parallel: false,
onScaleUp: failOnceScaleUp(),
expectConcurrentErrors: false,
expectedTotalTargetSizes: 3,
},
{
desc: "parallel scale up - one failure",
parallel: true,
onScaleUp: failOnceScaleUp(),
expectConcurrentErrors: false,
expectedTotalTargetSizes: 4,
},
}
for _, tc := range testCases {
t.Run(tc.desc, func(t *testing.T) {
config.Options.ParallelScaleUp = tc.parallel
config.OnScaleUp = tc.onScaleUp
result := runSimpleScaleUpTest(t, config)
assert.False(t, result.ScaleUpStatus.WasSuccessful())
assert.Equal(t, errors.CloudProviderError, result.ScaleUpError.Type())
assert.Equal(t, tc.expectedTotalTargetSizes, result.GroupTargetSizes["ng1"]+result.GroupTargetSizes["ng2"])
assert.Equal(t, tc.expectConcurrentErrors, strings.Contains(result.ScaleUpError.Error(), "...and other concurrent errors"))
})
}
}
func TestScaleUpCapToMaxTotalNodesLimit(t *testing.T) { func TestScaleUpCapToMaxTotalNodesLimit(t *testing.T) {
options := defaultOptions options := defaultOptions
options.MaxNodesTotal = 3 options.MaxNodesTotal = 3
config := &ScaleTestConfig{ config := &ScaleUpTestConfig{
Nodes: []NodeConfig{ Nodes: []NodeConfig{
{Name: "n1", Cpu: 2000, Memory: 100 * utils.MiB, Gpu: 0, Ready: true, Group: "ng1"}, {Name: "n1", Cpu: 2000, Memory: 100 * utils.MiB, Gpu: 0, Ready: true, Group: "ng1"},
{Name: "n2", Cpu: 4000, Memory: 1000 * utils.MiB, Gpu: 0, Ready: true, Group: "ng2"}, {Name: "n2", Cpu: 4000, Memory: 1000 * utils.MiB, Gpu: 0, Ready: true, Group: "ng2"},
@ -259,8 +386,8 @@ func TestScaleUpCapToMaxTotalNodesLimit(t *testing.T) {
{Name: "p-new-2", Cpu: 4000, Memory: 100 * utils.MiB, Gpu: 0, Node: "", ToleratesGpu: false}, {Name: "p-new-2", Cpu: 4000, Memory: 100 * utils.MiB, Gpu: 0, Node: "", ToleratesGpu: false},
{Name: "p-new-3", Cpu: 4000, Memory: 100 * utils.MiB, Gpu: 0, Node: "", ToleratesGpu: false}, {Name: "p-new-3", Cpu: 4000, Memory: 100 * utils.MiB, Gpu: 0, Node: "", ToleratesGpu: false},
}, },
ExpansionOptionToChoose: GroupSizeChange{GroupName: "ng2", SizeChange: 3}, ExpansionOptionToChoose: &GroupSizeChange{GroupName: "ng2", SizeChange: 3},
Options: options, Options: &options,
} }
results := &ScaleTestResults{ results := &ScaleTestResults{
FinalOption: GroupSizeChange{GroupName: "ng2", SizeChange: 1}, FinalOption: GroupSizeChange{GroupName: "ng2", SizeChange: 1},
@ -275,7 +402,7 @@ func TestScaleUpCapToMaxTotalNodesLimit(t *testing.T) {
func TestScaleUpCapToMaxTotalNodesLimitWithNotAutoscaledGroup(t *testing.T) { func TestScaleUpCapToMaxTotalNodesLimitWithNotAutoscaledGroup(t *testing.T) {
options := defaultOptions options := defaultOptions
options.MaxNodesTotal = 3 options.MaxNodesTotal = 3
config := &ScaleTestConfig{ config := &ScaleUpTestConfig{
Nodes: []NodeConfig{ Nodes: []NodeConfig{
{Name: "n1", Cpu: 2000, Memory: 100 * utils.MiB, Gpu: 0, Ready: true, Group: ""}, {Name: "n1", Cpu: 2000, Memory: 100 * utils.MiB, Gpu: 0, Ready: true, Group: ""},
{Name: "n2", Cpu: 4000, Memory: 1000 * utils.MiB, Gpu: 0, Ready: true, Group: "ng2"}, {Name: "n2", Cpu: 4000, Memory: 1000 * utils.MiB, Gpu: 0, Ready: true, Group: "ng2"},
@ -289,8 +416,8 @@ func TestScaleUpCapToMaxTotalNodesLimitWithNotAutoscaledGroup(t *testing.T) {
{Name: "p-new-2", Cpu: 4000, Memory: 100 * utils.MiB, Gpu: 0, Node: "", ToleratesGpu: false}, {Name: "p-new-2", Cpu: 4000, Memory: 100 * utils.MiB, Gpu: 0, Node: "", ToleratesGpu: false},
{Name: "p-new-3", Cpu: 4000, Memory: 100 * utils.MiB, Gpu: 0, Node: "", ToleratesGpu: false}, {Name: "p-new-3", Cpu: 4000, Memory: 100 * utils.MiB, Gpu: 0, Node: "", ToleratesGpu: false},
}, },
ExpansionOptionToChoose: GroupSizeChange{GroupName: "ng2", SizeChange: 3}, ExpansionOptionToChoose: &GroupSizeChange{GroupName: "ng2", SizeChange: 3},
Options: options, Options: &options,
} }
results := &ScaleTestResults{ results := &ScaleTestResults{
FinalOption: GroupSizeChange{GroupName: "ng2", SizeChange: 1}, FinalOption: GroupSizeChange{GroupName: "ng2", SizeChange: 1},
@ -305,7 +432,7 @@ func TestScaleUpCapToMaxTotalNodesLimitWithNotAutoscaledGroup(t *testing.T) {
func TestWillConsiderGpuAndStandardPoolForPodWhichDoesNotRequireGpu(t *testing.T) { func TestWillConsiderGpuAndStandardPoolForPodWhichDoesNotRequireGpu(t *testing.T) {
options := defaultOptions options := defaultOptions
options.MaxNodesTotal = 100 options.MaxNodesTotal = 100
config := &ScaleTestConfig{ config := &ScaleUpTestConfig{
Nodes: []NodeConfig{ Nodes: []NodeConfig{
{Name: "gpu-node-1", Cpu: 2000, Memory: 1000 * utils.MiB, Gpu: 1, Ready: true, Group: "gpu-pool"}, {Name: "gpu-node-1", Cpu: 2000, Memory: 1000 * utils.MiB, Gpu: 1, Ready: true, Group: "gpu-pool"},
{Name: "std-node-1", Cpu: 2000, Memory: 1000 * utils.MiB, Gpu: 0, Ready: true, Group: "std-pool"}, {Name: "std-node-1", Cpu: 2000, Memory: 1000 * utils.MiB, Gpu: 0, Ready: true, Group: "std-pool"},
@ -317,8 +444,8 @@ func TestWillConsiderGpuAndStandardPoolForPodWhichDoesNotRequireGpu(t *testing.T
ExtraPods: []PodConfig{ ExtraPods: []PodConfig{
{Name: "extra-std-pod", Cpu: 2000, Memory: 1000 * utils.MiB, Gpu: 0, Node: "", ToleratesGpu: true}, {Name: "extra-std-pod", Cpu: 2000, Memory: 1000 * utils.MiB, Gpu: 0, Node: "", ToleratesGpu: true},
}, },
ExpansionOptionToChoose: GroupSizeChange{GroupName: "std-pool", SizeChange: 1}, ExpansionOptionToChoose: &GroupSizeChange{GroupName: "std-pool", SizeChange: 1},
Options: options, Options: &options,
} }
results := &ScaleTestResults{ results := &ScaleTestResults{
FinalOption: GroupSizeChange{GroupName: "std-pool", SizeChange: 1}, FinalOption: GroupSizeChange{GroupName: "std-pool", SizeChange: 1},
@ -337,7 +464,7 @@ func TestWillConsiderGpuAndStandardPoolForPodWhichDoesNotRequireGpu(t *testing.T
func TestWillConsiderOnlyGpuPoolForPodWhichDoesRequiresGpu(t *testing.T) { func TestWillConsiderOnlyGpuPoolForPodWhichDoesRequiresGpu(t *testing.T) {
options := defaultOptions options := defaultOptions
options.MaxNodesTotal = 100 options.MaxNodesTotal = 100
config := &ScaleTestConfig{ config := &ScaleUpTestConfig{
Nodes: []NodeConfig{ Nodes: []NodeConfig{
{Name: "gpu-node-1", Cpu: 2000, Memory: 1000 * utils.MiB, Gpu: 1, Ready: true, Group: "gpu-pool"}, {Name: "gpu-node-1", Cpu: 2000, Memory: 1000 * utils.MiB, Gpu: 1, Ready: true, Group: "gpu-pool"},
{Name: "std-node-1", Cpu: 2000, Memory: 1000 * utils.MiB, Gpu: 0, Ready: true, Group: "std-pool"}, {Name: "std-node-1", Cpu: 2000, Memory: 1000 * utils.MiB, Gpu: 0, Ready: true, Group: "std-pool"},
@ -349,8 +476,8 @@ func TestWillConsiderOnlyGpuPoolForPodWhichDoesRequiresGpu(t *testing.T) {
ExtraPods: []PodConfig{ ExtraPods: []PodConfig{
{Name: "extra-gpu-pod", Cpu: 2000, Memory: 1000 * utils.MiB, Gpu: 1, Node: "", ToleratesGpu: true}, {Name: "extra-gpu-pod", Cpu: 2000, Memory: 1000 * utils.MiB, Gpu: 1, Node: "", ToleratesGpu: true},
}, },
ExpansionOptionToChoose: GroupSizeChange{GroupName: "gpu-pool", SizeChange: 1}, ExpansionOptionToChoose: &GroupSizeChange{GroupName: "gpu-pool", SizeChange: 1},
Options: options, Options: &options,
} }
results := &ScaleTestResults{ results := &ScaleTestResults{
FinalOption: GroupSizeChange{GroupName: "gpu-pool", SizeChange: 1}, FinalOption: GroupSizeChange{GroupName: "gpu-pool", SizeChange: 1},
@ -368,7 +495,7 @@ func TestWillConsiderOnlyGpuPoolForPodWhichDoesRequiresGpu(t *testing.T) {
func TestWillConsiderAllPoolsWhichFitTwoPodsRequiringGpus(t *testing.T) { func TestWillConsiderAllPoolsWhichFitTwoPodsRequiringGpus(t *testing.T) {
options := defaultOptions options := defaultOptions
options.MaxNodesTotal = 100 options.MaxNodesTotal = 100
config := &ScaleTestConfig{ config := &ScaleUpTestConfig{
Nodes: []NodeConfig{ Nodes: []NodeConfig{
{Name: "gpu-1-node-1", Cpu: 2000, Memory: 1000 * utils.MiB, Gpu: 1, Ready: true, Group: "gpu-1-pool"}, {Name: "gpu-1-node-1", Cpu: 2000, Memory: 1000 * utils.MiB, Gpu: 1, Ready: true, Group: "gpu-1-pool"},
{Name: "gpu-2-node-1", Cpu: 2000, Memory: 1000 * utils.MiB, Gpu: 2, Ready: true, Group: "gpu-2-pool"}, {Name: "gpu-2-node-1", Cpu: 2000, Memory: 1000 * utils.MiB, Gpu: 2, Ready: true, Group: "gpu-2-pool"},
@ -386,8 +513,8 @@ func TestWillConsiderAllPoolsWhichFitTwoPodsRequiringGpus(t *testing.T) {
{Name: "extra-gpu-pod-2", Cpu: 1, Memory: 1 * utils.MiB, Gpu: 1, Node: "", ToleratesGpu: true}, // CPU and mem negligible {Name: "extra-gpu-pod-2", Cpu: 1, Memory: 1 * utils.MiB, Gpu: 1, Node: "", ToleratesGpu: true}, // CPU and mem negligible
{Name: "extra-gpu-pod-3", Cpu: 1, Memory: 1 * utils.MiB, Gpu: 1, Node: "", ToleratesGpu: true}, // CPU and mem negligible {Name: "extra-gpu-pod-3", Cpu: 1, Memory: 1 * utils.MiB, Gpu: 1, Node: "", ToleratesGpu: true}, // CPU and mem negligible
}, },
ExpansionOptionToChoose: GroupSizeChange{GroupName: "gpu-1-pool", SizeChange: 3}, ExpansionOptionToChoose: &GroupSizeChange{GroupName: "gpu-1-pool", SizeChange: 3},
Options: options, Options: &options,
} }
results := &ScaleTestResults{ results := &ScaleTestResults{
FinalOption: GroupSizeChange{GroupName: "gpu-1-pool", SizeChange: 3}, FinalOption: GroupSizeChange{GroupName: "gpu-1-pool", SizeChange: 3},
@ -409,7 +536,7 @@ func TestNoScaleUpMaxCoresLimitHit(t *testing.T) {
options := defaultOptions options := defaultOptions
options.MaxCoresTotal = 7 options.MaxCoresTotal = 7
options.MaxMemoryTotal = 1150 options.MaxMemoryTotal = 1150
config := &ScaleTestConfig{ config := &ScaleUpTestConfig{
Nodes: []NodeConfig{ Nodes: []NodeConfig{
{Name: "n1", Cpu: 2000, Memory: 100, Gpu: 0, Ready: true, Group: "ng1"}, {Name: "n1", Cpu: 2000, Memory: 100, Gpu: 0, Ready: true, Group: "ng1"},
{Name: "n2", Cpu: 4000, Memory: 1000, Gpu: 0, Ready: true, Group: "ng2"}, {Name: "n2", Cpu: 4000, Memory: 1000, Gpu: 0, Ready: true, Group: "ng2"},
@ -422,7 +549,7 @@ func TestNoScaleUpMaxCoresLimitHit(t *testing.T) {
{Name: "p-new-1", Cpu: 2000, Memory: 0, Gpu: 0, Node: "", ToleratesGpu: false}, {Name: "p-new-1", Cpu: 2000, Memory: 0, Gpu: 0, Node: "", ToleratesGpu: false},
{Name: "p-new-2", Cpu: 2000, Memory: 0, Gpu: 0, Node: "", ToleratesGpu: false}, {Name: "p-new-2", Cpu: 2000, Memory: 0, Gpu: 0, Node: "", ToleratesGpu: false},
}, },
Options: options, Options: &options,
} }
results := &ScaleTestResults{ results := &ScaleTestResults{
NoScaleUpReason: "max cluster cpu, memory limit reached", NoScaleUpReason: "max cluster cpu, memory limit reached",
@ -434,152 +561,46 @@ func TestNoScaleUpMaxCoresLimitHit(t *testing.T) {
simpleNoScaleUpTest(t, config, results) simpleNoScaleUpTest(t, config, results)
} }
// To implement expander.Strategy, BestOption method must have a struct receiver. func simpleScaleUpTest(t *testing.T, config *ScaleUpTestConfig, expectedResults *ScaleTestResults) {
// This prevents it from modifying fields of reportingStrategy, so we need a thin
// pointer wrapper for mutable parts.
type expanderResults struct {
inputOptions []GroupSizeChange
}
type reportingStrategy struct {
initialNodeConfigs []NodeConfig
optionToChoose GroupSizeChange
results *expanderResults
t *testing.T
}
func (r reportingStrategy) BestOption(options []expander.Option, nodeInfo map[string]*schedulerframework.NodeInfo) *expander.Option {
r.results.inputOptions = expanderOptionsToGroupSizeChanges(options)
for _, option := range options {
GroupSizeChange := expanderOptionToGroupSizeChange(option)
if GroupSizeChange == r.optionToChoose {
return &option
}
}
assert.Fail(r.t, "did not find expansionOptionToChoose %s", r.optionToChoose)
return nil
}
func expanderOptionsToGroupSizeChanges(options []expander.Option) []GroupSizeChange {
groupSizeChanges := make([]GroupSizeChange, 0, len(options))
for _, option := range options {
GroupSizeChange := expanderOptionToGroupSizeChange(option)
groupSizeChanges = append(groupSizeChanges, GroupSizeChange)
}
return groupSizeChanges
}
func expanderOptionToGroupSizeChange(option expander.Option) GroupSizeChange {
groupName := option.NodeGroup.Id()
groupSizeIncrement := option.NodeCount
scaleUpOption := GroupSizeChange{GroupName: groupName, SizeChange: groupSizeIncrement}
return scaleUpOption
}
func runSimpleScaleUpTest(t *testing.T, config *ScaleTestConfig) *ScaleTestResults {
expandedGroups := make(chan GroupSizeChange, 10)
now := time.Now()
groups := make(map[string][]*apiv1.Node)
nodes := make([]*apiv1.Node, 0, len(config.Nodes))
for _, n := range config.Nodes {
node := BuildTestNode(n.Name, n.Cpu, n.Memory)
if n.Gpu > 0 {
AddGpusToNode(node, n.Gpu)
}
SetNodeReadyState(node, n.Ready, now.Add(-2*time.Minute))
nodes = append(nodes, node)
if n.Group != "" {
groups[n.Group] = append(groups[n.Group], node)
}
}
pods := make([]*apiv1.Pod, 0, len(config.Pods))
for _, p := range config.Pods {
pod := buildTestPod(p)
pods = append(pods, pod)
}
podLister := kube_util.NewTestPodLister(pods)
listers := kube_util.NewListerRegistry(nil, nil, podLister, nil, nil, nil, nil, nil, nil, nil)
provider := testprovider.NewTestCloudProvider(func(nodeGroup string, increase int) error {
expandedGroups <- GroupSizeChange{GroupName: nodeGroup, SizeChange: increase}
return nil
}, nil)
for name, nodesInGroup := range groups {
provider.AddNodeGroup(name, 1, 10, len(nodesInGroup))
for _, n := range nodesInGroup {
provider.AddNode(name, n)
}
}
resourceLimiter := cloudprovider.NewResourceLimiter(
map[string]int64{cloudprovider.ResourceNameCores: config.Options.MinCoresTotal, cloudprovider.ResourceNameMemory: config.Options.MinMemoryTotal},
map[string]int64{cloudprovider.ResourceNameCores: config.Options.MaxCoresTotal, cloudprovider.ResourceNameMemory: config.Options.MaxMemoryTotal})
provider.SetResourceLimiter(resourceLimiter)
assert.NotNil(t, provider)
// Create context with non-random expander strategy.
context, err := NewScaleTestAutoscalingContext(config.Options, &fake.Clientset{}, listers, provider, nil, nil)
assert.NoError(t, err)
expander := reportingStrategy{
initialNodeConfigs: config.Nodes,
optionToChoose: config.ExpansionOptionToChoose,
results: &expanderResults{},
t: t,
}
context.ExpanderStrategy = expander
nodeInfos, _ := nodeinfosprovider.NewDefaultTemplateNodeInfoProvider(nil, false).Process(&context, nodes, []*appsv1.DaemonSet{}, taints.TaintConfig{}, now)
clusterState := clusterstate.NewClusterStateRegistry(provider, clusterstate.ClusterStateRegistryConfig{}, context.LogRecorder, NewBackoff(), clusterstate.NewStaticMaxNodeProvisionTimeProvider(15*time.Minute))
clusterState.UpdateNodes(nodes, nodeInfos, time.Now())
extraPods := make([]*apiv1.Pod, 0, len(config.ExtraPods))
for _, p := range config.ExtraPods {
pod := buildTestPod(p)
extraPods = append(extraPods, pod)
}
processors := NewTestProcessors(&context)
suOrchestrator := New()
suOrchestrator.Initialize(&context, processors, clusterState, taints.TaintConfig{})
scaleUpStatus, err := suOrchestrator.ScaleUp(extraPods, nodes, []*appsv1.DaemonSet{}, nodeInfos)
processors.ScaleUpStatusProcessor.Process(&context, scaleUpStatus)
assert.NoError(t, err)
expandedGroup := getGroupSizeChangeFromChan(expandedGroups)
var expandedGroupStruct GroupSizeChange
if expandedGroup != nil {
expandedGroupStruct = *expandedGroup
}
events := []string{}
for eventsLeft := true; eventsLeft; {
select {
case event := <-context.Recorder.(*kube_record.FakeRecorder).Events:
events = append(events, event)
default:
eventsLeft = false
}
}
return &ScaleTestResults{
ExpansionOptions: expander.results.inputOptions,
FinalOption: expandedGroupStruct,
ScaleUpStatus: simplifyScaleUpStatus(scaleUpStatus),
Events: events,
}
}
func simpleNoScaleUpTest(t *testing.T, config *ScaleTestConfig, expectedResults *ScaleTestResults) {
results := runSimpleScaleUpTest(t, config) results := runSimpleScaleUpTest(t, config)
assert.NotNil(t, results.GroupSizeChanges[0], "Expected scale up event")
assert.Equal(t, expectedResults.FinalOption, results.GroupSizeChanges[0])
assert.True(t, results.ScaleUpStatus.WasSuccessful())
nodeEventSeen := false
for _, event := range results.Events {
if strings.Contains(event, "TriggeredScaleUp") && strings.Contains(event, expectedResults.FinalOption.GroupName) {
nodeEventSeen = true
}
if len(expectedResults.ScaleUpStatus.PodsRemainUnschedulable) == 0 {
assert.NotRegexp(t, regexp.MustCompile("NotTriggerScaleUp"), event)
}
}
assert.True(t, nodeEventSeen)
if len(expectedResults.ExpansionOptions) > 0 {
// Empty ExpansionOptions means we do not want to do any assertions
// on contents of actual scaleUp options
if config.ExpansionOptionToChoose != nil {
// Check that option to choose is part of expected options.
assert.Contains(t, expectedResults.ExpansionOptions, *config.ExpansionOptionToChoose, "final expected expansion option must be in expected expansion options")
assert.Contains(t, results.ExpansionOptions, *config.ExpansionOptionToChoose, "final expected expansion option must be in expected expansion options")
}
assert.ElementsMatch(t, results.ExpansionOptions, expectedResults.ExpansionOptions,
"actual and expected expansion options should be the same")
}
if expectedResults.GroupTargetSizes != nil {
assert.Equal(t, expectedResults.GroupTargetSizes, results.GroupTargetSizes)
}
assert.ElementsMatch(t, results.ScaleUpStatus.PodsTriggeredScaleUp, expectedResults.ScaleUpStatus.PodsTriggeredScaleUp,
"actual and expected triggering pods should be the same")
assert.ElementsMatch(t, results.ScaleUpStatus.PodsRemainUnschedulable, expectedResults.ScaleUpStatus.PodsRemainUnschedulable,
"actual and expected remaining pods should be the same")
assert.ElementsMatch(t, results.ScaleUpStatus.PodsAwaitEvaluation, expectedResults.ScaleUpStatus.PodsAwaitEvaluation,
"actual and expected awaiting evaluation pods should be the same")
}
assert.Equal(t, GroupSizeChange{}, results.FinalOption) func simpleNoScaleUpTest(t *testing.T, config *ScaleUpTestConfig, expectedResults *ScaleTestResults) {
results := runSimpleScaleUpTest(t, config)
assert.Nil(t, results.GroupSizeChanges)
assert.False(t, results.ScaleUpStatus.WasSuccessful()) assert.False(t, results.ScaleUpStatus.WasSuccessful())
noScaleUpEventSeen := false noScaleUpEventSeen := false
for _, event := range results.Events { for _, event := range results.Events {
@ -602,49 +623,119 @@ func simpleNoScaleUpTest(t *testing.T, config *ScaleTestConfig, expectedResults
"actual and expected awaiting evaluation pods should be the same") "actual and expected awaiting evaluation pods should be the same")
} }
func simpleScaleUpTest(t *testing.T, config *ScaleTestConfig, expectedResults *ScaleTestResults) { func runSimpleScaleUpTest(t *testing.T, config *ScaleUpTestConfig) *ScaleUpTestResult {
results := runSimpleScaleUpTest(t, config) now := time.Now()
groupSizeChangesChannel := make(chan GroupSizeChange, 20)
groupNodes := make(map[string][]*apiv1.Node)
assert.NotNil(t, results.FinalOption, "Expected scale up event") // build nodes
assert.Equal(t, expectedResults.FinalOption, results.FinalOption) nodes := make([]*apiv1.Node, 0, len(config.Nodes))
assert.True(t, results.ScaleUpStatus.WasSuccessful()) for _, n := range config.Nodes {
nodeEventSeen := false node := BuildTestNode(n.Name, n.Cpu, n.Memory)
for _, event := range results.Events { if n.Gpu > 0 {
if strings.Contains(event, "TriggeredScaleUp") && strings.Contains(event, expectedResults.FinalOption.GroupName) { AddGpusToNode(node, n.Gpu)
nodeEventSeen = true
} }
if len(expectedResults.ScaleUpStatus.PodsRemainUnschedulable) == 0 { SetNodeReadyState(node, n.Ready, now.Add(-2*time.Minute))
assert.NotRegexp(t, regexp.MustCompile("NotTriggerScaleUp"), event) nodes = append(nodes, node)
if n.Group != "" {
groupNodes[n.Group] = append(groupNodes[n.Group], node)
} }
} }
assert.True(t, nodeEventSeen)
if len(expectedResults.ExpansionOptions) > 0 { // build and setup pods
// Empty ExpansionOptions means we do not want to do any assertions pods := make([]*apiv1.Pod, len(config.Pods))
// on contents of actual scaleUp options for i, p := range config.Pods {
pods[i] = buildTestPod(p)
// Check that option to choose is part of expected options.
assert.Contains(t, expectedResults.ExpansionOptions, config.ExpansionOptionToChoose, "final expected expansion option must be in expected expansion options")
assert.Contains(t, results.ExpansionOptions, config.ExpansionOptionToChoose, "final expected expansion option must be in expected expansion options")
assert.ElementsMatch(t, results.ExpansionOptions, expectedResults.ExpansionOptions,
"actual and expected expansion options should be the same")
} }
extraPods := make([]*apiv1.Pod, len(config.ExtraPods))
for i, p := range config.ExtraPods {
extraPods[i] = buildTestPod(p)
}
podLister := kube_util.NewTestPodLister(pods)
listers := kube_util.NewListerRegistry(nil, nil, podLister, nil, nil, nil, nil, nil, nil, nil)
assert.ElementsMatch(t, results.ScaleUpStatus.PodsTriggeredScaleUp, expectedResults.ScaleUpStatus.PodsTriggeredScaleUp, // setup node groups
"actual and expected triggering pods should be the same") provider := testprovider.NewTestCloudProvider(func(nodeGroup string, increase int) error {
assert.ElementsMatch(t, results.ScaleUpStatus.PodsRemainUnschedulable, expectedResults.ScaleUpStatus.PodsRemainUnschedulable, groupSizeChangesChannel <- GroupSizeChange{GroupName: nodeGroup, SizeChange: increase}
"actual and expected remaining pods should be the same") if config.OnScaleUp != nil {
assert.ElementsMatch(t, results.ScaleUpStatus.PodsAwaitEvaluation, expectedResults.ScaleUpStatus.PodsAwaitEvaluation, return config.OnScaleUp(nodeGroup, increase)
"actual and expected awaiting evaluation pods should be the same") }
}
func getGroupSizeChangeFromChan(c chan GroupSizeChange) *GroupSizeChange {
select {
case val := <-c:
return &val
case <-time.After(100 * time.Millisecond):
return nil return nil
}, nil)
options := defaultOptions
if config.Options != nil {
options = *config.Options
}
resourceLimiter := cloudprovider.NewResourceLimiter(
map[string]int64{cloudprovider.ResourceNameCores: options.MinCoresTotal, cloudprovider.ResourceNameMemory: options.MinMemoryTotal},
map[string]int64{cloudprovider.ResourceNameCores: options.MaxCoresTotal, cloudprovider.ResourceNameMemory: options.MaxMemoryTotal})
provider.SetResourceLimiter(resourceLimiter)
groupConfigs := make(map[string]*NodeGroupConfig)
for _, group := range config.Groups {
groupConfigs[group.Name] = &group
}
for name, nodesInGroup := range groupNodes {
groupConfig := groupConfigs[name]
if groupConfig == nil {
groupConfig = &NodeGroupConfig{
Name: name,
MinSize: 1,
MaxSize: 10,
}
}
provider.AddNodeGroup(name, groupConfig.MinSize, groupConfig.MaxSize, len(nodesInGroup))
for _, n := range nodesInGroup {
provider.AddNode(name, n)
}
}
// build orchestrator
context, err := NewScaleTestAutoscalingContext(options, &fake.Clientset{}, listers, provider, nil, nil)
assert.NoError(t, err)
nodeInfos, err := nodeinfosprovider.NewDefaultTemplateNodeInfoProvider(nil, false).
Process(&context, nodes, []*appsv1.DaemonSet{}, taints.TaintConfig{}, now)
assert.NoError(t, err)
clusterState := clusterstate.
NewClusterStateRegistry(provider, clusterstate.ClusterStateRegistryConfig{}, context.LogRecorder, NewBackoff(), clusterstate.NewStaticMaxNodeProvisionTimeProvider(15*time.Minute))
clusterState.UpdateNodes(nodes, nodeInfos, time.Now())
processors := NewTestProcessors(&context)
orchestrator := New()
orchestrator.Initialize(&context, processors, clusterState, taints.TaintConfig{})
expander := NewMockRepotingStrategy(t, config.ExpansionOptionToChoose)
context.ExpanderStrategy = expander
// scale up
scaleUpStatus, scaleUpErr := orchestrator.ScaleUp(extraPods, nodes, []*appsv1.DaemonSet{}, nodeInfos)
processors.ScaleUpStatusProcessor.Process(&context, scaleUpStatus)
// aggregate group size changes
close(groupSizeChangesChannel)
var groupSizeChanges []GroupSizeChange
for change := range groupSizeChangesChannel {
groupSizeChanges = append(groupSizeChanges, change)
}
// aggregate events
eventsChannel := context.Recorder.(*kube_record.FakeRecorder).Events
close(eventsChannel)
events := []string{}
for event := range eventsChannel {
events = append(events, event)
}
// build target sizes
targetSizes := make(map[string]int)
for _, group := range provider.NodeGroups() {
targetSizes[group.Id()], _ = group.TargetSize()
}
return &ScaleUpTestResult{
ScaleUpError: scaleUpErr,
ScaleUpStatus: simplifyScaleUpStatus(scaleUpStatus),
GroupSizeChanges: groupSizeChanges,
Events: events,
GroupTargetSizes: targetSizes,
ExpansionOptions: expander.LastInputOptions(),
} }
} }
@ -1049,24 +1140,33 @@ func TestCheckDeltaWithinLimits(t *testing.T) {
} }
} }
func TestAuthError(t *testing.T) { func TestAuthErrorHandling(t *testing.T) {
metrics.RegisterAll(false) metrics.RegisterAll(false)
context, err := NewScaleTestAutoscalingContext(config.AutoscalingOptions{}, &fake.Clientset{}, nil, nil, nil, nil) config := &ScaleUpTestConfig{
assert.NoError(t, err) Groups: []NodeGroupConfig{
{Name: "ng1", MaxSize: 2},
nodeGroup := &mockprovider.NodeGroup{} },
info := nodegroupset.ScaleUpInfo{Group: nodeGroup} Nodes: []NodeConfig{
nodeGroup.On("Id").Return("A") {Name: "ng1-n1", Cpu: 1500, Memory: 1000 * utils.MiB, Ready: true, Group: "ng1"},
nodeGroup.On("IncreaseSize", 0).Return(errors.NewAutoscalerError(errors.AutoscalerErrorType("abcd"), "")) },
ExtraPods: []PodConfig{
processors := NewTestProcessors(&context) {Name: "p1", Cpu: 1000},
clusterStateRegistry := clusterstate.NewClusterStateRegistry(nil, clusterstate.ClusterStateRegistryConfig{}, context.LogRecorder, NewBackoff(), clusterstate.NewStaticMaxNodeProvisionTimeProvider(15*time.Minute)) },
suOrchestrator := New() OnScaleUp: func(group string, i int) error {
suOrchestrator.Initialize(&context, processors, clusterStateRegistry, taints.TaintConfig{}) return errors.NewAutoscalerError(errors.AutoscalerErrorType("authError"), "auth error")
scaleUpOrchestrator := suOrchestrator.(*ScaleUpOrchestrator) },
aerr := scaleUpOrchestrator.executeScaleUp(info, "", "", time.Now()) Options: &defaultOptions,
assert.Error(t, aerr) }
results := runSimpleScaleUpTest(t, config)
expected := errors.NewAutoscalerError(
errors.AutoscalerErrorType("authError"),
"failed to increase node group size: auth error",
)
assert.Equal(t, expected, results.ScaleUpError)
assertLegacyRegistryEntry(t, "cluster_autoscaler_failed_scale_ups_total{reason=\"authError\"} 1")
}
func assertLegacyRegistryEntry(t *testing.T, entry string) {
req, err := http.NewRequest("GET", "/", nil) req, err := http.NewRequest("GET", "/", nil)
if err != nil { if err != nil {
t.Fatal(err) t.Fatal(err)
@ -1074,14 +1174,13 @@ func TestAuthError(t *testing.T) {
rr := httptest.NewRecorder() rr := httptest.NewRecorder()
handler := http.HandlerFunc(legacyregistry.Handler().ServeHTTP) handler := http.HandlerFunc(legacyregistry.Handler().ServeHTTP)
handler.ServeHTTP(rr, req) handler.ServeHTTP(rr, req)
// Check that the status code is what we expect. // Check that the status code is what we expect.
if status := rr.Code; status != http.StatusOK { if status := rr.Code; status != http.StatusOK {
t.Errorf("handler returned wrong status code: got %v want %v", t.Errorf("handler returned wrong status code: got %v want %v",
status, http.StatusOK) status, http.StatusOK)
} }
// Check that the failed scale up reason is set correctly. // Check that the failed scale up reason is set correctly.
assert.Contains(t, rr.Body.String(), "cluster_autoscaler_failed_scale_ups_total{reason=\"abcd\"} 1") assert.Contains(t, rr.Body.String(), entry)
} }
func simplifyScaleUpStatus(scaleUpStatus *status.ScaleUpStatus) ScaleUpStatusInfo { func simplifyScaleUpStatus(scaleUpStatus *status.ScaleUpStatus) ScaleUpStatusInfo {

View File

@ -22,10 +22,6 @@ import (
"testing" "testing"
"time" "time"
"k8s.io/autoscaler/cluster-autoscaler/core/scaledown/pdb"
"k8s.io/autoscaler/cluster-autoscaler/debuggingsnapshot"
"k8s.io/apimachinery/pkg/api/resource"
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider" "k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
testcloudprovider "k8s.io/autoscaler/cluster-autoscaler/cloudprovider/test" testcloudprovider "k8s.io/autoscaler/cluster-autoscaler/cloudprovider/test"
"k8s.io/autoscaler/cluster-autoscaler/clusterstate/utils" "k8s.io/autoscaler/cluster-autoscaler/clusterstate/utils"
@ -33,7 +29,10 @@ import (
"k8s.io/autoscaler/cluster-autoscaler/context" "k8s.io/autoscaler/cluster-autoscaler/context"
"k8s.io/autoscaler/cluster-autoscaler/core/podlistprocessor" "k8s.io/autoscaler/cluster-autoscaler/core/podlistprocessor"
"k8s.io/autoscaler/cluster-autoscaler/core/scaledown/deletiontracker" "k8s.io/autoscaler/cluster-autoscaler/core/scaledown/deletiontracker"
"k8s.io/autoscaler/cluster-autoscaler/core/scaledown/pdb"
"k8s.io/autoscaler/cluster-autoscaler/debuggingsnapshot"
"k8s.io/autoscaler/cluster-autoscaler/estimator" "k8s.io/autoscaler/cluster-autoscaler/estimator"
"k8s.io/autoscaler/cluster-autoscaler/expander"
"k8s.io/autoscaler/cluster-autoscaler/expander/random" "k8s.io/autoscaler/cluster-autoscaler/expander/random"
"k8s.io/autoscaler/cluster-autoscaler/metrics" "k8s.io/autoscaler/cluster-autoscaler/metrics"
"k8s.io/autoscaler/cluster-autoscaler/processors" "k8s.io/autoscaler/cluster-autoscaler/processors"
@ -50,14 +49,14 @@ import (
"k8s.io/autoscaler/cluster-autoscaler/processors/status" "k8s.io/autoscaler/cluster-autoscaler/processors/status"
"k8s.io/autoscaler/cluster-autoscaler/simulator/clustersnapshot" "k8s.io/autoscaler/cluster-autoscaler/simulator/clustersnapshot"
"k8s.io/autoscaler/cluster-autoscaler/simulator/predicatechecker" "k8s.io/autoscaler/cluster-autoscaler/simulator/predicatechecker"
"k8s.io/autoscaler/cluster-autoscaler/utils/backoff"
"k8s.io/autoscaler/cluster-autoscaler/utils/errors" "k8s.io/autoscaler/cluster-autoscaler/utils/errors"
kube_util "k8s.io/autoscaler/cluster-autoscaler/utils/kubernetes" kube_util "k8s.io/autoscaler/cluster-autoscaler/utils/kubernetes"
"k8s.io/autoscaler/cluster-autoscaler/utils/labels" "k8s.io/autoscaler/cluster-autoscaler/utils/labels"
"github.com/stretchr/testify/assert" "github.com/stretchr/testify/assert"
apiv1 "k8s.io/api/core/v1" apiv1 "k8s.io/api/core/v1"
"k8s.io/autoscaler/cluster-autoscaler/utils/backoff" "k8s.io/apimachinery/pkg/api/resource"
kube_client "k8s.io/client-go/kubernetes" kube_client "k8s.io/client-go/kubernetes"
kube_record "k8s.io/client-go/tools/record" kube_record "k8s.io/client-go/tools/record"
schedulerframework "k8s.io/kubernetes/pkg/scheduler/framework" schedulerframework "k8s.io/kubernetes/pkg/scheduler/framework"
@ -102,9 +101,38 @@ type ScaleTestConfig struct {
ExpectedScaleDownCount int ExpectedScaleDownCount int
} }
// NodeGroupConfig is a node group config used in tests
type NodeGroupConfig struct {
Name string
MinSize int
MaxSize int
}
// ScaleUpTestConfig represents a config of a scale test
type ScaleUpTestConfig struct {
Groups []NodeGroupConfig
Nodes []NodeConfig
Pods []PodConfig
ExtraPods []PodConfig
OnScaleUp testcloudprovider.OnScaleUpFunc
ExpansionOptionToChoose *GroupSizeChange
Options *config.AutoscalingOptions
}
// ScaleUpTestResult represents a node groups scale up result
type ScaleUpTestResult struct {
ScaleUpError errors.AutoscalerError
ScaleUpStatus ScaleUpStatusInfo
GroupSizeChanges []GroupSizeChange
ExpansionOptions []GroupSizeChange
Events []string
GroupTargetSizes map[string]int
}
// ScaleTestResults contains results of a scale test // ScaleTestResults contains results of a scale test
type ScaleTestResults struct { type ScaleTestResults struct {
ExpansionOptions []GroupSizeChange ExpansionOptions []GroupSizeChange
GroupTargetSizes map[string]int
FinalOption GroupSizeChange FinalOption GroupSizeChange
NoScaleUpReason string NoScaleUpReason string
FinalScaleDowns []string FinalScaleDowns []string
@ -159,7 +187,8 @@ func NewTestProcessors(context *context.AutoscalingContext) *processors.Autoscal
func NewScaleTestAutoscalingContext( func NewScaleTestAutoscalingContext(
options config.AutoscalingOptions, fakeClient kube_client.Interface, options config.AutoscalingOptions, fakeClient kube_client.Interface,
listers kube_util.ListerRegistry, provider cloudprovider.CloudProvider, listers kube_util.ListerRegistry, provider cloudprovider.CloudProvider,
processorCallbacks processor_callbacks.ProcessorCallbacks, debuggingSnapshotter debuggingsnapshot.DebuggingSnapshotter) (context.AutoscalingContext, error) { processorCallbacks processor_callbacks.ProcessorCallbacks, debuggingSnapshotter debuggingsnapshot.DebuggingSnapshotter,
) (context.AutoscalingContext, error) {
// Not enough buffer space causes the test to hang without printing any logs. // Not enough buffer space causes the test to hang without printing any logs.
// This is not useful. // This is not useful.
fakeRecorder := kube_record.NewFakeRecorder(100) fakeRecorder := kube_record.NewFakeRecorder(100)
@ -279,8 +308,8 @@ type MockAutoprovisioningNodeGroupListProcessor struct {
// Process extends the list of node groups // Process extends the list of node groups
func (p *MockAutoprovisioningNodeGroupListProcessor) Process(context *context.AutoscalingContext, nodeGroups []cloudprovider.NodeGroup, nodeInfos map[string]*schedulerframework.NodeInfo, func (p *MockAutoprovisioningNodeGroupListProcessor) Process(context *context.AutoscalingContext, nodeGroups []cloudprovider.NodeGroup, nodeInfos map[string]*schedulerframework.NodeInfo,
unschedulablePods []*apiv1.Pod) ([]cloudprovider.NodeGroup, map[string]*schedulerframework.NodeInfo, error) { unschedulablePods []*apiv1.Pod,
) ([]cloudprovider.NodeGroup, map[string]*schedulerframework.NodeInfo, error) {
machines, err := context.CloudProvider.GetAvailableMachineTypes() machines, err := context.CloudProvider.GetAvailableMachineTypes()
assert.NoError(p.T, err) assert.NoError(p.T, err)
@ -305,3 +334,67 @@ func NewBackoff() backoff.Backoff {
return backoff.NewIdBasedExponentialBackoff(5*time.Minute, /*InitialNodeGroupBackoffDuration*/ return backoff.NewIdBasedExponentialBackoff(5*time.Minute, /*InitialNodeGroupBackoffDuration*/
30*time.Minute /*MaxNodeGroupBackoffDuration*/, 3*time.Hour /*NodeGroupBackoffResetTimeout*/) 30*time.Minute /*MaxNodeGroupBackoffDuration*/, 3*time.Hour /*NodeGroupBackoffResetTimeout*/)
} }
// To implement expander.Strategy, BestOption method must have a struct receiver.
// This prevents it from modifying fields of reportingStrategy, so we need a thin
// pointer wrapper for mutable parts.
type expanderResults struct {
inputOptions []GroupSizeChange
}
// MockReportingStrategy implements expander.Strategy
type MockReportingStrategy struct {
defaultStrategy expander.Strategy
optionToChoose *GroupSizeChange
t *testing.T
results *expanderResults
}
// NewMockRepotingStrategy creates an expander strategy with reporting and mocking capabilities.
func NewMockRepotingStrategy(t *testing.T, optionToChoose *GroupSizeChange) *MockReportingStrategy {
return &MockReportingStrategy{
defaultStrategy: random.NewStrategy(),
results: &expanderResults{},
optionToChoose: optionToChoose,
t: t,
}
}
// LastInputOptions provides access to expansion options passed as an input in recent strategy execution
func (r *MockReportingStrategy) LastInputOptions() []GroupSizeChange {
return r.results.inputOptions
}
// BestOption satisfies the Strategy interface. Picks the best option from those passed as an argument.
// When parameter optionToChoose is defined, it's picked as the best one.
// Otherwise, random option is used.
func (r *MockReportingStrategy) BestOption(options []expander.Option, nodeInfo map[string]*schedulerframework.NodeInfo) *expander.Option {
r.results.inputOptions = expanderOptionsToGroupSizeChanges(options)
if r.optionToChoose == nil {
return r.defaultStrategy.BestOption(options, nodeInfo)
}
for _, option := range options {
groupSizeChange := expanderOptionToGroupSizeChange(option)
if groupSizeChange == *r.optionToChoose {
return &option
}
}
assert.Fail(r.t, "did not find expansionOptionToChoose %+v", r.optionToChoose)
return nil
}
func expanderOptionsToGroupSizeChanges(options []expander.Option) []GroupSizeChange {
groupSizeChanges := make([]GroupSizeChange, 0, len(options))
for _, option := range options {
groupSizeChange := expanderOptionToGroupSizeChange(option)
groupSizeChanges = append(groupSizeChanges, groupSizeChange)
}
return groupSizeChanges
}
func expanderOptionToGroupSizeChange(option expander.Option) GroupSizeChange {
groupName := option.NodeGroup.Id()
groupSizeIncrement := option.NodeCount
scaleUpOption := GroupSizeChange{GroupName: groupName, SizeChange: groupSizeIncrement}
return scaleUpOption
}

View File

@ -147,7 +147,8 @@ var (
maxGracefulTerminationFlag = flag.Int("max-graceful-termination-sec", 10*60, "Maximum number of seconds CA waits for pod termination when trying to scale down a node.") maxGracefulTerminationFlag = flag.Int("max-graceful-termination-sec", 10*60, "Maximum number of seconds CA waits for pod termination when trying to scale down a node.")
maxTotalUnreadyPercentage = flag.Float64("max-total-unready-percentage", 45, "Maximum percentage of unready nodes in the cluster. After this is exceeded, CA halts operations") maxTotalUnreadyPercentage = flag.Float64("max-total-unready-percentage", 45, "Maximum percentage of unready nodes in the cluster. After this is exceeded, CA halts operations")
okTotalUnreadyCount = flag.Int("ok-total-unready-count", 3, "Number of allowed unready nodes, irrespective of max-total-unready-percentage") okTotalUnreadyCount = flag.Int("ok-total-unready-count", 3, "Number of allowed unready nodes, irrespective of max-total-unready-percentage")
scaleUpFromZero = flag.Bool("scale-up-from-zero", true, "Should CA scale up when there 0 ready nodes.") scaleUpFromZero = flag.Bool("scale-up-from-zero", true, "Should CA scale up when there are 0 ready nodes.")
parallelScaleUp = flag.Bool("parallel-scale-up", false, "Whether to allow parallel node groups scale up. Experimental: may not work on some cloud providers, enable at your own risk.")
maxNodeProvisionTime = flag.Duration("max-node-provision-time", 15*time.Minute, "The default maximum time CA waits for node to be provisioned - the value can be overridden per node group") maxNodeProvisionTime = flag.Duration("max-node-provision-time", 15*time.Minute, "The default maximum time CA waits for node to be provisioned - the value can be overridden per node group")
maxPodEvictionTime = flag.Duration("max-pod-eviction-time", 2*time.Minute, "Maximum time CA tries to evict a pod before giving up") maxPodEvictionTime = flag.Duration("max-pod-eviction-time", 2*time.Minute, "Maximum time CA tries to evict a pod before giving up")
nodeGroupsFlag = multiStringFlag( nodeGroupsFlag = multiStringFlag(
@ -265,6 +266,7 @@ func createAutoscalingOptions() config.AutoscalingOptions {
MaxTotalUnreadyPercentage: *maxTotalUnreadyPercentage, MaxTotalUnreadyPercentage: *maxTotalUnreadyPercentage,
OkTotalUnreadyCount: *okTotalUnreadyCount, OkTotalUnreadyCount: *okTotalUnreadyCount,
ScaleUpFromZero: *scaleUpFromZero, ScaleUpFromZero: *scaleUpFromZero,
ParallelScaleUp: *parallelScaleUp,
EstimatorName: *estimatorFlag, EstimatorName: *estimatorFlag,
ExpanderNames: *expanderFlag, ExpanderNames: *expanderFlag,
GRPCExpanderCert: *grpcExpanderCert, GRPCExpanderCert: *grpcExpanderCert,