Merge pull request #1464 from losipiuk/lo/stockouts2
Better quota-exceeded/stockout handling
This commit is contained in:
commit
ab7f1e69be
|
|
@ -0,0 +1,203 @@
|
|||
/*
|
||||
Copyright 2017 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package mocks
|
||||
|
||||
import cloudprovider "k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
|
||||
import errors "k8s.io/autoscaler/cluster-autoscaler/utils/errors"
|
||||
import mock "github.com/stretchr/testify/mock"
|
||||
import resource "k8s.io/apimachinery/pkg/api/resource"
|
||||
import v1 "k8s.io/api/core/v1"
|
||||
|
||||
// CloudProvider is an autogenerated mock type for the CloudProvider type
|
||||
type CloudProvider struct {
|
||||
mock.Mock
|
||||
}
|
||||
|
||||
// Cleanup provides a mock function with given fields:
|
||||
func (_m *CloudProvider) Cleanup() error {
|
||||
ret := _m.Called()
|
||||
|
||||
var r0 error
|
||||
if rf, ok := ret.Get(0).(func() error); ok {
|
||||
r0 = rf()
|
||||
} else {
|
||||
r0 = ret.Error(0)
|
||||
}
|
||||
|
||||
return r0
|
||||
}
|
||||
|
||||
// GetAvailableMachineTypes provides a mock function with given fields:
|
||||
func (_m *CloudProvider) GetAvailableMachineTypes() ([]string, error) {
|
||||
ret := _m.Called()
|
||||
|
||||
var r0 []string
|
||||
if rf, ok := ret.Get(0).(func() []string); ok {
|
||||
r0 = rf()
|
||||
} else {
|
||||
if ret.Get(0) != nil {
|
||||
r0 = ret.Get(0).([]string)
|
||||
}
|
||||
}
|
||||
|
||||
var r1 error
|
||||
if rf, ok := ret.Get(1).(func() error); ok {
|
||||
r1 = rf()
|
||||
} else {
|
||||
r1 = ret.Error(1)
|
||||
}
|
||||
|
||||
return r0, r1
|
||||
}
|
||||
|
||||
// GetResourceLimiter provides a mock function with given fields:
|
||||
func (_m *CloudProvider) GetResourceLimiter() (*cloudprovider.ResourceLimiter, error) {
|
||||
ret := _m.Called()
|
||||
|
||||
var r0 *cloudprovider.ResourceLimiter
|
||||
if rf, ok := ret.Get(0).(func() *cloudprovider.ResourceLimiter); ok {
|
||||
r0 = rf()
|
||||
} else {
|
||||
if ret.Get(0) != nil {
|
||||
r0 = ret.Get(0).(*cloudprovider.ResourceLimiter)
|
||||
}
|
||||
}
|
||||
|
||||
var r1 error
|
||||
if rf, ok := ret.Get(1).(func() error); ok {
|
||||
r1 = rf()
|
||||
} else {
|
||||
r1 = ret.Error(1)
|
||||
}
|
||||
|
||||
return r0, r1
|
||||
}
|
||||
|
||||
// Name provides a mock function with given fields:
|
||||
func (_m *CloudProvider) Name() string {
|
||||
ret := _m.Called()
|
||||
|
||||
var r0 string
|
||||
if rf, ok := ret.Get(0).(func() string); ok {
|
||||
r0 = rf()
|
||||
} else {
|
||||
r0 = ret.Get(0).(string)
|
||||
}
|
||||
|
||||
return r0
|
||||
}
|
||||
|
||||
// NewNodeGroup provides a mock function with given fields: machineType, labels, systemLabels, taints, extraResources
|
||||
func (_m *CloudProvider) NewNodeGroup(machineType string, labels map[string]string, systemLabels map[string]string, taints []v1.Taint, extraResources map[string]resource.Quantity) (cloudprovider.NodeGroup, error) {
|
||||
ret := _m.Called(machineType, labels, systemLabels, taints, extraResources)
|
||||
|
||||
var r0 cloudprovider.NodeGroup
|
||||
if rf, ok := ret.Get(0).(func(string, map[string]string, map[string]string, []v1.Taint, map[string]resource.Quantity) cloudprovider.NodeGroup); ok {
|
||||
r0 = rf(machineType, labels, systemLabels, taints, extraResources)
|
||||
} else {
|
||||
if ret.Get(0) != nil {
|
||||
r0 = ret.Get(0).(cloudprovider.NodeGroup)
|
||||
}
|
||||
}
|
||||
|
||||
var r1 error
|
||||
if rf, ok := ret.Get(1).(func(string, map[string]string, map[string]string, []v1.Taint, map[string]resource.Quantity) error); ok {
|
||||
r1 = rf(machineType, labels, systemLabels, taints, extraResources)
|
||||
} else {
|
||||
r1 = ret.Error(1)
|
||||
}
|
||||
|
||||
return r0, r1
|
||||
}
|
||||
|
||||
// NodeGroupForNode provides a mock function with given fields: _a0
|
||||
func (_m *CloudProvider) NodeGroupForNode(_a0 *v1.Node) (cloudprovider.NodeGroup, error) {
|
||||
ret := _m.Called(_a0)
|
||||
|
||||
var r0 cloudprovider.NodeGroup
|
||||
if rf, ok := ret.Get(0).(func(*v1.Node) cloudprovider.NodeGroup); ok {
|
||||
r0 = rf(_a0)
|
||||
} else {
|
||||
if ret.Get(0) != nil {
|
||||
r0 = ret.Get(0).(cloudprovider.NodeGroup)
|
||||
}
|
||||
}
|
||||
|
||||
var r1 error
|
||||
if rf, ok := ret.Get(1).(func(*v1.Node) error); ok {
|
||||
r1 = rf(_a0)
|
||||
} else {
|
||||
r1 = ret.Error(1)
|
||||
}
|
||||
|
||||
return r0, r1
|
||||
}
|
||||
|
||||
// NodeGroups provides a mock function with given fields:
|
||||
func (_m *CloudProvider) NodeGroups() []cloudprovider.NodeGroup {
|
||||
ret := _m.Called()
|
||||
|
||||
var r0 []cloudprovider.NodeGroup
|
||||
if rf, ok := ret.Get(0).(func() []cloudprovider.NodeGroup); ok {
|
||||
r0 = rf()
|
||||
} else {
|
||||
if ret.Get(0) != nil {
|
||||
r0 = ret.Get(0).([]cloudprovider.NodeGroup)
|
||||
}
|
||||
}
|
||||
|
||||
return r0
|
||||
}
|
||||
|
||||
// Pricing provides a mock function with given fields:
|
||||
func (_m *CloudProvider) Pricing() (cloudprovider.PricingModel, errors.AutoscalerError) {
|
||||
ret := _m.Called()
|
||||
|
||||
var r0 cloudprovider.PricingModel
|
||||
if rf, ok := ret.Get(0).(func() cloudprovider.PricingModel); ok {
|
||||
r0 = rf()
|
||||
} else {
|
||||
if ret.Get(0) != nil {
|
||||
r0 = ret.Get(0).(cloudprovider.PricingModel)
|
||||
}
|
||||
}
|
||||
|
||||
var r1 errors.AutoscalerError
|
||||
if rf, ok := ret.Get(1).(func() errors.AutoscalerError); ok {
|
||||
r1 = rf()
|
||||
} else {
|
||||
if ret.Get(1) != nil {
|
||||
r1 = ret.Get(1).(errors.AutoscalerError)
|
||||
}
|
||||
}
|
||||
|
||||
return r0, r1
|
||||
}
|
||||
|
||||
// Refresh provides a mock function with given fields:
|
||||
func (_m *CloudProvider) Refresh() error {
|
||||
ret := _m.Called()
|
||||
|
||||
var r0 error
|
||||
if rf, ok := ret.Get(0).(func() error); ok {
|
||||
r0 = rf()
|
||||
} else {
|
||||
r0 = ret.Error(0)
|
||||
}
|
||||
|
||||
return r0
|
||||
}
|
||||
|
|
@ -0,0 +1,257 @@
|
|||
/*
|
||||
Copyright 2017 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package mocks
|
||||
|
||||
import cache "k8s.io/kubernetes/pkg/scheduler/cache"
|
||||
import cloudprovider "k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
|
||||
import mock "github.com/stretchr/testify/mock"
|
||||
import v1 "k8s.io/api/core/v1"
|
||||
|
||||
// NodeGroup is an autogenerated mock type for the NodeGroup type
|
||||
type NodeGroup struct {
|
||||
mock.Mock
|
||||
}
|
||||
|
||||
// Autoprovisioned provides a mock function with given fields:
|
||||
func (_m *NodeGroup) Autoprovisioned() bool {
|
||||
ret := _m.Called()
|
||||
|
||||
var r0 bool
|
||||
if rf, ok := ret.Get(0).(func() bool); ok {
|
||||
r0 = rf()
|
||||
} else {
|
||||
r0 = ret.Get(0).(bool)
|
||||
}
|
||||
|
||||
return r0
|
||||
}
|
||||
|
||||
// Create provides a mock function with given fields:
|
||||
func (_m *NodeGroup) Create() (cloudprovider.NodeGroup, error) {
|
||||
ret := _m.Called()
|
||||
|
||||
var r0 cloudprovider.NodeGroup
|
||||
if rf, ok := ret.Get(0).(func() cloudprovider.NodeGroup); ok {
|
||||
r0 = rf()
|
||||
} else {
|
||||
if ret.Get(0) != nil {
|
||||
r0 = ret.Get(0).(cloudprovider.NodeGroup)
|
||||
}
|
||||
}
|
||||
|
||||
var r1 error
|
||||
if rf, ok := ret.Get(1).(func() error); ok {
|
||||
r1 = rf()
|
||||
} else {
|
||||
r1 = ret.Error(1)
|
||||
}
|
||||
|
||||
return r0, r1
|
||||
}
|
||||
|
||||
// Debug provides a mock function with given fields:
|
||||
func (_m *NodeGroup) Debug() string {
|
||||
ret := _m.Called()
|
||||
|
||||
var r0 string
|
||||
if rf, ok := ret.Get(0).(func() string); ok {
|
||||
r0 = rf()
|
||||
} else {
|
||||
r0 = ret.Get(0).(string)
|
||||
}
|
||||
|
||||
return r0
|
||||
}
|
||||
|
||||
// DecreaseTargetSize provides a mock function with given fields: delta
|
||||
func (_m *NodeGroup) DecreaseTargetSize(delta int) error {
|
||||
ret := _m.Called(delta)
|
||||
|
||||
var r0 error
|
||||
if rf, ok := ret.Get(0).(func(int) error); ok {
|
||||
r0 = rf(delta)
|
||||
} else {
|
||||
r0 = ret.Error(0)
|
||||
}
|
||||
|
||||
return r0
|
||||
}
|
||||
|
||||
// Delete provides a mock function with given fields:
|
||||
func (_m *NodeGroup) Delete() error {
|
||||
ret := _m.Called()
|
||||
|
||||
var r0 error
|
||||
if rf, ok := ret.Get(0).(func() error); ok {
|
||||
r0 = rf()
|
||||
} else {
|
||||
r0 = ret.Error(0)
|
||||
}
|
||||
|
||||
return r0
|
||||
}
|
||||
|
||||
// DeleteNodes provides a mock function with given fields: _a0
|
||||
func (_m *NodeGroup) DeleteNodes(_a0 []*v1.Node) error {
|
||||
ret := _m.Called(_a0)
|
||||
|
||||
var r0 error
|
||||
if rf, ok := ret.Get(0).(func([]*v1.Node) error); ok {
|
||||
r0 = rf(_a0)
|
||||
} else {
|
||||
r0 = ret.Error(0)
|
||||
}
|
||||
|
||||
return r0
|
||||
}
|
||||
|
||||
// Exist provides a mock function with given fields:
|
||||
func (_m *NodeGroup) Exist() bool {
|
||||
ret := _m.Called()
|
||||
|
||||
var r0 bool
|
||||
if rf, ok := ret.Get(0).(func() bool); ok {
|
||||
r0 = rf()
|
||||
} else {
|
||||
r0 = ret.Get(0).(bool)
|
||||
}
|
||||
|
||||
return r0
|
||||
}
|
||||
|
||||
// Id provides a mock function with given fields:
|
||||
func (_m *NodeGroup) Id() string {
|
||||
ret := _m.Called()
|
||||
|
||||
var r0 string
|
||||
if rf, ok := ret.Get(0).(func() string); ok {
|
||||
r0 = rf()
|
||||
} else {
|
||||
r0 = ret.Get(0).(string)
|
||||
}
|
||||
|
||||
return r0
|
||||
}
|
||||
|
||||
// IncreaseSize provides a mock function with given fields: delta
|
||||
func (_m *NodeGroup) IncreaseSize(delta int) error {
|
||||
ret := _m.Called(delta)
|
||||
|
||||
var r0 error
|
||||
if rf, ok := ret.Get(0).(func(int) error); ok {
|
||||
r0 = rf(delta)
|
||||
} else {
|
||||
r0 = ret.Error(0)
|
||||
}
|
||||
|
||||
return r0
|
||||
}
|
||||
|
||||
// MaxSize provides a mock function with given fields:
|
||||
func (_m *NodeGroup) MaxSize() int {
|
||||
ret := _m.Called()
|
||||
|
||||
var r0 int
|
||||
if rf, ok := ret.Get(0).(func() int); ok {
|
||||
r0 = rf()
|
||||
} else {
|
||||
r0 = ret.Get(0).(int)
|
||||
}
|
||||
|
||||
return r0
|
||||
}
|
||||
|
||||
// MinSize provides a mock function with given fields:
|
||||
func (_m *NodeGroup) MinSize() int {
|
||||
ret := _m.Called()
|
||||
|
||||
var r0 int
|
||||
if rf, ok := ret.Get(0).(func() int); ok {
|
||||
r0 = rf()
|
||||
} else {
|
||||
r0 = ret.Get(0).(int)
|
||||
}
|
||||
|
||||
return r0
|
||||
}
|
||||
|
||||
// Nodes provides a mock function with given fields:
|
||||
func (_m *NodeGroup) Nodes() ([]cloudprovider.Instance, error) {
|
||||
ret := _m.Called()
|
||||
|
||||
var r0 []cloudprovider.Instance
|
||||
if rf, ok := ret.Get(0).(func() []cloudprovider.Instance); ok {
|
||||
r0 = rf()
|
||||
} else {
|
||||
if ret.Get(0) != nil {
|
||||
r0 = ret.Get(0).([]cloudprovider.Instance)
|
||||
}
|
||||
}
|
||||
|
||||
var r1 error
|
||||
if rf, ok := ret.Get(1).(func() error); ok {
|
||||
r1 = rf()
|
||||
} else {
|
||||
r1 = ret.Error(1)
|
||||
}
|
||||
|
||||
return r0, r1
|
||||
}
|
||||
|
||||
// TargetSize provides a mock function with given fields:
|
||||
func (_m *NodeGroup) TargetSize() (int, error) {
|
||||
ret := _m.Called()
|
||||
|
||||
var r0 int
|
||||
if rf, ok := ret.Get(0).(func() int); ok {
|
||||
r0 = rf()
|
||||
} else {
|
||||
r0 = ret.Get(0).(int)
|
||||
}
|
||||
|
||||
var r1 error
|
||||
if rf, ok := ret.Get(1).(func() error); ok {
|
||||
r1 = rf()
|
||||
} else {
|
||||
r1 = ret.Error(1)
|
||||
}
|
||||
|
||||
return r0, r1
|
||||
}
|
||||
|
||||
// TemplateNodeInfo provides a mock function with given fields:
|
||||
func (_m *NodeGroup) TemplateNodeInfo() (*cache.NodeInfo, error) {
|
||||
ret := _m.Called()
|
||||
|
||||
var r0 *cache.NodeInfo
|
||||
if rf, ok := ret.Get(0).(func() *cache.NodeInfo); ok {
|
||||
r0 = rf()
|
||||
} else {
|
||||
if ret.Get(0) != nil {
|
||||
r0 = ret.Get(0).(*cache.NodeInfo)
|
||||
}
|
||||
}
|
||||
|
||||
var r1 error
|
||||
if rf, ok := ret.Get(1).(func() error); ok {
|
||||
r1 = rf()
|
||||
} else {
|
||||
r1 = ret.Error(1)
|
||||
}
|
||||
|
||||
return r0, r1
|
||||
}
|
||||
|
|
@ -0,0 +1,68 @@
|
|||
/*
|
||||
Copyright 2017 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package mocks
|
||||
|
||||
import mock "github.com/stretchr/testify/mock"
|
||||
import time "time"
|
||||
import v1 "k8s.io/api/core/v1"
|
||||
|
||||
// PricingModel is an autogenerated mock type for the PricingModel type
|
||||
type PricingModel struct {
|
||||
mock.Mock
|
||||
}
|
||||
|
||||
// NodePrice provides a mock function with given fields: node, startTime, endTime
|
||||
func (_m *PricingModel) NodePrice(node *v1.Node, startTime time.Time, endTime time.Time) (float64, error) {
|
||||
ret := _m.Called(node, startTime, endTime)
|
||||
|
||||
var r0 float64
|
||||
if rf, ok := ret.Get(0).(func(*v1.Node, time.Time, time.Time) float64); ok {
|
||||
r0 = rf(node, startTime, endTime)
|
||||
} else {
|
||||
r0 = ret.Get(0).(float64)
|
||||
}
|
||||
|
||||
var r1 error
|
||||
if rf, ok := ret.Get(1).(func(*v1.Node, time.Time, time.Time) error); ok {
|
||||
r1 = rf(node, startTime, endTime)
|
||||
} else {
|
||||
r1 = ret.Error(1)
|
||||
}
|
||||
|
||||
return r0, r1
|
||||
}
|
||||
|
||||
// PodPrice provides a mock function with given fields: pod, startTime, endTime
|
||||
func (_m *PricingModel) PodPrice(pod *v1.Pod, startTime time.Time, endTime time.Time) (float64, error) {
|
||||
ret := _m.Called(pod, startTime, endTime)
|
||||
|
||||
var r0 float64
|
||||
if rf, ok := ret.Get(0).(func(*v1.Pod, time.Time, time.Time) float64); ok {
|
||||
r0 = rf(pod, startTime, endTime)
|
||||
} else {
|
||||
r0 = ret.Get(0).(float64)
|
||||
}
|
||||
|
||||
var r1 error
|
||||
if rf, ok := ret.Get(1).(func(*v1.Pod, time.Time, time.Time) error); ok {
|
||||
r1 = rf(pod, startTime, endTime)
|
||||
} else {
|
||||
r1 = ret.Error(1)
|
||||
}
|
||||
|
||||
return r0, r1
|
||||
}
|
||||
|
|
@ -19,6 +19,7 @@ package clusterstate
|
|||
import (
|
||||
"fmt"
|
||||
"reflect"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
|
|
@ -129,6 +130,8 @@ type ClusterStateRegistry struct {
|
|||
lastStatus *api.ClusterAutoscalerStatus
|
||||
lastScaleDownUpdateTime time.Time
|
||||
logRecorder *utils.LogEventRecorder
|
||||
cloudProviderNodeInstances map[string][]cloudprovider.Instance
|
||||
previousCloudProviderNodeInstances map[string][]cloudprovider.Instance
|
||||
}
|
||||
|
||||
// NewClusterStateRegistry creates new ClusterStateRegistry.
|
||||
|
|
@ -154,21 +157,49 @@ func NewClusterStateRegistry(cloudProvider cloudprovider.CloudProvider, config C
|
|||
}
|
||||
}
|
||||
|
||||
// RegisterScaleUp registers scale up.
|
||||
func (csr *ClusterStateRegistry) RegisterScaleUp(request *ScaleUpRequest) {
|
||||
// RegisterOrUpdateScaleUp registers scale-up for give node group or changes requested node increase
|
||||
// count.
|
||||
// If delta is positive then number of new nodes requested is increased; Time and expectedAddTime
|
||||
// are reset.
|
||||
// If delta is negative the number of new nodes requested is decreased; Time and expectedAddTime are
|
||||
// left intact.
|
||||
func (csr *ClusterStateRegistry) RegisterOrUpdateScaleUp(nodeGroup cloudprovider.NodeGroup, delta int, currentTime time.Time) {
|
||||
csr.Lock()
|
||||
defer csr.Unlock()
|
||||
csr.registerOrUpdateScaleUpNoLock(nodeGroup, delta, currentTime)
|
||||
}
|
||||
|
||||
func (csr *ClusterStateRegistry) registerOrUpdateScaleUpNoLock(nodeGroup cloudprovider.NodeGroup, delta int, currentTime time.Time) {
|
||||
scaleUpRequest, found := csr.scaleUpRequests[nodeGroup.Id()]
|
||||
if !found && delta > 0 {
|
||||
scaleUpRequest = &ScaleUpRequest{
|
||||
NodeGroup: nodeGroup,
|
||||
Increase: delta,
|
||||
Time: currentTime,
|
||||
ExpectedAddTime: currentTime.Add(csr.config.MaxNodeProvisionTime),
|
||||
}
|
||||
csr.scaleUpRequests[nodeGroup.Id()] = scaleUpRequest
|
||||
return
|
||||
}
|
||||
|
||||
oldScaleUpRequest, found := csr.scaleUpRequests[request.NodeGroup.Id()]
|
||||
if !found {
|
||||
csr.scaleUpRequests[request.NodeGroup.Id()] = request
|
||||
// delta <=0
|
||||
return
|
||||
}
|
||||
|
||||
// update the old request
|
||||
oldScaleUpRequest.Time = request.Time
|
||||
oldScaleUpRequest.ExpectedAddTime = request.ExpectedAddTime
|
||||
oldScaleUpRequest.Increase += request.Increase
|
||||
if scaleUpRequest.Increase+delta <= 0 {
|
||||
// increase <= 0 means that there is no scale-up intent really
|
||||
delete(csr.scaleUpRequests, nodeGroup.Id())
|
||||
return
|
||||
}
|
||||
|
||||
scaleUpRequest.Increase += delta
|
||||
if delta > 0 {
|
||||
// if we are actually adding new nodes shift Time and ExpectedAddTime
|
||||
scaleUpRequest.Time = currentTime
|
||||
scaleUpRequest.ExpectedAddTime = currentTime.Add(csr.config.MaxNodeProvisionTime)
|
||||
}
|
||||
}
|
||||
|
||||
// RegisterScaleDown registers node scale down.
|
||||
|
|
@ -224,12 +255,15 @@ func (csr *ClusterStateRegistry) backoffNodeGroup(nodeGroup cloudprovider.NodeGr
|
|||
// RegisterFailedScaleUp should be called after getting error from cloudprovider
|
||||
// when trying to scale-up node group. It will mark this group as not safe to autoscale
|
||||
// for some time.
|
||||
func (csr *ClusterStateRegistry) RegisterFailedScaleUp(nodeGroup cloudprovider.NodeGroup, reason metrics.FailedScaleUpReason) {
|
||||
func (csr *ClusterStateRegistry) RegisterFailedScaleUp(nodeGroup cloudprovider.NodeGroup, reason metrics.FailedScaleUpReason, currentTime time.Time) {
|
||||
csr.Lock()
|
||||
defer csr.Unlock()
|
||||
csr.registerFailedScaleUpNoLock(nodeGroup, reason, currentTime)
|
||||
}
|
||||
|
||||
func (csr *ClusterStateRegistry) registerFailedScaleUpNoLock(nodeGroup cloudprovider.NodeGroup, reason metrics.FailedScaleUpReason, currentTime time.Time) {
|
||||
metrics.RegisterFailedScaleUp(reason)
|
||||
csr.backoffNodeGroup(nodeGroup, time.Now())
|
||||
csr.backoffNodeGroup(nodeGroup, currentTime)
|
||||
}
|
||||
|
||||
// UpdateNodes updates the state of the nodes in the ClusterStateRegistry and recalculates the stats
|
||||
|
|
@ -239,15 +273,19 @@ func (csr *ClusterStateRegistry) UpdateNodes(nodes []*apiv1.Node, currentTime ti
|
|||
if err != nil {
|
||||
return err
|
||||
}
|
||||
notRegistered, err := getNotRegisteredNodes(nodes, csr.cloudProvider, currentTime)
|
||||
|
||||
cloudProviderNodeInstances, err := getCloudProviderNodeInstances(csr.cloudProvider)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
notRegistered := getNotRegisteredNodes(nodes, cloudProviderNodeInstances, currentTime)
|
||||
|
||||
csr.Lock()
|
||||
defer csr.Unlock()
|
||||
|
||||
csr.nodes = nodes
|
||||
csr.previousCloudProviderNodeInstances = csr.cloudProviderNodeInstances
|
||||
csr.cloudProviderNodeInstances = cloudProviderNodeInstances
|
||||
|
||||
csr.updateUnregisteredNodes(notRegistered)
|
||||
csr.updateReadinessStats(currentTime)
|
||||
|
|
@ -256,6 +294,7 @@ func (csr *ClusterStateRegistry) UpdateNodes(nodes []*apiv1.Node, currentTime ti
|
|||
// updateScaleRequests relies on acceptableRanges being up to date
|
||||
csr.updateAcceptableRanges(targetSizes)
|
||||
csr.updateScaleRequests(currentTime)
|
||||
csr.handleOutOfResourcesErrors(currentTime)
|
||||
// recalculate acceptable ranges after removing timed out requests
|
||||
csr.updateAcceptableRanges(targetSizes)
|
||||
csr.updateIncorrectNodeGroupSizes(currentTime)
|
||||
|
|
@ -874,35 +913,38 @@ func (csr *ClusterStateRegistry) GetUpcomingNodes() map[string]int {
|
|||
return result
|
||||
}
|
||||
|
||||
// getCloudProviderNodeInstances returns map keyed on node group id where value is list of node instances
|
||||
// as returned by NodeGroup.Nodes().
|
||||
func getCloudProviderNodeInstances(cloudProvider cloudprovider.CloudProvider) (map[string][]cloudprovider.Instance, error) {
|
||||
allInstances := make(map[string][]cloudprovider.Instance)
|
||||
for _, nodeGroup := range cloudProvider.NodeGroups() {
|
||||
nodeGroupInstances, err := nodeGroup.Nodes()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
allInstances[nodeGroup.Id()] = nodeGroupInstances
|
||||
}
|
||||
return allInstances, nil
|
||||
}
|
||||
|
||||
// Calculates which of the existing cloud provider nodes are not registered in Kubernetes.
|
||||
func getNotRegisteredNodes(allNodes []*apiv1.Node, cloudProvider cloudprovider.CloudProvider, time time.Time) ([]UnregisteredNode, error) {
|
||||
func getNotRegisteredNodes(allNodes []*apiv1.Node, cloudProviderNodeInstances map[string][]cloudprovider.Instance, time time.Time) []UnregisteredNode {
|
||||
registered := sets.NewString()
|
||||
for _, node := range allNodes {
|
||||
registered.Insert(node.Spec.ProviderID)
|
||||
}
|
||||
notRegistered := make([]UnregisteredNode, 0)
|
||||
for _, nodeGroup := range cloudProvider.NodeGroups() {
|
||||
nodes, err := nodeGroup.Nodes()
|
||||
if err != nil {
|
||||
return []UnregisteredNode{}, err
|
||||
}
|
||||
for _, node := range nodes {
|
||||
if !registered.Has(node.Id) {
|
||||
for _, instances := range cloudProviderNodeInstances {
|
||||
for _, instance := range instances {
|
||||
if !registered.Has(instance.Id) {
|
||||
notRegistered = append(notRegistered, UnregisteredNode{
|
||||
Node: &apiv1.Node{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: node.Id,
|
||||
},
|
||||
Spec: apiv1.NodeSpec{
|
||||
ProviderID: node.Id,
|
||||
},
|
||||
},
|
||||
Node: fakeNode(instance),
|
||||
UnregisteredSince: time,
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
return notRegistered, nil
|
||||
return notRegistered
|
||||
}
|
||||
|
||||
// GetClusterSize calculates and returns cluster's current size and target size. The current size is the
|
||||
|
|
@ -917,3 +959,130 @@ func (csr *ClusterStateRegistry) GetClusterSize() (currentSize, targetSize int)
|
|||
currentSize = csr.totalReadiness.Registered - csr.totalReadiness.NotStarted - csr.totalReadiness.LongNotStarted
|
||||
return currentSize, targetSize
|
||||
}
|
||||
|
||||
func (csr *ClusterStateRegistry) handleOutOfResourcesErrors(currentTime time.Time) {
|
||||
nodeGroups := csr.cloudProvider.NodeGroups()
|
||||
|
||||
for _, nodeGroup := range nodeGroups {
|
||||
csr.handleOutOfResourcesErrorsForNodeGroup(
|
||||
nodeGroup,
|
||||
csr.cloudProviderNodeInstances[nodeGroup.Id()],
|
||||
csr.previousCloudProviderNodeInstances[nodeGroup.Id()],
|
||||
currentTime)
|
||||
}
|
||||
}
|
||||
|
||||
func (csr *ClusterStateRegistry) handleOutOfResourcesErrorsForNodeGroup(
|
||||
nodeGroup cloudprovider.NodeGroup,
|
||||
currentInstances []cloudprovider.Instance,
|
||||
previousInstances []cloudprovider.Instance,
|
||||
currentTime time.Time) {
|
||||
|
||||
_, currentUniqueErrorMessagesForErrorCode, currentErrorCodeToInstance := csr.buildInstanceToOutOfResourcesErrorCodeMappings(currentInstances)
|
||||
previousInstanceToErrorCode, _, _ := csr.buildInstanceToOutOfResourcesErrorCodeMappings(previousInstances)
|
||||
|
||||
// If node group is scaling up and there are new node-create requests which cannot be satisfied because of
|
||||
// out-of-resources errors we:
|
||||
// - emit event
|
||||
// - alter the scale-up
|
||||
// - increase scale-up failure metric
|
||||
// - backoff the node group
|
||||
for errorCode, instances := range currentErrorCodeToInstance {
|
||||
unseenInstanceIds := make([]string, 0)
|
||||
for _, instance := range instances {
|
||||
if _, seen := previousInstanceToErrorCode[instance.Id]; !seen {
|
||||
unseenInstanceIds = append(unseenInstanceIds, instance.Id)
|
||||
}
|
||||
}
|
||||
|
||||
klog.V(1).Infof("Failed adding %v nodes (%v unseen previously) to group %v due to %v", len(instances), len(unseenInstanceIds), nodeGroup.Id(), errorCode)
|
||||
if len(unseenInstanceIds) > 0 && csr.IsNodeGroupScalingUp(nodeGroup.Id()) {
|
||||
csr.logRecorder.Eventf(
|
||||
apiv1.EventTypeWarning,
|
||||
"ScaleUpFailed",
|
||||
"Failed adding %v nodes to group %v due to %v; source errors: %v",
|
||||
len(unseenInstanceIds),
|
||||
nodeGroup.Id(),
|
||||
errorCode,
|
||||
csr.buildErrorMessageEventString(currentUniqueErrorMessagesForErrorCode[errorCode]))
|
||||
|
||||
// Decrease the scale up request by the number of deleted nodes
|
||||
csr.registerOrUpdateScaleUpNoLock(nodeGroup, -len(unseenInstanceIds), currentTime)
|
||||
csr.registerFailedScaleUpNoLock(nodeGroup, metrics.FailedScaleUpReason(errorCode), currentTime)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (csr *ClusterStateRegistry) buildErrorMessageEventString(uniqErrorMessages []string) string {
|
||||
var sb strings.Builder
|
||||
maxErrors := 3
|
||||
for i, errorMessage := range uniqErrorMessages {
|
||||
if i > 0 {
|
||||
sb.WriteString(", ")
|
||||
}
|
||||
sb.WriteString(errorMessage)
|
||||
|
||||
}
|
||||
if len(uniqErrorMessages) > maxErrors {
|
||||
sb.WriteString(", ...")
|
||||
}
|
||||
return sb.String()
|
||||
}
|
||||
|
||||
func (csr *ClusterStateRegistry) buildInstanceToOutOfResourcesErrorCodeMappings(instances []cloudprovider.Instance) (instanceToErrorCode map[string]string, uniqueErrorMessagesForErrorCode map[string][]string, errorCodeToInstance map[string][]cloudprovider.Instance) {
|
||||
instanceToErrorCode = make(map[string]string)
|
||||
uniqueErrorMessagesForErrorCode = make(map[string][]string)
|
||||
errorCodeToInstance = make(map[string][]cloudprovider.Instance)
|
||||
|
||||
uniqErrorMessagesForErrorCodeTmp := make(map[string]map[string]bool)
|
||||
for _, instance := range instances {
|
||||
if instance.Status != nil && instance.Status.State == cloudprovider.InstanceCreating && instance.Status.ErrorInfo != nil {
|
||||
errorInfo := instance.Status.ErrorInfo
|
||||
if errorInfo.ErrorClass == cloudprovider.OutOfResourcesErrorClass {
|
||||
|
||||
if _, found := uniqErrorMessagesForErrorCodeTmp[errorInfo.ErrorCode]; !found {
|
||||
uniqErrorMessagesForErrorCodeTmp[errorInfo.ErrorCode] = make(map[string]bool)
|
||||
}
|
||||
instanceToErrorCode[instance.Id] = errorInfo.ErrorCode
|
||||
uniqErrorMessagesForErrorCodeTmp[errorInfo.ErrorCode][errorInfo.ErrorMessage] = true
|
||||
errorCodeToInstance[errorInfo.ErrorCode] = append(errorCodeToInstance[errorInfo.ErrorCode], instance)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for errorCode, uniqueErrorMessages := range uniqErrorMessagesForErrorCodeTmp {
|
||||
for errorMessage := range uniqueErrorMessages {
|
||||
uniqueErrorMessagesForErrorCode[errorCode] = append(uniqueErrorMessagesForErrorCode[errorCode], errorMessage)
|
||||
}
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
// GetCreatedNodesWithOutOfResourcesErrors returns list of nodes being created which reported create error of "out of resources" class.
|
||||
func (csr *ClusterStateRegistry) GetCreatedNodesWithOutOfResourcesErrors() []*apiv1.Node {
|
||||
csr.Lock()
|
||||
defer csr.Unlock()
|
||||
|
||||
nodesWithCreateErrors := make([]*apiv1.Node, 0, 0)
|
||||
for _, nodeGroupInstances := range csr.cloudProviderNodeInstances {
|
||||
_, _, instancesByErrorCode := csr.buildInstanceToOutOfResourcesErrorCodeMappings(nodeGroupInstances)
|
||||
for _, instances := range instancesByErrorCode {
|
||||
for _, instance := range instances {
|
||||
nodesWithCreateErrors = append(nodesWithCreateErrors, fakeNode(instance))
|
||||
}
|
||||
}
|
||||
}
|
||||
return nodesWithCreateErrors
|
||||
}
|
||||
|
||||
func fakeNode(instance cloudprovider.Instance) *apiv1.Node {
|
||||
return &apiv1.Node{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: instance.Id,
|
||||
},
|
||||
Spec: apiv1.NodeSpec{
|
||||
ProviderID: instance.Id,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -54,13 +54,9 @@ func TestOKWithScaleUp(t *testing.T) {
|
|||
clusterstate := NewClusterStateRegistry(provider, ClusterStateRegistryConfig{
|
||||
MaxTotalUnreadyPercentage: 10,
|
||||
OkTotalUnreadyCount: 1,
|
||||
MaxNodeProvisionTime: time.Minute,
|
||||
}, fakeLogRecorder, newBackoff())
|
||||
clusterstate.RegisterScaleUp(&ScaleUpRequest{
|
||||
NodeGroup: provider.GetNodeGroup("ng1"),
|
||||
Increase: 4,
|
||||
Time: now,
|
||||
ExpectedAddTime: now.Add(time.Minute),
|
||||
})
|
||||
clusterstate.RegisterOrUpdateScaleUp(provider.GetNodeGroup("ng1"), 4, time.Now())
|
||||
err := clusterstate.UpdateNodes([]*apiv1.Node{ng1_1, ng2_1}, now)
|
||||
assert.NoError(t, err)
|
||||
assert.True(t, clusterstate.IsClusterHealthy())
|
||||
|
|
@ -99,6 +95,7 @@ func TestEmptyOK(t *testing.T) {
|
|||
clusterstate := NewClusterStateRegistry(provider, ClusterStateRegistryConfig{
|
||||
MaxTotalUnreadyPercentage: 10,
|
||||
OkTotalUnreadyCount: 1,
|
||||
MaxNodeProvisionTime: time.Minute,
|
||||
}, fakeLogRecorder, newBackoff())
|
||||
err := clusterstate.UpdateNodes([]*apiv1.Node{}, now.Add(-5*time.Second))
|
||||
assert.NoError(t, err)
|
||||
|
|
@ -107,12 +104,9 @@ func TestEmptyOK(t *testing.T) {
|
|||
assert.False(t, clusterstate.IsNodeGroupScalingUp("ng1"))
|
||||
|
||||
provider.AddNodeGroup("ng1", 0, 10, 3)
|
||||
clusterstate.RegisterScaleUp(&ScaleUpRequest{
|
||||
NodeGroup: provider.GetNodeGroup("ng1"),
|
||||
Increase: 3,
|
||||
Time: now.Add(-3 * time.Second),
|
||||
ExpectedAddTime: now.Add(1 * time.Minute),
|
||||
})
|
||||
clusterstate.RegisterOrUpdateScaleUp(provider.GetNodeGroup("ng1"), 3, now.Add(-3*time.Second))
|
||||
// clusterstate.scaleUpRequests["ng1"].Time = now.Add(-3 * time.Second)
|
||||
// clusterstate.scaleUpRequests["ng1"].ExpectedAddTime = now.Add(1 * time.Minute)
|
||||
err = clusterstate.UpdateNodes([]*apiv1.Node{}, now)
|
||||
|
||||
assert.NoError(t, err)
|
||||
|
|
@ -332,13 +326,9 @@ func TestExpiredScaleUp(t *testing.T) {
|
|||
clusterstate := NewClusterStateRegistry(provider, ClusterStateRegistryConfig{
|
||||
MaxTotalUnreadyPercentage: 10,
|
||||
OkTotalUnreadyCount: 1,
|
||||
MaxNodeProvisionTime: 2 * time.Minute,
|
||||
}, fakeLogRecorder, newBackoff())
|
||||
clusterstate.RegisterScaleUp(&ScaleUpRequest{
|
||||
NodeGroup: provider.GetNodeGroup("ng1"),
|
||||
Increase: 4,
|
||||
Time: now.Add(-3 * time.Minute),
|
||||
ExpectedAddTime: now.Add(-1 * time.Minute),
|
||||
})
|
||||
clusterstate.RegisterOrUpdateScaleUp(provider.GetNodeGroup("ng1"), 4, now.Add(-3*time.Minute))
|
||||
err := clusterstate.UpdateNodes([]*apiv1.Node{ng1_1}, now)
|
||||
assert.NoError(t, err)
|
||||
assert.True(t, clusterstate.IsClusterHealthy())
|
||||
|
|
@ -619,15 +609,11 @@ func TestScaleUpBackoff(t *testing.T) {
|
|||
clusterstate := NewClusterStateRegistry(provider, ClusterStateRegistryConfig{
|
||||
MaxTotalUnreadyPercentage: 10,
|
||||
OkTotalUnreadyCount: 1,
|
||||
MaxNodeProvisionTime: 120 * time.Second,
|
||||
}, fakeLogRecorder, newBackoff())
|
||||
|
||||
// After failed scale-up, node group should be still healthy, but should backoff from scale-ups
|
||||
clusterstate.RegisterScaleUp(&ScaleUpRequest{
|
||||
NodeGroup: provider.GetNodeGroup("ng1"),
|
||||
Increase: 1,
|
||||
Time: now.Add(-3 * time.Minute),
|
||||
ExpectedAddTime: now.Add(-1 * time.Minute),
|
||||
})
|
||||
clusterstate.RegisterOrUpdateScaleUp(provider.GetNodeGroup("ng1"), 1, now.Add(-180*time.Second))
|
||||
err := clusterstate.UpdateNodes([]*apiv1.Node{ng1_1, ng1_2, ng1_3}, now)
|
||||
assert.NoError(t, err)
|
||||
assert.True(t, clusterstate.IsClusterHealthy())
|
||||
|
|
@ -641,12 +627,8 @@ func TestScaleUpBackoff(t *testing.T) {
|
|||
assert.True(t, clusterstate.IsNodeGroupSafeToScaleUp(ng1, now))
|
||||
|
||||
// Another failed scale up should cause longer backoff
|
||||
clusterstate.RegisterScaleUp(&ScaleUpRequest{
|
||||
NodeGroup: provider.GetNodeGroup("ng1"),
|
||||
Increase: 1,
|
||||
Time: now.Add(-2 * time.Minute),
|
||||
ExpectedAddTime: now.Add(-1 * time.Second),
|
||||
})
|
||||
clusterstate.RegisterOrUpdateScaleUp(provider.GetNodeGroup("ng1"), 1, now.Add(-121*time.Second))
|
||||
|
||||
err = clusterstate.UpdateNodes([]*apiv1.Node{ng1_1, ng1_2, ng1_3}, now)
|
||||
assert.NoError(t, err)
|
||||
assert.True(t, clusterstate.IsClusterHealthy())
|
||||
|
|
@ -657,12 +639,7 @@ func TestScaleUpBackoff(t *testing.T) {
|
|||
assert.False(t, clusterstate.IsNodeGroupSafeToScaleUp(ng1, now))
|
||||
|
||||
// The backoff should be cleared after a successful scale-up
|
||||
clusterstate.RegisterScaleUp(&ScaleUpRequest{
|
||||
NodeGroup: provider.GetNodeGroup("ng1"),
|
||||
Increase: 1,
|
||||
Time: now,
|
||||
ExpectedAddTime: now.Add(time.Second),
|
||||
})
|
||||
clusterstate.RegisterOrUpdateScaleUp(provider.GetNodeGroup("ng1"), 1, now)
|
||||
ng1_4 := BuildTestNode("ng1-4", 1000, 1000)
|
||||
SetNodeReadyState(ng1_4, true, now.Add(-1*time.Minute))
|
||||
provider.AddNode("ng1", ng1_4)
|
||||
|
|
@ -725,6 +702,50 @@ func TestGetClusterSize(t *testing.T) {
|
|||
assert.Equal(t, 30, targetSize)
|
||||
}
|
||||
|
||||
func TestUpdateScaleUp(t *testing.T) {
|
||||
now := time.Now()
|
||||
later := now.Add(time.Minute)
|
||||
|
||||
provider := testprovider.NewTestCloudProvider(nil, nil)
|
||||
provider.AddNodeGroup("ng1", 1, 10, 5)
|
||||
fakeClient := &fake.Clientset{}
|
||||
fakeLogRecorder, _ := utils.NewStatusMapRecorder(fakeClient, "kube-system", kube_record.NewFakeRecorder(5), false)
|
||||
clusterstate := NewClusterStateRegistry(
|
||||
provider,
|
||||
ClusterStateRegistryConfig{
|
||||
MaxTotalUnreadyPercentage: 10,
|
||||
OkTotalUnreadyCount: 1,
|
||||
MaxNodeProvisionTime: 10 * time.Second,
|
||||
},
|
||||
fakeLogRecorder,
|
||||
newBackoff())
|
||||
|
||||
clusterstate.RegisterOrUpdateScaleUp(provider.GetNodeGroup("ng1"), 100, now)
|
||||
assert.Equal(t, clusterstate.scaleUpRequests["ng1"].Increase, 100)
|
||||
assert.Equal(t, clusterstate.scaleUpRequests["ng1"].Time, now)
|
||||
assert.Equal(t, clusterstate.scaleUpRequests["ng1"].ExpectedAddTime, now.Add(10*time.Second))
|
||||
|
||||
// expect no change of times on negative delta
|
||||
clusterstate.RegisterOrUpdateScaleUp(provider.GetNodeGroup("ng1"), -20, later)
|
||||
assert.Equal(t, clusterstate.scaleUpRequests["ng1"].Increase, 80)
|
||||
assert.Equal(t, clusterstate.scaleUpRequests["ng1"].Time, now)
|
||||
assert.Equal(t, clusterstate.scaleUpRequests["ng1"].ExpectedAddTime, now.Add(10*time.Second))
|
||||
|
||||
// update times on positive delta
|
||||
clusterstate.RegisterOrUpdateScaleUp(provider.GetNodeGroup("ng1"), 30, later)
|
||||
assert.Equal(t, clusterstate.scaleUpRequests["ng1"].Increase, 110)
|
||||
assert.Equal(t, clusterstate.scaleUpRequests["ng1"].Time, later)
|
||||
assert.Equal(t, clusterstate.scaleUpRequests["ng1"].ExpectedAddTime, later.Add(10*time.Second))
|
||||
|
||||
// if we get below 0 scalup is deleted
|
||||
clusterstate.RegisterOrUpdateScaleUp(provider.GetNodeGroup("ng1"), -200, now)
|
||||
assert.Nil(t, clusterstate.scaleUpRequests["ng1"])
|
||||
|
||||
// If new scalup is registered with negative delta nothing should happen
|
||||
clusterstate.RegisterOrUpdateScaleUp(provider.GetNodeGroup("ng1"), -200, now)
|
||||
assert.Nil(t, clusterstate.scaleUpRequests["ng1"])
|
||||
}
|
||||
|
||||
func TestIsNodeStillStarting(t *testing.T) {
|
||||
testCases := []struct {
|
||||
desc string
|
||||
|
|
|
|||
|
|
@ -532,7 +532,7 @@ func ScaleUp(context *context.AutoscalingContext, processors *ca_processors.Auto
|
|||
}
|
||||
klog.V(1).Infof("Final scale-up plan: %v", scaleUpInfos)
|
||||
for _, info := range scaleUpInfos {
|
||||
typedErr := executeScaleUp(context, clusterStateRegistry, info, gpu.GetGpuTypeForMetrics(nodeInfo.Node(), nil))
|
||||
typedErr := executeScaleUp(context, clusterStateRegistry, info, gpu.GetGpuTypeForMetrics(nodeInfo.Node(), nil), now)
|
||||
if typedErr != nil {
|
||||
return &status.ScaleUpStatus{Result: status.ScaleUpError}, typedErr
|
||||
}
|
||||
|
|
@ -686,24 +686,21 @@ groupsloop:
|
|||
return result
|
||||
}
|
||||
|
||||
func executeScaleUp(context *context.AutoscalingContext, clusterStateRegistry *clusterstate.ClusterStateRegistry, info nodegroupset.ScaleUpInfo, gpuType string) errors.AutoscalerError {
|
||||
func executeScaleUp(context *context.AutoscalingContext, clusterStateRegistry *clusterstate.ClusterStateRegistry, info nodegroupset.ScaleUpInfo, gpuType string, now time.Time) errors.AutoscalerError {
|
||||
klog.V(0).Infof("Scale-up: setting group %s size to %d", info.Group.Id(), info.NewSize)
|
||||
context.LogRecorder.Eventf(apiv1.EventTypeNormal, "ScaledUpGroup",
|
||||
"Scale-up: setting group %s size to %d", info.Group.Id(), info.NewSize)
|
||||
increase := info.NewSize - info.CurrentSize
|
||||
if err := info.Group.IncreaseSize(increase); err != nil {
|
||||
context.LogRecorder.Eventf(apiv1.EventTypeWarning, "FailedToScaleUpGroup", "Scale-up failed for group %s: %v", info.Group.Id(), err)
|
||||
clusterStateRegistry.RegisterFailedScaleUp(info.Group, metrics.APIError)
|
||||
clusterStateRegistry.RegisterFailedScaleUp(info.Group, metrics.APIError, now)
|
||||
return errors.NewAutoscalerError(errors.CloudProviderError,
|
||||
"failed to increase node group size: %v", err)
|
||||
}
|
||||
clusterStateRegistry.RegisterScaleUp(
|
||||
&clusterstate.ScaleUpRequest{
|
||||
NodeGroup: info.Group,
|
||||
Increase: increase,
|
||||
Time: time.Now(),
|
||||
ExpectedAddTime: time.Now().Add(context.MaxNodeProvisionTime),
|
||||
})
|
||||
clusterStateRegistry.RegisterOrUpdateScaleUp(
|
||||
info.Group,
|
||||
increase,
|
||||
time.Now())
|
||||
metrics.RegisterScaleUp(increase, gpuType)
|
||||
context.LogRecorder.Eventf(apiv1.EventTypeNormal, "ScaledUpGroup",
|
||||
"Scale-up: group %s size set to %d", info.Group.Id(), info.NewSize)
|
||||
|
|
|
|||
|
|
@ -520,13 +520,12 @@ func TestScaleUpNodeComingNoScale(t *testing.T) {
|
|||
}
|
||||
context := NewScaleTestAutoscalingContext(options, fakeClient, provider)
|
||||
|
||||
clusterState := clusterstate.NewClusterStateRegistry(provider, clusterstate.ClusterStateRegistryConfig{}, context.LogRecorder, newBackoff())
|
||||
clusterState.RegisterScaleUp(&clusterstate.ScaleUpRequest{
|
||||
NodeGroup: provider.GetNodeGroup("ng2"),
|
||||
Increase: 1,
|
||||
Time: time.Now(),
|
||||
ExpectedAddTime: time.Now().Add(5 * time.Minute),
|
||||
})
|
||||
clusterState := clusterstate.NewClusterStateRegistry(
|
||||
provider,
|
||||
clusterstate.ClusterStateRegistryConfig{MaxNodeProvisionTime: 5 * time.Minute},
|
||||
context.LogRecorder,
|
||||
newBackoff())
|
||||
clusterState.RegisterOrUpdateScaleUp(provider.GetNodeGroup("ng2"), 1, time.Now())
|
||||
clusterState.UpdateNodes([]*apiv1.Node{n1, n2}, time.Now())
|
||||
|
||||
p3 := BuildTestPod("p-new", 550, 0)
|
||||
|
|
@ -575,13 +574,14 @@ func TestScaleUpNodeComingHasScale(t *testing.T) {
|
|||
|
||||
context := NewScaleTestAutoscalingContext(defaultOptions, fakeClient, provider)
|
||||
|
||||
clusterState := clusterstate.NewClusterStateRegistry(provider, clusterstate.ClusterStateRegistryConfig{}, context.LogRecorder, newBackoff())
|
||||
clusterState.RegisterScaleUp(&clusterstate.ScaleUpRequest{
|
||||
NodeGroup: provider.GetNodeGroup("ng2"),
|
||||
Increase: 1,
|
||||
Time: time.Now(),
|
||||
ExpectedAddTime: time.Now().Add(5 * time.Minute),
|
||||
})
|
||||
clusterState := clusterstate.NewClusterStateRegistry(
|
||||
provider,
|
||||
clusterstate.ClusterStateRegistryConfig{
|
||||
MaxNodeProvisionTime: 5 * time.Minute,
|
||||
},
|
||||
context.LogRecorder,
|
||||
newBackoff())
|
||||
clusterState.RegisterOrUpdateScaleUp(provider.GetNodeGroup("ng2"), 1, time.Now())
|
||||
clusterState.UpdateNodes([]*apiv1.Node{n1, n2}, time.Now())
|
||||
|
||||
p3 := BuildTestPod("p-new", 550, 0)
|
||||
|
|
|
|||
|
|
@ -17,6 +17,7 @@ limitations under the License.
|
|||
package core
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
apiv1 "k8s.io/api/core/v1"
|
||||
|
|
@ -192,6 +193,7 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) errors.AutoscalerError
|
|||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
if !a.clusterStateRegistry.IsClusterHealthy() {
|
||||
klog.Warning("Cluster is not ready for autoscaling")
|
||||
scaleDown.CleanUpUnneededNodes()
|
||||
|
|
@ -199,6 +201,8 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) errors.AutoscalerError
|
|||
return nil
|
||||
}
|
||||
|
||||
a.deleteCreatedNodesWithErrors()
|
||||
|
||||
// Check if there has been a constant difference between the number of nodes in k8s and
|
||||
// the number of nodes on the cloud provider side.
|
||||
// TODO: andrewskim - add protection for ready AWS nodes.
|
||||
|
|
@ -398,6 +402,52 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) errors.AutoscalerError
|
|||
return nil
|
||||
}
|
||||
|
||||
func (a *StaticAutoscaler) deleteCreatedNodesWithErrors() {
|
||||
// We always schedule deleting of incoming errornous nodes
|
||||
// TODO[lukaszos] Consider adding logic to not retry delete every loop iteration
|
||||
nodes := a.clusterStateRegistry.GetCreatedNodesWithOutOfResourcesErrors()
|
||||
|
||||
nodeGroups := a.nodeGroupsById()
|
||||
nodesToBeDeletedByNodeGroupId := make(map[string][]*apiv1.Node)
|
||||
|
||||
for _, node := range nodes {
|
||||
nodeGroup, err := a.CloudProvider.NodeGroupForNode(node)
|
||||
if err != nil {
|
||||
id := "<nil>"
|
||||
if node != nil {
|
||||
id = node.Spec.ProviderID
|
||||
}
|
||||
klog.Warningf("Cannot determine nodeGroup for node %v; %v", id, err)
|
||||
continue
|
||||
}
|
||||
nodesToBeDeletedByNodeGroupId[nodeGroup.Id()] = append(nodesToBeDeletedByNodeGroupId[nodeGroup.Id()], node)
|
||||
}
|
||||
|
||||
for nodeGroupId, nodesToBeDeleted := range nodesToBeDeletedByNodeGroupId {
|
||||
var err error
|
||||
klog.V(1).Infof("Deleting %v from %v node group because of create errors", len(nodesToBeDeleted), nodeGroupId)
|
||||
|
||||
nodeGroup := nodeGroups[nodeGroupId]
|
||||
if nodeGroup == nil {
|
||||
err = fmt.Errorf("Node group %s not found", nodeGroup)
|
||||
} else {
|
||||
err = nodeGroup.DeleteNodes(nodesToBeDeleted)
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
klog.Warningf("Error while trying to delete nodes from %v: %v", nodeGroup.Id(), err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (a *StaticAutoscaler) nodeGroupsById() map[string]cloudprovider.NodeGroup {
|
||||
nodeGroups := make(map[string]cloudprovider.NodeGroup)
|
||||
for _, nodeGroup := range a.CloudProvider.NodeGroups() {
|
||||
nodeGroups[nodeGroup.Id()] = nodeGroup
|
||||
}
|
||||
return nodeGroups
|
||||
}
|
||||
|
||||
// don't consider pods newer than newPodScaleUpDelay seconds old as unschedulable
|
||||
func (a *StaticAutoscaler) filterOutYoungPods(allUnschedulablePods []*apiv1.Pod, currentTime time.Time) []*apiv1.Pod {
|
||||
var oldUnschedulablePods []*apiv1.Pod
|
||||
|
|
|
|||
|
|
@ -17,10 +17,13 @@ limitations under the License.
|
|||
package core
|
||||
|
||||
import (
|
||||
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
|
||||
"reflect"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
mockprovider "k8s.io/autoscaler/cluster-autoscaler/cloudprovider/mocks"
|
||||
testprovider "k8s.io/autoscaler/cluster-autoscaler/cloudprovider/test"
|
||||
"k8s.io/autoscaler/cluster-autoscaler/clusterstate"
|
||||
"k8s.io/autoscaler/cluster-autoscaler/config"
|
||||
|
|
@ -692,3 +695,204 @@ func TestStaticAutoscalerRunOncePodsWithPriorities(t *testing.T) {
|
|||
podDisruptionBudgetListerMock, daemonSetListerMock, onScaleUpMock, onScaleDownMock)
|
||||
|
||||
}
|
||||
|
||||
func TestStaticAutoscalerOutOfResources(t *testing.T) {
|
||||
|
||||
// setup
|
||||
provider := &mockprovider.CloudProvider{}
|
||||
|
||||
// Create context with mocked lister registry.
|
||||
options := config.AutoscalingOptions{
|
||||
EstimatorName: estimator.BinpackingEstimatorName,
|
||||
ScaleDownEnabled: true,
|
||||
ScaleDownUtilizationThreshold: 0.5,
|
||||
MaxNodesTotal: 10,
|
||||
MaxCoresTotal: 10,
|
||||
MaxMemoryTotal: 100000,
|
||||
ScaleDownUnreadyTime: time.Minute,
|
||||
ScaleDownUnneededTime: time.Minute,
|
||||
ExpendablePodsPriorityCutoff: 10,
|
||||
}
|
||||
context := NewScaleTestAutoscalingContext(options, &fake.Clientset{}, provider)
|
||||
|
||||
clusterStateConfig := clusterstate.ClusterStateRegistryConfig{
|
||||
OkTotalUnreadyCount: 1,
|
||||
MaxNodeProvisionTime: 10 * time.Second,
|
||||
}
|
||||
|
||||
clusterState := clusterstate.NewClusterStateRegistry(provider, clusterStateConfig, context.LogRecorder, newBackoff())
|
||||
autoscaler := &StaticAutoscaler{
|
||||
AutoscalingContext: &context,
|
||||
clusterStateRegistry: clusterState,
|
||||
lastScaleUpTime: time.Now(),
|
||||
lastScaleDownFailTime: time.Now(),
|
||||
}
|
||||
|
||||
nodeGroupA := &mockprovider.NodeGroup{}
|
||||
nodeGroupB := &mockprovider.NodeGroup{}
|
||||
|
||||
// Three nodes with out-of-resources errors
|
||||
nodeGroupA.On("Exist").Return(true)
|
||||
nodeGroupA.On("Autoprovisioned").Return(false)
|
||||
nodeGroupA.On("TargetSize").Return(5, nil)
|
||||
nodeGroupA.On("Id").Return("A")
|
||||
nodeGroupA.On("DeleteNodes", mock.Anything).Return(nil)
|
||||
nodeGroupA.On("Nodes").Return([]cloudprovider.Instance{
|
||||
{
|
||||
"A1",
|
||||
&cloudprovider.InstanceStatus{
|
||||
State: cloudprovider.InstanceRunning,
|
||||
},
|
||||
},
|
||||
{
|
||||
"A2",
|
||||
&cloudprovider.InstanceStatus{
|
||||
State: cloudprovider.InstanceCreating,
|
||||
},
|
||||
},
|
||||
{
|
||||
"A3",
|
||||
&cloudprovider.InstanceStatus{
|
||||
State: cloudprovider.InstanceCreating,
|
||||
ErrorInfo: &cloudprovider.InstanceErrorInfo{
|
||||
ErrorClass: cloudprovider.OutOfResourcesErrorClass,
|
||||
ErrorCode: "STOCKOUT",
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
"A4",
|
||||
&cloudprovider.InstanceStatus{
|
||||
State: cloudprovider.InstanceCreating,
|
||||
ErrorInfo: &cloudprovider.InstanceErrorInfo{
|
||||
ErrorClass: cloudprovider.OutOfResourcesErrorClass,
|
||||
ErrorCode: "STOCKOUT",
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
"A5",
|
||||
&cloudprovider.InstanceStatus{
|
||||
State: cloudprovider.InstanceCreating,
|
||||
ErrorInfo: &cloudprovider.InstanceErrorInfo{
|
||||
ErrorClass: cloudprovider.OutOfResourcesErrorClass,
|
||||
ErrorCode: "QUOTA",
|
||||
},
|
||||
},
|
||||
},
|
||||
}, nil).Twice()
|
||||
|
||||
nodeGroupB.On("Exist").Return(true)
|
||||
nodeGroupB.On("Autoprovisioned").Return(false)
|
||||
nodeGroupB.On("TargetSize").Return(5, nil)
|
||||
nodeGroupB.On("Id").Return("B")
|
||||
nodeGroupB.On("DeleteNodes", mock.Anything).Return(nil)
|
||||
nodeGroupB.On("Nodes").Return([]cloudprovider.Instance{
|
||||
{
|
||||
"B1",
|
||||
&cloudprovider.InstanceStatus{
|
||||
State: cloudprovider.InstanceRunning,
|
||||
},
|
||||
},
|
||||
}, nil)
|
||||
|
||||
provider.On("NodeGroups").Return([]cloudprovider.NodeGroup{nodeGroupA})
|
||||
provider.On("NodeGroupForNode", mock.Anything).Return(
|
||||
func(node *apiv1.Node) cloudprovider.NodeGroup {
|
||||
if strings.HasPrefix(node.Spec.ProviderID, "A") {
|
||||
return nodeGroupA
|
||||
}
|
||||
if strings.HasPrefix(node.Spec.ProviderID, "B") {
|
||||
return nodeGroupB
|
||||
}
|
||||
return nil
|
||||
}, nil)
|
||||
|
||||
now := time.Now()
|
||||
|
||||
// propagate nodes info in cluster state
|
||||
clusterState.UpdateNodes([]*apiv1.Node{}, now)
|
||||
|
||||
// delete nodes with create errors
|
||||
autoscaler.deleteCreatedNodesWithErrors()
|
||||
|
||||
// check delete was called on correct nodes
|
||||
nodeGroupA.AssertCalled(t, "DeleteNodes", mock.MatchedBy(
|
||||
func(nodes []*apiv1.Node) bool {
|
||||
if len(nodes) != 3 {
|
||||
return false
|
||||
}
|
||||
names := make(map[string]bool)
|
||||
for _, node := range nodes {
|
||||
names[node.Spec.ProviderID] = true
|
||||
}
|
||||
return names["A3"] && names["A4"] && names["A5"]
|
||||
}))
|
||||
|
||||
// TODO assert that scaleup was failed (separately for QUOTA and STOCKOUT)
|
||||
|
||||
// propagate nodes info in cluster state again
|
||||
// no changes in what provider returns
|
||||
clusterState.UpdateNodes([]*apiv1.Node{}, now)
|
||||
|
||||
// delete nodes with create errors
|
||||
autoscaler.deleteCreatedNodesWithErrors()
|
||||
|
||||
// nodes should be deleted again
|
||||
nodeGroupA.AssertCalled(t, "DeleteNodes", mock.MatchedBy(
|
||||
func(nodes []*apiv1.Node) bool {
|
||||
if len(nodes) != 3 {
|
||||
return false
|
||||
}
|
||||
names := make(map[string]bool)
|
||||
for _, node := range nodes {
|
||||
names[node.Spec.ProviderID] = true
|
||||
}
|
||||
return names["A3"] && names["A4"] && names["A5"]
|
||||
}))
|
||||
|
||||
// TODO assert that scaleup is not failed again
|
||||
|
||||
// restub node group A so nodes are no longer reporting errors
|
||||
nodeGroupA.On("Nodes").Return([]cloudprovider.Instance{
|
||||
{
|
||||
"A1",
|
||||
&cloudprovider.InstanceStatus{
|
||||
State: cloudprovider.InstanceRunning,
|
||||
},
|
||||
},
|
||||
{
|
||||
"A2",
|
||||
&cloudprovider.InstanceStatus{
|
||||
State: cloudprovider.InstanceCreating,
|
||||
},
|
||||
},
|
||||
{
|
||||
"A3",
|
||||
&cloudprovider.InstanceStatus{
|
||||
State: cloudprovider.InstanceDeleting,
|
||||
},
|
||||
},
|
||||
{
|
||||
"A4",
|
||||
&cloudprovider.InstanceStatus{
|
||||
State: cloudprovider.InstanceDeleting,
|
||||
},
|
||||
},
|
||||
{
|
||||
"A5",
|
||||
&cloudprovider.InstanceStatus{
|
||||
State: cloudprovider.InstanceDeleting,
|
||||
},
|
||||
},
|
||||
}, nil)
|
||||
|
||||
// update cluster state
|
||||
clusterState.UpdateNodes([]*apiv1.Node{}, now)
|
||||
|
||||
// delete nodes with create errors
|
||||
autoscaler.deleteCreatedNodesWithErrors()
|
||||
|
||||
// we expect no more Delete Nodes
|
||||
nodeGroupA.AssertNumberOfCalls(t, "DeleteNodes", 2)
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue