Filter upcoming nodes in clusterstate and scale-up executor
This commit is contained in:
parent
bb94d270d7
commit
72ec806382
|
@ -263,6 +263,9 @@ func (csr *ClusterStateRegistry) updateScaleRequests(currentTime time.Time) {
|
||||||
csr.backoff.RemoveStaleBackoffData(currentTime)
|
csr.backoff.RemoveStaleBackoffData(currentTime)
|
||||||
|
|
||||||
for nodeGroupName, scaleUpRequest := range csr.scaleUpRequests {
|
for nodeGroupName, scaleUpRequest := range csr.scaleUpRequests {
|
||||||
|
if csr.asyncNodeGroupStateChecker.IsUpcoming(scaleUpRequest.NodeGroup) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
if !csr.areThereUpcomingNodesInNodeGroup(nodeGroupName) {
|
if !csr.areThereUpcomingNodesInNodeGroup(nodeGroupName) {
|
||||||
// scale up finished successfully, remove request
|
// scale up finished successfully, remove request
|
||||||
delete(csr.scaleUpRequests, nodeGroupName)
|
delete(csr.scaleUpRequests, nodeGroupName)
|
||||||
|
@ -450,10 +453,7 @@ func (csr *ClusterStateRegistry) IsNodeGroupHealthy(nodeGroupName string) bool {
|
||||||
func (csr *ClusterStateRegistry) updateNodeGroupMetrics() {
|
func (csr *ClusterStateRegistry) updateNodeGroupMetrics() {
|
||||||
autoscaled := 0
|
autoscaled := 0
|
||||||
autoprovisioned := 0
|
autoprovisioned := 0
|
||||||
for _, nodeGroup := range csr.cloudProvider.NodeGroups() {
|
for _, nodeGroup := range csr.getRunningNodeGroups() {
|
||||||
if !nodeGroup.Exist() {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
if nodeGroup.Autoprovisioned() {
|
if nodeGroup.Autoprovisioned() {
|
||||||
autoprovisioned++
|
autoprovisioned++
|
||||||
} else {
|
} else {
|
||||||
|
@ -509,6 +509,12 @@ func (csr *ClusterStateRegistry) areThereUpcomingNodesInNodeGroup(nodeGroupName
|
||||||
return target > provisioned
|
return target > provisioned
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// IsNodeGroupRegistered returns true if the node group is registered in cluster state.
|
||||||
|
func (csr *ClusterStateRegistry) IsNodeGroupRegistered(nodeGroupName string) bool {
|
||||||
|
_, found := csr.acceptableRanges[nodeGroupName]
|
||||||
|
return found
|
||||||
|
}
|
||||||
|
|
||||||
// IsNodeGroupAtTargetSize returns true if the number of nodes provisioned in the group is equal to the target number of nodes.
|
// IsNodeGroupAtTargetSize returns true if the number of nodes provisioned in the group is equal to the target number of nodes.
|
||||||
func (csr *ClusterStateRegistry) IsNodeGroupAtTargetSize(nodeGroupName string) bool {
|
func (csr *ClusterStateRegistry) IsNodeGroupAtTargetSize(nodeGroupName string) bool {
|
||||||
provisioned, target, ok := csr.getProvisionedAndTargetSizesForNodeGroup(nodeGroupName)
|
provisioned, target, ok := csr.getProvisionedAndTargetSizesForNodeGroup(nodeGroupName)
|
||||||
|
@ -555,7 +561,7 @@ type AcceptableRange struct {
|
||||||
// the expected number of ready nodes is between targetSize and targetSize + 3.
|
// the expected number of ready nodes is between targetSize and targetSize + 3.
|
||||||
func (csr *ClusterStateRegistry) updateAcceptableRanges(targetSize map[string]int) {
|
func (csr *ClusterStateRegistry) updateAcceptableRanges(targetSize map[string]int) {
|
||||||
result := make(map[string]AcceptableRange)
|
result := make(map[string]AcceptableRange)
|
||||||
for _, nodeGroup := range csr.cloudProvider.NodeGroups() {
|
for _, nodeGroup := range csr.getRunningNodeGroups() {
|
||||||
size := targetSize[nodeGroup.Id()]
|
size := targetSize[nodeGroup.Id()]
|
||||||
readiness := csr.perNodeGroupReadiness[nodeGroup.Id()]
|
readiness := csr.perNodeGroupReadiness[nodeGroup.Id()]
|
||||||
result[nodeGroup.Id()] = AcceptableRange{
|
result[nodeGroup.Id()] = AcceptableRange{
|
||||||
|
@ -681,17 +687,12 @@ func (csr *ClusterStateRegistry) updateReadinessStats(currentTime time.Time) {
|
||||||
// Calculates which node groups have incorrect size.
|
// Calculates which node groups have incorrect size.
|
||||||
func (csr *ClusterStateRegistry) updateIncorrectNodeGroupSizes(currentTime time.Time) {
|
func (csr *ClusterStateRegistry) updateIncorrectNodeGroupSizes(currentTime time.Time) {
|
||||||
result := make(map[string]IncorrectNodeGroupSize)
|
result := make(map[string]IncorrectNodeGroupSize)
|
||||||
for _, nodeGroup := range csr.cloudProvider.NodeGroups() {
|
for _, nodeGroup := range csr.getRunningNodeGroups() {
|
||||||
acceptableRange, found := csr.acceptableRanges[nodeGroup.Id()]
|
acceptableRange, found := csr.acceptableRanges[nodeGroup.Id()]
|
||||||
if !found {
|
if !found {
|
||||||
klog.Warningf("Acceptable range for node group %s not found", nodeGroup.Id())
|
klog.Warningf("Acceptable range for node group %s not found", nodeGroup.Id())
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if csr.asyncNodeGroupStateChecker.IsUpcoming(nodeGroup) {
|
|
||||||
// Nodes for upcoming node groups reside in-memory and wait for node group to be fully
|
|
||||||
// created. There is no need to mark their sizes incorrect.
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
readiness, found := csr.perNodeGroupReadiness[nodeGroup.Id()]
|
readiness, found := csr.perNodeGroupReadiness[nodeGroup.Id()]
|
||||||
if !found {
|
if !found {
|
||||||
// if MinNodes == 0 node group has been scaled to 0 and everything's fine
|
// if MinNodes == 0 node group has been scaled to 0 and everything's fine
|
||||||
|
@ -781,7 +782,7 @@ func (csr *ClusterStateRegistry) GetStatus(now time.Time) *api.ClusterAutoscaler
|
||||||
for _, nodeGroup := range csr.lastStatus.NodeGroups {
|
for _, nodeGroup := range csr.lastStatus.NodeGroups {
|
||||||
nodeGroupsLastStatus[nodeGroup.Name] = nodeGroup
|
nodeGroupsLastStatus[nodeGroup.Name] = nodeGroup
|
||||||
}
|
}
|
||||||
for _, nodeGroup := range csr.cloudProvider.NodeGroups() {
|
for _, nodeGroup := range csr.getRunningNodeGroups() {
|
||||||
nodeGroupStatus := api.NodeGroupStatus{
|
nodeGroupStatus := api.NodeGroupStatus{
|
||||||
Name: nodeGroup.Id(),
|
Name: nodeGroup.Id(),
|
||||||
}
|
}
|
||||||
|
@ -1014,10 +1015,22 @@ func (csr *ClusterStateRegistry) GetUpcomingNodes() (upcomingCounts map[string]i
|
||||||
return upcomingCounts, registeredNodeNames
|
return upcomingCounts, registeredNodeNames
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// getRunningNodeGroups returns running node groups, filters out upcoming ones.
|
||||||
|
func (csr *ClusterStateRegistry) getRunningNodeGroups() []cloudprovider.NodeGroup {
|
||||||
|
nodeGroups := csr.cloudProvider.NodeGroups()
|
||||||
|
result := make([]cloudprovider.NodeGroup, 0, len(nodeGroups))
|
||||||
|
for _, nodeGroup := range nodeGroups {
|
||||||
|
if !csr.asyncNodeGroupStateChecker.IsUpcoming(nodeGroup) {
|
||||||
|
result = append(result, nodeGroup)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return result
|
||||||
|
}
|
||||||
|
|
||||||
// getCloudProviderNodeInstances returns map keyed on node group id where value is list of node instances
|
// getCloudProviderNodeInstances returns map keyed on node group id where value is list of node instances
|
||||||
// as returned by NodeGroup.Nodes().
|
// as returned by NodeGroup.Nodes().
|
||||||
func (csr *ClusterStateRegistry) getCloudProviderNodeInstances() (map[string][]cloudprovider.Instance, error) {
|
func (csr *ClusterStateRegistry) getCloudProviderNodeInstances() (map[string][]cloudprovider.Instance, error) {
|
||||||
for _, nodeGroup := range csr.cloudProvider.NodeGroups() {
|
for _, nodeGroup := range csr.getRunningNodeGroups() {
|
||||||
if csr.IsNodeGroupScalingUp(nodeGroup.Id()) {
|
if csr.IsNodeGroupScalingUp(nodeGroup.Id()) {
|
||||||
csr.cloudProviderNodeInstancesCache.InvalidateCacheEntry(nodeGroup)
|
csr.cloudProviderNodeInstancesCache.InvalidateCacheEntry(nodeGroup)
|
||||||
}
|
}
|
||||||
|
@ -1089,7 +1102,7 @@ func (csr *ClusterStateRegistry) GetAutoscaledNodesCount() (currentSize, targetS
|
||||||
}
|
}
|
||||||
|
|
||||||
func (csr *ClusterStateRegistry) handleInstanceCreationErrors(currentTime time.Time) {
|
func (csr *ClusterStateRegistry) handleInstanceCreationErrors(currentTime time.Time) {
|
||||||
nodeGroups := csr.cloudProvider.NodeGroups()
|
nodeGroups := csr.getRunningNodeGroups()
|
||||||
|
|
||||||
for _, nodeGroup := range nodeGroups {
|
for _, nodeGroup := range nodeGroups {
|
||||||
csr.handleInstanceCreationErrorsForNodeGroup(
|
csr.handleInstanceCreationErrorsForNodeGroup(
|
||||||
|
|
|
@ -1225,10 +1225,11 @@ func TestUpdateAcceptableRanges(t *testing.T) {
|
||||||
}
|
}
|
||||||
|
|
||||||
clusterState := &ClusterStateRegistry{
|
clusterState := &ClusterStateRegistry{
|
||||||
cloudProvider: provider,
|
cloudProvider: provider,
|
||||||
perNodeGroupReadiness: tc.readiness,
|
perNodeGroupReadiness: tc.readiness,
|
||||||
scaleUpRequests: tc.scaleUpRequests,
|
scaleUpRequests: tc.scaleUpRequests,
|
||||||
scaleDownRequests: scaleDownRequests,
|
scaleDownRequests: scaleDownRequests,
|
||||||
|
asyncNodeGroupStateChecker: asyncnodegroups.NewDefaultAsyncNodeGroupStateChecker(),
|
||||||
}
|
}
|
||||||
|
|
||||||
clusterState.updateAcceptableRanges(tc.targetSizes)
|
clusterState.updateAcceptableRanges(tc.targetSizes)
|
||||||
|
@ -1456,6 +1457,43 @@ func TestTruncateIfExceedMaxSize(t *testing.T) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestIsNodeGroupRegistered(t *testing.T) {
|
||||||
|
provider := testprovider.NewTestCloudProvider(nil, nil)
|
||||||
|
registeredNodeGroupName := "registered-node-group"
|
||||||
|
provider.AddNodeGroup(registeredNodeGroupName, 1, 10, 1)
|
||||||
|
fakeClient := &fake.Clientset{}
|
||||||
|
fakeLogRecorder, _ := utils.NewStatusMapRecorder(fakeClient, "kube-system", kube_record.NewFakeRecorder(5), false, "some-map")
|
||||||
|
clusterstate := NewClusterStateRegistry(
|
||||||
|
provider,
|
||||||
|
ClusterStateRegistryConfig{MaxTotalUnreadyPercentage: 10, OkTotalUnreadyCount: 1},
|
||||||
|
fakeLogRecorder,
|
||||||
|
newBackoff(),
|
||||||
|
nodegroupconfig.NewDefaultNodeGroupConfigProcessor(config.NodeGroupAutoscalingOptions{MaxNodeProvisionTime: 15 * time.Minute}),
|
||||||
|
asyncnodegroups.NewDefaultAsyncNodeGroupStateChecker(),
|
||||||
|
)
|
||||||
|
clusterstate.Recalculate()
|
||||||
|
|
||||||
|
testCases := []struct {
|
||||||
|
nodeGroupName string
|
||||||
|
want bool
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
nodeGroupName: registeredNodeGroupName,
|
||||||
|
want: true,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
nodeGroupName: "unregistered-node-group",
|
||||||
|
want: false,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
for _, tc := range testCases {
|
||||||
|
t.Run(tc.nodeGroupName, func(t *testing.T) {
|
||||||
|
registered := clusterstate.IsNodeGroupRegistered(tc.nodeGroupName)
|
||||||
|
assert.Equal(t, tc.want, registered)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestUpcomingNodesFromUpcomingNodeGroups(t *testing.T) {
|
func TestUpcomingNodesFromUpcomingNodeGroups(t *testing.T) {
|
||||||
|
|
||||||
testCases := []struct {
|
testCases := []struct {
|
||||||
|
|
|
@ -176,10 +176,9 @@ func (e *scaleUpExecutor) executeScaleUp(
|
||||||
if increase < 0 {
|
if increase < 0 {
|
||||||
return errors.NewAutoscalerError(errors.InternalError, fmt.Sprintf("increase in number of nodes cannot be negative, got: %v", increase))
|
return errors.NewAutoscalerError(errors.InternalError, fmt.Sprintf("increase in number of nodes cannot be negative, got: %v", increase))
|
||||||
}
|
}
|
||||||
if e.asyncNodeGroupStateChecker.IsUpcoming(info.Group) {
|
if !info.Group.Exist() && e.asyncNodeGroupStateChecker.IsUpcoming(info.Group) {
|
||||||
// Don't emit scale up event for upcoming node group as it will be generated after
|
// Don't emit scale up event for upcoming node group as it will be generated after
|
||||||
// the node group is created, during initial scale up.
|
// the node group is created, during initial scale up.
|
||||||
klog.V(0).Infof("Scale-up: group %s is an upcoming node group, skipping emit scale up event", info.Group.Id())
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
e.scaleStateNotifier.RegisterScaleUp(info.Group, increase, time.Now())
|
e.scaleStateNotifier.RegisterScaleUp(info.Group, increase, time.Now())
|
||||||
|
|
Loading…
Reference in New Issue