From bf8cfef10b98657c37ce0ba93f68bcb47c1f7194 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=81ukasz=20Osipiuk?= Date: Thu, 13 Sep 2018 16:59:44 +0200 Subject: [PATCH] NodeGroupManager.CreateNodeGroup can return extra created node groups. --- cluster-autoscaler/core/scale_test_common.go | 8 +++-- cluster-autoscaler/core/scale_up.go | 13 ++++++- cluster-autoscaler/core/utils.go | 34 ++++++++++++------- .../nodegroups/nodegroup_manager.go | 17 ++++++++-- 4 files changed, 54 insertions(+), 18 deletions(-) diff --git a/cluster-autoscaler/core/scale_test_common.go b/cluster-autoscaler/core/scale_test_common.go index 57fd9215c1..7c406d588b 100644 --- a/cluster-autoscaler/core/scale_test_common.go +++ b/cluster-autoscaler/core/scale_test_common.go @@ -26,6 +26,7 @@ import ( "k8s.io/autoscaler/cluster-autoscaler/context" "k8s.io/autoscaler/cluster-autoscaler/expander/random" "k8s.io/autoscaler/cluster-autoscaler/metrics" + "k8s.io/autoscaler/cluster-autoscaler/processors/nodegroups" "k8s.io/autoscaler/cluster-autoscaler/simulator" "k8s.io/autoscaler/cluster-autoscaler/utils/errors" "k8s.io/autoscaler/cluster-autoscaler/utils/labels" @@ -92,11 +93,14 @@ type mockAutoprovisioningNodeGroupManager struct { t *testing.T } -func (p *mockAutoprovisioningNodeGroupManager) CreateNodeGroup(context *context.AutoscalingContext, nodeGroup cloudprovider.NodeGroup) (cloudprovider.NodeGroup, errors.AutoscalerError) { +func (p *mockAutoprovisioningNodeGroupManager) CreateNodeGroup(context *context.AutoscalingContext, nodeGroup cloudprovider.NodeGroup) (nodegroups.CreateNodeGroupResult, errors.AutoscalerError) { newNodeGroup, err := nodeGroup.Create() assert.NoError(p.t, err) metrics.RegisterNodeGroupCreation() - return newNodeGroup, nil + result := nodegroups.CreateNodeGroupResult{ + MainCreatedNodeGroup: newNodeGroup, + } + return result, nil } func (p *mockAutoprovisioningNodeGroupManager) RemoveUnneededNodeGroups(context *context.AutoscalingContext) error { diff --git a/cluster-autoscaler/core/scale_up.go b/cluster-autoscaler/core/scale_up.go index faea9a4651..519b5667ef 100644 --- a/cluster-autoscaler/core/scale_up.go +++ b/cluster-autoscaler/core/scale_up.go @@ -455,7 +455,8 @@ func ScaleUp(context *context.AutoscalingContext, processors *ca_processors.Auto if !bestOption.NodeGroup.Exist() { oldId := bestOption.NodeGroup.Id() - bestOption.NodeGroup, err = processors.NodeGroupManager.CreateNodeGroup(context, bestOption.NodeGroup) + createNodeGroupResult, err := processors.NodeGroupManager.CreateNodeGroup(context, bestOption.NodeGroup) + bestOption.NodeGroup = createNodeGroupResult.MainCreatedNodeGroup if err != nil { return nil, err } @@ -466,6 +467,16 @@ func ScaleUp(context *context.AutoscalingContext, processors *ca_processors.Auto nodeInfos[bestOption.NodeGroup.Id()] = nodeInfos[oldId] delete(nodeInfos, oldId) } + + for _, nodeGroup := range createNodeGroupResult.ExtraCreatedNodeGroups { + nodeInfo, err := GetNodeInfoFromTemplate(nodeGroup, daemonSets, context.PredicateChecker) + + if err != nil { + glog.Warning("Cannot build node info for newly created extra node group %v; balancing similar node groups will not work; err=%v", nodeGroup.Id(), err) + continue + } + nodeInfos[nodeGroup.Id()] = nodeInfo + } } nodeInfo, found := nodeInfos[bestOption.NodeGroup.Id()] diff --git a/cluster-autoscaler/core/utils.go b/cluster-autoscaler/core/utils.go index 3a20710a00..8dd6b3030b 100644 --- a/cluster-autoscaler/core/utils.go +++ b/cluster-autoscaler/core/utils.go @@ -266,25 +266,16 @@ func GetNodeInfosForGroups(nodes []*apiv1.Node, cloudProvider cloudprovider.Clou // No good template, trying to generate one. This is called only if there are no // working nodes in the node groups. By default CA tries to use a real-world example. - baseNodeInfo, err := nodeGroup.TemplateNodeInfo() + nodeInfo, err := GetNodeInfoFromTemplate(nodeGroup, daemonsets, predicateChecker) if err != nil { if err == cloudprovider.ErrNotImplemented { continue } else { glog.Errorf("Unable to build proper template node for %s: %v", id, err) - return map[string]*schedulercache.NodeInfo{}, errors.ToAutoscalerError( - errors.CloudProviderError, err) + return map[string]*schedulercache.NodeInfo{}, errors.ToAutoscalerError(errors.CloudProviderError, err) } } - pods := daemonset.GetDaemonSetPodsForNode(baseNodeInfo, daemonsets, predicateChecker) - pods = append(pods, baseNodeInfo.Pods()...) - fullNodeInfo := schedulercache.NewNodeInfo(pods...) - fullNodeInfo.SetNode(baseNodeInfo.Node()) - sanitizedNodeInfo, typedErr := sanitizeNodeInfo(fullNodeInfo, id) - if typedErr != nil { - return map[string]*schedulercache.NodeInfo{}, typedErr - } - result[id] = sanitizedNodeInfo + result[id] = nodeInfo } // Last resort - unready/unschedulable nodes. @@ -309,6 +300,25 @@ func GetNodeInfosForGroups(nodes []*apiv1.Node, cloudProvider cloudprovider.Clou return result, nil } +// GetNodeInfoFromTemplate returns NodeInfo object built base on TemplateNodeInfo returned by NodeGroup.TemplateNodeInfo(). +func GetNodeInfoFromTemplate(nodeGroup cloudprovider.NodeGroup, daemonsets []*extensionsv1.DaemonSet, predicateChecker *simulator.PredicateChecker) (*schedulercache.NodeInfo, errors.AutoscalerError) { + id := nodeGroup.Id() + baseNodeInfo, err := nodeGroup.TemplateNodeInfo() + if err != nil { + return nil, errors.ToAutoscalerError(errors.CloudProviderError, err) + } + + pods := daemonset.GetDaemonSetPodsForNode(baseNodeInfo, daemonsets, predicateChecker) + pods = append(pods, baseNodeInfo.Pods()...) + fullNodeInfo := schedulercache.NewNodeInfo(pods...) + fullNodeInfo.SetNode(baseNodeInfo.Node()) + sanitizedNodeInfo, typedErr := sanitizeNodeInfo(fullNodeInfo, id) + if typedErr != nil { + return nil, typedErr + } + return sanitizedNodeInfo, nil +} + // FilterOutNodesFromNotAutoscaledGroups return subset of input nodes for which cloud provider does not // return autoscaled node group. func FilterOutNodesFromNotAutoscaledGroups(nodes []*apiv1.Node, cloudProvider cloudprovider.CloudProvider) ([]*apiv1.Node, errors.AutoscalerError) { diff --git a/cluster-autoscaler/processors/nodegroups/nodegroup_manager.go b/cluster-autoscaler/processors/nodegroups/nodegroup_manager.go index c918ff22ef..baedfe1734 100644 --- a/cluster-autoscaler/processors/nodegroups/nodegroup_manager.go +++ b/cluster-autoscaler/processors/nodegroups/nodegroup_manager.go @@ -24,7 +24,7 @@ import ( // NodeGroupManager is responsible for creating/deleting node groups. type NodeGroupManager interface { - CreateNodeGroup(context *context.AutoscalingContext, nodeGroup cloudprovider.NodeGroup) (cloudprovider.NodeGroup, errors.AutoscalerError) + CreateNodeGroup(context *context.AutoscalingContext, nodeGroup cloudprovider.NodeGroup) (CreateNodeGroupResult, errors.AutoscalerError) RemoveUnneededNodeGroups(context *context.AutoscalingContext) error CleanUp() } @@ -35,9 +35,20 @@ type NodeGroupManager interface { type NoOpNodeGroupManager struct { } +// CreateNodeGroupResult result captures result of successful NodeGroupManager.CreateNodeGroup call. +type CreateNodeGroupResult struct { + // Main created node group, matching the requested node group passed to CreateNodeGroup call + MainCreatedNodeGroup cloudprovider.NodeGroup + + // List of extra node groups created by CreateNodeGroup call. Non-empty if due manager specific + // constraints creating one node group requires creating other ones (e.g. matching node group + // must exist in each zone for multizonal deployments) + ExtraCreatedNodeGroups []cloudprovider.NodeGroup +} + // CreateNodeGroup always returns internal error. It must not be called on NoOpNodeGroupManager. -func (*NoOpNodeGroupManager) CreateNodeGroup(context *context.AutoscalingContext, nodeGroup cloudprovider.NodeGroup) (cloudprovider.NodeGroup, errors.AutoscalerError) { - return nil, errors.NewAutoscalerError(errors.InternalError, "not implemented") +func (*NoOpNodeGroupManager) CreateNodeGroup(context *context.AutoscalingContext, nodeGroup cloudprovider.NodeGroup) (CreateNodeGroupResult, errors.AutoscalerError) { + return CreateNodeGroupResult{}, errors.NewAutoscalerError(errors.InternalError, "not implemented") } // RemoveUnneededNodeGroups does nothing in NoOpNodeGroupManager