From 982f9e41a39e6f73bc845cc0141c3885ea7a36d3 Mon Sep 17 00:00:00 2001 From: Nic Cope Date: Fri, 10 Nov 2017 18:44:31 -0800 Subject: [PATCH] Support autodetection of GCE managed instance groups by name prefix This commit adds a new usage of the --node-group-auto-discovery flag intended for use with the GCE cloud provider. GCE instance groups can be automatically discovered based on a prefix of their group name. Example usage: --node-group-auto-discovery=mig:prefix=k8s-mig,minNodes=0,maxNodes=10 Note that unlike the existing AWS ASG autodetection functionality we must specify the min and max nodes in the flag. This is because MIGs store only a target size in the GCE API - they do not have a min and max size we can infer via the API. In order to alleviate this limitation a little we allow multiple uses of the autodiscovery flag. For example to discover two classes (big and small) of instance groups with different size limits: ./cluster-autoscaler \ --node-group-auto-discovery=mig:prefix=k8s-a-small,minNodes=1,maxNodes=10 \ --node-group-auto-discovery=mig:prefix=k8s-a-big,minNodes=1,maxNodes=100 Zonal clusters (i.e. multizone = false in the cloud config) will detect all managed instance groups within the cluster's zone. Regional clusters will detect all matching (zonal) managed instance groups within any of that region's zones. --- .../cloudprovider/aws/aws_cloud_provider.go | 36 +++-- .../aws/aws_cloud_provider_test.go | 19 +++ .../builder/cloud_provider_builder.go | 2 +- .../cloudprovider/gce/gce_cloud_provider.go | 100 +++++++++++++- .../gce/gce_cloud_provider_test.go | 126 +++++++++++++++--- .../cloudprovider/gce/gce_manager.go | 61 ++++++++- .../cloudprovider/gce/gce_manager_test.go | 109 ++++++++++++++- .../node_group_discovery_options.go | 16 ++- .../node_group_discovery_options_test.go | 11 +- cluster-autoscaler/core/autoscaler.go | 4 +- .../core/autoscaling_context.go | 6 +- cluster-autoscaler/main.go | 12 +- 12 files changed, 450 insertions(+), 52 deletions(-) diff --git a/cluster-autoscaler/cloudprovider/aws/aws_cloud_provider.go b/cluster-autoscaler/cloudprovider/aws/aws_cloud_provider.go index cd18cad2f1..5c701460e4 100644 --- a/cluster-autoscaler/cloudprovider/aws/aws_cloud_provider.go +++ b/cluster-autoscaler/cloudprovider/aws/aws_cloud_provider.go @@ -45,12 +45,12 @@ func BuildAwsCloudProvider(awsManager *AwsManager, discoveryOpts cloudprovider.N return buildStaticallyDiscoveringProvider(awsManager, discoveryOpts.NodeGroupSpecs, resourceLimiter) } if discoveryOpts.AutoDiscoverySpecified() { - return buildAutoDiscoveringProvider(awsManager, discoveryOpts.NodeGroupAutoDiscoverySpec, resourceLimiter) + return buildAutoDiscoveringProvider(awsManager, discoveryOpts.NodeGroupAutoDiscoverySpecs, resourceLimiter) } return nil, fmt.Errorf("Failed to build an aws cloud provider: Either node group specs or node group auto discovery spec must be specified") } -func buildAutoDiscoveringProvider(awsManager *AwsManager, spec string, resourceLimiter *cloudprovider.ResourceLimiter) (*awsCloudProvider, error) { +func parseAutoDiscoverySpec(spec string) ([]string, error) { tokens := strings.Split(spec, ":") if len(tokens) != 2 { return nil, fmt.Errorf("Invalid node group auto discovery spec specified via --node-group-auto-discovery: %s", spec) @@ -72,20 +72,36 @@ func buildAutoDiscoveringProvider(awsManager *AwsManager, spec string, resourceL // Use the k8s cluster name tag to only discover asgs of the cluster denoted by clusterName // See https://github.com/kubernetes/kubernetes/blob/9ef85a7/pkg/cloudprovider/providers/aws/tags.go#L30-L34 // for more information about the tag - tags := strings.Split(tag, ",") - asgs, err := awsManager.getAutoscalingGroupsByTags(tags) - if err != nil { - return nil, fmt.Errorf("Failed to get ASGs: %v", err) - } - + return strings.Split(tag, ","), nil +} +func buildAutoDiscoveringProvider(awsManager *AwsManager, specs []string, resourceLimiter *cloudprovider.ResourceLimiter) (*awsCloudProvider, error) { aws := &awsCloudProvider{ awsManager: awsManager, asgs: make([]*Asg, 0), resourceLimiter: resourceLimiter, } - for _, asg := range asgs { - aws.addAsg(buildAsg(aws.awsManager, int(*asg.MinSize), int(*asg.MaxSize), *asg.AutoScalingGroupName)) + + seen := make(map[string]bool) + for _, spec := range specs { + tags, err := parseAutoDiscoverySpec(spec) + if err != nil { + return nil, err + } + asgs, err := awsManager.getAutoscalingGroupsByTags(tags) + if err != nil { + return nil, fmt.Errorf("Failed to get ASGs: %v", err) + } + for _, asg := range asgs { + // An ASG might match more than one provided spec, but we only ever + // want to add it once. + if seen[*asg.AutoScalingGroupARN] { + continue + } + seen[*asg.AutoScalingGroupARN] = true + aws.addAsg(buildAsg(aws.awsManager, int(*asg.MinSize), int(*asg.MaxSize), *asg.AutoScalingGroupName)) + } } + return aws, nil } diff --git a/cluster-autoscaler/cloudprovider/aws/aws_cloud_provider_test.go b/cluster-autoscaler/cloudprovider/aws/aws_cloud_provider_test.go index 0d35c44f9a..a66c61c0e4 100644 --- a/cluster-autoscaler/cloudprovider/aws/aws_cloud_provider_test.go +++ b/cluster-autoscaler/cloudprovider/aws/aws_cloud_provider_test.go @@ -131,6 +131,25 @@ func TestBuildAwsCloudProvider(t *testing.T) { assert.NoError(t, err) } +func TestParseAutoDiscoverySpec(t *testing.T) { + want := []string{"coolTag", "anotherTag"} + got, err := parseAutoDiscoverySpec("asg:tag=coolTag,anotherTag") + assert.NoError(t, err) + assert.Equal(t, want, got) + + badSpecs := []string{ + "asg", + "tag=coolTag,anotherTag", + "mig:tag=coolTag,anotherTag", + "asg:notatag=coolTag,anotherTag", + } + + for _, spec := range badSpecs { + _, err = parseAutoDiscoverySpec(spec) + assert.Error(t, err) + } +} + func TestAddNodeGroup(t *testing.T) { provider := testProvider(t, testAwsManager) err := provider.addNodeGroup("bad spec") diff --git a/cluster-autoscaler/cloudprovider/builder/cloud_provider_builder.go b/cluster-autoscaler/cloudprovider/builder/cloud_provider_builder.go index 62e83d4e3d..d233a2b98f 100644 --- a/cluster-autoscaler/cloudprovider/builder/cloud_provider_builder.go +++ b/cluster-autoscaler/cloudprovider/builder/cloud_provider_builder.go @@ -85,7 +85,7 @@ func (b CloudProviderBuilder) Build(discoveryOpts cloudprovider.NodeGroupDiscove if gceError != nil { glog.Fatalf("Failed to create GCE Manager: %v", gceError) } - cloudProvider, err = gce.BuildGceCloudProvider(gceManager, nodeGroupsFlag, resourceLimiter) + cloudProvider, err = gce.BuildGceCloudProvider(gceManager, discoveryOpts, resourceLimiter) if err != nil { glog.Fatalf("Failed to create GCE cloud provider: %v", err) } diff --git a/cluster-autoscaler/cloudprovider/gce/gce_cloud_provider.go b/cluster-autoscaler/cloudprovider/gce/gce_cloud_provider.go index d22f955d6f..0f58709726 100644 --- a/cluster-autoscaler/cloudprovider/gce/gce_cloud_provider.go +++ b/cluster-autoscaler/cloudprovider/gce/gce_cloud_provider.go @@ -18,6 +18,8 @@ package gce import ( "fmt" + "regexp" + "strconv" "strings" "time" @@ -33,8 +35,19 @@ import ( const ( maxAutoprovisionedSize = 1000 minAutoprovisionedSize = 0 + + autoDiscovererTypeMIG = "mig" + autoDiscovererKeyPrefix = "prefix" + autoDiscovererKeyMinNodes = "min" + autoDiscovererKeyMaxNodes = "max" ) +var validAutoDiscovererKeys = strings.Join([]string{ + autoDiscovererKeyPrefix, + autoDiscovererKeyMinNodes, + autoDiscovererKeyMaxNodes, +}, ", ") + // Big machines are temporarily commented out. // TODO(mwielgus): get this list programatically var autoprovisionedMachineTypes = []string{ @@ -67,11 +80,20 @@ type GceCloudProvider struct { } // BuildGceCloudProvider builds CloudProvider implementation for GCE. -func BuildGceCloudProvider(gceManager GceManager, specs []string, resourceLimiter *cloudprovider.ResourceLimiter) (*GceCloudProvider, error) { - if gceManager.getMode() == ModeGKE && len(specs) != 0 { +func BuildGceCloudProvider(gceManager GceManager, do cloudprovider.NodeGroupDiscoveryOptions, resourceLimiter *cloudprovider.ResourceLimiter) (*GceCloudProvider, error) { + if err := do.Validate(); err != nil { + return nil, fmt.Errorf("Failed to build a GCE cloud provider: %v", err) + } + if gceManager.getMode() == ModeGKE && !do.NoDiscoverySpecified() { return nil, fmt.Errorf("GKE gets nodegroup specification via API, command line specs are not allowed") } + if do.AutoDiscoverySpecified() { + return buildAutoDiscoveringProvider(gceManager, do.NodeGroupAutoDiscoverySpecs, resourceLimiter) + } + return buildStaticallyDiscoveringProvider(gceManager, do.NodeGroupSpecs, resourceLimiter) +} +func buildStaticallyDiscoveringProvider(gceManager GceManager, specs []string, resourceLimiter *cloudprovider.ResourceLimiter) (*GceCloudProvider, error) { gce := &GceCloudProvider{ gceManager: gceManager, resourceLimiterFromFlags: resourceLimiter, @@ -84,6 +106,80 @@ func BuildGceCloudProvider(gceManager GceManager, specs []string, resourceLimite return gce, nil } +type autoDiscovererConfig struct { + migRe *regexp.Regexp + minNodes string + maxNodes string +} + +func parseAutoDiscoverySpec(spec string) (autoDiscovererConfig, error) { + cfg := autoDiscovererConfig{} + + tokens := strings.Split(spec, ":") + if len(tokens) != 2 { + return cfg, fmt.Errorf("spec \"%s\" should be discoverer:key=value,key=value", spec) + } + discoverer := tokens[0] + if discoverer != autoDiscovererTypeMIG { + return cfg, fmt.Errorf("unsupported discoverer specified: %s", discoverer) + } + + for _, arg := range strings.Split(tokens[1], ",") { + kv := strings.Split(arg, "=") + k, v := kv[0], kv[1] + + switch k { + case autoDiscovererKeyPrefix: + var err error + if cfg.migRe, err = regexp.Compile(fmt.Sprintf("^%s.+", v)); err != nil { + return cfg, fmt.Errorf("invalid instance group name prefix \"%s\" - \"^%s.+\" must be a valid RE2 regexp", v, v) + } + case autoDiscovererKeyMinNodes: + if _, err := strconv.Atoi(v); err != nil { + return cfg, fmt.Errorf("invalid minimum nodes: %s", v) + } + cfg.minNodes = v + case autoDiscovererKeyMaxNodes: + if _, err := strconv.Atoi(v); err != nil { + return cfg, fmt.Errorf("invalid maximum nodes: %s", v) + } + cfg.maxNodes = v + default: + return cfg, fmt.Errorf("unsupported key \"%s\" is specified for discoverer \"%s\". Supported keys are \"%s\"", k, discoverer, validAutoDiscovererKeys) + } + } + return cfg, nil +} + +func buildAutoDiscoveringProvider(gceManager GceManager, specs []string, resourceLimiter *cloudprovider.ResourceLimiter) (*GceCloudProvider, error) { + gce := &GceCloudProvider{gceManager: gceManager, resourceLimiterFromFlags: resourceLimiter} + + seen := make(map[string]bool) + for _, spec := range specs { + cfg, err := parseAutoDiscoverySpec(spec) + if err != nil { + return nil, fmt.Errorf("invalid node group auto discovery spec \"%s\": %v", spec, err) + } + links, err := gceManager.findMigsNamed(cfg.migRe) + if err != nil { + return nil, fmt.Errorf("cannot autodiscover managed instance groups: %s", err) + } + for _, link := range links { + // A MIG might match more than one provided spec, but we only ever + // want to add it once. + if seen[link] { + continue + } + seen[link] = true + spec := fmt.Sprintf("%s:%s:%s", cfg.minNodes, cfg.maxNodes, link) + if err := gce.addNodeGroup(spec); err != nil { + return nil, err + } + } + } + return gce, nil +} + // Cleanup cleans up all resources before the cloud provider is removed func (gce *GceCloudProvider) Cleanup() error { gce.gceManager.Cleanup() diff --git a/cluster-autoscaler/cloudprovider/gce/gce_cloud_provider_test.go b/cluster-autoscaler/cloudprovider/gce/gce_cloud_provider_test.go index e756a207ff..26c5ea4401 100644 --- a/cluster-autoscaler/cloudprovider/gce/gce_cloud_provider_test.go +++ b/cluster-autoscaler/cloudprovider/gce/gce_cloud_provider_test.go @@ -17,9 +17,11 @@ limitations under the License. package gce import ( + "errors" "fmt" "net/http" "reflect" + "regexp" "strings" "testing" @@ -122,7 +124,12 @@ func (m *gceManagerMock) GetResourceLimiter() (*cloudprovider.ResourceLimiter, e return args.Get(0).(*cloudprovider.ResourceLimiter), args.Error(1) } -func TestBuildGceCloudProvider(t *testing.T) { +func (m *gceManagerMock) findMigsNamed(name *regexp.Regexp) ([]string, error) { + args := m.Called() + return args.Get(0).([]string), args.Error(1) +} + +func TestBuildStaticGceCloudProvider(t *testing.T) { gceManagerMock := &gceManagerMock{} ng1Name := "https://content.googleapis.com/compute/v1/projects/project1/zones/us-central1-b/instanceGroups/ng1" @@ -132,37 +139,122 @@ func TestBuildGceCloudProvider(t *testing.T) { map[string]int64{cloudprovider.ResourceNameCores: 1, cloudprovider.ResourceNameMemory: 10000000}, map[string]int64{cloudprovider.ResourceNameCores: 10, cloudprovider.ResourceNameMemory: 100000000}) - // GCE mode. + // GCE mode with explicit node groups. gceManagerMock.On("getMode").Return(ModeGCE).Once() gceManagerMock.On("RegisterMig", mock.MatchedBy(func(mig *Mig) bool { return mig.Name == "ng1" || mig.Name == "ng2" })).Return(true).Times(2) - provider, err := BuildGceCloudProvider(gceManagerMock, - []string{"0:10:" + ng1Name, "0:5:https:" + ng2Name}, - resourceLimiter) - assert.NoError(t, err) - assert.NotNil(t, provider) - mock.AssertExpectationsForObjects(t, gceManagerMock) - - // GKE mode. - gceManagerMock.On("getMode").Return(ModeGKE).Once() - - provider, err = BuildGceCloudProvider(gceManagerMock, []string{}, resourceLimiter) + do := cloudprovider.NodeGroupDiscoveryOptions{ + NodeGroupSpecs: []string{"0:10:" + ng1Name, "0:5:https:" + ng2Name}, + } + provider, err := BuildGceCloudProvider(gceManagerMock, do, resourceLimiter) assert.NoError(t, err) assert.NotNil(t, provider) mock.AssertExpectationsForObjects(t, gceManagerMock) // Error on GKE mode with specs. gceManagerMock.On("getMode").Return(ModeGKE).Once() - - provider, err = BuildGceCloudProvider(gceManagerMock, - []string{"0:10:" + ng1Name, "0:5:https:" + ng2Name}, - resourceLimiter) + _, err = BuildGceCloudProvider(gceManagerMock, do, resourceLimiter) assert.Error(t, err) assert.Equal(t, "GKE gets nodegroup specification via API, command line specs are not allowed", err.Error()) mock.AssertExpectationsForObjects(t, gceManagerMock) + + // Ensure GKE mode works with no specs. + gceManagerMock.On("getMode").Return(ModeGKE).Once() + do = cloudprovider.NodeGroupDiscoveryOptions{} + provider, err = BuildGceCloudProvider(gceManagerMock, do, resourceLimiter) + assert.NoError(t, err) + assert.NotNil(t, provider) + mock.AssertExpectationsForObjects(t, gceManagerMock) + + // Error with both explicit and autodiscovery specs. + do = cloudprovider.NodeGroupDiscoveryOptions{ + NodeGroupSpecs: []string{"0:10:" + ng1Name, "0:5:https:" + ng2Name}, + NodeGroupAutoDiscoverySpecs: []string{"mig:prefix=pfx,min=0,max=10"}, + } + _, err = BuildGceCloudProvider(gceManagerMock, do, resourceLimiter) + assert.Error(t, err) + mock.AssertExpectationsForObjects(t, gceManagerMock) +} +func TestBuildAutodiscoveringGceCloudProvider(t *testing.T) { + gceManagerMock := &gceManagerMock{} + + ng1Name := "https://content.googleapis.com/compute/v1/projects/project1/zones/us-central1-b/instanceGroups/ng1" + ng2Name := "https://content.googleapis.com/compute/v1/projects/project1/zones/us-central1-b/instanceGroups/ng2" + + resourceLimiter := cloudprovider.NewResourceLimiter( + map[string]int64{cloudprovider.ResourceNameCores: 1, cloudprovider.ResourceNameMemory: 10000000}, + map[string]int64{cloudprovider.ResourceNameCores: 10, cloudprovider.ResourceNameMemory: 100000000}) + + // GCE mode with autodiscovery. + gceManagerMock.On("getMode").Return(ModeGCE).Once() + gceManagerMock.On("findMigsNamed").Return([]string{ng1Name, ng2Name}, nil).Twice() + gceManagerMock.On("RegisterMig", + mock.MatchedBy(func(mig *Mig) bool { + return mig.Name == "ng1" || mig.Name == "ng2" + })).Return(true).Times(2) + + do := cloudprovider.NodeGroupDiscoveryOptions{ + NodeGroupAutoDiscoverySpecs: []string{ + "mig:prefix=ng,min=0,max=10", + "mig:prefix=n,min=1,max=2", + }, + } + provider, err := BuildGceCloudProvider(gceManagerMock, do, resourceLimiter) + assert.NoError(t, err) + assert.NotNil(t, provider) + mock.AssertExpectationsForObjects(t, gceManagerMock) + + // Error finding instance groups + gceManagerMock.On("getMode").Return(ModeGCE).Once() + gceManagerMock.On("findMigsNamed").Return([]string{}, errors.New("nope")).Once() + _, err = BuildGceCloudProvider(gceManagerMock, do, resourceLimiter) + assert.Error(t, err) + assert.Equal(t, "cannot autodiscover managed instance groups: nope", err.Error()) + mock.AssertExpectationsForObjects(t, gceManagerMock) + + // Error on GKE mode with autodiscovery specs. + gceManagerMock.On("getMode").Return(ModeGKE).Once() + _, err = BuildGceCloudProvider(gceManagerMock, do, resourceLimiter) + assert.Error(t, err) + assert.Equal(t, "GKE gets nodegroup specification via API, command line specs are not allowed", err.Error()) + mock.AssertExpectationsForObjects(t, gceManagerMock) + + // Bad autodiscovery spec + do = cloudprovider.NodeGroupDiscoveryOptions{ + NodeGroupAutoDiscoverySpecs: []string{"mig"}, + } + gceManagerMock.On("getMode").Return(ModeGCE).Once() + _, err = BuildGceCloudProvider(gceManagerMock, do, resourceLimiter) + assert.Error(t, err) + mock.AssertExpectationsForObjects(t, gceManagerMock) +} + +func TestParseAutoDiscoverySpec(t *testing.T) { + want := autoDiscovererConfig{ + migRe: regexp.MustCompile("^pfx.+"), + minNodes: "0", + maxNodes: "10", + } + got, err := parseAutoDiscoverySpec("mig:prefix=pfx,min=0,max=10") + assert.NoError(t, err) + assert.Equal(t, want, got) + + badSpecs := []string{ + "prefix=pfx,min=0,max=10", + "asg:prefix=pfx,min=0,max=10", + "mig:prefix=pfx,min=0,max=10,unknown=hi", + "mig:prefix=pfx,min=a,max=10", + "mig:prefix=pfx,min=10,max=donkey", + "mig:prefix=(a,min=1,max=10", + } + + for _, spec := range badSpecs { + _, err = parseAutoDiscoverySpec(spec) + assert.Error(t, err) + } } func TestNodeGroups(t *testing.T) { diff --git a/cluster-autoscaler/cloudprovider/gce/gce_manager.go b/cluster-autoscaler/cloudprovider/gce/gce_manager.go index 9e9e7d7e87..ea3c88caab 100644 --- a/cluster-autoscaler/cloudprovider/gce/gce_manager.go +++ b/cluster-autoscaler/cloudprovider/gce/gce_manager.go @@ -17,11 +17,15 @@ limitations under the License. package gce import ( + "context" + "errors" "flag" "fmt" "io" "os" + "path" "reflect" + "regexp" "strings" "sync" "time" @@ -125,6 +129,7 @@ type GceManager interface { getProjectId() string getMode() GcpCloudProviderMode getTemplates() *templateBuilder + findMigsNamed(name *regexp.Regexp) ([]string, error) } // gceManagerImpl handles gce communication and data caching. @@ -730,7 +735,7 @@ func (m *gceManagerImpl) DeleteInstances(instances []*GceRef) error { return err } if mig != commonMig { - return fmt.Errorf("Connot delete instances which don't belong to the same MIG.") + return errors.New("Cannot delete instances which don't belong to the same MIG.") } } @@ -946,3 +951,57 @@ func getProjectAndLocation(isRegional bool) (string, string, error) { } return projectID, location, nil } + +func (m *gceManagerImpl) findMigsNamed(name *regexp.Regexp) ([]string, error) { + if m.isRegional { + return m.findMigsInRegion(m.location, name) + } + return m.findMigsInZone(m.location, name) +} + +func (m *gceManagerImpl) getZones(region string) ([]string, error) { + r, err := m.gceService.Regions.Get(m.getProjectId(), region).Do() + if err != nil { + return nil, fmt.Errorf("cannot get zones for GCE region %s: %v", region, err) + } + zones := make([]string, len(r.Zones)) + for i, link := range r.Zones { + zones[i] = path.Base(link) + } + return zones, nil +} + +func (m *gceManagerImpl) findMigsInRegion(region string, name *regexp.Regexp) ([]string, error) { + links := make([]string, 0) + zones, err := m.getZones(region) + if err != nil { + return nil, err + } + for _, z := range zones { + zl, err := m.findMigsInZone(z, name) + if err != nil { + return nil, err + } + for _, link := range zl { + links = append(links, link) + } + } + + return links, nil +} + +func (m *gceManagerImpl) findMigsInZone(zone string, name *regexp.Regexp) ([]string, error) { + filter := fmt.Sprintf("name eq %s", name) + links := make([]string, 0) + req := m.gceService.InstanceGroups.List(m.getProjectId(), zone).Filter(filter) + if err := req.Pages(context.TODO(), func(page *gce.InstanceGroupList) error { + for _, ig := range page.Items { + links = append(links, ig.SelfLink) + glog.V(3).Infof("autodiscovered managed instance group %s using regexp %s", ig.Name, name) + } + return nil + }); err != nil { + return nil, fmt.Errorf("cannot list managed instance groups: %v", err) + } + return links, nil +} diff --git a/cluster-autoscaler/cloudprovider/gce/gce_manager_test.go b/cluster-autoscaler/cloudprovider/gce/gce_manager_test.go index 97153cfb97..9bbe76c774 100644 --- a/cluster-autoscaler/cloudprovider/gce/gce_manager_test.go +++ b/cluster-autoscaler/cloudprovider/gce/gce_manager_test.go @@ -19,6 +19,7 @@ package gce import ( "fmt" "net/http" + "regexp" "testing" . "k8s.io/autoscaler/cluster-autoscaler/utils/test" @@ -215,7 +216,7 @@ const instanceGroupManager = `{ "kind": "compute#instanceGroupManager", "id": "3213213219", "creationTimestamp": "2017-09-15T04:47:24.687-07:00", - "name": "gke-cluster-1-default-pool", + "name": "%s", "zone": "https://www.googleapis.com/compute/v1/projects/project1/zones/%s", "instanceTemplate": "https://www.googleapis.com/compute/v1/projects/project1/global/instanceTemplates/gke-cluster-1-default-pool", "instanceGroup": "https://www.googleapis.com/compute/v1/projects/project1/zones/%s/instanceGroups/gke-cluster-1-default-pool", @@ -232,9 +233,10 @@ const instanceGroupManager = `{ "refreshing": 0 }, "targetSize": 3, - "selfLink": "https://www.googleapis.com/compute/v1/projects/project1/zones/%s/instanceGroupManagers/gke-cluster-1-default-pool" + "selfLink": "https://www.googleapis.com/compute/v1/projects/project1/zones/%s/instanceGroupManagers/%s" } ` + const instanceTemplate = ` { "kind": "compute#instanceTemplate", @@ -492,7 +494,11 @@ const getClusterResponse = `{ }` func getInstanceGroupManager(zone string) string { - return fmt.Sprintf(instanceGroupManager, zone, zone, zone) + return getInstanceGroupManagerNamed("gke-cluster-1-default-pool", zone) +} + +func getInstanceGroupManagerNamed(name, zone string) string { + return fmt.Sprintf(instanceGroupManager, name, zone, zone, zone, name) } func getMachineType(zone string) string { @@ -936,7 +942,7 @@ func TestDeleteInstances(t *testing.T) { err = g.DeleteInstances(instances) assert.Error(t, err) - assert.Equal(t, "Connot delete instances which don't belong to the same MIG.", err.Error()) + assert.Equal(t, "Cannot delete instances which don't belong to the same MIG.", err.Error()) mock.AssertExpectationsForObjects(t, server) } @@ -1116,3 +1122,98 @@ func TestFetchResourceLimiter(t *testing.T) { mock.AssertExpectationsForObjects(t, server) } + +const instanceGroup = `{ + "kind": "compute#instanceGroup", + "id": "1121230570947910218", + "name": "%s", + "selfLink": "https://www.googleapis.com/compute/v1/projects/project1/zones/%s/instanceGroups/%s", + "size": 1 +}` + +func getInstanceGroup(zone string) string { + return getInstanceGroupNamed("gke-cluster-1-default-pool", zone) +} + +func getInstanceGroupNamed(name, zone string) string { + return fmt.Sprintf(instanceGroup, name, zone, name) +} + +const instanceGroupList = `{ + "kind": "compute#instanceGroupList", + "id": "projects/project1a/zones/%s/instanceGroups", + "items": [%s, %s], + "selfLink": "https://www.googleapis.com/compute/v1/projects/project1/zones/%s/instanceGroups" +}` + +func listInstanceGroups(zone string) string { + return fmt.Sprintf(instanceGroupList, + zone, + getInstanceGroupNamed("gce-pool-a", zone), + getInstanceGroupNamed("gce-pool-b", zone), + zone, + ) +} + +func TestFindMigsNamedZonal(t *testing.T) { + server := NewHttpServerMock() + defer server.Close() + + server.On("handle", "/project1/zones/us-central1-b/instanceGroups").Return(listInstanceGroups("us-central1-b")).Once() + + regional := false + g := newTestGceManager(t, server.URL, ModeGCE, regional) + links, err := g.findMigsNamed(regexp.MustCompile("^UNUSED")) + assert.NoError(t, err) + + assert.Equal(t, 2, len(links)) + assert.Equal(t, "https://www.googleapis.com/compute/v1/projects/project1/zones/us-central1-b/instanceGroups/gce-pool-a", links[0]) + assert.Equal(t, "https://www.googleapis.com/compute/v1/projects/project1/zones/us-central1-b/instanceGroups/gce-pool-b", links[1]) + mock.AssertExpectationsForObjects(t, server) +} + +const getRegion = `{ + "kind": "compute#region", + "id": "1000", + "creationTimestamp": "1969-12-31T16:00:00.000-08:00", + "name": "us-central1", + "description": "us-central1", + "status": "UP", + "zones": [ + "https://www.googleapis.com/compute/v1/projects/project1/zones/us-central1-a", + "https://www.googleapis.com/compute/v1/projects/project1/zones/us-central1-b", + "https://www.googleapis.com/compute/v1/projects/project1/zones/us-central1-c", + "https://www.googleapis.com/compute/v1/projects/project1/zones/us-central1-f" + ], + "quotas": [], + "selfLink": "https://www.googleapis.com/compute/v1/projects/project1/regions/us-central1" +}` + +func TestFindMigsNamedRegional(t *testing.T) { + server := NewHttpServerMock() + defer server.Close() + + server.On("handle", "/project1/regions/us-central1").Return(getRegion).Once() + server.On("handle", "/project1/zones/us-central1-a/instanceGroups").Return(listInstanceGroups("us-central1-a")).Once() + server.On("handle", "/project1/zones/us-central1-b/instanceGroups").Return(listInstanceGroups("us-central1-b")).Once() + server.On("handle", "/project1/zones/us-central1-c/instanceGroups").Return(listInstanceGroups("us-central1-c")).Once() + server.On("handle", "/project1/zones/us-central1-f/instanceGroups").Return(listInstanceGroups("us-central1-f")).Once() + + regional := true + g := newTestGceManager(t, server.URL, ModeGCE, regional) + got, err := g.findMigsNamed(regexp.MustCompile("^UNUSED")) + assert.NoError(t, err) + + want := []string{ + "https://www.googleapis.com/compute/v1/projects/project1/zones/us-central1-a/instanceGroups/gce-pool-a", + "https://www.googleapis.com/compute/v1/projects/project1/zones/us-central1-a/instanceGroups/gce-pool-b", + "https://www.googleapis.com/compute/v1/projects/project1/zones/us-central1-b/instanceGroups/gce-pool-a", + "https://www.googleapis.com/compute/v1/projects/project1/zones/us-central1-b/instanceGroups/gce-pool-b", + "https://www.googleapis.com/compute/v1/projects/project1/zones/us-central1-c/instanceGroups/gce-pool-a", + "https://www.googleapis.com/compute/v1/projects/project1/zones/us-central1-c/instanceGroups/gce-pool-b", + "https://www.googleapis.com/compute/v1/projects/project1/zones/us-central1-f/instanceGroups/gce-pool-a", + "https://www.googleapis.com/compute/v1/projects/project1/zones/us-central1-f/instanceGroups/gce-pool-b", + } + assert.Equal(t, want, got) + mock.AssertExpectationsForObjects(t, server) +} diff --git a/cluster-autoscaler/cloudprovider/node_group_discovery_options.go b/cluster-autoscaler/cloudprovider/node_group_discovery_options.go index ab98324323..56db3017d7 100644 --- a/cluster-autoscaler/cloudprovider/node_group_discovery_options.go +++ b/cluster-autoscaler/cloudprovider/node_group_discovery_options.go @@ -23,23 +23,29 @@ type NodeGroupDiscoveryOptions struct { // NodeGroupSpecs is specified to statically discover node groups listed in it NodeGroupSpecs []string // NodeGroupAutoDiscoverySpec is specified for automatically discovering node groups according to the specs - NodeGroupAutoDiscoverySpec string + NodeGroupAutoDiscoverySpecs []string } -// StaticDiscoverySpecified returns true only when there are 1 or more --nodes flags are specified +// StaticDiscoverySpecified returns true only when there are 1 or more --nodes flags specified func (o NodeGroupDiscoveryOptions) StaticDiscoverySpecified() bool { return len(o.NodeGroupSpecs) > 0 } -// AutoDiscoverySpecified returns true only when there is --node-group-auto-discovery specified +// AutoDiscoverySpecified returns true only when there are 1 or more --node-group-auto-discovery flags specified func (o NodeGroupDiscoveryOptions) AutoDiscoverySpecified() bool { - return o.NodeGroupAutoDiscoverySpec != "" + return len(o.NodeGroupAutoDiscoverySpecs) > 0 +} + +// NoDiscoverySpecified returns true expected nly when there were no --nodes or +// --node-group-auto-discovery flags specified. This is expected in GKE. +func (o NodeGroupDiscoveryOptions) NoDiscoverySpecified() bool { + return !o.StaticDiscoverySpecified() && !o.AutoDiscoverySpecified() } // Validate returns and error when both --nodes and --node-group-auto-discovery are specified func (o NodeGroupDiscoveryOptions) Validate() error { if o.StaticDiscoverySpecified() && o.AutoDiscoverySpecified() { - return fmt.Errorf("Either node group specs(%v) or node group auto discovery spec(%v) can be specified but not both", o.NodeGroupSpecs, o.NodeGroupAutoDiscoverySpec) + return fmt.Errorf("Either node group specs(%v) or node group auto discovery spec(%v) can be specified but not both", o.NodeGroupSpecs, o.NodeGroupAutoDiscoverySpecs) } return nil } diff --git a/cluster-autoscaler/cloudprovider/node_group_discovery_options_test.go b/cluster-autoscaler/cloudprovider/node_group_discovery_options_test.go index bbe1631783..c8af369f93 100644 --- a/cluster-autoscaler/cloudprovider/node_group_discovery_options_test.go +++ b/cluster-autoscaler/cloudprovider/node_group_discovery_options_test.go @@ -25,8 +25,8 @@ import ( func TestNodeGroupDiscoveryOptionsValidate(t *testing.T) { o := NodeGroupDiscoveryOptions{ - NodeGroupAutoDiscoverySpec: "asg:tag=foobar", - NodeGroupSpecs: []string{"myasg:0:10"}, + NodeGroupAutoDiscoverySpecs: []string{"asg:tag=foobar"}, + NodeGroupSpecs: []string{"myasg:0:10"}, } err := o.Validate() @@ -34,12 +34,15 @@ func TestNodeGroupDiscoveryOptionsValidate(t *testing.T) { t.Errorf("Expected validation error didn't occur with NodeGroupDiscoveryOptions: %+v", o) t.FailNow() } - if msg := fmt.Sprintf("%v", err); msg != `Either node group specs([myasg:0:10]) or node group auto discovery spec(asg:tag=foobar) can be specified but not both` { + if msg := fmt.Sprintf("%v", err); msg != `Either node group specs([myasg:0:10]) or node group auto discovery spec([asg:tag=foobar]) can be specified but not both` { t.Errorf("Unexpected validation error message: %s", msg) } o = NodeGroupDiscoveryOptions{ - NodeGroupAutoDiscoverySpec: "asg:tag=foobar", + NodeGroupAutoDiscoverySpecs: []string{ + "mig:prefix=iga,min=0,max=10", + "mig:prefix=igb,min=0,max=20", + }, } assert.NoError(t, o.Validate()) diff --git a/cluster-autoscaler/core/autoscaler.go b/cluster-autoscaler/core/autoscaler.go index f23d1da331..5e241fa163 100644 --- a/cluster-autoscaler/core/autoscaler.go +++ b/cluster-autoscaler/core/autoscaler.go @@ -54,13 +54,13 @@ func NewAutoscaler(opts AutoscalerOptions, predicateChecker *simulator.Predicate autoscalerBuilder := NewAutoscalerBuilder(opts.AutoscalingOptions, predicateChecker, kubeClient, kubeEventRecorder, listerRegistry) if opts.ConfigMapName != "" { - if opts.NodeGroupAutoDiscovery != "" { + if len(opts.NodeGroupAutoDiscovery) > 0 { glog.Warning("Both --configmap and --node-group-auto-discovery were specified but only the former is going to take effect") } configFetcher := dynamic.NewConfigFetcher(opts.ConfigFetcherOptions, kubeClient, kubeEventRecorder) return NewDynamicAutoscaler(autoscalerBuilder, configFetcher) } - if opts.NodeGroupAutoDiscovery != "" { + if len(opts.NodeGroupAutoDiscovery) > 0 { return NewPollingAutoscaler(autoscalerBuilder) } return autoscalerBuilder.Build() diff --git a/cluster-autoscaler/core/autoscaling_context.go b/cluster-autoscaler/core/autoscaling_context.go index fed2339b6a..0c6dd011da 100644 --- a/cluster-autoscaler/core/autoscaling_context.go +++ b/cluster-autoscaler/core/autoscaling_context.go @@ -76,7 +76,7 @@ type AutoscalingOptions struct { // MinMemoryTotal sets the maximum memory (in megabytes) in the whole cluster MinMemoryTotal int64 // NodeGroupAutoDiscovery represents one or more definition(s) of node group auto-discovery - NodeGroupAutoDiscovery string + NodeGroupAutoDiscovery []string // EstimatorName is the estimator used to estimate the number of needed nodes in scale up. EstimatorName string // ExpanderName sets the type of node group expander to be used in scale up @@ -141,8 +141,8 @@ func NewAutoscalingContext(options AutoscalingOptions, predicateChecker *simulat cloudProviderBuilder := builder.NewCloudProviderBuilder(options.CloudProviderName, options.CloudConfig, options.ClusterName, options.NodeAutoprovisioningEnabled) cloudProvider := cloudProviderBuilder.Build(cloudprovider.NodeGroupDiscoveryOptions{ - NodeGroupSpecs: options.NodeGroups, - NodeGroupAutoDiscoverySpec: options.NodeGroupAutoDiscovery}, + NodeGroupSpecs: options.NodeGroups, + NodeGroupAutoDiscoverySpecs: options.NodeGroupAutoDiscovery}, cloudprovider.NewResourceLimiter( map[string]int64{cloudprovider.ResourceNameCores: int64(options.MinCoresTotal), cloudprovider.ResourceNameMemory: options.MinMemoryTotal}, map[string]int64{cloudprovider.ResourceNameCores: options.MaxCoresTotal, cloudprovider.ResourceNameMemory: options.MaxMemoryTotal})) diff --git a/cluster-autoscaler/main.go b/cluster-autoscaler/main.go index 57f00c2974..1bfdef30db 100644 --- a/cluster-autoscaler/main.go +++ b/cluster-autoscaler/main.go @@ -65,7 +65,9 @@ func (flag *MultiStringFlag) Set(value string) error { } var ( - nodeGroupsFlag MultiStringFlag + nodeGroupsFlag MultiStringFlag + nodeGroupAutoDiscoveryFlag MultiStringFlag + clusterName = flag.String("cluster-name", "", "Autoscaled cluster name, if available") address = flag.String("address", ":8085", "The address to expose prometheus metrics.") kubernetes = flag.String("kubernetes", "", "Kubernetes master location. Leave blank for default") @@ -73,7 +75,6 @@ var ( cloudConfig = flag.String("cloud-config", "", "The path to the cloud provider configuration file. Empty string for no configuration file.") configMapName = flag.String("configmap", "", "The name of the ConfigMap containing settings used for dynamic reconfiguration. Empty string for no ConfigMap.") namespace = flag.String("namespace", "kube-system", "Namespace in which cluster-autoscaler run. If a --configmap flag is also provided, ensure that the configmap exists in this namespace before CA runs.") - nodeGroupAutoDiscovery = flag.String("node-group-auto-discovery", "", "One or more definition(s) of node group auto-discovery. A definition is expressed `:[[=]]`. Only the `aws` cloud provider is currently supported. The only valid discoverer for it is `asg` and the valid key is `tag`. For example, specifying `--cloud-provider aws` and `--node-group-auto-discovery asg:tag=cluster-autoscaler/auto-discovery/enabled,kubernetes.io/cluster/` results in ASGs tagged with `cluster-autoscaler/auto-discovery/enabled` and `kubernetes.io/cluster/` to be considered as target node groups") scaleDownEnabled = flag.Bool("scale-down-enabled", true, "Should CA scale down the cluster") scaleDownDelayAfterAdd = flag.Duration("scale-down-delay-after-add", 10*time.Minute, "How long after scale up that scale down evaluation resumes") @@ -146,7 +147,7 @@ func createAutoscalerOptions() core.AutoscalerOptions { autoscalingOpts := core.AutoscalingOptions{ CloudConfig: *cloudConfig, CloudProviderName: *cloudProviderFlag, - NodeGroupAutoDiscovery: *nodeGroupAutoDiscovery, + NodeGroupAutoDiscovery: nodeGroupAutoDiscoveryFlag, MaxTotalUnreadyPercentage: *maxTotalUnreadyPercentage, OkTotalUnreadyCount: *okTotalUnreadyCount, EstimatorName: *estimatorFlag, @@ -280,6 +281,11 @@ func main() { bindFlags(&leaderElection, pflag.CommandLine) flag.Var(&nodeGroupsFlag, "nodes", "sets min,max size and other configuration data for a node group in a format accepted by cloud provider."+ "Can be used multiple times. Format: ::") + flag.Var(&nodeGroupAutoDiscoveryFlag, "node-group-auto-discovery", "One or more definition(s) of node group auto-discovery. "+ + "A definition is expressed `:[[=]]`. "+ + "The `aws` and `gce` cloud providers are currently supported. AWS matches by ASG tags, e.g. `asg:tag=tagKey,anotherTagKey`. "+ + "GCE matches by IG prefix, and requires you to specify min and max nodes per IG, e.g. `mig:prefix=pfx,min=0,max=10` "+ + "Can be used multiple times.") kube_flag.InitFlags() healthCheck := metrics.NewHealthCheck(*maxInactivityTimeFlag, *maxFailingTimeFlag)