diff --git a/cluster-autoscaler/FAQ.md b/cluster-autoscaler/FAQ.md index 0bb03d1bfd..5891dc92cc 100644 --- a/cluster-autoscaler/FAQ.md +++ b/cluster-autoscaler/FAQ.md @@ -704,6 +704,7 @@ The following startup parameters are supported for cluster autoscaler: | `max-node-provision-time` | Maximum time CA waits for node to be provisioned | 15 minutes | `nodes` | sets min,max size and other configuration data for a node group in a format accepted by cloud provider. Can be used multiple times. Format: :: | "" | `node-group-auto-discovery` | One or more definition(s) of node group auto-discovery.
A definition is expressed `:[[=]]`
The `aws`, `gce`, and `azure` cloud providers are currently supported. AWS matches by ASG tags, e.g. `asg:tag=tagKey,anotherTagKey`
GCE matches by IG name prefix, and requires you to specify min and max nodes per IG, e.g. `mig:namePrefix=pfx,min=0,max=10`
Azure matches by tags on VMSS, e.g. `label:foo=bar`, and will auto-detect `min` and `max` tags on the VMSS to set scaling limits.
Can be used multiple times | "" +| `emit-per-nodegroup-metrics` | If true, emit per node group metrics. | false | `estimator` | Type of resource estimator to be used in scale up | binpacking | `expander` | Type of node group expander to be used in scale up. | random | `write-status-configmap` | Should CA write status information to a configmap | true diff --git a/cluster-autoscaler/core/scale_up_test.go b/cluster-autoscaler/core/scale_up_test.go index 6ff0902bbe..6da8cb6d50 100644 --- a/cluster-autoscaler/core/scale_up_test.go +++ b/cluster-autoscaler/core/scale_up_test.go @@ -978,7 +978,7 @@ func TestCheckScaleUpDeltaWithinLimits(t *testing.T) { } func TestAuthError(t *testing.T) { - metrics.RegisterAll() + metrics.RegisterAll(false) context, err := NewScaleTestAutoscalingContext(config.AutoscalingOptions{}, &fake.Clientset{}, nil, nil, nil) assert.NoError(t, err) diff --git a/cluster-autoscaler/core/static_autoscaler.go b/cluster-autoscaler/core/static_autoscaler.go index a0f54f960d..30f8202c8d 100644 --- a/cluster-autoscaler/core/static_autoscaler.go +++ b/cluster-autoscaler/core/static_autoscaler.go @@ -268,6 +268,12 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) errors.AutoscalerError return errors.ToAutoscalerError(errors.CloudProviderError, err) } + // Update node groups min/max after cloud provider refresh + for _, nodeGroup := range a.AutoscalingContext.CloudProvider.NodeGroups() { + metrics.UpdateNodeGroupMin(nodeGroup.Id(), nodeGroup.MinSize()) + metrics.UpdateNodeGroupMax(nodeGroup.Id(), nodeGroup.MaxSize()) + } + nonExpendableScheduledPods := core_utils.FilterOutExpendablePods(originalScheduledPods, a.ExpendablePodsPriorityCutoff) // Initialize cluster state to ClusterSnapshot if typedErr := a.initializeClusterSnapshot(allNodes, nonExpendableScheduledPods); typedErr != nil { diff --git a/cluster-autoscaler/main.go b/cluster-autoscaler/main.go index 37459a175b..18dc182e52 100644 --- a/cluster-autoscaler/main.go +++ b/cluster-autoscaler/main.go @@ -180,6 +180,8 @@ var ( daemonSetEvictionForEmptyNodes = flag.Bool("daemonset-eviction-for-empty-nodes", false, "DaemonSet pods will be gracefully terminated from empty nodes") daemonSetEvictionForOccupiedNodes = flag.Bool("daemonset-eviction-for-occupied-nodes", true, "DaemonSet pods will be gracefully terminated from non-empty nodes") userAgent = flag.String("user-agent", "cluster-autoscaler", "User agent used for HTTP calls.") + + emitPerNodeGroupMetrics = flag.Bool("emit-per-nodegroup-metrics", false, "If true, emit per node group metrics.") ) func createAutoscalingOptions() config.AutoscalingOptions { @@ -342,7 +344,7 @@ func buildAutoscaler() (core.Autoscaler, error) { } func run(healthCheck *metrics.HealthCheck) { - metrics.RegisterAll() + metrics.RegisterAll(*emitPerNodeGroupMetrics) autoscaler, err := buildAutoscaler() if err != nil { diff --git a/cluster-autoscaler/metrics/metrics.go b/cluster-autoscaler/metrics/metrics.go index 41b7fb7a9d..7c8b643ac7 100644 --- a/cluster-autoscaler/metrics/metrics.go +++ b/cluster-autoscaler/metrics/metrics.go @@ -171,6 +171,22 @@ var ( }, []string{"direction"}, ) + nodesGroupMinNodes = k8smetrics.NewGaugeVec( + &k8smetrics.GaugeOpts{ + Namespace: caNamespace, + Name: "node_group_min_count", + Help: "Minimum number of nodes in the node group", + }, []string{"node_group"}, + ) + + nodesGroupMaxNodes = k8smetrics.NewGaugeVec( + &k8smetrics.GaugeOpts{ + Namespace: caNamespace, + Name: "node_group_max_count", + Help: "Maximum number of nodes in the node group", + }, []string{"node_group"}, + ) + /**** Metrics related to autoscaler execution ****/ lastActivity = k8smetrics.NewGaugeVec( &k8smetrics.GaugeOpts{ @@ -315,7 +331,7 @@ var ( ) // RegisterAll registers all metrics. -func RegisterAll() { +func RegisterAll(emitPerNodeGroupMetrics bool) { legacyregistry.MustRegister(clusterSafeToAutoscale) legacyregistry.MustRegister(nodesCount) legacyregistry.MustRegister(nodeGroupsCount) @@ -342,6 +358,11 @@ func RegisterAll() { legacyregistry.MustRegister(napEnabled) legacyregistry.MustRegister(nodeGroupCreationCount) legacyregistry.MustRegister(nodeGroupDeletionCount) + + if emitPerNodeGroupMetrics { + legacyregistry.MustRegister(nodesGroupMinNodes) + legacyregistry.MustRegister(nodesGroupMaxNodes) + } } // UpdateDurationFromStart records the duration of the step identified by the @@ -423,6 +444,16 @@ func UpdateMemoryLimitsBytes(minMemoryCount int64, maxMemoryCount int64) { memoryLimitsBytes.WithLabelValues("maximum").Set(float64(maxMemoryCount)) } +// UpdateNodeGroupMin records the node group minimum allowed number of nodes +func UpdateNodeGroupMin(nodeGroup string, minNodes int) { + nodesGroupMinNodes.WithLabelValues(nodeGroup).Set(float64(minNodes)) +} + +// UpdateNodeGroupMax records the node group maximum allowed number of nodes +func UpdateNodeGroupMax(nodeGroup string, maxNodes int) { + nodesGroupMaxNodes.WithLabelValues(nodeGroup).Set(float64(maxNodes)) +} + // RegisterError records any errors preventing Cluster Autoscaler from working. // No more than one error should be recorded per loop. func RegisterError(err errors.AutoscalerError) { diff --git a/cluster-autoscaler/metrics/metrics_test.go b/cluster-autoscaler/metrics/metrics_test.go new file mode 100644 index 0000000000..71789d2c7a --- /dev/null +++ b/cluster-autoscaler/metrics/metrics_test.go @@ -0,0 +1,42 @@ +/* +Copyright 2021 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package metrics + +import ( + "testing" + + "github.com/prometheus/client_golang/prometheus/testutil" + "github.com/stretchr/testify/assert" +) + +func TestDisabledPerNodeGroupMetrics(t *testing.T) { + RegisterAll(false) + assert.False(t, nodesGroupMinNodes.IsCreated()) + assert.False(t, nodesGroupMaxNodes.IsCreated()) +} + +func TestEnabledPerNodeGroupMetrics(t *testing.T) { + RegisterAll(true) + assert.True(t, nodesGroupMinNodes.IsCreated()) + assert.True(t, nodesGroupMaxNodes.IsCreated()) + + UpdateNodeGroupMin("foo", 2) + UpdateNodeGroupMax("foo", 100) + + assert.Equal(t, 2, int(testutil.ToFloat64(nodesGroupMinNodes.GaugeVec.WithLabelValues("foo")))) + assert.Equal(t, 100, int(testutil.ToFloat64(nodesGroupMaxNodes.GaugeVec.WithLabelValues("foo")))) +}