Merge pull request #4022 from amrmahdi/amrh/nodegroupminmaxmetrics

[cluster-autoscaler] Publish node group min/max metrics
This commit is contained in:
Kubernetes Prow Robot 2021-07-05 07:38:54 -07:00 committed by GitHub
commit 9f84d391f6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 85 additions and 3 deletions

View File

@ -704,6 +704,7 @@ The following startup parameters are supported for cluster autoscaler:
| `max-node-provision-time` | Maximum time CA waits for node to be provisioned | 15 minutes
| `nodes` | sets min,max size and other configuration data for a node group in a format accepted by cloud provider. Can be used multiple times. Format: <min>:<max>:<other...> | ""
| `node-group-auto-discovery` | One or more definition(s) of node group auto-discovery.<br>A definition is expressed `<name of discoverer>:[<key>[=<value>]]`<br>The `aws`, `gce`, and `azure` cloud providers are currently supported. AWS matches by ASG tags, e.g. `asg:tag=tagKey,anotherTagKey`<br>GCE matches by IG name prefix, and requires you to specify min and max nodes per IG, e.g. `mig:namePrefix=pfx,min=0,max=10`<br> Azure matches by tags on VMSS, e.g. `label:foo=bar`, and will auto-detect `min` and `max` tags on the VMSS to set scaling limits.<br>Can be used multiple times | ""
| `emit-per-nodegroup-metrics` | If true, emit per node group metrics. | false
| `estimator` | Type of resource estimator to be used in scale up | binpacking
| `expander` | Type of node group expander to be used in scale up. | random
| `write-status-configmap` | Should CA write status information to a configmap | true

View File

@ -978,7 +978,7 @@ func TestCheckScaleUpDeltaWithinLimits(t *testing.T) {
}
func TestAuthError(t *testing.T) {
metrics.RegisterAll()
metrics.RegisterAll(false)
context, err := NewScaleTestAutoscalingContext(config.AutoscalingOptions{}, &fake.Clientset{}, nil, nil, nil)
assert.NoError(t, err)

View File

@ -268,6 +268,12 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) errors.AutoscalerError
return errors.ToAutoscalerError(errors.CloudProviderError, err)
}
// Update node groups min/max after cloud provider refresh
for _, nodeGroup := range a.AutoscalingContext.CloudProvider.NodeGroups() {
metrics.UpdateNodeGroupMin(nodeGroup.Id(), nodeGroup.MinSize())
metrics.UpdateNodeGroupMax(nodeGroup.Id(), nodeGroup.MaxSize())
}
nonExpendableScheduledPods := core_utils.FilterOutExpendablePods(originalScheduledPods, a.ExpendablePodsPriorityCutoff)
// Initialize cluster state to ClusterSnapshot
if typedErr := a.initializeClusterSnapshot(allNodes, nonExpendableScheduledPods); typedErr != nil {

View File

@ -180,6 +180,8 @@ var (
daemonSetEvictionForEmptyNodes = flag.Bool("daemonset-eviction-for-empty-nodes", false, "DaemonSet pods will be gracefully terminated from empty nodes")
daemonSetEvictionForOccupiedNodes = flag.Bool("daemonset-eviction-for-occupied-nodes", true, "DaemonSet pods will be gracefully terminated from non-empty nodes")
userAgent = flag.String("user-agent", "cluster-autoscaler", "User agent used for HTTP calls.")
emitPerNodeGroupMetrics = flag.Bool("emit-per-nodegroup-metrics", false, "If true, emit per node group metrics.")
)
func createAutoscalingOptions() config.AutoscalingOptions {
@ -342,7 +344,7 @@ func buildAutoscaler() (core.Autoscaler, error) {
}
func run(healthCheck *metrics.HealthCheck) {
metrics.RegisterAll()
metrics.RegisterAll(*emitPerNodeGroupMetrics)
autoscaler, err := buildAutoscaler()
if err != nil {

View File

@ -171,6 +171,22 @@ var (
}, []string{"direction"},
)
nodesGroupMinNodes = k8smetrics.NewGaugeVec(
&k8smetrics.GaugeOpts{
Namespace: caNamespace,
Name: "node_group_min_count",
Help: "Minimum number of nodes in the node group",
}, []string{"node_group"},
)
nodesGroupMaxNodes = k8smetrics.NewGaugeVec(
&k8smetrics.GaugeOpts{
Namespace: caNamespace,
Name: "node_group_max_count",
Help: "Maximum number of nodes in the node group",
}, []string{"node_group"},
)
/**** Metrics related to autoscaler execution ****/
lastActivity = k8smetrics.NewGaugeVec(
&k8smetrics.GaugeOpts{
@ -315,7 +331,7 @@ var (
)
// RegisterAll registers all metrics.
func RegisterAll() {
func RegisterAll(emitPerNodeGroupMetrics bool) {
legacyregistry.MustRegister(clusterSafeToAutoscale)
legacyregistry.MustRegister(nodesCount)
legacyregistry.MustRegister(nodeGroupsCount)
@ -342,6 +358,11 @@ func RegisterAll() {
legacyregistry.MustRegister(napEnabled)
legacyregistry.MustRegister(nodeGroupCreationCount)
legacyregistry.MustRegister(nodeGroupDeletionCount)
if emitPerNodeGroupMetrics {
legacyregistry.MustRegister(nodesGroupMinNodes)
legacyregistry.MustRegister(nodesGroupMaxNodes)
}
}
// UpdateDurationFromStart records the duration of the step identified by the
@ -423,6 +444,16 @@ func UpdateMemoryLimitsBytes(minMemoryCount int64, maxMemoryCount int64) {
memoryLimitsBytes.WithLabelValues("maximum").Set(float64(maxMemoryCount))
}
// UpdateNodeGroupMin records the node group minimum allowed number of nodes
func UpdateNodeGroupMin(nodeGroup string, minNodes int) {
nodesGroupMinNodes.WithLabelValues(nodeGroup).Set(float64(minNodes))
}
// UpdateNodeGroupMax records the node group maximum allowed number of nodes
func UpdateNodeGroupMax(nodeGroup string, maxNodes int) {
nodesGroupMaxNodes.WithLabelValues(nodeGroup).Set(float64(maxNodes))
}
// RegisterError records any errors preventing Cluster Autoscaler from working.
// No more than one error should be recorded per loop.
func RegisterError(err errors.AutoscalerError) {

View File

@ -0,0 +1,42 @@
/*
Copyright 2021 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package metrics
import (
"testing"
"github.com/prometheus/client_golang/prometheus/testutil"
"github.com/stretchr/testify/assert"
)
func TestDisabledPerNodeGroupMetrics(t *testing.T) {
RegisterAll(false)
assert.False(t, nodesGroupMinNodes.IsCreated())
assert.False(t, nodesGroupMaxNodes.IsCreated())
}
func TestEnabledPerNodeGroupMetrics(t *testing.T) {
RegisterAll(true)
assert.True(t, nodesGroupMinNodes.IsCreated())
assert.True(t, nodesGroupMaxNodes.IsCreated())
UpdateNodeGroupMin("foo", 2)
UpdateNodeGroupMax("foo", 100)
assert.Equal(t, 2, int(testutil.ToFloat64(nodesGroupMinNodes.GaugeVec.WithLabelValues("foo"))))
assert.Equal(t, 100, int(testutil.ToFloat64(nodesGroupMaxNodes.GaugeVec.WithLabelValues("foo"))))
}