Merge pull request #4022 from amrmahdi/amrh/nodegroupminmaxmetrics
[cluster-autoscaler] Publish node group min/max metrics
This commit is contained in:
commit
9f84d391f6
|
|
@ -704,6 +704,7 @@ The following startup parameters are supported for cluster autoscaler:
|
|||
| `max-node-provision-time` | Maximum time CA waits for node to be provisioned | 15 minutes
|
||||
| `nodes` | sets min,max size and other configuration data for a node group in a format accepted by cloud provider. Can be used multiple times. Format: <min>:<max>:<other...> | ""
|
||||
| `node-group-auto-discovery` | One or more definition(s) of node group auto-discovery.<br>A definition is expressed `<name of discoverer>:[<key>[=<value>]]`<br>The `aws`, `gce`, and `azure` cloud providers are currently supported. AWS matches by ASG tags, e.g. `asg:tag=tagKey,anotherTagKey`<br>GCE matches by IG name prefix, and requires you to specify min and max nodes per IG, e.g. `mig:namePrefix=pfx,min=0,max=10`<br> Azure matches by tags on VMSS, e.g. `label:foo=bar`, and will auto-detect `min` and `max` tags on the VMSS to set scaling limits.<br>Can be used multiple times | ""
|
||||
| `emit-per-nodegroup-metrics` | If true, emit per node group metrics. | false
|
||||
| `estimator` | Type of resource estimator to be used in scale up | binpacking
|
||||
| `expander` | Type of node group expander to be used in scale up. | random
|
||||
| `write-status-configmap` | Should CA write status information to a configmap | true
|
||||
|
|
|
|||
|
|
@ -978,7 +978,7 @@ func TestCheckScaleUpDeltaWithinLimits(t *testing.T) {
|
|||
}
|
||||
|
||||
func TestAuthError(t *testing.T) {
|
||||
metrics.RegisterAll()
|
||||
metrics.RegisterAll(false)
|
||||
context, err := NewScaleTestAutoscalingContext(config.AutoscalingOptions{}, &fake.Clientset{}, nil, nil, nil)
|
||||
assert.NoError(t, err)
|
||||
|
||||
|
|
|
|||
|
|
@ -268,6 +268,12 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) errors.AutoscalerError
|
|||
return errors.ToAutoscalerError(errors.CloudProviderError, err)
|
||||
}
|
||||
|
||||
// Update node groups min/max after cloud provider refresh
|
||||
for _, nodeGroup := range a.AutoscalingContext.CloudProvider.NodeGroups() {
|
||||
metrics.UpdateNodeGroupMin(nodeGroup.Id(), nodeGroup.MinSize())
|
||||
metrics.UpdateNodeGroupMax(nodeGroup.Id(), nodeGroup.MaxSize())
|
||||
}
|
||||
|
||||
nonExpendableScheduledPods := core_utils.FilterOutExpendablePods(originalScheduledPods, a.ExpendablePodsPriorityCutoff)
|
||||
// Initialize cluster state to ClusterSnapshot
|
||||
if typedErr := a.initializeClusterSnapshot(allNodes, nonExpendableScheduledPods); typedErr != nil {
|
||||
|
|
|
|||
|
|
@ -180,6 +180,8 @@ var (
|
|||
daemonSetEvictionForEmptyNodes = flag.Bool("daemonset-eviction-for-empty-nodes", false, "DaemonSet pods will be gracefully terminated from empty nodes")
|
||||
daemonSetEvictionForOccupiedNodes = flag.Bool("daemonset-eviction-for-occupied-nodes", true, "DaemonSet pods will be gracefully terminated from non-empty nodes")
|
||||
userAgent = flag.String("user-agent", "cluster-autoscaler", "User agent used for HTTP calls.")
|
||||
|
||||
emitPerNodeGroupMetrics = flag.Bool("emit-per-nodegroup-metrics", false, "If true, emit per node group metrics.")
|
||||
)
|
||||
|
||||
func createAutoscalingOptions() config.AutoscalingOptions {
|
||||
|
|
@ -342,7 +344,7 @@ func buildAutoscaler() (core.Autoscaler, error) {
|
|||
}
|
||||
|
||||
func run(healthCheck *metrics.HealthCheck) {
|
||||
metrics.RegisterAll()
|
||||
metrics.RegisterAll(*emitPerNodeGroupMetrics)
|
||||
|
||||
autoscaler, err := buildAutoscaler()
|
||||
if err != nil {
|
||||
|
|
|
|||
|
|
@ -171,6 +171,22 @@ var (
|
|||
}, []string{"direction"},
|
||||
)
|
||||
|
||||
nodesGroupMinNodes = k8smetrics.NewGaugeVec(
|
||||
&k8smetrics.GaugeOpts{
|
||||
Namespace: caNamespace,
|
||||
Name: "node_group_min_count",
|
||||
Help: "Minimum number of nodes in the node group",
|
||||
}, []string{"node_group"},
|
||||
)
|
||||
|
||||
nodesGroupMaxNodes = k8smetrics.NewGaugeVec(
|
||||
&k8smetrics.GaugeOpts{
|
||||
Namespace: caNamespace,
|
||||
Name: "node_group_max_count",
|
||||
Help: "Maximum number of nodes in the node group",
|
||||
}, []string{"node_group"},
|
||||
)
|
||||
|
||||
/**** Metrics related to autoscaler execution ****/
|
||||
lastActivity = k8smetrics.NewGaugeVec(
|
||||
&k8smetrics.GaugeOpts{
|
||||
|
|
@ -315,7 +331,7 @@ var (
|
|||
)
|
||||
|
||||
// RegisterAll registers all metrics.
|
||||
func RegisterAll() {
|
||||
func RegisterAll(emitPerNodeGroupMetrics bool) {
|
||||
legacyregistry.MustRegister(clusterSafeToAutoscale)
|
||||
legacyregistry.MustRegister(nodesCount)
|
||||
legacyregistry.MustRegister(nodeGroupsCount)
|
||||
|
|
@ -342,6 +358,11 @@ func RegisterAll() {
|
|||
legacyregistry.MustRegister(napEnabled)
|
||||
legacyregistry.MustRegister(nodeGroupCreationCount)
|
||||
legacyregistry.MustRegister(nodeGroupDeletionCount)
|
||||
|
||||
if emitPerNodeGroupMetrics {
|
||||
legacyregistry.MustRegister(nodesGroupMinNodes)
|
||||
legacyregistry.MustRegister(nodesGroupMaxNodes)
|
||||
}
|
||||
}
|
||||
|
||||
// UpdateDurationFromStart records the duration of the step identified by the
|
||||
|
|
@ -423,6 +444,16 @@ func UpdateMemoryLimitsBytes(minMemoryCount int64, maxMemoryCount int64) {
|
|||
memoryLimitsBytes.WithLabelValues("maximum").Set(float64(maxMemoryCount))
|
||||
}
|
||||
|
||||
// UpdateNodeGroupMin records the node group minimum allowed number of nodes
|
||||
func UpdateNodeGroupMin(nodeGroup string, minNodes int) {
|
||||
nodesGroupMinNodes.WithLabelValues(nodeGroup).Set(float64(minNodes))
|
||||
}
|
||||
|
||||
// UpdateNodeGroupMax records the node group maximum allowed number of nodes
|
||||
func UpdateNodeGroupMax(nodeGroup string, maxNodes int) {
|
||||
nodesGroupMaxNodes.WithLabelValues(nodeGroup).Set(float64(maxNodes))
|
||||
}
|
||||
|
||||
// RegisterError records any errors preventing Cluster Autoscaler from working.
|
||||
// No more than one error should be recorded per loop.
|
||||
func RegisterError(err errors.AutoscalerError) {
|
||||
|
|
|
|||
|
|
@ -0,0 +1,42 @@
|
|||
/*
|
||||
Copyright 2021 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package metrics
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/prometheus/client_golang/prometheus/testutil"
|
||||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
|
||||
func TestDisabledPerNodeGroupMetrics(t *testing.T) {
|
||||
RegisterAll(false)
|
||||
assert.False(t, nodesGroupMinNodes.IsCreated())
|
||||
assert.False(t, nodesGroupMaxNodes.IsCreated())
|
||||
}
|
||||
|
||||
func TestEnabledPerNodeGroupMetrics(t *testing.T) {
|
||||
RegisterAll(true)
|
||||
assert.True(t, nodesGroupMinNodes.IsCreated())
|
||||
assert.True(t, nodesGroupMaxNodes.IsCreated())
|
||||
|
||||
UpdateNodeGroupMin("foo", 2)
|
||||
UpdateNodeGroupMax("foo", 100)
|
||||
|
||||
assert.Equal(t, 2, int(testutil.ToFloat64(nodesGroupMinNodes.GaugeVec.WithLabelValues("foo"))))
|
||||
assert.Equal(t, 100, int(testutil.ToFloat64(nodesGroupMaxNodes.GaugeVec.WithLabelValues("foo"))))
|
||||
}
|
||||
Loading…
Reference in New Issue