From 2bd7f0efa39023b7d66764ede20f26ce98f690ec Mon Sep 17 00:00:00 2001
From: "Amr Hanafi (MAHDI))" <amrh@microsoft.com>
Date: Wed, 21 Apr 2021 00:26:43 -0700
Subject: [PATCH 1/4] [cluster-autoscaler] Publish node group min/max metrics

---
 cluster-autoscaler/core/autoscaler.go |  8 ++++++++
 cluster-autoscaler/metrics/metrics.go | 28 +++++++++++++++++++++++++++
 2 files changed, 36 insertions(+)

diff --git a/cluster-autoscaler/core/autoscaler.go b/cluster-autoscaler/core/autoscaler.go
index 3735baaac0..bf243eaf8c 100644
--- a/cluster-autoscaler/core/autoscaler.go
+++ b/cluster-autoscaler/core/autoscaler.go
@@ -27,6 +27,7 @@ import (
 	"k8s.io/autoscaler/cluster-autoscaler/estimator"
 	"k8s.io/autoscaler/cluster-autoscaler/expander"
 	"k8s.io/autoscaler/cluster-autoscaler/expander/factory"
+	"k8s.io/autoscaler/cluster-autoscaler/metrics"
 	ca_processors "k8s.io/autoscaler/cluster-autoscaler/processors"
 	"k8s.io/autoscaler/cluster-autoscaler/simulator"
 	"k8s.io/autoscaler/cluster-autoscaler/utils/backoff"
@@ -66,6 +67,13 @@ func NewAutoscaler(opts AutoscalerOptions) (Autoscaler, errors.AutoscalerError)
 	if err != nil {
 		return nil, errors.ToAutoscalerError(errors.InternalError, err)
 	}
+
+	// These metrics should be published only once.
+	for _, nodeGroup := range opts.CloudProvider.NodeGroups() {
+		metrics.UpdateNodeGroupMin(nodeGroup.Id(), nodeGroup.MinSize())
+		metrics.UpdateNodeGroupMax(nodeGroup.Id(), nodeGroup.MaxSize())
+	}
+
 	return NewStaticAutoscaler(
 		opts.AutoscalingOptions,
 		opts.PredicateChecker,
diff --git a/cluster-autoscaler/metrics/metrics.go b/cluster-autoscaler/metrics/metrics.go
index 9580ee3344..713af534ea 100644
--- a/cluster-autoscaler/metrics/metrics.go
+++ b/cluster-autoscaler/metrics/metrics.go
@@ -170,6 +170,22 @@ var (
 		}, []string{"direction"},
 	)
 
+	nodesGroupMinNodes = k8smetrics.NewGaugeVec(
+		&k8smetrics.GaugeOpts{
+			Namespace: caNamespace,
+			Name:      "node_group_min_count",
+			Help:      "Minimum number of nodes in the node group",
+		}, []string{"node_group"},
+	)
+
+	nodesGroupMaxNodes = k8smetrics.NewGaugeVec(
+		&k8smetrics.GaugeOpts{
+			Namespace: caNamespace,
+			Name:      "node_group_max_count",
+			Help:      "Maximum number of nodes in the node group",
+		}, []string{"node_group"},
+	)
+
 	/**** Metrics related to autoscaler execution ****/
 	lastActivity = k8smetrics.NewGaugeVec(
 		&k8smetrics.GaugeOpts{
@@ -324,6 +340,8 @@ func RegisterAll() {
 	legacyregistry.MustRegister(cpuLimitsCores)
 	legacyregistry.MustRegister(memoryCurrentBytes)
 	legacyregistry.MustRegister(memoryLimitsBytes)
+	legacyregistry.MustRegister(nodesGroupMinNodes)
+	legacyregistry.MustRegister(nodesGroupMaxNodes)
 	legacyregistry.MustRegister(lastActivity)
 	legacyregistry.MustRegister(functionDuration)
 	legacyregistry.MustRegister(functionDurationSummary)
@@ -422,6 +440,16 @@ func UpdateMemoryLimitsBytes(minMemoryCount int64, maxMemoryCount int64) {
 	memoryLimitsBytes.WithLabelValues("maximum").Set(float64(maxMemoryCount))
 }
 
+// UpdateNodeGroupMin records the node group minimum allowed number of nodes
+func UpdateNodeGroupMin(nodeGroup string, minNodes int) {
+	nodesGroupMinNodes.WithLabelValues(nodeGroup).Set(float64(minNodes))
+}
+
+// UpdateNodeGroupMax records the node group maximum allowed number of nodes
+func UpdateNodeGroupMax(nodeGroup string, maxNodes int) {
+	nodesGroupMaxNodes.WithLabelValues(nodeGroup).Set(float64(maxNodes))
+}
+
 // RegisterError records any errors preventing Cluster Autoscaler from working.
 // No more than one error should be recorded per loop.
 func RegisterError(err errors.AutoscalerError) {

From f5c2ab73287d6d7226a040416121bbf140eb2bda Mon Sep 17 00:00:00 2001
From: "Amr Hanafi (MAHDI))" <amrh@microsoft.com>
Date: Thu, 20 May 2021 16:49:39 -0700
Subject: [PATCH 2/4] Emit the node group metrics behind a flag

---
 cluster-autoscaler/core/scale_up_test.go   |  2 +-
 cluster-autoscaler/main.go                 |  4 ++-
 cluster-autoscaler/metrics/metrics.go      |  9 +++--
 cluster-autoscaler/metrics/metrics_test.go | 42 ++++++++++++++++++++++
 4 files changed, 52 insertions(+), 5 deletions(-)
 create mode 100644 cluster-autoscaler/metrics/metrics_test.go

diff --git a/cluster-autoscaler/core/scale_up_test.go b/cluster-autoscaler/core/scale_up_test.go
index 6ff0902bbe..6da8cb6d50 100644
--- a/cluster-autoscaler/core/scale_up_test.go
+++ b/cluster-autoscaler/core/scale_up_test.go
@@ -978,7 +978,7 @@ func TestCheckScaleUpDeltaWithinLimits(t *testing.T) {
 }
 
 func TestAuthError(t *testing.T) {
-	metrics.RegisterAll()
+	metrics.RegisterAll(false)
 	context, err := NewScaleTestAutoscalingContext(config.AutoscalingOptions{}, &fake.Clientset{}, nil, nil, nil)
 	assert.NoError(t, err)
 
diff --git a/cluster-autoscaler/main.go b/cluster-autoscaler/main.go
index c7c18d22b1..7feb0a41ea 100644
--- a/cluster-autoscaler/main.go
+++ b/cluster-autoscaler/main.go
@@ -179,6 +179,8 @@ var (
 	cordonNodeBeforeTerminate          = flag.Bool("cordon-node-before-terminating", false, "Should CA cordon nodes before terminating during downscale process")
 	daemonSetEvictionForEmptyNodes     = flag.Bool("daemonset-eviction-for-empty-nodes", false, "DaemonSet pods will be gracefully terminated from empty nodes")
 	userAgent                          = flag.String("user-agent", "cluster-autoscaler", "User agent used for HTTP calls.")
+
+	emitPerNodeGroupMetrics = flag.Bool("emit-per-nodegroup-metrics", false, "If true, emit per node group metrics.")
 )
 
 func createAutoscalingOptions() config.AutoscalingOptions {
@@ -340,7 +342,7 @@ func buildAutoscaler() (core.Autoscaler, error) {
 }
 
 func run(healthCheck *metrics.HealthCheck) {
-	metrics.RegisterAll()
+	metrics.RegisterAll(*emitPerNodeGroupMetrics)
 
 	autoscaler, err := buildAutoscaler()
 	if err != nil {
diff --git a/cluster-autoscaler/metrics/metrics.go b/cluster-autoscaler/metrics/metrics.go
index 713af534ea..e0b9e64e92 100644
--- a/cluster-autoscaler/metrics/metrics.go
+++ b/cluster-autoscaler/metrics/metrics.go
@@ -330,7 +330,7 @@ var (
 )
 
 // RegisterAll registers all metrics.
-func RegisterAll() {
+func RegisterAll(emitPerNodeGroupMetrics bool) {
 	legacyregistry.MustRegister(clusterSafeToAutoscale)
 	legacyregistry.MustRegister(nodesCount)
 	legacyregistry.MustRegister(nodeGroupsCount)
@@ -340,8 +340,6 @@ func RegisterAll() {
 	legacyregistry.MustRegister(cpuLimitsCores)
 	legacyregistry.MustRegister(memoryCurrentBytes)
 	legacyregistry.MustRegister(memoryLimitsBytes)
-	legacyregistry.MustRegister(nodesGroupMinNodes)
-	legacyregistry.MustRegister(nodesGroupMaxNodes)
 	legacyregistry.MustRegister(lastActivity)
 	legacyregistry.MustRegister(functionDuration)
 	legacyregistry.MustRegister(functionDurationSummary)
@@ -359,6 +357,11 @@ func RegisterAll() {
 	legacyregistry.MustRegister(napEnabled)
 	legacyregistry.MustRegister(nodeGroupCreationCount)
 	legacyregistry.MustRegister(nodeGroupDeletionCount)
+
+	if emitPerNodeGroupMetrics {
+		legacyregistry.MustRegister(nodesGroupMinNodes)
+		legacyregistry.MustRegister(nodesGroupMaxNodes)
+	}
 }
 
 // UpdateDurationFromStart records the duration of the step identified by the
diff --git a/cluster-autoscaler/metrics/metrics_test.go b/cluster-autoscaler/metrics/metrics_test.go
new file mode 100644
index 0000000000..71789d2c7a
--- /dev/null
+++ b/cluster-autoscaler/metrics/metrics_test.go
@@ -0,0 +1,42 @@
+/*
+Copyright 2021 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package metrics
+
+import (
+	"testing"
+
+	"github.com/prometheus/client_golang/prometheus/testutil"
+	"github.com/stretchr/testify/assert"
+)
+
+func TestDisabledPerNodeGroupMetrics(t *testing.T) {
+	RegisterAll(false)
+	assert.False(t, nodesGroupMinNodes.IsCreated())
+	assert.False(t, nodesGroupMaxNodes.IsCreated())
+}
+
+func TestEnabledPerNodeGroupMetrics(t *testing.T) {
+	RegisterAll(true)
+	assert.True(t, nodesGroupMinNodes.IsCreated())
+	assert.True(t, nodesGroupMaxNodes.IsCreated())
+
+	UpdateNodeGroupMin("foo", 2)
+	UpdateNodeGroupMax("foo", 100)
+
+	assert.Equal(t, 2, int(testutil.ToFloat64(nodesGroupMinNodes.GaugeVec.WithLabelValues("foo"))))
+	assert.Equal(t, 100, int(testutil.ToFloat64(nodesGroupMaxNodes.GaugeVec.WithLabelValues("foo"))))
+}

From 3ac32b817ce8e598f55342b014724badbca3a930 Mon Sep 17 00:00:00 2001
From: "Amr Hanafi (MAHDI))" <amrh@microsoft.com>
Date: Thu, 20 May 2021 17:35:08 -0700
Subject: [PATCH 3/4] Update node group min/max on cloud provider refresh

---
 cluster-autoscaler/core/autoscaler.go        | 8 --------
 cluster-autoscaler/core/static_autoscaler.go | 6 ++++++
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/cluster-autoscaler/core/autoscaler.go b/cluster-autoscaler/core/autoscaler.go
index bf243eaf8c..3735baaac0 100644
--- a/cluster-autoscaler/core/autoscaler.go
+++ b/cluster-autoscaler/core/autoscaler.go
@@ -27,7 +27,6 @@ import (
 	"k8s.io/autoscaler/cluster-autoscaler/estimator"
 	"k8s.io/autoscaler/cluster-autoscaler/expander"
 	"k8s.io/autoscaler/cluster-autoscaler/expander/factory"
-	"k8s.io/autoscaler/cluster-autoscaler/metrics"
 	ca_processors "k8s.io/autoscaler/cluster-autoscaler/processors"
 	"k8s.io/autoscaler/cluster-autoscaler/simulator"
 	"k8s.io/autoscaler/cluster-autoscaler/utils/backoff"
@@ -67,13 +66,6 @@ func NewAutoscaler(opts AutoscalerOptions) (Autoscaler, errors.AutoscalerError)
 	if err != nil {
 		return nil, errors.ToAutoscalerError(errors.InternalError, err)
 	}
-
-	// These metrics should be published only once.
-	for _, nodeGroup := range opts.CloudProvider.NodeGroups() {
-		metrics.UpdateNodeGroupMin(nodeGroup.Id(), nodeGroup.MinSize())
-		metrics.UpdateNodeGroupMax(nodeGroup.Id(), nodeGroup.MaxSize())
-	}
-
 	return NewStaticAutoscaler(
 		opts.AutoscalingOptions,
 		opts.PredicateChecker,
diff --git a/cluster-autoscaler/core/static_autoscaler.go b/cluster-autoscaler/core/static_autoscaler.go
index 83d840265a..c9cde41645 100644
--- a/cluster-autoscaler/core/static_autoscaler.go
+++ b/cluster-autoscaler/core/static_autoscaler.go
@@ -263,6 +263,12 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) errors.AutoscalerError
 		return errors.ToAutoscalerError(errors.CloudProviderError, err)
 	}
 
+	// Update node groups min/max after cloud provider refresh
+	for _, nodeGroup := range a.AutoscalingContext.CloudProvider.NodeGroups() {
+		metrics.UpdateNodeGroupMin(nodeGroup.Id(), nodeGroup.MinSize())
+		metrics.UpdateNodeGroupMax(nodeGroup.Id(), nodeGroup.MaxSize())
+	}
+
 	nonExpendableScheduledPods := core_utils.FilterOutExpendablePods(originalScheduledPods, a.ExpendablePodsPriorityCutoff)
 	// Initialize cluster state to ClusterSnapshot
 	if typedErr := a.initializeClusterSnapshot(allNodes, nonExpendableScheduledPods); typedErr != nil {

From 8b2aee01e4a1c145064b8084896d4b6b39652b97 Mon Sep 17 00:00:00 2001
From: "Amr Hanafi (MAHDI))" <amrh@microsoft.com>
Date: Fri, 21 May 2021 08:25:16 -0700
Subject: [PATCH 4/4] Update FAQ to mention the new flag

---
 cluster-autoscaler/FAQ.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cluster-autoscaler/FAQ.md b/cluster-autoscaler/FAQ.md
index 1125b54ec1..cb6e02f19c 100644
--- a/cluster-autoscaler/FAQ.md
+++ b/cluster-autoscaler/FAQ.md
@@ -662,6 +662,7 @@ The following startup parameters are supported for cluster autoscaler:
 | `max-node-provision-time` | Maximum time CA waits for node to be provisioned | 15 minutes
 | `nodes` | sets min,max size and other configuration data for a node group in a format accepted by cloud provider. Can be used multiple times. Format: <min>:<max>:<other...> | ""
 | `node-group-auto-discovery` | One or more definition(s) of node group auto-discovery.<br>A definition is expressed `<name of discoverer>:[<key>[=<value>]]`<br>The `aws`, `gce`, and `azure` cloud providers are currently supported. AWS matches by ASG tags, e.g. `asg:tag=tagKey,anotherTagKey`<br>GCE matches by IG name prefix, and requires you to specify min and max nodes per IG, e.g. `mig:namePrefix=pfx,min=0,max=10`<br> Azure matches by tags on VMSS, e.g. `label:foo=bar`, and will auto-detect `min` and `max` tags on the VMSS to set scaling limits.<br>Can be used multiple times | ""
+| `emit-per-nodegroup-metrics` | If true, emit per node group metrics. | false
 | `estimator` | Type of resource estimator to be used in scale up | binpacking
 | `expander` | Type of node group expander to be used in scale up.  | random
 | `write-status-configmap` | Should CA write status information to a configmap  | true