add cluster cores and memory bytes count metrics

This change adds 4 metrics that can be used to monitor the minimum and
maximum limits for CPU and memory, as well as the current counts in
cores and bytes, respectively.

The four metrics added are:
* `cluster_autoscaler_cpu_limits_cores`
* `cluster_autoscaler_cluster_cpu_current_cores`
* `cluster_autoscaler_memory_limits_bytes`
* `cluster_autoscaler_cluster_memory_current_bytes`

This change also adds the `max_cores_total` metric to the metrics
proposal doc, as it was previously not recorded there.

User story: As a cluster autoscaler user, I would like to monitor my
cluster through metrics to determine when the cluster is nearing its
limits for cores and memory usage.
This commit is contained in:
Michael McCune 2021-03-23 17:00:52 -04:00
parent 6dcda9d580
commit a24ea6c66b
4 changed files with 88 additions and 0 deletions

View File

@ -246,6 +246,11 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) errors.AutoscalerError
return nil return nil
} }
// Update cluster resource usage metrics
coresTotal, memoryTotal := calculateCoresMemoryTotal(allNodes, currentTime)
metrics.UpdateClusterCPUCurrentCores(coresTotal)
metrics.UpdateClusterMemoryCurrentBytes(memoryTotal)
daemonsets, err := a.ListerRegistry.DaemonSetLister().List(labels.Everything()) daemonsets, err := a.ListerRegistry.DaemonSetLister().List(labels.Everything())
if err != nil { if err != nil {
klog.Errorf("Failed to get daemonset list: %v", err) klog.Errorf("Failed to get daemonset list: %v", err)
@ -800,3 +805,21 @@ func getUpcomingNodeInfos(registry *clusterstate.ClusterStateRegistry, nodeInfos
} }
return upcomingNodes return upcomingNodes
} }
func calculateCoresMemoryTotal(nodes []*apiv1.Node, timestamp time.Time) (int64, int64) {
// this function is essentially similar to the calculateScaleDownCoresMemoryTotal
// we want to check all nodes, aside from those deleting, to sum the cluster resource usage.
var coresTotal, memoryTotal int64
for _, node := range nodes {
if isNodeBeingDeleted(node, timestamp) {
// Nodes being deleted do not count towards total cluster resources
continue
}
cores, memory := core_utils.GetNodeCoresAndMemory(node)
coresTotal += cores
memoryTotal += memory
}
return coresTotal, memoryTotal
}

View File

@ -330,6 +330,8 @@ func buildAutoscaler() (core.Autoscaler, error) {
// These metrics should be published only once. // These metrics should be published only once.
metrics.UpdateNapEnabled(autoscalingOptions.NodeAutoprovisioningEnabled) metrics.UpdateNapEnabled(autoscalingOptions.NodeAutoprovisioningEnabled)
metrics.UpdateMaxNodesCount(autoscalingOptions.MaxNodesTotal) metrics.UpdateMaxNodesCount(autoscalingOptions.MaxNodesTotal)
metrics.UpdateCPULimitsCores(autoscalingOptions.MinCoresTotal, autoscalingOptions.MaxCoresTotal)
metrics.UpdateMemoryLimitsBytes(autoscalingOptions.MinMemoryTotal, autoscalingOptions.MaxMemoryTotal)
// Create autoscaler. // Create autoscaler.
return core.NewAutoscaler(opts) return core.NewAutoscaler(opts)

View File

@ -138,6 +138,38 @@ var (
}, },
) )
cpuCurrentCores = k8smetrics.NewGauge(
&k8smetrics.GaugeOpts{
Namespace: caNamespace,
Name: "cluster_cpu_current_cores",
Help: "Current number of cores in the cluster, minus deleting nodes.",
},
)
cpuLimitsCores = k8smetrics.NewGaugeVec(
&k8smetrics.GaugeOpts{
Namespace: caNamespace,
Name: "cpu_limits_cores",
Help: "Minimum and maximum number of cores in the cluster.",
}, []string{"direction"},
)
memoryCurrentBytes = k8smetrics.NewGauge(
&k8smetrics.GaugeOpts{
Namespace: caNamespace,
Name: "cluster_memory_current_bytes",
Help: "Current number of bytes of memory in the cluster, minus deleting nodes.",
},
)
memoryLimitsBytes = k8smetrics.NewGaugeVec(
&k8smetrics.GaugeOpts{
Namespace: caNamespace,
Name: "memory_limits_bytes",
Help: "Minimum and maximum number of bytes of memory in cluster.",
}, []string{"direction"},
)
/**** Metrics related to autoscaler execution ****/ /**** Metrics related to autoscaler execution ****/
lastActivity = k8smetrics.NewGaugeVec( lastActivity = k8smetrics.NewGaugeVec(
&k8smetrics.GaugeOpts{ &k8smetrics.GaugeOpts{
@ -288,6 +320,10 @@ func RegisterAll() {
legacyregistry.MustRegister(nodeGroupsCount) legacyregistry.MustRegister(nodeGroupsCount)
legacyregistry.MustRegister(unschedulablePodsCount) legacyregistry.MustRegister(unschedulablePodsCount)
legacyregistry.MustRegister(maxNodesCount) legacyregistry.MustRegister(maxNodesCount)
legacyregistry.MustRegister(cpuCurrentCores)
legacyregistry.MustRegister(cpuLimitsCores)
legacyregistry.MustRegister(memoryCurrentBytes)
legacyregistry.MustRegister(memoryLimitsBytes)
legacyregistry.MustRegister(lastActivity) legacyregistry.MustRegister(lastActivity)
legacyregistry.MustRegister(functionDuration) legacyregistry.MustRegister(functionDuration)
legacyregistry.MustRegister(functionDurationSummary) legacyregistry.MustRegister(functionDurationSummary)
@ -364,6 +400,28 @@ func UpdateMaxNodesCount(nodesCount int) {
maxNodesCount.Set(float64(nodesCount)) maxNodesCount.Set(float64(nodesCount))
} }
// UpdateClusterCPUCurrentCores records the number of cores in the cluster, minus deleting nodes
func UpdateClusterCPUCurrentCores(coresCount int64) {
cpuCurrentCores.Set(float64(coresCount))
}
// UpdateCPULimitsCores records the minimum and maximum number of cores in the cluster
func UpdateCPULimitsCores(minCoresCount int64, maxCoresCount int64) {
cpuLimitsCores.WithLabelValues("minimum").Set(float64(minCoresCount))
cpuLimitsCores.WithLabelValues("maximum").Set(float64(maxCoresCount))
}
// UpdateClusterMemoryCurrentBytes records the number of bytes of memory in the cluster, minus deleting nodes
func UpdateClusterMemoryCurrentBytes(memoryCount int64) {
memoryCurrentBytes.Set(float64(memoryCount))
}
// UpdateMemoryLimitsBytes records the minimum and maximum bytes of memory in the cluster
func UpdateMemoryLimitsBytes(minMemoryCount int64, maxMemoryCount int64) {
memoryLimitsBytes.WithLabelValues("minimum").Set(float64(minMemoryCount))
memoryLimitsBytes.WithLabelValues("maximum").Set(float64(maxMemoryCount))
}
// RegisterError records any errors preventing Cluster Autoscaler from working. // RegisterError records any errors preventing Cluster Autoscaler from working.
// No more than one error should be recorded per loop. // No more than one error should be recorded per loop.
func RegisterError(err errors.AutoscalerError) { func RegisterError(err errors.AutoscalerError) {

View File

@ -27,6 +27,11 @@ All the metrics are prefixed with `cluster_autoscaler_`.
| nodes_count | Gauge | `state`=<node-state> | Number of nodes in cluster. | | nodes_count | Gauge | `state`=<node-state> | Number of nodes in cluster. |
| unschedulable_pods_count | Gauge | | Number of unschedulable ("Pending") pods in the cluster. | | unschedulable_pods_count | Gauge | | Number of unschedulable ("Pending") pods in the cluster. |
| node_groups_count | Gauge | `node_group_type`=<node-group-type> | Number of node groups managed by CA. | | node_groups_count | Gauge | `node_group_type`=<node-group-type> | Number of node groups managed by CA. |
| max_nodes_count | Gauge | | Maximum number of nodes in all node groups. |
| cluster_cpu_current_cores | Gauge | | | Current number of cores in the cluster, minus deleting nodes. |
| cpu_limits_cores | Gauge | `direction`=<`minimum` or `maximum`> | Minimum and maximum number of cores in the cluster. |
| cluster_memory_current_bytes | Gauge | | Current number of bytes of memory in the cluster, minus deleting nodes. |
| memory_limits_bytes | Gauge | `direction`=<`minimum` or `maximum`> | Minimum and maximum number of bytes of memory in cluster. |
* `cluster_safe_to_autoscale` indicates whether cluster is healthy enough for autoscaling. CA stops all operations if significant number of nodes are unready (by default 33% as of CA 0.5.4). * `cluster_safe_to_autoscale` indicates whether cluster is healthy enough for autoscaling. CA stops all operations if significant number of nodes are unready (by default 33% as of CA 0.5.4).
* `nodes_count` records the total number of nodes, labeled by node state. Possible * `nodes_count` records the total number of nodes, labeled by node state. Possible