Merge pull request #3983 from elmiko/cluster-resource-consumption-metrics

Cluster resource consumption metrics
This commit is contained in:
Kubernetes Prow Robot 2021-05-13 15:32:04 -07:00 committed by GitHub
commit 2beea02a29
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 88 additions and 0 deletions

View File

@ -245,6 +245,11 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) errors.AutoscalerError
return nil
}
// Update cluster resource usage metrics
coresTotal, memoryTotal := calculateCoresMemoryTotal(allNodes, currentTime)
metrics.UpdateClusterCPUCurrentCores(coresTotal)
metrics.UpdateClusterMemoryCurrentBytes(memoryTotal)
daemonsets, err := a.ListerRegistry.DaemonSetLister().List(labels.Everything())
if err != nil {
klog.Errorf("Failed to get daemonset list: %v", err)
@ -799,3 +804,21 @@ func getUpcomingNodeInfos(registry *clusterstate.ClusterStateRegistry, nodeInfos
}
return upcomingNodes
}
func calculateCoresMemoryTotal(nodes []*apiv1.Node, timestamp time.Time) (int64, int64) {
// this function is essentially similar to the calculateScaleDownCoresMemoryTotal
// we want to check all nodes, aside from those deleting, to sum the cluster resource usage.
var coresTotal, memoryTotal int64
for _, node := range nodes {
if isNodeBeingDeleted(node, timestamp) {
// Nodes being deleted do not count towards total cluster resources
continue
}
cores, memory := core_utils.GetNodeCoresAndMemory(node)
coresTotal += cores
memoryTotal += memory
}
return coresTotal, memoryTotal
}

View File

@ -332,6 +332,8 @@ func buildAutoscaler() (core.Autoscaler, error) {
// These metrics should be published only once.
metrics.UpdateNapEnabled(autoscalingOptions.NodeAutoprovisioningEnabled)
metrics.UpdateMaxNodesCount(autoscalingOptions.MaxNodesTotal)
metrics.UpdateCPULimitsCores(autoscalingOptions.MinCoresTotal, autoscalingOptions.MaxCoresTotal)
metrics.UpdateMemoryLimitsBytes(autoscalingOptions.MinMemoryTotal, autoscalingOptions.MaxMemoryTotal)
// Create autoscaler.
return core.NewAutoscaler(opts)

View File

@ -138,6 +138,38 @@ var (
},
)
cpuCurrentCores = k8smetrics.NewGauge(
&k8smetrics.GaugeOpts{
Namespace: caNamespace,
Name: "cluster_cpu_current_cores",
Help: "Current number of cores in the cluster, minus deleting nodes.",
},
)
cpuLimitsCores = k8smetrics.NewGaugeVec(
&k8smetrics.GaugeOpts{
Namespace: caNamespace,
Name: "cpu_limits_cores",
Help: "Minimum and maximum number of cores in the cluster.",
}, []string{"direction"},
)
memoryCurrentBytes = k8smetrics.NewGauge(
&k8smetrics.GaugeOpts{
Namespace: caNamespace,
Name: "cluster_memory_current_bytes",
Help: "Current number of bytes of memory in the cluster, minus deleting nodes.",
},
)
memoryLimitsBytes = k8smetrics.NewGaugeVec(
&k8smetrics.GaugeOpts{
Namespace: caNamespace,
Name: "memory_limits_bytes",
Help: "Minimum and maximum number of bytes of memory in cluster.",
}, []string{"direction"},
)
/**** Metrics related to autoscaler execution ****/
lastActivity = k8smetrics.NewGaugeVec(
&k8smetrics.GaugeOpts{
@ -288,6 +320,10 @@ func RegisterAll() {
legacyregistry.MustRegister(nodeGroupsCount)
legacyregistry.MustRegister(unschedulablePodsCount)
legacyregistry.MustRegister(maxNodesCount)
legacyregistry.MustRegister(cpuCurrentCores)
legacyregistry.MustRegister(cpuLimitsCores)
legacyregistry.MustRegister(memoryCurrentBytes)
legacyregistry.MustRegister(memoryLimitsBytes)
legacyregistry.MustRegister(lastActivity)
legacyregistry.MustRegister(functionDuration)
legacyregistry.MustRegister(functionDurationSummary)
@ -364,6 +400,28 @@ func UpdateMaxNodesCount(nodesCount int) {
maxNodesCount.Set(float64(nodesCount))
}
// UpdateClusterCPUCurrentCores records the number of cores in the cluster, minus deleting nodes
func UpdateClusterCPUCurrentCores(coresCount int64) {
cpuCurrentCores.Set(float64(coresCount))
}
// UpdateCPULimitsCores records the minimum and maximum number of cores in the cluster
func UpdateCPULimitsCores(minCoresCount int64, maxCoresCount int64) {
cpuLimitsCores.WithLabelValues("minimum").Set(float64(minCoresCount))
cpuLimitsCores.WithLabelValues("maximum").Set(float64(maxCoresCount))
}
// UpdateClusterMemoryCurrentBytes records the number of bytes of memory in the cluster, minus deleting nodes
func UpdateClusterMemoryCurrentBytes(memoryCount int64) {
memoryCurrentBytes.Set(float64(memoryCount))
}
// UpdateMemoryLimitsBytes records the minimum and maximum bytes of memory in the cluster
func UpdateMemoryLimitsBytes(minMemoryCount int64, maxMemoryCount int64) {
memoryLimitsBytes.WithLabelValues("minimum").Set(float64(minMemoryCount))
memoryLimitsBytes.WithLabelValues("maximum").Set(float64(maxMemoryCount))
}
// RegisterError records any errors preventing Cluster Autoscaler from working.
// No more than one error should be recorded per loop.
func RegisterError(err errors.AutoscalerError) {

View File

@ -27,6 +27,11 @@ All the metrics are prefixed with `cluster_autoscaler_`.
| nodes_count | Gauge | `state`=<node-state> | Number of nodes in cluster. |
| unschedulable_pods_count | Gauge | | Number of unschedulable ("Pending") pods in the cluster. |
| node_groups_count | Gauge | `node_group_type`=<node-group-type> | Number of node groups managed by CA. |
| max_nodes_count | Gauge | | Maximum number of nodes in all node groups. |
| cluster_cpu_current_cores | Gauge | | | Current number of cores in the cluster, minus deleting nodes. |
| cpu_limits_cores | Gauge | `direction`=<`minimum` or `maximum`> | Minimum and maximum number of cores in the cluster. |
| cluster_memory_current_bytes | Gauge | | Current number of bytes of memory in the cluster, minus deleting nodes. |
| memory_limits_bytes | Gauge | `direction`=<`minimum` or `maximum`> | Minimum and maximum number of bytes of memory in cluster. |
* `cluster_safe_to_autoscale` indicates whether cluster is healthy enough for autoscaling. CA stops all operations if significant number of nodes are unready (by default 33% as of CA 0.5.4).
* `nodes_count` records the total number of nodes, labeled by node state. Possible