diff --git a/cluster-autoscaler/core/static_autoscaler.go b/cluster-autoscaler/core/static_autoscaler.go index 8643bdddc7..83d840265a 100644 --- a/cluster-autoscaler/core/static_autoscaler.go +++ b/cluster-autoscaler/core/static_autoscaler.go @@ -245,6 +245,11 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) errors.AutoscalerError return nil } + // Update cluster resource usage metrics + coresTotal, memoryTotal := calculateCoresMemoryTotal(allNodes, currentTime) + metrics.UpdateClusterCPUCurrentCores(coresTotal) + metrics.UpdateClusterMemoryCurrentBytes(memoryTotal) + daemonsets, err := a.ListerRegistry.DaemonSetLister().List(labels.Everything()) if err != nil { klog.Errorf("Failed to get daemonset list: %v", err) @@ -799,3 +804,21 @@ func getUpcomingNodeInfos(registry *clusterstate.ClusterStateRegistry, nodeInfos } return upcomingNodes } + +func calculateCoresMemoryTotal(nodes []*apiv1.Node, timestamp time.Time) (int64, int64) { + // this function is essentially similar to the calculateScaleDownCoresMemoryTotal + // we want to check all nodes, aside from those deleting, to sum the cluster resource usage. + var coresTotal, memoryTotal int64 + for _, node := range nodes { + if isNodeBeingDeleted(node, timestamp) { + // Nodes being deleted do not count towards total cluster resources + continue + } + cores, memory := core_utils.GetNodeCoresAndMemory(node) + + coresTotal += cores + memoryTotal += memory + } + + return coresTotal, memoryTotal +} diff --git a/cluster-autoscaler/main.go b/cluster-autoscaler/main.go index 7ce5725734..c7c18d22b1 100644 --- a/cluster-autoscaler/main.go +++ b/cluster-autoscaler/main.go @@ -332,6 +332,8 @@ func buildAutoscaler() (core.Autoscaler, error) { // These metrics should be published only once. metrics.UpdateNapEnabled(autoscalingOptions.NodeAutoprovisioningEnabled) metrics.UpdateMaxNodesCount(autoscalingOptions.MaxNodesTotal) + metrics.UpdateCPULimitsCores(autoscalingOptions.MinCoresTotal, autoscalingOptions.MaxCoresTotal) + metrics.UpdateMemoryLimitsBytes(autoscalingOptions.MinMemoryTotal, autoscalingOptions.MaxMemoryTotal) // Create autoscaler. return core.NewAutoscaler(opts) diff --git a/cluster-autoscaler/metrics/metrics.go b/cluster-autoscaler/metrics/metrics.go index e3b5ed22ed..9580ee3344 100644 --- a/cluster-autoscaler/metrics/metrics.go +++ b/cluster-autoscaler/metrics/metrics.go @@ -138,6 +138,38 @@ var ( }, ) + cpuCurrentCores = k8smetrics.NewGauge( + &k8smetrics.GaugeOpts{ + Namespace: caNamespace, + Name: "cluster_cpu_current_cores", + Help: "Current number of cores in the cluster, minus deleting nodes.", + }, + ) + + cpuLimitsCores = k8smetrics.NewGaugeVec( + &k8smetrics.GaugeOpts{ + Namespace: caNamespace, + Name: "cpu_limits_cores", + Help: "Minimum and maximum number of cores in the cluster.", + }, []string{"direction"}, + ) + + memoryCurrentBytes = k8smetrics.NewGauge( + &k8smetrics.GaugeOpts{ + Namespace: caNamespace, + Name: "cluster_memory_current_bytes", + Help: "Current number of bytes of memory in the cluster, minus deleting nodes.", + }, + ) + + memoryLimitsBytes = k8smetrics.NewGaugeVec( + &k8smetrics.GaugeOpts{ + Namespace: caNamespace, + Name: "memory_limits_bytes", + Help: "Minimum and maximum number of bytes of memory in cluster.", + }, []string{"direction"}, + ) + /**** Metrics related to autoscaler execution ****/ lastActivity = k8smetrics.NewGaugeVec( &k8smetrics.GaugeOpts{ @@ -288,6 +320,10 @@ func RegisterAll() { legacyregistry.MustRegister(nodeGroupsCount) legacyregistry.MustRegister(unschedulablePodsCount) legacyregistry.MustRegister(maxNodesCount) + legacyregistry.MustRegister(cpuCurrentCores) + legacyregistry.MustRegister(cpuLimitsCores) + legacyregistry.MustRegister(memoryCurrentBytes) + legacyregistry.MustRegister(memoryLimitsBytes) legacyregistry.MustRegister(lastActivity) legacyregistry.MustRegister(functionDuration) legacyregistry.MustRegister(functionDurationSummary) @@ -364,6 +400,28 @@ func UpdateMaxNodesCount(nodesCount int) { maxNodesCount.Set(float64(nodesCount)) } +// UpdateClusterCPUCurrentCores records the number of cores in the cluster, minus deleting nodes +func UpdateClusterCPUCurrentCores(coresCount int64) { + cpuCurrentCores.Set(float64(coresCount)) +} + +// UpdateCPULimitsCores records the minimum and maximum number of cores in the cluster +func UpdateCPULimitsCores(minCoresCount int64, maxCoresCount int64) { + cpuLimitsCores.WithLabelValues("minimum").Set(float64(minCoresCount)) + cpuLimitsCores.WithLabelValues("maximum").Set(float64(maxCoresCount)) +} + +// UpdateClusterMemoryCurrentBytes records the number of bytes of memory in the cluster, minus deleting nodes +func UpdateClusterMemoryCurrentBytes(memoryCount int64) { + memoryCurrentBytes.Set(float64(memoryCount)) +} + +// UpdateMemoryLimitsBytes records the minimum and maximum bytes of memory in the cluster +func UpdateMemoryLimitsBytes(minMemoryCount int64, maxMemoryCount int64) { + memoryLimitsBytes.WithLabelValues("minimum").Set(float64(minMemoryCount)) + memoryLimitsBytes.WithLabelValues("maximum").Set(float64(maxMemoryCount)) +} + // RegisterError records any errors preventing Cluster Autoscaler from working. // No more than one error should be recorded per loop. func RegisterError(err errors.AutoscalerError) { diff --git a/cluster-autoscaler/proposals/metrics.md b/cluster-autoscaler/proposals/metrics.md index 6cb3b5ac0d..d21eb1a2f6 100644 --- a/cluster-autoscaler/proposals/metrics.md +++ b/cluster-autoscaler/proposals/metrics.md @@ -27,6 +27,11 @@ All the metrics are prefixed with `cluster_autoscaler_`. | nodes_count | Gauge | `state`=<node-state> | Number of nodes in cluster. | | unschedulable_pods_count | Gauge | | Number of unschedulable ("Pending") pods in the cluster. | | node_groups_count | Gauge | `node_group_type`=<node-group-type> | Number of node groups managed by CA. | +| max_nodes_count | Gauge | | Maximum number of nodes in all node groups. | +| cluster_cpu_current_cores | Gauge | | | Current number of cores in the cluster, minus deleting nodes. | +| cpu_limits_cores | Gauge | `direction`=<`minimum` or `maximum`> | Minimum and maximum number of cores in the cluster. | +| cluster_memory_current_bytes | Gauge | | Current number of bytes of memory in the cluster, minus deleting nodes. | +| memory_limits_bytes | Gauge | `direction`=<`minimum` or `maximum`> | Minimum and maximum number of bytes of memory in cluster. | * `cluster_safe_to_autoscale` indicates whether cluster is healthy enough for autoscaling. CA stops all operations if significant number of nodes are unready (by default 33% as of CA 0.5.4). * `nodes_count` records the total number of nodes, labeled by node state. Possible