Merge pull request #3983 from elmiko/cluster-resource-consumption-metrics
Cluster resource consumption metrics
This commit is contained in:
		
						commit
						2beea02a29
					
				| 
						 | 
				
			
			@ -245,6 +245,11 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) errors.AutoscalerError
 | 
			
		|||
		return nil
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	// Update cluster resource usage metrics
 | 
			
		||||
	coresTotal, memoryTotal := calculateCoresMemoryTotal(allNodes, currentTime)
 | 
			
		||||
	metrics.UpdateClusterCPUCurrentCores(coresTotal)
 | 
			
		||||
	metrics.UpdateClusterMemoryCurrentBytes(memoryTotal)
 | 
			
		||||
 | 
			
		||||
	daemonsets, err := a.ListerRegistry.DaemonSetLister().List(labels.Everything())
 | 
			
		||||
	if err != nil {
 | 
			
		||||
		klog.Errorf("Failed to get daemonset list: %v", err)
 | 
			
		||||
| 
						 | 
				
			
			@ -799,3 +804,21 @@ func getUpcomingNodeInfos(registry *clusterstate.ClusterStateRegistry, nodeInfos
 | 
			
		|||
	}
 | 
			
		||||
	return upcomingNodes
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func calculateCoresMemoryTotal(nodes []*apiv1.Node, timestamp time.Time) (int64, int64) {
 | 
			
		||||
	// this function is essentially similar to the calculateScaleDownCoresMemoryTotal
 | 
			
		||||
	// we want to check all nodes, aside from those deleting, to sum the cluster resource usage.
 | 
			
		||||
	var coresTotal, memoryTotal int64
 | 
			
		||||
	for _, node := range nodes {
 | 
			
		||||
		if isNodeBeingDeleted(node, timestamp) {
 | 
			
		||||
			// Nodes being deleted do not count towards total cluster resources
 | 
			
		||||
			continue
 | 
			
		||||
		}
 | 
			
		||||
		cores, memory := core_utils.GetNodeCoresAndMemory(node)
 | 
			
		||||
 | 
			
		||||
		coresTotal += cores
 | 
			
		||||
		memoryTotal += memory
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	return coresTotal, memoryTotal
 | 
			
		||||
}
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -332,6 +332,8 @@ func buildAutoscaler() (core.Autoscaler, error) {
 | 
			
		|||
	// These metrics should be published only once.
 | 
			
		||||
	metrics.UpdateNapEnabled(autoscalingOptions.NodeAutoprovisioningEnabled)
 | 
			
		||||
	metrics.UpdateMaxNodesCount(autoscalingOptions.MaxNodesTotal)
 | 
			
		||||
	metrics.UpdateCPULimitsCores(autoscalingOptions.MinCoresTotal, autoscalingOptions.MaxCoresTotal)
 | 
			
		||||
	metrics.UpdateMemoryLimitsBytes(autoscalingOptions.MinMemoryTotal, autoscalingOptions.MaxMemoryTotal)
 | 
			
		||||
 | 
			
		||||
	// Create autoscaler.
 | 
			
		||||
	return core.NewAutoscaler(opts)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -138,6 +138,38 @@ var (
 | 
			
		|||
		},
 | 
			
		||||
	)
 | 
			
		||||
 | 
			
		||||
	cpuCurrentCores = k8smetrics.NewGauge(
 | 
			
		||||
		&k8smetrics.GaugeOpts{
 | 
			
		||||
			Namespace: caNamespace,
 | 
			
		||||
			Name:      "cluster_cpu_current_cores",
 | 
			
		||||
			Help:      "Current number of cores in the cluster, minus deleting nodes.",
 | 
			
		||||
		},
 | 
			
		||||
	)
 | 
			
		||||
 | 
			
		||||
	cpuLimitsCores = k8smetrics.NewGaugeVec(
 | 
			
		||||
		&k8smetrics.GaugeOpts{
 | 
			
		||||
			Namespace: caNamespace,
 | 
			
		||||
			Name:      "cpu_limits_cores",
 | 
			
		||||
			Help:      "Minimum and maximum number of cores in the cluster.",
 | 
			
		||||
		}, []string{"direction"},
 | 
			
		||||
	)
 | 
			
		||||
 | 
			
		||||
	memoryCurrentBytes = k8smetrics.NewGauge(
 | 
			
		||||
		&k8smetrics.GaugeOpts{
 | 
			
		||||
			Namespace: caNamespace,
 | 
			
		||||
			Name:      "cluster_memory_current_bytes",
 | 
			
		||||
			Help:      "Current number of bytes of memory in the cluster, minus deleting nodes.",
 | 
			
		||||
		},
 | 
			
		||||
	)
 | 
			
		||||
 | 
			
		||||
	memoryLimitsBytes = k8smetrics.NewGaugeVec(
 | 
			
		||||
		&k8smetrics.GaugeOpts{
 | 
			
		||||
			Namespace: caNamespace,
 | 
			
		||||
			Name:      "memory_limits_bytes",
 | 
			
		||||
			Help:      "Minimum and maximum number of bytes of memory in cluster.",
 | 
			
		||||
		}, []string{"direction"},
 | 
			
		||||
	)
 | 
			
		||||
 | 
			
		||||
	/**** Metrics related to autoscaler execution ****/
 | 
			
		||||
	lastActivity = k8smetrics.NewGaugeVec(
 | 
			
		||||
		&k8smetrics.GaugeOpts{
 | 
			
		||||
| 
						 | 
				
			
			@ -288,6 +320,10 @@ func RegisterAll() {
 | 
			
		|||
	legacyregistry.MustRegister(nodeGroupsCount)
 | 
			
		||||
	legacyregistry.MustRegister(unschedulablePodsCount)
 | 
			
		||||
	legacyregistry.MustRegister(maxNodesCount)
 | 
			
		||||
	legacyregistry.MustRegister(cpuCurrentCores)
 | 
			
		||||
	legacyregistry.MustRegister(cpuLimitsCores)
 | 
			
		||||
	legacyregistry.MustRegister(memoryCurrentBytes)
 | 
			
		||||
	legacyregistry.MustRegister(memoryLimitsBytes)
 | 
			
		||||
	legacyregistry.MustRegister(lastActivity)
 | 
			
		||||
	legacyregistry.MustRegister(functionDuration)
 | 
			
		||||
	legacyregistry.MustRegister(functionDurationSummary)
 | 
			
		||||
| 
						 | 
				
			
			@ -364,6 +400,28 @@ func UpdateMaxNodesCount(nodesCount int) {
 | 
			
		|||
	maxNodesCount.Set(float64(nodesCount))
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// UpdateClusterCPUCurrentCores records the number of cores in the cluster, minus deleting nodes
 | 
			
		||||
func UpdateClusterCPUCurrentCores(coresCount int64) {
 | 
			
		||||
	cpuCurrentCores.Set(float64(coresCount))
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// UpdateCPULimitsCores records the minimum and maximum number of cores in the cluster
 | 
			
		||||
func UpdateCPULimitsCores(minCoresCount int64, maxCoresCount int64) {
 | 
			
		||||
	cpuLimitsCores.WithLabelValues("minimum").Set(float64(minCoresCount))
 | 
			
		||||
	cpuLimitsCores.WithLabelValues("maximum").Set(float64(maxCoresCount))
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// UpdateClusterMemoryCurrentBytes records the number of bytes of memory in the cluster, minus deleting nodes
 | 
			
		||||
func UpdateClusterMemoryCurrentBytes(memoryCount int64) {
 | 
			
		||||
	memoryCurrentBytes.Set(float64(memoryCount))
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// UpdateMemoryLimitsBytes records the minimum and maximum bytes of memory in the cluster
 | 
			
		||||
func UpdateMemoryLimitsBytes(minMemoryCount int64, maxMemoryCount int64) {
 | 
			
		||||
	memoryLimitsBytes.WithLabelValues("minimum").Set(float64(minMemoryCount))
 | 
			
		||||
	memoryLimitsBytes.WithLabelValues("maximum").Set(float64(maxMemoryCount))
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// RegisterError records any errors preventing Cluster Autoscaler from working.
 | 
			
		||||
// No more than one error should be recorded per loop.
 | 
			
		||||
func RegisterError(err errors.AutoscalerError) {
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -27,6 +27,11 @@ All the metrics are prefixed with `cluster_autoscaler_`.
 | 
			
		|||
| nodes_count | Gauge | `state`=<node-state> | Number of nodes in cluster. |
 | 
			
		||||
| unschedulable_pods_count | Gauge | | Number of unschedulable ("Pending") pods in the cluster. |
 | 
			
		||||
| node_groups_count | Gauge | `node_group_type`=<node-group-type> | Number of node groups managed by CA. |
 | 
			
		||||
| max_nodes_count | Gauge | | Maximum number of nodes in all node groups. |
 | 
			
		||||
| cluster_cpu_current_cores | Gauge | | | Current number of cores in the cluster, minus deleting nodes. |
 | 
			
		||||
| cpu_limits_cores | Gauge | `direction`=<`minimum` or `maximum`> | Minimum and maximum number of cores in the cluster. |
 | 
			
		||||
| cluster_memory_current_bytes | Gauge | | Current number of bytes of memory in the cluster, minus deleting nodes. |
 | 
			
		||||
| memory_limits_bytes | Gauge | `direction`=<`minimum` or `maximum`> | Minimum and maximum number of bytes of memory in cluster. |
 | 
			
		||||
 | 
			
		||||
* `cluster_safe_to_autoscale` indicates whether cluster is healthy enough for autoscaling. CA stops all operations if significant number of nodes are unready (by default 33% as of CA 0.5.4).
 | 
			
		||||
* `nodes_count` records the total number of nodes, labeled by node state. Possible
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in New Issue