Merge pull request #3983 from elmiko/cluster-resource-consumption-metrics
Cluster resource consumption metrics
This commit is contained in:
commit
2beea02a29
|
|
@ -245,6 +245,11 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) errors.AutoscalerError
|
|||
return nil
|
||||
}
|
||||
|
||||
// Update cluster resource usage metrics
|
||||
coresTotal, memoryTotal := calculateCoresMemoryTotal(allNodes, currentTime)
|
||||
metrics.UpdateClusterCPUCurrentCores(coresTotal)
|
||||
metrics.UpdateClusterMemoryCurrentBytes(memoryTotal)
|
||||
|
||||
daemonsets, err := a.ListerRegistry.DaemonSetLister().List(labels.Everything())
|
||||
if err != nil {
|
||||
klog.Errorf("Failed to get daemonset list: %v", err)
|
||||
|
|
@ -799,3 +804,21 @@ func getUpcomingNodeInfos(registry *clusterstate.ClusterStateRegistry, nodeInfos
|
|||
}
|
||||
return upcomingNodes
|
||||
}
|
||||
|
||||
func calculateCoresMemoryTotal(nodes []*apiv1.Node, timestamp time.Time) (int64, int64) {
|
||||
// this function is essentially similar to the calculateScaleDownCoresMemoryTotal
|
||||
// we want to check all nodes, aside from those deleting, to sum the cluster resource usage.
|
||||
var coresTotal, memoryTotal int64
|
||||
for _, node := range nodes {
|
||||
if isNodeBeingDeleted(node, timestamp) {
|
||||
// Nodes being deleted do not count towards total cluster resources
|
||||
continue
|
||||
}
|
||||
cores, memory := core_utils.GetNodeCoresAndMemory(node)
|
||||
|
||||
coresTotal += cores
|
||||
memoryTotal += memory
|
||||
}
|
||||
|
||||
return coresTotal, memoryTotal
|
||||
}
|
||||
|
|
|
|||
|
|
@ -332,6 +332,8 @@ func buildAutoscaler() (core.Autoscaler, error) {
|
|||
// These metrics should be published only once.
|
||||
metrics.UpdateNapEnabled(autoscalingOptions.NodeAutoprovisioningEnabled)
|
||||
metrics.UpdateMaxNodesCount(autoscalingOptions.MaxNodesTotal)
|
||||
metrics.UpdateCPULimitsCores(autoscalingOptions.MinCoresTotal, autoscalingOptions.MaxCoresTotal)
|
||||
metrics.UpdateMemoryLimitsBytes(autoscalingOptions.MinMemoryTotal, autoscalingOptions.MaxMemoryTotal)
|
||||
|
||||
// Create autoscaler.
|
||||
return core.NewAutoscaler(opts)
|
||||
|
|
|
|||
|
|
@ -138,6 +138,38 @@ var (
|
|||
},
|
||||
)
|
||||
|
||||
cpuCurrentCores = k8smetrics.NewGauge(
|
||||
&k8smetrics.GaugeOpts{
|
||||
Namespace: caNamespace,
|
||||
Name: "cluster_cpu_current_cores",
|
||||
Help: "Current number of cores in the cluster, minus deleting nodes.",
|
||||
},
|
||||
)
|
||||
|
||||
cpuLimitsCores = k8smetrics.NewGaugeVec(
|
||||
&k8smetrics.GaugeOpts{
|
||||
Namespace: caNamespace,
|
||||
Name: "cpu_limits_cores",
|
||||
Help: "Minimum and maximum number of cores in the cluster.",
|
||||
}, []string{"direction"},
|
||||
)
|
||||
|
||||
memoryCurrentBytes = k8smetrics.NewGauge(
|
||||
&k8smetrics.GaugeOpts{
|
||||
Namespace: caNamespace,
|
||||
Name: "cluster_memory_current_bytes",
|
||||
Help: "Current number of bytes of memory in the cluster, minus deleting nodes.",
|
||||
},
|
||||
)
|
||||
|
||||
memoryLimitsBytes = k8smetrics.NewGaugeVec(
|
||||
&k8smetrics.GaugeOpts{
|
||||
Namespace: caNamespace,
|
||||
Name: "memory_limits_bytes",
|
||||
Help: "Minimum and maximum number of bytes of memory in cluster.",
|
||||
}, []string{"direction"},
|
||||
)
|
||||
|
||||
/**** Metrics related to autoscaler execution ****/
|
||||
lastActivity = k8smetrics.NewGaugeVec(
|
||||
&k8smetrics.GaugeOpts{
|
||||
|
|
@ -288,6 +320,10 @@ func RegisterAll() {
|
|||
legacyregistry.MustRegister(nodeGroupsCount)
|
||||
legacyregistry.MustRegister(unschedulablePodsCount)
|
||||
legacyregistry.MustRegister(maxNodesCount)
|
||||
legacyregistry.MustRegister(cpuCurrentCores)
|
||||
legacyregistry.MustRegister(cpuLimitsCores)
|
||||
legacyregistry.MustRegister(memoryCurrentBytes)
|
||||
legacyregistry.MustRegister(memoryLimitsBytes)
|
||||
legacyregistry.MustRegister(lastActivity)
|
||||
legacyregistry.MustRegister(functionDuration)
|
||||
legacyregistry.MustRegister(functionDurationSummary)
|
||||
|
|
@ -364,6 +400,28 @@ func UpdateMaxNodesCount(nodesCount int) {
|
|||
maxNodesCount.Set(float64(nodesCount))
|
||||
}
|
||||
|
||||
// UpdateClusterCPUCurrentCores records the number of cores in the cluster, minus deleting nodes
|
||||
func UpdateClusterCPUCurrentCores(coresCount int64) {
|
||||
cpuCurrentCores.Set(float64(coresCount))
|
||||
}
|
||||
|
||||
// UpdateCPULimitsCores records the minimum and maximum number of cores in the cluster
|
||||
func UpdateCPULimitsCores(minCoresCount int64, maxCoresCount int64) {
|
||||
cpuLimitsCores.WithLabelValues("minimum").Set(float64(minCoresCount))
|
||||
cpuLimitsCores.WithLabelValues("maximum").Set(float64(maxCoresCount))
|
||||
}
|
||||
|
||||
// UpdateClusterMemoryCurrentBytes records the number of bytes of memory in the cluster, minus deleting nodes
|
||||
func UpdateClusterMemoryCurrentBytes(memoryCount int64) {
|
||||
memoryCurrentBytes.Set(float64(memoryCount))
|
||||
}
|
||||
|
||||
// UpdateMemoryLimitsBytes records the minimum and maximum bytes of memory in the cluster
|
||||
func UpdateMemoryLimitsBytes(minMemoryCount int64, maxMemoryCount int64) {
|
||||
memoryLimitsBytes.WithLabelValues("minimum").Set(float64(minMemoryCount))
|
||||
memoryLimitsBytes.WithLabelValues("maximum").Set(float64(maxMemoryCount))
|
||||
}
|
||||
|
||||
// RegisterError records any errors preventing Cluster Autoscaler from working.
|
||||
// No more than one error should be recorded per loop.
|
||||
func RegisterError(err errors.AutoscalerError) {
|
||||
|
|
|
|||
|
|
@ -27,6 +27,11 @@ All the metrics are prefixed with `cluster_autoscaler_`.
|
|||
| nodes_count | Gauge | `state`=<node-state> | Number of nodes in cluster. |
|
||||
| unschedulable_pods_count | Gauge | | Number of unschedulable ("Pending") pods in the cluster. |
|
||||
| node_groups_count | Gauge | `node_group_type`=<node-group-type> | Number of node groups managed by CA. |
|
||||
| max_nodes_count | Gauge | | Maximum number of nodes in all node groups. |
|
||||
| cluster_cpu_current_cores | Gauge | | | Current number of cores in the cluster, minus deleting nodes. |
|
||||
| cpu_limits_cores | Gauge | `direction`=<`minimum` or `maximum`> | Minimum and maximum number of cores in the cluster. |
|
||||
| cluster_memory_current_bytes | Gauge | | Current number of bytes of memory in the cluster, minus deleting nodes. |
|
||||
| memory_limits_bytes | Gauge | `direction`=<`minimum` or `maximum`> | Minimum and maximum number of bytes of memory in cluster. |
|
||||
|
||||
* `cluster_safe_to_autoscale` indicates whether cluster is healthy enough for autoscaling. CA stops all operations if significant number of nodes are unready (by default 33% as of CA 0.5.4).
|
||||
* `nodes_count` records the total number of nodes, labeled by node state. Possible
|
||||
|
|
|
|||
Loading…
Reference in New Issue