Add GPU-related scaled_up & scaled_down metrics (#974)

* Add GPU-related scaled_up & scaled_down metrics * Fix name to match SD naming convention * Fix import after master rebase * Change the logic to include GPU-being-installed nodes
2018-06-22 21:00:52 +02:00 · 2018-06-22 21:00:52 +02:00 · 5eb7021f82
parent bbe99a27d8
commit 5eb7021f82
4 changed files with 51 additions and 9 deletions
--- a/cluster-autoscaler/core/scale_down.go
+++ b/cluster-autoscaler/core/scale_down.go
@ -701,9 +701,9 @@ func (sd *ScaleDown) TryToScaleDown(allNodes []*apiv1.Node, pods []*apiv1.Pod, p
 			return
 		}
 		if readinessMap[toRemove.Node.Name] {
-			metrics.RegisterScaleDown(1, metrics.Underutilized)
+			metrics.RegisterScaleDown(1, gpu.GetGpuTypeForMetrics(toRemove.Node), metrics.Underutilized)
 		} else {
-			metrics.RegisterScaleDown(1, metrics.Unready)
+			metrics.RegisterScaleDown(1, gpu.GetGpuTypeForMetrics(toRemove.Node), metrics.Unready)
 		}
 	}()

@ -809,9 +809,9 @@ func (sd *ScaleDown) scheduleDeleteEmptyNodes(emptyNodes []*apiv1.Node, client k
 				sd.context.Recorder, sd.clusterStateRegistry)
 			if deleteErr == nil {
 				if readinessMap[nodeToDelete.Name] {
-					metrics.RegisterScaleDown(1, metrics.Empty)
+					metrics.RegisterScaleDown(1, gpu.GetGpuTypeForMetrics(nodeToDelete), metrics.Empty)
 				} else {
-					metrics.RegisterScaleDown(1, metrics.Unready)
+					metrics.RegisterScaleDown(1, gpu.GetGpuTypeForMetrics(nodeToDelete), metrics.Unready)
 				}
 			}
 			confirmation <- deleteErr
--- a/cluster-autoscaler/core/scale_up.go
+++ b/cluster-autoscaler/core/scale_up.go
@ -465,7 +465,7 @@ func ScaleUp(context *context.AutoscalingContext, processors *ca_processors.Auto
 		}
 		glog.V(1).Infof("Final scale-up plan: %v", scaleUpInfos)
 		for _, info := range scaleUpInfos {
-			typedErr := executeScaleUp(context, clusterStateRegistry, info)
+			typedErr := executeScaleUp(context, clusterStateRegistry, info, gpu.GetGpuTypeForMetrics(nodeInfo.Node()))
 			if typedErr != nil {
 				return nil, typedErr
 			}
@ -532,7 +532,7 @@ groupsloop:
 	return result
 }

-func executeScaleUp(context *context.AutoscalingContext, clusterStateRegistry *clusterstate.ClusterStateRegistry, info nodegroupset.ScaleUpInfo) errors.AutoscalerError {
+func executeScaleUp(context *context.AutoscalingContext, clusterStateRegistry *clusterstate.ClusterStateRegistry, info nodegroupset.ScaleUpInfo, gpuType string) errors.AutoscalerError {
 	glog.V(0).Infof("Scale-up: setting group %s size to %d", info.Group.Id(), info.NewSize)
 	increase := info.NewSize - info.CurrentSize
 	if err := info.Group.IncreaseSize(increase); err != nil {
@ -548,7 +548,7 @@ func executeScaleUp(context *context.AutoscalingContext, clusterStateRegistry *c
 			Time:            time.Now(),
 			ExpectedAddTime: time.Now().Add(context.MaxNodeProvisionTime),
 		})
-	metrics.RegisterScaleUp(increase)
+	metrics.RegisterScaleUp(increase, gpuType)
 	context.LogRecorder.Eventf(apiv1.EventTypeNormal, "ScaledUpGroup",
 		"Scale-up: group %s size set to %d", info.Group.Id(), info.NewSize)
 	return nil
--- a/cluster-autoscaler/metrics/metrics.go
+++ b/cluster-autoscaler/metrics/metrics.go
@ -156,6 +156,14 @@ var (
 		},
 	)

+	gpuScaleUpCount = prometheus.NewCounterVec(
+		prometheus.CounterOpts{
+			Namespace: caNamespace,
+			Name:      "scaled_up_gpu_nodes_total",
+			Help:      "Number of GPU nodes added by CA, by GPU name.",
+		}, []string{"gpu_name"},
+	)
+
 	failedScaleUpCount = prometheus.NewCounterVec(
 		prometheus.CounterOpts{
 			Namespace: caNamespace,
@ -172,6 +180,14 @@ var (
 		}, []string{"reason"},
 	)

+	gpuScaleDownCount = prometheus.NewCounterVec(
+		prometheus.CounterOpts{
+			Namespace: caNamespace,
+			Name:      "scaled_down_gpu_nodes_total",
+			Help:      "Number of GPU nodes removed by CA, by reason and GPU name.",
+		}, []string{"reason", "gpu_name"},
+	)
+
 	evictionsCount = prometheus.NewCounter(
 		prometheus.CounterOpts{
 			Namespace: caNamespace,
@ -224,8 +240,10 @@ func RegisterAll() {
 	prometheus.MustRegister(functionDuration)
 	prometheus.MustRegister(errorsCount)
 	prometheus.MustRegister(scaleUpCount)
+	prometheus.MustRegister(gpuScaleUpCount)
 	prometheus.MustRegister(failedScaleUpCount)
 	prometheus.MustRegister(scaleDownCount)
+	prometheus.MustRegister(gpuScaleDownCount)
 	prometheus.MustRegister(evictionsCount)
 	prometheus.MustRegister(unneededNodesCount)
 	prometheus.MustRegister(napEnabled)
@ -291,8 +309,11 @@ func RegisterError(err errors.AutoscalerError) {
 }

 // RegisterScaleUp records number of nodes added by scale up
-func RegisterScaleUp(nodesCount int) {
+func RegisterScaleUp(nodesCount int, gpuType string) {
 	scaleUpCount.Add(float64(nodesCount))
+	if gpuType != "" {
+		gpuScaleUpCount.WithLabelValues(gpuType).Add(float64(nodesCount))
+	}
 }

 // RegisterFailedScaleUp records a failed scale-up operation
@ -301,8 +322,11 @@ func RegisterFailedScaleUp(reason FailedScaleUpReason) {
 }

 // RegisterScaleDown records number of nodes removed by scale down
-func RegisterScaleDown(nodesCount int, reason NodeScaleDownReason) {
+func RegisterScaleDown(nodesCount int, gpuType string, reason NodeScaleDownReason) {
 	scaleDownCount.WithLabelValues(string(reason)).Add(float64(nodesCount))
+	if gpuType != "" {
+		gpuScaleDownCount.WithLabelValues(string(reason), gpuType).Add(float64(nodesCount))
+	}
 }

 // RegisterEvictions records number of evicted pods
--- a/cluster-autoscaler/utils/gpu/gpu.go
+++ b/cluster-autoscaler/utils/gpu/gpu.go
@ -68,6 +68,24 @@ func FilterOutNodesWithUnreadyGpus(allNodes, readyNodes []*apiv1.Node) ([]*apiv1
 	return newAllNodes, newReadyNodes
 }

+// GetGpuTypeForMetrics returns name of the GPU used on the node or empty string if there's no GPU
+// if the GPU type is unknown, "generic" is returned
+// NOTE: current implementation is GKE/GCE-specific
+func GetGpuTypeForMetrics(node *apiv1.Node) string {
+	// we use the GKE label if there is one
+	gpuType, found := node.Labels[GPULabel]
+	if found {
+		return gpuType
+	}
+
+	// no label, fallback to generic solution
+	capacity, found := node.Status.Capacity[ResourceNvidiaGPU]
+	if !found || capacity.IsZero() {
+		return ""
+	}
+	return "generic"
+}
+
 func getUnreadyNodeCopy(node *apiv1.Node) *apiv1.Node {
 	newNode := node.DeepCopy()
 	newReadyCondition := apiv1.NodeCondition{