From 5eb7021f827ea9f2dadae79c74e1ef8b6ec49d9a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karol=20Go=C5=82=C4=85b?= Date: Fri, 22 Jun 2018 21:00:52 +0200 Subject: [PATCH] Add GPU-related scaled_up & scaled_down metrics (#974) * Add GPU-related scaled_up & scaled_down metrics * Fix name to match SD naming convention * Fix import after master rebase * Change the logic to include GPU-being-installed nodes --- cluster-autoscaler/core/scale_down.go | 8 ++++---- cluster-autoscaler/core/scale_up.go | 6 +++--- cluster-autoscaler/metrics/metrics.go | 28 +++++++++++++++++++++++++-- cluster-autoscaler/utils/gpu/gpu.go | 18 +++++++++++++++++ 4 files changed, 51 insertions(+), 9 deletions(-) diff --git a/cluster-autoscaler/core/scale_down.go b/cluster-autoscaler/core/scale_down.go index 3443af29d3..deab280d2c 100644 --- a/cluster-autoscaler/core/scale_down.go +++ b/cluster-autoscaler/core/scale_down.go @@ -701,9 +701,9 @@ func (sd *ScaleDown) TryToScaleDown(allNodes []*apiv1.Node, pods []*apiv1.Pod, p return } if readinessMap[toRemove.Node.Name] { - metrics.RegisterScaleDown(1, metrics.Underutilized) + metrics.RegisterScaleDown(1, gpu.GetGpuTypeForMetrics(toRemove.Node), metrics.Underutilized) } else { - metrics.RegisterScaleDown(1, metrics.Unready) + metrics.RegisterScaleDown(1, gpu.GetGpuTypeForMetrics(toRemove.Node), metrics.Unready) } }() @@ -809,9 +809,9 @@ func (sd *ScaleDown) scheduleDeleteEmptyNodes(emptyNodes []*apiv1.Node, client k sd.context.Recorder, sd.clusterStateRegistry) if deleteErr == nil { if readinessMap[nodeToDelete.Name] { - metrics.RegisterScaleDown(1, metrics.Empty) + metrics.RegisterScaleDown(1, gpu.GetGpuTypeForMetrics(nodeToDelete), metrics.Empty) } else { - metrics.RegisterScaleDown(1, metrics.Unready) + metrics.RegisterScaleDown(1, gpu.GetGpuTypeForMetrics(nodeToDelete), metrics.Unready) } } confirmation <- deleteErr diff --git a/cluster-autoscaler/core/scale_up.go b/cluster-autoscaler/core/scale_up.go index 6951118ddc..537f5a4874 100644 --- a/cluster-autoscaler/core/scale_up.go +++ b/cluster-autoscaler/core/scale_up.go @@ -465,7 +465,7 @@ func ScaleUp(context *context.AutoscalingContext, processors *ca_processors.Auto } glog.V(1).Infof("Final scale-up plan: %v", scaleUpInfos) for _, info := range scaleUpInfos { - typedErr := executeScaleUp(context, clusterStateRegistry, info) + typedErr := executeScaleUp(context, clusterStateRegistry, info, gpu.GetGpuTypeForMetrics(nodeInfo.Node())) if typedErr != nil { return nil, typedErr } @@ -532,7 +532,7 @@ groupsloop: return result } -func executeScaleUp(context *context.AutoscalingContext, clusterStateRegistry *clusterstate.ClusterStateRegistry, info nodegroupset.ScaleUpInfo) errors.AutoscalerError { +func executeScaleUp(context *context.AutoscalingContext, clusterStateRegistry *clusterstate.ClusterStateRegistry, info nodegroupset.ScaleUpInfo, gpuType string) errors.AutoscalerError { glog.V(0).Infof("Scale-up: setting group %s size to %d", info.Group.Id(), info.NewSize) increase := info.NewSize - info.CurrentSize if err := info.Group.IncreaseSize(increase); err != nil { @@ -548,7 +548,7 @@ func executeScaleUp(context *context.AutoscalingContext, clusterStateRegistry *c Time: time.Now(), ExpectedAddTime: time.Now().Add(context.MaxNodeProvisionTime), }) - metrics.RegisterScaleUp(increase) + metrics.RegisterScaleUp(increase, gpuType) context.LogRecorder.Eventf(apiv1.EventTypeNormal, "ScaledUpGroup", "Scale-up: group %s size set to %d", info.Group.Id(), info.NewSize) return nil diff --git a/cluster-autoscaler/metrics/metrics.go b/cluster-autoscaler/metrics/metrics.go index 83c525f838..a14ea0e90b 100644 --- a/cluster-autoscaler/metrics/metrics.go +++ b/cluster-autoscaler/metrics/metrics.go @@ -156,6 +156,14 @@ var ( }, ) + gpuScaleUpCount = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: caNamespace, + Name: "scaled_up_gpu_nodes_total", + Help: "Number of GPU nodes added by CA, by GPU name.", + }, []string{"gpu_name"}, + ) + failedScaleUpCount = prometheus.NewCounterVec( prometheus.CounterOpts{ Namespace: caNamespace, @@ -172,6 +180,14 @@ var ( }, []string{"reason"}, ) + gpuScaleDownCount = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: caNamespace, + Name: "scaled_down_gpu_nodes_total", + Help: "Number of GPU nodes removed by CA, by reason and GPU name.", + }, []string{"reason", "gpu_name"}, + ) + evictionsCount = prometheus.NewCounter( prometheus.CounterOpts{ Namespace: caNamespace, @@ -224,8 +240,10 @@ func RegisterAll() { prometheus.MustRegister(functionDuration) prometheus.MustRegister(errorsCount) prometheus.MustRegister(scaleUpCount) + prometheus.MustRegister(gpuScaleUpCount) prometheus.MustRegister(failedScaleUpCount) prometheus.MustRegister(scaleDownCount) + prometheus.MustRegister(gpuScaleDownCount) prometheus.MustRegister(evictionsCount) prometheus.MustRegister(unneededNodesCount) prometheus.MustRegister(napEnabled) @@ -291,8 +309,11 @@ func RegisterError(err errors.AutoscalerError) { } // RegisterScaleUp records number of nodes added by scale up -func RegisterScaleUp(nodesCount int) { +func RegisterScaleUp(nodesCount int, gpuType string) { scaleUpCount.Add(float64(nodesCount)) + if gpuType != "" { + gpuScaleUpCount.WithLabelValues(gpuType).Add(float64(nodesCount)) + } } // RegisterFailedScaleUp records a failed scale-up operation @@ -301,8 +322,11 @@ func RegisterFailedScaleUp(reason FailedScaleUpReason) { } // RegisterScaleDown records number of nodes removed by scale down -func RegisterScaleDown(nodesCount int, reason NodeScaleDownReason) { +func RegisterScaleDown(nodesCount int, gpuType string, reason NodeScaleDownReason) { scaleDownCount.WithLabelValues(string(reason)).Add(float64(nodesCount)) + if gpuType != "" { + gpuScaleDownCount.WithLabelValues(string(reason), gpuType).Add(float64(nodesCount)) + } } // RegisterEvictions records number of evicted pods diff --git a/cluster-autoscaler/utils/gpu/gpu.go b/cluster-autoscaler/utils/gpu/gpu.go index 77b547b30c..4f00d1a805 100644 --- a/cluster-autoscaler/utils/gpu/gpu.go +++ b/cluster-autoscaler/utils/gpu/gpu.go @@ -68,6 +68,24 @@ func FilterOutNodesWithUnreadyGpus(allNodes, readyNodes []*apiv1.Node) ([]*apiv1 return newAllNodes, newReadyNodes } +// GetGpuTypeForMetrics returns name of the GPU used on the node or empty string if there's no GPU +// if the GPU type is unknown, "generic" is returned +// NOTE: current implementation is GKE/GCE-specific +func GetGpuTypeForMetrics(node *apiv1.Node) string { + // we use the GKE label if there is one + gpuType, found := node.Labels[GPULabel] + if found { + return gpuType + } + + // no label, fallback to generic solution + capacity, found := node.Status.Capacity[ResourceNvidiaGPU] + if !found || capacity.IsZero() { + return "" + } + return "generic" +} + func getUnreadyNodeCopy(node *apiv1.Node) *apiv1.Node { newNode := node.DeepCopy() newReadyCondition := apiv1.NodeCondition{