Add GPU-related scaled_up & scaled_down metrics (#974)
* Add GPU-related scaled_up & scaled_down metrics * Fix name to match SD naming convention * Fix import after master rebase * Change the logic to include GPU-being-installed nodes
This commit is contained in:
		
							parent
							
								
									bbe99a27d8
								
							
						
					
					
						commit
						5eb7021f82
					
				|  | @ -701,9 +701,9 @@ func (sd *ScaleDown) TryToScaleDown(allNodes []*apiv1.Node, pods []*apiv1.Pod, p | |||
| 			return | ||||
| 		} | ||||
| 		if readinessMap[toRemove.Node.Name] { | ||||
| 			metrics.RegisterScaleDown(1, metrics.Underutilized) | ||||
| 			metrics.RegisterScaleDown(1, gpu.GetGpuTypeForMetrics(toRemove.Node), metrics.Underutilized) | ||||
| 		} else { | ||||
| 			metrics.RegisterScaleDown(1, metrics.Unready) | ||||
| 			metrics.RegisterScaleDown(1, gpu.GetGpuTypeForMetrics(toRemove.Node), metrics.Unready) | ||||
| 		} | ||||
| 	}() | ||||
| 
 | ||||
|  | @ -809,9 +809,9 @@ func (sd *ScaleDown) scheduleDeleteEmptyNodes(emptyNodes []*apiv1.Node, client k | |||
| 				sd.context.Recorder, sd.clusterStateRegistry) | ||||
| 			if deleteErr == nil { | ||||
| 				if readinessMap[nodeToDelete.Name] { | ||||
| 					metrics.RegisterScaleDown(1, metrics.Empty) | ||||
| 					metrics.RegisterScaleDown(1, gpu.GetGpuTypeForMetrics(nodeToDelete), metrics.Empty) | ||||
| 				} else { | ||||
| 					metrics.RegisterScaleDown(1, metrics.Unready) | ||||
| 					metrics.RegisterScaleDown(1, gpu.GetGpuTypeForMetrics(nodeToDelete), metrics.Unready) | ||||
| 				} | ||||
| 			} | ||||
| 			confirmation <- deleteErr | ||||
|  |  | |||
|  | @ -465,7 +465,7 @@ func ScaleUp(context *context.AutoscalingContext, processors *ca_processors.Auto | |||
| 		} | ||||
| 		glog.V(1).Infof("Final scale-up plan: %v", scaleUpInfos) | ||||
| 		for _, info := range scaleUpInfos { | ||||
| 			typedErr := executeScaleUp(context, clusterStateRegistry, info) | ||||
| 			typedErr := executeScaleUp(context, clusterStateRegistry, info, gpu.GetGpuTypeForMetrics(nodeInfo.Node())) | ||||
| 			if typedErr != nil { | ||||
| 				return nil, typedErr | ||||
| 			} | ||||
|  | @ -532,7 +532,7 @@ groupsloop: | |||
| 	return result | ||||
| } | ||||
| 
 | ||||
| func executeScaleUp(context *context.AutoscalingContext, clusterStateRegistry *clusterstate.ClusterStateRegistry, info nodegroupset.ScaleUpInfo) errors.AutoscalerError { | ||||
| func executeScaleUp(context *context.AutoscalingContext, clusterStateRegistry *clusterstate.ClusterStateRegistry, info nodegroupset.ScaleUpInfo, gpuType string) errors.AutoscalerError { | ||||
| 	glog.V(0).Infof("Scale-up: setting group %s size to %d", info.Group.Id(), info.NewSize) | ||||
| 	increase := info.NewSize - info.CurrentSize | ||||
| 	if err := info.Group.IncreaseSize(increase); err != nil { | ||||
|  | @ -548,7 +548,7 @@ func executeScaleUp(context *context.AutoscalingContext, clusterStateRegistry *c | |||
| 			Time:            time.Now(), | ||||
| 			ExpectedAddTime: time.Now().Add(context.MaxNodeProvisionTime), | ||||
| 		}) | ||||
| 	metrics.RegisterScaleUp(increase) | ||||
| 	metrics.RegisterScaleUp(increase, gpuType) | ||||
| 	context.LogRecorder.Eventf(apiv1.EventTypeNormal, "ScaledUpGroup", | ||||
| 		"Scale-up: group %s size set to %d", info.Group.Id(), info.NewSize) | ||||
| 	return nil | ||||
|  |  | |||
|  | @ -156,6 +156,14 @@ var ( | |||
| 		}, | ||||
| 	) | ||||
| 
 | ||||
| 	gpuScaleUpCount = prometheus.NewCounterVec( | ||||
| 		prometheus.CounterOpts{ | ||||
| 			Namespace: caNamespace, | ||||
| 			Name:      "scaled_up_gpu_nodes_total", | ||||
| 			Help:      "Number of GPU nodes added by CA, by GPU name.", | ||||
| 		}, []string{"gpu_name"}, | ||||
| 	) | ||||
| 
 | ||||
| 	failedScaleUpCount = prometheus.NewCounterVec( | ||||
| 		prometheus.CounterOpts{ | ||||
| 			Namespace: caNamespace, | ||||
|  | @ -172,6 +180,14 @@ var ( | |||
| 		}, []string{"reason"}, | ||||
| 	) | ||||
| 
 | ||||
| 	gpuScaleDownCount = prometheus.NewCounterVec( | ||||
| 		prometheus.CounterOpts{ | ||||
| 			Namespace: caNamespace, | ||||
| 			Name:      "scaled_down_gpu_nodes_total", | ||||
| 			Help:      "Number of GPU nodes removed by CA, by reason and GPU name.", | ||||
| 		}, []string{"reason", "gpu_name"}, | ||||
| 	) | ||||
| 
 | ||||
| 	evictionsCount = prometheus.NewCounter( | ||||
| 		prometheus.CounterOpts{ | ||||
| 			Namespace: caNamespace, | ||||
|  | @ -224,8 +240,10 @@ func RegisterAll() { | |||
| 	prometheus.MustRegister(functionDuration) | ||||
| 	prometheus.MustRegister(errorsCount) | ||||
| 	prometheus.MustRegister(scaleUpCount) | ||||
| 	prometheus.MustRegister(gpuScaleUpCount) | ||||
| 	prometheus.MustRegister(failedScaleUpCount) | ||||
| 	prometheus.MustRegister(scaleDownCount) | ||||
| 	prometheus.MustRegister(gpuScaleDownCount) | ||||
| 	prometheus.MustRegister(evictionsCount) | ||||
| 	prometheus.MustRegister(unneededNodesCount) | ||||
| 	prometheus.MustRegister(napEnabled) | ||||
|  | @ -291,8 +309,11 @@ func RegisterError(err errors.AutoscalerError) { | |||
| } | ||||
| 
 | ||||
| // RegisterScaleUp records number of nodes added by scale up
 | ||||
| func RegisterScaleUp(nodesCount int) { | ||||
| func RegisterScaleUp(nodesCount int, gpuType string) { | ||||
| 	scaleUpCount.Add(float64(nodesCount)) | ||||
| 	if gpuType != "" { | ||||
| 		gpuScaleUpCount.WithLabelValues(gpuType).Add(float64(nodesCount)) | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| // RegisterFailedScaleUp records a failed scale-up operation
 | ||||
|  | @ -301,8 +322,11 @@ func RegisterFailedScaleUp(reason FailedScaleUpReason) { | |||
| } | ||||
| 
 | ||||
| // RegisterScaleDown records number of nodes removed by scale down
 | ||||
| func RegisterScaleDown(nodesCount int, reason NodeScaleDownReason) { | ||||
| func RegisterScaleDown(nodesCount int, gpuType string, reason NodeScaleDownReason) { | ||||
| 	scaleDownCount.WithLabelValues(string(reason)).Add(float64(nodesCount)) | ||||
| 	if gpuType != "" { | ||||
| 		gpuScaleDownCount.WithLabelValues(string(reason), gpuType).Add(float64(nodesCount)) | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| // RegisterEvictions records number of evicted pods
 | ||||
|  |  | |||
|  | @ -68,6 +68,24 @@ func FilterOutNodesWithUnreadyGpus(allNodes, readyNodes []*apiv1.Node) ([]*apiv1 | |||
| 	return newAllNodes, newReadyNodes | ||||
| } | ||||
| 
 | ||||
| // GetGpuTypeForMetrics returns name of the GPU used on the node or empty string if there's no GPU
 | ||||
| // if the GPU type is unknown, "generic" is returned
 | ||||
| // NOTE: current implementation is GKE/GCE-specific
 | ||||
| func GetGpuTypeForMetrics(node *apiv1.Node) string { | ||||
| 	// we use the GKE label if there is one
 | ||||
| 	gpuType, found := node.Labels[GPULabel] | ||||
| 	if found { | ||||
| 		return gpuType | ||||
| 	} | ||||
| 
 | ||||
| 	// no label, fallback to generic solution
 | ||||
| 	capacity, found := node.Status.Capacity[ResourceNvidiaGPU] | ||||
| 	if !found || capacity.IsZero() { | ||||
| 		return "" | ||||
| 	} | ||||
| 	return "generic" | ||||
| } | ||||
| 
 | ||||
| func getUnreadyNodeCopy(node *apiv1.Node) *apiv1.Node { | ||||
| 	newNode := node.DeepCopy() | ||||
| 	newReadyCondition := apiv1.NodeCondition{ | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue