Add GPU-related scaled_up & scaled_down metrics (#974)

* Add GPU-related scaled_up & scaled_down metrics

* Fix name to match SD naming convention

* Fix import after master rebase

* Change the logic to include GPU-being-installed nodes
This commit is contained in:
Karol Gołąb 2018-06-22 21:00:52 +02:00 committed by Marcin Wielgus
parent bbe99a27d8
commit 5eb7021f82
4 changed files with 51 additions and 9 deletions

View File

@ -701,9 +701,9 @@ func (sd *ScaleDown) TryToScaleDown(allNodes []*apiv1.Node, pods []*apiv1.Pod, p
return return
} }
if readinessMap[toRemove.Node.Name] { if readinessMap[toRemove.Node.Name] {
metrics.RegisterScaleDown(1, metrics.Underutilized) metrics.RegisterScaleDown(1, gpu.GetGpuTypeForMetrics(toRemove.Node), metrics.Underutilized)
} else { } else {
metrics.RegisterScaleDown(1, metrics.Unready) metrics.RegisterScaleDown(1, gpu.GetGpuTypeForMetrics(toRemove.Node), metrics.Unready)
} }
}() }()
@ -809,9 +809,9 @@ func (sd *ScaleDown) scheduleDeleteEmptyNodes(emptyNodes []*apiv1.Node, client k
sd.context.Recorder, sd.clusterStateRegistry) sd.context.Recorder, sd.clusterStateRegistry)
if deleteErr == nil { if deleteErr == nil {
if readinessMap[nodeToDelete.Name] { if readinessMap[nodeToDelete.Name] {
metrics.RegisterScaleDown(1, metrics.Empty) metrics.RegisterScaleDown(1, gpu.GetGpuTypeForMetrics(nodeToDelete), metrics.Empty)
} else { } else {
metrics.RegisterScaleDown(1, metrics.Unready) metrics.RegisterScaleDown(1, gpu.GetGpuTypeForMetrics(nodeToDelete), metrics.Unready)
} }
} }
confirmation <- deleteErr confirmation <- deleteErr

View File

@ -465,7 +465,7 @@ func ScaleUp(context *context.AutoscalingContext, processors *ca_processors.Auto
} }
glog.V(1).Infof("Final scale-up plan: %v", scaleUpInfos) glog.V(1).Infof("Final scale-up plan: %v", scaleUpInfos)
for _, info := range scaleUpInfos { for _, info := range scaleUpInfos {
typedErr := executeScaleUp(context, clusterStateRegistry, info) typedErr := executeScaleUp(context, clusterStateRegistry, info, gpu.GetGpuTypeForMetrics(nodeInfo.Node()))
if typedErr != nil { if typedErr != nil {
return nil, typedErr return nil, typedErr
} }
@ -532,7 +532,7 @@ groupsloop:
return result return result
} }
func executeScaleUp(context *context.AutoscalingContext, clusterStateRegistry *clusterstate.ClusterStateRegistry, info nodegroupset.ScaleUpInfo) errors.AutoscalerError { func executeScaleUp(context *context.AutoscalingContext, clusterStateRegistry *clusterstate.ClusterStateRegistry, info nodegroupset.ScaleUpInfo, gpuType string) errors.AutoscalerError {
glog.V(0).Infof("Scale-up: setting group %s size to %d", info.Group.Id(), info.NewSize) glog.V(0).Infof("Scale-up: setting group %s size to %d", info.Group.Id(), info.NewSize)
increase := info.NewSize - info.CurrentSize increase := info.NewSize - info.CurrentSize
if err := info.Group.IncreaseSize(increase); err != nil { if err := info.Group.IncreaseSize(increase); err != nil {
@ -548,7 +548,7 @@ func executeScaleUp(context *context.AutoscalingContext, clusterStateRegistry *c
Time: time.Now(), Time: time.Now(),
ExpectedAddTime: time.Now().Add(context.MaxNodeProvisionTime), ExpectedAddTime: time.Now().Add(context.MaxNodeProvisionTime),
}) })
metrics.RegisterScaleUp(increase) metrics.RegisterScaleUp(increase, gpuType)
context.LogRecorder.Eventf(apiv1.EventTypeNormal, "ScaledUpGroup", context.LogRecorder.Eventf(apiv1.EventTypeNormal, "ScaledUpGroup",
"Scale-up: group %s size set to %d", info.Group.Id(), info.NewSize) "Scale-up: group %s size set to %d", info.Group.Id(), info.NewSize)
return nil return nil

View File

@ -156,6 +156,14 @@ var (
}, },
) )
gpuScaleUpCount = prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: caNamespace,
Name: "scaled_up_gpu_nodes_total",
Help: "Number of GPU nodes added by CA, by GPU name.",
}, []string{"gpu_name"},
)
failedScaleUpCount = prometheus.NewCounterVec( failedScaleUpCount = prometheus.NewCounterVec(
prometheus.CounterOpts{ prometheus.CounterOpts{
Namespace: caNamespace, Namespace: caNamespace,
@ -172,6 +180,14 @@ var (
}, []string{"reason"}, }, []string{"reason"},
) )
gpuScaleDownCount = prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: caNamespace,
Name: "scaled_down_gpu_nodes_total",
Help: "Number of GPU nodes removed by CA, by reason and GPU name.",
}, []string{"reason", "gpu_name"},
)
evictionsCount = prometheus.NewCounter( evictionsCount = prometheus.NewCounter(
prometheus.CounterOpts{ prometheus.CounterOpts{
Namespace: caNamespace, Namespace: caNamespace,
@ -224,8 +240,10 @@ func RegisterAll() {
prometheus.MustRegister(functionDuration) prometheus.MustRegister(functionDuration)
prometheus.MustRegister(errorsCount) prometheus.MustRegister(errorsCount)
prometheus.MustRegister(scaleUpCount) prometheus.MustRegister(scaleUpCount)
prometheus.MustRegister(gpuScaleUpCount)
prometheus.MustRegister(failedScaleUpCount) prometheus.MustRegister(failedScaleUpCount)
prometheus.MustRegister(scaleDownCount) prometheus.MustRegister(scaleDownCount)
prometheus.MustRegister(gpuScaleDownCount)
prometheus.MustRegister(evictionsCount) prometheus.MustRegister(evictionsCount)
prometheus.MustRegister(unneededNodesCount) prometheus.MustRegister(unneededNodesCount)
prometheus.MustRegister(napEnabled) prometheus.MustRegister(napEnabled)
@ -291,8 +309,11 @@ func RegisterError(err errors.AutoscalerError) {
} }
// RegisterScaleUp records number of nodes added by scale up // RegisterScaleUp records number of nodes added by scale up
func RegisterScaleUp(nodesCount int) { func RegisterScaleUp(nodesCount int, gpuType string) {
scaleUpCount.Add(float64(nodesCount)) scaleUpCount.Add(float64(nodesCount))
if gpuType != "" {
gpuScaleUpCount.WithLabelValues(gpuType).Add(float64(nodesCount))
}
} }
// RegisterFailedScaleUp records a failed scale-up operation // RegisterFailedScaleUp records a failed scale-up operation
@ -301,8 +322,11 @@ func RegisterFailedScaleUp(reason FailedScaleUpReason) {
} }
// RegisterScaleDown records number of nodes removed by scale down // RegisterScaleDown records number of nodes removed by scale down
func RegisterScaleDown(nodesCount int, reason NodeScaleDownReason) { func RegisterScaleDown(nodesCount int, gpuType string, reason NodeScaleDownReason) {
scaleDownCount.WithLabelValues(string(reason)).Add(float64(nodesCount)) scaleDownCount.WithLabelValues(string(reason)).Add(float64(nodesCount))
if gpuType != "" {
gpuScaleDownCount.WithLabelValues(string(reason), gpuType).Add(float64(nodesCount))
}
} }
// RegisterEvictions records number of evicted pods // RegisterEvictions records number of evicted pods

View File

@ -68,6 +68,24 @@ func FilterOutNodesWithUnreadyGpus(allNodes, readyNodes []*apiv1.Node) ([]*apiv1
return newAllNodes, newReadyNodes return newAllNodes, newReadyNodes
} }
// GetGpuTypeForMetrics returns name of the GPU used on the node or empty string if there's no GPU
// if the GPU type is unknown, "generic" is returned
// NOTE: current implementation is GKE/GCE-specific
func GetGpuTypeForMetrics(node *apiv1.Node) string {
// we use the GKE label if there is one
gpuType, found := node.Labels[GPULabel]
if found {
return gpuType
}
// no label, fallback to generic solution
capacity, found := node.Status.Capacity[ResourceNvidiaGPU]
if !found || capacity.IsZero() {
return ""
}
return "generic"
}
func getUnreadyNodeCopy(node *apiv1.Node) *apiv1.Node { func getUnreadyNodeCopy(node *apiv1.Node) *apiv1.Node {
newNode := node.DeepCopy() newNode := node.DeepCopy()
newReadyCondition := apiv1.NodeCondition{ newReadyCondition := apiv1.NodeCondition{