Add GPU-related scaled_up & scaled_down metrics (#974)

* Add GPU-related scaled_up & scaled_down metrics

* Fix name to match SD naming convention

* Fix import after master rebase

* Change the logic to include GPU-being-installed nodes
This commit is contained in:
Karol Gołąb 2018-06-22 21:00:52 +02:00 committed by Marcin Wielgus
parent bbe99a27d8
commit 5eb7021f82
4 changed files with 51 additions and 9 deletions

View File

@ -701,9 +701,9 @@ func (sd *ScaleDown) TryToScaleDown(allNodes []*apiv1.Node, pods []*apiv1.Pod, p
return
}
if readinessMap[toRemove.Node.Name] {
metrics.RegisterScaleDown(1, metrics.Underutilized)
metrics.RegisterScaleDown(1, gpu.GetGpuTypeForMetrics(toRemove.Node), metrics.Underutilized)
} else {
metrics.RegisterScaleDown(1, metrics.Unready)
metrics.RegisterScaleDown(1, gpu.GetGpuTypeForMetrics(toRemove.Node), metrics.Unready)
}
}()
@ -809,9 +809,9 @@ func (sd *ScaleDown) scheduleDeleteEmptyNodes(emptyNodes []*apiv1.Node, client k
sd.context.Recorder, sd.clusterStateRegistry)
if deleteErr == nil {
if readinessMap[nodeToDelete.Name] {
metrics.RegisterScaleDown(1, metrics.Empty)
metrics.RegisterScaleDown(1, gpu.GetGpuTypeForMetrics(nodeToDelete), metrics.Empty)
} else {
metrics.RegisterScaleDown(1, metrics.Unready)
metrics.RegisterScaleDown(1, gpu.GetGpuTypeForMetrics(nodeToDelete), metrics.Unready)
}
}
confirmation <- deleteErr

View File

@ -465,7 +465,7 @@ func ScaleUp(context *context.AutoscalingContext, processors *ca_processors.Auto
}
glog.V(1).Infof("Final scale-up plan: %v", scaleUpInfos)
for _, info := range scaleUpInfos {
typedErr := executeScaleUp(context, clusterStateRegistry, info)
typedErr := executeScaleUp(context, clusterStateRegistry, info, gpu.GetGpuTypeForMetrics(nodeInfo.Node()))
if typedErr != nil {
return nil, typedErr
}
@ -532,7 +532,7 @@ groupsloop:
return result
}
func executeScaleUp(context *context.AutoscalingContext, clusterStateRegistry *clusterstate.ClusterStateRegistry, info nodegroupset.ScaleUpInfo) errors.AutoscalerError {
func executeScaleUp(context *context.AutoscalingContext, clusterStateRegistry *clusterstate.ClusterStateRegistry, info nodegroupset.ScaleUpInfo, gpuType string) errors.AutoscalerError {
glog.V(0).Infof("Scale-up: setting group %s size to %d", info.Group.Id(), info.NewSize)
increase := info.NewSize - info.CurrentSize
if err := info.Group.IncreaseSize(increase); err != nil {
@ -548,7 +548,7 @@ func executeScaleUp(context *context.AutoscalingContext, clusterStateRegistry *c
Time: time.Now(),
ExpectedAddTime: time.Now().Add(context.MaxNodeProvisionTime),
})
metrics.RegisterScaleUp(increase)
metrics.RegisterScaleUp(increase, gpuType)
context.LogRecorder.Eventf(apiv1.EventTypeNormal, "ScaledUpGroup",
"Scale-up: group %s size set to %d", info.Group.Id(), info.NewSize)
return nil

View File

@ -156,6 +156,14 @@ var (
},
)
gpuScaleUpCount = prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: caNamespace,
Name: "scaled_up_gpu_nodes_total",
Help: "Number of GPU nodes added by CA, by GPU name.",
}, []string{"gpu_name"},
)
failedScaleUpCount = prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: caNamespace,
@ -172,6 +180,14 @@ var (
}, []string{"reason"},
)
gpuScaleDownCount = prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: caNamespace,
Name: "scaled_down_gpu_nodes_total",
Help: "Number of GPU nodes removed by CA, by reason and GPU name.",
}, []string{"reason", "gpu_name"},
)
evictionsCount = prometheus.NewCounter(
prometheus.CounterOpts{
Namespace: caNamespace,
@ -224,8 +240,10 @@ func RegisterAll() {
prometheus.MustRegister(functionDuration)
prometheus.MustRegister(errorsCount)
prometheus.MustRegister(scaleUpCount)
prometheus.MustRegister(gpuScaleUpCount)
prometheus.MustRegister(failedScaleUpCount)
prometheus.MustRegister(scaleDownCount)
prometheus.MustRegister(gpuScaleDownCount)
prometheus.MustRegister(evictionsCount)
prometheus.MustRegister(unneededNodesCount)
prometheus.MustRegister(napEnabled)
@ -291,8 +309,11 @@ func RegisterError(err errors.AutoscalerError) {
}
// RegisterScaleUp records number of nodes added by scale up
func RegisterScaleUp(nodesCount int) {
func RegisterScaleUp(nodesCount int, gpuType string) {
scaleUpCount.Add(float64(nodesCount))
if gpuType != "" {
gpuScaleUpCount.WithLabelValues(gpuType).Add(float64(nodesCount))
}
}
// RegisterFailedScaleUp records a failed scale-up operation
@ -301,8 +322,11 @@ func RegisterFailedScaleUp(reason FailedScaleUpReason) {
}
// RegisterScaleDown records number of nodes removed by scale down
func RegisterScaleDown(nodesCount int, reason NodeScaleDownReason) {
func RegisterScaleDown(nodesCount int, gpuType string, reason NodeScaleDownReason) {
scaleDownCount.WithLabelValues(string(reason)).Add(float64(nodesCount))
if gpuType != "" {
gpuScaleDownCount.WithLabelValues(string(reason), gpuType).Add(float64(nodesCount))
}
}
// RegisterEvictions records number of evicted pods

View File

@ -68,6 +68,24 @@ func FilterOutNodesWithUnreadyGpus(allNodes, readyNodes []*apiv1.Node) ([]*apiv1
return newAllNodes, newReadyNodes
}
// GetGpuTypeForMetrics returns name of the GPU used on the node or empty string if there's no GPU
// if the GPU type is unknown, "generic" is returned
// NOTE: current implementation is GKE/GCE-specific
func GetGpuTypeForMetrics(node *apiv1.Node) string {
// we use the GKE label if there is one
gpuType, found := node.Labels[GPULabel]
if found {
return gpuType
}
// no label, fallback to generic solution
capacity, found := node.Status.Capacity[ResourceNvidiaGPU]
if !found || capacity.IsZero() {
return ""
}
return "generic"
}
func getUnreadyNodeCopy(node *apiv1.Node) *apiv1.Node {
newNode := node.DeepCopy()
newReadyCondition := apiv1.NodeCondition{