Add GPU-related scaled_up & scaled_down metrics (#974)
* Add GPU-related scaled_up & scaled_down metrics * Fix name to match SD naming convention * Fix import after master rebase * Change the logic to include GPU-being-installed nodes
This commit is contained in:
parent
bbe99a27d8
commit
5eb7021f82
|
|
@ -701,9 +701,9 @@ func (sd *ScaleDown) TryToScaleDown(allNodes []*apiv1.Node, pods []*apiv1.Pod, p
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
if readinessMap[toRemove.Node.Name] {
|
if readinessMap[toRemove.Node.Name] {
|
||||||
metrics.RegisterScaleDown(1, metrics.Underutilized)
|
metrics.RegisterScaleDown(1, gpu.GetGpuTypeForMetrics(toRemove.Node), metrics.Underutilized)
|
||||||
} else {
|
} else {
|
||||||
metrics.RegisterScaleDown(1, metrics.Unready)
|
metrics.RegisterScaleDown(1, gpu.GetGpuTypeForMetrics(toRemove.Node), metrics.Unready)
|
||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
|
|
||||||
|
|
@ -809,9 +809,9 @@ func (sd *ScaleDown) scheduleDeleteEmptyNodes(emptyNodes []*apiv1.Node, client k
|
||||||
sd.context.Recorder, sd.clusterStateRegistry)
|
sd.context.Recorder, sd.clusterStateRegistry)
|
||||||
if deleteErr == nil {
|
if deleteErr == nil {
|
||||||
if readinessMap[nodeToDelete.Name] {
|
if readinessMap[nodeToDelete.Name] {
|
||||||
metrics.RegisterScaleDown(1, metrics.Empty)
|
metrics.RegisterScaleDown(1, gpu.GetGpuTypeForMetrics(nodeToDelete), metrics.Empty)
|
||||||
} else {
|
} else {
|
||||||
metrics.RegisterScaleDown(1, metrics.Unready)
|
metrics.RegisterScaleDown(1, gpu.GetGpuTypeForMetrics(nodeToDelete), metrics.Unready)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
confirmation <- deleteErr
|
confirmation <- deleteErr
|
||||||
|
|
|
||||||
|
|
@ -465,7 +465,7 @@ func ScaleUp(context *context.AutoscalingContext, processors *ca_processors.Auto
|
||||||
}
|
}
|
||||||
glog.V(1).Infof("Final scale-up plan: %v", scaleUpInfos)
|
glog.V(1).Infof("Final scale-up plan: %v", scaleUpInfos)
|
||||||
for _, info := range scaleUpInfos {
|
for _, info := range scaleUpInfos {
|
||||||
typedErr := executeScaleUp(context, clusterStateRegistry, info)
|
typedErr := executeScaleUp(context, clusterStateRegistry, info, gpu.GetGpuTypeForMetrics(nodeInfo.Node()))
|
||||||
if typedErr != nil {
|
if typedErr != nil {
|
||||||
return nil, typedErr
|
return nil, typedErr
|
||||||
}
|
}
|
||||||
|
|
@ -532,7 +532,7 @@ groupsloop:
|
||||||
return result
|
return result
|
||||||
}
|
}
|
||||||
|
|
||||||
func executeScaleUp(context *context.AutoscalingContext, clusterStateRegistry *clusterstate.ClusterStateRegistry, info nodegroupset.ScaleUpInfo) errors.AutoscalerError {
|
func executeScaleUp(context *context.AutoscalingContext, clusterStateRegistry *clusterstate.ClusterStateRegistry, info nodegroupset.ScaleUpInfo, gpuType string) errors.AutoscalerError {
|
||||||
glog.V(0).Infof("Scale-up: setting group %s size to %d", info.Group.Id(), info.NewSize)
|
glog.V(0).Infof("Scale-up: setting group %s size to %d", info.Group.Id(), info.NewSize)
|
||||||
increase := info.NewSize - info.CurrentSize
|
increase := info.NewSize - info.CurrentSize
|
||||||
if err := info.Group.IncreaseSize(increase); err != nil {
|
if err := info.Group.IncreaseSize(increase); err != nil {
|
||||||
|
|
@ -548,7 +548,7 @@ func executeScaleUp(context *context.AutoscalingContext, clusterStateRegistry *c
|
||||||
Time: time.Now(),
|
Time: time.Now(),
|
||||||
ExpectedAddTime: time.Now().Add(context.MaxNodeProvisionTime),
|
ExpectedAddTime: time.Now().Add(context.MaxNodeProvisionTime),
|
||||||
})
|
})
|
||||||
metrics.RegisterScaleUp(increase)
|
metrics.RegisterScaleUp(increase, gpuType)
|
||||||
context.LogRecorder.Eventf(apiv1.EventTypeNormal, "ScaledUpGroup",
|
context.LogRecorder.Eventf(apiv1.EventTypeNormal, "ScaledUpGroup",
|
||||||
"Scale-up: group %s size set to %d", info.Group.Id(), info.NewSize)
|
"Scale-up: group %s size set to %d", info.Group.Id(), info.NewSize)
|
||||||
return nil
|
return nil
|
||||||
|
|
|
||||||
|
|
@ -156,6 +156,14 @@ var (
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
|
gpuScaleUpCount = prometheus.NewCounterVec(
|
||||||
|
prometheus.CounterOpts{
|
||||||
|
Namespace: caNamespace,
|
||||||
|
Name: "scaled_up_gpu_nodes_total",
|
||||||
|
Help: "Number of GPU nodes added by CA, by GPU name.",
|
||||||
|
}, []string{"gpu_name"},
|
||||||
|
)
|
||||||
|
|
||||||
failedScaleUpCount = prometheus.NewCounterVec(
|
failedScaleUpCount = prometheus.NewCounterVec(
|
||||||
prometheus.CounterOpts{
|
prometheus.CounterOpts{
|
||||||
Namespace: caNamespace,
|
Namespace: caNamespace,
|
||||||
|
|
@ -172,6 +180,14 @@ var (
|
||||||
}, []string{"reason"},
|
}, []string{"reason"},
|
||||||
)
|
)
|
||||||
|
|
||||||
|
gpuScaleDownCount = prometheus.NewCounterVec(
|
||||||
|
prometheus.CounterOpts{
|
||||||
|
Namespace: caNamespace,
|
||||||
|
Name: "scaled_down_gpu_nodes_total",
|
||||||
|
Help: "Number of GPU nodes removed by CA, by reason and GPU name.",
|
||||||
|
}, []string{"reason", "gpu_name"},
|
||||||
|
)
|
||||||
|
|
||||||
evictionsCount = prometheus.NewCounter(
|
evictionsCount = prometheus.NewCounter(
|
||||||
prometheus.CounterOpts{
|
prometheus.CounterOpts{
|
||||||
Namespace: caNamespace,
|
Namespace: caNamespace,
|
||||||
|
|
@ -224,8 +240,10 @@ func RegisterAll() {
|
||||||
prometheus.MustRegister(functionDuration)
|
prometheus.MustRegister(functionDuration)
|
||||||
prometheus.MustRegister(errorsCount)
|
prometheus.MustRegister(errorsCount)
|
||||||
prometheus.MustRegister(scaleUpCount)
|
prometheus.MustRegister(scaleUpCount)
|
||||||
|
prometheus.MustRegister(gpuScaleUpCount)
|
||||||
prometheus.MustRegister(failedScaleUpCount)
|
prometheus.MustRegister(failedScaleUpCount)
|
||||||
prometheus.MustRegister(scaleDownCount)
|
prometheus.MustRegister(scaleDownCount)
|
||||||
|
prometheus.MustRegister(gpuScaleDownCount)
|
||||||
prometheus.MustRegister(evictionsCount)
|
prometheus.MustRegister(evictionsCount)
|
||||||
prometheus.MustRegister(unneededNodesCount)
|
prometheus.MustRegister(unneededNodesCount)
|
||||||
prometheus.MustRegister(napEnabled)
|
prometheus.MustRegister(napEnabled)
|
||||||
|
|
@ -291,8 +309,11 @@ func RegisterError(err errors.AutoscalerError) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// RegisterScaleUp records number of nodes added by scale up
|
// RegisterScaleUp records number of nodes added by scale up
|
||||||
func RegisterScaleUp(nodesCount int) {
|
func RegisterScaleUp(nodesCount int, gpuType string) {
|
||||||
scaleUpCount.Add(float64(nodesCount))
|
scaleUpCount.Add(float64(nodesCount))
|
||||||
|
if gpuType != "" {
|
||||||
|
gpuScaleUpCount.WithLabelValues(gpuType).Add(float64(nodesCount))
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// RegisterFailedScaleUp records a failed scale-up operation
|
// RegisterFailedScaleUp records a failed scale-up operation
|
||||||
|
|
@ -301,8 +322,11 @@ func RegisterFailedScaleUp(reason FailedScaleUpReason) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// RegisterScaleDown records number of nodes removed by scale down
|
// RegisterScaleDown records number of nodes removed by scale down
|
||||||
func RegisterScaleDown(nodesCount int, reason NodeScaleDownReason) {
|
func RegisterScaleDown(nodesCount int, gpuType string, reason NodeScaleDownReason) {
|
||||||
scaleDownCount.WithLabelValues(string(reason)).Add(float64(nodesCount))
|
scaleDownCount.WithLabelValues(string(reason)).Add(float64(nodesCount))
|
||||||
|
if gpuType != "" {
|
||||||
|
gpuScaleDownCount.WithLabelValues(string(reason), gpuType).Add(float64(nodesCount))
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// RegisterEvictions records number of evicted pods
|
// RegisterEvictions records number of evicted pods
|
||||||
|
|
|
||||||
|
|
@ -68,6 +68,24 @@ func FilterOutNodesWithUnreadyGpus(allNodes, readyNodes []*apiv1.Node) ([]*apiv1
|
||||||
return newAllNodes, newReadyNodes
|
return newAllNodes, newReadyNodes
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// GetGpuTypeForMetrics returns name of the GPU used on the node or empty string if there's no GPU
|
||||||
|
// if the GPU type is unknown, "generic" is returned
|
||||||
|
// NOTE: current implementation is GKE/GCE-specific
|
||||||
|
func GetGpuTypeForMetrics(node *apiv1.Node) string {
|
||||||
|
// we use the GKE label if there is one
|
||||||
|
gpuType, found := node.Labels[GPULabel]
|
||||||
|
if found {
|
||||||
|
return gpuType
|
||||||
|
}
|
||||||
|
|
||||||
|
// no label, fallback to generic solution
|
||||||
|
capacity, found := node.Status.Capacity[ResourceNvidiaGPU]
|
||||||
|
if !found || capacity.IsZero() {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
return "generic"
|
||||||
|
}
|
||||||
|
|
||||||
func getUnreadyNodeCopy(node *apiv1.Node) *apiv1.Node {
|
func getUnreadyNodeCopy(node *apiv1.Node) *apiv1.Node {
|
||||||
newNode := node.DeepCopy()
|
newNode := node.DeepCopy()
|
||||||
newReadyCondition := apiv1.NodeCondition{
|
newReadyCondition := apiv1.NodeCondition{
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue