Consider GPU utilization in scaling down

2019-04-04 01:01:46 -07:00 · 2019-04-04 01:01:46 -07:00 · 83ae66cebc
parent 40cf6e43c0
commit 83ae66cebc
7 changed files with 167 additions and 30 deletions
--- a/cluster-autoscaler/config/autoscaling_options.go
+++ b/cluster-autoscaler/config/autoscaling_options.go
@ -34,9 +34,12 @@ type GpuLimits struct {
 type AutoscalingOptions struct {
 	// MaxEmptyBulkDelete is a number of empty nodes that can be removed at the same time.
 	MaxEmptyBulkDelete int
-	// ScaleDownUtilizationThreshold sets threshold for nodes to be considered for scale down.
+	// ScaleDownUtilizationThreshold sets threshold for nodes to be considered for scale down if cpu or memory utilization is over threshold.
 	// Well-utilized nodes are not touched.
 	ScaleDownUtilizationThreshold float64
+	// ScaleDownGpuUtilizationThreshold sets threshold for gpu nodes to be considered for scale down if gpu utilization is over threshold.
+	// Well-utilized nodes are not touched.
+	ScaleDownGpuUtilizationThreshold float64
 	// ScaleDownUnneededTime sets the duration CA expects a node to be unneeded/eligible for removal
 	// before scaling down the node.
 	ScaleDownUnneededTime time.Duration
--- a/cluster-autoscaler/core/scale_down.go
+++ b/cluster-autoscaler/core/scale_down.go
@ -402,16 +402,16 @@ func (sd *ScaleDown) UpdateUnneededNodes(
 			klog.Errorf("Node info for %s not found", node.Name)
 			continue
 		}
-		utilInfo, err := simulator.CalculateUtilization(node, nodeInfo, sd.context.IgnoreDaemonSetsUtilization, sd.context.IgnoreMirrorPodsUtilization)

+		utilInfo, err := simulator.CalculateUtilization(node, nodeInfo, sd.context.IgnoreDaemonSetsUtilization, sd.context.IgnoreMirrorPodsUtilization, sd.context.CloudProvider.GPULabel())
 		if err != nil {
 			klog.Warningf("Failed to calculate utilization for %s: %v", node.Name, err)
 		}
-		klog.V(4).Infof("Node %s - utilization %f", node.Name, utilInfo.Utilization)
+		klog.V(4).Infof("Node %s - %s utilization %f", node.Name, utilInfo.ResourceName, utilInfo.Utilization)
 		utilizationMap[node.Name] = utilInfo

-		if utilInfo.Utilization >= sd.context.ScaleDownUtilizationThreshold {
-			klog.V(4).Infof("Node %s is not suitable for removal - utilization too big (%f)", node.Name, utilInfo.Utilization)
+		if !sd.isNodeBelowUtilzationThreshold(node, utilInfo) {
+			klog.V(4).Infof("Node %s is not suitable for removal - %s utilization too big (%f)", node.Name, utilInfo.ResourceName, utilInfo.Utilization)
 			continue
 		}
 		currentlyUnneededNodes = append(currentlyUnneededNodes, node)
@ -506,6 +506,20 @@ func (sd *ScaleDown) UpdateUnneededNodes(
 	return nil
 }

+// isNodeBelowUtilzationThreshold determintes if a given node utilization is blow threshold.
+func (sd *ScaleDown) isNodeBelowUtilzationThreshold(node *apiv1.Node, utilInfo simulator.UtilizationInfo) bool {
+	if gpu.NodeHasGpu(sd.context.CloudProvider.GPULabel(), node) {
+		if utilInfo.Utilization >= sd.context.ScaleDownGpuUtilizationThreshold {
+			return false
+		}
+	} else {
+		if utilInfo.Utilization >= sd.context.ScaleDownUtilizationThreshold {
+			return false
+		}
+	}
+	return true
+}
+
 // updateUnremovableNodes updates unremovableNodes map according to current
 // state of the cluster. Removes from the map nodes that are no longer in the
 // nodes list.
--- a/cluster-autoscaler/core/scale_down_test.go
+++ b/cluster-autoscaler/core/scale_down_test.go
@ -172,6 +172,66 @@ func TestFindUnneededNodes(t *testing.T) {
 	assert.Equal(t, 0, len(sd.unremovableNodes))
 }

+func TestFindUnneededGPUNodes(t *testing.T) {
+	// shared owner reference
+	ownerRef := GenerateOwnerReferences("rs", "ReplicaSet", "extensions/v1beta1", "")
+
+	p1 := BuildTestPod("p1", 100, 0)
+	p1.Spec.NodeName = "n1"
+	p1.OwnerReferences = ownerRef
+	RequestGpuForPod(p1, 1)
+
+	p2 := BuildTestPod("p2", 400, 0)
+	p2.Spec.NodeName = "n2"
+	p2.OwnerReferences = ownerRef
+	RequestGpuForPod(p2, 1)
+
+	p3 := BuildTestPod("p3", 300, 0)
+	p3.Spec.NodeName = "n3"
+	p3.OwnerReferences = ownerRef
+	p3.ObjectMeta.Annotations["cluster-autoscaler.kubernetes.io/safe-to-evict"] = "false"
+	RequestGpuForPod(p3, 1)
+
+	// Node with low cpu utilization and high gpu utilization
+	n1 := BuildTestNode("n1", 1000, 10)
+	AddGpusToNode(n1, 2)
+	// Node with high cpu utilization and low gpu utilization
+	n2 := BuildTestNode("n2", 1000, 10)
+	AddGpusToNode(n2, 4)
+	// Node with low gpu utilization and pods on node can not be interrupted
+	n3 := BuildTestNode("n3", 1000, 10)
+	AddGpusToNode(n3, 8)
+
+	SetNodeReadyState(n1, true, time.Time{})
+	SetNodeReadyState(n2, true, time.Time{})
+	SetNodeReadyState(n3, true, time.Time{})
+
+	provider := testprovider.NewTestCloudProvider(nil, nil)
+	provider.AddNodeGroup("ng1", 1, 10, 2)
+	provider.AddNode("ng1", n1)
+	provider.AddNode("ng1", n2)
+	provider.AddNode("ng1", n3)
+
+	options := config.AutoscalingOptions{
+		ScaleDownUtilizationThreshold:    0.35,
+		ScaleDownGpuUtilizationThreshold: 0.3,
+		UnremovableNodeRecheckTimeout:    5 * time.Minute,
+	}
+	context := NewScaleTestAutoscalingContext(options, &fake.Clientset{}, nil, provider)
+
+	clusterStateRegistry := clusterstate.NewClusterStateRegistry(provider, clusterstate.ClusterStateRegistryConfig{}, context.LogRecorder, newBackoff())
+	sd := NewScaleDown(&context, clusterStateRegistry)
+	sd.UpdateUnneededNodes([]*apiv1.Node{n1, n2, n3}, []*apiv1.Node{n1, n2, n3},
+		[]*apiv1.Pod{p1, p2, p3}, time.Now(), nil)
+
+	assert.Equal(t, 1, len(sd.unneededNodes))
+	_, found := sd.unneededNodes["n2"]
+	assert.True(t, found)
+
+	assert.Contains(t, sd.podLocationHints, p2.Namespace+"/"+p2.Name)
+	assert.Equal(t, 3, len(sd.nodeUtilizationMap))
+}
+
 func TestPodsWithPrioritiesFindUnneededNodes(t *testing.T) {
 	// shared owner reference
 	ownerRef := GenerateOwnerReferences("rs", "ReplicaSet", "extensions/v1beta1", "")
@ -812,14 +872,15 @@ func assertSubset(t *testing.T, a []string, b []string) {
 }

 var defaultScaleDownOptions = config.AutoscalingOptions{
-	ScaleDownUtilizationThreshold: 0.5,
-	ScaleDownUnneededTime:         time.Minute,
-	MaxGracefulTerminationSec:     60,
-	MaxEmptyBulkDelete:            10,
-	MinCoresTotal:                 0,
-	MinMemoryTotal:                0,
-	MaxCoresTotal:                 config.DefaultMaxClusterCores,
-	MaxMemoryTotal:                config.DefaultMaxClusterMemory * units.GiB,
+	ScaleDownUtilizationThreshold:    0.5,
+	ScaleDownGpuUtilizationThreshold: 0.5,
+	ScaleDownUnneededTime:            time.Minute,
+	MaxGracefulTerminationSec:        60,
+	MaxEmptyBulkDelete:               10,
+	MinCoresTotal:                    0,
+	MinMemoryTotal:                   0,
+	MaxCoresTotal:                    config.DefaultMaxClusterCores,
+	MaxMemoryTotal:                   config.DefaultMaxClusterMemory * units.GiB,
 }

 func TestScaleDownEmptyMultipleNodeGroups(t *testing.T) {
--- a/cluster-autoscaler/main.go
+++ b/cluster-autoscaler/main.go
@ -93,7 +93,10 @@ var (
 	scaleDownUnreadyTime = flag.Duration("scale-down-unready-time", 20*time.Minute,
 		"How long an unready node should be unneeded before it is eligible for scale down")
 	scaleDownUtilizationThreshold = flag.Float64("scale-down-utilization-threshold", 0.5,
-		"Node utilization level, defined as sum of requested resources divided by capacity, below which a node can be considered for scale down")
+		"Sum of cpu or memory of all pods running on the node divided by node's corresponding allocatable resource, below which a node can be considered for scale down")
+	scaleDownGpuUtilizationThreshold = flag.Float64("scale-down-gpu-utilization-threshold", 0.5,
+		"Sum of gpu requests of all pods running on the node divided by node's allocatable resource, below which a node can be considered for scale down."+
+			"Utilization calculation only cares about gpu resource for accelerator node. cpu and memory utilization will be ignored.")
 	scaleDownNonEmptyCandidatesCount = flag.Int("scale-down-non-empty-candidates-count", 30,
 		"Maximum number of non empty nodes considered in one iteration as candidates for scale down with drain."+
 			"Lower value means better CA responsiveness but possible slower scale down latency."+
@ -210,6 +213,7 @@ func createAutoscalingOptions() config.AutoscalingOptions {
 		ScaleDownUnneededTime:               *scaleDownUnneededTime,
 		ScaleDownUnreadyTime:                *scaleDownUnreadyTime,
 		ScaleDownUtilizationThreshold:       *scaleDownUtilizationThreshold,
+		ScaleDownGpuUtilizationThreshold:    *scaleDownGpuUtilizationThreshold,
 		ScaleDownNonEmptyCandidatesCount:    *scaleDownNonEmptyCandidatesCount,
 		ScaleDownCandidatesPoolRatio:        *scaleDownCandidatesPoolRatio,
 		ScaleDownCandidatesPoolMinCount:     *scaleDownCandidatesPoolMinCount,
--- a/cluster-autoscaler/simulator/cluster.go
+++ b/cluster-autoscaler/simulator/cluster.go
@ -19,13 +19,13 @@ package simulator
 import (
 	"flag"
 	"fmt"
-	"math"
 	"math/rand"
 	"time"

 	"k8s.io/autoscaler/cluster-autoscaler/utils/drain"
 	"k8s.io/autoscaler/cluster-autoscaler/utils/errors"
 	"k8s.io/autoscaler/cluster-autoscaler/utils/glogx"
+	"k8s.io/autoscaler/cluster-autoscaler/utils/gpu"
 	kube_util "k8s.io/autoscaler/cluster-autoscaler/utils/kubernetes"
 	scheduler_util "k8s.io/autoscaler/cluster-autoscaler/utils/scheduler"
 	"k8s.io/autoscaler/cluster-autoscaler/utils/tpu"
@ -62,7 +62,10 @@ type NodeToBeRemoved struct {
 type UtilizationInfo struct {
 	CpuUtil float64
 	MemUtil float64
-	// Max(CpuUtil, MemUtil).
+	GpuUtil float64
+	// Resource name of highest utilization resource
+	ResourceName apiv1.ResourceName
+	// Max(CpuUtil, MemUtil) or GpuUtils
 	Utilization float64
 }

@ -149,10 +152,22 @@ func FindEmptyNodesToRemove(candidates []*apiv1.Node, pods []*apiv1.Pod) []*apiv
 	return result
 }

-// CalculateUtilization calculates utilization of a node, defined as maximum of (cpu, memory) utilization.
-// Per resource utilization is the sum of requests for it divided by allocatable. It also returns the individual
-// cpu and memory utilization.
-func CalculateUtilization(node *apiv1.Node, nodeInfo *schedulernodeinfo.NodeInfo, skipDaemonSetPods, skipMirrorPods bool) (utilInfo UtilizationInfo, err error) {
+// CalculateUtilization calculates utilization of a node, defined as maximum of (cpu, memory) or gpu utilization
+// based on if the node has GPU or not. Per resource utilization is the sum of requests for it divided by allocatable.
+// It also returns the individual cpu, memory and gpu utilization.
+func CalculateUtilization(node *apiv1.Node, nodeInfo *schedulernodeinfo.NodeInfo, skipDaemonSetPods, skipMirrorPods bool, gpuLabel string) (utilInfo UtilizationInfo, err error) {
+	if gpu.NodeHasGpu(gpuLabel, node) {
+		gpuUtil, err := calculateUtilizationOfResource(node, nodeInfo, gpu.ResourceNvidiaGPU, skipDaemonSetPods, skipMirrorPods)
+		if err != nil {
+			klog.V(3).Infof("node %s has unready GPU", node.Name)
+			// Return 0 if GPU is unready. This will guarantee we can still scale down a node with unready GPU.
+			return UtilizationInfo{GpuUtil: 0, ResourceName: gpu.ResourceNvidiaGPU, Utilization: 0}, nil
+		}
+
+		// Skips cpu and memory utilization calculation for node with GPU.
+		return UtilizationInfo{GpuUtil: gpuUtil, ResourceName: gpu.ResourceNvidiaGPU, Utilization: gpuUtil}, nil
+	}
+
 	cpu, err := calculateUtilizationOfResource(node, nodeInfo, apiv1.ResourceCPU, skipDaemonSetPods, skipMirrorPods)
 	if err != nil {
 		return UtilizationInfo{}, err
@ -161,7 +176,18 @@ func CalculateUtilization(node *apiv1.Node, nodeInfo *schedulernodeinfo.NodeInfo
 	if err != nil {
 		return UtilizationInfo{}, err
 	}
-	return UtilizationInfo{CpuUtil: cpu, MemUtil: mem, Utilization: math.Max(cpu, mem)}, nil
+
+	utilization := UtilizationInfo{CpuUtil: cpu, MemUtil: mem}
+
+	if cpu > mem {
+		utilization.ResourceName = apiv1.ResourceCPU
+		utilization.Utilization = cpu
+	} else {
+		utilization.ResourceName = apiv1.ResourceMemory
+		utilization.Utilization = mem
+	}
+
+	return utilization, nil
 }

 func calculateUtilizationOfResource(node *apiv1.Node, nodeInfo *schedulernodeinfo.NodeInfo, resourceName apiv1.ResourceName, skipDaemonSetPods, skipMirrorPods bool) (float64, error) {
--- a/cluster-autoscaler/simulator/cluster_test.go
+++ b/cluster-autoscaler/simulator/cluster_test.go
@ -31,6 +31,7 @@ import (
 )

 func TestUtilization(t *testing.T) {
+	gpuLabel := GetGPULabel()
 	pod := BuildTestPod("p1", 100, 200000)
 	pod2 := BuildTestPod("p2", -1, -1)

@ -38,25 +39,25 @@ func TestUtilization(t *testing.T) {
 	node := BuildTestNode("node1", 2000, 2000000)
 	SetNodeReadyState(node, true, time.Time{})

-	utilInfo, err := CalculateUtilization(node, nodeInfo, false, false)
+	utilInfo, err := CalculateUtilization(node, nodeInfo, false, false, gpuLabel)
 	assert.NoError(t, err)
 	assert.InEpsilon(t, 2.0/10, utilInfo.Utilization, 0.01)

 	node2 := BuildTestNode("node1", 2000, -1)

-	_, err = CalculateUtilization(node2, nodeInfo, false, false)
+	_, err = CalculateUtilization(node2, nodeInfo, false, false, gpuLabel)
 	assert.Error(t, err)

 	daemonSetPod3 := BuildTestPod("p3", 100, 200000)
 	daemonSetPod3.OwnerReferences = GenerateOwnerReferences("ds", "DaemonSet", "apps/v1", "")

 	nodeInfo = schedulernodeinfo.NewNodeInfo(pod, pod, pod2, daemonSetPod3)
-	utilInfo, err = CalculateUtilization(node, nodeInfo, true, false)
+	utilInfo, err = CalculateUtilization(node, nodeInfo, true, false, gpuLabel)
 	assert.NoError(t, err)
 	assert.InEpsilon(t, 2.0/10, utilInfo.Utilization, 0.01)

 	nodeInfo = schedulernodeinfo.NewNodeInfo(pod, pod2, daemonSetPod3)
-	utilInfo, err = CalculateUtilization(node, nodeInfo, false, false)
+	utilInfo, err = CalculateUtilization(node, nodeInfo, false, false, gpuLabel)
 	assert.NoError(t, err)
 	assert.InEpsilon(t, 2.0/10, utilInfo.Utilization, 0.01)

@ -66,14 +67,31 @@ func TestUtilization(t *testing.T) {
 	}

 	nodeInfo = schedulernodeinfo.NewNodeInfo(pod, pod, pod2, mirrorPod4)
-	utilInfo, err = CalculateUtilization(node, nodeInfo, false, true)
+	utilInfo, err = CalculateUtilization(node, nodeInfo, false, true, gpuLabel)
 	assert.NoError(t, err)
 	assert.InEpsilon(t, 2.0/10, utilInfo.Utilization, 0.01)

 	nodeInfo = schedulernodeinfo.NewNodeInfo(pod, pod2, mirrorPod4)
-	utilInfo, err = CalculateUtilization(node, nodeInfo, false, false)
+	utilInfo, err = CalculateUtilization(node, nodeInfo, false, false, gpuLabel)
 	assert.NoError(t, err)
 	assert.InEpsilon(t, 2.0/10, utilInfo.Utilization, 0.01)
+
+	gpuNode := BuildTestNode("gpu_node", 2000, 2000000)
+	AddGpusToNode(gpuNode, 1)
+	gpuPod := BuildTestPod("gpu_pod", 100, 200000)
+	RequestGpuForPod(gpuPod, 1)
+	nodeInfo = schedulernodeinfo.NewNodeInfo(pod, pod, gpuPod)
+	utilInfo, err = CalculateUtilization(gpuNode, nodeInfo, false, false, gpuLabel)
+	assert.NoError(t, err)
+	assert.InEpsilon(t, 1/1, utilInfo.Utilization, 0.01)
+
+	// Node with Unready GPU
+	gpuNode = BuildTestNode("gpu_node", 2000, 2000000)
+	AddGpuLabelToNode(gpuNode)
+	nodeInfo = schedulernodeinfo.NewNodeInfo(pod, pod)
+	utilInfo, err = CalculateUtilization(gpuNode, nodeInfo, false, false, gpuLabel)
+	assert.NoError(t, err)
+	assert.Zero(t, utilInfo.Utilization)
 }

 func TestFindPlaceAllOk(t *testing.T) {
--- a/cluster-autoscaler/utils/test/test_utils.go
+++ b/cluster-autoscaler/utils/test/test_utils.go
@ -39,9 +39,10 @@ import (
 func BuildTestPod(name string, cpu int64, mem int64) *apiv1.Pod {
 	pod := &apiv1.Pod{
 		ObjectMeta: metav1.ObjectMeta{
-			Namespace: "default",
-			Name:      name,
-			SelfLink:  fmt.Sprintf("/api/v1/namespaces/default/pods/%s", name),
+			Namespace:   "default",
+			Name:        name,
+			SelfLink:    fmt.Sprintf("/api/v1/namespaces/default/pods/%s", name),
+			Annotations: map[string]string{},
 		},
 		Spec: apiv1.PodSpec{
 			Containers: []apiv1.Container{
@ -128,9 +129,19 @@ func AddGpusToNode(node *apiv1.Node, gpusCount int64) {
 		})
 	node.Status.Capacity[resourceNvidiaGPU] = *resource.NewQuantity(gpusCount, resource.DecimalSI)
 	node.Status.Allocatable[resourceNvidiaGPU] = *resource.NewQuantity(gpusCount, resource.DecimalSI)
+	AddGpuLabelToNode(node)
+}
+
+// AddGpuLabelToNode adds GPULabel to give node. This is used to mock intermediate result that GPU on node is not ready
+func AddGpuLabelToNode(node *apiv1.Node) {
 	node.Labels[gpuLabel] = defaultGPUType
 }

+// GetGPULabel return GPULabel on the node. This is only used in unit tests.
+func GetGPULabel() string {
+	return gpuLabel
+}
+
 // SetNodeReadyState sets node ready state to either ConditionTrue or ConditionFalse.
 func SetNodeReadyState(node *apiv1.Node, ready bool, lastTransition time.Time) {
 	if ready {