Consider GPU utilization in scaling down

2019-04-04 01:01:46 -07:00 · 2019-04-04 01:01:46 -07:00 · 83ae66cebc
parent 40cf6e43c0
commit 83ae66cebc
7 changed files with 167 additions and 30 deletions
--- a/cluster-autoscaler/config/autoscaling_options.go
+++ b/cluster-autoscaler/config/autoscaling_options.go
@ -34,9 +34,12 @@ type GpuLimits struct {
 type AutoscalingOptions struct {
 	// MaxEmptyBulkDelete is a number of empty nodes that can be removed at the same time.
 	MaxEmptyBulkDelete int
-	// ScaleDownUtilizationThreshold sets threshold for nodes to be considered for scale down.
+	// ScaleDownUtilizationThreshold sets threshold for nodes to be considered for scale down if cpu or memory utilization is over threshold.
 	// Well-utilized nodes are not touched.
 	ScaleDownUtilizationThreshold float64
 	// ScaleDownGpuUtilizationThreshold sets threshold for gpu nodes to be considered for scale down if gpu utilization is over threshold.
 	// Well-utilized nodes are not touched.
 	ScaleDownGpuUtilizationThreshold float64
 	// ScaleDownUnneededTime sets the duration CA expects a node to be unneeded/eligible for removal
 	// before scaling down the node.
 	ScaleDownUnneededTime time.Duration
--- a/cluster-autoscaler/core/scale_down.go
+++ b/cluster-autoscaler/core/scale_down.go
@ -402,16 +402,16 @@ func (sd *ScaleDown) UpdateUnneededNodes(
 			klog.Errorf("Node info for %s not found", node.Name)
 			continue
 		}
 		utilInfo, err := simulator.CalculateUtilization(node, nodeInfo, sd.context.IgnoreDaemonSetsUtilization, sd.context.IgnoreMirrorPodsUtilization)
 		utilInfo, err := simulator.CalculateUtilization(node, nodeInfo, sd.context.IgnoreDaemonSetsUtilization, sd.context.IgnoreMirrorPodsUtilization, sd.context.CloudProvider.GPULabel())
 		if err != nil {
 			klog.Warningf("Failed to calculate utilization for %s: %v", node.Name, err)
 		}
-		klog.V(4).Infof("Node %s - utilization %f", node.Name, utilInfo.Utilization)
+		klog.V(4).Infof("Node %s - %s utilization %f", node.Name, utilInfo.ResourceName, utilInfo.Utilization)
 		utilizationMap[node.Name] = utilInfo
-		if utilInfo.Utilization >= sd.context.ScaleDownUtilizationThreshold {
+		if !sd.isNodeBelowUtilzationThreshold(node, utilInfo) {
-			klog.V(4).Infof("Node %s is not suitable for removal - utilization too big (%f)", node.Name, utilInfo.Utilization)
+			klog.V(4).Infof("Node %s is not suitable for removal - %s utilization too big (%f)", node.Name, utilInfo.ResourceName, utilInfo.Utilization)
 			continue
 		}
 		currentlyUnneededNodes = append(currentlyUnneededNodes, node)
@ -506,6 +506,20 @@ func (sd *ScaleDown) UpdateUnneededNodes(
 	return nil
 }
 // isNodeBelowUtilzationThreshold determintes if a given node utilization is blow threshold.
 func (sd *ScaleDown) isNodeBelowUtilzationThreshold(node *apiv1.Node, utilInfo simulator.UtilizationInfo) bool {
 	if gpu.NodeHasGpu(sd.context.CloudProvider.GPULabel(), node) {
 		if utilInfo.Utilization >= sd.context.ScaleDownGpuUtilizationThreshold {
 			return false
 		}
 	} else {
 		if utilInfo.Utilization >= sd.context.ScaleDownUtilizationThreshold {
 			return false
 		}
 	}
 	return true
 }
 // updateUnremovableNodes updates unremovableNodes map according to current
 // state of the cluster. Removes from the map nodes that are no longer in the
 // nodes list.
--- a/cluster-autoscaler/core/scale_down_test.go
+++ b/cluster-autoscaler/core/scale_down_test.go
@ -172,6 +172,66 @@ func TestFindUnneededNodes(t *testing.T) {
 	assert.Equal(t, 0, len(sd.unremovableNodes))
 }
 func TestFindUnneededGPUNodes(t *testing.T) {
 	// shared owner reference
 	ownerRef := GenerateOwnerReferences("rs", "ReplicaSet", "extensions/v1beta1", "")
 	p1 := BuildTestPod("p1", 100, 0)
 	p1.Spec.NodeName = "n1"
 	p1.OwnerReferences = ownerRef
 	RequestGpuForPod(p1, 1)
 	p2 := BuildTestPod("p2", 400, 0)
 	p2.Spec.NodeName = "n2"
 	p2.OwnerReferences = ownerRef
 	RequestGpuForPod(p2, 1)
 	p3 := BuildTestPod("p3", 300, 0)
 	p3.Spec.NodeName = "n3"
 	p3.OwnerReferences = ownerRef
 	p3.ObjectMeta.Annotations["cluster-autoscaler.kubernetes.io/safe-to-evict"] = "false"
 	RequestGpuForPod(p3, 1)
 	// Node with low cpu utilization and high gpu utilization
 	n1 := BuildTestNode("n1", 1000, 10)
 	AddGpusToNode(n1, 2)
 	// Node with high cpu utilization and low gpu utilization
 	n2 := BuildTestNode("n2", 1000, 10)
 	AddGpusToNode(n2, 4)
 	// Node with low gpu utilization and pods on node can not be interrupted
 	n3 := BuildTestNode("n3", 1000, 10)
 	AddGpusToNode(n3, 8)
 	SetNodeReadyState(n1, true, time.Time{})
 	SetNodeReadyState(n2, true, time.Time{})
 	SetNodeReadyState(n3, true, time.Time{})
 	provider := testprovider.NewTestCloudProvider(nil, nil)
 	provider.AddNodeGroup("ng1", 1, 10, 2)
 	provider.AddNode("ng1", n1)
 	provider.AddNode("ng1", n2)
 	provider.AddNode("ng1", n3)
 	options := config.AutoscalingOptions{
 		ScaleDownUtilizationThreshold:    0.35,
 		ScaleDownGpuUtilizationThreshold: 0.3,
 		UnremovableNodeRecheckTimeout:    5 * time.Minute,
 	}
 	context := NewScaleTestAutoscalingContext(options, &fake.Clientset{}, nil, provider)
 	clusterStateRegistry := clusterstate.NewClusterStateRegistry(provider, clusterstate.ClusterStateRegistryConfig{}, context.LogRecorder, newBackoff())
 	sd := NewScaleDown(&context, clusterStateRegistry)
 	sd.UpdateUnneededNodes([]*apiv1.Node{n1, n2, n3}, []*apiv1.Node{n1, n2, n3},
 		[]*apiv1.Pod{p1, p2, p3}, time.Now(), nil)
 	assert.Equal(t, 1, len(sd.unneededNodes))
 	_, found := sd.unneededNodes["n2"]
 	assert.True(t, found)
 	assert.Contains(t, sd.podLocationHints, p2.Namespace+"/"+p2.Name)
 	assert.Equal(t, 3, len(sd.nodeUtilizationMap))
 }
 func TestPodsWithPrioritiesFindUnneededNodes(t *testing.T) {
 	// shared owner reference
 	ownerRef := GenerateOwnerReferences("rs", "ReplicaSet", "extensions/v1beta1", "")
@ -812,14 +872,15 @@ func assertSubset(t *testing.T, a []string, b []string) {
 }
 var defaultScaleDownOptions = config.AutoscalingOptions{
-	ScaleDownUtilizationThreshold: 0.5,
+	ScaleDownUtilizationThreshold:    0.5,
-	ScaleDownUnneededTime:         time.Minute,
+	ScaleDownGpuUtilizationThreshold: 0.5,
-	MaxGracefulTerminationSec:     60,
+	ScaleDownUnneededTime:            time.Minute,
-	MaxEmptyBulkDelete:            10,
+	MaxGracefulTerminationSec:        60,
-	MinCoresTotal:                 0,
+	MaxEmptyBulkDelete:               10,
-	MinMemoryTotal:                0,
+	MinCoresTotal:                    0,
-	MaxCoresTotal:                 config.DefaultMaxClusterCores,
+	MinMemoryTotal:                   0,
-	MaxMemoryTotal:                config.DefaultMaxClusterMemory * units.GiB,
+	MaxCoresTotal:                    config.DefaultMaxClusterCores,
 	MaxMemoryTotal:                   config.DefaultMaxClusterMemory * units.GiB,
 }
 func TestScaleDownEmptyMultipleNodeGroups(t *testing.T) {
--- a/cluster-autoscaler/main.go
+++ b/cluster-autoscaler/main.go
@ -93,7 +93,10 @@ var (
 	scaleDownUnreadyTime = flag.Duration("scale-down-unready-time", 20*time.Minute,
 		"How long an unready node should be unneeded before it is eligible for scale down")
 	scaleDownUtilizationThreshold = flag.Float64("scale-down-utilization-threshold", 0.5,
-		"Node utilization level, defined as sum of requested resources divided by capacity, below which a node can be considered for scale down")
+		"Sum of cpu or memory of all pods running on the node divided by node's corresponding allocatable resource, below which a node can be considered for scale down")
 	scaleDownGpuUtilizationThreshold = flag.Float64("scale-down-gpu-utilization-threshold", 0.5,
 		"Sum of gpu requests of all pods running on the node divided by node's allocatable resource, below which a node can be considered for scale down."+
 			"Utilization calculation only cares about gpu resource for accelerator node. cpu and memory utilization will be ignored.")
 	scaleDownNonEmptyCandidatesCount = flag.Int("scale-down-non-empty-candidates-count", 30,
 		"Maximum number of non empty nodes considered in one iteration as candidates for scale down with drain."+
 			"Lower value means better CA responsiveness but possible slower scale down latency."+
@ -210,6 +213,7 @@ func createAutoscalingOptions() config.AutoscalingOptions {
 		ScaleDownUnneededTime:               *scaleDownUnneededTime,
 		ScaleDownUnreadyTime:                *scaleDownUnreadyTime,
 		ScaleDownUtilizationThreshold:       *scaleDownUtilizationThreshold,
 		ScaleDownGpuUtilizationThreshold:    *scaleDownGpuUtilizationThreshold,
 		ScaleDownNonEmptyCandidatesCount:    *scaleDownNonEmptyCandidatesCount,
 		ScaleDownCandidatesPoolRatio:        *scaleDownCandidatesPoolRatio,
 		ScaleDownCandidatesPoolMinCount:     *scaleDownCandidatesPoolMinCount,
--- a/cluster-autoscaler/simulator/cluster.go
+++ b/cluster-autoscaler/simulator/cluster.go
@ -19,13 +19,13 @@ package simulator
 import (
 	"flag"
 	"fmt"
 	"math"
 	"math/rand"
 	"time"
 	"k8s.io/autoscaler/cluster-autoscaler/utils/drain"
 	"k8s.io/autoscaler/cluster-autoscaler/utils/errors"
 	"k8s.io/autoscaler/cluster-autoscaler/utils/glogx"
 	"k8s.io/autoscaler/cluster-autoscaler/utils/gpu"
 	kube_util "k8s.io/autoscaler/cluster-autoscaler/utils/kubernetes"
 	scheduler_util "k8s.io/autoscaler/cluster-autoscaler/utils/scheduler"
 	"k8s.io/autoscaler/cluster-autoscaler/utils/tpu"
@ -62,7 +62,10 @@ type NodeToBeRemoved struct {
 type UtilizationInfo struct {
 	CpuUtil float64
 	MemUtil float64
-	// Max(CpuUtil, MemUtil).
+	GpuUtil float64
 	// Resource name of highest utilization resource
 	ResourceName apiv1.ResourceName
 	// Max(CpuUtil, MemUtil) or GpuUtils
 	Utilization float64
 }
@ -149,10 +152,22 @@ func FindEmptyNodesToRemove(candidates []*apiv1.Node, pods []*apiv1.Pod) []*apiv
 	return result
 }
-// CalculateUtilization calculates utilization of a node, defined as maximum of (cpu, memory) utilization.
+// CalculateUtilization calculates utilization of a node, defined as maximum of (cpu, memory) or gpu utilization
-// Per resource utilization is the sum of requests for it divided by allocatable. It also returns the individual
+// based on if the node has GPU or not. Per resource utilization is the sum of requests for it divided by allocatable.
-// cpu and memory utilization.
+// It also returns the individual cpu, memory and gpu utilization.
-func CalculateUtilization(node *apiv1.Node, nodeInfo *schedulernodeinfo.NodeInfo, skipDaemonSetPods, skipMirrorPods bool) (utilInfo UtilizationInfo, err error) {
+func CalculateUtilization(node *apiv1.Node, nodeInfo *schedulernodeinfo.NodeInfo, skipDaemonSetPods, skipMirrorPods bool, gpuLabel string) (utilInfo UtilizationInfo, err error) {
 	if gpu.NodeHasGpu(gpuLabel, node) {
 		gpuUtil, err := calculateUtilizationOfResource(node, nodeInfo, gpu.ResourceNvidiaGPU, skipDaemonSetPods, skipMirrorPods)
 		if err != nil {
 			klog.V(3).Infof("node %s has unready GPU", node.Name)
 			// Return 0 if GPU is unready. This will guarantee we can still scale down a node with unready GPU.
 			return UtilizationInfo{GpuUtil: 0, ResourceName: gpu.ResourceNvidiaGPU, Utilization: 0}, nil
 		}
 		// Skips cpu and memory utilization calculation for node with GPU.
 		return UtilizationInfo{GpuUtil: gpuUtil, ResourceName: gpu.ResourceNvidiaGPU, Utilization: gpuUtil}, nil
 	}
 	cpu, err := calculateUtilizationOfResource(node, nodeInfo, apiv1.ResourceCPU, skipDaemonSetPods, skipMirrorPods)
 	if err != nil {
 		return UtilizationInfo{}, err
@ -161,7 +176,18 @@ func CalculateUtilization(node *apiv1.Node, nodeInfo *schedulernodeinfo.NodeInfo
 	if err != nil {
 		return UtilizationInfo{}, err
 	}
-	return UtilizationInfo{CpuUtil: cpu, MemUtil: mem, Utilization: math.Max(cpu, mem)}, nil
+
 	utilization := UtilizationInfo{CpuUtil: cpu, MemUtil: mem}
 	if cpu > mem {
 		utilization.ResourceName = apiv1.ResourceCPU
 		utilization.Utilization = cpu
 	} else {
 		utilization.ResourceName = apiv1.ResourceMemory
 		utilization.Utilization = mem
 	}
 	return utilization, nil
 }
 func calculateUtilizationOfResource(node *apiv1.Node, nodeInfo *schedulernodeinfo.NodeInfo, resourceName apiv1.ResourceName, skipDaemonSetPods, skipMirrorPods bool) (float64, error) {
--- a/cluster-autoscaler/simulator/cluster_test.go
+++ b/cluster-autoscaler/simulator/cluster_test.go
@ -31,6 +31,7 @@ import (
 )
 func TestUtilization(t *testing.T) {
 	gpuLabel := GetGPULabel()
 	pod := BuildTestPod("p1", 100, 200000)
 	pod2 := BuildTestPod("p2", -1, -1)
@ -38,25 +39,25 @@ func TestUtilization(t *testing.T) {
 	node := BuildTestNode("node1", 2000, 2000000)
 	SetNodeReadyState(node, true, time.Time{})
-	utilInfo, err := CalculateUtilization(node, nodeInfo, false, false)
+	utilInfo, err := CalculateUtilization(node, nodeInfo, false, false, gpuLabel)
 	assert.NoError(t, err)
 	assert.InEpsilon(t, 2.0/10, utilInfo.Utilization, 0.01)
 	node2 := BuildTestNode("node1", 2000, -1)
-	_, err = CalculateUtilization(node2, nodeInfo, false, false)
+	_, err = CalculateUtilization(node2, nodeInfo, false, false, gpuLabel)
 	assert.Error(t, err)
 	daemonSetPod3 := BuildTestPod("p3", 100, 200000)
 	daemonSetPod3.OwnerReferences = GenerateOwnerReferences("ds", "DaemonSet", "apps/v1", "")
 	nodeInfo = schedulernodeinfo.NewNodeInfo(pod, pod, pod2, daemonSetPod3)
-	utilInfo, err = CalculateUtilization(node, nodeInfo, true, false)
+	utilInfo, err = CalculateUtilization(node, nodeInfo, true, false, gpuLabel)
 	assert.NoError(t, err)
 	assert.InEpsilon(t, 2.0/10, utilInfo.Utilization, 0.01)
 	nodeInfo = schedulernodeinfo.NewNodeInfo(pod, pod2, daemonSetPod3)
-	utilInfo, err = CalculateUtilization(node, nodeInfo, false, false)
+	utilInfo, err = CalculateUtilization(node, nodeInfo, false, false, gpuLabel)
 	assert.NoError(t, err)
 	assert.InEpsilon(t, 2.0/10, utilInfo.Utilization, 0.01)
@ -66,14 +67,31 @@ func TestUtilization(t *testing.T) {
 	}
 	nodeInfo = schedulernodeinfo.NewNodeInfo(pod, pod, pod2, mirrorPod4)
-	utilInfo, err = CalculateUtilization(node, nodeInfo, false, true)
+	utilInfo, err = CalculateUtilization(node, nodeInfo, false, true, gpuLabel)
 	assert.NoError(t, err)
 	assert.InEpsilon(t, 2.0/10, utilInfo.Utilization, 0.01)
 	nodeInfo = schedulernodeinfo.NewNodeInfo(pod, pod2, mirrorPod4)
-	utilInfo, err = CalculateUtilization(node, nodeInfo, false, false)
+	utilInfo, err = CalculateUtilization(node, nodeInfo, false, false, gpuLabel)
 	assert.NoError(t, err)
 	assert.InEpsilon(t, 2.0/10, utilInfo.Utilization, 0.01)
 	gpuNode := BuildTestNode("gpu_node", 2000, 2000000)
 	AddGpusToNode(gpuNode, 1)
 	gpuPod := BuildTestPod("gpu_pod", 100, 200000)
 	RequestGpuForPod(gpuPod, 1)
 	nodeInfo = schedulernodeinfo.NewNodeInfo(pod, pod, gpuPod)
 	utilInfo, err = CalculateUtilization(gpuNode, nodeInfo, false, false, gpuLabel)
 	assert.NoError(t, err)
 	assert.InEpsilon(t, 1/1, utilInfo.Utilization, 0.01)
 	// Node with Unready GPU
 	gpuNode = BuildTestNode("gpu_node", 2000, 2000000)
 	AddGpuLabelToNode(gpuNode)
 	nodeInfo = schedulernodeinfo.NewNodeInfo(pod, pod)
 	utilInfo, err = CalculateUtilization(gpuNode, nodeInfo, false, false, gpuLabel)
 	assert.NoError(t, err)
 	assert.Zero(t, utilInfo.Utilization)
 }
 func TestFindPlaceAllOk(t *testing.T) {
--- a/cluster-autoscaler/utils/test/test_utils.go
+++ b/cluster-autoscaler/utils/test/test_utils.go
@ -39,9 +39,10 @@ import (
 func BuildTestPod(name string, cpu int64, mem int64) *apiv1.Pod {
 	pod := &apiv1.Pod{
 		ObjectMeta: metav1.ObjectMeta{
-			Namespace: "default",
+			Namespace:   "default",
-			Name:      name,
+			Name:        name,
-			SelfLink:  fmt.Sprintf("/api/v1/namespaces/default/pods/%s", name),
+			SelfLink:    fmt.Sprintf("/api/v1/namespaces/default/pods/%s", name),
 			Annotations: map[string]string{},
 		},
 		Spec: apiv1.PodSpec{
 			Containers: []apiv1.Container{
@ -128,9 +129,19 @@ func AddGpusToNode(node *apiv1.Node, gpusCount int64) {
 		})
 	node.Status.Capacity[resourceNvidiaGPU] = *resource.NewQuantity(gpusCount, resource.DecimalSI)
 	node.Status.Allocatable[resourceNvidiaGPU] = *resource.NewQuantity(gpusCount, resource.DecimalSI)
 	AddGpuLabelToNode(node)
 }
 // AddGpuLabelToNode adds GPULabel to give node. This is used to mock intermediate result that GPU on node is not ready
 func AddGpuLabelToNode(node *apiv1.Node) {
 	node.Labels[gpuLabel] = defaultGPUType
 }
 // GetGPULabel return GPULabel on the node. This is only used in unit tests.
 func GetGPULabel() string {
 	return gpuLabel
 }
 // SetNodeReadyState sets node ready state to either ConditionTrue or ConditionFalse.
 func SetNodeReadyState(node *apiv1.Node, ready bool, lastTransition time.Time) {
 	if ready {