Consider GPU utilization in scaling down

This commit is contained in:
Jiaxin Shan 2019-04-04 01:01:46 -07:00
parent 40cf6e43c0
commit 83ae66cebc
7 changed files with 167 additions and 30 deletions

View File

@ -34,9 +34,12 @@ type GpuLimits struct {
type AutoscalingOptions struct { type AutoscalingOptions struct {
// MaxEmptyBulkDelete is a number of empty nodes that can be removed at the same time. // MaxEmptyBulkDelete is a number of empty nodes that can be removed at the same time.
MaxEmptyBulkDelete int MaxEmptyBulkDelete int
// ScaleDownUtilizationThreshold sets threshold for nodes to be considered for scale down. // ScaleDownUtilizationThreshold sets threshold for nodes to be considered for scale down if cpu or memory utilization is over threshold.
// Well-utilized nodes are not touched. // Well-utilized nodes are not touched.
ScaleDownUtilizationThreshold float64 ScaleDownUtilizationThreshold float64
// ScaleDownGpuUtilizationThreshold sets threshold for gpu nodes to be considered for scale down if gpu utilization is over threshold.
// Well-utilized nodes are not touched.
ScaleDownGpuUtilizationThreshold float64
// ScaleDownUnneededTime sets the duration CA expects a node to be unneeded/eligible for removal // ScaleDownUnneededTime sets the duration CA expects a node to be unneeded/eligible for removal
// before scaling down the node. // before scaling down the node.
ScaleDownUnneededTime time.Duration ScaleDownUnneededTime time.Duration

View File

@ -402,16 +402,16 @@ func (sd *ScaleDown) UpdateUnneededNodes(
klog.Errorf("Node info for %s not found", node.Name) klog.Errorf("Node info for %s not found", node.Name)
continue continue
} }
utilInfo, err := simulator.CalculateUtilization(node, nodeInfo, sd.context.IgnoreDaemonSetsUtilization, sd.context.IgnoreMirrorPodsUtilization)
utilInfo, err := simulator.CalculateUtilization(node, nodeInfo, sd.context.IgnoreDaemonSetsUtilization, sd.context.IgnoreMirrorPodsUtilization, sd.context.CloudProvider.GPULabel())
if err != nil { if err != nil {
klog.Warningf("Failed to calculate utilization for %s: %v", node.Name, err) klog.Warningf("Failed to calculate utilization for %s: %v", node.Name, err)
} }
klog.V(4).Infof("Node %s - utilization %f", node.Name, utilInfo.Utilization) klog.V(4).Infof("Node %s - %s utilization %f", node.Name, utilInfo.ResourceName, utilInfo.Utilization)
utilizationMap[node.Name] = utilInfo utilizationMap[node.Name] = utilInfo
if utilInfo.Utilization >= sd.context.ScaleDownUtilizationThreshold { if !sd.isNodeBelowUtilzationThreshold(node, utilInfo) {
klog.V(4).Infof("Node %s is not suitable for removal - utilization too big (%f)", node.Name, utilInfo.Utilization) klog.V(4).Infof("Node %s is not suitable for removal - %s utilization too big (%f)", node.Name, utilInfo.ResourceName, utilInfo.Utilization)
continue continue
} }
currentlyUnneededNodes = append(currentlyUnneededNodes, node) currentlyUnneededNodes = append(currentlyUnneededNodes, node)
@ -506,6 +506,20 @@ func (sd *ScaleDown) UpdateUnneededNodes(
return nil return nil
} }
// isNodeBelowUtilzationThreshold determintes if a given node utilization is blow threshold.
func (sd *ScaleDown) isNodeBelowUtilzationThreshold(node *apiv1.Node, utilInfo simulator.UtilizationInfo) bool {
if gpu.NodeHasGpu(sd.context.CloudProvider.GPULabel(), node) {
if utilInfo.Utilization >= sd.context.ScaleDownGpuUtilizationThreshold {
return false
}
} else {
if utilInfo.Utilization >= sd.context.ScaleDownUtilizationThreshold {
return false
}
}
return true
}
// updateUnremovableNodes updates unremovableNodes map according to current // updateUnremovableNodes updates unremovableNodes map according to current
// state of the cluster. Removes from the map nodes that are no longer in the // state of the cluster. Removes from the map nodes that are no longer in the
// nodes list. // nodes list.

View File

@ -172,6 +172,66 @@ func TestFindUnneededNodes(t *testing.T) {
assert.Equal(t, 0, len(sd.unremovableNodes)) assert.Equal(t, 0, len(sd.unremovableNodes))
} }
func TestFindUnneededGPUNodes(t *testing.T) {
// shared owner reference
ownerRef := GenerateOwnerReferences("rs", "ReplicaSet", "extensions/v1beta1", "")
p1 := BuildTestPod("p1", 100, 0)
p1.Spec.NodeName = "n1"
p1.OwnerReferences = ownerRef
RequestGpuForPod(p1, 1)
p2 := BuildTestPod("p2", 400, 0)
p2.Spec.NodeName = "n2"
p2.OwnerReferences = ownerRef
RequestGpuForPod(p2, 1)
p3 := BuildTestPod("p3", 300, 0)
p3.Spec.NodeName = "n3"
p3.OwnerReferences = ownerRef
p3.ObjectMeta.Annotations["cluster-autoscaler.kubernetes.io/safe-to-evict"] = "false"
RequestGpuForPod(p3, 1)
// Node with low cpu utilization and high gpu utilization
n1 := BuildTestNode("n1", 1000, 10)
AddGpusToNode(n1, 2)
// Node with high cpu utilization and low gpu utilization
n2 := BuildTestNode("n2", 1000, 10)
AddGpusToNode(n2, 4)
// Node with low gpu utilization and pods on node can not be interrupted
n3 := BuildTestNode("n3", 1000, 10)
AddGpusToNode(n3, 8)
SetNodeReadyState(n1, true, time.Time{})
SetNodeReadyState(n2, true, time.Time{})
SetNodeReadyState(n3, true, time.Time{})
provider := testprovider.NewTestCloudProvider(nil, nil)
provider.AddNodeGroup("ng1", 1, 10, 2)
provider.AddNode("ng1", n1)
provider.AddNode("ng1", n2)
provider.AddNode("ng1", n3)
options := config.AutoscalingOptions{
ScaleDownUtilizationThreshold: 0.35,
ScaleDownGpuUtilizationThreshold: 0.3,
UnremovableNodeRecheckTimeout: 5 * time.Minute,
}
context := NewScaleTestAutoscalingContext(options, &fake.Clientset{}, nil, provider)
clusterStateRegistry := clusterstate.NewClusterStateRegistry(provider, clusterstate.ClusterStateRegistryConfig{}, context.LogRecorder, newBackoff())
sd := NewScaleDown(&context, clusterStateRegistry)
sd.UpdateUnneededNodes([]*apiv1.Node{n1, n2, n3}, []*apiv1.Node{n1, n2, n3},
[]*apiv1.Pod{p1, p2, p3}, time.Now(), nil)
assert.Equal(t, 1, len(sd.unneededNodes))
_, found := sd.unneededNodes["n2"]
assert.True(t, found)
assert.Contains(t, sd.podLocationHints, p2.Namespace+"/"+p2.Name)
assert.Equal(t, 3, len(sd.nodeUtilizationMap))
}
func TestPodsWithPrioritiesFindUnneededNodes(t *testing.T) { func TestPodsWithPrioritiesFindUnneededNodes(t *testing.T) {
// shared owner reference // shared owner reference
ownerRef := GenerateOwnerReferences("rs", "ReplicaSet", "extensions/v1beta1", "") ownerRef := GenerateOwnerReferences("rs", "ReplicaSet", "extensions/v1beta1", "")
@ -812,14 +872,15 @@ func assertSubset(t *testing.T, a []string, b []string) {
} }
var defaultScaleDownOptions = config.AutoscalingOptions{ var defaultScaleDownOptions = config.AutoscalingOptions{
ScaleDownUtilizationThreshold: 0.5, ScaleDownUtilizationThreshold: 0.5,
ScaleDownUnneededTime: time.Minute, ScaleDownGpuUtilizationThreshold: 0.5,
MaxGracefulTerminationSec: 60, ScaleDownUnneededTime: time.Minute,
MaxEmptyBulkDelete: 10, MaxGracefulTerminationSec: 60,
MinCoresTotal: 0, MaxEmptyBulkDelete: 10,
MinMemoryTotal: 0, MinCoresTotal: 0,
MaxCoresTotal: config.DefaultMaxClusterCores, MinMemoryTotal: 0,
MaxMemoryTotal: config.DefaultMaxClusterMemory * units.GiB, MaxCoresTotal: config.DefaultMaxClusterCores,
MaxMemoryTotal: config.DefaultMaxClusterMemory * units.GiB,
} }
func TestScaleDownEmptyMultipleNodeGroups(t *testing.T) { func TestScaleDownEmptyMultipleNodeGroups(t *testing.T) {

View File

@ -93,7 +93,10 @@ var (
scaleDownUnreadyTime = flag.Duration("scale-down-unready-time", 20*time.Minute, scaleDownUnreadyTime = flag.Duration("scale-down-unready-time", 20*time.Minute,
"How long an unready node should be unneeded before it is eligible for scale down") "How long an unready node should be unneeded before it is eligible for scale down")
scaleDownUtilizationThreshold = flag.Float64("scale-down-utilization-threshold", 0.5, scaleDownUtilizationThreshold = flag.Float64("scale-down-utilization-threshold", 0.5,
"Node utilization level, defined as sum of requested resources divided by capacity, below which a node can be considered for scale down") "Sum of cpu or memory of all pods running on the node divided by node's corresponding allocatable resource, below which a node can be considered for scale down")
scaleDownGpuUtilizationThreshold = flag.Float64("scale-down-gpu-utilization-threshold", 0.5,
"Sum of gpu requests of all pods running on the node divided by node's allocatable resource, below which a node can be considered for scale down."+
"Utilization calculation only cares about gpu resource for accelerator node. cpu and memory utilization will be ignored.")
scaleDownNonEmptyCandidatesCount = flag.Int("scale-down-non-empty-candidates-count", 30, scaleDownNonEmptyCandidatesCount = flag.Int("scale-down-non-empty-candidates-count", 30,
"Maximum number of non empty nodes considered in one iteration as candidates for scale down with drain."+ "Maximum number of non empty nodes considered in one iteration as candidates for scale down with drain."+
"Lower value means better CA responsiveness but possible slower scale down latency."+ "Lower value means better CA responsiveness but possible slower scale down latency."+
@ -210,6 +213,7 @@ func createAutoscalingOptions() config.AutoscalingOptions {
ScaleDownUnneededTime: *scaleDownUnneededTime, ScaleDownUnneededTime: *scaleDownUnneededTime,
ScaleDownUnreadyTime: *scaleDownUnreadyTime, ScaleDownUnreadyTime: *scaleDownUnreadyTime,
ScaleDownUtilizationThreshold: *scaleDownUtilizationThreshold, ScaleDownUtilizationThreshold: *scaleDownUtilizationThreshold,
ScaleDownGpuUtilizationThreshold: *scaleDownGpuUtilizationThreshold,
ScaleDownNonEmptyCandidatesCount: *scaleDownNonEmptyCandidatesCount, ScaleDownNonEmptyCandidatesCount: *scaleDownNonEmptyCandidatesCount,
ScaleDownCandidatesPoolRatio: *scaleDownCandidatesPoolRatio, ScaleDownCandidatesPoolRatio: *scaleDownCandidatesPoolRatio,
ScaleDownCandidatesPoolMinCount: *scaleDownCandidatesPoolMinCount, ScaleDownCandidatesPoolMinCount: *scaleDownCandidatesPoolMinCount,

View File

@ -19,13 +19,13 @@ package simulator
import ( import (
"flag" "flag"
"fmt" "fmt"
"math"
"math/rand" "math/rand"
"time" "time"
"k8s.io/autoscaler/cluster-autoscaler/utils/drain" "k8s.io/autoscaler/cluster-autoscaler/utils/drain"
"k8s.io/autoscaler/cluster-autoscaler/utils/errors" "k8s.io/autoscaler/cluster-autoscaler/utils/errors"
"k8s.io/autoscaler/cluster-autoscaler/utils/glogx" "k8s.io/autoscaler/cluster-autoscaler/utils/glogx"
"k8s.io/autoscaler/cluster-autoscaler/utils/gpu"
kube_util "k8s.io/autoscaler/cluster-autoscaler/utils/kubernetes" kube_util "k8s.io/autoscaler/cluster-autoscaler/utils/kubernetes"
scheduler_util "k8s.io/autoscaler/cluster-autoscaler/utils/scheduler" scheduler_util "k8s.io/autoscaler/cluster-autoscaler/utils/scheduler"
"k8s.io/autoscaler/cluster-autoscaler/utils/tpu" "k8s.io/autoscaler/cluster-autoscaler/utils/tpu"
@ -62,7 +62,10 @@ type NodeToBeRemoved struct {
type UtilizationInfo struct { type UtilizationInfo struct {
CpuUtil float64 CpuUtil float64
MemUtil float64 MemUtil float64
// Max(CpuUtil, MemUtil). GpuUtil float64
// Resource name of highest utilization resource
ResourceName apiv1.ResourceName
// Max(CpuUtil, MemUtil) or GpuUtils
Utilization float64 Utilization float64
} }
@ -149,10 +152,22 @@ func FindEmptyNodesToRemove(candidates []*apiv1.Node, pods []*apiv1.Pod) []*apiv
return result return result
} }
// CalculateUtilization calculates utilization of a node, defined as maximum of (cpu, memory) utilization. // CalculateUtilization calculates utilization of a node, defined as maximum of (cpu, memory) or gpu utilization
// Per resource utilization is the sum of requests for it divided by allocatable. It also returns the individual // based on if the node has GPU or not. Per resource utilization is the sum of requests for it divided by allocatable.
// cpu and memory utilization. // It also returns the individual cpu, memory and gpu utilization.
func CalculateUtilization(node *apiv1.Node, nodeInfo *schedulernodeinfo.NodeInfo, skipDaemonSetPods, skipMirrorPods bool) (utilInfo UtilizationInfo, err error) { func CalculateUtilization(node *apiv1.Node, nodeInfo *schedulernodeinfo.NodeInfo, skipDaemonSetPods, skipMirrorPods bool, gpuLabel string) (utilInfo UtilizationInfo, err error) {
if gpu.NodeHasGpu(gpuLabel, node) {
gpuUtil, err := calculateUtilizationOfResource(node, nodeInfo, gpu.ResourceNvidiaGPU, skipDaemonSetPods, skipMirrorPods)
if err != nil {
klog.V(3).Infof("node %s has unready GPU", node.Name)
// Return 0 if GPU is unready. This will guarantee we can still scale down a node with unready GPU.
return UtilizationInfo{GpuUtil: 0, ResourceName: gpu.ResourceNvidiaGPU, Utilization: 0}, nil
}
// Skips cpu and memory utilization calculation for node with GPU.
return UtilizationInfo{GpuUtil: gpuUtil, ResourceName: gpu.ResourceNvidiaGPU, Utilization: gpuUtil}, nil
}
cpu, err := calculateUtilizationOfResource(node, nodeInfo, apiv1.ResourceCPU, skipDaemonSetPods, skipMirrorPods) cpu, err := calculateUtilizationOfResource(node, nodeInfo, apiv1.ResourceCPU, skipDaemonSetPods, skipMirrorPods)
if err != nil { if err != nil {
return UtilizationInfo{}, err return UtilizationInfo{}, err
@ -161,7 +176,18 @@ func CalculateUtilization(node *apiv1.Node, nodeInfo *schedulernodeinfo.NodeInfo
if err != nil { if err != nil {
return UtilizationInfo{}, err return UtilizationInfo{}, err
} }
return UtilizationInfo{CpuUtil: cpu, MemUtil: mem, Utilization: math.Max(cpu, mem)}, nil
utilization := UtilizationInfo{CpuUtil: cpu, MemUtil: mem}
if cpu > mem {
utilization.ResourceName = apiv1.ResourceCPU
utilization.Utilization = cpu
} else {
utilization.ResourceName = apiv1.ResourceMemory
utilization.Utilization = mem
}
return utilization, nil
} }
func calculateUtilizationOfResource(node *apiv1.Node, nodeInfo *schedulernodeinfo.NodeInfo, resourceName apiv1.ResourceName, skipDaemonSetPods, skipMirrorPods bool) (float64, error) { func calculateUtilizationOfResource(node *apiv1.Node, nodeInfo *schedulernodeinfo.NodeInfo, resourceName apiv1.ResourceName, skipDaemonSetPods, skipMirrorPods bool) (float64, error) {

View File

@ -31,6 +31,7 @@ import (
) )
func TestUtilization(t *testing.T) { func TestUtilization(t *testing.T) {
gpuLabel := GetGPULabel()
pod := BuildTestPod("p1", 100, 200000) pod := BuildTestPod("p1", 100, 200000)
pod2 := BuildTestPod("p2", -1, -1) pod2 := BuildTestPod("p2", -1, -1)
@ -38,25 +39,25 @@ func TestUtilization(t *testing.T) {
node := BuildTestNode("node1", 2000, 2000000) node := BuildTestNode("node1", 2000, 2000000)
SetNodeReadyState(node, true, time.Time{}) SetNodeReadyState(node, true, time.Time{})
utilInfo, err := CalculateUtilization(node, nodeInfo, false, false) utilInfo, err := CalculateUtilization(node, nodeInfo, false, false, gpuLabel)
assert.NoError(t, err) assert.NoError(t, err)
assert.InEpsilon(t, 2.0/10, utilInfo.Utilization, 0.01) assert.InEpsilon(t, 2.0/10, utilInfo.Utilization, 0.01)
node2 := BuildTestNode("node1", 2000, -1) node2 := BuildTestNode("node1", 2000, -1)
_, err = CalculateUtilization(node2, nodeInfo, false, false) _, err = CalculateUtilization(node2, nodeInfo, false, false, gpuLabel)
assert.Error(t, err) assert.Error(t, err)
daemonSetPod3 := BuildTestPod("p3", 100, 200000) daemonSetPod3 := BuildTestPod("p3", 100, 200000)
daemonSetPod3.OwnerReferences = GenerateOwnerReferences("ds", "DaemonSet", "apps/v1", "") daemonSetPod3.OwnerReferences = GenerateOwnerReferences("ds", "DaemonSet", "apps/v1", "")
nodeInfo = schedulernodeinfo.NewNodeInfo(pod, pod, pod2, daemonSetPod3) nodeInfo = schedulernodeinfo.NewNodeInfo(pod, pod, pod2, daemonSetPod3)
utilInfo, err = CalculateUtilization(node, nodeInfo, true, false) utilInfo, err = CalculateUtilization(node, nodeInfo, true, false, gpuLabel)
assert.NoError(t, err) assert.NoError(t, err)
assert.InEpsilon(t, 2.0/10, utilInfo.Utilization, 0.01) assert.InEpsilon(t, 2.0/10, utilInfo.Utilization, 0.01)
nodeInfo = schedulernodeinfo.NewNodeInfo(pod, pod2, daemonSetPod3) nodeInfo = schedulernodeinfo.NewNodeInfo(pod, pod2, daemonSetPod3)
utilInfo, err = CalculateUtilization(node, nodeInfo, false, false) utilInfo, err = CalculateUtilization(node, nodeInfo, false, false, gpuLabel)
assert.NoError(t, err) assert.NoError(t, err)
assert.InEpsilon(t, 2.0/10, utilInfo.Utilization, 0.01) assert.InEpsilon(t, 2.0/10, utilInfo.Utilization, 0.01)
@ -66,14 +67,31 @@ func TestUtilization(t *testing.T) {
} }
nodeInfo = schedulernodeinfo.NewNodeInfo(pod, pod, pod2, mirrorPod4) nodeInfo = schedulernodeinfo.NewNodeInfo(pod, pod, pod2, mirrorPod4)
utilInfo, err = CalculateUtilization(node, nodeInfo, false, true) utilInfo, err = CalculateUtilization(node, nodeInfo, false, true, gpuLabel)
assert.NoError(t, err) assert.NoError(t, err)
assert.InEpsilon(t, 2.0/10, utilInfo.Utilization, 0.01) assert.InEpsilon(t, 2.0/10, utilInfo.Utilization, 0.01)
nodeInfo = schedulernodeinfo.NewNodeInfo(pod, pod2, mirrorPod4) nodeInfo = schedulernodeinfo.NewNodeInfo(pod, pod2, mirrorPod4)
utilInfo, err = CalculateUtilization(node, nodeInfo, false, false) utilInfo, err = CalculateUtilization(node, nodeInfo, false, false, gpuLabel)
assert.NoError(t, err) assert.NoError(t, err)
assert.InEpsilon(t, 2.0/10, utilInfo.Utilization, 0.01) assert.InEpsilon(t, 2.0/10, utilInfo.Utilization, 0.01)
gpuNode := BuildTestNode("gpu_node", 2000, 2000000)
AddGpusToNode(gpuNode, 1)
gpuPod := BuildTestPod("gpu_pod", 100, 200000)
RequestGpuForPod(gpuPod, 1)
nodeInfo = schedulernodeinfo.NewNodeInfo(pod, pod, gpuPod)
utilInfo, err = CalculateUtilization(gpuNode, nodeInfo, false, false, gpuLabel)
assert.NoError(t, err)
assert.InEpsilon(t, 1/1, utilInfo.Utilization, 0.01)
// Node with Unready GPU
gpuNode = BuildTestNode("gpu_node", 2000, 2000000)
AddGpuLabelToNode(gpuNode)
nodeInfo = schedulernodeinfo.NewNodeInfo(pod, pod)
utilInfo, err = CalculateUtilization(gpuNode, nodeInfo, false, false, gpuLabel)
assert.NoError(t, err)
assert.Zero(t, utilInfo.Utilization)
} }
func TestFindPlaceAllOk(t *testing.T) { func TestFindPlaceAllOk(t *testing.T) {

View File

@ -39,9 +39,10 @@ import (
func BuildTestPod(name string, cpu int64, mem int64) *apiv1.Pod { func BuildTestPod(name string, cpu int64, mem int64) *apiv1.Pod {
pod := &apiv1.Pod{ pod := &apiv1.Pod{
ObjectMeta: metav1.ObjectMeta{ ObjectMeta: metav1.ObjectMeta{
Namespace: "default", Namespace: "default",
Name: name, Name: name,
SelfLink: fmt.Sprintf("/api/v1/namespaces/default/pods/%s", name), SelfLink: fmt.Sprintf("/api/v1/namespaces/default/pods/%s", name),
Annotations: map[string]string{},
}, },
Spec: apiv1.PodSpec{ Spec: apiv1.PodSpec{
Containers: []apiv1.Container{ Containers: []apiv1.Container{
@ -128,9 +129,19 @@ func AddGpusToNode(node *apiv1.Node, gpusCount int64) {
}) })
node.Status.Capacity[resourceNvidiaGPU] = *resource.NewQuantity(gpusCount, resource.DecimalSI) node.Status.Capacity[resourceNvidiaGPU] = *resource.NewQuantity(gpusCount, resource.DecimalSI)
node.Status.Allocatable[resourceNvidiaGPU] = *resource.NewQuantity(gpusCount, resource.DecimalSI) node.Status.Allocatable[resourceNvidiaGPU] = *resource.NewQuantity(gpusCount, resource.DecimalSI)
AddGpuLabelToNode(node)
}
// AddGpuLabelToNode adds GPULabel to give node. This is used to mock intermediate result that GPU on node is not ready
func AddGpuLabelToNode(node *apiv1.Node) {
node.Labels[gpuLabel] = defaultGPUType node.Labels[gpuLabel] = defaultGPUType
} }
// GetGPULabel return GPULabel on the node. This is only used in unit tests.
func GetGPULabel() string {
return gpuLabel
}
// SetNodeReadyState sets node ready state to either ConditionTrue or ConditionFalse. // SetNodeReadyState sets node ready state to either ConditionTrue or ConditionFalse.
func SetNodeReadyState(node *apiv1.Node, ready bool, lastTransition time.Time) { func SetNodeReadyState(node *apiv1.Node, ready bool, lastTransition time.Time) {
if ready { if ready {