Consider GPU utilization in scaling down

This commit is contained in:
Jiaxin Shan 2019-04-04 01:01:46 -07:00
parent 40cf6e43c0
commit 83ae66cebc
7 changed files with 167 additions and 30 deletions

View File

@ -34,9 +34,12 @@ type GpuLimits struct {
type AutoscalingOptions struct {
// MaxEmptyBulkDelete is a number of empty nodes that can be removed at the same time.
MaxEmptyBulkDelete int
// ScaleDownUtilizationThreshold sets threshold for nodes to be considered for scale down.
// ScaleDownUtilizationThreshold sets threshold for nodes to be considered for scale down if cpu or memory utilization is over threshold.
// Well-utilized nodes are not touched.
ScaleDownUtilizationThreshold float64
// ScaleDownGpuUtilizationThreshold sets threshold for gpu nodes to be considered for scale down if gpu utilization is over threshold.
// Well-utilized nodes are not touched.
ScaleDownGpuUtilizationThreshold float64
// ScaleDownUnneededTime sets the duration CA expects a node to be unneeded/eligible for removal
// before scaling down the node.
ScaleDownUnneededTime time.Duration

View File

@ -402,16 +402,16 @@ func (sd *ScaleDown) UpdateUnneededNodes(
klog.Errorf("Node info for %s not found", node.Name)
continue
}
utilInfo, err := simulator.CalculateUtilization(node, nodeInfo, sd.context.IgnoreDaemonSetsUtilization, sd.context.IgnoreMirrorPodsUtilization)
utilInfo, err := simulator.CalculateUtilization(node, nodeInfo, sd.context.IgnoreDaemonSetsUtilization, sd.context.IgnoreMirrorPodsUtilization, sd.context.CloudProvider.GPULabel())
if err != nil {
klog.Warningf("Failed to calculate utilization for %s: %v", node.Name, err)
}
klog.V(4).Infof("Node %s - utilization %f", node.Name, utilInfo.Utilization)
klog.V(4).Infof("Node %s - %s utilization %f", node.Name, utilInfo.ResourceName, utilInfo.Utilization)
utilizationMap[node.Name] = utilInfo
if utilInfo.Utilization >= sd.context.ScaleDownUtilizationThreshold {
klog.V(4).Infof("Node %s is not suitable for removal - utilization too big (%f)", node.Name, utilInfo.Utilization)
if !sd.isNodeBelowUtilzationThreshold(node, utilInfo) {
klog.V(4).Infof("Node %s is not suitable for removal - %s utilization too big (%f)", node.Name, utilInfo.ResourceName, utilInfo.Utilization)
continue
}
currentlyUnneededNodes = append(currentlyUnneededNodes, node)
@ -506,6 +506,20 @@ func (sd *ScaleDown) UpdateUnneededNodes(
return nil
}
// isNodeBelowUtilzationThreshold determintes if a given node utilization is blow threshold.
func (sd *ScaleDown) isNodeBelowUtilzationThreshold(node *apiv1.Node, utilInfo simulator.UtilizationInfo) bool {
if gpu.NodeHasGpu(sd.context.CloudProvider.GPULabel(), node) {
if utilInfo.Utilization >= sd.context.ScaleDownGpuUtilizationThreshold {
return false
}
} else {
if utilInfo.Utilization >= sd.context.ScaleDownUtilizationThreshold {
return false
}
}
return true
}
// updateUnremovableNodes updates unremovableNodes map according to current
// state of the cluster. Removes from the map nodes that are no longer in the
// nodes list.

View File

@ -172,6 +172,66 @@ func TestFindUnneededNodes(t *testing.T) {
assert.Equal(t, 0, len(sd.unremovableNodes))
}
func TestFindUnneededGPUNodes(t *testing.T) {
// shared owner reference
ownerRef := GenerateOwnerReferences("rs", "ReplicaSet", "extensions/v1beta1", "")
p1 := BuildTestPod("p1", 100, 0)
p1.Spec.NodeName = "n1"
p1.OwnerReferences = ownerRef
RequestGpuForPod(p1, 1)
p2 := BuildTestPod("p2", 400, 0)
p2.Spec.NodeName = "n2"
p2.OwnerReferences = ownerRef
RequestGpuForPod(p2, 1)
p3 := BuildTestPod("p3", 300, 0)
p3.Spec.NodeName = "n3"
p3.OwnerReferences = ownerRef
p3.ObjectMeta.Annotations["cluster-autoscaler.kubernetes.io/safe-to-evict"] = "false"
RequestGpuForPod(p3, 1)
// Node with low cpu utilization and high gpu utilization
n1 := BuildTestNode("n1", 1000, 10)
AddGpusToNode(n1, 2)
// Node with high cpu utilization and low gpu utilization
n2 := BuildTestNode("n2", 1000, 10)
AddGpusToNode(n2, 4)
// Node with low gpu utilization and pods on node can not be interrupted
n3 := BuildTestNode("n3", 1000, 10)
AddGpusToNode(n3, 8)
SetNodeReadyState(n1, true, time.Time{})
SetNodeReadyState(n2, true, time.Time{})
SetNodeReadyState(n3, true, time.Time{})
provider := testprovider.NewTestCloudProvider(nil, nil)
provider.AddNodeGroup("ng1", 1, 10, 2)
provider.AddNode("ng1", n1)
provider.AddNode("ng1", n2)
provider.AddNode("ng1", n3)
options := config.AutoscalingOptions{
ScaleDownUtilizationThreshold: 0.35,
ScaleDownGpuUtilizationThreshold: 0.3,
UnremovableNodeRecheckTimeout: 5 * time.Minute,
}
context := NewScaleTestAutoscalingContext(options, &fake.Clientset{}, nil, provider)
clusterStateRegistry := clusterstate.NewClusterStateRegistry(provider, clusterstate.ClusterStateRegistryConfig{}, context.LogRecorder, newBackoff())
sd := NewScaleDown(&context, clusterStateRegistry)
sd.UpdateUnneededNodes([]*apiv1.Node{n1, n2, n3}, []*apiv1.Node{n1, n2, n3},
[]*apiv1.Pod{p1, p2, p3}, time.Now(), nil)
assert.Equal(t, 1, len(sd.unneededNodes))
_, found := sd.unneededNodes["n2"]
assert.True(t, found)
assert.Contains(t, sd.podLocationHints, p2.Namespace+"/"+p2.Name)
assert.Equal(t, 3, len(sd.nodeUtilizationMap))
}
func TestPodsWithPrioritiesFindUnneededNodes(t *testing.T) {
// shared owner reference
ownerRef := GenerateOwnerReferences("rs", "ReplicaSet", "extensions/v1beta1", "")
@ -812,14 +872,15 @@ func assertSubset(t *testing.T, a []string, b []string) {
}
var defaultScaleDownOptions = config.AutoscalingOptions{
ScaleDownUtilizationThreshold: 0.5,
ScaleDownUnneededTime: time.Minute,
MaxGracefulTerminationSec: 60,
MaxEmptyBulkDelete: 10,
MinCoresTotal: 0,
MinMemoryTotal: 0,
MaxCoresTotal: config.DefaultMaxClusterCores,
MaxMemoryTotal: config.DefaultMaxClusterMemory * units.GiB,
ScaleDownUtilizationThreshold: 0.5,
ScaleDownGpuUtilizationThreshold: 0.5,
ScaleDownUnneededTime: time.Minute,
MaxGracefulTerminationSec: 60,
MaxEmptyBulkDelete: 10,
MinCoresTotal: 0,
MinMemoryTotal: 0,
MaxCoresTotal: config.DefaultMaxClusterCores,
MaxMemoryTotal: config.DefaultMaxClusterMemory * units.GiB,
}
func TestScaleDownEmptyMultipleNodeGroups(t *testing.T) {

View File

@ -93,7 +93,10 @@ var (
scaleDownUnreadyTime = flag.Duration("scale-down-unready-time", 20*time.Minute,
"How long an unready node should be unneeded before it is eligible for scale down")
scaleDownUtilizationThreshold = flag.Float64("scale-down-utilization-threshold", 0.5,
"Node utilization level, defined as sum of requested resources divided by capacity, below which a node can be considered for scale down")
"Sum of cpu or memory of all pods running on the node divided by node's corresponding allocatable resource, below which a node can be considered for scale down")
scaleDownGpuUtilizationThreshold = flag.Float64("scale-down-gpu-utilization-threshold", 0.5,
"Sum of gpu requests of all pods running on the node divided by node's allocatable resource, below which a node can be considered for scale down."+
"Utilization calculation only cares about gpu resource for accelerator node. cpu and memory utilization will be ignored.")
scaleDownNonEmptyCandidatesCount = flag.Int("scale-down-non-empty-candidates-count", 30,
"Maximum number of non empty nodes considered in one iteration as candidates for scale down with drain."+
"Lower value means better CA responsiveness but possible slower scale down latency."+
@ -210,6 +213,7 @@ func createAutoscalingOptions() config.AutoscalingOptions {
ScaleDownUnneededTime: *scaleDownUnneededTime,
ScaleDownUnreadyTime: *scaleDownUnreadyTime,
ScaleDownUtilizationThreshold: *scaleDownUtilizationThreshold,
ScaleDownGpuUtilizationThreshold: *scaleDownGpuUtilizationThreshold,
ScaleDownNonEmptyCandidatesCount: *scaleDownNonEmptyCandidatesCount,
ScaleDownCandidatesPoolRatio: *scaleDownCandidatesPoolRatio,
ScaleDownCandidatesPoolMinCount: *scaleDownCandidatesPoolMinCount,

View File

@ -19,13 +19,13 @@ package simulator
import (
"flag"
"fmt"
"math"
"math/rand"
"time"
"k8s.io/autoscaler/cluster-autoscaler/utils/drain"
"k8s.io/autoscaler/cluster-autoscaler/utils/errors"
"k8s.io/autoscaler/cluster-autoscaler/utils/glogx"
"k8s.io/autoscaler/cluster-autoscaler/utils/gpu"
kube_util "k8s.io/autoscaler/cluster-autoscaler/utils/kubernetes"
scheduler_util "k8s.io/autoscaler/cluster-autoscaler/utils/scheduler"
"k8s.io/autoscaler/cluster-autoscaler/utils/tpu"
@ -62,7 +62,10 @@ type NodeToBeRemoved struct {
type UtilizationInfo struct {
CpuUtil float64
MemUtil float64
// Max(CpuUtil, MemUtil).
GpuUtil float64
// Resource name of highest utilization resource
ResourceName apiv1.ResourceName
// Max(CpuUtil, MemUtil) or GpuUtils
Utilization float64
}
@ -149,10 +152,22 @@ func FindEmptyNodesToRemove(candidates []*apiv1.Node, pods []*apiv1.Pod) []*apiv
return result
}
// CalculateUtilization calculates utilization of a node, defined as maximum of (cpu, memory) utilization.
// Per resource utilization is the sum of requests for it divided by allocatable. It also returns the individual
// cpu and memory utilization.
func CalculateUtilization(node *apiv1.Node, nodeInfo *schedulernodeinfo.NodeInfo, skipDaemonSetPods, skipMirrorPods bool) (utilInfo UtilizationInfo, err error) {
// CalculateUtilization calculates utilization of a node, defined as maximum of (cpu, memory) or gpu utilization
// based on if the node has GPU or not. Per resource utilization is the sum of requests for it divided by allocatable.
// It also returns the individual cpu, memory and gpu utilization.
func CalculateUtilization(node *apiv1.Node, nodeInfo *schedulernodeinfo.NodeInfo, skipDaemonSetPods, skipMirrorPods bool, gpuLabel string) (utilInfo UtilizationInfo, err error) {
if gpu.NodeHasGpu(gpuLabel, node) {
gpuUtil, err := calculateUtilizationOfResource(node, nodeInfo, gpu.ResourceNvidiaGPU, skipDaemonSetPods, skipMirrorPods)
if err != nil {
klog.V(3).Infof("node %s has unready GPU", node.Name)
// Return 0 if GPU is unready. This will guarantee we can still scale down a node with unready GPU.
return UtilizationInfo{GpuUtil: 0, ResourceName: gpu.ResourceNvidiaGPU, Utilization: 0}, nil
}
// Skips cpu and memory utilization calculation for node with GPU.
return UtilizationInfo{GpuUtil: gpuUtil, ResourceName: gpu.ResourceNvidiaGPU, Utilization: gpuUtil}, nil
}
cpu, err := calculateUtilizationOfResource(node, nodeInfo, apiv1.ResourceCPU, skipDaemonSetPods, skipMirrorPods)
if err != nil {
return UtilizationInfo{}, err
@ -161,7 +176,18 @@ func CalculateUtilization(node *apiv1.Node, nodeInfo *schedulernodeinfo.NodeInfo
if err != nil {
return UtilizationInfo{}, err
}
return UtilizationInfo{CpuUtil: cpu, MemUtil: mem, Utilization: math.Max(cpu, mem)}, nil
utilization := UtilizationInfo{CpuUtil: cpu, MemUtil: mem}
if cpu > mem {
utilization.ResourceName = apiv1.ResourceCPU
utilization.Utilization = cpu
} else {
utilization.ResourceName = apiv1.ResourceMemory
utilization.Utilization = mem
}
return utilization, nil
}
func calculateUtilizationOfResource(node *apiv1.Node, nodeInfo *schedulernodeinfo.NodeInfo, resourceName apiv1.ResourceName, skipDaemonSetPods, skipMirrorPods bool) (float64, error) {

View File

@ -31,6 +31,7 @@ import (
)
func TestUtilization(t *testing.T) {
gpuLabel := GetGPULabel()
pod := BuildTestPod("p1", 100, 200000)
pod2 := BuildTestPod("p2", -1, -1)
@ -38,25 +39,25 @@ func TestUtilization(t *testing.T) {
node := BuildTestNode("node1", 2000, 2000000)
SetNodeReadyState(node, true, time.Time{})
utilInfo, err := CalculateUtilization(node, nodeInfo, false, false)
utilInfo, err := CalculateUtilization(node, nodeInfo, false, false, gpuLabel)
assert.NoError(t, err)
assert.InEpsilon(t, 2.0/10, utilInfo.Utilization, 0.01)
node2 := BuildTestNode("node1", 2000, -1)
_, err = CalculateUtilization(node2, nodeInfo, false, false)
_, err = CalculateUtilization(node2, nodeInfo, false, false, gpuLabel)
assert.Error(t, err)
daemonSetPod3 := BuildTestPod("p3", 100, 200000)
daemonSetPod3.OwnerReferences = GenerateOwnerReferences("ds", "DaemonSet", "apps/v1", "")
nodeInfo = schedulernodeinfo.NewNodeInfo(pod, pod, pod2, daemonSetPod3)
utilInfo, err = CalculateUtilization(node, nodeInfo, true, false)
utilInfo, err = CalculateUtilization(node, nodeInfo, true, false, gpuLabel)
assert.NoError(t, err)
assert.InEpsilon(t, 2.0/10, utilInfo.Utilization, 0.01)
nodeInfo = schedulernodeinfo.NewNodeInfo(pod, pod2, daemonSetPod3)
utilInfo, err = CalculateUtilization(node, nodeInfo, false, false)
utilInfo, err = CalculateUtilization(node, nodeInfo, false, false, gpuLabel)
assert.NoError(t, err)
assert.InEpsilon(t, 2.0/10, utilInfo.Utilization, 0.01)
@ -66,14 +67,31 @@ func TestUtilization(t *testing.T) {
}
nodeInfo = schedulernodeinfo.NewNodeInfo(pod, pod, pod2, mirrorPod4)
utilInfo, err = CalculateUtilization(node, nodeInfo, false, true)
utilInfo, err = CalculateUtilization(node, nodeInfo, false, true, gpuLabel)
assert.NoError(t, err)
assert.InEpsilon(t, 2.0/10, utilInfo.Utilization, 0.01)
nodeInfo = schedulernodeinfo.NewNodeInfo(pod, pod2, mirrorPod4)
utilInfo, err = CalculateUtilization(node, nodeInfo, false, false)
utilInfo, err = CalculateUtilization(node, nodeInfo, false, false, gpuLabel)
assert.NoError(t, err)
assert.InEpsilon(t, 2.0/10, utilInfo.Utilization, 0.01)
gpuNode := BuildTestNode("gpu_node", 2000, 2000000)
AddGpusToNode(gpuNode, 1)
gpuPod := BuildTestPod("gpu_pod", 100, 200000)
RequestGpuForPod(gpuPod, 1)
nodeInfo = schedulernodeinfo.NewNodeInfo(pod, pod, gpuPod)
utilInfo, err = CalculateUtilization(gpuNode, nodeInfo, false, false, gpuLabel)
assert.NoError(t, err)
assert.InEpsilon(t, 1/1, utilInfo.Utilization, 0.01)
// Node with Unready GPU
gpuNode = BuildTestNode("gpu_node", 2000, 2000000)
AddGpuLabelToNode(gpuNode)
nodeInfo = schedulernodeinfo.NewNodeInfo(pod, pod)
utilInfo, err = CalculateUtilization(gpuNode, nodeInfo, false, false, gpuLabel)
assert.NoError(t, err)
assert.Zero(t, utilInfo.Utilization)
}
func TestFindPlaceAllOk(t *testing.T) {

View File

@ -39,9 +39,10 @@ import (
func BuildTestPod(name string, cpu int64, mem int64) *apiv1.Pod {
pod := &apiv1.Pod{
ObjectMeta: metav1.ObjectMeta{
Namespace: "default",
Name: name,
SelfLink: fmt.Sprintf("/api/v1/namespaces/default/pods/%s", name),
Namespace: "default",
Name: name,
SelfLink: fmt.Sprintf("/api/v1/namespaces/default/pods/%s", name),
Annotations: map[string]string{},
},
Spec: apiv1.PodSpec{
Containers: []apiv1.Container{
@ -128,9 +129,19 @@ func AddGpusToNode(node *apiv1.Node, gpusCount int64) {
})
node.Status.Capacity[resourceNvidiaGPU] = *resource.NewQuantity(gpusCount, resource.DecimalSI)
node.Status.Allocatable[resourceNvidiaGPU] = *resource.NewQuantity(gpusCount, resource.DecimalSI)
AddGpuLabelToNode(node)
}
// AddGpuLabelToNode adds GPULabel to give node. This is used to mock intermediate result that GPU on node is not ready
func AddGpuLabelToNode(node *apiv1.Node) {
node.Labels[gpuLabel] = defaultGPUType
}
// GetGPULabel return GPULabel on the node. This is only used in unit tests.
func GetGPULabel() string {
return gpuLabel
}
// SetNodeReadyState sets node ready state to either ConditionTrue or ConditionFalse.
func SetNodeReadyState(node *apiv1.Node, ready bool, lastTransition time.Time) {
if ready {