Consider GPU utilization in scaling down
This commit is contained in:
parent
40cf6e43c0
commit
83ae66cebc
|
|
@ -34,9 +34,12 @@ type GpuLimits struct {
|
|||
type AutoscalingOptions struct {
|
||||
// MaxEmptyBulkDelete is a number of empty nodes that can be removed at the same time.
|
||||
MaxEmptyBulkDelete int
|
||||
// ScaleDownUtilizationThreshold sets threshold for nodes to be considered for scale down.
|
||||
// ScaleDownUtilizationThreshold sets threshold for nodes to be considered for scale down if cpu or memory utilization is over threshold.
|
||||
// Well-utilized nodes are not touched.
|
||||
ScaleDownUtilizationThreshold float64
|
||||
// ScaleDownGpuUtilizationThreshold sets threshold for gpu nodes to be considered for scale down if gpu utilization is over threshold.
|
||||
// Well-utilized nodes are not touched.
|
||||
ScaleDownGpuUtilizationThreshold float64
|
||||
// ScaleDownUnneededTime sets the duration CA expects a node to be unneeded/eligible for removal
|
||||
// before scaling down the node.
|
||||
ScaleDownUnneededTime time.Duration
|
||||
|
|
|
|||
|
|
@ -402,16 +402,16 @@ func (sd *ScaleDown) UpdateUnneededNodes(
|
|||
klog.Errorf("Node info for %s not found", node.Name)
|
||||
continue
|
||||
}
|
||||
utilInfo, err := simulator.CalculateUtilization(node, nodeInfo, sd.context.IgnoreDaemonSetsUtilization, sd.context.IgnoreMirrorPodsUtilization)
|
||||
|
||||
utilInfo, err := simulator.CalculateUtilization(node, nodeInfo, sd.context.IgnoreDaemonSetsUtilization, sd.context.IgnoreMirrorPodsUtilization, sd.context.CloudProvider.GPULabel())
|
||||
if err != nil {
|
||||
klog.Warningf("Failed to calculate utilization for %s: %v", node.Name, err)
|
||||
}
|
||||
klog.V(4).Infof("Node %s - utilization %f", node.Name, utilInfo.Utilization)
|
||||
klog.V(4).Infof("Node %s - %s utilization %f", node.Name, utilInfo.ResourceName, utilInfo.Utilization)
|
||||
utilizationMap[node.Name] = utilInfo
|
||||
|
||||
if utilInfo.Utilization >= sd.context.ScaleDownUtilizationThreshold {
|
||||
klog.V(4).Infof("Node %s is not suitable for removal - utilization too big (%f)", node.Name, utilInfo.Utilization)
|
||||
if !sd.isNodeBelowUtilzationThreshold(node, utilInfo) {
|
||||
klog.V(4).Infof("Node %s is not suitable for removal - %s utilization too big (%f)", node.Name, utilInfo.ResourceName, utilInfo.Utilization)
|
||||
continue
|
||||
}
|
||||
currentlyUnneededNodes = append(currentlyUnneededNodes, node)
|
||||
|
|
@ -506,6 +506,20 @@ func (sd *ScaleDown) UpdateUnneededNodes(
|
|||
return nil
|
||||
}
|
||||
|
||||
// isNodeBelowUtilzationThreshold determintes if a given node utilization is blow threshold.
|
||||
func (sd *ScaleDown) isNodeBelowUtilzationThreshold(node *apiv1.Node, utilInfo simulator.UtilizationInfo) bool {
|
||||
if gpu.NodeHasGpu(sd.context.CloudProvider.GPULabel(), node) {
|
||||
if utilInfo.Utilization >= sd.context.ScaleDownGpuUtilizationThreshold {
|
||||
return false
|
||||
}
|
||||
} else {
|
||||
if utilInfo.Utilization >= sd.context.ScaleDownUtilizationThreshold {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// updateUnremovableNodes updates unremovableNodes map according to current
|
||||
// state of the cluster. Removes from the map nodes that are no longer in the
|
||||
// nodes list.
|
||||
|
|
|
|||
|
|
@ -172,6 +172,66 @@ func TestFindUnneededNodes(t *testing.T) {
|
|||
assert.Equal(t, 0, len(sd.unremovableNodes))
|
||||
}
|
||||
|
||||
func TestFindUnneededGPUNodes(t *testing.T) {
|
||||
// shared owner reference
|
||||
ownerRef := GenerateOwnerReferences("rs", "ReplicaSet", "extensions/v1beta1", "")
|
||||
|
||||
p1 := BuildTestPod("p1", 100, 0)
|
||||
p1.Spec.NodeName = "n1"
|
||||
p1.OwnerReferences = ownerRef
|
||||
RequestGpuForPod(p1, 1)
|
||||
|
||||
p2 := BuildTestPod("p2", 400, 0)
|
||||
p2.Spec.NodeName = "n2"
|
||||
p2.OwnerReferences = ownerRef
|
||||
RequestGpuForPod(p2, 1)
|
||||
|
||||
p3 := BuildTestPod("p3", 300, 0)
|
||||
p3.Spec.NodeName = "n3"
|
||||
p3.OwnerReferences = ownerRef
|
||||
p3.ObjectMeta.Annotations["cluster-autoscaler.kubernetes.io/safe-to-evict"] = "false"
|
||||
RequestGpuForPod(p3, 1)
|
||||
|
||||
// Node with low cpu utilization and high gpu utilization
|
||||
n1 := BuildTestNode("n1", 1000, 10)
|
||||
AddGpusToNode(n1, 2)
|
||||
// Node with high cpu utilization and low gpu utilization
|
||||
n2 := BuildTestNode("n2", 1000, 10)
|
||||
AddGpusToNode(n2, 4)
|
||||
// Node with low gpu utilization and pods on node can not be interrupted
|
||||
n3 := BuildTestNode("n3", 1000, 10)
|
||||
AddGpusToNode(n3, 8)
|
||||
|
||||
SetNodeReadyState(n1, true, time.Time{})
|
||||
SetNodeReadyState(n2, true, time.Time{})
|
||||
SetNodeReadyState(n3, true, time.Time{})
|
||||
|
||||
provider := testprovider.NewTestCloudProvider(nil, nil)
|
||||
provider.AddNodeGroup("ng1", 1, 10, 2)
|
||||
provider.AddNode("ng1", n1)
|
||||
provider.AddNode("ng1", n2)
|
||||
provider.AddNode("ng1", n3)
|
||||
|
||||
options := config.AutoscalingOptions{
|
||||
ScaleDownUtilizationThreshold: 0.35,
|
||||
ScaleDownGpuUtilizationThreshold: 0.3,
|
||||
UnremovableNodeRecheckTimeout: 5 * time.Minute,
|
||||
}
|
||||
context := NewScaleTestAutoscalingContext(options, &fake.Clientset{}, nil, provider)
|
||||
|
||||
clusterStateRegistry := clusterstate.NewClusterStateRegistry(provider, clusterstate.ClusterStateRegistryConfig{}, context.LogRecorder, newBackoff())
|
||||
sd := NewScaleDown(&context, clusterStateRegistry)
|
||||
sd.UpdateUnneededNodes([]*apiv1.Node{n1, n2, n3}, []*apiv1.Node{n1, n2, n3},
|
||||
[]*apiv1.Pod{p1, p2, p3}, time.Now(), nil)
|
||||
|
||||
assert.Equal(t, 1, len(sd.unneededNodes))
|
||||
_, found := sd.unneededNodes["n2"]
|
||||
assert.True(t, found)
|
||||
|
||||
assert.Contains(t, sd.podLocationHints, p2.Namespace+"/"+p2.Name)
|
||||
assert.Equal(t, 3, len(sd.nodeUtilizationMap))
|
||||
}
|
||||
|
||||
func TestPodsWithPrioritiesFindUnneededNodes(t *testing.T) {
|
||||
// shared owner reference
|
||||
ownerRef := GenerateOwnerReferences("rs", "ReplicaSet", "extensions/v1beta1", "")
|
||||
|
|
@ -812,14 +872,15 @@ func assertSubset(t *testing.T, a []string, b []string) {
|
|||
}
|
||||
|
||||
var defaultScaleDownOptions = config.AutoscalingOptions{
|
||||
ScaleDownUtilizationThreshold: 0.5,
|
||||
ScaleDownUnneededTime: time.Minute,
|
||||
MaxGracefulTerminationSec: 60,
|
||||
MaxEmptyBulkDelete: 10,
|
||||
MinCoresTotal: 0,
|
||||
MinMemoryTotal: 0,
|
||||
MaxCoresTotal: config.DefaultMaxClusterCores,
|
||||
MaxMemoryTotal: config.DefaultMaxClusterMemory * units.GiB,
|
||||
ScaleDownUtilizationThreshold: 0.5,
|
||||
ScaleDownGpuUtilizationThreshold: 0.5,
|
||||
ScaleDownUnneededTime: time.Minute,
|
||||
MaxGracefulTerminationSec: 60,
|
||||
MaxEmptyBulkDelete: 10,
|
||||
MinCoresTotal: 0,
|
||||
MinMemoryTotal: 0,
|
||||
MaxCoresTotal: config.DefaultMaxClusterCores,
|
||||
MaxMemoryTotal: config.DefaultMaxClusterMemory * units.GiB,
|
||||
}
|
||||
|
||||
func TestScaleDownEmptyMultipleNodeGroups(t *testing.T) {
|
||||
|
|
|
|||
|
|
@ -93,7 +93,10 @@ var (
|
|||
scaleDownUnreadyTime = flag.Duration("scale-down-unready-time", 20*time.Minute,
|
||||
"How long an unready node should be unneeded before it is eligible for scale down")
|
||||
scaleDownUtilizationThreshold = flag.Float64("scale-down-utilization-threshold", 0.5,
|
||||
"Node utilization level, defined as sum of requested resources divided by capacity, below which a node can be considered for scale down")
|
||||
"Sum of cpu or memory of all pods running on the node divided by node's corresponding allocatable resource, below which a node can be considered for scale down")
|
||||
scaleDownGpuUtilizationThreshold = flag.Float64("scale-down-gpu-utilization-threshold", 0.5,
|
||||
"Sum of gpu requests of all pods running on the node divided by node's allocatable resource, below which a node can be considered for scale down."+
|
||||
"Utilization calculation only cares about gpu resource for accelerator node. cpu and memory utilization will be ignored.")
|
||||
scaleDownNonEmptyCandidatesCount = flag.Int("scale-down-non-empty-candidates-count", 30,
|
||||
"Maximum number of non empty nodes considered in one iteration as candidates for scale down with drain."+
|
||||
"Lower value means better CA responsiveness but possible slower scale down latency."+
|
||||
|
|
@ -210,6 +213,7 @@ func createAutoscalingOptions() config.AutoscalingOptions {
|
|||
ScaleDownUnneededTime: *scaleDownUnneededTime,
|
||||
ScaleDownUnreadyTime: *scaleDownUnreadyTime,
|
||||
ScaleDownUtilizationThreshold: *scaleDownUtilizationThreshold,
|
||||
ScaleDownGpuUtilizationThreshold: *scaleDownGpuUtilizationThreshold,
|
||||
ScaleDownNonEmptyCandidatesCount: *scaleDownNonEmptyCandidatesCount,
|
||||
ScaleDownCandidatesPoolRatio: *scaleDownCandidatesPoolRatio,
|
||||
ScaleDownCandidatesPoolMinCount: *scaleDownCandidatesPoolMinCount,
|
||||
|
|
|
|||
|
|
@ -19,13 +19,13 @@ package simulator
|
|||
import (
|
||||
"flag"
|
||||
"fmt"
|
||||
"math"
|
||||
"math/rand"
|
||||
"time"
|
||||
|
||||
"k8s.io/autoscaler/cluster-autoscaler/utils/drain"
|
||||
"k8s.io/autoscaler/cluster-autoscaler/utils/errors"
|
||||
"k8s.io/autoscaler/cluster-autoscaler/utils/glogx"
|
||||
"k8s.io/autoscaler/cluster-autoscaler/utils/gpu"
|
||||
kube_util "k8s.io/autoscaler/cluster-autoscaler/utils/kubernetes"
|
||||
scheduler_util "k8s.io/autoscaler/cluster-autoscaler/utils/scheduler"
|
||||
"k8s.io/autoscaler/cluster-autoscaler/utils/tpu"
|
||||
|
|
@ -62,7 +62,10 @@ type NodeToBeRemoved struct {
|
|||
type UtilizationInfo struct {
|
||||
CpuUtil float64
|
||||
MemUtil float64
|
||||
// Max(CpuUtil, MemUtil).
|
||||
GpuUtil float64
|
||||
// Resource name of highest utilization resource
|
||||
ResourceName apiv1.ResourceName
|
||||
// Max(CpuUtil, MemUtil) or GpuUtils
|
||||
Utilization float64
|
||||
}
|
||||
|
||||
|
|
@ -149,10 +152,22 @@ func FindEmptyNodesToRemove(candidates []*apiv1.Node, pods []*apiv1.Pod) []*apiv
|
|||
return result
|
||||
}
|
||||
|
||||
// CalculateUtilization calculates utilization of a node, defined as maximum of (cpu, memory) utilization.
|
||||
// Per resource utilization is the sum of requests for it divided by allocatable. It also returns the individual
|
||||
// cpu and memory utilization.
|
||||
func CalculateUtilization(node *apiv1.Node, nodeInfo *schedulernodeinfo.NodeInfo, skipDaemonSetPods, skipMirrorPods bool) (utilInfo UtilizationInfo, err error) {
|
||||
// CalculateUtilization calculates utilization of a node, defined as maximum of (cpu, memory) or gpu utilization
|
||||
// based on if the node has GPU or not. Per resource utilization is the sum of requests for it divided by allocatable.
|
||||
// It also returns the individual cpu, memory and gpu utilization.
|
||||
func CalculateUtilization(node *apiv1.Node, nodeInfo *schedulernodeinfo.NodeInfo, skipDaemonSetPods, skipMirrorPods bool, gpuLabel string) (utilInfo UtilizationInfo, err error) {
|
||||
if gpu.NodeHasGpu(gpuLabel, node) {
|
||||
gpuUtil, err := calculateUtilizationOfResource(node, nodeInfo, gpu.ResourceNvidiaGPU, skipDaemonSetPods, skipMirrorPods)
|
||||
if err != nil {
|
||||
klog.V(3).Infof("node %s has unready GPU", node.Name)
|
||||
// Return 0 if GPU is unready. This will guarantee we can still scale down a node with unready GPU.
|
||||
return UtilizationInfo{GpuUtil: 0, ResourceName: gpu.ResourceNvidiaGPU, Utilization: 0}, nil
|
||||
}
|
||||
|
||||
// Skips cpu and memory utilization calculation for node with GPU.
|
||||
return UtilizationInfo{GpuUtil: gpuUtil, ResourceName: gpu.ResourceNvidiaGPU, Utilization: gpuUtil}, nil
|
||||
}
|
||||
|
||||
cpu, err := calculateUtilizationOfResource(node, nodeInfo, apiv1.ResourceCPU, skipDaemonSetPods, skipMirrorPods)
|
||||
if err != nil {
|
||||
return UtilizationInfo{}, err
|
||||
|
|
@ -161,7 +176,18 @@ func CalculateUtilization(node *apiv1.Node, nodeInfo *schedulernodeinfo.NodeInfo
|
|||
if err != nil {
|
||||
return UtilizationInfo{}, err
|
||||
}
|
||||
return UtilizationInfo{CpuUtil: cpu, MemUtil: mem, Utilization: math.Max(cpu, mem)}, nil
|
||||
|
||||
utilization := UtilizationInfo{CpuUtil: cpu, MemUtil: mem}
|
||||
|
||||
if cpu > mem {
|
||||
utilization.ResourceName = apiv1.ResourceCPU
|
||||
utilization.Utilization = cpu
|
||||
} else {
|
||||
utilization.ResourceName = apiv1.ResourceMemory
|
||||
utilization.Utilization = mem
|
||||
}
|
||||
|
||||
return utilization, nil
|
||||
}
|
||||
|
||||
func calculateUtilizationOfResource(node *apiv1.Node, nodeInfo *schedulernodeinfo.NodeInfo, resourceName apiv1.ResourceName, skipDaemonSetPods, skipMirrorPods bool) (float64, error) {
|
||||
|
|
|
|||
|
|
@ -31,6 +31,7 @@ import (
|
|||
)
|
||||
|
||||
func TestUtilization(t *testing.T) {
|
||||
gpuLabel := GetGPULabel()
|
||||
pod := BuildTestPod("p1", 100, 200000)
|
||||
pod2 := BuildTestPod("p2", -1, -1)
|
||||
|
||||
|
|
@ -38,25 +39,25 @@ func TestUtilization(t *testing.T) {
|
|||
node := BuildTestNode("node1", 2000, 2000000)
|
||||
SetNodeReadyState(node, true, time.Time{})
|
||||
|
||||
utilInfo, err := CalculateUtilization(node, nodeInfo, false, false)
|
||||
utilInfo, err := CalculateUtilization(node, nodeInfo, false, false, gpuLabel)
|
||||
assert.NoError(t, err)
|
||||
assert.InEpsilon(t, 2.0/10, utilInfo.Utilization, 0.01)
|
||||
|
||||
node2 := BuildTestNode("node1", 2000, -1)
|
||||
|
||||
_, err = CalculateUtilization(node2, nodeInfo, false, false)
|
||||
_, err = CalculateUtilization(node2, nodeInfo, false, false, gpuLabel)
|
||||
assert.Error(t, err)
|
||||
|
||||
daemonSetPod3 := BuildTestPod("p3", 100, 200000)
|
||||
daemonSetPod3.OwnerReferences = GenerateOwnerReferences("ds", "DaemonSet", "apps/v1", "")
|
||||
|
||||
nodeInfo = schedulernodeinfo.NewNodeInfo(pod, pod, pod2, daemonSetPod3)
|
||||
utilInfo, err = CalculateUtilization(node, nodeInfo, true, false)
|
||||
utilInfo, err = CalculateUtilization(node, nodeInfo, true, false, gpuLabel)
|
||||
assert.NoError(t, err)
|
||||
assert.InEpsilon(t, 2.0/10, utilInfo.Utilization, 0.01)
|
||||
|
||||
nodeInfo = schedulernodeinfo.NewNodeInfo(pod, pod2, daemonSetPod3)
|
||||
utilInfo, err = CalculateUtilization(node, nodeInfo, false, false)
|
||||
utilInfo, err = CalculateUtilization(node, nodeInfo, false, false, gpuLabel)
|
||||
assert.NoError(t, err)
|
||||
assert.InEpsilon(t, 2.0/10, utilInfo.Utilization, 0.01)
|
||||
|
||||
|
|
@ -66,14 +67,31 @@ func TestUtilization(t *testing.T) {
|
|||
}
|
||||
|
||||
nodeInfo = schedulernodeinfo.NewNodeInfo(pod, pod, pod2, mirrorPod4)
|
||||
utilInfo, err = CalculateUtilization(node, nodeInfo, false, true)
|
||||
utilInfo, err = CalculateUtilization(node, nodeInfo, false, true, gpuLabel)
|
||||
assert.NoError(t, err)
|
||||
assert.InEpsilon(t, 2.0/10, utilInfo.Utilization, 0.01)
|
||||
|
||||
nodeInfo = schedulernodeinfo.NewNodeInfo(pod, pod2, mirrorPod4)
|
||||
utilInfo, err = CalculateUtilization(node, nodeInfo, false, false)
|
||||
utilInfo, err = CalculateUtilization(node, nodeInfo, false, false, gpuLabel)
|
||||
assert.NoError(t, err)
|
||||
assert.InEpsilon(t, 2.0/10, utilInfo.Utilization, 0.01)
|
||||
|
||||
gpuNode := BuildTestNode("gpu_node", 2000, 2000000)
|
||||
AddGpusToNode(gpuNode, 1)
|
||||
gpuPod := BuildTestPod("gpu_pod", 100, 200000)
|
||||
RequestGpuForPod(gpuPod, 1)
|
||||
nodeInfo = schedulernodeinfo.NewNodeInfo(pod, pod, gpuPod)
|
||||
utilInfo, err = CalculateUtilization(gpuNode, nodeInfo, false, false, gpuLabel)
|
||||
assert.NoError(t, err)
|
||||
assert.InEpsilon(t, 1/1, utilInfo.Utilization, 0.01)
|
||||
|
||||
// Node with Unready GPU
|
||||
gpuNode = BuildTestNode("gpu_node", 2000, 2000000)
|
||||
AddGpuLabelToNode(gpuNode)
|
||||
nodeInfo = schedulernodeinfo.NewNodeInfo(pod, pod)
|
||||
utilInfo, err = CalculateUtilization(gpuNode, nodeInfo, false, false, gpuLabel)
|
||||
assert.NoError(t, err)
|
||||
assert.Zero(t, utilInfo.Utilization)
|
||||
}
|
||||
|
||||
func TestFindPlaceAllOk(t *testing.T) {
|
||||
|
|
|
|||
|
|
@ -39,9 +39,10 @@ import (
|
|||
func BuildTestPod(name string, cpu int64, mem int64) *apiv1.Pod {
|
||||
pod := &apiv1.Pod{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Namespace: "default",
|
||||
Name: name,
|
||||
SelfLink: fmt.Sprintf("/api/v1/namespaces/default/pods/%s", name),
|
||||
Namespace: "default",
|
||||
Name: name,
|
||||
SelfLink: fmt.Sprintf("/api/v1/namespaces/default/pods/%s", name),
|
||||
Annotations: map[string]string{},
|
||||
},
|
||||
Spec: apiv1.PodSpec{
|
||||
Containers: []apiv1.Container{
|
||||
|
|
@ -128,9 +129,19 @@ func AddGpusToNode(node *apiv1.Node, gpusCount int64) {
|
|||
})
|
||||
node.Status.Capacity[resourceNvidiaGPU] = *resource.NewQuantity(gpusCount, resource.DecimalSI)
|
||||
node.Status.Allocatable[resourceNvidiaGPU] = *resource.NewQuantity(gpusCount, resource.DecimalSI)
|
||||
AddGpuLabelToNode(node)
|
||||
}
|
||||
|
||||
// AddGpuLabelToNode adds GPULabel to give node. This is used to mock intermediate result that GPU on node is not ready
|
||||
func AddGpuLabelToNode(node *apiv1.Node) {
|
||||
node.Labels[gpuLabel] = defaultGPUType
|
||||
}
|
||||
|
||||
// GetGPULabel return GPULabel on the node. This is only used in unit tests.
|
||||
func GetGPULabel() string {
|
||||
return gpuLabel
|
||||
}
|
||||
|
||||
// SetNodeReadyState sets node ready state to either ConditionTrue or ConditionFalse.
|
||||
func SetNodeReadyState(node *apiv1.Node, ready bool, lastTransition time.Time) {
|
||||
if ready {
|
||||
|
|
|
|||
Loading…
Reference in New Issue