Consider GPU utilization in scaling down
This commit is contained in:
parent
40cf6e43c0
commit
83ae66cebc
|
|
@ -34,9 +34,12 @@ type GpuLimits struct {
|
||||||
type AutoscalingOptions struct {
|
type AutoscalingOptions struct {
|
||||||
// MaxEmptyBulkDelete is a number of empty nodes that can be removed at the same time.
|
// MaxEmptyBulkDelete is a number of empty nodes that can be removed at the same time.
|
||||||
MaxEmptyBulkDelete int
|
MaxEmptyBulkDelete int
|
||||||
// ScaleDownUtilizationThreshold sets threshold for nodes to be considered for scale down.
|
// ScaleDownUtilizationThreshold sets threshold for nodes to be considered for scale down if cpu or memory utilization is over threshold.
|
||||||
// Well-utilized nodes are not touched.
|
// Well-utilized nodes are not touched.
|
||||||
ScaleDownUtilizationThreshold float64
|
ScaleDownUtilizationThreshold float64
|
||||||
|
// ScaleDownGpuUtilizationThreshold sets threshold for gpu nodes to be considered for scale down if gpu utilization is over threshold.
|
||||||
|
// Well-utilized nodes are not touched.
|
||||||
|
ScaleDownGpuUtilizationThreshold float64
|
||||||
// ScaleDownUnneededTime sets the duration CA expects a node to be unneeded/eligible for removal
|
// ScaleDownUnneededTime sets the duration CA expects a node to be unneeded/eligible for removal
|
||||||
// before scaling down the node.
|
// before scaling down the node.
|
||||||
ScaleDownUnneededTime time.Duration
|
ScaleDownUnneededTime time.Duration
|
||||||
|
|
|
||||||
|
|
@ -402,16 +402,16 @@ func (sd *ScaleDown) UpdateUnneededNodes(
|
||||||
klog.Errorf("Node info for %s not found", node.Name)
|
klog.Errorf("Node info for %s not found", node.Name)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
utilInfo, err := simulator.CalculateUtilization(node, nodeInfo, sd.context.IgnoreDaemonSetsUtilization, sd.context.IgnoreMirrorPodsUtilization)
|
|
||||||
|
|
||||||
|
utilInfo, err := simulator.CalculateUtilization(node, nodeInfo, sd.context.IgnoreDaemonSetsUtilization, sd.context.IgnoreMirrorPodsUtilization, sd.context.CloudProvider.GPULabel())
|
||||||
if err != nil {
|
if err != nil {
|
||||||
klog.Warningf("Failed to calculate utilization for %s: %v", node.Name, err)
|
klog.Warningf("Failed to calculate utilization for %s: %v", node.Name, err)
|
||||||
}
|
}
|
||||||
klog.V(4).Infof("Node %s - utilization %f", node.Name, utilInfo.Utilization)
|
klog.V(4).Infof("Node %s - %s utilization %f", node.Name, utilInfo.ResourceName, utilInfo.Utilization)
|
||||||
utilizationMap[node.Name] = utilInfo
|
utilizationMap[node.Name] = utilInfo
|
||||||
|
|
||||||
if utilInfo.Utilization >= sd.context.ScaleDownUtilizationThreshold {
|
if !sd.isNodeBelowUtilzationThreshold(node, utilInfo) {
|
||||||
klog.V(4).Infof("Node %s is not suitable for removal - utilization too big (%f)", node.Name, utilInfo.Utilization)
|
klog.V(4).Infof("Node %s is not suitable for removal - %s utilization too big (%f)", node.Name, utilInfo.ResourceName, utilInfo.Utilization)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
currentlyUnneededNodes = append(currentlyUnneededNodes, node)
|
currentlyUnneededNodes = append(currentlyUnneededNodes, node)
|
||||||
|
|
@ -506,6 +506,20 @@ func (sd *ScaleDown) UpdateUnneededNodes(
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// isNodeBelowUtilzationThreshold determintes if a given node utilization is blow threshold.
|
||||||
|
func (sd *ScaleDown) isNodeBelowUtilzationThreshold(node *apiv1.Node, utilInfo simulator.UtilizationInfo) bool {
|
||||||
|
if gpu.NodeHasGpu(sd.context.CloudProvider.GPULabel(), node) {
|
||||||
|
if utilInfo.Utilization >= sd.context.ScaleDownGpuUtilizationThreshold {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if utilInfo.Utilization >= sd.context.ScaleDownUtilizationThreshold {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
// updateUnremovableNodes updates unremovableNodes map according to current
|
// updateUnremovableNodes updates unremovableNodes map according to current
|
||||||
// state of the cluster. Removes from the map nodes that are no longer in the
|
// state of the cluster. Removes from the map nodes that are no longer in the
|
||||||
// nodes list.
|
// nodes list.
|
||||||
|
|
|
||||||
|
|
@ -172,6 +172,66 @@ func TestFindUnneededNodes(t *testing.T) {
|
||||||
assert.Equal(t, 0, len(sd.unremovableNodes))
|
assert.Equal(t, 0, len(sd.unremovableNodes))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestFindUnneededGPUNodes(t *testing.T) {
|
||||||
|
// shared owner reference
|
||||||
|
ownerRef := GenerateOwnerReferences("rs", "ReplicaSet", "extensions/v1beta1", "")
|
||||||
|
|
||||||
|
p1 := BuildTestPod("p1", 100, 0)
|
||||||
|
p1.Spec.NodeName = "n1"
|
||||||
|
p1.OwnerReferences = ownerRef
|
||||||
|
RequestGpuForPod(p1, 1)
|
||||||
|
|
||||||
|
p2 := BuildTestPod("p2", 400, 0)
|
||||||
|
p2.Spec.NodeName = "n2"
|
||||||
|
p2.OwnerReferences = ownerRef
|
||||||
|
RequestGpuForPod(p2, 1)
|
||||||
|
|
||||||
|
p3 := BuildTestPod("p3", 300, 0)
|
||||||
|
p3.Spec.NodeName = "n3"
|
||||||
|
p3.OwnerReferences = ownerRef
|
||||||
|
p3.ObjectMeta.Annotations["cluster-autoscaler.kubernetes.io/safe-to-evict"] = "false"
|
||||||
|
RequestGpuForPod(p3, 1)
|
||||||
|
|
||||||
|
// Node with low cpu utilization and high gpu utilization
|
||||||
|
n1 := BuildTestNode("n1", 1000, 10)
|
||||||
|
AddGpusToNode(n1, 2)
|
||||||
|
// Node with high cpu utilization and low gpu utilization
|
||||||
|
n2 := BuildTestNode("n2", 1000, 10)
|
||||||
|
AddGpusToNode(n2, 4)
|
||||||
|
// Node with low gpu utilization and pods on node can not be interrupted
|
||||||
|
n3 := BuildTestNode("n3", 1000, 10)
|
||||||
|
AddGpusToNode(n3, 8)
|
||||||
|
|
||||||
|
SetNodeReadyState(n1, true, time.Time{})
|
||||||
|
SetNodeReadyState(n2, true, time.Time{})
|
||||||
|
SetNodeReadyState(n3, true, time.Time{})
|
||||||
|
|
||||||
|
provider := testprovider.NewTestCloudProvider(nil, nil)
|
||||||
|
provider.AddNodeGroup("ng1", 1, 10, 2)
|
||||||
|
provider.AddNode("ng1", n1)
|
||||||
|
provider.AddNode("ng1", n2)
|
||||||
|
provider.AddNode("ng1", n3)
|
||||||
|
|
||||||
|
options := config.AutoscalingOptions{
|
||||||
|
ScaleDownUtilizationThreshold: 0.35,
|
||||||
|
ScaleDownGpuUtilizationThreshold: 0.3,
|
||||||
|
UnremovableNodeRecheckTimeout: 5 * time.Minute,
|
||||||
|
}
|
||||||
|
context := NewScaleTestAutoscalingContext(options, &fake.Clientset{}, nil, provider)
|
||||||
|
|
||||||
|
clusterStateRegistry := clusterstate.NewClusterStateRegistry(provider, clusterstate.ClusterStateRegistryConfig{}, context.LogRecorder, newBackoff())
|
||||||
|
sd := NewScaleDown(&context, clusterStateRegistry)
|
||||||
|
sd.UpdateUnneededNodes([]*apiv1.Node{n1, n2, n3}, []*apiv1.Node{n1, n2, n3},
|
||||||
|
[]*apiv1.Pod{p1, p2, p3}, time.Now(), nil)
|
||||||
|
|
||||||
|
assert.Equal(t, 1, len(sd.unneededNodes))
|
||||||
|
_, found := sd.unneededNodes["n2"]
|
||||||
|
assert.True(t, found)
|
||||||
|
|
||||||
|
assert.Contains(t, sd.podLocationHints, p2.Namespace+"/"+p2.Name)
|
||||||
|
assert.Equal(t, 3, len(sd.nodeUtilizationMap))
|
||||||
|
}
|
||||||
|
|
||||||
func TestPodsWithPrioritiesFindUnneededNodes(t *testing.T) {
|
func TestPodsWithPrioritiesFindUnneededNodes(t *testing.T) {
|
||||||
// shared owner reference
|
// shared owner reference
|
||||||
ownerRef := GenerateOwnerReferences("rs", "ReplicaSet", "extensions/v1beta1", "")
|
ownerRef := GenerateOwnerReferences("rs", "ReplicaSet", "extensions/v1beta1", "")
|
||||||
|
|
@ -812,14 +872,15 @@ func assertSubset(t *testing.T, a []string, b []string) {
|
||||||
}
|
}
|
||||||
|
|
||||||
var defaultScaleDownOptions = config.AutoscalingOptions{
|
var defaultScaleDownOptions = config.AutoscalingOptions{
|
||||||
ScaleDownUtilizationThreshold: 0.5,
|
ScaleDownUtilizationThreshold: 0.5,
|
||||||
ScaleDownUnneededTime: time.Minute,
|
ScaleDownGpuUtilizationThreshold: 0.5,
|
||||||
MaxGracefulTerminationSec: 60,
|
ScaleDownUnneededTime: time.Minute,
|
||||||
MaxEmptyBulkDelete: 10,
|
MaxGracefulTerminationSec: 60,
|
||||||
MinCoresTotal: 0,
|
MaxEmptyBulkDelete: 10,
|
||||||
MinMemoryTotal: 0,
|
MinCoresTotal: 0,
|
||||||
MaxCoresTotal: config.DefaultMaxClusterCores,
|
MinMemoryTotal: 0,
|
||||||
MaxMemoryTotal: config.DefaultMaxClusterMemory * units.GiB,
|
MaxCoresTotal: config.DefaultMaxClusterCores,
|
||||||
|
MaxMemoryTotal: config.DefaultMaxClusterMemory * units.GiB,
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestScaleDownEmptyMultipleNodeGroups(t *testing.T) {
|
func TestScaleDownEmptyMultipleNodeGroups(t *testing.T) {
|
||||||
|
|
|
||||||
|
|
@ -93,7 +93,10 @@ var (
|
||||||
scaleDownUnreadyTime = flag.Duration("scale-down-unready-time", 20*time.Minute,
|
scaleDownUnreadyTime = flag.Duration("scale-down-unready-time", 20*time.Minute,
|
||||||
"How long an unready node should be unneeded before it is eligible for scale down")
|
"How long an unready node should be unneeded before it is eligible for scale down")
|
||||||
scaleDownUtilizationThreshold = flag.Float64("scale-down-utilization-threshold", 0.5,
|
scaleDownUtilizationThreshold = flag.Float64("scale-down-utilization-threshold", 0.5,
|
||||||
"Node utilization level, defined as sum of requested resources divided by capacity, below which a node can be considered for scale down")
|
"Sum of cpu or memory of all pods running on the node divided by node's corresponding allocatable resource, below which a node can be considered for scale down")
|
||||||
|
scaleDownGpuUtilizationThreshold = flag.Float64("scale-down-gpu-utilization-threshold", 0.5,
|
||||||
|
"Sum of gpu requests of all pods running on the node divided by node's allocatable resource, below which a node can be considered for scale down."+
|
||||||
|
"Utilization calculation only cares about gpu resource for accelerator node. cpu and memory utilization will be ignored.")
|
||||||
scaleDownNonEmptyCandidatesCount = flag.Int("scale-down-non-empty-candidates-count", 30,
|
scaleDownNonEmptyCandidatesCount = flag.Int("scale-down-non-empty-candidates-count", 30,
|
||||||
"Maximum number of non empty nodes considered in one iteration as candidates for scale down with drain."+
|
"Maximum number of non empty nodes considered in one iteration as candidates for scale down with drain."+
|
||||||
"Lower value means better CA responsiveness but possible slower scale down latency."+
|
"Lower value means better CA responsiveness but possible slower scale down latency."+
|
||||||
|
|
@ -210,6 +213,7 @@ func createAutoscalingOptions() config.AutoscalingOptions {
|
||||||
ScaleDownUnneededTime: *scaleDownUnneededTime,
|
ScaleDownUnneededTime: *scaleDownUnneededTime,
|
||||||
ScaleDownUnreadyTime: *scaleDownUnreadyTime,
|
ScaleDownUnreadyTime: *scaleDownUnreadyTime,
|
||||||
ScaleDownUtilizationThreshold: *scaleDownUtilizationThreshold,
|
ScaleDownUtilizationThreshold: *scaleDownUtilizationThreshold,
|
||||||
|
ScaleDownGpuUtilizationThreshold: *scaleDownGpuUtilizationThreshold,
|
||||||
ScaleDownNonEmptyCandidatesCount: *scaleDownNonEmptyCandidatesCount,
|
ScaleDownNonEmptyCandidatesCount: *scaleDownNonEmptyCandidatesCount,
|
||||||
ScaleDownCandidatesPoolRatio: *scaleDownCandidatesPoolRatio,
|
ScaleDownCandidatesPoolRatio: *scaleDownCandidatesPoolRatio,
|
||||||
ScaleDownCandidatesPoolMinCount: *scaleDownCandidatesPoolMinCount,
|
ScaleDownCandidatesPoolMinCount: *scaleDownCandidatesPoolMinCount,
|
||||||
|
|
|
||||||
|
|
@ -19,13 +19,13 @@ package simulator
|
||||||
import (
|
import (
|
||||||
"flag"
|
"flag"
|
||||||
"fmt"
|
"fmt"
|
||||||
"math"
|
|
||||||
"math/rand"
|
"math/rand"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"k8s.io/autoscaler/cluster-autoscaler/utils/drain"
|
"k8s.io/autoscaler/cluster-autoscaler/utils/drain"
|
||||||
"k8s.io/autoscaler/cluster-autoscaler/utils/errors"
|
"k8s.io/autoscaler/cluster-autoscaler/utils/errors"
|
||||||
"k8s.io/autoscaler/cluster-autoscaler/utils/glogx"
|
"k8s.io/autoscaler/cluster-autoscaler/utils/glogx"
|
||||||
|
"k8s.io/autoscaler/cluster-autoscaler/utils/gpu"
|
||||||
kube_util "k8s.io/autoscaler/cluster-autoscaler/utils/kubernetes"
|
kube_util "k8s.io/autoscaler/cluster-autoscaler/utils/kubernetes"
|
||||||
scheduler_util "k8s.io/autoscaler/cluster-autoscaler/utils/scheduler"
|
scheduler_util "k8s.io/autoscaler/cluster-autoscaler/utils/scheduler"
|
||||||
"k8s.io/autoscaler/cluster-autoscaler/utils/tpu"
|
"k8s.io/autoscaler/cluster-autoscaler/utils/tpu"
|
||||||
|
|
@ -62,7 +62,10 @@ type NodeToBeRemoved struct {
|
||||||
type UtilizationInfo struct {
|
type UtilizationInfo struct {
|
||||||
CpuUtil float64
|
CpuUtil float64
|
||||||
MemUtil float64
|
MemUtil float64
|
||||||
// Max(CpuUtil, MemUtil).
|
GpuUtil float64
|
||||||
|
// Resource name of highest utilization resource
|
||||||
|
ResourceName apiv1.ResourceName
|
||||||
|
// Max(CpuUtil, MemUtil) or GpuUtils
|
||||||
Utilization float64
|
Utilization float64
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -149,10 +152,22 @@ func FindEmptyNodesToRemove(candidates []*apiv1.Node, pods []*apiv1.Pod) []*apiv
|
||||||
return result
|
return result
|
||||||
}
|
}
|
||||||
|
|
||||||
// CalculateUtilization calculates utilization of a node, defined as maximum of (cpu, memory) utilization.
|
// CalculateUtilization calculates utilization of a node, defined as maximum of (cpu, memory) or gpu utilization
|
||||||
// Per resource utilization is the sum of requests for it divided by allocatable. It also returns the individual
|
// based on if the node has GPU or not. Per resource utilization is the sum of requests for it divided by allocatable.
|
||||||
// cpu and memory utilization.
|
// It also returns the individual cpu, memory and gpu utilization.
|
||||||
func CalculateUtilization(node *apiv1.Node, nodeInfo *schedulernodeinfo.NodeInfo, skipDaemonSetPods, skipMirrorPods bool) (utilInfo UtilizationInfo, err error) {
|
func CalculateUtilization(node *apiv1.Node, nodeInfo *schedulernodeinfo.NodeInfo, skipDaemonSetPods, skipMirrorPods bool, gpuLabel string) (utilInfo UtilizationInfo, err error) {
|
||||||
|
if gpu.NodeHasGpu(gpuLabel, node) {
|
||||||
|
gpuUtil, err := calculateUtilizationOfResource(node, nodeInfo, gpu.ResourceNvidiaGPU, skipDaemonSetPods, skipMirrorPods)
|
||||||
|
if err != nil {
|
||||||
|
klog.V(3).Infof("node %s has unready GPU", node.Name)
|
||||||
|
// Return 0 if GPU is unready. This will guarantee we can still scale down a node with unready GPU.
|
||||||
|
return UtilizationInfo{GpuUtil: 0, ResourceName: gpu.ResourceNvidiaGPU, Utilization: 0}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Skips cpu and memory utilization calculation for node with GPU.
|
||||||
|
return UtilizationInfo{GpuUtil: gpuUtil, ResourceName: gpu.ResourceNvidiaGPU, Utilization: gpuUtil}, nil
|
||||||
|
}
|
||||||
|
|
||||||
cpu, err := calculateUtilizationOfResource(node, nodeInfo, apiv1.ResourceCPU, skipDaemonSetPods, skipMirrorPods)
|
cpu, err := calculateUtilizationOfResource(node, nodeInfo, apiv1.ResourceCPU, skipDaemonSetPods, skipMirrorPods)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return UtilizationInfo{}, err
|
return UtilizationInfo{}, err
|
||||||
|
|
@ -161,7 +176,18 @@ func CalculateUtilization(node *apiv1.Node, nodeInfo *schedulernodeinfo.NodeInfo
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return UtilizationInfo{}, err
|
return UtilizationInfo{}, err
|
||||||
}
|
}
|
||||||
return UtilizationInfo{CpuUtil: cpu, MemUtil: mem, Utilization: math.Max(cpu, mem)}, nil
|
|
||||||
|
utilization := UtilizationInfo{CpuUtil: cpu, MemUtil: mem}
|
||||||
|
|
||||||
|
if cpu > mem {
|
||||||
|
utilization.ResourceName = apiv1.ResourceCPU
|
||||||
|
utilization.Utilization = cpu
|
||||||
|
} else {
|
||||||
|
utilization.ResourceName = apiv1.ResourceMemory
|
||||||
|
utilization.Utilization = mem
|
||||||
|
}
|
||||||
|
|
||||||
|
return utilization, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func calculateUtilizationOfResource(node *apiv1.Node, nodeInfo *schedulernodeinfo.NodeInfo, resourceName apiv1.ResourceName, skipDaemonSetPods, skipMirrorPods bool) (float64, error) {
|
func calculateUtilizationOfResource(node *apiv1.Node, nodeInfo *schedulernodeinfo.NodeInfo, resourceName apiv1.ResourceName, skipDaemonSetPods, skipMirrorPods bool) (float64, error) {
|
||||||
|
|
|
||||||
|
|
@ -31,6 +31,7 @@ import (
|
||||||
)
|
)
|
||||||
|
|
||||||
func TestUtilization(t *testing.T) {
|
func TestUtilization(t *testing.T) {
|
||||||
|
gpuLabel := GetGPULabel()
|
||||||
pod := BuildTestPod("p1", 100, 200000)
|
pod := BuildTestPod("p1", 100, 200000)
|
||||||
pod2 := BuildTestPod("p2", -1, -1)
|
pod2 := BuildTestPod("p2", -1, -1)
|
||||||
|
|
||||||
|
|
@ -38,25 +39,25 @@ func TestUtilization(t *testing.T) {
|
||||||
node := BuildTestNode("node1", 2000, 2000000)
|
node := BuildTestNode("node1", 2000, 2000000)
|
||||||
SetNodeReadyState(node, true, time.Time{})
|
SetNodeReadyState(node, true, time.Time{})
|
||||||
|
|
||||||
utilInfo, err := CalculateUtilization(node, nodeInfo, false, false)
|
utilInfo, err := CalculateUtilization(node, nodeInfo, false, false, gpuLabel)
|
||||||
assert.NoError(t, err)
|
assert.NoError(t, err)
|
||||||
assert.InEpsilon(t, 2.0/10, utilInfo.Utilization, 0.01)
|
assert.InEpsilon(t, 2.0/10, utilInfo.Utilization, 0.01)
|
||||||
|
|
||||||
node2 := BuildTestNode("node1", 2000, -1)
|
node2 := BuildTestNode("node1", 2000, -1)
|
||||||
|
|
||||||
_, err = CalculateUtilization(node2, nodeInfo, false, false)
|
_, err = CalculateUtilization(node2, nodeInfo, false, false, gpuLabel)
|
||||||
assert.Error(t, err)
|
assert.Error(t, err)
|
||||||
|
|
||||||
daemonSetPod3 := BuildTestPod("p3", 100, 200000)
|
daemonSetPod3 := BuildTestPod("p3", 100, 200000)
|
||||||
daemonSetPod3.OwnerReferences = GenerateOwnerReferences("ds", "DaemonSet", "apps/v1", "")
|
daemonSetPod3.OwnerReferences = GenerateOwnerReferences("ds", "DaemonSet", "apps/v1", "")
|
||||||
|
|
||||||
nodeInfo = schedulernodeinfo.NewNodeInfo(pod, pod, pod2, daemonSetPod3)
|
nodeInfo = schedulernodeinfo.NewNodeInfo(pod, pod, pod2, daemonSetPod3)
|
||||||
utilInfo, err = CalculateUtilization(node, nodeInfo, true, false)
|
utilInfo, err = CalculateUtilization(node, nodeInfo, true, false, gpuLabel)
|
||||||
assert.NoError(t, err)
|
assert.NoError(t, err)
|
||||||
assert.InEpsilon(t, 2.0/10, utilInfo.Utilization, 0.01)
|
assert.InEpsilon(t, 2.0/10, utilInfo.Utilization, 0.01)
|
||||||
|
|
||||||
nodeInfo = schedulernodeinfo.NewNodeInfo(pod, pod2, daemonSetPod3)
|
nodeInfo = schedulernodeinfo.NewNodeInfo(pod, pod2, daemonSetPod3)
|
||||||
utilInfo, err = CalculateUtilization(node, nodeInfo, false, false)
|
utilInfo, err = CalculateUtilization(node, nodeInfo, false, false, gpuLabel)
|
||||||
assert.NoError(t, err)
|
assert.NoError(t, err)
|
||||||
assert.InEpsilon(t, 2.0/10, utilInfo.Utilization, 0.01)
|
assert.InEpsilon(t, 2.0/10, utilInfo.Utilization, 0.01)
|
||||||
|
|
||||||
|
|
@ -66,14 +67,31 @@ func TestUtilization(t *testing.T) {
|
||||||
}
|
}
|
||||||
|
|
||||||
nodeInfo = schedulernodeinfo.NewNodeInfo(pod, pod, pod2, mirrorPod4)
|
nodeInfo = schedulernodeinfo.NewNodeInfo(pod, pod, pod2, mirrorPod4)
|
||||||
utilInfo, err = CalculateUtilization(node, nodeInfo, false, true)
|
utilInfo, err = CalculateUtilization(node, nodeInfo, false, true, gpuLabel)
|
||||||
assert.NoError(t, err)
|
assert.NoError(t, err)
|
||||||
assert.InEpsilon(t, 2.0/10, utilInfo.Utilization, 0.01)
|
assert.InEpsilon(t, 2.0/10, utilInfo.Utilization, 0.01)
|
||||||
|
|
||||||
nodeInfo = schedulernodeinfo.NewNodeInfo(pod, pod2, mirrorPod4)
|
nodeInfo = schedulernodeinfo.NewNodeInfo(pod, pod2, mirrorPod4)
|
||||||
utilInfo, err = CalculateUtilization(node, nodeInfo, false, false)
|
utilInfo, err = CalculateUtilization(node, nodeInfo, false, false, gpuLabel)
|
||||||
assert.NoError(t, err)
|
assert.NoError(t, err)
|
||||||
assert.InEpsilon(t, 2.0/10, utilInfo.Utilization, 0.01)
|
assert.InEpsilon(t, 2.0/10, utilInfo.Utilization, 0.01)
|
||||||
|
|
||||||
|
gpuNode := BuildTestNode("gpu_node", 2000, 2000000)
|
||||||
|
AddGpusToNode(gpuNode, 1)
|
||||||
|
gpuPod := BuildTestPod("gpu_pod", 100, 200000)
|
||||||
|
RequestGpuForPod(gpuPod, 1)
|
||||||
|
nodeInfo = schedulernodeinfo.NewNodeInfo(pod, pod, gpuPod)
|
||||||
|
utilInfo, err = CalculateUtilization(gpuNode, nodeInfo, false, false, gpuLabel)
|
||||||
|
assert.NoError(t, err)
|
||||||
|
assert.InEpsilon(t, 1/1, utilInfo.Utilization, 0.01)
|
||||||
|
|
||||||
|
// Node with Unready GPU
|
||||||
|
gpuNode = BuildTestNode("gpu_node", 2000, 2000000)
|
||||||
|
AddGpuLabelToNode(gpuNode)
|
||||||
|
nodeInfo = schedulernodeinfo.NewNodeInfo(pod, pod)
|
||||||
|
utilInfo, err = CalculateUtilization(gpuNode, nodeInfo, false, false, gpuLabel)
|
||||||
|
assert.NoError(t, err)
|
||||||
|
assert.Zero(t, utilInfo.Utilization)
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestFindPlaceAllOk(t *testing.T) {
|
func TestFindPlaceAllOk(t *testing.T) {
|
||||||
|
|
|
||||||
|
|
@ -39,9 +39,10 @@ import (
|
||||||
func BuildTestPod(name string, cpu int64, mem int64) *apiv1.Pod {
|
func BuildTestPod(name string, cpu int64, mem int64) *apiv1.Pod {
|
||||||
pod := &apiv1.Pod{
|
pod := &apiv1.Pod{
|
||||||
ObjectMeta: metav1.ObjectMeta{
|
ObjectMeta: metav1.ObjectMeta{
|
||||||
Namespace: "default",
|
Namespace: "default",
|
||||||
Name: name,
|
Name: name,
|
||||||
SelfLink: fmt.Sprintf("/api/v1/namespaces/default/pods/%s", name),
|
SelfLink: fmt.Sprintf("/api/v1/namespaces/default/pods/%s", name),
|
||||||
|
Annotations: map[string]string{},
|
||||||
},
|
},
|
||||||
Spec: apiv1.PodSpec{
|
Spec: apiv1.PodSpec{
|
||||||
Containers: []apiv1.Container{
|
Containers: []apiv1.Container{
|
||||||
|
|
@ -128,9 +129,19 @@ func AddGpusToNode(node *apiv1.Node, gpusCount int64) {
|
||||||
})
|
})
|
||||||
node.Status.Capacity[resourceNvidiaGPU] = *resource.NewQuantity(gpusCount, resource.DecimalSI)
|
node.Status.Capacity[resourceNvidiaGPU] = *resource.NewQuantity(gpusCount, resource.DecimalSI)
|
||||||
node.Status.Allocatable[resourceNvidiaGPU] = *resource.NewQuantity(gpusCount, resource.DecimalSI)
|
node.Status.Allocatable[resourceNvidiaGPU] = *resource.NewQuantity(gpusCount, resource.DecimalSI)
|
||||||
|
AddGpuLabelToNode(node)
|
||||||
|
}
|
||||||
|
|
||||||
|
// AddGpuLabelToNode adds GPULabel to give node. This is used to mock intermediate result that GPU on node is not ready
|
||||||
|
func AddGpuLabelToNode(node *apiv1.Node) {
|
||||||
node.Labels[gpuLabel] = defaultGPUType
|
node.Labels[gpuLabel] = defaultGPUType
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// GetGPULabel return GPULabel on the node. This is only used in unit tests.
|
||||||
|
func GetGPULabel() string {
|
||||||
|
return gpuLabel
|
||||||
|
}
|
||||||
|
|
||||||
// SetNodeReadyState sets node ready state to either ConditionTrue or ConditionFalse.
|
// SetNodeReadyState sets node ready state to either ConditionTrue or ConditionFalse.
|
||||||
func SetNodeReadyState(node *apiv1.Node, ready bool, lastTransition time.Time) {
|
func SetNodeReadyState(node *apiv1.Node, ready bool, lastTransition time.Time) {
|
||||||
if ready {
|
if ready {
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue