Delay scale-up including GPU request

Nodes with GPU are expensive and it's likely a bunch of pods
using them will be created in a batch. In this case we can
wait a bit for all pods to be created to make more efficient
scale-up decision.
This commit is contained in:
Maciej Pytel 2018-03-02 15:55:04 +01:00
parent d876d74912
commit abbc45da2e
4 changed files with 50 additions and 1 deletions

View File

@ -37,6 +37,10 @@ import (
const (
// How old the oldest unschedulable pod should be before starting scale up.
unschedulablePodTimeBuffer = 2 * time.Second
// How old the oldest unschedulable pod with GPU should be before starting scale up.
// The idea is that nodes with GPU are very expensive and we're ready to sacrifice
// a bit more latency to wait for more pods and make a more informed scale-up decision.
unschedulablePodWithGpuTimeBuffer = 30 * time.Second
// How long should Cluster Autoscaler wait for nodes to become ready after start.
nodesNotReadyAfterStartTimeout = 10 * time.Minute
)
@ -274,7 +278,7 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) errors.AutoscalerError
glog.V(1).Info("No unschedulable pods")
} else if a.MaxNodesTotal > 0 && len(readyNodes) >= a.MaxNodesTotal {
glog.V(1).Info("Max total nodes in cluster reached")
} else if getOldestCreateTime(unschedulablePodsToHelp).Add(unschedulablePodTimeBuffer).After(currentTime) {
} else if allPodsAreNew(unschedulablePodsToHelp, currentTime) {
// The assumption here is that these pods have been created very recently and probably there
// is more pods to come. In theory we could check the newest pod time but then if pod were created
// slowly but at the pace of 1 every 2 seconds then no scale up would be triggered for long time.
@ -384,3 +388,11 @@ func (a *StaticAutoscaler) ExitCleanUp() {
}
utils.DeleteStatusConfigMap(a.AutoscalingContext.ClientSet, a.AutoscalingContext.ConfigNamespace)
}
func allPodsAreNew(pods []*apiv1.Pod, currentTime time.Time) bool {
if getOldestCreateTime(pods).Add(unschedulablePodTimeBuffer).After(currentTime) {
return true
}
found, oldest := getOldestCreateTimeWithGpu(pods)
return found && oldest.Add(unschedulablePodWithGpuTimeBuffer).After(currentTime)
}

View File

@ -32,6 +32,7 @@ import (
"k8s.io/autoscaler/cluster-autoscaler/utils/deletetaint"
"k8s.io/autoscaler/cluster-autoscaler/utils/drain"
"k8s.io/autoscaler/cluster-autoscaler/utils/errors"
"k8s.io/autoscaler/cluster-autoscaler/utils/gpu"
kube_util "k8s.io/autoscaler/cluster-autoscaler/utils/kubernetes"
scheduler_util "k8s.io/autoscaler/cluster-autoscaler/utils/scheduler"
@ -517,6 +518,20 @@ func getOldestCreateTime(pods []*apiv1.Pod) time.Time {
return oldest
}
func getOldestCreateTimeWithGpu(pods []*apiv1.Pod) (bool, time.Time) {
oldest := time.Now()
gpuFound := false
for _, pod := range pods {
if gpu.PodRequestsGpu(pod) {
gpuFound = true
if oldest.After(pod.CreationTimestamp.Time) {
oldest = pod.CreationTimestamp.Time
}
}
}
return gpuFound, oldest
}
// UpdateEmptyClusterStateMetrics updates metrics related to empty cluster's state.
// TODO(aleksandra-malinowska): use long unregistered value from ClusterStateRegistry.
func UpdateEmptyClusterStateMetrics() {

View File

@ -101,6 +101,19 @@ func NodeHasGpu(node *apiv1.Node) bool {
return hasGpuLabel || (hasGpuAllocatable && !gpuAllocatable.IsZero())
}
// PodRequestsGpu returns true if a given pod has GPU request.
func PodRequestsGpu(pod *apiv1.Pod) bool {
for _, container := range pod.Spec.Containers {
if container.Resources.Requests != nil {
_, gpuFound := container.Resources.Requests[ResourceNvidiaGPU]
if gpuFound {
return true
}
}
}
return false
}
// GpuRequestInfo contains an information about a set of pods requesting a GPU.
type GpuRequestInfo struct {
// MaxRequest is maximum GPU request among pods

View File

@ -194,6 +194,15 @@ func TestNodeHasGpu(t *testing.T) {
assert.False(t, NodeHasGpu(nodeNoGpu))
}
func TestPodRequestsGpu(t *testing.T) {
podNoGpu := test.BuildTestPod("podNoGpu", 0, 1000)
podWithGpu := test.BuildTestPod("pod1AnyGpu", 0, 1000)
podWithGpu.Spec.Containers[0].Resources.Requests[ResourceNvidiaGPU] = *resource.NewQuantity(1, resource.DecimalSI)
assert.False(t, PodRequestsGpu(podNoGpu))
assert.True(t, PodRequestsGpu(podWithGpu))
}
func TestGetGpuRequests(t *testing.T) {
podNoGpu := test.BuildTestPod("podNoGpu", 0, 1000)
podNoGpu.Spec.NodeSelector = map[string]string{}