Delay scale-up including GPU request

Nodes with GPU are expensive and it's likely a bunch of pods using them will be created in a batch. In this case we can wait a bit for all pods to be created to make more efficient scale-up decision.
2018-03-02 15:55:04 +01:00 · 2018-03-02 15:55:04 +01:00 · abbc45da2e
parent d876d74912
commit abbc45da2e
4 changed files with 50 additions and 1 deletions
--- a/cluster-autoscaler/core/static_autoscaler.go
+++ b/cluster-autoscaler/core/static_autoscaler.go
@ -37,6 +37,10 @@ import (
 const (
 	// How old the oldest unschedulable pod should be before starting scale up.
 	unschedulablePodTimeBuffer = 2 * time.Second
+	// How old the oldest unschedulable pod with GPU should be before starting scale up.
+	// The idea is that nodes with GPU are very expensive and we're ready to sacrifice
+	// a bit more latency to wait for more pods and make a more informed scale-up decision.
+	unschedulablePodWithGpuTimeBuffer = 30 * time.Second
 	// How long should Cluster Autoscaler wait for nodes to become ready after start.
 	nodesNotReadyAfterStartTimeout = 10 * time.Minute
 )
@ -274,7 +278,7 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) errors.AutoscalerError
 		glog.V(1).Info("No unschedulable pods")
 	} else if a.MaxNodesTotal > 0 && len(readyNodes) >= a.MaxNodesTotal {
 		glog.V(1).Info("Max total nodes in cluster reached")
-	} else if getOldestCreateTime(unschedulablePodsToHelp).Add(unschedulablePodTimeBuffer).After(currentTime) {
+	} else if allPodsAreNew(unschedulablePodsToHelp, currentTime) {
 		// The assumption here is that these pods have been created very recently and probably there
 		// is more pods to come. In theory we could check the newest pod time but then if pod were created
 		// slowly but at the pace of 1 every 2 seconds then no scale up would be triggered for long time.
@ -384,3 +388,11 @@ func (a *StaticAutoscaler) ExitCleanUp() {
 	}
 	utils.DeleteStatusConfigMap(a.AutoscalingContext.ClientSet, a.AutoscalingContext.ConfigNamespace)
 }
+
+func allPodsAreNew(pods []*apiv1.Pod, currentTime time.Time) bool {
+	if getOldestCreateTime(pods).Add(unschedulablePodTimeBuffer).After(currentTime) {
+		return true
+	}
+	found, oldest := getOldestCreateTimeWithGpu(pods)
+	return found && oldest.Add(unschedulablePodWithGpuTimeBuffer).After(currentTime)
+}
--- a/cluster-autoscaler/core/utils.go
+++ b/cluster-autoscaler/core/utils.go
@ -32,6 +32,7 @@ import (
 	"k8s.io/autoscaler/cluster-autoscaler/utils/deletetaint"
 	"k8s.io/autoscaler/cluster-autoscaler/utils/drain"
 	"k8s.io/autoscaler/cluster-autoscaler/utils/errors"
+	"k8s.io/autoscaler/cluster-autoscaler/utils/gpu"
 	kube_util "k8s.io/autoscaler/cluster-autoscaler/utils/kubernetes"
 	scheduler_util "k8s.io/autoscaler/cluster-autoscaler/utils/scheduler"

@ -517,6 +518,20 @@ func getOldestCreateTime(pods []*apiv1.Pod) time.Time {
 	return oldest
 }

+func getOldestCreateTimeWithGpu(pods []*apiv1.Pod) (bool, time.Time) {
+	oldest := time.Now()
+	gpuFound := false
+	for _, pod := range pods {
+		if gpu.PodRequestsGpu(pod) {
+			gpuFound = true
+			if oldest.After(pod.CreationTimestamp.Time) {
+				oldest = pod.CreationTimestamp.Time
+			}
+		}
+	}
+	return gpuFound, oldest
+}
+
 // UpdateEmptyClusterStateMetrics updates metrics related to empty cluster's state.
 // TODO(aleksandra-malinowska): use long unregistered value from ClusterStateRegistry.
 func UpdateEmptyClusterStateMetrics() {
--- a/cluster-autoscaler/utils/gpu/gpu.go
+++ b/cluster-autoscaler/utils/gpu/gpu.go
@ -101,6 +101,19 @@ func NodeHasGpu(node *apiv1.Node) bool {
 	return hasGpuLabel || (hasGpuAllocatable && !gpuAllocatable.IsZero())
 }

+// PodRequestsGpu returns true if a given pod has GPU request.
+func PodRequestsGpu(pod *apiv1.Pod) bool {
+	for _, container := range pod.Spec.Containers {
+		if container.Resources.Requests != nil {
+			_, gpuFound := container.Resources.Requests[ResourceNvidiaGPU]
+			if gpuFound {
+				return true
+			}
+		}
+	}
+	return false
+}
+
 // GpuRequestInfo contains an information about a set of pods requesting a GPU.
 type GpuRequestInfo struct {
 	// MaxRequest is maximum GPU request among pods
--- a/cluster-autoscaler/utils/gpu/gpu_test.go
+++ b/cluster-autoscaler/utils/gpu/gpu_test.go
@ -194,6 +194,15 @@ func TestNodeHasGpu(t *testing.T) {
 	assert.False(t, NodeHasGpu(nodeNoGpu))
 }

+func TestPodRequestsGpu(t *testing.T) {
+	podNoGpu := test.BuildTestPod("podNoGpu", 0, 1000)
+	podWithGpu := test.BuildTestPod("pod1AnyGpu", 0, 1000)
+	podWithGpu.Spec.Containers[0].Resources.Requests[ResourceNvidiaGPU] = *resource.NewQuantity(1, resource.DecimalSI)
+
+	assert.False(t, PodRequestsGpu(podNoGpu))
+	assert.True(t, PodRequestsGpu(podWithGpu))
+}
+
 func TestGetGpuRequests(t *testing.T) {
 	podNoGpu := test.BuildTestPod("podNoGpu", 0, 1000)
 	podNoGpu.Spec.NodeSelector = map[string]string{}