Skip iteration if pending pods are too new

2017-12-20 21:19:10 +01:00 · 2017-12-20 21:19:10 +01:00 · 15b10c8f67
parent 8225983307
commit 15b10c8f67
3 changed files with 38 additions and 1 deletions
--- a/cluster-autoscaler/core/static_autoscaler.go
+++ b/cluster-autoscaler/core/static_autoscaler.go
@ -32,6 +32,11 @@ import (
 	"github.com/golang/glog"
 )

+const (
+	// How old the oldest unschedulable pod should be before starting scale up.
+	unschedulablePodTimeBuffer = 2 * time.Second
+)
+
 // StaticAutoscaler is an autoscaler which has all the core functionality of a CA but without the reconfiguration feature
 type StaticAutoscaler struct {
 	// AutoscalingContext consists of validated settings and options for this autoscaler
@ -240,10 +245,19 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) errors.AutoscalerError
 		glog.V(4).Info("No schedulable pods")
 	}

+	// If all pending pods are new we may want to skip a real scale down (just like if the pods were handled).
+	allPendingPodsToHelpAreNew := false
+
 	if len(unschedulablePodsToHelp) == 0 {
 		glog.V(1).Info("No unschedulable pods")
 	} else if a.MaxNodesTotal > 0 && len(readyNodes) >= a.MaxNodesTotal {
 		glog.V(1).Info("Max total nodes in cluster reached")
+	} else if getOldestCreateTime(unschedulablePodsToHelp).Add(unschedulablePodTimeBuffer).After(currentTime) {
+		// The assumption here is that these pods have been created very recently and probably there
+		// is more pods to come. In theory we could check the newest pod time but then if pod were created
+		// slowly but at the pace of 1 every 2 seconds then no scale up would be triggered for long time.
+		allPendingPodsToHelpAreNew = true
+		glog.V(1).Info("Unschedulable pods are very new, waiting one iteration for more")
 	} else {
 		daemonsets, err := a.ListerRegistry.DaemonSetLister().List()
 		if err != nil {
@ -301,7 +315,8 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) errors.AutoscalerError
 			a.lastScaleDownFailTime.Add(a.ScaleDownDelayAfterFailure).After(currentTime) ||
 			a.lastScaleDownDeleteTime.Add(a.ScaleDownDelayAfterDelete).After(currentTime) ||
 			schedulablePodsPresent ||
-			scaleDown.nodeDeleteStatus.IsDeleteInProgress()
+			scaleDown.nodeDeleteStatus.IsDeleteInProgress() ||
+			allPendingPodsToHelpAreNew

 		glog.V(4).Infof("Scale down status: unneededOnly=%v lastScaleUpTime=%s "+
 			"lastScaleDownDeleteTime=%v lastScaleDownFailTime=%s schedulablePodsPresent=%v isDeleteInProgress=%v",
--- a/cluster-autoscaler/core/utils.go
+++ b/cluster-autoscaler/core/utils.go
@ -491,3 +491,13 @@ func UpdateClusterStateMetrics(csr *clusterstate.ClusterStateRegistry) {
 	readiness := csr.GetClusterReadiness()
 	metrics.UpdateNodesCount(readiness.Ready, readiness.Unready+readiness.LongNotStarted, readiness.NotStarted)
 }
+
+func getOldestCreateTime(pods []*apiv1.Pod) time.Time {
+	oldest := time.Now()
+	for _, pod := range pods {
+		if oldest.After(pod.CreationTimestamp.Time) {
+			oldest = pod.CreationTimestamp.Time
+		}
+	}
+	return oldest
+}
--- a/cluster-autoscaler/core/utils_test.go
+++ b/cluster-autoscaler/core/utils_test.go
@ -527,3 +527,15 @@ func TestGetNodeCoresAndMemory(t *testing.T) {
 	_, _, err = getNodeCoresAndMemory(node)
 	assert.Error(t, err)
 }
+
+func TestGetOldestPod(t *testing.T) {
+	p1 := BuildTestPod("p1", 500, 1000)
+	p1.CreationTimestamp = metav1.NewTime(time.Now().Add(-1 * time.Minute))
+	p2 := BuildTestPod("p2", 500, 1000)
+	p2.CreationTimestamp = metav1.NewTime(time.Now().Add(+1 * time.Minute))
+	p3 := BuildTestPod("p3", 500, 1000)
+	p3.CreationTimestamp = metav1.NewTime(time.Now())
+
+	assert.Equal(t, p1.CreationTimestamp.Time, getOldestCreateTime([]*apiv1.Pod{p1, p2, p3}))
+	assert.Equal(t, p1.CreationTimestamp.Time, getOldestCreateTime([]*apiv1.Pod{p3, p2, p1}))
+}