Add podScaleUpDelay annotation support

2021-08-09 18:44:37 -04:00 · 2021-08-09 18:44:37 -04:00 · 11d150e920
parent c38cc74604
commit 11d150e920
4 changed files with 142 additions and 9 deletions
--- a/cluster-autoscaler/FAQ.md
+++ b/cluster-autoscaler/FAQ.md
@ -33,6 +33,7 @@ this document:
  * [How can I scale a node group to 0?](#how-can-i-scale-a-node-group-to-0)
  * [How can I prevent Cluster Autoscaler from scaling down a particular node?](#how-can-i-prevent-cluster-autoscaler-from-scaling-down-a-particular-node)
  * [How can I prevent Cluster Autoscaler from scaling down non-empty nodes?](#how-can-i-prevent-cluster-autoscaler-from-scaling-down-non-empty-nodes)
+  * [How can I modify Cluster Autoscaler reaction time?](#how-can-i-modify-cluster-autoscaler-reaction-time)
  * [How can I configure overprovisioning with Cluster Autoscaler?](#how-can-i-configure-overprovisioning-with-cluster-autoscaler)
  * [How can I enable/disable eviction for a specific DaemonSet](#how-can-i-enabledisable-eviction-for-a-specific-daemonset)
  * [How can I enable Cluster Autoscaler to scale up when Node's max volume count is exceeded (CSI migration enabled)?](#how-can-i-enable-cluster-autoscaler-to-scale-up-when-nodes-max-volume-count-is-exceeded-csi-migration-enabled)
@ -331,6 +332,23 @@ CA might scale down non-empty nodes with utilization below a threshold

 To prevent this behavior, set the utilization threshold to `0`.

+### How can I modify Cluster Autoscaler reaction time?
+
+There are multiple flags which can be used to configure scale up and scale down delays.
+
+In some environments, you may wish to give the k8s scheduler a bit more time to schedule a pod than the CA's scan-interval.
+One way to do this is by setting `--new-pod-scale-up-delay`, which causes the CA to ignore unschedulable pods until they are
+a certain "age", regardless of the scan-interval. This setting can be overridden per pod through
+`cluster-autoscaler.kubernetes.io/pod-scale-up-delay` annotation. If k8s has not scheduled them by the end of that delay,
+then they may be considered by the CA for a possible scale-up.
+
+```
+"cluster-autoscaler.kubernetes.io/pod-scale-up-delay": "600s"
+```
+
+Scaling down of unneeded nodes can be configured by setting `--scale-down-unneeded-time`. Increasing value will make nodes stay
+up longer, waiting for pods to be scheduled while decreasing value will make nodes be deleted sooner.
+
 ### How can I configure overprovisioning with Cluster Autoscaler?

 Below solution works since version 1.1 (to be shipped with Kubernetes 1.9).
@ -620,10 +638,7 @@ then this node group may be excluded from future scale-ups.
 ### How fast is Cluster Autoscaler?

 By default, scale-up is considered up to 10 seconds after pod is marked as unschedulable, and scale-down 10 minutes after a node becomes unneeded.
-There are multiple flags which can be used to configure these thresholds. For example, in some environments, you may wish to give the k8s scheduler
-a bit more time to schedule a pod than the CA's scan-interval. One way to do this is by setting `--new-pod-scale-up-delay`, which causes the CA to
-ignore unschedulable pods until they are a certain "age", regardless of the scan-interval. If k8s has not scheduled them by the end of that delay,
-then they may be considered by the CA for a possible scale-up.
+Read [this section](#how-can-i-modify-cluster-autoscaler-reaction-time) to see how you can modify this behaviour.

 Assuming default settings, [SLOs described here apply](#what-are-the-service-level-objectives-for-cluster-autoscaler).

--- a/cluster-autoscaler/core/static_autoscaler.go
+++ b/cluster-autoscaler/core/static_autoscaler.go
@ -62,6 +62,9 @@ const (

 	// NodeUpcomingAnnotation is an annotation CA adds to nodes which are upcoming.
 	NodeUpcomingAnnotation = "cluster-autoscaler.k8s.io/upcoming-node"
+
+	// podScaleUpDelayAnnotationKey is an annotation how long pod can wait to be scaled up.
+	podScaleUpDelayAnnotationKey = "cluster-autoscaler.kubernetes.io/pod-scale-up-delay"
 )

 // StaticAutoscaler is an autoscaler which has all the core functionality of a CA but without the reconfiguration feature
@ -719,17 +722,32 @@ func (a *StaticAutoscaler) nodeGroupsById() map[string]cloudprovider.NodeGroup {
 	return nodeGroups
 }

-// don't consider pods newer than newPodScaleUpDelay seconds old as unschedulable
+// Don't consider pods newer than newPodScaleUpDelay or annotated podScaleUpDelay
+// seconds old as unschedulable.
 func (a *StaticAutoscaler) filterOutYoungPods(allUnschedulablePods []*apiv1.Pod, currentTime time.Time) []*apiv1.Pod {
 	var oldUnschedulablePods []*apiv1.Pod
 	newPodScaleUpDelay := a.AutoscalingOptions.NewPodScaleUpDelay
 	for _, pod := range allUnschedulablePods {
 		podAge := currentTime.Sub(pod.CreationTimestamp.Time)
-		if podAge > newPodScaleUpDelay {
+		podScaleUpDelay := newPodScaleUpDelay
+
+		if podScaleUpDelayAnnotationStr, ok := pod.Annotations[podScaleUpDelayAnnotationKey]; ok {
+			podScaleUpDelayAnnotation, err := time.ParseDuration(podScaleUpDelayAnnotationStr)
+			if err != nil {
+				klog.Errorf("Failed to parse pod %q annotation %s: %v", pod.Name, podScaleUpDelayAnnotationKey, err)
+			} else {
+				if podScaleUpDelayAnnotation < podScaleUpDelay {
+					klog.Errorf("Failed to set pod scale up delay for %q through annotation %s: %d is less then %d", pod.Name, podScaleUpDelayAnnotationKey, podScaleUpDelayAnnotation, newPodScaleUpDelay)
+				} else {
+					podScaleUpDelay = podScaleUpDelayAnnotation
+				}
+			}
+		}
+
+		if podAge > podScaleUpDelay {
 			oldUnschedulablePods = append(oldUnschedulablePods, pod)
 		} else {
 			klog.V(3).Infof("Pod %s is %.3f seconds old, too new to consider unschedulable", pod.Name, podAge.Seconds())
-
 		}
 	}
 	return oldUnschedulablePods
--- a/cluster-autoscaler/core/static_autoscaler_test.go
+++ b/cluster-autoscaler/core/static_autoscaler_test.go
@ -17,7 +17,10 @@ limitations under the License.
 package core

 import (
+	"bytes"
+	"flag"
 	"fmt"
+	"os"
 	"reflect"
 	"strings"
 	"testing"
@ -46,6 +49,7 @@ import (
 	appsv1 "k8s.io/api/apps/v1"
 	apiv1 "k8s.io/api/core/v1"
 	policyv1 "k8s.io/api/policy/v1"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/labels"
 	"k8s.io/client-go/kubernetes/fake"
 	v1appslister "k8s.io/client-go/listers/apps/v1"
@ -952,7 +956,6 @@ func TestStaticAutoscalerRunOnceWithFilteringOnUpcomingNodesEnabledNoScaleUp(t *
 }

 func TestStaticAutoscalerInstanceCreationErrors(t *testing.T) {
-
 	// setup
 	provider := &mockprovider.CloudProvider{}

@ -1377,6 +1380,103 @@ func TestSubtractNodes(t *testing.T) {
 	}
 }

+func TestFilterOutYoungPods(t *testing.T) {
+	now := time.Now()
+	klog.InitFlags(nil)
+	flag.CommandLine.Parse([]string{"--logtostderr=false"})
+
+	p1 := BuildTestPod("p1", 500, 1000)
+	p1.CreationTimestamp = metav1.NewTime(now.Add(-1 * time.Minute))
+	p2 := BuildTestPod("p2", 500, 1000)
+	p2.CreationTimestamp = metav1.NewTime(now.Add(-1 * time.Minute))
+	p2.Annotations = map[string]string{
+		podScaleUpDelayAnnotationKey: "5m",
+	}
+	p3 := BuildTestPod("p3", 500, 1000)
+	p3.CreationTimestamp = metav1.NewTime(now.Add(-1 * time.Minute))
+	p3.Annotations = map[string]string{
+		podScaleUpDelayAnnotationKey: "2m",
+	}
+	p4 := BuildTestPod("p4", 500, 1000)
+	p4.CreationTimestamp = metav1.NewTime(now.Add(-1 * time.Minute))
+	p4.Annotations = map[string]string{
+		podScaleUpDelayAnnotationKey: "error",
+	}
+
+	tests := []struct {
+		name               string
+		newPodScaleUpDelay time.Duration
+		runTime            time.Time
+		pods               []*apiv1.Pod
+		expectedPods       []*apiv1.Pod
+		expectedError      string
+	}{
+		{
+			name:               "annotation delayed pod checking now",
+			newPodScaleUpDelay: 0,
+			runTime:            now,
+			pods:               []*apiv1.Pod{p1, p2},
+			expectedPods:       []*apiv1.Pod{p1},
+		},
+		{
+			name:               "annotation delayed pod checking after delay",
+			newPodScaleUpDelay: 0,
+			runTime:            now.Add(5 * time.Minute),
+			pods:               []*apiv1.Pod{p1, p2},
+			expectedPods:       []*apiv1.Pod{p1, p2},
+		},
+		{
+			name:               "globally delayed pods",
+			newPodScaleUpDelay: 5 * time.Minute,
+			runTime:            now,
+			pods:               []*apiv1.Pod{p1, p2},
+			expectedPods:       []*apiv1.Pod(nil),
+		},
+		{
+			name:               "annotation delay smaller than global",
+			newPodScaleUpDelay: 5 * time.Minute,
+			runTime:            now.Add(2 * time.Minute),
+			pods:               []*apiv1.Pod{p1, p3},
+			expectedPods:       []*apiv1.Pod(nil),
+			expectedError:      "Failed to set pod scale up delay for",
+		},
+		{
+			name:               "annotation delay with error",
+			newPodScaleUpDelay: 0,
+			runTime:            now,
+			pods:               []*apiv1.Pod{p1, p4},
+			expectedPods:       []*apiv1.Pod{p1, p4},
+			expectedError:      "Failed to parse pod",
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			context := context.AutoscalingContext{
+				AutoscalingOptions: config.AutoscalingOptions{
+					NewPodScaleUpDelay: tt.newPodScaleUpDelay,
+				},
+			}
+			autoscaler := &StaticAutoscaler{
+				AutoscalingContext: &context,
+			}
+
+			var buf bytes.Buffer
+			klog.SetOutput(&buf)
+			defer func() {
+				klog.SetOutput(os.Stderr)
+			}()
+
+			actual := autoscaler.filterOutYoungPods(tt.pods, tt.runTime)
+
+			assert.Equal(t, tt.expectedPods, actual)
+			if tt.expectedError != "" {
+				assert.Contains(t, buf.String(), tt.expectedError)
+			}
+		})
+	}
+}
+
 func nodeNames(ns []*apiv1.Node) []string {
 	names := make([]string, len(ns))
 	for i, node := range ns {
--- a/cluster-autoscaler/main.go
+++ b/cluster-autoscaler/main.go
@ -176,7 +176,7 @@ var (
 	unremovableNodeRecheckTimeout = flag.Duration("unremovable-node-recheck-timeout", 5*time.Minute, "The timeout before we check again a node that couldn't be removed before")
 	expendablePodsPriorityCutoff  = flag.Int("expendable-pods-priority-cutoff", -10, "Pods with priority below cutoff will be expendable. They can be killed without any consideration during scale down and they don't cause scale up. Pods with null priority (PodPriority disabled) are non expendable.")
 	regional                      = flag.Bool("regional", false, "Cluster is regional.")
-	newPodScaleUpDelay            = flag.Duration("new-pod-scale-up-delay", 0*time.Second, "Pods less than this old will not be considered for scale-up.")
+	newPodScaleUpDelay            = flag.Duration("new-pod-scale-up-delay", 0*time.Second, "Pods less than this old will not be considered for scale-up. Can be increased for individual pods through annotation 'cluster-autoscaler.kubernetes.io/pod-scale-up-delay'.")

 	ignoreTaintsFlag                   = multiStringFlag("ignore-taint", "Specifies a taint to ignore in node templates when considering to scale a node group")
 	balancingIgnoreLabelsFlag          = multiStringFlag("balancing-ignore-label", "Specifies a label to ignore in addition to the basic and cloud-provider set of labels when comparing if two node groups are similar")