From 2cd532ebfef42ad76b20588ea496b13dfd31a500 Mon Sep 17 00:00:00 2001
From: Marcin Wielgus <mwielgus@google.com>
Date: Tue, 20 Jun 2017 16:37:35 +0200
Subject: [PATCH] Don't calculate utilization and run scale down simulations
 for unmanaged nodes

---
 cluster-autoscaler/core/scale_down.go        |  9 ++++++---
 cluster-autoscaler/core/scale_down_test.go   | 19 +++++++++++++------
 cluster-autoscaler/core/static_autoscaler.go |  4 +++-
 cluster-autoscaler/core/utils.go             | 18 ++++++++++++++++++
 4 files changed, 40 insertions(+), 10 deletions(-)

diff --git a/cluster-autoscaler/core/scale_down.go b/cluster-autoscaler/core/scale_down.go
index 18c8bb9a92..d3b5b105e4 100644
--- a/cluster-autoscaler/core/scale_down.go
+++ b/cluster-autoscaler/core/scale_down.go
@@ -109,9 +109,11 @@ func (sd *ScaleDown) CleanUpUnneededNodes() {
 
 // UpdateUnneededNodes calculates which nodes are not needed, i.e. all pods can be scheduled somewhere else,
 // and updates unneededNodes map accordingly. It also computes information where pods can be rescheduled and
-// node utilization level. Timestamp is the current timestamp.
+// node utilization level. Timestamp is the current timestamp. The computations are made only for the nodes
+// managed by CA.
 func (sd *ScaleDown) UpdateUnneededNodes(
 	nodes []*apiv1.Node,
+	managedNodes []*apiv1.Node,
 	pods []*apiv1.Pod,
 	timestamp time.Time,
 	pdbs []*policyv1.PodDisruptionBudget) errors.AutoscalerError {
@@ -120,8 +122,9 @@ func (sd *ScaleDown) UpdateUnneededNodes(
 	nodeNameToNodeInfo := schedulercache.CreateNodeNameToInfoMap(pods, nodes)
 	utilizationMap := make(map[string]float64)
 
-	// Phase1 - look at the nodes utilization.
-	for _, node := range nodes {
+	// Phase1 - look at the nodes utilization. Calculate the utilization
+	// only for the managed nodes.
+	for _, node := range managedNodes {
 
 		// Skip nodes marked to be deleted, if they were marked recently.
 		// Old-time marked nodes are again eligible for deletion - something went wrong with them
diff --git a/cluster-autoscaler/core/scale_down_test.go b/cluster-autoscaler/core/scale_down_test.go
index 80858c95bb..69cf819738 100644
--- a/cluster-autoscaler/core/scale_down_test.go
+++ b/cluster-autoscaler/core/scale_down_test.go
@@ -86,7 +86,7 @@ func TestFindUnneededNodes(t *testing.T) {
 		LogRecorder:          fakeLogRecorder,
 	}
 	sd := NewScaleDown(&context)
-	sd.UpdateUnneededNodes([]*apiv1.Node{n1, n2, n3, n4}, []*apiv1.Pod{p1, p2, p3, p4}, time.Now(), nil)
+	sd.UpdateUnneededNodes([]*apiv1.Node{n1, n2, n3, n4}, []*apiv1.Node{n1, n2, n3, n4}, []*apiv1.Pod{p1, p2, p3, p4}, time.Now(), nil)
 
 	assert.Equal(t, 1, len(sd.unneededNodes))
 	addTime, found := sd.unneededNodes["n2"]
@@ -95,13 +95,16 @@ func TestFindUnneededNodes(t *testing.T) {
 	assert.Equal(t, 4, len(sd.nodeUtilizationMap))
 
 	sd.unneededNodes["n1"] = time.Now()
-	sd.UpdateUnneededNodes([]*apiv1.Node{n1, n2, n3, n4}, []*apiv1.Pod{p1, p2, p3, p4}, time.Now(), nil)
+	sd.UpdateUnneededNodes([]*apiv1.Node{n1, n2, n3, n4}, []*apiv1.Node{n1, n2, n3, n4}, []*apiv1.Pod{p1, p2, p3, p4}, time.Now(), nil)
 
 	assert.Equal(t, 1, len(sd.unneededNodes))
 	addTime2, found := sd.unneededNodes["n2"]
 	assert.True(t, found)
 	assert.Equal(t, addTime, addTime2)
 	assert.Equal(t, 4, len(sd.nodeUtilizationMap))
+
+	sd.UpdateUnneededNodes([]*apiv1.Node{n1, n2, n3, n4}, []*apiv1.Node{n1, n3, n4}, []*apiv1.Pod{p1, p2, p3, p4}, time.Now(), nil)
+	assert.Equal(t, 0, len(sd.unneededNodes))
 }
 
 func TestDrainNode(t *testing.T) {
@@ -288,7 +291,8 @@ func TestScaleDown(t *testing.T) {
 		LogRecorder:          fakeLogRecorder,
 	}
 	scaleDown := NewScaleDown(context)
-	scaleDown.UpdateUnneededNodes([]*apiv1.Node{n1, n2}, []*apiv1.Pod{p1, p2}, time.Now().Add(-5*time.Minute), nil)
+	scaleDown.UpdateUnneededNodes([]*apiv1.Node{n1, n2},
+		[]*apiv1.Node{n1, n2}, []*apiv1.Pod{p1, p2}, time.Now().Add(-5*time.Minute), nil)
 	result, err := scaleDown.TryToScaleDown([]*apiv1.Node{n1, n2}, []*apiv1.Pod{p1, p2}, nil)
 	assert.NoError(t, err)
 	assert.Equal(t, ScaleDownNodeDeleted, result)
@@ -349,7 +353,8 @@ func TestNoScaleDownUnready(t *testing.T) {
 
 	// N1 is unready so it requires a bigger unneeded time.
 	scaleDown := NewScaleDown(context)
-	scaleDown.UpdateUnneededNodes([]*apiv1.Node{n1, n2}, []*apiv1.Pod{p2}, time.Now().Add(-5*time.Minute), nil)
+	scaleDown.UpdateUnneededNodes([]*apiv1.Node{n1, n2},
+		[]*apiv1.Node{n1, n2}, []*apiv1.Pod{p2}, time.Now().Add(-5*time.Minute), nil)
 	result, err := scaleDown.TryToScaleDown([]*apiv1.Node{n1, n2}, []*apiv1.Pod{p2}, nil)
 	assert.NoError(t, err)
 	assert.Equal(t, ScaleDownNoUnneeded, result)
@@ -368,7 +373,8 @@ func TestNoScaleDownUnready(t *testing.T) {
 	// N1 has been unready for 2 hours, ok to delete.
 	context.CloudProvider = provider
 	scaleDown = NewScaleDown(context)
-	scaleDown.UpdateUnneededNodes([]*apiv1.Node{n1, n2}, []*apiv1.Pod{p2}, time.Now().Add(-2*time.Hour), nil)
+	scaleDown.UpdateUnneededNodes([]*apiv1.Node{n1, n2}, []*apiv1.Node{n1, n2},
+		[]*apiv1.Pod{p2}, time.Now().Add(-2*time.Hour), nil)
 	result, err = scaleDown.TryToScaleDown([]*apiv1.Node{n1, n2}, []*apiv1.Pod{p2}, nil)
 	assert.NoError(t, err)
 	assert.Equal(t, ScaleDownNodeDeleted, result)
@@ -451,7 +457,8 @@ func TestScaleDownNoMove(t *testing.T) {
 		LogRecorder:          fakeLogRecorder,
 	}
 	scaleDown := NewScaleDown(context)
-	scaleDown.UpdateUnneededNodes([]*apiv1.Node{n1, n2}, []*apiv1.Pod{p1, p2}, time.Now().Add(5*time.Minute), nil)
+	scaleDown.UpdateUnneededNodes([]*apiv1.Node{n1, n2}, []*apiv1.Node{n1, n2},
+		[]*apiv1.Pod{p1, p2}, time.Now().Add(5*time.Minute), nil)
 	result, err := scaleDown.TryToScaleDown([]*apiv1.Node{n1, n2}, []*apiv1.Pod{p1, p2}, nil)
 	assert.NoError(t, err)
 	assert.Equal(t, ScaleDownNoUnneeded, result)
diff --git a/cluster-autoscaler/core/static_autoscaler.go b/cluster-autoscaler/core/static_autoscaler.go
index 48c9cc7de3..2b5ecc85f0 100644
--- a/cluster-autoscaler/core/static_autoscaler.go
+++ b/cluster-autoscaler/core/static_autoscaler.go
@@ -267,7 +267,9 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) errors.AutoscalerError
 		glog.V(4).Infof("Calculating unneeded nodes")
 
 		scaleDown.CleanUp(time.Now())
-		typedErr := scaleDown.UpdateUnneededNodes(allNodes, allScheduled, time.Now(), pdbs)
+		managedNodes := getManagedNodes(autoscalingContext, allNodes)
+
+		typedErr := scaleDown.UpdateUnneededNodes(allNodes, managedNodes, allScheduled, time.Now(), pdbs)
 		if typedErr != nil {
 			glog.Errorf("Failed to scale down: %v", typedErr)
 			return typedErr
diff --git a/cluster-autoscaler/core/utils.go b/cluster-autoscaler/core/utils.go
index c805de997a..23aacb5b29 100644
--- a/cluster-autoscaler/core/utils.go
+++ b/cluster-autoscaler/core/utils.go
@@ -329,3 +329,21 @@ func fixNodeGroupSize(context *AutoscalingContext, currentTime time.Time) (bool,
 	}
 	return fixed, nil
 }
+
+// getManagedNodes returns the nodes managed by the cluster autoscaler.
+func getManagedNodes(context *AutoscalingContext, nodes []*apiv1.Node) []*apiv1.Node {
+	result := make([]*apiv1.Node, 0, len(nodes))
+	for _, node := range nodes {
+		nodeGroup, err := context.CloudProvider.NodeGroupForNode(node)
+		if err != nil {
+			glog.Warningf("Error while checking node group for %s: %v", node.Name, err)
+			continue
+		}
+		if nodeGroup == nil || reflect.ValueOf(nodeGroup).IsNil() {
+			glog.V(4).Infof("Skipping %s - no node group config", node.Name)
+			continue
+		}
+		result = append(result, node)
+	}
+	return result
+}