From 2cd532ebfef42ad76b20588ea496b13dfd31a500 Mon Sep 17 00:00:00 2001 From: Marcin Wielgus Date: Tue, 20 Jun 2017 16:37:35 +0200 Subject: [PATCH] Don't calculate utilization and run scale down simulations for unmanaged nodes --- cluster-autoscaler/core/scale_down.go | 9 ++++++--- cluster-autoscaler/core/scale_down_test.go | 19 +++++++++++++------ cluster-autoscaler/core/static_autoscaler.go | 4 +++- cluster-autoscaler/core/utils.go | 18 ++++++++++++++++++ 4 files changed, 40 insertions(+), 10 deletions(-) diff --git a/cluster-autoscaler/core/scale_down.go b/cluster-autoscaler/core/scale_down.go index 18c8bb9a92..d3b5b105e4 100644 --- a/cluster-autoscaler/core/scale_down.go +++ b/cluster-autoscaler/core/scale_down.go @@ -109,9 +109,11 @@ func (sd *ScaleDown) CleanUpUnneededNodes() { // UpdateUnneededNodes calculates which nodes are not needed, i.e. all pods can be scheduled somewhere else, // and updates unneededNodes map accordingly. It also computes information where pods can be rescheduled and -// node utilization level. Timestamp is the current timestamp. +// node utilization level. Timestamp is the current timestamp. The computations are made only for the nodes +// managed by CA. func (sd *ScaleDown) UpdateUnneededNodes( nodes []*apiv1.Node, + managedNodes []*apiv1.Node, pods []*apiv1.Pod, timestamp time.Time, pdbs []*policyv1.PodDisruptionBudget) errors.AutoscalerError { @@ -120,8 +122,9 @@ func (sd *ScaleDown) UpdateUnneededNodes( nodeNameToNodeInfo := schedulercache.CreateNodeNameToInfoMap(pods, nodes) utilizationMap := make(map[string]float64) - // Phase1 - look at the nodes utilization. - for _, node := range nodes { + // Phase1 - look at the nodes utilization. Calculate the utilization + // only for the managed nodes. + for _, node := range managedNodes { // Skip nodes marked to be deleted, if they were marked recently. // Old-time marked nodes are again eligible for deletion - something went wrong with them diff --git a/cluster-autoscaler/core/scale_down_test.go b/cluster-autoscaler/core/scale_down_test.go index 80858c95bb..69cf819738 100644 --- a/cluster-autoscaler/core/scale_down_test.go +++ b/cluster-autoscaler/core/scale_down_test.go @@ -86,7 +86,7 @@ func TestFindUnneededNodes(t *testing.T) { LogRecorder: fakeLogRecorder, } sd := NewScaleDown(&context) - sd.UpdateUnneededNodes([]*apiv1.Node{n1, n2, n3, n4}, []*apiv1.Pod{p1, p2, p3, p4}, time.Now(), nil) + sd.UpdateUnneededNodes([]*apiv1.Node{n1, n2, n3, n4}, []*apiv1.Node{n1, n2, n3, n4}, []*apiv1.Pod{p1, p2, p3, p4}, time.Now(), nil) assert.Equal(t, 1, len(sd.unneededNodes)) addTime, found := sd.unneededNodes["n2"] @@ -95,13 +95,16 @@ func TestFindUnneededNodes(t *testing.T) { assert.Equal(t, 4, len(sd.nodeUtilizationMap)) sd.unneededNodes["n1"] = time.Now() - sd.UpdateUnneededNodes([]*apiv1.Node{n1, n2, n3, n4}, []*apiv1.Pod{p1, p2, p3, p4}, time.Now(), nil) + sd.UpdateUnneededNodes([]*apiv1.Node{n1, n2, n3, n4}, []*apiv1.Node{n1, n2, n3, n4}, []*apiv1.Pod{p1, p2, p3, p4}, time.Now(), nil) assert.Equal(t, 1, len(sd.unneededNodes)) addTime2, found := sd.unneededNodes["n2"] assert.True(t, found) assert.Equal(t, addTime, addTime2) assert.Equal(t, 4, len(sd.nodeUtilizationMap)) + + sd.UpdateUnneededNodes([]*apiv1.Node{n1, n2, n3, n4}, []*apiv1.Node{n1, n3, n4}, []*apiv1.Pod{p1, p2, p3, p4}, time.Now(), nil) + assert.Equal(t, 0, len(sd.unneededNodes)) } func TestDrainNode(t *testing.T) { @@ -288,7 +291,8 @@ func TestScaleDown(t *testing.T) { LogRecorder: fakeLogRecorder, } scaleDown := NewScaleDown(context) - scaleDown.UpdateUnneededNodes([]*apiv1.Node{n1, n2}, []*apiv1.Pod{p1, p2}, time.Now().Add(-5*time.Minute), nil) + scaleDown.UpdateUnneededNodes([]*apiv1.Node{n1, n2}, + []*apiv1.Node{n1, n2}, []*apiv1.Pod{p1, p2}, time.Now().Add(-5*time.Minute), nil) result, err := scaleDown.TryToScaleDown([]*apiv1.Node{n1, n2}, []*apiv1.Pod{p1, p2}, nil) assert.NoError(t, err) assert.Equal(t, ScaleDownNodeDeleted, result) @@ -349,7 +353,8 @@ func TestNoScaleDownUnready(t *testing.T) { // N1 is unready so it requires a bigger unneeded time. scaleDown := NewScaleDown(context) - scaleDown.UpdateUnneededNodes([]*apiv1.Node{n1, n2}, []*apiv1.Pod{p2}, time.Now().Add(-5*time.Minute), nil) + scaleDown.UpdateUnneededNodes([]*apiv1.Node{n1, n2}, + []*apiv1.Node{n1, n2}, []*apiv1.Pod{p2}, time.Now().Add(-5*time.Minute), nil) result, err := scaleDown.TryToScaleDown([]*apiv1.Node{n1, n2}, []*apiv1.Pod{p2}, nil) assert.NoError(t, err) assert.Equal(t, ScaleDownNoUnneeded, result) @@ -368,7 +373,8 @@ func TestNoScaleDownUnready(t *testing.T) { // N1 has been unready for 2 hours, ok to delete. context.CloudProvider = provider scaleDown = NewScaleDown(context) - scaleDown.UpdateUnneededNodes([]*apiv1.Node{n1, n2}, []*apiv1.Pod{p2}, time.Now().Add(-2*time.Hour), nil) + scaleDown.UpdateUnneededNodes([]*apiv1.Node{n1, n2}, []*apiv1.Node{n1, n2}, + []*apiv1.Pod{p2}, time.Now().Add(-2*time.Hour), nil) result, err = scaleDown.TryToScaleDown([]*apiv1.Node{n1, n2}, []*apiv1.Pod{p2}, nil) assert.NoError(t, err) assert.Equal(t, ScaleDownNodeDeleted, result) @@ -451,7 +457,8 @@ func TestScaleDownNoMove(t *testing.T) { LogRecorder: fakeLogRecorder, } scaleDown := NewScaleDown(context) - scaleDown.UpdateUnneededNodes([]*apiv1.Node{n1, n2}, []*apiv1.Pod{p1, p2}, time.Now().Add(5*time.Minute), nil) + scaleDown.UpdateUnneededNodes([]*apiv1.Node{n1, n2}, []*apiv1.Node{n1, n2}, + []*apiv1.Pod{p1, p2}, time.Now().Add(5*time.Minute), nil) result, err := scaleDown.TryToScaleDown([]*apiv1.Node{n1, n2}, []*apiv1.Pod{p1, p2}, nil) assert.NoError(t, err) assert.Equal(t, ScaleDownNoUnneeded, result) diff --git a/cluster-autoscaler/core/static_autoscaler.go b/cluster-autoscaler/core/static_autoscaler.go index 48c9cc7de3..2b5ecc85f0 100644 --- a/cluster-autoscaler/core/static_autoscaler.go +++ b/cluster-autoscaler/core/static_autoscaler.go @@ -267,7 +267,9 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) errors.AutoscalerError glog.V(4).Infof("Calculating unneeded nodes") scaleDown.CleanUp(time.Now()) - typedErr := scaleDown.UpdateUnneededNodes(allNodes, allScheduled, time.Now(), pdbs) + managedNodes := getManagedNodes(autoscalingContext, allNodes) + + typedErr := scaleDown.UpdateUnneededNodes(allNodes, managedNodes, allScheduled, time.Now(), pdbs) if typedErr != nil { glog.Errorf("Failed to scale down: %v", typedErr) return typedErr diff --git a/cluster-autoscaler/core/utils.go b/cluster-autoscaler/core/utils.go index c805de997a..23aacb5b29 100644 --- a/cluster-autoscaler/core/utils.go +++ b/cluster-autoscaler/core/utils.go @@ -329,3 +329,21 @@ func fixNodeGroupSize(context *AutoscalingContext, currentTime time.Time) (bool, } return fixed, nil } + +// getManagedNodes returns the nodes managed by the cluster autoscaler. +func getManagedNodes(context *AutoscalingContext, nodes []*apiv1.Node) []*apiv1.Node { + result := make([]*apiv1.Node, 0, len(nodes)) + for _, node := range nodes { + nodeGroup, err := context.CloudProvider.NodeGroupForNode(node) + if err != nil { + glog.Warningf("Error while checking node group for %s: %v", node.Name, err) + continue + } + if nodeGroup == nil || reflect.ValueOf(nodeGroup).IsNil() { + glog.V(4).Infof("Skipping %s - no node group config", node.Name) + continue + } + result = append(result, node) + } + return result +}