Don't calculate utilization and run scale down simulations for unmanaged nodes

This commit is contained in:
Marcin Wielgus 2017-06-20 16:37:35 +02:00
parent 7b3a75c294
commit 2cd532ebfe
4 changed files with 40 additions and 10 deletions

View File

@ -109,9 +109,11 @@ func (sd *ScaleDown) CleanUpUnneededNodes() {
// UpdateUnneededNodes calculates which nodes are not needed, i.e. all pods can be scheduled somewhere else,
// and updates unneededNodes map accordingly. It also computes information where pods can be rescheduled and
// node utilization level. Timestamp is the current timestamp.
// node utilization level. Timestamp is the current timestamp. The computations are made only for the nodes
// managed by CA.
func (sd *ScaleDown) UpdateUnneededNodes(
nodes []*apiv1.Node,
managedNodes []*apiv1.Node,
pods []*apiv1.Pod,
timestamp time.Time,
pdbs []*policyv1.PodDisruptionBudget) errors.AutoscalerError {
@ -120,8 +122,9 @@ func (sd *ScaleDown) UpdateUnneededNodes(
nodeNameToNodeInfo := schedulercache.CreateNodeNameToInfoMap(pods, nodes)
utilizationMap := make(map[string]float64)
// Phase1 - look at the nodes utilization.
for _, node := range nodes {
// Phase1 - look at the nodes utilization. Calculate the utilization
// only for the managed nodes.
for _, node := range managedNodes {
// Skip nodes marked to be deleted, if they were marked recently.
// Old-time marked nodes are again eligible for deletion - something went wrong with them

View File

@ -86,7 +86,7 @@ func TestFindUnneededNodes(t *testing.T) {
LogRecorder: fakeLogRecorder,
}
sd := NewScaleDown(&context)
sd.UpdateUnneededNodes([]*apiv1.Node{n1, n2, n3, n4}, []*apiv1.Pod{p1, p2, p3, p4}, time.Now(), nil)
sd.UpdateUnneededNodes([]*apiv1.Node{n1, n2, n3, n4}, []*apiv1.Node{n1, n2, n3, n4}, []*apiv1.Pod{p1, p2, p3, p4}, time.Now(), nil)
assert.Equal(t, 1, len(sd.unneededNodes))
addTime, found := sd.unneededNodes["n2"]
@ -95,13 +95,16 @@ func TestFindUnneededNodes(t *testing.T) {
assert.Equal(t, 4, len(sd.nodeUtilizationMap))
sd.unneededNodes["n1"] = time.Now()
sd.UpdateUnneededNodes([]*apiv1.Node{n1, n2, n3, n4}, []*apiv1.Pod{p1, p2, p3, p4}, time.Now(), nil)
sd.UpdateUnneededNodes([]*apiv1.Node{n1, n2, n3, n4}, []*apiv1.Node{n1, n2, n3, n4}, []*apiv1.Pod{p1, p2, p3, p4}, time.Now(), nil)
assert.Equal(t, 1, len(sd.unneededNodes))
addTime2, found := sd.unneededNodes["n2"]
assert.True(t, found)
assert.Equal(t, addTime, addTime2)
assert.Equal(t, 4, len(sd.nodeUtilizationMap))
sd.UpdateUnneededNodes([]*apiv1.Node{n1, n2, n3, n4}, []*apiv1.Node{n1, n3, n4}, []*apiv1.Pod{p1, p2, p3, p4}, time.Now(), nil)
assert.Equal(t, 0, len(sd.unneededNodes))
}
func TestDrainNode(t *testing.T) {
@ -288,7 +291,8 @@ func TestScaleDown(t *testing.T) {
LogRecorder: fakeLogRecorder,
}
scaleDown := NewScaleDown(context)
scaleDown.UpdateUnneededNodes([]*apiv1.Node{n1, n2}, []*apiv1.Pod{p1, p2}, time.Now().Add(-5*time.Minute), nil)
scaleDown.UpdateUnneededNodes([]*apiv1.Node{n1, n2},
[]*apiv1.Node{n1, n2}, []*apiv1.Pod{p1, p2}, time.Now().Add(-5*time.Minute), nil)
result, err := scaleDown.TryToScaleDown([]*apiv1.Node{n1, n2}, []*apiv1.Pod{p1, p2}, nil)
assert.NoError(t, err)
assert.Equal(t, ScaleDownNodeDeleted, result)
@ -349,7 +353,8 @@ func TestNoScaleDownUnready(t *testing.T) {
// N1 is unready so it requires a bigger unneeded time.
scaleDown := NewScaleDown(context)
scaleDown.UpdateUnneededNodes([]*apiv1.Node{n1, n2}, []*apiv1.Pod{p2}, time.Now().Add(-5*time.Minute), nil)
scaleDown.UpdateUnneededNodes([]*apiv1.Node{n1, n2},
[]*apiv1.Node{n1, n2}, []*apiv1.Pod{p2}, time.Now().Add(-5*time.Minute), nil)
result, err := scaleDown.TryToScaleDown([]*apiv1.Node{n1, n2}, []*apiv1.Pod{p2}, nil)
assert.NoError(t, err)
assert.Equal(t, ScaleDownNoUnneeded, result)
@ -368,7 +373,8 @@ func TestNoScaleDownUnready(t *testing.T) {
// N1 has been unready for 2 hours, ok to delete.
context.CloudProvider = provider
scaleDown = NewScaleDown(context)
scaleDown.UpdateUnneededNodes([]*apiv1.Node{n1, n2}, []*apiv1.Pod{p2}, time.Now().Add(-2*time.Hour), nil)
scaleDown.UpdateUnneededNodes([]*apiv1.Node{n1, n2}, []*apiv1.Node{n1, n2},
[]*apiv1.Pod{p2}, time.Now().Add(-2*time.Hour), nil)
result, err = scaleDown.TryToScaleDown([]*apiv1.Node{n1, n2}, []*apiv1.Pod{p2}, nil)
assert.NoError(t, err)
assert.Equal(t, ScaleDownNodeDeleted, result)
@ -451,7 +457,8 @@ func TestScaleDownNoMove(t *testing.T) {
LogRecorder: fakeLogRecorder,
}
scaleDown := NewScaleDown(context)
scaleDown.UpdateUnneededNodes([]*apiv1.Node{n1, n2}, []*apiv1.Pod{p1, p2}, time.Now().Add(5*time.Minute), nil)
scaleDown.UpdateUnneededNodes([]*apiv1.Node{n1, n2}, []*apiv1.Node{n1, n2},
[]*apiv1.Pod{p1, p2}, time.Now().Add(5*time.Minute), nil)
result, err := scaleDown.TryToScaleDown([]*apiv1.Node{n1, n2}, []*apiv1.Pod{p1, p2}, nil)
assert.NoError(t, err)
assert.Equal(t, ScaleDownNoUnneeded, result)

View File

@ -267,7 +267,9 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) errors.AutoscalerError
glog.V(4).Infof("Calculating unneeded nodes")
scaleDown.CleanUp(time.Now())
typedErr := scaleDown.UpdateUnneededNodes(allNodes, allScheduled, time.Now(), pdbs)
managedNodes := getManagedNodes(autoscalingContext, allNodes)
typedErr := scaleDown.UpdateUnneededNodes(allNodes, managedNodes, allScheduled, time.Now(), pdbs)
if typedErr != nil {
glog.Errorf("Failed to scale down: %v", typedErr)
return typedErr

View File

@ -329,3 +329,21 @@ func fixNodeGroupSize(context *AutoscalingContext, currentTime time.Time) (bool,
}
return fixed, nil
}
// getManagedNodes returns the nodes managed by the cluster autoscaler.
func getManagedNodes(context *AutoscalingContext, nodes []*apiv1.Node) []*apiv1.Node {
result := make([]*apiv1.Node, 0, len(nodes))
for _, node := range nodes {
nodeGroup, err := context.CloudProvider.NodeGroupForNode(node)
if err != nil {
glog.Warningf("Error while checking node group for %s: %v", node.Name, err)
continue
}
if nodeGroup == nil || reflect.ValueOf(nodeGroup).IsNil() {
glog.V(4).Infof("Skipping %s - no node group config", node.Name)
continue
}
result = append(result, node)
}
return result
}