Skip node in scale-down if it was recently found unremovable

This commit is contained in:
Maciej Pytel 2017-08-25 15:53:09 +02:00
parent 21c575f0ac
commit fa53e52ed9
4 changed files with 62 additions and 11 deletions

View File

@ -69,6 +69,8 @@ const (
// PodEvictionHeadroom is the extra time we wait to catch situations when the pod is ignoring SIGTERM and
// is killed with SIGKILL after MaxGracefulTerminationTime
PodEvictionHeadroom = 20 * time.Second
// UnremovableNodeRecheckTimeout is the timeout before we check again a node that couldn't be removed before
UnremovableNodeRecheckTimeout = 5 * time.Minute
)
// ScaleDown is responsible for maintaining the state needed to perform unneded node removals.
@ -76,6 +78,7 @@ type ScaleDown struct {
context *AutoscalingContext
unneededNodes map[string]time.Time
unneededNodesList []*apiv1.Node
unremovableNodes map[string]time.Time
podLocationHints map[string]string
nodeUtilizationMap map[string]float64
usageTracker *simulator.UsageTracker
@ -86,6 +89,7 @@ func NewScaleDown(context *AutoscalingContext) *ScaleDown {
return &ScaleDown{
context: context,
unneededNodes: make(map[string]time.Time),
unremovableNodes: make(map[string]time.Time),
podLocationHints: make(map[string]string),
nodeUtilizationMap: make(map[string]float64),
usageTracker: simulator.NewUsageTracker(),
@ -124,9 +128,25 @@ func (sd *ScaleDown) UpdateUnneededNodes(
nodeNameToNodeInfo := schedulercache.CreateNodeNameToInfoMap(pods, nodes)
utilizationMap := make(map[string]float64)
// Filter out nodes that were recently checked
nodesToCheck := make([]*apiv1.Node, 0)
for _, node := range managedNodes {
if unremovableTimestamp, found := sd.unremovableNodes[node.Name]; found {
if unremovableTimestamp.After(timestamp) {
continue
}
delete(sd.unremovableNodes, node.Name)
}
nodesToCheck = append(nodesToCheck, node)
}
skipped := len(managedNodes) - len(nodesToCheck)
if skipped > 0 {
glog.V(1).Infof("Scale-down calculation: ignoring %v nodes, that were unremovable in the last %v", skipped, UnremovableNodeRecheckTimeout)
}
// Phase1 - look at the nodes utilization. Calculate the utilization
// only for the managed nodes.
for _, node := range managedNodes {
for _, node := range nodesToCheck {
// Skip nodes marked to be deleted, if they were marked recently.
// Old-time marked nodes are again eligible for deletion - something went wrong with them
@ -164,7 +184,7 @@ func (sd *ScaleDown) UpdateUnneededNodes(
}
// Phase2 - check which nodes can be probably removed using fast drain.
nodesToRemove, newHints, simulatorErr := simulator.FindNodesToRemove(currentlyUnneededNodes, nodes, pods,
nodesToRemove, unremovable, newHints, simulatorErr := simulator.FindNodesToRemove(currentlyUnneededNodes, nodes, pods,
nil, sd.context.PredicateChecker,
len(currentlyUnneededNodes), true, sd.podLocationHints, sd.usageTracker, timestamp, pdbs)
if simulatorErr != nil {
@ -191,6 +211,15 @@ func (sd *ScaleDown) UpdateUnneededNodes(
}
}
// Add noded to unremovable map
if len(unremovable) > 0 {
unremovableTimeout := timestamp.Add(UnremovableNodeRecheckTimeout)
for _, node := range unremovable {
sd.unremovableNodes[node.Name] = unremovableTimeout
}
glog.V(1).Infof("%v nodes found unremovable in simulation, will re-check them at %v", len(unremovable), unremovableTimeout)
}
sd.unneededNodesList = unneadedNodeList
sd.unneededNodes = result
sd.podLocationHints = newHints
@ -281,7 +310,7 @@ func (sd *ScaleDown) TryToScaleDown(nodes []*apiv1.Node, pods []*apiv1.Pod, pdbs
findNodesToRemoveStart := time.Now()
// We look for only 1 node so new hints may be incomplete.
nodesToRemove, _, err := simulator.FindNodesToRemove(candidates, nodes, pods, sd.context.ClientSet,
nodesToRemove, _, _, err := simulator.FindNodesToRemove(candidates, nodes, pods, sd.context.ClientSet,
sd.context.PredicateChecker, 1, false,
sd.podLocationHints, sd.usageTracker, time.Now(), pdbs)
findNodesToRemoveDuration = time.Now().Sub(findNodesToRemoveStart)

View File

@ -104,8 +104,10 @@ func TestFindUnneededNodes(t *testing.T) {
assert.Contains(t, sd.podLocationHints, p2.Namespace+"/"+p2.Name)
assert.Equal(t, 4, len(sd.nodeUtilizationMap))
sd.unremovableNodes = make(map[string]time.Time)
sd.unneededNodes["n1"] = time.Now()
sd.UpdateUnneededNodes([]*apiv1.Node{n1, n2, n3, n4}, []*apiv1.Node{n1, n2, n3, n4}, []*apiv1.Pod{p1, p2, p3, p4}, time.Now(), nil)
sd.unremovableNodes = make(map[string]time.Time)
assert.Equal(t, 1, len(sd.unneededNodes))
addTime2, found := sd.unneededNodes["n2"]
@ -113,8 +115,17 @@ func TestFindUnneededNodes(t *testing.T) {
assert.Equal(t, addTime, addTime2)
assert.Equal(t, 4, len(sd.nodeUtilizationMap))
sd.unremovableNodes = make(map[string]time.Time)
sd.UpdateUnneededNodes([]*apiv1.Node{n1, n2, n3, n4}, []*apiv1.Node{n1, n3, n4}, []*apiv1.Pod{p1, p2, p3, p4}, time.Now(), nil)
assert.Equal(t, 0, len(sd.unneededNodes))
// Node n1 is unneeded, but should be skipped because it has just recently been found to be unremovable
sd.UpdateUnneededNodes([]*apiv1.Node{n1}, []*apiv1.Node{n1}, []*apiv1.Pod{}, time.Now(), nil)
assert.Equal(t, 0, len(sd.unneededNodes))
// But it should be checked after timeout
sd.UpdateUnneededNodes([]*apiv1.Node{n1}, []*apiv1.Node{n1}, []*apiv1.Pod{}, time.Now().Add(UnremovableNodeRecheckTimeout+time.Second), nil)
assert.Equal(t, 1, len(sd.unneededNodes))
}
func TestDrainNode(t *testing.T) {

View File

@ -59,10 +59,11 @@ func FindNodesToRemove(candidates []*apiv1.Node, allNodes []*apiv1.Node, pods []
fastCheck bool, oldHints map[string]string, usageTracker *UsageTracker,
timestamp time.Time,
podDisruptionBudgets []*policyv1.PodDisruptionBudget,
) (nodesToRemove []NodeToBeRemoved, podReschedulingHints map[string]string, finalError errors.AutoscalerError) {
) (nodesToRemove []NodeToBeRemoved, unremovableNodes []*apiv1.Node, podReschedulingHints map[string]string, finalError errors.AutoscalerError) {
nodeNameToNodeInfo := schedulercache.CreateNodeNameToInfoMap(pods, allNodes)
result := make([]NodeToBeRemoved, 0)
unremovable := make([]*apiv1.Node, 0)
evaluationType := "Detailed evaluation"
if fastCheck {
@ -87,10 +88,12 @@ candidateloop:
}
if err != nil {
glog.V(2).Infof("%s: node %s cannot be removed: %v", evaluationType, node.Name, err)
unremovable = append(unremovable, node)
continue candidateloop
}
} else {
glog.V(2).Infof("%s: nodeInfo for %s not found", evaluationType, node.Name)
unremovable = append(unremovable, node)
continue candidateloop
}
findProblems := findPlaceFor(node.Name, podsToRemove, allNodes, nodeNameToNodeInfo, predicateChecker, oldHints, newHints,
@ -107,9 +110,10 @@ candidateloop:
}
} else {
glog.V(2).Infof("%s: node %s is not suitable for removal: %v", evaluationType, node.Name, findProblems)
unremovable = append(unremovable, node)
}
}
return result, newHints, nil
return result, unremovable, newHints, nil
}
// FindEmptyNodesToRemove finds empty nodes that can be removed.

View File

@ -231,46 +231,53 @@ func TestFindNodesToRemove(t *testing.T) {
// just an empty node, should be removed
candidates = []*apiv1.Node{emptyNode}
allNodes = []*apiv1.Node{emptyNode}
toRemove, _, err := FindNodesToRemove(
toRemove, unremovable, _, err := FindNodesToRemove(
candidates, allNodes, pods, nil, predicateChecker, len(allNodes),
true, map[string]string{}, tracker, time.Now(), []*policyv1.PodDisruptionBudget{})
assert.NoError(t, err)
assert.Equal(t, toRemove, []NodeToBeRemoved{emptyNodeToRemove})
assert.Equal(t, len(unremovable), 0)
// just a drainable node, but nowhere for pods to go to
candidates = []*apiv1.Node{drainableNode}
allNodes = []*apiv1.Node{drainableNode}
toRemove, _, err = FindNodesToRemove(
toRemove, unremovable, _, err = FindNodesToRemove(
candidates, allNodes, pods, nil, predicateChecker, len(allNodes),
true, map[string]string{}, tracker, time.Now(), []*policyv1.PodDisruptionBudget{})
assert.NoError(t, err)
assert.Equal(t, toRemove, []NodeToBeRemoved{})
assert.Equal(t, len(unremovable), 1)
assert.Equal(t, unremovable[0].Name, "n2")
// drainable node, and a mostly empty node that can take its pods
candidates = []*apiv1.Node{drainableNode, nonDrainableNode}
allNodes = []*apiv1.Node{drainableNode, nonDrainableNode}
toRemove, _, err = FindNodesToRemove(
toRemove, unremovable, _, err = FindNodesToRemove(
candidates, allNodes, pods, nil, predicateChecker, len(allNodes),
true, map[string]string{}, tracker, time.Now(), []*policyv1.PodDisruptionBudget{})
assert.NoError(t, err)
assert.Equal(t, toRemove, []NodeToBeRemoved{drainableNodeToRemove})
assert.Equal(t, len(unremovable), 1)
assert.Equal(t, unremovable[0].Name, "n3")
// drainable node, and a full node that cannot fit anymore pods
candidates = []*apiv1.Node{drainableNode}
allNodes = []*apiv1.Node{drainableNode, fullNode}
toRemove, _, err = FindNodesToRemove(
toRemove, unremovable, _, err = FindNodesToRemove(
candidates, allNodes, pods, nil, predicateChecker, len(allNodes),
true, map[string]string{}, tracker, time.Now(), []*policyv1.PodDisruptionBudget{})
assert.NoError(t, err)
assert.Equal(t, toRemove, []NodeToBeRemoved{})
assert.Equal(t, len(unremovable), 1)
assert.Equal(t, unremovable[0].Name, "n2")
// 4 nodes, 1 empty, 1 drainable
candidates = []*apiv1.Node{emptyNode, drainableNode}
allNodes = []*apiv1.Node{emptyNode, drainableNode, fullNode, nonDrainableNode}
toRemove, _, err = FindNodesToRemove(
toRemove, unremovable, _, err = FindNodesToRemove(
candidates, allNodes, pods, nil, predicateChecker, len(allNodes),
true, map[string]string{}, tracker, time.Now(), []*policyv1.PodDisruptionBudget{})
assert.NoError(t, err)
assert.Equal(t, toRemove, []NodeToBeRemoved{emptyNodeToRemove, drainableNodeToRemove})
assert.Equal(t, len(unremovable), 0)
}