Skip node in scale-down if it was recently found unremovable
This commit is contained in:
parent
21c575f0ac
commit
fa53e52ed9
|
|
@ -69,6 +69,8 @@ const (
|
|||
// PodEvictionHeadroom is the extra time we wait to catch situations when the pod is ignoring SIGTERM and
|
||||
// is killed with SIGKILL after MaxGracefulTerminationTime
|
||||
PodEvictionHeadroom = 20 * time.Second
|
||||
// UnremovableNodeRecheckTimeout is the timeout before we check again a node that couldn't be removed before
|
||||
UnremovableNodeRecheckTimeout = 5 * time.Minute
|
||||
)
|
||||
|
||||
// ScaleDown is responsible for maintaining the state needed to perform unneded node removals.
|
||||
|
|
@ -76,6 +78,7 @@ type ScaleDown struct {
|
|||
context *AutoscalingContext
|
||||
unneededNodes map[string]time.Time
|
||||
unneededNodesList []*apiv1.Node
|
||||
unremovableNodes map[string]time.Time
|
||||
podLocationHints map[string]string
|
||||
nodeUtilizationMap map[string]float64
|
||||
usageTracker *simulator.UsageTracker
|
||||
|
|
@ -86,6 +89,7 @@ func NewScaleDown(context *AutoscalingContext) *ScaleDown {
|
|||
return &ScaleDown{
|
||||
context: context,
|
||||
unneededNodes: make(map[string]time.Time),
|
||||
unremovableNodes: make(map[string]time.Time),
|
||||
podLocationHints: make(map[string]string),
|
||||
nodeUtilizationMap: make(map[string]float64),
|
||||
usageTracker: simulator.NewUsageTracker(),
|
||||
|
|
@ -124,9 +128,25 @@ func (sd *ScaleDown) UpdateUnneededNodes(
|
|||
nodeNameToNodeInfo := schedulercache.CreateNodeNameToInfoMap(pods, nodes)
|
||||
utilizationMap := make(map[string]float64)
|
||||
|
||||
// Filter out nodes that were recently checked
|
||||
nodesToCheck := make([]*apiv1.Node, 0)
|
||||
for _, node := range managedNodes {
|
||||
if unremovableTimestamp, found := sd.unremovableNodes[node.Name]; found {
|
||||
if unremovableTimestamp.After(timestamp) {
|
||||
continue
|
||||
}
|
||||
delete(sd.unremovableNodes, node.Name)
|
||||
}
|
||||
nodesToCheck = append(nodesToCheck, node)
|
||||
}
|
||||
skipped := len(managedNodes) - len(nodesToCheck)
|
||||
if skipped > 0 {
|
||||
glog.V(1).Infof("Scale-down calculation: ignoring %v nodes, that were unremovable in the last %v", skipped, UnremovableNodeRecheckTimeout)
|
||||
}
|
||||
|
||||
// Phase1 - look at the nodes utilization. Calculate the utilization
|
||||
// only for the managed nodes.
|
||||
for _, node := range managedNodes {
|
||||
for _, node := range nodesToCheck {
|
||||
|
||||
// Skip nodes marked to be deleted, if they were marked recently.
|
||||
// Old-time marked nodes are again eligible for deletion - something went wrong with them
|
||||
|
|
@ -164,7 +184,7 @@ func (sd *ScaleDown) UpdateUnneededNodes(
|
|||
}
|
||||
|
||||
// Phase2 - check which nodes can be probably removed using fast drain.
|
||||
nodesToRemove, newHints, simulatorErr := simulator.FindNodesToRemove(currentlyUnneededNodes, nodes, pods,
|
||||
nodesToRemove, unremovable, newHints, simulatorErr := simulator.FindNodesToRemove(currentlyUnneededNodes, nodes, pods,
|
||||
nil, sd.context.PredicateChecker,
|
||||
len(currentlyUnneededNodes), true, sd.podLocationHints, sd.usageTracker, timestamp, pdbs)
|
||||
if simulatorErr != nil {
|
||||
|
|
@ -191,6 +211,15 @@ func (sd *ScaleDown) UpdateUnneededNodes(
|
|||
}
|
||||
}
|
||||
|
||||
// Add noded to unremovable map
|
||||
if len(unremovable) > 0 {
|
||||
unremovableTimeout := timestamp.Add(UnremovableNodeRecheckTimeout)
|
||||
for _, node := range unremovable {
|
||||
sd.unremovableNodes[node.Name] = unremovableTimeout
|
||||
}
|
||||
glog.V(1).Infof("%v nodes found unremovable in simulation, will re-check them at %v", len(unremovable), unremovableTimeout)
|
||||
}
|
||||
|
||||
sd.unneededNodesList = unneadedNodeList
|
||||
sd.unneededNodes = result
|
||||
sd.podLocationHints = newHints
|
||||
|
|
@ -281,7 +310,7 @@ func (sd *ScaleDown) TryToScaleDown(nodes []*apiv1.Node, pods []*apiv1.Pod, pdbs
|
|||
|
||||
findNodesToRemoveStart := time.Now()
|
||||
// We look for only 1 node so new hints may be incomplete.
|
||||
nodesToRemove, _, err := simulator.FindNodesToRemove(candidates, nodes, pods, sd.context.ClientSet,
|
||||
nodesToRemove, _, _, err := simulator.FindNodesToRemove(candidates, nodes, pods, sd.context.ClientSet,
|
||||
sd.context.PredicateChecker, 1, false,
|
||||
sd.podLocationHints, sd.usageTracker, time.Now(), pdbs)
|
||||
findNodesToRemoveDuration = time.Now().Sub(findNodesToRemoveStart)
|
||||
|
|
|
|||
|
|
@ -104,8 +104,10 @@ func TestFindUnneededNodes(t *testing.T) {
|
|||
assert.Contains(t, sd.podLocationHints, p2.Namespace+"/"+p2.Name)
|
||||
assert.Equal(t, 4, len(sd.nodeUtilizationMap))
|
||||
|
||||
sd.unremovableNodes = make(map[string]time.Time)
|
||||
sd.unneededNodes["n1"] = time.Now()
|
||||
sd.UpdateUnneededNodes([]*apiv1.Node{n1, n2, n3, n4}, []*apiv1.Node{n1, n2, n3, n4}, []*apiv1.Pod{p1, p2, p3, p4}, time.Now(), nil)
|
||||
sd.unremovableNodes = make(map[string]time.Time)
|
||||
|
||||
assert.Equal(t, 1, len(sd.unneededNodes))
|
||||
addTime2, found := sd.unneededNodes["n2"]
|
||||
|
|
@ -113,8 +115,17 @@ func TestFindUnneededNodes(t *testing.T) {
|
|||
assert.Equal(t, addTime, addTime2)
|
||||
assert.Equal(t, 4, len(sd.nodeUtilizationMap))
|
||||
|
||||
sd.unremovableNodes = make(map[string]time.Time)
|
||||
sd.UpdateUnneededNodes([]*apiv1.Node{n1, n2, n3, n4}, []*apiv1.Node{n1, n3, n4}, []*apiv1.Pod{p1, p2, p3, p4}, time.Now(), nil)
|
||||
assert.Equal(t, 0, len(sd.unneededNodes))
|
||||
|
||||
// Node n1 is unneeded, but should be skipped because it has just recently been found to be unremovable
|
||||
sd.UpdateUnneededNodes([]*apiv1.Node{n1}, []*apiv1.Node{n1}, []*apiv1.Pod{}, time.Now(), nil)
|
||||
assert.Equal(t, 0, len(sd.unneededNodes))
|
||||
|
||||
// But it should be checked after timeout
|
||||
sd.UpdateUnneededNodes([]*apiv1.Node{n1}, []*apiv1.Node{n1}, []*apiv1.Pod{}, time.Now().Add(UnremovableNodeRecheckTimeout+time.Second), nil)
|
||||
assert.Equal(t, 1, len(sd.unneededNodes))
|
||||
}
|
||||
|
||||
func TestDrainNode(t *testing.T) {
|
||||
|
|
|
|||
|
|
@ -59,10 +59,11 @@ func FindNodesToRemove(candidates []*apiv1.Node, allNodes []*apiv1.Node, pods []
|
|||
fastCheck bool, oldHints map[string]string, usageTracker *UsageTracker,
|
||||
timestamp time.Time,
|
||||
podDisruptionBudgets []*policyv1.PodDisruptionBudget,
|
||||
) (nodesToRemove []NodeToBeRemoved, podReschedulingHints map[string]string, finalError errors.AutoscalerError) {
|
||||
) (nodesToRemove []NodeToBeRemoved, unremovableNodes []*apiv1.Node, podReschedulingHints map[string]string, finalError errors.AutoscalerError) {
|
||||
|
||||
nodeNameToNodeInfo := schedulercache.CreateNodeNameToInfoMap(pods, allNodes)
|
||||
result := make([]NodeToBeRemoved, 0)
|
||||
unremovable := make([]*apiv1.Node, 0)
|
||||
|
||||
evaluationType := "Detailed evaluation"
|
||||
if fastCheck {
|
||||
|
|
@ -87,10 +88,12 @@ candidateloop:
|
|||
}
|
||||
if err != nil {
|
||||
glog.V(2).Infof("%s: node %s cannot be removed: %v", evaluationType, node.Name, err)
|
||||
unremovable = append(unremovable, node)
|
||||
continue candidateloop
|
||||
}
|
||||
} else {
|
||||
glog.V(2).Infof("%s: nodeInfo for %s not found", evaluationType, node.Name)
|
||||
unremovable = append(unremovable, node)
|
||||
continue candidateloop
|
||||
}
|
||||
findProblems := findPlaceFor(node.Name, podsToRemove, allNodes, nodeNameToNodeInfo, predicateChecker, oldHints, newHints,
|
||||
|
|
@ -107,9 +110,10 @@ candidateloop:
|
|||
}
|
||||
} else {
|
||||
glog.V(2).Infof("%s: node %s is not suitable for removal: %v", evaluationType, node.Name, findProblems)
|
||||
unremovable = append(unremovable, node)
|
||||
}
|
||||
}
|
||||
return result, newHints, nil
|
||||
return result, unremovable, newHints, nil
|
||||
}
|
||||
|
||||
// FindEmptyNodesToRemove finds empty nodes that can be removed.
|
||||
|
|
|
|||
|
|
@ -231,46 +231,53 @@ func TestFindNodesToRemove(t *testing.T) {
|
|||
// just an empty node, should be removed
|
||||
candidates = []*apiv1.Node{emptyNode}
|
||||
allNodes = []*apiv1.Node{emptyNode}
|
||||
toRemove, _, err := FindNodesToRemove(
|
||||
toRemove, unremovable, _, err := FindNodesToRemove(
|
||||
candidates, allNodes, pods, nil, predicateChecker, len(allNodes),
|
||||
true, map[string]string{}, tracker, time.Now(), []*policyv1.PodDisruptionBudget{})
|
||||
assert.NoError(t, err)
|
||||
assert.Equal(t, toRemove, []NodeToBeRemoved{emptyNodeToRemove})
|
||||
assert.Equal(t, len(unremovable), 0)
|
||||
|
||||
// just a drainable node, but nowhere for pods to go to
|
||||
candidates = []*apiv1.Node{drainableNode}
|
||||
allNodes = []*apiv1.Node{drainableNode}
|
||||
toRemove, _, err = FindNodesToRemove(
|
||||
toRemove, unremovable, _, err = FindNodesToRemove(
|
||||
candidates, allNodes, pods, nil, predicateChecker, len(allNodes),
|
||||
true, map[string]string{}, tracker, time.Now(), []*policyv1.PodDisruptionBudget{})
|
||||
assert.NoError(t, err)
|
||||
assert.Equal(t, toRemove, []NodeToBeRemoved{})
|
||||
assert.Equal(t, len(unremovable), 1)
|
||||
assert.Equal(t, unremovable[0].Name, "n2")
|
||||
|
||||
// drainable node, and a mostly empty node that can take its pods
|
||||
candidates = []*apiv1.Node{drainableNode, nonDrainableNode}
|
||||
allNodes = []*apiv1.Node{drainableNode, nonDrainableNode}
|
||||
toRemove, _, err = FindNodesToRemove(
|
||||
toRemove, unremovable, _, err = FindNodesToRemove(
|
||||
candidates, allNodes, pods, nil, predicateChecker, len(allNodes),
|
||||
true, map[string]string{}, tracker, time.Now(), []*policyv1.PodDisruptionBudget{})
|
||||
assert.NoError(t, err)
|
||||
assert.Equal(t, toRemove, []NodeToBeRemoved{drainableNodeToRemove})
|
||||
assert.Equal(t, len(unremovable), 1)
|
||||
assert.Equal(t, unremovable[0].Name, "n3")
|
||||
|
||||
// drainable node, and a full node that cannot fit anymore pods
|
||||
candidates = []*apiv1.Node{drainableNode}
|
||||
allNodes = []*apiv1.Node{drainableNode, fullNode}
|
||||
toRemove, _, err = FindNodesToRemove(
|
||||
toRemove, unremovable, _, err = FindNodesToRemove(
|
||||
candidates, allNodes, pods, nil, predicateChecker, len(allNodes),
|
||||
true, map[string]string{}, tracker, time.Now(), []*policyv1.PodDisruptionBudget{})
|
||||
assert.NoError(t, err)
|
||||
assert.Equal(t, toRemove, []NodeToBeRemoved{})
|
||||
assert.Equal(t, len(unremovable), 1)
|
||||
assert.Equal(t, unremovable[0].Name, "n2")
|
||||
|
||||
// 4 nodes, 1 empty, 1 drainable
|
||||
candidates = []*apiv1.Node{emptyNode, drainableNode}
|
||||
allNodes = []*apiv1.Node{emptyNode, drainableNode, fullNode, nonDrainableNode}
|
||||
toRemove, _, err = FindNodesToRemove(
|
||||
toRemove, unremovable, _, err = FindNodesToRemove(
|
||||
candidates, allNodes, pods, nil, predicateChecker, len(allNodes),
|
||||
true, map[string]string{}, tracker, time.Now(), []*policyv1.PodDisruptionBudget{})
|
||||
assert.NoError(t, err)
|
||||
assert.Equal(t, toRemove, []NodeToBeRemoved{emptyNodeToRemove, drainableNodeToRemove})
|
||||
|
||||
assert.Equal(t, len(unremovable), 0)
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue