Skip node in scale-down if it was recently found unremovable
This commit is contained in:
parent
21c575f0ac
commit
fa53e52ed9
|
|
@ -69,6 +69,8 @@ const (
|
||||||
// PodEvictionHeadroom is the extra time we wait to catch situations when the pod is ignoring SIGTERM and
|
// PodEvictionHeadroom is the extra time we wait to catch situations when the pod is ignoring SIGTERM and
|
||||||
// is killed with SIGKILL after MaxGracefulTerminationTime
|
// is killed with SIGKILL after MaxGracefulTerminationTime
|
||||||
PodEvictionHeadroom = 20 * time.Second
|
PodEvictionHeadroom = 20 * time.Second
|
||||||
|
// UnremovableNodeRecheckTimeout is the timeout before we check again a node that couldn't be removed before
|
||||||
|
UnremovableNodeRecheckTimeout = 5 * time.Minute
|
||||||
)
|
)
|
||||||
|
|
||||||
// ScaleDown is responsible for maintaining the state needed to perform unneded node removals.
|
// ScaleDown is responsible for maintaining the state needed to perform unneded node removals.
|
||||||
|
|
@ -76,6 +78,7 @@ type ScaleDown struct {
|
||||||
context *AutoscalingContext
|
context *AutoscalingContext
|
||||||
unneededNodes map[string]time.Time
|
unneededNodes map[string]time.Time
|
||||||
unneededNodesList []*apiv1.Node
|
unneededNodesList []*apiv1.Node
|
||||||
|
unremovableNodes map[string]time.Time
|
||||||
podLocationHints map[string]string
|
podLocationHints map[string]string
|
||||||
nodeUtilizationMap map[string]float64
|
nodeUtilizationMap map[string]float64
|
||||||
usageTracker *simulator.UsageTracker
|
usageTracker *simulator.UsageTracker
|
||||||
|
|
@ -86,6 +89,7 @@ func NewScaleDown(context *AutoscalingContext) *ScaleDown {
|
||||||
return &ScaleDown{
|
return &ScaleDown{
|
||||||
context: context,
|
context: context,
|
||||||
unneededNodes: make(map[string]time.Time),
|
unneededNodes: make(map[string]time.Time),
|
||||||
|
unremovableNodes: make(map[string]time.Time),
|
||||||
podLocationHints: make(map[string]string),
|
podLocationHints: make(map[string]string),
|
||||||
nodeUtilizationMap: make(map[string]float64),
|
nodeUtilizationMap: make(map[string]float64),
|
||||||
usageTracker: simulator.NewUsageTracker(),
|
usageTracker: simulator.NewUsageTracker(),
|
||||||
|
|
@ -124,9 +128,25 @@ func (sd *ScaleDown) UpdateUnneededNodes(
|
||||||
nodeNameToNodeInfo := schedulercache.CreateNodeNameToInfoMap(pods, nodes)
|
nodeNameToNodeInfo := schedulercache.CreateNodeNameToInfoMap(pods, nodes)
|
||||||
utilizationMap := make(map[string]float64)
|
utilizationMap := make(map[string]float64)
|
||||||
|
|
||||||
|
// Filter out nodes that were recently checked
|
||||||
|
nodesToCheck := make([]*apiv1.Node, 0)
|
||||||
|
for _, node := range managedNodes {
|
||||||
|
if unremovableTimestamp, found := sd.unremovableNodes[node.Name]; found {
|
||||||
|
if unremovableTimestamp.After(timestamp) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
delete(sd.unremovableNodes, node.Name)
|
||||||
|
}
|
||||||
|
nodesToCheck = append(nodesToCheck, node)
|
||||||
|
}
|
||||||
|
skipped := len(managedNodes) - len(nodesToCheck)
|
||||||
|
if skipped > 0 {
|
||||||
|
glog.V(1).Infof("Scale-down calculation: ignoring %v nodes, that were unremovable in the last %v", skipped, UnremovableNodeRecheckTimeout)
|
||||||
|
}
|
||||||
|
|
||||||
// Phase1 - look at the nodes utilization. Calculate the utilization
|
// Phase1 - look at the nodes utilization. Calculate the utilization
|
||||||
// only for the managed nodes.
|
// only for the managed nodes.
|
||||||
for _, node := range managedNodes {
|
for _, node := range nodesToCheck {
|
||||||
|
|
||||||
// Skip nodes marked to be deleted, if they were marked recently.
|
// Skip nodes marked to be deleted, if they were marked recently.
|
||||||
// Old-time marked nodes are again eligible for deletion - something went wrong with them
|
// Old-time marked nodes are again eligible for deletion - something went wrong with them
|
||||||
|
|
@ -164,7 +184,7 @@ func (sd *ScaleDown) UpdateUnneededNodes(
|
||||||
}
|
}
|
||||||
|
|
||||||
// Phase2 - check which nodes can be probably removed using fast drain.
|
// Phase2 - check which nodes can be probably removed using fast drain.
|
||||||
nodesToRemove, newHints, simulatorErr := simulator.FindNodesToRemove(currentlyUnneededNodes, nodes, pods,
|
nodesToRemove, unremovable, newHints, simulatorErr := simulator.FindNodesToRemove(currentlyUnneededNodes, nodes, pods,
|
||||||
nil, sd.context.PredicateChecker,
|
nil, sd.context.PredicateChecker,
|
||||||
len(currentlyUnneededNodes), true, sd.podLocationHints, sd.usageTracker, timestamp, pdbs)
|
len(currentlyUnneededNodes), true, sd.podLocationHints, sd.usageTracker, timestamp, pdbs)
|
||||||
if simulatorErr != nil {
|
if simulatorErr != nil {
|
||||||
|
|
@ -191,6 +211,15 @@ func (sd *ScaleDown) UpdateUnneededNodes(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Add noded to unremovable map
|
||||||
|
if len(unremovable) > 0 {
|
||||||
|
unremovableTimeout := timestamp.Add(UnremovableNodeRecheckTimeout)
|
||||||
|
for _, node := range unremovable {
|
||||||
|
sd.unremovableNodes[node.Name] = unremovableTimeout
|
||||||
|
}
|
||||||
|
glog.V(1).Infof("%v nodes found unremovable in simulation, will re-check them at %v", len(unremovable), unremovableTimeout)
|
||||||
|
}
|
||||||
|
|
||||||
sd.unneededNodesList = unneadedNodeList
|
sd.unneededNodesList = unneadedNodeList
|
||||||
sd.unneededNodes = result
|
sd.unneededNodes = result
|
||||||
sd.podLocationHints = newHints
|
sd.podLocationHints = newHints
|
||||||
|
|
@ -281,7 +310,7 @@ func (sd *ScaleDown) TryToScaleDown(nodes []*apiv1.Node, pods []*apiv1.Pod, pdbs
|
||||||
|
|
||||||
findNodesToRemoveStart := time.Now()
|
findNodesToRemoveStart := time.Now()
|
||||||
// We look for only 1 node so new hints may be incomplete.
|
// We look for only 1 node so new hints may be incomplete.
|
||||||
nodesToRemove, _, err := simulator.FindNodesToRemove(candidates, nodes, pods, sd.context.ClientSet,
|
nodesToRemove, _, _, err := simulator.FindNodesToRemove(candidates, nodes, pods, sd.context.ClientSet,
|
||||||
sd.context.PredicateChecker, 1, false,
|
sd.context.PredicateChecker, 1, false,
|
||||||
sd.podLocationHints, sd.usageTracker, time.Now(), pdbs)
|
sd.podLocationHints, sd.usageTracker, time.Now(), pdbs)
|
||||||
findNodesToRemoveDuration = time.Now().Sub(findNodesToRemoveStart)
|
findNodesToRemoveDuration = time.Now().Sub(findNodesToRemoveStart)
|
||||||
|
|
|
||||||
|
|
@ -104,8 +104,10 @@ func TestFindUnneededNodes(t *testing.T) {
|
||||||
assert.Contains(t, sd.podLocationHints, p2.Namespace+"/"+p2.Name)
|
assert.Contains(t, sd.podLocationHints, p2.Namespace+"/"+p2.Name)
|
||||||
assert.Equal(t, 4, len(sd.nodeUtilizationMap))
|
assert.Equal(t, 4, len(sd.nodeUtilizationMap))
|
||||||
|
|
||||||
|
sd.unremovableNodes = make(map[string]time.Time)
|
||||||
sd.unneededNodes["n1"] = time.Now()
|
sd.unneededNodes["n1"] = time.Now()
|
||||||
sd.UpdateUnneededNodes([]*apiv1.Node{n1, n2, n3, n4}, []*apiv1.Node{n1, n2, n3, n4}, []*apiv1.Pod{p1, p2, p3, p4}, time.Now(), nil)
|
sd.UpdateUnneededNodes([]*apiv1.Node{n1, n2, n3, n4}, []*apiv1.Node{n1, n2, n3, n4}, []*apiv1.Pod{p1, p2, p3, p4}, time.Now(), nil)
|
||||||
|
sd.unremovableNodes = make(map[string]time.Time)
|
||||||
|
|
||||||
assert.Equal(t, 1, len(sd.unneededNodes))
|
assert.Equal(t, 1, len(sd.unneededNodes))
|
||||||
addTime2, found := sd.unneededNodes["n2"]
|
addTime2, found := sd.unneededNodes["n2"]
|
||||||
|
|
@ -113,8 +115,17 @@ func TestFindUnneededNodes(t *testing.T) {
|
||||||
assert.Equal(t, addTime, addTime2)
|
assert.Equal(t, addTime, addTime2)
|
||||||
assert.Equal(t, 4, len(sd.nodeUtilizationMap))
|
assert.Equal(t, 4, len(sd.nodeUtilizationMap))
|
||||||
|
|
||||||
|
sd.unremovableNodes = make(map[string]time.Time)
|
||||||
sd.UpdateUnneededNodes([]*apiv1.Node{n1, n2, n3, n4}, []*apiv1.Node{n1, n3, n4}, []*apiv1.Pod{p1, p2, p3, p4}, time.Now(), nil)
|
sd.UpdateUnneededNodes([]*apiv1.Node{n1, n2, n3, n4}, []*apiv1.Node{n1, n3, n4}, []*apiv1.Pod{p1, p2, p3, p4}, time.Now(), nil)
|
||||||
assert.Equal(t, 0, len(sd.unneededNodes))
|
assert.Equal(t, 0, len(sd.unneededNodes))
|
||||||
|
|
||||||
|
// Node n1 is unneeded, but should be skipped because it has just recently been found to be unremovable
|
||||||
|
sd.UpdateUnneededNodes([]*apiv1.Node{n1}, []*apiv1.Node{n1}, []*apiv1.Pod{}, time.Now(), nil)
|
||||||
|
assert.Equal(t, 0, len(sd.unneededNodes))
|
||||||
|
|
||||||
|
// But it should be checked after timeout
|
||||||
|
sd.UpdateUnneededNodes([]*apiv1.Node{n1}, []*apiv1.Node{n1}, []*apiv1.Pod{}, time.Now().Add(UnremovableNodeRecheckTimeout+time.Second), nil)
|
||||||
|
assert.Equal(t, 1, len(sd.unneededNodes))
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestDrainNode(t *testing.T) {
|
func TestDrainNode(t *testing.T) {
|
||||||
|
|
|
||||||
|
|
@ -59,10 +59,11 @@ func FindNodesToRemove(candidates []*apiv1.Node, allNodes []*apiv1.Node, pods []
|
||||||
fastCheck bool, oldHints map[string]string, usageTracker *UsageTracker,
|
fastCheck bool, oldHints map[string]string, usageTracker *UsageTracker,
|
||||||
timestamp time.Time,
|
timestamp time.Time,
|
||||||
podDisruptionBudgets []*policyv1.PodDisruptionBudget,
|
podDisruptionBudgets []*policyv1.PodDisruptionBudget,
|
||||||
) (nodesToRemove []NodeToBeRemoved, podReschedulingHints map[string]string, finalError errors.AutoscalerError) {
|
) (nodesToRemove []NodeToBeRemoved, unremovableNodes []*apiv1.Node, podReschedulingHints map[string]string, finalError errors.AutoscalerError) {
|
||||||
|
|
||||||
nodeNameToNodeInfo := schedulercache.CreateNodeNameToInfoMap(pods, allNodes)
|
nodeNameToNodeInfo := schedulercache.CreateNodeNameToInfoMap(pods, allNodes)
|
||||||
result := make([]NodeToBeRemoved, 0)
|
result := make([]NodeToBeRemoved, 0)
|
||||||
|
unremovable := make([]*apiv1.Node, 0)
|
||||||
|
|
||||||
evaluationType := "Detailed evaluation"
|
evaluationType := "Detailed evaluation"
|
||||||
if fastCheck {
|
if fastCheck {
|
||||||
|
|
@ -87,10 +88,12 @@ candidateloop:
|
||||||
}
|
}
|
||||||
if err != nil {
|
if err != nil {
|
||||||
glog.V(2).Infof("%s: node %s cannot be removed: %v", evaluationType, node.Name, err)
|
glog.V(2).Infof("%s: node %s cannot be removed: %v", evaluationType, node.Name, err)
|
||||||
|
unremovable = append(unremovable, node)
|
||||||
continue candidateloop
|
continue candidateloop
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
glog.V(2).Infof("%s: nodeInfo for %s not found", evaluationType, node.Name)
|
glog.V(2).Infof("%s: nodeInfo for %s not found", evaluationType, node.Name)
|
||||||
|
unremovable = append(unremovable, node)
|
||||||
continue candidateloop
|
continue candidateloop
|
||||||
}
|
}
|
||||||
findProblems := findPlaceFor(node.Name, podsToRemove, allNodes, nodeNameToNodeInfo, predicateChecker, oldHints, newHints,
|
findProblems := findPlaceFor(node.Name, podsToRemove, allNodes, nodeNameToNodeInfo, predicateChecker, oldHints, newHints,
|
||||||
|
|
@ -107,9 +110,10 @@ candidateloop:
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
glog.V(2).Infof("%s: node %s is not suitable for removal: %v", evaluationType, node.Name, findProblems)
|
glog.V(2).Infof("%s: node %s is not suitable for removal: %v", evaluationType, node.Name, findProblems)
|
||||||
|
unremovable = append(unremovable, node)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return result, newHints, nil
|
return result, unremovable, newHints, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// FindEmptyNodesToRemove finds empty nodes that can be removed.
|
// FindEmptyNodesToRemove finds empty nodes that can be removed.
|
||||||
|
|
|
||||||
|
|
@ -231,46 +231,53 @@ func TestFindNodesToRemove(t *testing.T) {
|
||||||
// just an empty node, should be removed
|
// just an empty node, should be removed
|
||||||
candidates = []*apiv1.Node{emptyNode}
|
candidates = []*apiv1.Node{emptyNode}
|
||||||
allNodes = []*apiv1.Node{emptyNode}
|
allNodes = []*apiv1.Node{emptyNode}
|
||||||
toRemove, _, err := FindNodesToRemove(
|
toRemove, unremovable, _, err := FindNodesToRemove(
|
||||||
candidates, allNodes, pods, nil, predicateChecker, len(allNodes),
|
candidates, allNodes, pods, nil, predicateChecker, len(allNodes),
|
||||||
true, map[string]string{}, tracker, time.Now(), []*policyv1.PodDisruptionBudget{})
|
true, map[string]string{}, tracker, time.Now(), []*policyv1.PodDisruptionBudget{})
|
||||||
assert.NoError(t, err)
|
assert.NoError(t, err)
|
||||||
assert.Equal(t, toRemove, []NodeToBeRemoved{emptyNodeToRemove})
|
assert.Equal(t, toRemove, []NodeToBeRemoved{emptyNodeToRemove})
|
||||||
|
assert.Equal(t, len(unremovable), 0)
|
||||||
|
|
||||||
// just a drainable node, but nowhere for pods to go to
|
// just a drainable node, but nowhere for pods to go to
|
||||||
candidates = []*apiv1.Node{drainableNode}
|
candidates = []*apiv1.Node{drainableNode}
|
||||||
allNodes = []*apiv1.Node{drainableNode}
|
allNodes = []*apiv1.Node{drainableNode}
|
||||||
toRemove, _, err = FindNodesToRemove(
|
toRemove, unremovable, _, err = FindNodesToRemove(
|
||||||
candidates, allNodes, pods, nil, predicateChecker, len(allNodes),
|
candidates, allNodes, pods, nil, predicateChecker, len(allNodes),
|
||||||
true, map[string]string{}, tracker, time.Now(), []*policyv1.PodDisruptionBudget{})
|
true, map[string]string{}, tracker, time.Now(), []*policyv1.PodDisruptionBudget{})
|
||||||
assert.NoError(t, err)
|
assert.NoError(t, err)
|
||||||
assert.Equal(t, toRemove, []NodeToBeRemoved{})
|
assert.Equal(t, toRemove, []NodeToBeRemoved{})
|
||||||
|
assert.Equal(t, len(unremovable), 1)
|
||||||
|
assert.Equal(t, unremovable[0].Name, "n2")
|
||||||
|
|
||||||
// drainable node, and a mostly empty node that can take its pods
|
// drainable node, and a mostly empty node that can take its pods
|
||||||
candidates = []*apiv1.Node{drainableNode, nonDrainableNode}
|
candidates = []*apiv1.Node{drainableNode, nonDrainableNode}
|
||||||
allNodes = []*apiv1.Node{drainableNode, nonDrainableNode}
|
allNodes = []*apiv1.Node{drainableNode, nonDrainableNode}
|
||||||
toRemove, _, err = FindNodesToRemove(
|
toRemove, unremovable, _, err = FindNodesToRemove(
|
||||||
candidates, allNodes, pods, nil, predicateChecker, len(allNodes),
|
candidates, allNodes, pods, nil, predicateChecker, len(allNodes),
|
||||||
true, map[string]string{}, tracker, time.Now(), []*policyv1.PodDisruptionBudget{})
|
true, map[string]string{}, tracker, time.Now(), []*policyv1.PodDisruptionBudget{})
|
||||||
assert.NoError(t, err)
|
assert.NoError(t, err)
|
||||||
assert.Equal(t, toRemove, []NodeToBeRemoved{drainableNodeToRemove})
|
assert.Equal(t, toRemove, []NodeToBeRemoved{drainableNodeToRemove})
|
||||||
|
assert.Equal(t, len(unremovable), 1)
|
||||||
|
assert.Equal(t, unremovable[0].Name, "n3")
|
||||||
|
|
||||||
// drainable node, and a full node that cannot fit anymore pods
|
// drainable node, and a full node that cannot fit anymore pods
|
||||||
candidates = []*apiv1.Node{drainableNode}
|
candidates = []*apiv1.Node{drainableNode}
|
||||||
allNodes = []*apiv1.Node{drainableNode, fullNode}
|
allNodes = []*apiv1.Node{drainableNode, fullNode}
|
||||||
toRemove, _, err = FindNodesToRemove(
|
toRemove, unremovable, _, err = FindNodesToRemove(
|
||||||
candidates, allNodes, pods, nil, predicateChecker, len(allNodes),
|
candidates, allNodes, pods, nil, predicateChecker, len(allNodes),
|
||||||
true, map[string]string{}, tracker, time.Now(), []*policyv1.PodDisruptionBudget{})
|
true, map[string]string{}, tracker, time.Now(), []*policyv1.PodDisruptionBudget{})
|
||||||
assert.NoError(t, err)
|
assert.NoError(t, err)
|
||||||
assert.Equal(t, toRemove, []NodeToBeRemoved{})
|
assert.Equal(t, toRemove, []NodeToBeRemoved{})
|
||||||
|
assert.Equal(t, len(unremovable), 1)
|
||||||
|
assert.Equal(t, unremovable[0].Name, "n2")
|
||||||
|
|
||||||
// 4 nodes, 1 empty, 1 drainable
|
// 4 nodes, 1 empty, 1 drainable
|
||||||
candidates = []*apiv1.Node{emptyNode, drainableNode}
|
candidates = []*apiv1.Node{emptyNode, drainableNode}
|
||||||
allNodes = []*apiv1.Node{emptyNode, drainableNode, fullNode, nonDrainableNode}
|
allNodes = []*apiv1.Node{emptyNode, drainableNode, fullNode, nonDrainableNode}
|
||||||
toRemove, _, err = FindNodesToRemove(
|
toRemove, unremovable, _, err = FindNodesToRemove(
|
||||||
candidates, allNodes, pods, nil, predicateChecker, len(allNodes),
|
candidates, allNodes, pods, nil, predicateChecker, len(allNodes),
|
||||||
true, map[string]string{}, tracker, time.Now(), []*policyv1.PodDisruptionBudget{})
|
true, map[string]string{}, tracker, time.Now(), []*policyv1.PodDisruptionBudget{})
|
||||||
assert.NoError(t, err)
|
assert.NoError(t, err)
|
||||||
assert.Equal(t, toRemove, []NodeToBeRemoved{emptyNodeToRemove, drainableNodeToRemove})
|
assert.Equal(t, toRemove, []NodeToBeRemoved{emptyNodeToRemove, drainableNodeToRemove})
|
||||||
|
assert.Equal(t, len(unremovable), 0)
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue