Include taints by condition when determining if a node is unready/still starting

Conditions and their corresponding taints can sometimes skew, which
can cause unnecessary scale-up. CA thinks nodes are ready because it
looks only at the conditions, but scheduler predicates fail because they
consider the taints as well. CA adds nodes, even though the existing
nodes are still starting. This commit brings CA behavior in line
with scheduler predicates behavior, eliminating the unnecessary
scale-up.
This commit is contained in:
Jakub Tużnik 2020-10-29 17:45:05 +01:00
parent 65a0e30cb6
commit 6a528b45de
3 changed files with 71 additions and 25 deletions

View File

@ -860,22 +860,34 @@ func buildScaleDownStatusClusterwide(candidates map[string][]string, lastProbed
return condition
}
func hasTaint(node *apiv1.Node, taintKey string) bool {
for _, taint := range node.Spec.Taints {
if taint.Key == taintKey {
return true
}
}
return false
}
func isNodeStillStarting(node *apiv1.Node) bool {
for _, condition := range node.Status.Conditions {
if condition.Type == apiv1.NodeReady &&
condition.Status != apiv1.ConditionTrue &&
condition.LastTransitionTime.Time.Sub(node.CreationTimestamp.Time) < MaxStatusSettingDelayAfterCreation {
return true
if condition.Type == apiv1.NodeReady {
notReady := condition.Status != apiv1.ConditionTrue || hasTaint(node, apiv1.TaintNodeNotReady)
if notReady && condition.LastTransitionTime.Time.Sub(node.CreationTimestamp.Time) < MaxStatusSettingDelayAfterCreation {
return true
}
}
if condition.Type == apiv1.NodeDiskPressure &&
condition.Status == apiv1.ConditionTrue &&
condition.LastTransitionTime.Time.Sub(node.CreationTimestamp.Time) < MaxStatusSettingDelayAfterCreation {
return true
if condition.Type == apiv1.NodeDiskPressure {
notReady := condition.Status == apiv1.ConditionTrue || hasTaint(node, apiv1.TaintNodeDiskPressure)
if notReady && condition.LastTransitionTime.Time.Sub(node.CreationTimestamp.Time) < MaxStatusSettingDelayAfterCreation {
return true
}
}
if condition.Type == apiv1.NodeNetworkUnavailable &&
condition.Status == apiv1.ConditionTrue &&
condition.LastTransitionTime.Time.Sub(node.CreationTimestamp.Time) < MaxStatusSettingDelayAfterCreation {
return true
if condition.Type == apiv1.NodeNetworkUnavailable {
notReady := condition.Status == apiv1.ConditionTrue || hasTaint(node, apiv1.TaintNodeNetworkUnavailable)
if notReady && condition.LastTransitionTime.Time.Sub(node.CreationTimestamp.Time) < MaxStatusSettingDelayAfterCreation {
return true
}
}
}
return false

View File

@ -773,27 +773,46 @@ func TestIsNodeStillStarting(t *testing.T) {
desc string
condition apiv1.NodeConditionType
status apiv1.ConditionStatus
taintKey string
expectedResult bool
}{
{"unready", apiv1.NodeReady, apiv1.ConditionFalse, true},
{"readiness unknown", apiv1.NodeReady, apiv1.ConditionUnknown, true},
{"out of disk", apiv1.NodeDiskPressure, apiv1.ConditionTrue, true},
{"network unavailable", apiv1.NodeNetworkUnavailable, apiv1.ConditionTrue, true},
{"started", apiv1.NodeReady, apiv1.ConditionTrue, false},
{"unready", apiv1.NodeReady, apiv1.ConditionFalse, "", true},
{"readiness unknown", apiv1.NodeReady, apiv1.ConditionUnknown, "", true},
{"out of disk", apiv1.NodeDiskPressure, apiv1.ConditionTrue, "", true},
{"network unavailable", apiv1.NodeNetworkUnavailable, apiv1.ConditionTrue, "", true},
{"started", apiv1.NodeReady, apiv1.ConditionTrue, "", false},
{"unready and unready taint", apiv1.NodeReady, apiv1.ConditionFalse, apiv1.TaintNodeNotReady, true},
{"readiness unknown and unready taint", apiv1.NodeReady, apiv1.ConditionUnknown, apiv1.TaintNodeNotReady, true},
{"disk pressure and disk pressure taint", apiv1.NodeDiskPressure, apiv1.ConditionTrue, apiv1.TaintNodeDiskPressure, true},
{"network unavailable and network unavailable taint", apiv1.NodeNetworkUnavailable, apiv1.ConditionTrue, apiv1.TaintNodeNetworkUnavailable, true},
{"ready but unready taint", apiv1.NodeReady, apiv1.ConditionTrue, apiv1.TaintNodeNotReady, true},
{"no disk pressure but disk pressure taint", apiv1.NodeDiskPressure, apiv1.ConditionFalse, apiv1.TaintNodeDiskPressure, true},
{"network available but network unavailable taint", apiv1.NodeNetworkUnavailable, apiv1.ConditionFalse, apiv1.TaintNodeNetworkUnavailable, true},
}
now := time.Now()
for _, tc := range testCases {
t.Run("recent "+tc.desc, func(t *testing.T) {
createTestNode := func(timeSinceCreation time.Duration) *apiv1.Node {
node := BuildTestNode("n1", 1000, 1000)
node.CreationTimestamp.Time = now
SetNodeCondition(node, tc.condition, tc.status, now.Add(1*time.Minute))
node.CreationTimestamp.Time = time.Time{}
testedTime := node.CreationTimestamp.Time.Add(timeSinceCreation)
SetNodeCondition(node, tc.condition, tc.status, testedTime)
if tc.taintKey != "" {
node.Spec.Taints = []apiv1.Taint{{
Key: tc.taintKey,
Effect: apiv1.TaintEffectNoSchedule,
TimeAdded: &metav1.Time{Time: testedTime},
}}
}
return node
}
t.Run("recent "+tc.desc, func(t *testing.T) {
node := createTestNode(1 * time.Minute)
assert.Equal(t, tc.expectedResult, isNodeStillStarting(node))
})
t.Run("long "+tc.desc, func(t *testing.T) {
node := BuildTestNode("n1", 1000, 1000)
node.CreationTimestamp.Time = now
SetNodeCondition(node, tc.condition, tc.status, now.Add(30*time.Minute))
node := createTestNode(30 * time.Minute)
// No matter what are the node's conditions, stop considering it not started after long enough.
assert.False(t, isNodeStillStarting(node))
})

View File

@ -67,6 +67,21 @@ func GetReadinessState(node *apiv1.Node) (isNodeReady bool, lastTransitionTime t
}
}
}
notReadyTaints := map[string]bool{
apiv1.TaintNodeNotReady: true,
apiv1.TaintNodeDiskPressure: true,
apiv1.TaintNodeNetworkUnavailable: true,
}
for _, taint := range node.Spec.Taints {
if notReadyTaints[taint.Key] {
canNodeBeReady = false
if taint.TimeAdded != nil && lastTransitionTime.Before(taint.TimeAdded.Time) {
lastTransitionTime = taint.TimeAdded.Time
}
}
}
if !readyFound {
return false, time.Time{}, fmt.Errorf("readiness information not found")
}