Include taints by condition when determining if a node is unready/still starting
Conditions and their corresponding taints can sometimes skew, which can cause unnecessary scale-up. CA thinks nodes are ready because it looks only at the conditions, but scheduler predicates fail because they consider the taints as well. CA adds nodes, even though the existing nodes are still starting. This commit brings CA behavior in line with scheduler predicates behavior, eliminating the unnecessary scale-up.
This commit is contained in:
parent
65a0e30cb6
commit
6a528b45de
|
|
@ -860,22 +860,34 @@ func buildScaleDownStatusClusterwide(candidates map[string][]string, lastProbed
|
|||
return condition
|
||||
}
|
||||
|
||||
func hasTaint(node *apiv1.Node, taintKey string) bool {
|
||||
for _, taint := range node.Spec.Taints {
|
||||
if taint.Key == taintKey {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func isNodeStillStarting(node *apiv1.Node) bool {
|
||||
for _, condition := range node.Status.Conditions {
|
||||
if condition.Type == apiv1.NodeReady &&
|
||||
condition.Status != apiv1.ConditionTrue &&
|
||||
condition.LastTransitionTime.Time.Sub(node.CreationTimestamp.Time) < MaxStatusSettingDelayAfterCreation {
|
||||
return true
|
||||
if condition.Type == apiv1.NodeReady {
|
||||
notReady := condition.Status != apiv1.ConditionTrue || hasTaint(node, apiv1.TaintNodeNotReady)
|
||||
if notReady && condition.LastTransitionTime.Time.Sub(node.CreationTimestamp.Time) < MaxStatusSettingDelayAfterCreation {
|
||||
return true
|
||||
}
|
||||
}
|
||||
if condition.Type == apiv1.NodeDiskPressure &&
|
||||
condition.Status == apiv1.ConditionTrue &&
|
||||
condition.LastTransitionTime.Time.Sub(node.CreationTimestamp.Time) < MaxStatusSettingDelayAfterCreation {
|
||||
return true
|
||||
if condition.Type == apiv1.NodeDiskPressure {
|
||||
notReady := condition.Status == apiv1.ConditionTrue || hasTaint(node, apiv1.TaintNodeDiskPressure)
|
||||
if notReady && condition.LastTransitionTime.Time.Sub(node.CreationTimestamp.Time) < MaxStatusSettingDelayAfterCreation {
|
||||
return true
|
||||
}
|
||||
}
|
||||
if condition.Type == apiv1.NodeNetworkUnavailable &&
|
||||
condition.Status == apiv1.ConditionTrue &&
|
||||
condition.LastTransitionTime.Time.Sub(node.CreationTimestamp.Time) < MaxStatusSettingDelayAfterCreation {
|
||||
return true
|
||||
if condition.Type == apiv1.NodeNetworkUnavailable {
|
||||
notReady := condition.Status == apiv1.ConditionTrue || hasTaint(node, apiv1.TaintNodeNetworkUnavailable)
|
||||
if notReady && condition.LastTransitionTime.Time.Sub(node.CreationTimestamp.Time) < MaxStatusSettingDelayAfterCreation {
|
||||
return true
|
||||
}
|
||||
}
|
||||
}
|
||||
return false
|
||||
|
|
|
|||
|
|
@ -773,27 +773,46 @@ func TestIsNodeStillStarting(t *testing.T) {
|
|||
desc string
|
||||
condition apiv1.NodeConditionType
|
||||
status apiv1.ConditionStatus
|
||||
taintKey string
|
||||
expectedResult bool
|
||||
}{
|
||||
{"unready", apiv1.NodeReady, apiv1.ConditionFalse, true},
|
||||
{"readiness unknown", apiv1.NodeReady, apiv1.ConditionUnknown, true},
|
||||
{"out of disk", apiv1.NodeDiskPressure, apiv1.ConditionTrue, true},
|
||||
{"network unavailable", apiv1.NodeNetworkUnavailable, apiv1.ConditionTrue, true},
|
||||
{"started", apiv1.NodeReady, apiv1.ConditionTrue, false},
|
||||
{"unready", apiv1.NodeReady, apiv1.ConditionFalse, "", true},
|
||||
{"readiness unknown", apiv1.NodeReady, apiv1.ConditionUnknown, "", true},
|
||||
{"out of disk", apiv1.NodeDiskPressure, apiv1.ConditionTrue, "", true},
|
||||
{"network unavailable", apiv1.NodeNetworkUnavailable, apiv1.ConditionTrue, "", true},
|
||||
{"started", apiv1.NodeReady, apiv1.ConditionTrue, "", false},
|
||||
{"unready and unready taint", apiv1.NodeReady, apiv1.ConditionFalse, apiv1.TaintNodeNotReady, true},
|
||||
{"readiness unknown and unready taint", apiv1.NodeReady, apiv1.ConditionUnknown, apiv1.TaintNodeNotReady, true},
|
||||
{"disk pressure and disk pressure taint", apiv1.NodeDiskPressure, apiv1.ConditionTrue, apiv1.TaintNodeDiskPressure, true},
|
||||
{"network unavailable and network unavailable taint", apiv1.NodeNetworkUnavailable, apiv1.ConditionTrue, apiv1.TaintNodeNetworkUnavailable, true},
|
||||
{"ready but unready taint", apiv1.NodeReady, apiv1.ConditionTrue, apiv1.TaintNodeNotReady, true},
|
||||
{"no disk pressure but disk pressure taint", apiv1.NodeDiskPressure, apiv1.ConditionFalse, apiv1.TaintNodeDiskPressure, true},
|
||||
{"network available but network unavailable taint", apiv1.NodeNetworkUnavailable, apiv1.ConditionFalse, apiv1.TaintNodeNetworkUnavailable, true},
|
||||
}
|
||||
|
||||
now := time.Now()
|
||||
for _, tc := range testCases {
|
||||
t.Run("recent "+tc.desc, func(t *testing.T) {
|
||||
createTestNode := func(timeSinceCreation time.Duration) *apiv1.Node {
|
||||
node := BuildTestNode("n1", 1000, 1000)
|
||||
node.CreationTimestamp.Time = now
|
||||
SetNodeCondition(node, tc.condition, tc.status, now.Add(1*time.Minute))
|
||||
node.CreationTimestamp.Time = time.Time{}
|
||||
testedTime := node.CreationTimestamp.Time.Add(timeSinceCreation)
|
||||
|
||||
SetNodeCondition(node, tc.condition, tc.status, testedTime)
|
||||
|
||||
if tc.taintKey != "" {
|
||||
node.Spec.Taints = []apiv1.Taint{{
|
||||
Key: tc.taintKey,
|
||||
Effect: apiv1.TaintEffectNoSchedule,
|
||||
TimeAdded: &metav1.Time{Time: testedTime},
|
||||
}}
|
||||
}
|
||||
|
||||
return node
|
||||
}
|
||||
t.Run("recent "+tc.desc, func(t *testing.T) {
|
||||
node := createTestNode(1 * time.Minute)
|
||||
assert.Equal(t, tc.expectedResult, isNodeStillStarting(node))
|
||||
})
|
||||
t.Run("long "+tc.desc, func(t *testing.T) {
|
||||
node := BuildTestNode("n1", 1000, 1000)
|
||||
node.CreationTimestamp.Time = now
|
||||
SetNodeCondition(node, tc.condition, tc.status, now.Add(30*time.Minute))
|
||||
node := createTestNode(30 * time.Minute)
|
||||
// No matter what are the node's conditions, stop considering it not started after long enough.
|
||||
assert.False(t, isNodeStillStarting(node))
|
||||
})
|
||||
|
|
|
|||
|
|
@ -67,6 +67,21 @@ func GetReadinessState(node *apiv1.Node) (isNodeReady bool, lastTransitionTime t
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
notReadyTaints := map[string]bool{
|
||||
apiv1.TaintNodeNotReady: true,
|
||||
apiv1.TaintNodeDiskPressure: true,
|
||||
apiv1.TaintNodeNetworkUnavailable: true,
|
||||
}
|
||||
for _, taint := range node.Spec.Taints {
|
||||
if notReadyTaints[taint.Key] {
|
||||
canNodeBeReady = false
|
||||
if taint.TimeAdded != nil && lastTransitionTime.Before(taint.TimeAdded.Time) {
|
||||
lastTransitionTime = taint.TimeAdded.Time
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if !readyFound {
|
||||
return false, time.Time{}, fmt.Errorf("readiness information not found")
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue