Include taints by condition when determining if a node is unready/still starting
Conditions and their corresponding taints can sometimes skew, which can cause unnecessary scale-up. CA thinks nodes are ready because it looks only at the conditions, but scheduler predicates fail because they consider the taints as well. CA adds nodes, even though the existing nodes are still starting. This commit brings CA behavior in line with scheduler predicates behavior, eliminating the unnecessary scale-up.
This commit is contained in:
parent
65a0e30cb6
commit
6a528b45de
|
|
@ -860,24 +860,36 @@ func buildScaleDownStatusClusterwide(candidates map[string][]string, lastProbed
|
||||||
return condition
|
return condition
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func hasTaint(node *apiv1.Node, taintKey string) bool {
|
||||||
|
for _, taint := range node.Spec.Taints {
|
||||||
|
if taint.Key == taintKey {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
func isNodeStillStarting(node *apiv1.Node) bool {
|
func isNodeStillStarting(node *apiv1.Node) bool {
|
||||||
for _, condition := range node.Status.Conditions {
|
for _, condition := range node.Status.Conditions {
|
||||||
if condition.Type == apiv1.NodeReady &&
|
if condition.Type == apiv1.NodeReady {
|
||||||
condition.Status != apiv1.ConditionTrue &&
|
notReady := condition.Status != apiv1.ConditionTrue || hasTaint(node, apiv1.TaintNodeNotReady)
|
||||||
condition.LastTransitionTime.Time.Sub(node.CreationTimestamp.Time) < MaxStatusSettingDelayAfterCreation {
|
if notReady && condition.LastTransitionTime.Time.Sub(node.CreationTimestamp.Time) < MaxStatusSettingDelayAfterCreation {
|
||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
if condition.Type == apiv1.NodeDiskPressure &&
|
}
|
||||||
condition.Status == apiv1.ConditionTrue &&
|
if condition.Type == apiv1.NodeDiskPressure {
|
||||||
condition.LastTransitionTime.Time.Sub(node.CreationTimestamp.Time) < MaxStatusSettingDelayAfterCreation {
|
notReady := condition.Status == apiv1.ConditionTrue || hasTaint(node, apiv1.TaintNodeDiskPressure)
|
||||||
|
if notReady && condition.LastTransitionTime.Time.Sub(node.CreationTimestamp.Time) < MaxStatusSettingDelayAfterCreation {
|
||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
if condition.Type == apiv1.NodeNetworkUnavailable &&
|
}
|
||||||
condition.Status == apiv1.ConditionTrue &&
|
if condition.Type == apiv1.NodeNetworkUnavailable {
|
||||||
condition.LastTransitionTime.Time.Sub(node.CreationTimestamp.Time) < MaxStatusSettingDelayAfterCreation {
|
notReady := condition.Status == apiv1.ConditionTrue || hasTaint(node, apiv1.TaintNodeNetworkUnavailable)
|
||||||
|
if notReady && condition.LastTransitionTime.Time.Sub(node.CreationTimestamp.Time) < MaxStatusSettingDelayAfterCreation {
|
||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -773,27 +773,46 @@ func TestIsNodeStillStarting(t *testing.T) {
|
||||||
desc string
|
desc string
|
||||||
condition apiv1.NodeConditionType
|
condition apiv1.NodeConditionType
|
||||||
status apiv1.ConditionStatus
|
status apiv1.ConditionStatus
|
||||||
|
taintKey string
|
||||||
expectedResult bool
|
expectedResult bool
|
||||||
}{
|
}{
|
||||||
{"unready", apiv1.NodeReady, apiv1.ConditionFalse, true},
|
{"unready", apiv1.NodeReady, apiv1.ConditionFalse, "", true},
|
||||||
{"readiness unknown", apiv1.NodeReady, apiv1.ConditionUnknown, true},
|
{"readiness unknown", apiv1.NodeReady, apiv1.ConditionUnknown, "", true},
|
||||||
{"out of disk", apiv1.NodeDiskPressure, apiv1.ConditionTrue, true},
|
{"out of disk", apiv1.NodeDiskPressure, apiv1.ConditionTrue, "", true},
|
||||||
{"network unavailable", apiv1.NodeNetworkUnavailable, apiv1.ConditionTrue, true},
|
{"network unavailable", apiv1.NodeNetworkUnavailable, apiv1.ConditionTrue, "", true},
|
||||||
{"started", apiv1.NodeReady, apiv1.ConditionTrue, false},
|
{"started", apiv1.NodeReady, apiv1.ConditionTrue, "", false},
|
||||||
|
{"unready and unready taint", apiv1.NodeReady, apiv1.ConditionFalse, apiv1.TaintNodeNotReady, true},
|
||||||
|
{"readiness unknown and unready taint", apiv1.NodeReady, apiv1.ConditionUnknown, apiv1.TaintNodeNotReady, true},
|
||||||
|
{"disk pressure and disk pressure taint", apiv1.NodeDiskPressure, apiv1.ConditionTrue, apiv1.TaintNodeDiskPressure, true},
|
||||||
|
{"network unavailable and network unavailable taint", apiv1.NodeNetworkUnavailable, apiv1.ConditionTrue, apiv1.TaintNodeNetworkUnavailable, true},
|
||||||
|
{"ready but unready taint", apiv1.NodeReady, apiv1.ConditionTrue, apiv1.TaintNodeNotReady, true},
|
||||||
|
{"no disk pressure but disk pressure taint", apiv1.NodeDiskPressure, apiv1.ConditionFalse, apiv1.TaintNodeDiskPressure, true},
|
||||||
|
{"network available but network unavailable taint", apiv1.NodeNetworkUnavailable, apiv1.ConditionFalse, apiv1.TaintNodeNetworkUnavailable, true},
|
||||||
|
}
|
||||||
|
for _, tc := range testCases {
|
||||||
|
createTestNode := func(timeSinceCreation time.Duration) *apiv1.Node {
|
||||||
|
node := BuildTestNode("n1", 1000, 1000)
|
||||||
|
node.CreationTimestamp.Time = time.Time{}
|
||||||
|
testedTime := node.CreationTimestamp.Time.Add(timeSinceCreation)
|
||||||
|
|
||||||
|
SetNodeCondition(node, tc.condition, tc.status, testedTime)
|
||||||
|
|
||||||
|
if tc.taintKey != "" {
|
||||||
|
node.Spec.Taints = []apiv1.Taint{{
|
||||||
|
Key: tc.taintKey,
|
||||||
|
Effect: apiv1.TaintEffectNoSchedule,
|
||||||
|
TimeAdded: &metav1.Time{Time: testedTime},
|
||||||
|
}}
|
||||||
}
|
}
|
||||||
|
|
||||||
now := time.Now()
|
return node
|
||||||
for _, tc := range testCases {
|
}
|
||||||
t.Run("recent "+tc.desc, func(t *testing.T) {
|
t.Run("recent "+tc.desc, func(t *testing.T) {
|
||||||
node := BuildTestNode("n1", 1000, 1000)
|
node := createTestNode(1 * time.Minute)
|
||||||
node.CreationTimestamp.Time = now
|
|
||||||
SetNodeCondition(node, tc.condition, tc.status, now.Add(1*time.Minute))
|
|
||||||
assert.Equal(t, tc.expectedResult, isNodeStillStarting(node))
|
assert.Equal(t, tc.expectedResult, isNodeStillStarting(node))
|
||||||
})
|
})
|
||||||
t.Run("long "+tc.desc, func(t *testing.T) {
|
t.Run("long "+tc.desc, func(t *testing.T) {
|
||||||
node := BuildTestNode("n1", 1000, 1000)
|
node := createTestNode(30 * time.Minute)
|
||||||
node.CreationTimestamp.Time = now
|
|
||||||
SetNodeCondition(node, tc.condition, tc.status, now.Add(30*time.Minute))
|
|
||||||
// No matter what are the node's conditions, stop considering it not started after long enough.
|
// No matter what are the node's conditions, stop considering it not started after long enough.
|
||||||
assert.False(t, isNodeStillStarting(node))
|
assert.False(t, isNodeStillStarting(node))
|
||||||
})
|
})
|
||||||
|
|
|
||||||
|
|
@ -67,6 +67,21 @@ func GetReadinessState(node *apiv1.Node) (isNodeReady bool, lastTransitionTime t
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
notReadyTaints := map[string]bool{
|
||||||
|
apiv1.TaintNodeNotReady: true,
|
||||||
|
apiv1.TaintNodeDiskPressure: true,
|
||||||
|
apiv1.TaintNodeNetworkUnavailable: true,
|
||||||
|
}
|
||||||
|
for _, taint := range node.Spec.Taints {
|
||||||
|
if notReadyTaints[taint.Key] {
|
||||||
|
canNodeBeReady = false
|
||||||
|
if taint.TimeAdded != nil && lastTransitionTime.Before(taint.TimeAdded.Time) {
|
||||||
|
lastTransitionTime = taint.TimeAdded.Time
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if !readyFound {
|
if !readyFound {
|
||||||
return false, time.Time{}, fmt.Errorf("readiness information not found")
|
return false, time.Time{}, fmt.Errorf("readiness information not found")
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue