Fix bug where a node that becomes ready after 2 mins can be

treated as unready. Deprecated LongNotStarted

 In cases where node n1 would:
 1) Be created at t=0min
 2) Ready condition is true at t=2.5min
 3) Not ready taint is removed at t=3min
 the ready node is counted as unready

 Tested cases after fix:
 1) Case described above
 2) Nodes not starting even after 15mins still
 treated as unready
 3) Nodes created long ago that suddenly become unready are
 counted as unready.
This commit is contained in:
Vivek Bagade 2021-03-11 16:07:00 +01:00
parent bb3b24c1ba
commit 8c592f0c04
5 changed files with 174 additions and 103 deletions

View File

@ -43,10 +43,6 @@ const (
// MaxNodeStartupTime is the maximum time from the moment the node is registered to the time the node is ready.
MaxNodeStartupTime = 15 * time.Minute
// MaxStatusSettingDelayAfterCreation is the maximum time for node to set its initial status after the
// node is registered.
MaxStatusSettingDelayAfterCreation = 2 * time.Minute
// MaxNodeGroupBackoffDuration is the maximum backoff duration for a NodeGroup after new nodes failed to start.
MaxNodeGroupBackoffDuration = 30 * time.Minute
@ -399,7 +395,7 @@ func (csr *ClusterStateRegistry) IsNodeGroupHealthy(nodeGroupName string) bool {
if unjustifiedUnready > csr.config.OkTotalUnreadyCount &&
float64(unjustifiedUnready) > csr.config.MaxTotalUnreadyPercentage/100.0*
float64(readiness.Ready+readiness.Unready+readiness.NotStarted+readiness.LongNotStarted) {
float64(readiness.Ready+readiness.Unready+readiness.NotStarted) {
return false
}
@ -452,7 +448,7 @@ func (csr *ClusterStateRegistry) getProvisionedAndTargetSizesForNodeGroup(nodeGr
}
return 0, target, true
}
provisioned = readiness.Registered - readiness.NotStarted - readiness.LongNotStarted
provisioned = readiness.Registered - readiness.NotStarted
return provisioned, target, true
}
@ -531,8 +527,6 @@ type Readiness struct {
// Number of nodes that are being currently deleted. They exist in K8S but
// are not included in NodeGroup.TargetSize().
Deleted int
// Number of nodes that failed to start within a reasonable limit.
LongNotStarted int
// Number of nodes that are not yet fully started.
NotStarted int
// Number of all registered nodes in the group (ready/unready/deleted/etc).
@ -554,12 +548,10 @@ func (csr *ClusterStateRegistry) updateReadinessStats(currentTime time.Time) {
current.Registered++
if deletetaint.HasToBeDeletedTaint(node) {
current.Deleted++
} else if stillStarting := isNodeStillStarting(node); stillStarting && node.CreationTimestamp.Time.Add(MaxNodeStartupTime).Before(currentTime) {
current.LongNotStarted++
} else if stillStarting {
current.NotStarted++
} else if ready {
current.Ready++
} else if node.CreationTimestamp.Time.Add(MaxNodeStartupTime).After(currentTime) {
current.NotStarted++
} else {
current.Unready++
}
@ -743,11 +735,10 @@ func (csr *ClusterStateRegistry) GetClusterReadiness() Readiness {
func buildHealthStatusNodeGroup(isReady bool, readiness Readiness, acceptable AcceptableRange, minSize, maxSize int) api.ClusterAutoscalerCondition {
condition := api.ClusterAutoscalerCondition{
Type: api.ClusterAutoscalerHealth,
Message: fmt.Sprintf("ready=%d unready=%d notStarted=%d longNotStarted=%d registered=%d longUnregistered=%d cloudProviderTarget=%d (minSize=%d, maxSize=%d)",
Message: fmt.Sprintf("ready=%d unready=%d notStarted=%d longNotStarted=0 registered=%d longUnregistered=%d cloudProviderTarget=%d (minSize=%d, maxSize=%d)",
readiness.Ready,
readiness.Unready,
readiness.NotStarted,
readiness.LongNotStarted,
readiness.Registered,
readiness.LongUnregistered,
acceptable.CurrentTarget,
@ -798,11 +789,10 @@ func buildScaleDownStatusNodeGroup(candidates []string, lastProbed time.Time) ap
func buildHealthStatusClusterwide(isReady bool, readiness Readiness) api.ClusterAutoscalerCondition {
condition := api.ClusterAutoscalerCondition{
Type: api.ClusterAutoscalerHealth,
Message: fmt.Sprintf("ready=%d unready=%d notStarted=%d longNotStarted=%d registered=%d longUnregistered=%d",
Message: fmt.Sprintf("ready=%d unready=%d notStarted=%d longNotStarted=0 registered=%d longUnregistered=%d",
readiness.Ready,
readiness.Unready,
readiness.NotStarted,
readiness.LongNotStarted,
readiness.Registered,
readiness.LongUnregistered,
),
@ -860,39 +850,6 @@ func buildScaleDownStatusClusterwide(candidates map[string][]string, lastProbed
return condition
}
func hasTaint(node *apiv1.Node, taintKey string) bool {
for _, taint := range node.Spec.Taints {
if taint.Key == taintKey {
return true
}
}
return false
}
func isNodeStillStarting(node *apiv1.Node) bool {
for _, condition := range node.Status.Conditions {
if condition.Type == apiv1.NodeReady {
notReady := condition.Status != apiv1.ConditionTrue || hasTaint(node, apiv1.TaintNodeNotReady)
if notReady && condition.LastTransitionTime.Time.Sub(node.CreationTimestamp.Time) < MaxStatusSettingDelayAfterCreation {
return true
}
}
if condition.Type == apiv1.NodeDiskPressure {
notReady := condition.Status == apiv1.ConditionTrue || hasTaint(node, apiv1.TaintNodeDiskPressure)
if notReady && condition.LastTransitionTime.Time.Sub(node.CreationTimestamp.Time) < MaxStatusSettingDelayAfterCreation {
return true
}
}
if condition.Type == apiv1.NodeNetworkUnavailable {
notReady := condition.Status == apiv1.ConditionTrue || hasTaint(node, apiv1.TaintNodeNetworkUnavailable)
if notReady && condition.LastTransitionTime.Time.Sub(node.CreationTimestamp.Time) < MaxStatusSettingDelayAfterCreation {
return true
}
}
}
return false
}
func updateLastTransition(oldStatus, newStatus *api.ClusterAutoscalerStatus) {
newStatus.ClusterwideConditions = updateLastTransitionSingleList(
oldStatus.ClusterwideConditions, newStatus.ClusterwideConditions)
@ -955,7 +912,7 @@ func (csr *ClusterStateRegistry) GetUpcomingNodes() map[string]int {
readiness := csr.perNodeGroupReadiness[id]
ar := csr.acceptableRanges[id]
// newNodes is the number of nodes that
newNodes := ar.CurrentTarget - (readiness.Ready + readiness.Unready + readiness.LongNotStarted + readiness.LongUnregistered)
newNodes := ar.CurrentTarget - (readiness.Ready + readiness.Unready + readiness.LongUnregistered)
if newNodes <= 0 {
// Negative value is unlikely but theoretically possible.
continue
@ -1006,7 +963,7 @@ func (csr *ClusterStateRegistry) GetAutoscaledNodesCount() (currentSize, targetS
targetSize += accRange.CurrentTarget
}
for _, readiness := range csr.perNodeGroupReadiness {
currentSize += readiness.Registered - readiness.NotStarted - readiness.LongNotStarted
currentSize += readiness.Registered - readiness.NotStarted
}
return currentSize, targetSize
}

View File

@ -319,6 +319,79 @@ func TestTooManyUnready(t *testing.T) {
assert.True(t, clusterstate.IsNodeGroupHealthy("ng1"))
}
func TestUnreadyLongAfterCreation(t *testing.T) {
now := time.Now()
ng1_1 := BuildTestNode("ng1-1", 1000, 1000)
SetNodeReadyState(ng1_1, true, now.Add(-time.Minute))
ng2_1 := BuildTestNode("ng2-1", 1000, 1000)
SetNodeReadyState(ng2_1, false, now.Add(-time.Minute))
ng2_1.CreationTimestamp = metav1.Time{Time: now.Add(-30 * time.Minute)}
provider := testprovider.NewTestCloudProvider(nil, nil)
provider.AddNodeGroup("ng1", 1, 10, 1)
provider.AddNodeGroup("ng2", 1, 10, 1)
provider.AddNode("ng1", ng1_1)
provider.AddNode("ng2", ng2_1)
assert.NotNil(t, provider)
fakeClient := &fake.Clientset{}
fakeLogRecorder, _ := utils.NewStatusMapRecorder(fakeClient, "kube-system", kube_record.NewFakeRecorder(5), false, "some-map")
clusterstate := NewClusterStateRegistry(provider, ClusterStateRegistryConfig{
MaxTotalUnreadyPercentage: 10,
OkTotalUnreadyCount: 1,
}, fakeLogRecorder, newBackoff())
err := clusterstate.UpdateNodes([]*apiv1.Node{ng1_1, ng2_1}, nil, now)
assert.NoError(t, err)
assert.Equal(t, 1, clusterstate.GetClusterReadiness().Unready)
assert.Equal(t, 0, clusterstate.GetClusterReadiness().NotStarted)
upcoming := clusterstate.GetUpcomingNodes()
assert.Equal(t, 0, upcoming["ng1"])
}
func TestNotStarted(t *testing.T) {
now := time.Now()
ng1_1 := BuildTestNode("ng1-1", 1000, 1000)
SetNodeReadyState(ng1_1, true, now.Add(-time.Minute))
ng2_1 := BuildTestNode("ng2-1", 1000, 1000)
SetNodeReadyState(ng2_1, false, now.Add(-4*time.Minute))
SetNodeNotReadyTaint(ng2_1)
ng2_1.CreationTimestamp = metav1.Time{Time: now.Add(-10 * time.Minute)}
provider := testprovider.NewTestCloudProvider(nil, nil)
provider.AddNodeGroup("ng1", 1, 10, 1)
provider.AddNodeGroup("ng2", 1, 10, 1)
provider.AddNode("ng1", ng1_1)
provider.AddNode("ng2", ng2_1)
assert.NotNil(t, provider)
fakeClient := &fake.Clientset{}
fakeLogRecorder, _ := utils.NewStatusMapRecorder(fakeClient, "kube-system", kube_record.NewFakeRecorder(5), false, "some-map")
clusterstate := NewClusterStateRegistry(provider, ClusterStateRegistryConfig{
MaxTotalUnreadyPercentage: 10,
OkTotalUnreadyCount: 1,
}, fakeLogRecorder, newBackoff())
err := clusterstate.UpdateNodes([]*apiv1.Node{ng1_1, ng2_1}, nil, now)
assert.NoError(t, err)
assert.Equal(t, 1, clusterstate.GetClusterReadiness().NotStarted)
assert.Equal(t, 1, clusterstate.GetClusterReadiness().Ready)
// node ng2_1 moves condition to ready
SetNodeReadyState(ng2_1, true, now.Add(-4*time.Minute))
err = clusterstate.UpdateNodes([]*apiv1.Node{ng1_1, ng2_1}, nil, now)
assert.NoError(t, err)
assert.Equal(t, 1, clusterstate.GetClusterReadiness().NotStarted)
assert.Equal(t, 1, clusterstate.GetClusterReadiness().Ready)
// node ng2_1 no longer has the taint
RemoveNodeNotReadyTaint(ng2_1)
err = clusterstate.UpdateNodes([]*apiv1.Node{ng1_1, ng2_1}, nil, now)
assert.NoError(t, err)
assert.Equal(t, 0, clusterstate.GetClusterReadiness().NotStarted)
assert.Equal(t, 2, clusterstate.GetClusterReadiness().Ready)
}
func TestExpiredScaleUp(t *testing.T) {
now := time.Now()
@ -768,57 +841,6 @@ func TestUpdateScaleUp(t *testing.T) {
assert.Nil(t, clusterstate.scaleUpRequests["ng1"])
}
func TestIsNodeStillStarting(t *testing.T) {
testCases := []struct {
desc string
condition apiv1.NodeConditionType
status apiv1.ConditionStatus
taintKey string
expectedResult bool
}{
{"unready", apiv1.NodeReady, apiv1.ConditionFalse, "", true},
{"readiness unknown", apiv1.NodeReady, apiv1.ConditionUnknown, "", true},
{"out of disk", apiv1.NodeDiskPressure, apiv1.ConditionTrue, "", true},
{"network unavailable", apiv1.NodeNetworkUnavailable, apiv1.ConditionTrue, "", true},
{"started", apiv1.NodeReady, apiv1.ConditionTrue, "", false},
{"unready and unready taint", apiv1.NodeReady, apiv1.ConditionFalse, apiv1.TaintNodeNotReady, true},
{"readiness unknown and unready taint", apiv1.NodeReady, apiv1.ConditionUnknown, apiv1.TaintNodeNotReady, true},
{"disk pressure and disk pressure taint", apiv1.NodeDiskPressure, apiv1.ConditionTrue, apiv1.TaintNodeDiskPressure, true},
{"network unavailable and network unavailable taint", apiv1.NodeNetworkUnavailable, apiv1.ConditionTrue, apiv1.TaintNodeNetworkUnavailable, true},
{"ready but unready taint", apiv1.NodeReady, apiv1.ConditionTrue, apiv1.TaintNodeNotReady, true},
{"no disk pressure but disk pressure taint", apiv1.NodeDiskPressure, apiv1.ConditionFalse, apiv1.TaintNodeDiskPressure, true},
{"network available but network unavailable taint", apiv1.NodeNetworkUnavailable, apiv1.ConditionFalse, apiv1.TaintNodeNetworkUnavailable, true},
}
for _, tc := range testCases {
createTestNode := func(timeSinceCreation time.Duration) *apiv1.Node {
node := BuildTestNode("n1", 1000, 1000)
node.CreationTimestamp.Time = time.Time{}
testedTime := node.CreationTimestamp.Time.Add(timeSinceCreation)
SetNodeCondition(node, tc.condition, tc.status, testedTime)
if tc.taintKey != "" {
node.Spec.Taints = []apiv1.Taint{{
Key: tc.taintKey,
Effect: apiv1.TaintEffectNoSchedule,
TimeAdded: &metav1.Time{Time: testedTime},
}}
}
return node
}
t.Run("recent "+tc.desc, func(t *testing.T) {
node := createTestNode(1 * time.Minute)
assert.Equal(t, tc.expectedResult, isNodeStillStarting(node))
})
t.Run("long "+tc.desc, func(t *testing.T) {
node := createTestNode(30 * time.Minute)
// No matter what are the node's conditions, stop considering it not started after long enough.
assert.False(t, isNodeStillStarting(node))
})
}
}
func TestScaleUpFailures(t *testing.T) {
now := time.Now()

View File

@ -312,7 +312,7 @@ func UpdateClusterStateMetrics(csr *clusterstate.ClusterStateRegistry) {
}
metrics.UpdateClusterSafeToAutoscale(csr.IsClusterHealthy())
readiness := csr.GetClusterReadiness()
metrics.UpdateNodesCount(readiness.Ready, readiness.Unready+readiness.LongNotStarted, readiness.NotStarted, readiness.LongUnregistered, readiness.Unregistered)
metrics.UpdateNodesCount(readiness.Ready, readiness.Unready, readiness.NotStarted, readiness.LongUnregistered, readiness.Unregistered)
}
// GetOldestCreateTime returns oldest creation time out of the pods in the set

View File

@ -0,0 +1,75 @@
/*
Copyright 2021 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package kubernetes
import (
"testing"
"time"
"github.com/stretchr/testify/assert"
apiv1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
. "k8s.io/autoscaler/cluster-autoscaler/utils/test"
)
func TestGetReadiness(t *testing.T) {
testCases := []struct {
desc string
condition apiv1.NodeConditionType
status apiv1.ConditionStatus
taintKey string
expectedResult bool
}{
{"ready", apiv1.NodeReady, apiv1.ConditionTrue, "", true},
{"unready and unready taint", apiv1.NodeReady, apiv1.ConditionFalse, apiv1.TaintNodeNotReady, false},
{"readiness unknown and unready taint", apiv1.NodeReady, apiv1.ConditionUnknown, apiv1.TaintNodeNotReady, false},
{"disk pressure and disk pressure taint", apiv1.NodeDiskPressure, apiv1.ConditionTrue, apiv1.TaintNodeDiskPressure, false},
{"network unavailable and network unavailable taint", apiv1.NodeNetworkUnavailable, apiv1.ConditionTrue, apiv1.TaintNodeNetworkUnavailable, false},
{"ready but unready taint", apiv1.NodeReady, apiv1.ConditionTrue, apiv1.TaintNodeNotReady, false},
{"no disk pressure but disk pressure taint", apiv1.NodeDiskPressure, apiv1.ConditionFalse, apiv1.TaintNodeDiskPressure, false},
{"network available but network unavailable taint", apiv1.NodeNetworkUnavailable, apiv1.ConditionFalse, apiv1.TaintNodeNetworkUnavailable, false},
}
for _, tc := range testCases {
createTestNode := func(timeSinceCreation time.Duration) *apiv1.Node {
node := BuildTestNode("n1", 1000, 1000)
node.CreationTimestamp.Time = time.Time{}
testedTime := node.CreationTimestamp.Time.Add(timeSinceCreation)
SetNodeCondition(node, tc.condition, tc.status, testedTime)
if tc.condition != apiv1.NodeReady {
SetNodeCondition(node, apiv1.NodeReady, apiv1.ConditionTrue, testedTime)
}
if tc.taintKey != "" {
node.Spec.Taints = []apiv1.Taint{{
Key: tc.taintKey,
Effect: apiv1.TaintEffectNoSchedule,
TimeAdded: &metav1.Time{Time: testedTime},
}}
}
return node
}
t.Run(tc.desc, func(t *testing.T) {
node := createTestNode(1 * time.Minute)
isReady, _, err := GetReadinessState(node)
assert.NoError(t, err)
assert.Equal(t, tc.expectedResult, isReady)
})
}
}

View File

@ -162,6 +162,23 @@ func SetNodeReadyState(node *apiv1.Node, ready bool, lastTransition time.Time) {
}
}
// SetNodeNotReadyTaint sets the not ready taint on node.
func SetNodeNotReadyTaint(node *apiv1.Node) {
node.Spec.Taints = append(node.Spec.Taints, apiv1.Taint{Key: apiv1.TaintNodeNotReady, Effect: apiv1.TaintEffectNoSchedule})
}
// RemoveNodeNotReadyTaint removes the not ready taint.
func RemoveNodeNotReadyTaint(node *apiv1.Node) {
var final []apiv1.Taint
for i := range node.Spec.Taints {
if node.Spec.Taints[i].Key == apiv1.TaintNodeNotReady {
continue
}
final = append(final, node.Spec.Taints[i])
}
node.Spec.Taints = final
}
// SetNodeCondition sets node condition.
func SetNodeCondition(node *apiv1.Node, conditionType apiv1.NodeConditionType, status apiv1.ConditionStatus, lastTransition time.Time) {
for i := range node.Status.Conditions {