Merge pull request #4674 from x13n/nodestatus

Expose nodes with unready GPU in CA status
2022-03-03 06:17:48 -08:00 · 2022-03-03 06:17:48 -08:00 · 3e53cc4b8d
parent c22e6f1a54 26769e4c1b
commit 3e53cc4b8d
5 changed files with 66 additions and 14 deletions
--- a/cluster-autoscaler/FAQ.md
+++ b/cluster-autoscaler/FAQ.md
@ -843,7 +843,7 @@ Most likely it's due to a problem with the cluster. Steps to debug:
 * Check if cluster autoscaler is up and running. In version 0.5 and later, it periodically publishes the kube-system/cluster-autoscaler-status config map. Check last update time annotation. It should be no more than 3 min (usually 10 sec old).
-* Check in the above config map if cluster and node groups are in the healthy state. If not, check if there are unready nodes.
+* Check in the above config map if cluster and node groups are in the healthy state. If not, check if there are unready nodes. If some nodes appear unready despite being Ready in the Node object, check `resourceUnready` count. If there are any nodes marked as `resourceUnready`, it is most likely a problem with the device driver failing to install a new resource (e.g. GPU). `resourceUnready` count is only available in CA version 1.24 and later.
 If both the cluster and CA appear healthy:
--- a/cluster-autoscaler/clusterstate/clusterstate.go
+++ b/cluster-autoscaler/clusterstate/clusterstate.go
@ -537,6 +537,10 @@ type Readiness struct {
 	Unregistered int
 	// Time when the readiness was measured.
 	Time time.Time
 	// Number of nodes that are Unready due to missing resources.
 	// This field is only used for exposing information externally and
 	// doesn't influence CA behavior.
 	ResourceUnready int
 }
 func (csr *ClusterStateRegistry) updateReadinessStats(currentTime time.Time) {
@ -544,23 +548,26 @@ func (csr *ClusterStateRegistry) updateReadinessStats(currentTime time.Time) {
 	perNodeGroup := make(map[string]Readiness)
 	total := Readiness{Time: currentTime}
-	update := func(current Readiness, node *apiv1.Node, ready bool) Readiness {
+	update := func(current Readiness, node *apiv1.Node, nr kube_util.NodeReadiness) Readiness {
 		current.Registered++
 		if deletetaint.HasToBeDeletedTaint(node) {
 			current.Deleted++
-		} else if ready {
+		} else if nr.Ready {
 			current.Ready++
 		} else if node.CreationTimestamp.Time.Add(MaxNodeStartupTime).After(currentTime) {
 			current.NotStarted++
 		} else {
 			current.Unready++
 			if nr.Reason == kube_util.ResourceUnready {
 				current.ResourceUnready++
 			}
 		}
 		return current
 	}
 	for _, node := range csr.nodes {
 		nodeGroup, errNg := csr.cloudProvider.NodeGroupForNode(node)
-		ready, _, errReady := kube_util.GetReadinessState(node)
+		nr, errReady := kube_util.GetNodeReadiness(node)
 		// Node is most likely not autoscaled, however check the errors.
 		if nodeGroup == nil || reflect.ValueOf(nodeGroup).IsNil() {
@ -571,9 +578,9 @@ func (csr *ClusterStateRegistry) updateReadinessStats(currentTime time.Time) {
 				klog.Warningf("Failed to get readiness info for %s: %v", node.Name, errReady)
 			}
 		} else {
-			perNodeGroup[nodeGroup.Id()] = update(perNodeGroup[nodeGroup.Id()], node, ready)
+			perNodeGroup[nodeGroup.Id()] = update(perNodeGroup[nodeGroup.Id()], node, nr)
 		}
-		total = update(total, node, ready)
+		total = update(total, node, nr)
 	}
 	var longUnregisteredNodeNames []string
@ -740,9 +747,10 @@ func (csr *ClusterStateRegistry) GetClusterReadiness() Readiness {
 func buildHealthStatusNodeGroup(isReady bool, readiness Readiness, acceptable AcceptableRange, minSize, maxSize int) api.ClusterAutoscalerCondition {
 	condition := api.ClusterAutoscalerCondition{
 		Type: api.ClusterAutoscalerHealth,
-		Message: fmt.Sprintf("ready=%d unready=%d notStarted=%d longNotStarted=0 registered=%d longUnregistered=%d cloudProviderTarget=%d (minSize=%d, maxSize=%d)",
+		Message: fmt.Sprintf("ready=%d unready=%d (resourceUnready=%d) notStarted=%d longNotStarted=0 registered=%d longUnregistered=%d cloudProviderTarget=%d (minSize=%d, maxSize=%d)",
 			readiness.Ready,
 			readiness.Unready,
 			readiness.ResourceUnready,
 			readiness.NotStarted,
 			readiness.Registered,
 			readiness.LongUnregistered,
@ -794,9 +802,10 @@ func buildScaleDownStatusNodeGroup(candidates []string, lastProbed time.Time) ap
 func buildHealthStatusClusterwide(isReady bool, readiness Readiness) api.ClusterAutoscalerCondition {
 	condition := api.ClusterAutoscalerCondition{
 		Type: api.ClusterAutoscalerHealth,
-		Message: fmt.Sprintf("ready=%d unready=%d notStarted=%d longNotStarted=0 registered=%d longUnregistered=%d",
+		Message: fmt.Sprintf("ready=%d unready=%d (resourceUnready=%d) notStarted=%d longNotStarted=0 registered=%d longUnregistered=%d",
 			readiness.Ready,
 			readiness.Unready,
 			readiness.ResourceUnready,
 			readiness.NotStarted,
 			readiness.Registered,
 			readiness.LongUnregistered,
--- a/cluster-autoscaler/processors/customresources/gpu_processor.go
+++ b/cluster-autoscaler/processors/customresources/gpu_processor.go
@ -49,7 +49,7 @@ func (p *GpuCustomResourcesProcessor) FilterOutNodesWithUnreadyResources(context
 		if hasGpuLabel && (!hasGpuAllocatable || gpuAllocatable.IsZero()) {
 			klog.V(3).Infof("Overriding status of node %v, which seems to have unready GPU",
 				node.Name)
-			nodesWithUnreadyGpu[node.Name] = kubernetes.GetUnreadyNodeCopy(node)
+			nodesWithUnreadyGpu[node.Name] = kubernetes.GetUnreadyNodeCopy(node, kubernetes.ResourceUnready)
 		} else {
 			newReadyNodes = append(newReadyNodes, node)
 		}
--- a/cluster-autoscaler/utils/kubernetes/ready.go
+++ b/cluster-autoscaler/utils/kubernetes/ready.go
@ -23,6 +23,24 @@ import (
 	apiv1 "k8s.io/api/core/v1"
 )
 // NodeNotReadyReason reprents a reason for node to be unready. While it is
 // simply a string on the node object, custom type ensures no one accidentally
 // performs any string operation on variables of this type and allows them to
 // be treated as enums.
 type NodeNotReadyReason string
 const (
 	// ResourceUnready is a fake identifier used internally by Cluster Autoscaler
 	// to indicate nodes that appear Ready in the API, but are treated as
 	// still upcoming due to a missing resource (e.g. GPU).
 	ResourceUnready NodeNotReadyReason = "cluster-autoscaler.kubernetes.io/resource-not-ready"
 	// IgnoreTaint is a fake identifier used internally by Cluster Autoscaler
 	// to indicate nodes that appear Ready in the API, but are treated as
 	// still upcoming due to applied ignore taint.
 	IgnoreTaint NodeNotReadyReason = "cluster-autoscaler.kubernetes.io/ignore-taint"
 )
 // IsNodeReadyAndSchedulable returns true if the node is ready and schedulable.
 func IsNodeReadyAndSchedulable(node *apiv1.Node) bool {
 	ready, _, _ := GetReadinessState(node)
@ -36,10 +54,29 @@ func IsNodeReadyAndSchedulable(node *apiv1.Node) bool {
 	return true
 }
 // NodeReadiness represents the last known node readiness.
 type NodeReadiness struct {
 	// Is the node ready or not.
 	Ready bool
 	// Time of the last state transition related to readiness.
 	LastTransitionTime time.Time
 	// Reason for the node to be unready. Defined only when Ready is false.
 	Reason NodeNotReadyReason
 }
 // GetReadinessState gets readiness state for the node
 //
 // Deprecated: Use GetNodeReadiness instead.
 func GetReadinessState(node *apiv1.Node) (isNodeReady bool, lastTransitionTime time.Time, err error) {
 	nr, err := GetNodeReadiness(node)
 	return nr.Ready, nr.LastTransitionTime, err
 }
 // GetNodeReadiness gets readiness for the node
 func GetNodeReadiness(node *apiv1.Node) (NodeReadiness, error) {
 	canNodeBeReady, readyFound := true, false
-	lastTransitionTime = time.Time{}
+	lastTransitionTime := time.Time{}
 	var reason NodeNotReadyReason
 	for _, cond := range node.Status.Conditions {
 		switch cond.Type {
@ -47,6 +84,7 @@ func GetReadinessState(node *apiv1.Node) (isNodeReady bool, lastTransitionTime t
 			readyFound = true
 			if cond.Status == apiv1.ConditionFalse || cond.Status == apiv1.ConditionUnknown {
 				canNodeBeReady = false
 				reason = NodeNotReadyReason(cond.Reason)
 			}
 			if lastTransitionTime.Before(cond.LastTransitionTime.Time) {
 				lastTransitionTime = cond.LastTransitionTime.Time
@ -83,18 +121,23 @@ func GetReadinessState(node *apiv1.Node) (isNodeReady bool, lastTransitionTime t
 	}
 	if !readyFound {
-		return false, time.Time{}, fmt.Errorf("readiness information not found")
+		return NodeReadiness{}, fmt.Errorf("readiness information not found")
 	}
-	return canNodeBeReady, lastTransitionTime, nil
+	return NodeReadiness{
 		Ready:              canNodeBeReady,
 		LastTransitionTime: lastTransitionTime,
 		Reason:             reason,
 	}, nil
 }
 // GetUnreadyNodeCopy create a copy of the given node and override its NodeReady condition to False
-func GetUnreadyNodeCopy(node *apiv1.Node) *apiv1.Node {
+func GetUnreadyNodeCopy(node *apiv1.Node, reason NodeNotReadyReason) *apiv1.Node {
 	newNode := node.DeepCopy()
 	newReadyCondition := apiv1.NodeCondition{
 		Type:               apiv1.NodeReady,
 		Status:             apiv1.ConditionFalse,
 		LastTransitionTime: node.CreationTimestamp,
 		Reason:             string(reason),
 	}
 	newNodeConditions := []apiv1.NodeCondition{newReadyCondition}
 	for _, condition := range newNode.Status.Conditions {
--- a/cluster-autoscaler/utils/taints/taints.go
+++ b/cluster-autoscaler/utils/taints/taints.go
@ -115,7 +115,7 @@ func FilterOutNodesWithIgnoredTaints(ignoredTaints TaintKeySet, allNodes, readyN
 			_, hasIgnoredTaint := ignoredTaints[t.Key]
 			if hasIgnoredTaint || strings.HasPrefix(t.Key, IgnoreTaintPrefix) {
 				ready = false
-				nodesWithIgnoredTaints[node.Name] = kubernetes.GetUnreadyNodeCopy(node)
+				nodesWithIgnoredTaints[node.Name] = kubernetes.GetUnreadyNodeCopy(node, kubernetes.IgnoreTaint)
 				klog.V(3).Infof("Overriding status of node %v, which seems to have ignored taint %q", node.Name, t.Key)
 				break
 			}