Merge pull request #4674 from x13n/nodestatus

Expose nodes with unready GPU in CA status
This commit is contained in:
Kubernetes Prow Robot 2022-03-03 06:17:48 -08:00 committed by GitHub
commit 3e53cc4b8d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 66 additions and 14 deletions

View File

@ -843,7 +843,7 @@ Most likely it's due to a problem with the cluster. Steps to debug:
* Check if cluster autoscaler is up and running. In version 0.5 and later, it periodically publishes the kube-system/cluster-autoscaler-status config map. Check last update time annotation. It should be no more than 3 min (usually 10 sec old). * Check if cluster autoscaler is up and running. In version 0.5 and later, it periodically publishes the kube-system/cluster-autoscaler-status config map. Check last update time annotation. It should be no more than 3 min (usually 10 sec old).
* Check in the above config map if cluster and node groups are in the healthy state. If not, check if there are unready nodes. * Check in the above config map if cluster and node groups are in the healthy state. If not, check if there are unready nodes. If some nodes appear unready despite being Ready in the Node object, check `resourceUnready` count. If there are any nodes marked as `resourceUnready`, it is most likely a problem with the device driver failing to install a new resource (e.g. GPU). `resourceUnready` count is only available in CA version 1.24 and later.
If both the cluster and CA appear healthy: If both the cluster and CA appear healthy:

View File

@ -537,6 +537,10 @@ type Readiness struct {
Unregistered int Unregistered int
// Time when the readiness was measured. // Time when the readiness was measured.
Time time.Time Time time.Time
// Number of nodes that are Unready due to missing resources.
// This field is only used for exposing information externally and
// doesn't influence CA behavior.
ResourceUnready int
} }
func (csr *ClusterStateRegistry) updateReadinessStats(currentTime time.Time) { func (csr *ClusterStateRegistry) updateReadinessStats(currentTime time.Time) {
@ -544,23 +548,26 @@ func (csr *ClusterStateRegistry) updateReadinessStats(currentTime time.Time) {
perNodeGroup := make(map[string]Readiness) perNodeGroup := make(map[string]Readiness)
total := Readiness{Time: currentTime} total := Readiness{Time: currentTime}
update := func(current Readiness, node *apiv1.Node, ready bool) Readiness { update := func(current Readiness, node *apiv1.Node, nr kube_util.NodeReadiness) Readiness {
current.Registered++ current.Registered++
if deletetaint.HasToBeDeletedTaint(node) { if deletetaint.HasToBeDeletedTaint(node) {
current.Deleted++ current.Deleted++
} else if ready { } else if nr.Ready {
current.Ready++ current.Ready++
} else if node.CreationTimestamp.Time.Add(MaxNodeStartupTime).After(currentTime) { } else if node.CreationTimestamp.Time.Add(MaxNodeStartupTime).After(currentTime) {
current.NotStarted++ current.NotStarted++
} else { } else {
current.Unready++ current.Unready++
if nr.Reason == kube_util.ResourceUnready {
current.ResourceUnready++
}
} }
return current return current
} }
for _, node := range csr.nodes { for _, node := range csr.nodes {
nodeGroup, errNg := csr.cloudProvider.NodeGroupForNode(node) nodeGroup, errNg := csr.cloudProvider.NodeGroupForNode(node)
ready, _, errReady := kube_util.GetReadinessState(node) nr, errReady := kube_util.GetNodeReadiness(node)
// Node is most likely not autoscaled, however check the errors. // Node is most likely not autoscaled, however check the errors.
if nodeGroup == nil || reflect.ValueOf(nodeGroup).IsNil() { if nodeGroup == nil || reflect.ValueOf(nodeGroup).IsNil() {
@ -571,9 +578,9 @@ func (csr *ClusterStateRegistry) updateReadinessStats(currentTime time.Time) {
klog.Warningf("Failed to get readiness info for %s: %v", node.Name, errReady) klog.Warningf("Failed to get readiness info for %s: %v", node.Name, errReady)
} }
} else { } else {
perNodeGroup[nodeGroup.Id()] = update(perNodeGroup[nodeGroup.Id()], node, ready) perNodeGroup[nodeGroup.Id()] = update(perNodeGroup[nodeGroup.Id()], node, nr)
} }
total = update(total, node, ready) total = update(total, node, nr)
} }
var longUnregisteredNodeNames []string var longUnregisteredNodeNames []string
@ -740,9 +747,10 @@ func (csr *ClusterStateRegistry) GetClusterReadiness() Readiness {
func buildHealthStatusNodeGroup(isReady bool, readiness Readiness, acceptable AcceptableRange, minSize, maxSize int) api.ClusterAutoscalerCondition { func buildHealthStatusNodeGroup(isReady bool, readiness Readiness, acceptable AcceptableRange, minSize, maxSize int) api.ClusterAutoscalerCondition {
condition := api.ClusterAutoscalerCondition{ condition := api.ClusterAutoscalerCondition{
Type: api.ClusterAutoscalerHealth, Type: api.ClusterAutoscalerHealth,
Message: fmt.Sprintf("ready=%d unready=%d notStarted=%d longNotStarted=0 registered=%d longUnregistered=%d cloudProviderTarget=%d (minSize=%d, maxSize=%d)", Message: fmt.Sprintf("ready=%d unready=%d (resourceUnready=%d) notStarted=%d longNotStarted=0 registered=%d longUnregistered=%d cloudProviderTarget=%d (minSize=%d, maxSize=%d)",
readiness.Ready, readiness.Ready,
readiness.Unready, readiness.Unready,
readiness.ResourceUnready,
readiness.NotStarted, readiness.NotStarted,
readiness.Registered, readiness.Registered,
readiness.LongUnregistered, readiness.LongUnregistered,
@ -794,9 +802,10 @@ func buildScaleDownStatusNodeGroup(candidates []string, lastProbed time.Time) ap
func buildHealthStatusClusterwide(isReady bool, readiness Readiness) api.ClusterAutoscalerCondition { func buildHealthStatusClusterwide(isReady bool, readiness Readiness) api.ClusterAutoscalerCondition {
condition := api.ClusterAutoscalerCondition{ condition := api.ClusterAutoscalerCondition{
Type: api.ClusterAutoscalerHealth, Type: api.ClusterAutoscalerHealth,
Message: fmt.Sprintf("ready=%d unready=%d notStarted=%d longNotStarted=0 registered=%d longUnregistered=%d", Message: fmt.Sprintf("ready=%d unready=%d (resourceUnready=%d) notStarted=%d longNotStarted=0 registered=%d longUnregistered=%d",
readiness.Ready, readiness.Ready,
readiness.Unready, readiness.Unready,
readiness.ResourceUnready,
readiness.NotStarted, readiness.NotStarted,
readiness.Registered, readiness.Registered,
readiness.LongUnregistered, readiness.LongUnregistered,

View File

@ -49,7 +49,7 @@ func (p *GpuCustomResourcesProcessor) FilterOutNodesWithUnreadyResources(context
if hasGpuLabel && (!hasGpuAllocatable || gpuAllocatable.IsZero()) { if hasGpuLabel && (!hasGpuAllocatable || gpuAllocatable.IsZero()) {
klog.V(3).Infof("Overriding status of node %v, which seems to have unready GPU", klog.V(3).Infof("Overriding status of node %v, which seems to have unready GPU",
node.Name) node.Name)
nodesWithUnreadyGpu[node.Name] = kubernetes.GetUnreadyNodeCopy(node) nodesWithUnreadyGpu[node.Name] = kubernetes.GetUnreadyNodeCopy(node, kubernetes.ResourceUnready)
} else { } else {
newReadyNodes = append(newReadyNodes, node) newReadyNodes = append(newReadyNodes, node)
} }

View File

@ -23,6 +23,24 @@ import (
apiv1 "k8s.io/api/core/v1" apiv1 "k8s.io/api/core/v1"
) )
// NodeNotReadyReason reprents a reason for node to be unready. While it is
// simply a string on the node object, custom type ensures no one accidentally
// performs any string operation on variables of this type and allows them to
// be treated as enums.
type NodeNotReadyReason string
const (
// ResourceUnready is a fake identifier used internally by Cluster Autoscaler
// to indicate nodes that appear Ready in the API, but are treated as
// still upcoming due to a missing resource (e.g. GPU).
ResourceUnready NodeNotReadyReason = "cluster-autoscaler.kubernetes.io/resource-not-ready"
// IgnoreTaint is a fake identifier used internally by Cluster Autoscaler
// to indicate nodes that appear Ready in the API, but are treated as
// still upcoming due to applied ignore taint.
IgnoreTaint NodeNotReadyReason = "cluster-autoscaler.kubernetes.io/ignore-taint"
)
// IsNodeReadyAndSchedulable returns true if the node is ready and schedulable. // IsNodeReadyAndSchedulable returns true if the node is ready and schedulable.
func IsNodeReadyAndSchedulable(node *apiv1.Node) bool { func IsNodeReadyAndSchedulable(node *apiv1.Node) bool {
ready, _, _ := GetReadinessState(node) ready, _, _ := GetReadinessState(node)
@ -36,10 +54,29 @@ func IsNodeReadyAndSchedulable(node *apiv1.Node) bool {
return true return true
} }
// NodeReadiness represents the last known node readiness.
type NodeReadiness struct {
// Is the node ready or not.
Ready bool
// Time of the last state transition related to readiness.
LastTransitionTime time.Time
// Reason for the node to be unready. Defined only when Ready is false.
Reason NodeNotReadyReason
}
// GetReadinessState gets readiness state for the node // GetReadinessState gets readiness state for the node
//
// Deprecated: Use GetNodeReadiness instead.
func GetReadinessState(node *apiv1.Node) (isNodeReady bool, lastTransitionTime time.Time, err error) { func GetReadinessState(node *apiv1.Node) (isNodeReady bool, lastTransitionTime time.Time, err error) {
nr, err := GetNodeReadiness(node)
return nr.Ready, nr.LastTransitionTime, err
}
// GetNodeReadiness gets readiness for the node
func GetNodeReadiness(node *apiv1.Node) (NodeReadiness, error) {
canNodeBeReady, readyFound := true, false canNodeBeReady, readyFound := true, false
lastTransitionTime = time.Time{} lastTransitionTime := time.Time{}
var reason NodeNotReadyReason
for _, cond := range node.Status.Conditions { for _, cond := range node.Status.Conditions {
switch cond.Type { switch cond.Type {
@ -47,6 +84,7 @@ func GetReadinessState(node *apiv1.Node) (isNodeReady bool, lastTransitionTime t
readyFound = true readyFound = true
if cond.Status == apiv1.ConditionFalse || cond.Status == apiv1.ConditionUnknown { if cond.Status == apiv1.ConditionFalse || cond.Status == apiv1.ConditionUnknown {
canNodeBeReady = false canNodeBeReady = false
reason = NodeNotReadyReason(cond.Reason)
} }
if lastTransitionTime.Before(cond.LastTransitionTime.Time) { if lastTransitionTime.Before(cond.LastTransitionTime.Time) {
lastTransitionTime = cond.LastTransitionTime.Time lastTransitionTime = cond.LastTransitionTime.Time
@ -83,18 +121,23 @@ func GetReadinessState(node *apiv1.Node) (isNodeReady bool, lastTransitionTime t
} }
if !readyFound { if !readyFound {
return false, time.Time{}, fmt.Errorf("readiness information not found") return NodeReadiness{}, fmt.Errorf("readiness information not found")
} }
return canNodeBeReady, lastTransitionTime, nil return NodeReadiness{
Ready: canNodeBeReady,
LastTransitionTime: lastTransitionTime,
Reason: reason,
}, nil
} }
// GetUnreadyNodeCopy create a copy of the given node and override its NodeReady condition to False // GetUnreadyNodeCopy create a copy of the given node and override its NodeReady condition to False
func GetUnreadyNodeCopy(node *apiv1.Node) *apiv1.Node { func GetUnreadyNodeCopy(node *apiv1.Node, reason NodeNotReadyReason) *apiv1.Node {
newNode := node.DeepCopy() newNode := node.DeepCopy()
newReadyCondition := apiv1.NodeCondition{ newReadyCondition := apiv1.NodeCondition{
Type: apiv1.NodeReady, Type: apiv1.NodeReady,
Status: apiv1.ConditionFalse, Status: apiv1.ConditionFalse,
LastTransitionTime: node.CreationTimestamp, LastTransitionTime: node.CreationTimestamp,
Reason: string(reason),
} }
newNodeConditions := []apiv1.NodeCondition{newReadyCondition} newNodeConditions := []apiv1.NodeCondition{newReadyCondition}
for _, condition := range newNode.Status.Conditions { for _, condition := range newNode.Status.Conditions {

View File

@ -115,7 +115,7 @@ func FilterOutNodesWithIgnoredTaints(ignoredTaints TaintKeySet, allNodes, readyN
_, hasIgnoredTaint := ignoredTaints[t.Key] _, hasIgnoredTaint := ignoredTaints[t.Key]
if hasIgnoredTaint || strings.HasPrefix(t.Key, IgnoreTaintPrefix) { if hasIgnoredTaint || strings.HasPrefix(t.Key, IgnoreTaintPrefix) {
ready = false ready = false
nodesWithIgnoredTaints[node.Name] = kubernetes.GetUnreadyNodeCopy(node) nodesWithIgnoredTaints[node.Name] = kubernetes.GetUnreadyNodeCopy(node, kubernetes.IgnoreTaint)
klog.V(3).Infof("Overriding status of node %v, which seems to have ignored taint %q", node.Name, t.Key) klog.V(3).Infof("Overriding status of node %v, which seems to have ignored taint %q", node.Name, t.Key)
break break
} }