Mark nodes with uninitialized GPUs as unready

2017-11-10 17:41:19 +01:00 · 2017-11-10 17:41:19 +01:00 · d81dca5991
parent 4c83330c29
commit d81dca5991
3 changed files with 198 additions and 60 deletions
--- a/cluster-autoscaler/core/static_autoscaler.go
+++ b/cluster-autoscaler/core/static_autoscaler.go
@ -102,21 +102,6 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) errors.AutoscalerError
 		return errors.ToAutoscalerError(errors.CloudProviderError, err)
 	}

-	readyNodes, err := readyNodeLister.List()
-	if err != nil {
-		glog.Errorf("Failed to list ready nodes: %v", err)
-		return errors.ToAutoscalerError(errors.ApiCallError, err)
-	}
-	// Handle GPU case - allocatable GPU may be equal to 0 up to 15 minutes after
-	// node registers as ready. See https://github.com/kubernetes/kubernetes/issues/54959
-	// TODO: Remove this call when we handle dynamically provisioned resources.
-	readyNodes = gpu.SetGPUAllocatableToCapacity(readyNodes)
-	if len(readyNodes) == 0 {
-		glog.Warningf("No ready nodes in the cluster")
-		scaleDown.CleanUpUnneededNodes()
-		return nil
-	}
-
 	allNodes, err := allNodeLister.List()
 	if err != nil {
 		glog.Errorf("Failed to list all nodes: %v", err)
@ -128,6 +113,23 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) errors.AutoscalerError
 		return nil
 	}

+	readyNodes, err := readyNodeLister.List()
+	if err != nil {
+		glog.Errorf("Failed to list ready nodes: %v", err)
+		return errors.ToAutoscalerError(errors.ApiCallError, err)
+	}
+	// Handle GPU case - allocatable GPU may be equal to 0 up to 15 minutes after
+	// node registers as ready. See https://github.com/kubernetes/kubernetes/issues/54959
+	// Treat those nodes as unready until GPU actually becomes available and let
+	// our normal handling for booting up nodes deal with this.
+	// TODO: Remove this call when we handle dynamically provisioned resources.
+	allNodes, readyNodes = gpu.FilterOutNodesWithUnreadyGpus(allNodes, readyNodes)
+	if len(readyNodes) == 0 {
+		glog.Warningf("No ready nodes in the cluster")
+		scaleDown.CleanUpUnneededNodes()
+		return nil
+	}
+
 	err = a.ClusterStateRegistry.UpdateNodes(allNodes, currentTime)
 	if err != nil {
 		glog.Errorf("Failed to update node registry: %v", err)
--- a/cluster-autoscaler/utils/gpu/gpu.go
+++ b/cluster-autoscaler/utils/gpu/gpu.go
@ -26,29 +26,69 @@ import (
 const (
 	// ResourceNvidiaGPU is the name of the Nvidia GPU resource.
 	ResourceNvidiaGPU = "nvidia.com/gpu"
+	// GPULabel is the label added to nodes with GPU resource on GKE.
+	GPULabel = "cloud.google.com/gke-accelerator"
 )

-// SetGPUAllocatableToCapacity allows us to tolerate the fact that nodes with
-// GPUs can have allocatable set to 0 for multiple minutes after becoming ready
-// Without this workaround, Cluster Autoscaler will trigger an unnecessary
-// additional scale up before the node is fully operational.
-// TODO: Remove this once we handle dynamically privisioned resources well.
-func SetGPUAllocatableToCapacity(nodes []*apiv1.Node) []*apiv1.Node {
-	result := []*apiv1.Node{}
-	for _, node := range nodes {
-		newNode := node
-		if gpuCapacity, ok := node.Status.Capacity[ResourceNvidiaGPU]; ok {
-			if gpuAllocatable, ok := node.Status.Allocatable[ResourceNvidiaGPU]; !ok || gpuAllocatable.IsZero() {
-				nodeCopy, err := api.Scheme.DeepCopy(node)
-				if err != nil {
-					glog.Errorf("Failed to make a copy of node %v", node.ObjectMeta.Name)
-				} else {
-					newNode = nodeCopy.(*apiv1.Node)
-					newNode.Status.Allocatable[ResourceNvidiaGPU] = gpuCapacity.DeepCopy()
-				}
+// FilterOutNodesWithUnreadyGpus removes nodes that should have GPU, but don't have it in allocatable
+// from ready nodes list and updates their status to unready on all nodes list.
+// This is a hack/workaround for nodes with GPU coming up without installed drivers, resulting
+// in GPU missing from their allocatable and capacity.
+func FilterOutNodesWithUnreadyGpus(allNodes, readyNodes []*apiv1.Node) ([]*apiv1.Node, []*apiv1.Node) {
+	newAllNodes := make([]*apiv1.Node, 0)
+	newReadyNodes := make([]*apiv1.Node, 0)
+	nodesWithUnreadyGpu := make(map[string]*apiv1.Node)
+	for _, node := range readyNodes {
+		isUnready := false
+		_, hasGpuLabel := node.Labels[GPULabel]
+		gpuAllocatable, hasGpuAllocatable := node.Status.Allocatable[ResourceNvidiaGPU]
+		// We expect node to have GPU based on label, but it doesn't show up
+		// on node object. Assume the node is still not fully started (installing
+		// GPU drivers).
+		if hasGpuLabel && (!hasGpuAllocatable || gpuAllocatable.IsZero()) {
+			newNode, err := getUnreadyNodeCopy(node)
+			if err != nil {
+				glog.Errorf("Failed to override status of node %v with unready GPU: %v",
+					node.Name, err)
+			} else {
+				glog.V(3).Infof("Overriding status of node %v, which seems to have unready GPU",
+					node.Name)
+				nodesWithUnreadyGpu[newNode.Name] = newNode
+				isUnready = true
 			}
 		}
-		result = append(result, newNode)
+		if !isUnready {
+			newReadyNodes = append(newReadyNodes, node)
+		}
 	}
-	return result
+	// Override any node with unready GPU with its "unready" copy
+	for _, node := range allNodes {
+		if newNode, found := nodesWithUnreadyGpu[node.Name]; found {
+			newAllNodes = append(newAllNodes, newNode)
+		} else {
+			newAllNodes = append(newAllNodes, node)
+		}
+	}
+	return newAllNodes, newReadyNodes
+}
+
+func getUnreadyNodeCopy(node *apiv1.Node) (*apiv1.Node, error) {
+	nodeCopy, err := api.Scheme.DeepCopy(node)
+	if err != nil {
+		return nil, err
+	}
+	newNode := nodeCopy.(*apiv1.Node)
+	newReadyCondition := apiv1.NodeCondition{
+		Type:               apiv1.NodeReady,
+		Status:             apiv1.ConditionFalse,
+		LastTransitionTime: node.CreationTimestamp,
+	}
+	newNodeConditions := []apiv1.NodeCondition{newReadyCondition}
+	for _, condition := range newNode.Status.Conditions {
+		if condition.Type != apiv1.NodeReady {
+			newNodeConditions = append(newNodeConditions, condition)
+		}
+	}
+	newNode.Status.Conditions = newNodeConditions
+	return newNode, nil
 }
--- a/cluster-autoscaler/utils/gpu/gpu_test.go
+++ b/cluster-autoscaler/utils/gpu/gpu_test.go
@ -17,7 +17,9 @@ limitations under the License.
 package gpu

 import (
+	"fmt"
 	"testing"
+	"time"

 	apiv1 "k8s.io/api/core/v1"
 	"k8s.io/apimachinery/pkg/api/resource"
@ -26,30 +28,124 @@ import (
 	"github.com/stretchr/testify/assert"
 )

-func TestSetGPUAllocatableToCapacity(t *testing.T) {
-	nodeGPU := &apiv1.Node{ObjectMeta: metav1.ObjectMeta{Name: "nodeGpu"}, Status: apiv1.NodeStatus{Capacity: apiv1.ResourceList{}, Allocatable: apiv1.ResourceList{}}}
-	nodeGPU.Status.Allocatable[ResourceNvidiaGPU] = *resource.NewQuantity(1, resource.DecimalSI)
-	nodeGPU.Status.Capacity[ResourceNvidiaGPU] = *resource.NewQuantity(1, resource.DecimalSI)
-	nodeGPUUnready := &apiv1.Node{ObjectMeta: metav1.ObjectMeta{Name: "nodeGpuUnready"}, Status: apiv1.NodeStatus{Capacity: apiv1.ResourceList{}, Allocatable: apiv1.ResourceList{}}}
-	nodeGPUUnready.Status.Allocatable[ResourceNvidiaGPU] = *resource.NewQuantity(0, resource.DecimalSI)
-	nodeGPUUnready.Status.Capacity[ResourceNvidiaGPU] = *resource.NewQuantity(2, resource.DecimalSI)
-	nodeGPUNoAllocatable := &apiv1.Node{ObjectMeta: metav1.ObjectMeta{Name: "nodeGpuNoAllocatable"}, Status: apiv1.NodeStatus{Capacity: apiv1.ResourceList{}, Allocatable: apiv1.ResourceList{}}}
-	nodeGPUNoAllocatable.Status.Capacity[ResourceNvidiaGPU] = *resource.NewQuantity(1, resource.DecimalSI)
-	nodeNoGPU := &apiv1.Node{ObjectMeta: metav1.ObjectMeta{Name: "nodeGpuUnready"}, Status: apiv1.NodeStatus{Capacity: apiv1.ResourceList{}, Allocatable: apiv1.ResourceList{}}}
-	nodeNoGPU.Status.Allocatable[apiv1.ResourceCPU] = *resource.NewQuantity(1, resource.DecimalSI)
-	nodeNoGPU.Status.Capacity[apiv1.ResourceCPU] = *resource.NewQuantity(2, resource.DecimalSI)
-	result := SetGPUAllocatableToCapacity([]*apiv1.Node{nodeGPU, nodeGPUUnready, nodeGPUNoAllocatable, nodeNoGPU})
-	assertAllocatableAndCapacity(t, ResourceNvidiaGPU, 1, 1, result[0])
-	assertAllocatableAndCapacity(t, ResourceNvidiaGPU, 2, 2, result[1])
-	assertAllocatableAndCapacity(t, ResourceNvidiaGPU, 1, 1, result[2])
-	assertAllocatableAndCapacity(t, apiv1.ResourceCPU, 1, 2, result[3])
-}
+func TestFilterOutNodesWithUnreadyGpus(t *testing.T) {
+	start := time.Now()
+	later := start.Add(10 * time.Minute)
+	expectedReadiness := make(map[string]bool)
+	gpuLabels := map[string]string{
+		GPULabel: "nvidia-tesla-k80",
+	}
+	readyCondition := apiv1.NodeCondition{
+		Type:               apiv1.NodeReady,
+		Status:             apiv1.ConditionTrue,
+		LastTransitionTime: metav1.NewTime(later),
+	}
+	unreadyCondition := apiv1.NodeCondition{
+		Type:               apiv1.NodeReady,
+		Status:             apiv1.ConditionFalse,
+		LastTransitionTime: metav1.NewTime(later),
+	}

-func assertAllocatableAndCapacity(t *testing.T, resourceName apiv1.ResourceName, allocatable, capacity int64, node *apiv1.Node) {
-	allocatableResource := *resource.NewQuantity(allocatable, resource.DecimalSI)
-	capacityResource := *resource.NewQuantity(capacity, resource.DecimalSI)
-	assert.Equal(t, node.Status.Allocatable[resourceName], allocatableResource,
-		"Node %v, expected allocatable %v: %v got: %v", node.ObjectMeta.Name, resourceName, node.Status.Allocatable[resourceName], allocatableResource)
-	assert.Equal(t, node.Status.Capacity[resourceName], capacityResource,
-		"Node %v, expected capacity %v: %v got: %v", node.ObjectMeta.Name, resourceName, node.Status.Capacity[resourceName], capacityResource)
+	nodeGpuReady := &apiv1.Node{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:              "nodeGpuReady",
+			Labels:            gpuLabels,
+			CreationTimestamp: metav1.NewTime(start),
+		},
+		Status: apiv1.NodeStatus{
+			Capacity:    apiv1.ResourceList{},
+			Allocatable: apiv1.ResourceList{},
+			Conditions:  []apiv1.NodeCondition{readyCondition},
+		},
+	}
+	nodeGpuReady.Status.Allocatable[ResourceNvidiaGPU] = *resource.NewQuantity(1, resource.DecimalSI)
+	nodeGpuReady.Status.Capacity[ResourceNvidiaGPU] = *resource.NewQuantity(1, resource.DecimalSI)
+	expectedReadiness[nodeGpuReady.Name] = true
+
+	nodeGpuUnready := &apiv1.Node{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:              "nodeGpuUnready",
+			Labels:            gpuLabels,
+			CreationTimestamp: metav1.NewTime(start),
+		},
+		Status: apiv1.NodeStatus{
+			Capacity:    apiv1.ResourceList{},
+			Allocatable: apiv1.ResourceList{},
+			Conditions:  []apiv1.NodeCondition{readyCondition},
+		},
+	}
+	nodeGpuUnready.Status.Allocatable[ResourceNvidiaGPU] = *resource.NewQuantity(0, resource.DecimalSI)
+	nodeGpuUnready.Status.Capacity[ResourceNvidiaGPU] = *resource.NewQuantity(0, resource.DecimalSI)
+	expectedReadiness[nodeGpuUnready.Name] = false
+
+	nodeGpuUnready2 := &apiv1.Node{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:              "nodeGpuUnready2",
+			Labels:            gpuLabels,
+			CreationTimestamp: metav1.NewTime(start),
+		},
+		Status: apiv1.NodeStatus{
+			Conditions: []apiv1.NodeCondition{readyCondition},
+		},
+	}
+	expectedReadiness[nodeGpuUnready2.Name] = false
+
+	nodeNoGpuReady := &apiv1.Node{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:              "nodeNoGpuReady",
+			Labels:            make(map[string]string),
+			CreationTimestamp: metav1.NewTime(start),
+		},
+		Status: apiv1.NodeStatus{
+			Conditions: []apiv1.NodeCondition{readyCondition},
+		},
+	}
+	expectedReadiness[nodeNoGpuReady.Name] = true
+
+	nodeNoGpuUnready := &apiv1.Node{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:              "nodeNoGpuUnready",
+			Labels:            make(map[string]string),
+			CreationTimestamp: metav1.NewTime(start),
+		},
+		Status: apiv1.NodeStatus{
+			Conditions: []apiv1.NodeCondition{unreadyCondition},
+		},
+	}
+	expectedReadiness[nodeNoGpuUnready.Name] = false
+
+	initialReadyNodes := []*apiv1.Node{
+		nodeGpuReady,
+		nodeGpuUnready,
+		nodeGpuUnready2,
+		nodeNoGpuReady,
+	}
+	initialAllNodes := []*apiv1.Node{
+		nodeGpuReady,
+		nodeGpuUnready,
+		nodeGpuUnready2,
+		nodeNoGpuReady,
+		nodeNoGpuUnready,
+	}
+
+	newAllNodes, newReadyNodes := FilterOutNodesWithUnreadyGpus(initialAllNodes, initialReadyNodes)
+
+	foundInReady := make(map[string]bool)
+	for _, node := range newReadyNodes {
+		foundInReady[node.Name] = true
+		assert.True(t, expectedReadiness[node.Name], fmt.Sprintf("Node %s found in ready nodes list (it shouldn't be there)", node.Name))
+	}
+	for nodeName, expected := range expectedReadiness {
+		if expected {
+			assert.True(t, foundInReady[nodeName], fmt.Sprintf("Node %s expected ready, but not found in ready nodes list", nodeName))
+		}
+	}
+	for _, node := range newAllNodes {
+		assert.Equal(t, len(node.Status.Conditions), 1)
+		if expectedReadiness[node.Name] {
+			assert.Equal(t, node.Status.Conditions[0].Status, apiv1.ConditionTrue, fmt.Sprintf("Unexpected ready condition value for node %s", node.Name))
+		} else {
+			assert.Equal(t, node.Status.Conditions[0].Status, apiv1.ConditionFalse, fmt.Sprintf("Unexpected ready condition value for node %s", node.Name))
+		}
+	}
 }