Mark nodes with uninitialized GPUs as unready

This commit is contained in:
Maciej Pytel 2017-11-10 17:41:19 +01:00
parent 4c83330c29
commit d81dca5991
3 changed files with 198 additions and 60 deletions

View File

@ -102,21 +102,6 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) errors.AutoscalerError
return errors.ToAutoscalerError(errors.CloudProviderError, err)
}
readyNodes, err := readyNodeLister.List()
if err != nil {
glog.Errorf("Failed to list ready nodes: %v", err)
return errors.ToAutoscalerError(errors.ApiCallError, err)
}
// Handle GPU case - allocatable GPU may be equal to 0 up to 15 minutes after
// node registers as ready. See https://github.com/kubernetes/kubernetes/issues/54959
// TODO: Remove this call when we handle dynamically provisioned resources.
readyNodes = gpu.SetGPUAllocatableToCapacity(readyNodes)
if len(readyNodes) == 0 {
glog.Warningf("No ready nodes in the cluster")
scaleDown.CleanUpUnneededNodes()
return nil
}
allNodes, err := allNodeLister.List()
if err != nil {
glog.Errorf("Failed to list all nodes: %v", err)
@ -128,6 +113,23 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) errors.AutoscalerError
return nil
}
readyNodes, err := readyNodeLister.List()
if err != nil {
glog.Errorf("Failed to list ready nodes: %v", err)
return errors.ToAutoscalerError(errors.ApiCallError, err)
}
// Handle GPU case - allocatable GPU may be equal to 0 up to 15 minutes after
// node registers as ready. See https://github.com/kubernetes/kubernetes/issues/54959
// Treat those nodes as unready until GPU actually becomes available and let
// our normal handling for booting up nodes deal with this.
// TODO: Remove this call when we handle dynamically provisioned resources.
allNodes, readyNodes = gpu.FilterOutNodesWithUnreadyGpus(allNodes, readyNodes)
if len(readyNodes) == 0 {
glog.Warningf("No ready nodes in the cluster")
scaleDown.CleanUpUnneededNodes()
return nil
}
err = a.ClusterStateRegistry.UpdateNodes(allNodes, currentTime)
if err != nil {
glog.Errorf("Failed to update node registry: %v", err)

View File

@ -26,29 +26,69 @@ import (
const (
// ResourceNvidiaGPU is the name of the Nvidia GPU resource.
ResourceNvidiaGPU = "nvidia.com/gpu"
// GPULabel is the label added to nodes with GPU resource on GKE.
GPULabel = "cloud.google.com/gke-accelerator"
)
// SetGPUAllocatableToCapacity allows us to tolerate the fact that nodes with
// GPUs can have allocatable set to 0 for multiple minutes after becoming ready
// Without this workaround, Cluster Autoscaler will trigger an unnecessary
// additional scale up before the node is fully operational.
// TODO: Remove this once we handle dynamically privisioned resources well.
func SetGPUAllocatableToCapacity(nodes []*apiv1.Node) []*apiv1.Node {
result := []*apiv1.Node{}
for _, node := range nodes {
newNode := node
if gpuCapacity, ok := node.Status.Capacity[ResourceNvidiaGPU]; ok {
if gpuAllocatable, ok := node.Status.Allocatable[ResourceNvidiaGPU]; !ok || gpuAllocatable.IsZero() {
nodeCopy, err := api.Scheme.DeepCopy(node)
if err != nil {
glog.Errorf("Failed to make a copy of node %v", node.ObjectMeta.Name)
} else {
newNode = nodeCopy.(*apiv1.Node)
newNode.Status.Allocatable[ResourceNvidiaGPU] = gpuCapacity.DeepCopy()
}
// FilterOutNodesWithUnreadyGpus removes nodes that should have GPU, but don't have it in allocatable
// from ready nodes list and updates their status to unready on all nodes list.
// This is a hack/workaround for nodes with GPU coming up without installed drivers, resulting
// in GPU missing from their allocatable and capacity.
func FilterOutNodesWithUnreadyGpus(allNodes, readyNodes []*apiv1.Node) ([]*apiv1.Node, []*apiv1.Node) {
newAllNodes := make([]*apiv1.Node, 0)
newReadyNodes := make([]*apiv1.Node, 0)
nodesWithUnreadyGpu := make(map[string]*apiv1.Node)
for _, node := range readyNodes {
isUnready := false
_, hasGpuLabel := node.Labels[GPULabel]
gpuAllocatable, hasGpuAllocatable := node.Status.Allocatable[ResourceNvidiaGPU]
// We expect node to have GPU based on label, but it doesn't show up
// on node object. Assume the node is still not fully started (installing
// GPU drivers).
if hasGpuLabel && (!hasGpuAllocatable || gpuAllocatable.IsZero()) {
newNode, err := getUnreadyNodeCopy(node)
if err != nil {
glog.Errorf("Failed to override status of node %v with unready GPU: %v",
node.Name, err)
} else {
glog.V(3).Infof("Overriding status of node %v, which seems to have unready GPU",
node.Name)
nodesWithUnreadyGpu[newNode.Name] = newNode
isUnready = true
}
}
result = append(result, newNode)
if !isUnready {
newReadyNodes = append(newReadyNodes, node)
}
}
return result
// Override any node with unready GPU with its "unready" copy
for _, node := range allNodes {
if newNode, found := nodesWithUnreadyGpu[node.Name]; found {
newAllNodes = append(newAllNodes, newNode)
} else {
newAllNodes = append(newAllNodes, node)
}
}
return newAllNodes, newReadyNodes
}
func getUnreadyNodeCopy(node *apiv1.Node) (*apiv1.Node, error) {
nodeCopy, err := api.Scheme.DeepCopy(node)
if err != nil {
return nil, err
}
newNode := nodeCopy.(*apiv1.Node)
newReadyCondition := apiv1.NodeCondition{
Type: apiv1.NodeReady,
Status: apiv1.ConditionFalse,
LastTransitionTime: node.CreationTimestamp,
}
newNodeConditions := []apiv1.NodeCondition{newReadyCondition}
for _, condition := range newNode.Status.Conditions {
if condition.Type != apiv1.NodeReady {
newNodeConditions = append(newNodeConditions, condition)
}
}
newNode.Status.Conditions = newNodeConditions
return newNode, nil
}

View File

@ -17,7 +17,9 @@ limitations under the License.
package gpu
import (
"fmt"
"testing"
"time"
apiv1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
@ -26,30 +28,124 @@ import (
"github.com/stretchr/testify/assert"
)
func TestSetGPUAllocatableToCapacity(t *testing.T) {
nodeGPU := &apiv1.Node{ObjectMeta: metav1.ObjectMeta{Name: "nodeGpu"}, Status: apiv1.NodeStatus{Capacity: apiv1.ResourceList{}, Allocatable: apiv1.ResourceList{}}}
nodeGPU.Status.Allocatable[ResourceNvidiaGPU] = *resource.NewQuantity(1, resource.DecimalSI)
nodeGPU.Status.Capacity[ResourceNvidiaGPU] = *resource.NewQuantity(1, resource.DecimalSI)
nodeGPUUnready := &apiv1.Node{ObjectMeta: metav1.ObjectMeta{Name: "nodeGpuUnready"}, Status: apiv1.NodeStatus{Capacity: apiv1.ResourceList{}, Allocatable: apiv1.ResourceList{}}}
nodeGPUUnready.Status.Allocatable[ResourceNvidiaGPU] = *resource.NewQuantity(0, resource.DecimalSI)
nodeGPUUnready.Status.Capacity[ResourceNvidiaGPU] = *resource.NewQuantity(2, resource.DecimalSI)
nodeGPUNoAllocatable := &apiv1.Node{ObjectMeta: metav1.ObjectMeta{Name: "nodeGpuNoAllocatable"}, Status: apiv1.NodeStatus{Capacity: apiv1.ResourceList{}, Allocatable: apiv1.ResourceList{}}}
nodeGPUNoAllocatable.Status.Capacity[ResourceNvidiaGPU] = *resource.NewQuantity(1, resource.DecimalSI)
nodeNoGPU := &apiv1.Node{ObjectMeta: metav1.ObjectMeta{Name: "nodeGpuUnready"}, Status: apiv1.NodeStatus{Capacity: apiv1.ResourceList{}, Allocatable: apiv1.ResourceList{}}}
nodeNoGPU.Status.Allocatable[apiv1.ResourceCPU] = *resource.NewQuantity(1, resource.DecimalSI)
nodeNoGPU.Status.Capacity[apiv1.ResourceCPU] = *resource.NewQuantity(2, resource.DecimalSI)
result := SetGPUAllocatableToCapacity([]*apiv1.Node{nodeGPU, nodeGPUUnready, nodeGPUNoAllocatable, nodeNoGPU})
assertAllocatableAndCapacity(t, ResourceNvidiaGPU, 1, 1, result[0])
assertAllocatableAndCapacity(t, ResourceNvidiaGPU, 2, 2, result[1])
assertAllocatableAndCapacity(t, ResourceNvidiaGPU, 1, 1, result[2])
assertAllocatableAndCapacity(t, apiv1.ResourceCPU, 1, 2, result[3])
}
func TestFilterOutNodesWithUnreadyGpus(t *testing.T) {
start := time.Now()
later := start.Add(10 * time.Minute)
expectedReadiness := make(map[string]bool)
gpuLabels := map[string]string{
GPULabel: "nvidia-tesla-k80",
}
readyCondition := apiv1.NodeCondition{
Type: apiv1.NodeReady,
Status: apiv1.ConditionTrue,
LastTransitionTime: metav1.NewTime(later),
}
unreadyCondition := apiv1.NodeCondition{
Type: apiv1.NodeReady,
Status: apiv1.ConditionFalse,
LastTransitionTime: metav1.NewTime(later),
}
func assertAllocatableAndCapacity(t *testing.T, resourceName apiv1.ResourceName, allocatable, capacity int64, node *apiv1.Node) {
allocatableResource := *resource.NewQuantity(allocatable, resource.DecimalSI)
capacityResource := *resource.NewQuantity(capacity, resource.DecimalSI)
assert.Equal(t, node.Status.Allocatable[resourceName], allocatableResource,
"Node %v, expected allocatable %v: %v got: %v", node.ObjectMeta.Name, resourceName, node.Status.Allocatable[resourceName], allocatableResource)
assert.Equal(t, node.Status.Capacity[resourceName], capacityResource,
"Node %v, expected capacity %v: %v got: %v", node.ObjectMeta.Name, resourceName, node.Status.Capacity[resourceName], capacityResource)
nodeGpuReady := &apiv1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "nodeGpuReady",
Labels: gpuLabels,
CreationTimestamp: metav1.NewTime(start),
},
Status: apiv1.NodeStatus{
Capacity: apiv1.ResourceList{},
Allocatable: apiv1.ResourceList{},
Conditions: []apiv1.NodeCondition{readyCondition},
},
}
nodeGpuReady.Status.Allocatable[ResourceNvidiaGPU] = *resource.NewQuantity(1, resource.DecimalSI)
nodeGpuReady.Status.Capacity[ResourceNvidiaGPU] = *resource.NewQuantity(1, resource.DecimalSI)
expectedReadiness[nodeGpuReady.Name] = true
nodeGpuUnready := &apiv1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "nodeGpuUnready",
Labels: gpuLabels,
CreationTimestamp: metav1.NewTime(start),
},
Status: apiv1.NodeStatus{
Capacity: apiv1.ResourceList{},
Allocatable: apiv1.ResourceList{},
Conditions: []apiv1.NodeCondition{readyCondition},
},
}
nodeGpuUnready.Status.Allocatable[ResourceNvidiaGPU] = *resource.NewQuantity(0, resource.DecimalSI)
nodeGpuUnready.Status.Capacity[ResourceNvidiaGPU] = *resource.NewQuantity(0, resource.DecimalSI)
expectedReadiness[nodeGpuUnready.Name] = false
nodeGpuUnready2 := &apiv1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "nodeGpuUnready2",
Labels: gpuLabels,
CreationTimestamp: metav1.NewTime(start),
},
Status: apiv1.NodeStatus{
Conditions: []apiv1.NodeCondition{readyCondition},
},
}
expectedReadiness[nodeGpuUnready2.Name] = false
nodeNoGpuReady := &apiv1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "nodeNoGpuReady",
Labels: make(map[string]string),
CreationTimestamp: metav1.NewTime(start),
},
Status: apiv1.NodeStatus{
Conditions: []apiv1.NodeCondition{readyCondition},
},
}
expectedReadiness[nodeNoGpuReady.Name] = true
nodeNoGpuUnready := &apiv1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "nodeNoGpuUnready",
Labels: make(map[string]string),
CreationTimestamp: metav1.NewTime(start),
},
Status: apiv1.NodeStatus{
Conditions: []apiv1.NodeCondition{unreadyCondition},
},
}
expectedReadiness[nodeNoGpuUnready.Name] = false
initialReadyNodes := []*apiv1.Node{
nodeGpuReady,
nodeGpuUnready,
nodeGpuUnready2,
nodeNoGpuReady,
}
initialAllNodes := []*apiv1.Node{
nodeGpuReady,
nodeGpuUnready,
nodeGpuUnready2,
nodeNoGpuReady,
nodeNoGpuUnready,
}
newAllNodes, newReadyNodes := FilterOutNodesWithUnreadyGpus(initialAllNodes, initialReadyNodes)
foundInReady := make(map[string]bool)
for _, node := range newReadyNodes {
foundInReady[node.Name] = true
assert.True(t, expectedReadiness[node.Name], fmt.Sprintf("Node %s found in ready nodes list (it shouldn't be there)", node.Name))
}
for nodeName, expected := range expectedReadiness {
if expected {
assert.True(t, foundInReady[nodeName], fmt.Sprintf("Node %s expected ready, but not found in ready nodes list", nodeName))
}
}
for _, node := range newAllNodes {
assert.Equal(t, len(node.Status.Conditions), 1)
if expectedReadiness[node.Name] {
assert.Equal(t, node.Status.Conditions[0].Status, apiv1.ConditionTrue, fmt.Sprintf("Unexpected ready condition value for node %s", node.Name))
} else {
assert.Equal(t, node.Status.Conditions[0].Status, apiv1.ConditionFalse, fmt.Sprintf("Unexpected ready condition value for node %s", node.Name))
}
}
}