Mark nodes with uninitialized GPUs as unready
This commit is contained in:
parent
4c83330c29
commit
d81dca5991
|
|
@ -102,21 +102,6 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) errors.AutoscalerError
|
|||
return errors.ToAutoscalerError(errors.CloudProviderError, err)
|
||||
}
|
||||
|
||||
readyNodes, err := readyNodeLister.List()
|
||||
if err != nil {
|
||||
glog.Errorf("Failed to list ready nodes: %v", err)
|
||||
return errors.ToAutoscalerError(errors.ApiCallError, err)
|
||||
}
|
||||
// Handle GPU case - allocatable GPU may be equal to 0 up to 15 minutes after
|
||||
// node registers as ready. See https://github.com/kubernetes/kubernetes/issues/54959
|
||||
// TODO: Remove this call when we handle dynamically provisioned resources.
|
||||
readyNodes = gpu.SetGPUAllocatableToCapacity(readyNodes)
|
||||
if len(readyNodes) == 0 {
|
||||
glog.Warningf("No ready nodes in the cluster")
|
||||
scaleDown.CleanUpUnneededNodes()
|
||||
return nil
|
||||
}
|
||||
|
||||
allNodes, err := allNodeLister.List()
|
||||
if err != nil {
|
||||
glog.Errorf("Failed to list all nodes: %v", err)
|
||||
|
|
@ -128,6 +113,23 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) errors.AutoscalerError
|
|||
return nil
|
||||
}
|
||||
|
||||
readyNodes, err := readyNodeLister.List()
|
||||
if err != nil {
|
||||
glog.Errorf("Failed to list ready nodes: %v", err)
|
||||
return errors.ToAutoscalerError(errors.ApiCallError, err)
|
||||
}
|
||||
// Handle GPU case - allocatable GPU may be equal to 0 up to 15 minutes after
|
||||
// node registers as ready. See https://github.com/kubernetes/kubernetes/issues/54959
|
||||
// Treat those nodes as unready until GPU actually becomes available and let
|
||||
// our normal handling for booting up nodes deal with this.
|
||||
// TODO: Remove this call when we handle dynamically provisioned resources.
|
||||
allNodes, readyNodes = gpu.FilterOutNodesWithUnreadyGpus(allNodes, readyNodes)
|
||||
if len(readyNodes) == 0 {
|
||||
glog.Warningf("No ready nodes in the cluster")
|
||||
scaleDown.CleanUpUnneededNodes()
|
||||
return nil
|
||||
}
|
||||
|
||||
err = a.ClusterStateRegistry.UpdateNodes(allNodes, currentTime)
|
||||
if err != nil {
|
||||
glog.Errorf("Failed to update node registry: %v", err)
|
||||
|
|
|
|||
|
|
@ -26,29 +26,69 @@ import (
|
|||
const (
|
||||
// ResourceNvidiaGPU is the name of the Nvidia GPU resource.
|
||||
ResourceNvidiaGPU = "nvidia.com/gpu"
|
||||
// GPULabel is the label added to nodes with GPU resource on GKE.
|
||||
GPULabel = "cloud.google.com/gke-accelerator"
|
||||
)
|
||||
|
||||
// SetGPUAllocatableToCapacity allows us to tolerate the fact that nodes with
|
||||
// GPUs can have allocatable set to 0 for multiple minutes after becoming ready
|
||||
// Without this workaround, Cluster Autoscaler will trigger an unnecessary
|
||||
// additional scale up before the node is fully operational.
|
||||
// TODO: Remove this once we handle dynamically privisioned resources well.
|
||||
func SetGPUAllocatableToCapacity(nodes []*apiv1.Node) []*apiv1.Node {
|
||||
result := []*apiv1.Node{}
|
||||
for _, node := range nodes {
|
||||
newNode := node
|
||||
if gpuCapacity, ok := node.Status.Capacity[ResourceNvidiaGPU]; ok {
|
||||
if gpuAllocatable, ok := node.Status.Allocatable[ResourceNvidiaGPU]; !ok || gpuAllocatable.IsZero() {
|
||||
nodeCopy, err := api.Scheme.DeepCopy(node)
|
||||
if err != nil {
|
||||
glog.Errorf("Failed to make a copy of node %v", node.ObjectMeta.Name)
|
||||
} else {
|
||||
newNode = nodeCopy.(*apiv1.Node)
|
||||
newNode.Status.Allocatable[ResourceNvidiaGPU] = gpuCapacity.DeepCopy()
|
||||
}
|
||||
// FilterOutNodesWithUnreadyGpus removes nodes that should have GPU, but don't have it in allocatable
|
||||
// from ready nodes list and updates their status to unready on all nodes list.
|
||||
// This is a hack/workaround for nodes with GPU coming up without installed drivers, resulting
|
||||
// in GPU missing from their allocatable and capacity.
|
||||
func FilterOutNodesWithUnreadyGpus(allNodes, readyNodes []*apiv1.Node) ([]*apiv1.Node, []*apiv1.Node) {
|
||||
newAllNodes := make([]*apiv1.Node, 0)
|
||||
newReadyNodes := make([]*apiv1.Node, 0)
|
||||
nodesWithUnreadyGpu := make(map[string]*apiv1.Node)
|
||||
for _, node := range readyNodes {
|
||||
isUnready := false
|
||||
_, hasGpuLabel := node.Labels[GPULabel]
|
||||
gpuAllocatable, hasGpuAllocatable := node.Status.Allocatable[ResourceNvidiaGPU]
|
||||
// We expect node to have GPU based on label, but it doesn't show up
|
||||
// on node object. Assume the node is still not fully started (installing
|
||||
// GPU drivers).
|
||||
if hasGpuLabel && (!hasGpuAllocatable || gpuAllocatable.IsZero()) {
|
||||
newNode, err := getUnreadyNodeCopy(node)
|
||||
if err != nil {
|
||||
glog.Errorf("Failed to override status of node %v with unready GPU: %v",
|
||||
node.Name, err)
|
||||
} else {
|
||||
glog.V(3).Infof("Overriding status of node %v, which seems to have unready GPU",
|
||||
node.Name)
|
||||
nodesWithUnreadyGpu[newNode.Name] = newNode
|
||||
isUnready = true
|
||||
}
|
||||
}
|
||||
result = append(result, newNode)
|
||||
if !isUnready {
|
||||
newReadyNodes = append(newReadyNodes, node)
|
||||
}
|
||||
}
|
||||
return result
|
||||
// Override any node with unready GPU with its "unready" copy
|
||||
for _, node := range allNodes {
|
||||
if newNode, found := nodesWithUnreadyGpu[node.Name]; found {
|
||||
newAllNodes = append(newAllNodes, newNode)
|
||||
} else {
|
||||
newAllNodes = append(newAllNodes, node)
|
||||
}
|
||||
}
|
||||
return newAllNodes, newReadyNodes
|
||||
}
|
||||
|
||||
func getUnreadyNodeCopy(node *apiv1.Node) (*apiv1.Node, error) {
|
||||
nodeCopy, err := api.Scheme.DeepCopy(node)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
newNode := nodeCopy.(*apiv1.Node)
|
||||
newReadyCondition := apiv1.NodeCondition{
|
||||
Type: apiv1.NodeReady,
|
||||
Status: apiv1.ConditionFalse,
|
||||
LastTransitionTime: node.CreationTimestamp,
|
||||
}
|
||||
newNodeConditions := []apiv1.NodeCondition{newReadyCondition}
|
||||
for _, condition := range newNode.Status.Conditions {
|
||||
if condition.Type != apiv1.NodeReady {
|
||||
newNodeConditions = append(newNodeConditions, condition)
|
||||
}
|
||||
}
|
||||
newNode.Status.Conditions = newNodeConditions
|
||||
return newNode, nil
|
||||
}
|
||||
|
|
|
|||
|
|
@ -17,7 +17,9 @@ limitations under the License.
|
|||
package gpu
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
apiv1 "k8s.io/api/core/v1"
|
||||
"k8s.io/apimachinery/pkg/api/resource"
|
||||
|
|
@ -26,30 +28,124 @@ import (
|
|||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
|
||||
func TestSetGPUAllocatableToCapacity(t *testing.T) {
|
||||
nodeGPU := &apiv1.Node{ObjectMeta: metav1.ObjectMeta{Name: "nodeGpu"}, Status: apiv1.NodeStatus{Capacity: apiv1.ResourceList{}, Allocatable: apiv1.ResourceList{}}}
|
||||
nodeGPU.Status.Allocatable[ResourceNvidiaGPU] = *resource.NewQuantity(1, resource.DecimalSI)
|
||||
nodeGPU.Status.Capacity[ResourceNvidiaGPU] = *resource.NewQuantity(1, resource.DecimalSI)
|
||||
nodeGPUUnready := &apiv1.Node{ObjectMeta: metav1.ObjectMeta{Name: "nodeGpuUnready"}, Status: apiv1.NodeStatus{Capacity: apiv1.ResourceList{}, Allocatable: apiv1.ResourceList{}}}
|
||||
nodeGPUUnready.Status.Allocatable[ResourceNvidiaGPU] = *resource.NewQuantity(0, resource.DecimalSI)
|
||||
nodeGPUUnready.Status.Capacity[ResourceNvidiaGPU] = *resource.NewQuantity(2, resource.DecimalSI)
|
||||
nodeGPUNoAllocatable := &apiv1.Node{ObjectMeta: metav1.ObjectMeta{Name: "nodeGpuNoAllocatable"}, Status: apiv1.NodeStatus{Capacity: apiv1.ResourceList{}, Allocatable: apiv1.ResourceList{}}}
|
||||
nodeGPUNoAllocatable.Status.Capacity[ResourceNvidiaGPU] = *resource.NewQuantity(1, resource.DecimalSI)
|
||||
nodeNoGPU := &apiv1.Node{ObjectMeta: metav1.ObjectMeta{Name: "nodeGpuUnready"}, Status: apiv1.NodeStatus{Capacity: apiv1.ResourceList{}, Allocatable: apiv1.ResourceList{}}}
|
||||
nodeNoGPU.Status.Allocatable[apiv1.ResourceCPU] = *resource.NewQuantity(1, resource.DecimalSI)
|
||||
nodeNoGPU.Status.Capacity[apiv1.ResourceCPU] = *resource.NewQuantity(2, resource.DecimalSI)
|
||||
result := SetGPUAllocatableToCapacity([]*apiv1.Node{nodeGPU, nodeGPUUnready, nodeGPUNoAllocatable, nodeNoGPU})
|
||||
assertAllocatableAndCapacity(t, ResourceNvidiaGPU, 1, 1, result[0])
|
||||
assertAllocatableAndCapacity(t, ResourceNvidiaGPU, 2, 2, result[1])
|
||||
assertAllocatableAndCapacity(t, ResourceNvidiaGPU, 1, 1, result[2])
|
||||
assertAllocatableAndCapacity(t, apiv1.ResourceCPU, 1, 2, result[3])
|
||||
}
|
||||
func TestFilterOutNodesWithUnreadyGpus(t *testing.T) {
|
||||
start := time.Now()
|
||||
later := start.Add(10 * time.Minute)
|
||||
expectedReadiness := make(map[string]bool)
|
||||
gpuLabels := map[string]string{
|
||||
GPULabel: "nvidia-tesla-k80",
|
||||
}
|
||||
readyCondition := apiv1.NodeCondition{
|
||||
Type: apiv1.NodeReady,
|
||||
Status: apiv1.ConditionTrue,
|
||||
LastTransitionTime: metav1.NewTime(later),
|
||||
}
|
||||
unreadyCondition := apiv1.NodeCondition{
|
||||
Type: apiv1.NodeReady,
|
||||
Status: apiv1.ConditionFalse,
|
||||
LastTransitionTime: metav1.NewTime(later),
|
||||
}
|
||||
|
||||
func assertAllocatableAndCapacity(t *testing.T, resourceName apiv1.ResourceName, allocatable, capacity int64, node *apiv1.Node) {
|
||||
allocatableResource := *resource.NewQuantity(allocatable, resource.DecimalSI)
|
||||
capacityResource := *resource.NewQuantity(capacity, resource.DecimalSI)
|
||||
assert.Equal(t, node.Status.Allocatable[resourceName], allocatableResource,
|
||||
"Node %v, expected allocatable %v: %v got: %v", node.ObjectMeta.Name, resourceName, node.Status.Allocatable[resourceName], allocatableResource)
|
||||
assert.Equal(t, node.Status.Capacity[resourceName], capacityResource,
|
||||
"Node %v, expected capacity %v: %v got: %v", node.ObjectMeta.Name, resourceName, node.Status.Capacity[resourceName], capacityResource)
|
||||
nodeGpuReady := &apiv1.Node{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: "nodeGpuReady",
|
||||
Labels: gpuLabels,
|
||||
CreationTimestamp: metav1.NewTime(start),
|
||||
},
|
||||
Status: apiv1.NodeStatus{
|
||||
Capacity: apiv1.ResourceList{},
|
||||
Allocatable: apiv1.ResourceList{},
|
||||
Conditions: []apiv1.NodeCondition{readyCondition},
|
||||
},
|
||||
}
|
||||
nodeGpuReady.Status.Allocatable[ResourceNvidiaGPU] = *resource.NewQuantity(1, resource.DecimalSI)
|
||||
nodeGpuReady.Status.Capacity[ResourceNvidiaGPU] = *resource.NewQuantity(1, resource.DecimalSI)
|
||||
expectedReadiness[nodeGpuReady.Name] = true
|
||||
|
||||
nodeGpuUnready := &apiv1.Node{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: "nodeGpuUnready",
|
||||
Labels: gpuLabels,
|
||||
CreationTimestamp: metav1.NewTime(start),
|
||||
},
|
||||
Status: apiv1.NodeStatus{
|
||||
Capacity: apiv1.ResourceList{},
|
||||
Allocatable: apiv1.ResourceList{},
|
||||
Conditions: []apiv1.NodeCondition{readyCondition},
|
||||
},
|
||||
}
|
||||
nodeGpuUnready.Status.Allocatable[ResourceNvidiaGPU] = *resource.NewQuantity(0, resource.DecimalSI)
|
||||
nodeGpuUnready.Status.Capacity[ResourceNvidiaGPU] = *resource.NewQuantity(0, resource.DecimalSI)
|
||||
expectedReadiness[nodeGpuUnready.Name] = false
|
||||
|
||||
nodeGpuUnready2 := &apiv1.Node{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: "nodeGpuUnready2",
|
||||
Labels: gpuLabels,
|
||||
CreationTimestamp: metav1.NewTime(start),
|
||||
},
|
||||
Status: apiv1.NodeStatus{
|
||||
Conditions: []apiv1.NodeCondition{readyCondition},
|
||||
},
|
||||
}
|
||||
expectedReadiness[nodeGpuUnready2.Name] = false
|
||||
|
||||
nodeNoGpuReady := &apiv1.Node{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: "nodeNoGpuReady",
|
||||
Labels: make(map[string]string),
|
||||
CreationTimestamp: metav1.NewTime(start),
|
||||
},
|
||||
Status: apiv1.NodeStatus{
|
||||
Conditions: []apiv1.NodeCondition{readyCondition},
|
||||
},
|
||||
}
|
||||
expectedReadiness[nodeNoGpuReady.Name] = true
|
||||
|
||||
nodeNoGpuUnready := &apiv1.Node{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: "nodeNoGpuUnready",
|
||||
Labels: make(map[string]string),
|
||||
CreationTimestamp: metav1.NewTime(start),
|
||||
},
|
||||
Status: apiv1.NodeStatus{
|
||||
Conditions: []apiv1.NodeCondition{unreadyCondition},
|
||||
},
|
||||
}
|
||||
expectedReadiness[nodeNoGpuUnready.Name] = false
|
||||
|
||||
initialReadyNodes := []*apiv1.Node{
|
||||
nodeGpuReady,
|
||||
nodeGpuUnready,
|
||||
nodeGpuUnready2,
|
||||
nodeNoGpuReady,
|
||||
}
|
||||
initialAllNodes := []*apiv1.Node{
|
||||
nodeGpuReady,
|
||||
nodeGpuUnready,
|
||||
nodeGpuUnready2,
|
||||
nodeNoGpuReady,
|
||||
nodeNoGpuUnready,
|
||||
}
|
||||
|
||||
newAllNodes, newReadyNodes := FilterOutNodesWithUnreadyGpus(initialAllNodes, initialReadyNodes)
|
||||
|
||||
foundInReady := make(map[string]bool)
|
||||
for _, node := range newReadyNodes {
|
||||
foundInReady[node.Name] = true
|
||||
assert.True(t, expectedReadiness[node.Name], fmt.Sprintf("Node %s found in ready nodes list (it shouldn't be there)", node.Name))
|
||||
}
|
||||
for nodeName, expected := range expectedReadiness {
|
||||
if expected {
|
||||
assert.True(t, foundInReady[nodeName], fmt.Sprintf("Node %s expected ready, but not found in ready nodes list", nodeName))
|
||||
}
|
||||
}
|
||||
for _, node := range newAllNodes {
|
||||
assert.Equal(t, len(node.Status.Conditions), 1)
|
||||
if expectedReadiness[node.Name] {
|
||||
assert.Equal(t, node.Status.Conditions[0].Status, apiv1.ConditionTrue, fmt.Sprintf("Unexpected ready condition value for node %s", node.Name))
|
||||
} else {
|
||||
assert.Equal(t, node.Status.Conditions[0].Status, apiv1.ConditionFalse, fmt.Sprintf("Unexpected ready condition value for node %s", node.Name))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue