Disable GPU resource processor for nodes using DRA for accelerator attachment
This commit is contained in:
parent
20f76e9875
commit
87901dcce9
|
|
@ -98,9 +98,10 @@ const (
|
|||
|
||||
// GpuConfig contains the label, type and the resource name for a GPU.
|
||||
type GpuConfig struct {
|
||||
Label string
|
||||
Type string
|
||||
ResourceName apiv1.ResourceName
|
||||
Label string
|
||||
Type string
|
||||
ResourceName apiv1.ResourceName
|
||||
AttachedUsingDRA bool
|
||||
}
|
||||
|
||||
// CloudProvider contains configuration info and functions for interacting with
|
||||
|
|
|
|||
|
|
@ -35,6 +35,8 @@ import (
|
|||
const (
|
||||
// GPULabel is the label added to nodes with GPU resource.
|
||||
GPULabel = "cloud.google.com/gke-accelerator"
|
||||
// DraGPULabel is the label added to nodes with GPU resource attached using DRA.
|
||||
DraGPULabel = "cloud.google.com/gke-gpu-dra-driver"
|
||||
)
|
||||
|
||||
var (
|
||||
|
|
@ -82,9 +84,14 @@ func (gce *GceCloudProvider) GetAvailableGPUTypes() map[string]struct{} {
|
|||
}
|
||||
|
||||
// GetNodeGpuConfig returns the label, type and resource name for the GPU added to node. If node doesn't have
|
||||
// any GPUs, it returns nil.
|
||||
// any GPUs, it returns nil. If node has GPU attached using DRA - populates the according field in GpuConfig
|
||||
func (gce *GceCloudProvider) GetNodeGpuConfig(node *apiv1.Node) *cloudprovider.GpuConfig {
|
||||
return gpu.GetNodeGPUFromCloudProvider(gce, node)
|
||||
gpuConfig := gpu.GetNodeGPUFromCloudProvider(gce, node)
|
||||
if gpuDraDriverEnabled(node) {
|
||||
gpuConfig.AttachedUsingDRA = true
|
||||
}
|
||||
|
||||
return gpuConfig
|
||||
}
|
||||
|
||||
// NodeGroups returns all node groups configured for this cloud provider.
|
||||
|
|
@ -401,3 +408,12 @@ func BuildGCE(opts config.AutoscalingOptions, do cloudprovider.NodeGroupDiscover
|
|||
RegisterMetrics()
|
||||
return provider
|
||||
}
|
||||
|
||||
// gpuDraDriverEnabled checks whether GPU driver is enabled on the node
|
||||
func gpuDraDriverEnabled(node *apiv1.Node) bool {
|
||||
if node.Labels == nil {
|
||||
return false
|
||||
}
|
||||
|
||||
return node.Labels[DraGPULabel] == "true"
|
||||
}
|
||||
|
|
|
|||
|
|
@ -42,13 +42,25 @@ func (p *GpuCustomResourcesProcessor) FilterOutNodesWithUnreadyResources(context
|
|||
newReadyNodes := make([]*apiv1.Node, 0)
|
||||
nodesWithUnreadyGpu := make(map[string]*apiv1.Node)
|
||||
for _, node := range readyNodes {
|
||||
_, hasGpuLabel := node.Labels[context.CloudProvider.GPULabel()]
|
||||
gpuAllocatable, hasGpuAllocatable := node.Status.Allocatable[gpu.ResourceNvidiaGPU]
|
||||
directXAllocatable, hasDirectXAllocatable := node.Status.Allocatable[gpu.ResourceDirectX]
|
||||
gpuConfig := context.CloudProvider.GetNodeGpuConfig(node)
|
||||
if gpuConfig == nil {
|
||||
newReadyNodes = append(newReadyNodes, node)
|
||||
continue
|
||||
}
|
||||
|
||||
// Devices attached through DRA are not using node allocatable
|
||||
// to confirm their attachment, assume that node is ready
|
||||
// and will be checked in the separate processor
|
||||
if gpuConfig.AttachedUsingDRA {
|
||||
newReadyNodes = append(newReadyNodes, node)
|
||||
continue
|
||||
}
|
||||
|
||||
allocatable, hasAllocatable := node.Status.Allocatable[gpuConfig.ResourceName]
|
||||
// We expect node to have GPU based on label, but it doesn't show up
|
||||
// on node object. Assume the node is still not fully started (installing
|
||||
// GPU drivers).
|
||||
if hasGpuLabel && ((!hasGpuAllocatable || gpuAllocatable.IsZero()) && (!hasDirectXAllocatable || directXAllocatable.IsZero())) {
|
||||
if !hasAllocatable || allocatable.IsZero() {
|
||||
klog.V(3).Infof("Overriding status of node %v, which seems to have unready GPU",
|
||||
node.Name)
|
||||
nodesWithUnreadyGpu[node.Name] = kubernetes.GetUnreadyNodeCopy(node, kubernetes.ResourceUnready)
|
||||
|
|
|
|||
Loading…
Reference in New Issue