Disable GPU resource processor for nodes using DRA for accelerator attachment

This commit is contained in:
Maksym Fuhol 2025-09-18 07:50:07 +00:00
parent 20f76e9875
commit 87901dcce9
3 changed files with 38 additions and 9 deletions

View File

@ -98,9 +98,10 @@ const (
// GpuConfig contains the label, type and the resource name for a GPU.
type GpuConfig struct {
Label string
Type string
ResourceName apiv1.ResourceName
Label string
Type string
ResourceName apiv1.ResourceName
AttachedUsingDRA bool
}
// CloudProvider contains configuration info and functions for interacting with

View File

@ -35,6 +35,8 @@ import (
const (
// GPULabel is the label added to nodes with GPU resource.
GPULabel = "cloud.google.com/gke-accelerator"
// DraGPULabel is the label added to nodes with GPU resource attached using DRA.
DraGPULabel = "cloud.google.com/gke-gpu-dra-driver"
)
var (
@ -82,9 +84,14 @@ func (gce *GceCloudProvider) GetAvailableGPUTypes() map[string]struct{} {
}
// GetNodeGpuConfig returns the label, type and resource name for the GPU added to node. If node doesn't have
// any GPUs, it returns nil.
// any GPUs, it returns nil. If node has GPU attached using DRA - populates the according field in GpuConfig
func (gce *GceCloudProvider) GetNodeGpuConfig(node *apiv1.Node) *cloudprovider.GpuConfig {
return gpu.GetNodeGPUFromCloudProvider(gce, node)
gpuConfig := gpu.GetNodeGPUFromCloudProvider(gce, node)
if gpuDraDriverEnabled(node) {
gpuConfig.AttachedUsingDRA = true
}
return gpuConfig
}
// NodeGroups returns all node groups configured for this cloud provider.
@ -401,3 +408,12 @@ func BuildGCE(opts config.AutoscalingOptions, do cloudprovider.NodeGroupDiscover
RegisterMetrics()
return provider
}
// gpuDraDriverEnabled checks whether GPU driver is enabled on the node
func gpuDraDriverEnabled(node *apiv1.Node) bool {
if node.Labels == nil {
return false
}
return node.Labels[DraGPULabel] == "true"
}

View File

@ -42,13 +42,25 @@ func (p *GpuCustomResourcesProcessor) FilterOutNodesWithUnreadyResources(context
newReadyNodes := make([]*apiv1.Node, 0)
nodesWithUnreadyGpu := make(map[string]*apiv1.Node)
for _, node := range readyNodes {
_, hasGpuLabel := node.Labels[context.CloudProvider.GPULabel()]
gpuAllocatable, hasGpuAllocatable := node.Status.Allocatable[gpu.ResourceNvidiaGPU]
directXAllocatable, hasDirectXAllocatable := node.Status.Allocatable[gpu.ResourceDirectX]
gpuConfig := context.CloudProvider.GetNodeGpuConfig(node)
if gpuConfig == nil {
newReadyNodes = append(newReadyNodes, node)
continue
}
// Devices attached through DRA are not using node allocatable
// to confirm their attachment, assume that node is ready
// and will be checked in the separate processor
if gpuConfig.AttachedUsingDRA {
newReadyNodes = append(newReadyNodes, node)
continue
}
allocatable, hasAllocatable := node.Status.Allocatable[gpuConfig.ResourceName]
// We expect node to have GPU based on label, but it doesn't show up
// on node object. Assume the node is still not fully started (installing
// GPU drivers).
if hasGpuLabel && ((!hasGpuAllocatable || gpuAllocatable.IsZero()) && (!hasDirectXAllocatable || directXAllocatable.IsZero())) {
if !hasAllocatable || allocatable.IsZero() {
klog.V(3).Infof("Overriding status of node %v, which seems to have unready GPU",
node.Name)
nodesWithUnreadyGpu[node.Name] = kubernetes.GetUnreadyNodeCopy(node, kubernetes.ResourceUnready)