Disable GPU resource processor for nodes using DRA for accelerator attachment

2025-09-18 07:50:07 +00:00 · 2025-09-18 07:50:07 +00:00 · 87901dcce9
parent 20f76e9875
commit 87901dcce9
3 changed files with 38 additions and 9 deletions
--- a/cluster-autoscaler/cloudprovider/cloud_provider.go
+++ b/cluster-autoscaler/cloudprovider/cloud_provider.go
@ -98,9 +98,10 @@ const (

 // GpuConfig contains the label, type and the resource name for a GPU.
 type GpuConfig struct {
-	Label        string
-	Type         string
-	ResourceName apiv1.ResourceName
+	Label            string
+	Type             string
+	ResourceName     apiv1.ResourceName
+	AttachedUsingDRA bool
 }

 // CloudProvider contains configuration info and functions for interacting with
--- a/cluster-autoscaler/cloudprovider/gce/gce_cloud_provider.go
+++ b/cluster-autoscaler/cloudprovider/gce/gce_cloud_provider.go
@ -35,6 +35,8 @@ import (
 const (
 	// GPULabel is the label added to nodes with GPU resource.
 	GPULabel = "cloud.google.com/gke-accelerator"
+	// DraGPULabel is the label added to nodes with GPU resource attached using DRA.
+	DraGPULabel = "cloud.google.com/gke-gpu-dra-driver"
 )

 var (
@ -82,9 +84,14 @@ func (gce *GceCloudProvider) GetAvailableGPUTypes() map[string]struct{} {
 }

 // GetNodeGpuConfig returns the label, type and resource name for the GPU added to node. If node doesn't have
-// any GPUs, it returns nil.
+// any GPUs, it returns nil. If node has GPU attached using DRA - populates the according field in GpuConfig
 func (gce *GceCloudProvider) GetNodeGpuConfig(node *apiv1.Node) *cloudprovider.GpuConfig {
-	return gpu.GetNodeGPUFromCloudProvider(gce, node)
+	gpuConfig := gpu.GetNodeGPUFromCloudProvider(gce, node)
+	if gpuDraDriverEnabled(node) {
+		gpuConfig.AttachedUsingDRA = true
+	}
+
+	return gpuConfig
 }

 // NodeGroups returns all node groups configured for this cloud provider.
@ -401,3 +408,12 @@ func BuildGCE(opts config.AutoscalingOptions, do cloudprovider.NodeGroupDiscover
 	RegisterMetrics()
 	return provider
 }
+
+// gpuDraDriverEnabled checks whether GPU driver is enabled on the node
+func gpuDraDriverEnabled(node *apiv1.Node) bool {
+	if node.Labels == nil {
+		return false
+	}
+
+	return node.Labels[DraGPULabel] == "true"
+}
--- a/cluster-autoscaler/processors/customresources/gpu_processor.go
+++ b/cluster-autoscaler/processors/customresources/gpu_processor.go
@ -42,13 +42,25 @@ func (p *GpuCustomResourcesProcessor) FilterOutNodesWithUnreadyResources(context
 	newReadyNodes := make([]*apiv1.Node, 0)
 	nodesWithUnreadyGpu := make(map[string]*apiv1.Node)
 	for _, node := range readyNodes {
-		_, hasGpuLabel := node.Labels[context.CloudProvider.GPULabel()]
-		gpuAllocatable, hasGpuAllocatable := node.Status.Allocatable[gpu.ResourceNvidiaGPU]
-		directXAllocatable, hasDirectXAllocatable := node.Status.Allocatable[gpu.ResourceDirectX]
+		gpuConfig := context.CloudProvider.GetNodeGpuConfig(node)
+		if gpuConfig == nil {
+			newReadyNodes = append(newReadyNodes, node)
+			continue
+		}
+
+		// Devices attached through DRA are not using node allocatable
+		// to confirm their attachment, assume that node is ready
+		// and will be checked in the separate processor
+		if gpuConfig.AttachedUsingDRA {
+			newReadyNodes = append(newReadyNodes, node)
+			continue
+		}
+
+		allocatable, hasAllocatable := node.Status.Allocatable[gpuConfig.ResourceName]
 		// We expect node to have GPU based on label, but it doesn't show up
 		// on node object. Assume the node is still not fully started (installing
 		// GPU drivers).
-		if hasGpuLabel && ((!hasGpuAllocatable || gpuAllocatable.IsZero()) && (!hasDirectXAllocatable || directXAllocatable.IsZero())) {
+		if !hasAllocatable || allocatable.IsZero() {
 			klog.V(3).Infof("Overriding status of node %v, which seems to have unready GPU",
 				node.Name)
 			nodesWithUnreadyGpu[node.Name] = kubernetes.GetUnreadyNodeCopy(node, kubernetes.ResourceUnready)