VRAM size getter for linux

Signed-off-by: Piotr Stankiewicz <piotr.stankiewicz@docker.com>
2025-07-11 14:44:23 +02:00 · 2025-07-11 14:44:23 +02:00 · d559e1b755
parent f90e4703f5
commit d559e1b755
5 changed files with 96 additions and 2 deletions
--- a/2
+++ b/2
@ -27,7 +27,7 @@ COPY --link . .
 # Build the Go binary (static build)
 RUN --mount=type=cache,target=/go/pkg/mod \
    --mount=type=cache,target=/root/.cache/go-build \
-    CGO_ENABLED=0 GOOS=linux go build -ldflags="-s -w" -o model-runner ./main.go
+    CGO_ENABLED=1 GOOS=linux go build -ldflags="-s -w" -o model-runner ./main.go

 # --- Get llama.cpp binary ---
 FROM docker/docker-model-backend-llamacpp:${LLAMA_SERVER_VERSION}-${LLAMA_SERVER_VARIANT} AS llama-server
--- a/pkg/inference/scheduling/loader.go
+++ b/pkg/inference/scheduling/loader.go
@ -134,7 +134,7 @@ func newLoader(
 	// Compute the amount of available memory.
 	vramSize, err := getVRAMSize() // FIXME(p1-0tr): only implemented on macOS for now
 	if err != nil {
-		return nil // FIXME(p1-0tr): should forward the error
+		log.Warnf("Could not read VRAM size: %s", err)
 	}
 	totalMemory := vramSize

--- a/pkg/inference/scheduling/memory_linux.go
+++ b/pkg/inference/scheduling/memory_linux.go
@ -0,0 +1,17 @@
+package scheduling
+
+/*
+#cgo LDFLAGS: -ldl
+#include "nvidia.h"
+*/
+import "C"
+import "errors"
+
+// getVRAMSize returns total system GPU memory in bytes
+func getVRAMSize() (uint64, error) {
+	vramSize := C.getVRAMSize()
+	if vramSize == 0 {
+		return 0, errors.New("could not get nvidia VRAM size")
+	}
+	return uint64(vramSize), nil
+}
--- a/pkg/inference/scheduling/nvidia.c
+++ b/pkg/inference/scheduling/nvidia.c
@ -0,0 +1,71 @@
+// +build linux
+
+#include "nvidia.h"
+
+typedef enum {
+    NVML_SUCCESS = 0
+} nvmlReturn_t;
+
+typedef struct {
+    unsigned long long total;
+    unsigned long long free;
+    unsigned long long used;
+} nvmlMemory_t;
+
+typedef void* nvmlDevice_t;
+
+size_t getVRAMSize() {
+    void* handle;
+    nvmlReturn_t (*nvmlInit)(void);
+    nvmlReturn_t (*nvmlShutdown)(void);
+    nvmlReturn_t (*nvmlDeviceGetHandleByIndex)(unsigned int index, nvmlDevice_t* device);
+    nvmlReturn_t (*nvmlDeviceGetMemoryInfo)(nvmlDevice_t device, nvmlMemory_t* memory);
+    
+    nvmlReturn_t result;
+    nvmlDevice_t device;
+    nvmlMemory_t memory;
+    
+    // Try to load libnvidia-ml.so.1 first, then fallback to libnvidia-ml.so
+    handle = dlopen("libnvidia-ml.so.1", RTLD_LAZY);
+    if (!handle) {
+        handle = dlopen("libnvidia-ml.so", RTLD_LAZY);
+        if (!handle) {
+            return 0;
+        }
+    }
+    
+    // Load required functions
+    nvmlInit = dlsym(handle, "nvmlInit");
+    nvmlShutdown = dlsym(handle, "nvmlShutdown");
+    nvmlDeviceGetHandleByIndex = dlsym(handle, "nvmlDeviceGetHandleByIndex");
+    nvmlDeviceGetMemoryInfo = dlsym(handle, "nvmlDeviceGetMemoryInfo");
+    
+    if (!nvmlInit || !nvmlShutdown || !nvmlDeviceGetHandleByIndex || !nvmlDeviceGetMemoryInfo) {
+        dlclose(handle);
+        return 0;
+    }
+    
+    result = nvmlInit();
+    if (result != NVML_SUCCESS) {
+        dlclose(handle);
+        return 0;
+    }
+    
+    result = nvmlDeviceGetHandleByIndex(0, &device);
+    if (result != NVML_SUCCESS) {
+        nvmlShutdown();
+        dlclose(handle);
+        return 0;
+    }
+    
+    result = nvmlDeviceGetMemoryInfo(device, &memory);
+    if (result != NVML_SUCCESS) {
+        nvmlShutdown();
+        dlclose(handle);
+        return 0;
+    }
+    
+    nvmlShutdown();
+    dlclose(handle);
+    return memory.total;
+}
--- a/pkg/inference/scheduling/nvidia.h
+++ b/pkg/inference/scheduling/nvidia.h
@ -0,0 +1,6 @@
+// +build linux
+
+#include <stddef.h>
+#include <dlfcn.h>
+
+size_t getVRAMSize();