Implement basic memory estimation in scheduler

First pass implementation of memory estimation logic in model scheduler. This change heavily relies on gguf-parser-go to calculate estimated peak memory requirement for running inference with a given model. It adds GetRequiredMemoryForModel() to the Backend interface to allow each backend to deal with its config and calculate required memory usage based on it. Signed-off-by: Piotr Stankiewicz <piotr.stankiewicz@docker.com>
2025-07-11 13:24:05 +02:00 · 2025-07-11 13:24:05 +02:00 · a4dc5834d1
parent 606aead0e5
commit a4dc5834d1
10 changed files with 117 additions and 30 deletions
--- a/2
+++ b/2
@ -17,7 +17,7 @@ LLAMA_ARGS ?=

 # Build the Go application
 build:
-	CGO_ENABLED=0 go build -ldflags="-s -w" -o $(APP_NAME) ./main.go
+	CGO_ENABLED=1 go build -ldflags="-s -w" -o $(APP_NAME) ./main.go

 # Run the application locally
 run: build
--- a/pkg/inference/backend.go
+++ b/pkg/inference/backend.go
@ -34,6 +34,11 @@ type BackendConfiguration struct {
 	RuntimeFlags []string `json:"runtime-flags,omitempty"`
 }

+type RequiredMemory struct {
+	RAM  uint64
+	VRAM uint64 // TODO(p1-0tr): for now assume we are working with single GPU set-ups
+}
+
 // Backend is the interface implemented by inference engine backends. Backend
 // implementations need not be safe for concurrent invocation of the following
 // methods, though their underlying server implementations do need to support
@ -76,4 +81,7 @@ type Backend interface {
 	Status() string
 	// GetDiskUsage returns the disk usage of the backend.
 	GetDiskUsage() (int64, error)
+	// GetRequiredMemoryForModel returns the required working memory for a given
+	// model.
+	GetRequiredMemoryForModel(model string, config *BackendConfiguration) (*RequiredMemory, error)
 }
--- a/pkg/inference/backends/llamacpp/llamacpp.go
+++ b/pkg/inference/backends/llamacpp/llamacpp.go
@ -13,6 +13,8 @@ import (
 	"runtime"
 	"strings"

+	parser "github.com/gpustack/gguf-parser-go"
+
 	"github.com/docker/model-runner/pkg/diskusage"
 	"github.com/docker/model-runner/pkg/inference"
 	"github.com/docker/model-runner/pkg/inference/config"
@ -213,3 +215,38 @@ func (l *llamaCpp) GetDiskUsage() (int64, error) {
 	}
 	return size, nil
 }
+
+func (l *llamaCpp) GetRequiredMemoryForModel(model string, config *inference.BackendConfiguration) (*inference.RequiredMemory, error) {
+	mdl, err := l.modelManager.GetModel(model)
+	if err != nil {
+		return nil, err
+	}
+	mdlPath, err := mdl.GGUFPath()
+	if err != nil {
+		return nil, err
+	}
+	mdlGguf, err := parser.ParseGGUFFile(mdlPath)
+	if err != nil {
+		return nil, err
+	}
+	mdlConfig, err := mdl.Config()
+	if err != nil {
+		return nil, err
+	}
+
+	contextSize := GetContextSize(&mdlConfig, config)
+
+	// FIXME(p1-0tr): for now assume we are running on GPU (single one) - Devices[1];
+	// sum up weights + kv cache + context for an estimate of total GPU memory needed
+	// while running inference with the given model
+	estimate := mdlGguf.EstimateLLaMACppRun(parser.WithLLaMACppContextSize(int32(contextSize)),
+		// FIXME(p1-0tr): add logic for resolving other param values, instead of hardcoding them
+		parser.WithLLaMACppLogicalBatchSize(2048),
+		parser.WithLLaMACppOffloadLayers(100))
+	memory := uint64(estimate.Devices[1].Weight.Sum() + estimate.Devices[1].KVCache.Sum() + estimate.Devices[1].Computation.Sum())
+
+	return &inference.RequiredMemory{
+		RAM:  0,
+		VRAM: memory,
+	}, nil
+}
--- a/pkg/inference/backends/llamacpp/llamacpp_config.go
+++ b/pkg/inference/backends/llamacpp/llamacpp_config.go
@ -57,16 +57,10 @@ func (c *Config) GetArgs(model types.Model, socket string, mode inference.Backen
 		args = append(args, "--embeddings")
 	}

-	// Add arguments from model config
-	if modelCfg.ContextSize != nil {
-		args = append(args, "--ctx-size", strconv.FormatUint(*modelCfg.ContextSize, 10))
-	}
+	args = append(args, "--ctx-size", strconv.FormatUint(GetContextSize(&modelCfg, config), 10))

 	// Add arguments from backend config
 	if config != nil {
-		if config.ContextSize > 0 && !containsArg(args, "--ctx-size") {
-			args = append(args, "--ctx-size", strconv.FormatInt(config.ContextSize, 10))
-		}
 		args = append(args, config.RuntimeFlags...)
 	}

@ -79,6 +73,19 @@ func (c *Config) GetArgs(model types.Model, socket string, mode inference.Backen
 	return args, nil
 }

+func GetContextSize(modelCfg *types.Config, backendCfg *inference.BackendConfiguration) uint64 {
+	// Model config takes precedence
+	if modelCfg != nil && modelCfg.ContextSize != nil {
+		return *modelCfg.ContextSize
+	}
+	// else use backend config
+	if backendCfg != nil && backendCfg.ContextSize > 0 {
+		return uint64(backendCfg.ContextSize)
+	}
+	// finally return default
+	return 4096 // llama.cpp default
+}
+
 // containsArg checks if the given argument is already in the args slice.
 func containsArg(args []string, arg string) bool {
 	for _, a := range args {
--- a/pkg/inference/backends/mlx/mlx.go
+++ b/pkg/inference/backends/mlx/mlx.go
@ -62,3 +62,7 @@ func (m *mlx) Status() string {
 func (m *mlx) GetDiskUsage() (int64, error) {
 	return 0, nil
 }
+
+func (m *mlx) GetRequiredMemoryForModel(model string, config *inference.BackendConfiguration) (*inference.RequiredMemory, error) {
+	return nil, errors.New("not implemented")
+}
--- a/pkg/inference/backends/vllm/vllm.go
+++ b/pkg/inference/backends/vllm/vllm.go
@ -62,3 +62,7 @@ func (v *vLLM) Status() string {
 func (v *vLLM) GetDiskUsage() (int64, error) {
 	return 0, nil
 }
+
+func (v *vLLM) GetRequiredMemoryForModel(model string, config *inference.BackendConfiguration) (*inference.RequiredMemory, error) {
+	return nil, errors.New("not implemented")
+}
--- a/pkg/inference/scheduling/loader.go
+++ b/pkg/inference/scheduling/loader.go
@ -132,21 +132,11 @@ func newLoader(
 	}

 	// Compute the amount of available memory.
-	//
-	// TODO: For now, we treat the system as having memory size 1 and all models
-	// as having size 1 (and thus we'll only load a single model at a time).
-	// However, the loader is designed to use "real" values for each and to
-	// schedule appropriately. Thus, we should switch to polling the system
-	// VRAM size here (and potentially even reserving a portion of it) and
-	// computing model size through estimation (using parameter count and
-	// quantization data type size).
-	//
-	// HACK: On GPU-enabled cloud engines, we'll bump this to 2. We can remove
-	// this once we have VRAM estimation.
-	totalMemory := uint64(1)
-	if isGPUEnabledCloudEnvironment {
-		totalMemory = 2
+	vramSize, err := getVRAMSize() // FIXME(p1-0tr): only implemented on macOS for now
+	if err != nil {
+		return nil // FIXME(p1-0tr): should forward the error
 	}
+	totalMemory := vramSize

 	// Create the loader.
 	l := &loader{
@ -400,14 +390,16 @@ func (l *loader) load(ctx context.Context, backendName, modelID, modelRef string

 	// Estimate the amount of memory that will be used by the model and check
 	// that we're even capable of loading it.
-	//
-	// TODO: For now, we treat the system as having memory size 1 and all models
-	// as having size 1 (and thus we'll only load a single model at a time).
-	// However, the loader is designed to use "real" values for each and to
-	// schedule appropriately. Thus, we should switch to computing model size
-	// here through estimation (using parameter count and quantization data type
-	// size).
-	memory := uint64(1)
+	var runnerConfig *inference.BackendConfiguration
+	if rc, ok := l.runnerConfigs[runnerKey{backendName, model, mode}]; ok {
+		runnerConfig = &rc
+	}
+	reqMemory, err := backend.GetRequiredMemoryForModel(model, runnerConfig)
+	if err != nil {
+		return nil, err
+	}
+	memory := reqMemory.VRAM // FIXME(p1-0tr): extend loader to reason about RAM and VRAM (and multiple VRAM sets in the future)
+	l.log.Infof("Loading %s, which will require %dMB of working memory", model, memory/1024.0/1024.0)
 	if memory > l.totalMemory {
 		return nil, errModelTooBig
 	}
--- a/pkg/inference/scheduling/memory_darwin.go
+++ b/pkg/inference/scheduling/memory_darwin.go
@ -0,0 +1,17 @@
+package scheduling
+
+/*
+#cgo LDFLAGS: -framework Metal
+#include "metal.h"
+*/
+import "C"
+import "errors"
+
+// getVRAMSize returns total system GPU memory in bytes
+func getVRAMSize() (uint64, error) {
+	vramSize := C.getVRAMSize()
+	if vramSize == 0 {
+		return 0, errors.New("could not get metal VRAM size")
+	}
+	return uint64(vramSize), nil
+}
--- a/pkg/inference/scheduling/metal.h
+++ b/pkg/inference/scheduling/metal.h
@ -0,0 +1,5 @@
+// +build darwin
+
+#include <stddef.h>
+
+size_t getVRAMSize();
--- a/pkg/inference/scheduling/metal.m
+++ b/pkg/inference/scheduling/metal.m
@ -0,0 +1,13 @@
+// +build darwin
+
+#include <Metal/Metal.h>
+
+#include "metal.h"
+
+size_t getVRAMSize() {
+    id<MTLDevice> device = MTLCreateSystemDefaultDevice();
+    if (device) {
+        return [device recommendedMaxWorkingSetSize];
+    }
+    return 0;
+}