inference: Ignore parse errors when estimating model memory

We will run into cases where our model runner is ahead of gguf-parser-go. In such cases we may want to load a model that will cause the model parse to fail. So, for now, in such cases ignore model parsing errors, and assume it takes no resources. In the future we should come up with a cleaner way of dealing with this (e.g. ship a model memory estimator along with the llama-server). Signed-off-by: Piotr Stankiewicz <piotr.stankiewicz@docker.com>
2025-08-06 16:40:10 +02:00 · 2025-08-06 16:40:10 +02:00 · 1c13e4fc61
parent d61ffd5311
commit 1c13e4fc61
3 changed files with 17 additions and 2 deletions
--- a/pkg/inference/backend.go
+++ b/pkg/inference/backend.go
@ -2,6 +2,7 @@ package inference

 import (
 	"context"
+	"errors"
 	"net/http"
 )

@ -17,6 +18,10 @@ const (
 	BackendModeEmbedding
 )

+var (
+	ErrGGUFParse = errors.New("failed to parse GGUF file")
+)
+
 // String implements Stringer.String for BackendMode.
 func (m BackendMode) String() string {
 	switch m {
--- a/pkg/inference/backends/llamacpp/llamacpp.go
+++ b/pkg/inference/backends/llamacpp/llamacpp.go
@ -234,7 +234,8 @@ func (l *llamaCpp) GetRequiredMemoryForModel(model string, config *inference.Bac
 	}
 	mdlGguf, err := parser.ParseGGUFFile(mdlPath)
 	if err != nil {
-		return nil, fmt.Errorf("parsing gguf(%s): %w", mdlPath, err)
+		l.log.Warnf("Failed to parse gguf(%s): %s", mdlPath, err)
+		return nil, inference.ErrGGUFParse
 	}
 	mdlConfig, err := mdl.Config()
 	if err != nil {
--- a/pkg/inference/scheduling/loader.go
+++ b/pkg/inference/scheduling/loader.go
@ -421,7 +421,16 @@ func (l *loader) load(ctx context.Context, backendName, modelID, modelRef string
 		runnerConfig = &rc
 	}
 	memory, err := backend.GetRequiredMemoryForModel(modelID, runnerConfig)
-	if err != nil {
+	if errors.Is(err, inference.ErrGGUFParse) {
+		// TODO(p1-0tr): For now override memory checks in case model can't be parsed
+		// e.g. model is too new for gguf-parser-go to know. We should provide a cleaner
+		// way to bypass these checks.
+		l.log.Warnf("Could not parse model(%s), memory checks will be ignored for it.", modelID)
+		memory = &inference.RequiredMemory{
+			RAM:  0,
+			VRAM: 0,
+		}
+	} else if err != nil {
 		return nil, err
 	}
 	l.log.Infof("Loading %s, which will require %dMB RAM and %dMB VRAM", modelID, memory.RAM/1024/1024, memory.VRAM/1024/1024)