inference: Ignore parse errors when estimating model memory

We will run into cases where our model runner is ahead of
gguf-parser-go. In such cases we may want to load a model that will
cause the model parse to fail. So, for now, in such cases ignore model
parsing errors, and assume it takes no resources. In the future we
should come up with a cleaner way of dealing with this (e.g. ship a
model memory estimator along with the llama-server).

Signed-off-by: Piotr Stankiewicz <piotr.stankiewicz@docker.com>
This commit is contained in:
Piotr Stankiewicz 2025-08-06 16:40:10 +02:00 committed by Piotr
parent d61ffd5311
commit 1c13e4fc61
3 changed files with 17 additions and 2 deletions

View File

@ -2,6 +2,7 @@ package inference
import (
"context"
"errors"
"net/http"
)
@ -17,6 +18,10 @@ const (
BackendModeEmbedding
)
var (
ErrGGUFParse = errors.New("failed to parse GGUF file")
)
// String implements Stringer.String for BackendMode.
func (m BackendMode) String() string {
switch m {

View File

@ -234,7 +234,8 @@ func (l *llamaCpp) GetRequiredMemoryForModel(model string, config *inference.Bac
}
mdlGguf, err := parser.ParseGGUFFile(mdlPath)
if err != nil {
return nil, fmt.Errorf("parsing gguf(%s): %w", mdlPath, err)
l.log.Warnf("Failed to parse gguf(%s): %s", mdlPath, err)
return nil, inference.ErrGGUFParse
}
mdlConfig, err := mdl.Config()
if err != nil {

View File

@ -421,7 +421,16 @@ func (l *loader) load(ctx context.Context, backendName, modelID, modelRef string
runnerConfig = &rc
}
memory, err := backend.GetRequiredMemoryForModel(modelID, runnerConfig)
if err != nil {
if errors.Is(err, inference.ErrGGUFParse) {
// TODO(p1-0tr): For now override memory checks in case model can't be parsed
// e.g. model is too new for gguf-parser-go to know. We should provide a cleaner
// way to bypass these checks.
l.log.Warnf("Could not parse model(%s), memory checks will be ignored for it.", modelID)
memory = &inference.RequiredMemory{
RAM: 0,
VRAM: 0,
}
} else if err != nil {
return nil, err
}
l.log.Infof("Loading %s, which will require %dMB RAM and %dMB VRAM", modelID, memory.RAM/1024/1024, memory.VRAM/1024/1024)