diff --git a/pkg/inference/backend.go b/pkg/inference/backend.go index 944ec12..3d857f8 100644 --- a/pkg/inference/backend.go +++ b/pkg/inference/backend.go @@ -2,6 +2,7 @@ package inference import ( "context" + "errors" "net/http" ) @@ -17,6 +18,10 @@ const ( BackendModeEmbedding ) +var ( + ErrGGUFParse = errors.New("failed to parse GGUF file") +) + // String implements Stringer.String for BackendMode. func (m BackendMode) String() string { switch m { diff --git a/pkg/inference/backends/llamacpp/llamacpp.go b/pkg/inference/backends/llamacpp/llamacpp.go index 09de11f..f53b26a 100644 --- a/pkg/inference/backends/llamacpp/llamacpp.go +++ b/pkg/inference/backends/llamacpp/llamacpp.go @@ -234,7 +234,8 @@ func (l *llamaCpp) GetRequiredMemoryForModel(model string, config *inference.Bac } mdlGguf, err := parser.ParseGGUFFile(mdlPath) if err != nil { - return nil, fmt.Errorf("parsing gguf(%s): %w", mdlPath, err) + l.log.Warnf("Failed to parse gguf(%s): %s", mdlPath, err) + return nil, inference.ErrGGUFParse } mdlConfig, err := mdl.Config() if err != nil { diff --git a/pkg/inference/scheduling/loader.go b/pkg/inference/scheduling/loader.go index ec7e1f5..ebbdd33 100644 --- a/pkg/inference/scheduling/loader.go +++ b/pkg/inference/scheduling/loader.go @@ -421,7 +421,16 @@ func (l *loader) load(ctx context.Context, backendName, modelID, modelRef string runnerConfig = &rc } memory, err := backend.GetRequiredMemoryForModel(modelID, runnerConfig) - if err != nil { + if errors.Is(err, inference.ErrGGUFParse) { + // TODO(p1-0tr): For now override memory checks in case model can't be parsed + // e.g. model is too new for gguf-parser-go to know. We should provide a cleaner + // way to bypass these checks. + l.log.Warnf("Could not parse model(%s), memory checks will be ignored for it.", modelID) + memory = &inference.RequiredMemory{ + RAM: 0, + VRAM: 0, + } + } else if err != nil { return nil, err } l.log.Infof("Loading %s, which will require %dMB RAM and %dMB VRAM", modelID, memory.RAM/1024/1024, memory.VRAM/1024/1024)