inference: Ignore parse errors when estimating model memory
We will run into cases where our model runner is ahead of gguf-parser-go. In such cases we may want to load a model that will cause the model parse to fail. So, for now, in such cases ignore model parsing errors, and assume it takes no resources. In the future we should come up with a cleaner way of dealing with this (e.g. ship a model memory estimator along with the llama-server). Signed-off-by: Piotr Stankiewicz <piotr.stankiewicz@docker.com>
This commit is contained in:
parent
d61ffd5311
commit
1c13e4fc61
|
|
@ -2,6 +2,7 @@ package inference
|
|||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"net/http"
|
||||
)
|
||||
|
||||
|
|
@ -17,6 +18,10 @@ const (
|
|||
BackendModeEmbedding
|
||||
)
|
||||
|
||||
var (
|
||||
ErrGGUFParse = errors.New("failed to parse GGUF file")
|
||||
)
|
||||
|
||||
// String implements Stringer.String for BackendMode.
|
||||
func (m BackendMode) String() string {
|
||||
switch m {
|
||||
|
|
|
|||
|
|
@ -234,7 +234,8 @@ func (l *llamaCpp) GetRequiredMemoryForModel(model string, config *inference.Bac
|
|||
}
|
||||
mdlGguf, err := parser.ParseGGUFFile(mdlPath)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("parsing gguf(%s): %w", mdlPath, err)
|
||||
l.log.Warnf("Failed to parse gguf(%s): %s", mdlPath, err)
|
||||
return nil, inference.ErrGGUFParse
|
||||
}
|
||||
mdlConfig, err := mdl.Config()
|
||||
if err != nil {
|
||||
|
|
|
|||
|
|
@ -421,7 +421,16 @@ func (l *loader) load(ctx context.Context, backendName, modelID, modelRef string
|
|||
runnerConfig = &rc
|
||||
}
|
||||
memory, err := backend.GetRequiredMemoryForModel(modelID, runnerConfig)
|
||||
if err != nil {
|
||||
if errors.Is(err, inference.ErrGGUFParse) {
|
||||
// TODO(p1-0tr): For now override memory checks in case model can't be parsed
|
||||
// e.g. model is too new for gguf-parser-go to know. We should provide a cleaner
|
||||
// way to bypass these checks.
|
||||
l.log.Warnf("Could not parse model(%s), memory checks will be ignored for it.", modelID)
|
||||
memory = &inference.RequiredMemory{
|
||||
RAM: 0,
|
||||
VRAM: 0,
|
||||
}
|
||||
} else if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
l.log.Infof("Loading %s, which will require %dMB RAM and %dMB VRAM", modelID, memory.RAM/1024/1024, memory.VRAM/1024/1024)
|
||||
|
|
|
|||
Loading…
Reference in New Issue