387 lines
12 KiB
Go
387 lines
12 KiB
Go
package llamacpp
|
|
|
|
import (
|
|
"bufio"
|
|
"context"
|
|
"errors"
|
|
"fmt"
|
|
"io"
|
|
"io/fs"
|
|
"net/http"
|
|
"os"
|
|
"os/exec"
|
|
"path/filepath"
|
|
"regexp"
|
|
"runtime"
|
|
"strings"
|
|
|
|
v1 "github.com/google/go-containerregistry/pkg/v1"
|
|
parser "github.com/gpustack/gguf-parser-go"
|
|
|
|
"github.com/docker/model-distribution/types"
|
|
|
|
"github.com/docker/model-runner/pkg/diskusage"
|
|
"github.com/docker/model-runner/pkg/inference"
|
|
"github.com/docker/model-runner/pkg/inference/config"
|
|
"github.com/docker/model-runner/pkg/inference/models"
|
|
"github.com/docker/model-runner/pkg/logging"
|
|
"github.com/docker/model-runner/pkg/tailbuffer"
|
|
)
|
|
|
|
const (
|
|
// Name is the backend name.
|
|
Name = "llama.cpp"
|
|
)
|
|
|
|
// llamaCpp is the llama.cpp-based backend implementation.
|
|
type llamaCpp struct {
|
|
// log is the associated logger.
|
|
log logging.Logger
|
|
// modelManager is the shared model manager.
|
|
modelManager *models.Manager
|
|
// serverLog is the logger to use for the llama.cpp server process.
|
|
serverLog logging.Logger
|
|
updatedLlamaCpp bool
|
|
// vendoredServerStoragePath is the parent path of the vendored version of com.docker.llama-server.
|
|
vendoredServerStoragePath string
|
|
// updatedServerStoragePath is the parent path of the updated version of com.docker.llama-server.
|
|
// It is also where updates will be stored when downloaded.
|
|
updatedServerStoragePath string
|
|
// status is the state in which the llama.cpp backend is in.
|
|
status string
|
|
// config is the configuration for the llama.cpp backend.
|
|
config config.BackendConfig
|
|
// gpuSupported indicates whether the underlying llama-server is built with GPU support.
|
|
gpuSupported bool
|
|
}
|
|
|
|
// New creates a new llama.cpp-based backend.
|
|
func New(
|
|
log logging.Logger,
|
|
modelManager *models.Manager,
|
|
serverLog logging.Logger,
|
|
vendoredServerStoragePath string,
|
|
updatedServerStoragePath string,
|
|
conf config.BackendConfig,
|
|
) (inference.Backend, error) {
|
|
// If no config is provided, use the default configuration
|
|
if conf == nil {
|
|
conf = NewDefaultLlamaCppConfig()
|
|
}
|
|
|
|
return &llamaCpp{
|
|
log: log,
|
|
modelManager: modelManager,
|
|
serverLog: serverLog,
|
|
vendoredServerStoragePath: vendoredServerStoragePath,
|
|
updatedServerStoragePath: updatedServerStoragePath,
|
|
config: conf,
|
|
}, nil
|
|
}
|
|
|
|
// Name implements inference.Backend.Name.
|
|
func (l *llamaCpp) Name() string {
|
|
return Name
|
|
}
|
|
|
|
// UsesExternalModelManagement implements
|
|
// inference.Backend.UsesExternalModelManagement.
|
|
func (l *llamaCpp) UsesExternalModelManagement() bool {
|
|
return false
|
|
}
|
|
|
|
// Install implements inference.Backend.Install.
|
|
func (l *llamaCpp) Install(ctx context.Context, httpClient *http.Client) error {
|
|
l.updatedLlamaCpp = false
|
|
|
|
// We don't currently support this backend on Windows. We'll likely
|
|
// never support it on Intel Macs.
|
|
if (runtime.GOOS == "darwin" && runtime.GOARCH == "amd64") ||
|
|
(runtime.GOOS == "windows" && !(runtime.GOARCH == "amd64" || runtime.GOARCH == "arm64")) {
|
|
return errors.New("platform not supported")
|
|
}
|
|
|
|
llamaServerBin := "com.docker.llama-server"
|
|
if runtime.GOOS == "windows" {
|
|
llamaServerBin = "com.docker.llama-server.exe"
|
|
}
|
|
|
|
l.status = "installing"
|
|
|
|
// Temporary workaround for dynamically downloading llama.cpp from Docker Hub.
|
|
// Internet access and an available docker/docker-model-backend-llamacpp:latest on Docker Hub are required.
|
|
// Even if docker/docker-model-backend-llamacpp:latest has been downloaded before, we still require its
|
|
// digest to be equal to the one on Docker Hub.
|
|
llamaCppPath := filepath.Join(l.updatedServerStoragePath, llamaServerBin)
|
|
if err := l.ensureLatestLlamaCpp(ctx, l.log, httpClient, llamaCppPath, l.vendoredServerStoragePath); err != nil {
|
|
l.log.Infof("failed to ensure latest llama.cpp: %v\n", err)
|
|
if !(errors.Is(err, errLlamaCppUpToDate) || errors.Is(err, errLlamaCppUpdateDisabled)) {
|
|
l.status = fmt.Sprintf("failed to install llama.cpp: %v", err)
|
|
}
|
|
if errors.Is(err, context.Canceled) {
|
|
return err
|
|
}
|
|
} else {
|
|
l.updatedLlamaCpp = true
|
|
}
|
|
|
|
l.gpuSupported = l.checkGPUSupport(ctx)
|
|
l.log.Infof("installed llama-server with gpuSupport=%t", l.gpuSupported)
|
|
|
|
return nil
|
|
}
|
|
|
|
// Run implements inference.Backend.Run.
|
|
func (l *llamaCpp) Run(ctx context.Context, socket, model string, mode inference.BackendMode, config *inference.BackendConfiguration) error {
|
|
mdl, err := l.modelManager.GetModel(model)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to get model: %w", err)
|
|
}
|
|
|
|
if err := os.RemoveAll(socket); err != nil && !errors.Is(err, fs.ErrNotExist) {
|
|
l.log.Warnf("failed to remove socket file %s: %w\n", socket, err)
|
|
l.log.Warnln("llama.cpp may not be able to start")
|
|
}
|
|
|
|
binPath := l.vendoredServerStoragePath
|
|
if l.updatedLlamaCpp {
|
|
binPath = l.updatedServerStoragePath
|
|
}
|
|
|
|
args, err := l.config.GetArgs(mdl, socket, mode, config)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to get args for llama.cpp: %w", err)
|
|
}
|
|
|
|
l.log.Infof("llamaCppArgs: %v", args)
|
|
llamaCppProcess := exec.CommandContext(
|
|
ctx,
|
|
filepath.Join(binPath, "com.docker.llama-server"),
|
|
args...,
|
|
)
|
|
llamaCppProcess.Cancel = func() error {
|
|
if runtime.GOOS == "windows" {
|
|
return llamaCppProcess.Process.Kill()
|
|
}
|
|
return llamaCppProcess.Process.Signal(os.Interrupt)
|
|
}
|
|
tailBuf := tailbuffer.NewTailBuffer(1024)
|
|
serverLogStream := l.serverLog.Writer()
|
|
out := io.MultiWriter(serverLogStream, tailBuf)
|
|
llamaCppProcess.Stdout = serverLogStream
|
|
llamaCppProcess.Stderr = out
|
|
|
|
if err := llamaCppProcess.Start(); err != nil {
|
|
return fmt.Errorf("unable to start llama.cpp: %w", err)
|
|
}
|
|
|
|
llamaCppErrors := make(chan error, 1)
|
|
go func() {
|
|
llamaCppErr := llamaCppProcess.Wait()
|
|
serverLogStream.Close()
|
|
|
|
errOutput := new(strings.Builder)
|
|
if _, err := io.Copy(errOutput, tailBuf); err != nil {
|
|
l.log.Warnf("failed to read server output tail: %w", err)
|
|
}
|
|
|
|
if len(errOutput.String()) != 0 {
|
|
llamaCppErr = fmt.Errorf("llama.cpp exit status: %w\nwith output: %s", llamaCppErr, errOutput.String())
|
|
} else {
|
|
llamaCppErr = fmt.Errorf("llama.cpp exit status: %w", llamaCppErr)
|
|
}
|
|
|
|
llamaCppErrors <- llamaCppErr
|
|
close(llamaCppErrors)
|
|
if err := os.Remove(socket); err != nil && !errors.Is(err, fs.ErrNotExist) {
|
|
l.log.Warnf("failed to remove socket file %s on exit: %w\n", socket, err)
|
|
}
|
|
}()
|
|
defer func() {
|
|
<-llamaCppErrors
|
|
}()
|
|
|
|
select {
|
|
case <-ctx.Done():
|
|
return nil
|
|
case llamaCppErr := <-llamaCppErrors:
|
|
select {
|
|
case <-ctx.Done():
|
|
return nil
|
|
default:
|
|
}
|
|
return fmt.Errorf("llama.cpp terminated unexpectedly: %w", llamaCppErr)
|
|
}
|
|
}
|
|
|
|
func (l *llamaCpp) Status() string {
|
|
return l.status
|
|
}
|
|
|
|
func (l *llamaCpp) GetDiskUsage() (int64, error) {
|
|
size, err := diskusage.Size(l.updatedServerStoragePath)
|
|
if err != nil {
|
|
return 0, fmt.Errorf("error while getting store size: %v", err)
|
|
}
|
|
return size, nil
|
|
}
|
|
|
|
func (l *llamaCpp) GetRequiredMemoryForModel(ctx context.Context, model string, config *inference.BackendConfiguration) (*inference.RequiredMemory, error) {
|
|
var mdlGguf *parser.GGUFFile
|
|
var mdlConfig types.Config
|
|
inStore, err := l.modelManager.IsModelInStore(model)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("checking if model is in local store: %w", err)
|
|
}
|
|
if inStore {
|
|
mdlGguf, mdlConfig, err = l.parseLocalModel(model)
|
|
if err != nil {
|
|
return nil, &inference.ErrGGUFParse{Err: err}
|
|
}
|
|
} else {
|
|
mdlGguf, mdlConfig, err = l.parseRemoteModel(ctx, model)
|
|
if err != nil {
|
|
return nil, &inference.ErrGGUFParse{Err: err}
|
|
}
|
|
}
|
|
|
|
contextSize := GetContextSize(&mdlConfig, config)
|
|
|
|
ngl := uint64(0)
|
|
if l.gpuSupported {
|
|
if runtime.GOOS == "windows" && runtime.GOARCH == "arm64" && mdlConfig.Quantization != "Q4_0" {
|
|
ngl = 0 // only Q4_0 models can be accelerated on Adreno
|
|
}
|
|
ngl = 100
|
|
}
|
|
|
|
// TODO(p1-0tr): for now assume we are running on GPU (single one) - Devices[1];
|
|
// sum up weights + kv cache + context for an estimate of total GPU memory needed
|
|
// while running inference with the given model
|
|
estimate := mdlGguf.EstimateLLaMACppRun(parser.WithLLaMACppContextSize(int32(contextSize)),
|
|
// TODO(p1-0tr): add logic for resolving other param values, instead of hardcoding them
|
|
parser.WithLLaMACppLogicalBatchSize(2048),
|
|
parser.WithLLaMACppOffloadLayers(ngl))
|
|
ram := uint64(estimate.Devices[0].Weight.Sum() + estimate.Devices[0].KVCache.Sum() + estimate.Devices[0].Computation.Sum())
|
|
var vram uint64
|
|
if len(estimate.Devices) > 1 {
|
|
vram = uint64(estimate.Devices[1].Weight.Sum() + estimate.Devices[1].KVCache.Sum() + estimate.Devices[1].Computation.Sum())
|
|
}
|
|
|
|
if runtime.GOOS == "windows" && runtime.GOARCH == "arm64" {
|
|
// TODO(p1-0tr): For now on windows/arm64 stick to the old behaviour, of allowing
|
|
// one model at a time. This WA requires gpuinfo.GetVRAMSize to return 1.
|
|
vram = 1
|
|
}
|
|
|
|
return &inference.RequiredMemory{
|
|
RAM: ram,
|
|
VRAM: vram,
|
|
}, nil
|
|
}
|
|
|
|
func (l *llamaCpp) parseLocalModel(model string) (*parser.GGUFFile, types.Config, error) {
|
|
mdl, err := l.modelManager.GetModel(model)
|
|
if err != nil {
|
|
return nil, types.Config{}, fmt.Errorf("getting model(%s): %w", model, err)
|
|
}
|
|
mdlPath, err := mdl.GGUFPath()
|
|
if err != nil {
|
|
return nil, types.Config{}, fmt.Errorf("getting gguf path for model(%s): %w", model, err)
|
|
}
|
|
mdlGguf, err := parser.ParseGGUFFile(mdlPath)
|
|
if err != nil {
|
|
return nil, types.Config{}, fmt.Errorf("parsing gguf(%s): %w", mdlPath, err)
|
|
}
|
|
mdlConfig, err := mdl.Config()
|
|
if err != nil {
|
|
return nil, types.Config{}, fmt.Errorf("accessing model(%s) config: %w", model, err)
|
|
}
|
|
return mdlGguf, mdlConfig, nil
|
|
}
|
|
|
|
func (l *llamaCpp) parseRemoteModel(ctx context.Context, model string) (*parser.GGUFFile, types.Config, error) {
|
|
mdl, err := l.modelManager.GetRemoteModel(ctx, model)
|
|
if err != nil {
|
|
return nil, types.Config{}, fmt.Errorf("getting remote model(%s): %w", model, err)
|
|
}
|
|
layers, err := mdl.Layers()
|
|
if err != nil {
|
|
return nil, types.Config{}, fmt.Errorf("getting layers of model(%s): %w", model, err)
|
|
}
|
|
ggufLayers := getGGUFLayers(layers)
|
|
if len(ggufLayers) != 1 {
|
|
return nil, types.Config{}, fmt.Errorf(
|
|
"remote memory estimation only supported for models with single GGUF layer, found %d layers", len(ggufLayers),
|
|
)
|
|
}
|
|
ggufDigest, err := ggufLayers[0].Digest()
|
|
if err != nil {
|
|
return nil, types.Config{}, fmt.Errorf("getting digest of GGUF layer for model(%s): %w", model, err)
|
|
}
|
|
if ggufDigest.String() == "" {
|
|
return nil, types.Config{}, fmt.Errorf("model(%s) has no GGUF layer", model)
|
|
}
|
|
blobURL, err := l.modelManager.GetRemoteModelBlobURL(model, ggufDigest)
|
|
if err != nil {
|
|
return nil, types.Config{}, fmt.Errorf("getting GGUF blob URL for model(%s): %w", model, err)
|
|
}
|
|
tok, err := l.modelManager.BearerTokenForModel(ctx, model)
|
|
if err != nil {
|
|
return nil, types.Config{}, fmt.Errorf("getting bearer token for model(%s): %w", model, err)
|
|
}
|
|
mdlGguf, err := parser.ParseGGUFFileRemote(ctx, blobURL, parser.UseBearerAuth(tok))
|
|
if err != nil {
|
|
return nil, types.Config{}, fmt.Errorf("parsing GGUF for model(%s): %w", model, err)
|
|
}
|
|
config, err := mdl.Config()
|
|
if err != nil {
|
|
return nil, types.Config{}, fmt.Errorf("getting config for model(%s): %w", model, err)
|
|
}
|
|
return mdlGguf, config, nil
|
|
}
|
|
|
|
func getGGUFLayers(layers []v1.Layer) []v1.Layer {
|
|
var filtered []v1.Layer
|
|
for _, layer := range layers {
|
|
mt, err := layer.MediaType()
|
|
if err != nil {
|
|
continue
|
|
}
|
|
if mt == types.MediaTypeGGUF {
|
|
filtered = append(filtered, layer)
|
|
}
|
|
}
|
|
return filtered
|
|
}
|
|
|
|
func (l *llamaCpp) checkGPUSupport(ctx context.Context) bool {
|
|
binPath := l.vendoredServerStoragePath
|
|
if l.updatedLlamaCpp {
|
|
binPath = l.updatedServerStoragePath
|
|
}
|
|
out, err := exec.CommandContext(
|
|
ctx,
|
|
filepath.Join(binPath, "com.docker.llama-server"),
|
|
"--list-devices",
|
|
).CombinedOutput()
|
|
if err != nil {
|
|
l.log.Warnf("Failed to determine if llama-server is built with GPU support: %s", err)
|
|
return false
|
|
}
|
|
sc := bufio.NewScanner(strings.NewReader(string(out)))
|
|
expectDev := false
|
|
devRe := regexp.MustCompile(`\s{2}.*:\s`)
|
|
ndevs := 0
|
|
for sc.Scan() {
|
|
if expectDev {
|
|
if devRe.MatchString(sc.Text()) {
|
|
ndevs += 1
|
|
}
|
|
} else {
|
|
expectDev = strings.HasPrefix(sc.Text(), "Available devices:")
|
|
}
|
|
}
|
|
return ndevs > 0
|
|
}
|