Add Status to Backend interface

Signed-off-by: Dorin Geman <dorin.geman@docker.com>
2025-04-17 15:42:02 +03:00 · 2025-04-17 15:42:02 +03:00 · e5d5ccf2dd
parent 5e4719501a
commit e5d5ccf2dd
9 changed files with 70 additions and 14 deletions
--- a/pkg/inference/backend.go
+++ b/pkg/inference/backend.go
@ -67,4 +67,6 @@ type Backend interface {
 	// instead load only the specified model. Backends should still respond to
 	// OpenAI API requests for other models with a 421 error code.
 	Run(ctx context.Context, socket, model string, mode BackendMode) error
+	// Status returns in which state the backend is in.
+	Status() string
 }
--- a/pkg/inference/backends/llamacpp/download.go
+++ b/pkg/inference/backends/llamacpp/download.go
@ -23,9 +23,12 @@ const (
 	hubRepo      = "docker-model-backend-llamacpp"
 )

-var ShouldUseGPUVariant bool
+var (
+	ShouldUseGPUVariant bool
+	errLlamaCppUpToDate = errors.New("bundled llama.cpp version is up to date, no need to update")
+)

-func downloadLatestLlamaCpp(ctx context.Context, log logging.Logger, httpClient *http.Client,
+func (l *llamaCpp) downloadLatestLlamaCpp(ctx context.Context, log logging.Logger, httpClient *http.Client,
 	llamaCppPath, vendoredServerStoragePath, desiredVersion, desiredVariant string,
 ) error {
 	url := fmt.Sprintf("https://hub.docker.com/v2/namespaces/%s/repositories/%s/tags", hubNamespace, hubRepo)
@ -71,7 +74,9 @@ func downloadLatestLlamaCpp(ctx context.Context, log logging.Logger, httpClient
 	if err != nil {
 		return fmt.Errorf("failed to read bundled llama.cpp version: %w", err)
 	} else if strings.TrimSpace(string(data)) == latest {
-		return errors.New("bundled llama.cpp version is up to date, no need to update")
+		l.status = fmt.Sprintf("running llama.cpp %s (%s) version: %s",
+			desiredTag, latest, getLlamaCppVersion(log, filepath.Join(vendoredServerStoragePath, "com.docker.llama-server")))
+		return errLlamaCppUpToDate
 	}

 	data, err = os.ReadFile(currentVersionFile)
@ -81,6 +86,8 @@ func downloadLatestLlamaCpp(ctx context.Context, log logging.Logger, httpClient
 	} else if strings.TrimSpace(string(data)) == latest {
 		log.Infoln("current llama.cpp version is already up to date")
 		if _, err := os.Stat(llamaCppPath); err == nil {
+			l.status = fmt.Sprintf("running llama.cpp %s (%s) version: %s",
+				desiredTag, latest, getLlamaCppVersion(log, llamaCppPath))
 			return nil
 		}
 		log.Infoln("llama.cpp binary must be updated, proceeding to update it")
@ -95,6 +102,7 @@ func downloadLatestLlamaCpp(ctx context.Context, log logging.Logger, httpClient
 	}
 	defer os.RemoveAll(downloadDir)

+	l.status = fmt.Sprintf("downloading %s (%s) variant of llama.cpp", desiredTag, latest)
 	if err := extractFromImage(ctx, log, image, runtime.GOOS, runtime.GOARCH, downloadDir); err != nil {
 		return fmt.Errorf("could not extract image: %w", err)
 	}
@ -130,7 +138,8 @@ func downloadLatestLlamaCpp(ctx context.Context, log logging.Logger, httpClient
 	}

 	log.Infoln("successfully updated llama.cpp binary")
-	log.Infoln("running llama.cpp version:", getLlamaCppVersion(log, llamaCppPath))
+	l.status = fmt.Sprintf("running llama.cpp %s (%s) version: %s", desiredTag, latest, getLlamaCppVersion(log, llamaCppPath))
+	log.Infoln(l.status)

 	if err := os.WriteFile(currentVersionFile, []byte(latest), 0o644); err != nil {
 		log.Warnf("failed to save llama.cpp version: %v", err)
--- a/pkg/inference/backends/llamacpp/download_darwin.go
+++ b/pkg/inference/backends/llamacpp/download_darwin.go
@ -7,11 +7,11 @@ import (
 	"github.com/docker/model-runner/pkg/logging"
 )

-func ensureLatestLlamaCpp(ctx context.Context, log logging.Logger, httpClient *http.Client,
+func (l *llamaCpp) ensureLatestLlamaCpp(ctx context.Context, log logging.Logger, httpClient *http.Client,
 	llamaCppPath, vendoredServerStoragePath string,
 ) error {
 	desiredVersion := "latest"
 	desiredVariant := "metal"
-	return downloadLatestLlamaCpp(ctx, log, httpClient, llamaCppPath, vendoredServerStoragePath, desiredVersion,
+	return l.downloadLatestLlamaCpp(ctx, log, httpClient, llamaCppPath, vendoredServerStoragePath, desiredVersion,
 		desiredVariant)
 }
--- a/pkg/inference/backends/llamacpp/download_linux.go
+++ b/pkg/inference/backends/llamacpp/download_linux.go
@ -8,7 +8,7 @@ import (
 	"github.com/docker/model-runner/pkg/logging"
 )

-func ensureLatestLlamaCpp(ctx context.Context, log logging.Logger, httpClient *http.Client,
+func (l *llamaCpp) ensureLatestLlamaCpp(ctx context.Context, log logging.Logger, httpClient *http.Client,
 	llamaCppPath, vendoredServerStoragePath string,
 ) error {
 	return errors.New("platform is not supported")
--- a/pkg/inference/backends/llamacpp/download_windows.go
+++ b/pkg/inference/backends/llamacpp/download_windows.go
@ -9,7 +9,7 @@ import (
 	"github.com/docker/model-runner/pkg/logging"
 )

-func ensureLatestLlamaCpp(ctx context.Context, log logging.Logger, httpClient *http.Client,
+func (l *llamaCpp) ensureLatestLlamaCpp(ctx context.Context, log logging.Logger, httpClient *http.Client,
 	llamaCppPath, vendoredServerStoragePath string,
 ) error {
 	nvGPUInfoBin := filepath.Join(vendoredServerStoragePath, "com.docker.nv-gpu-info.exe")
@ -18,6 +18,7 @@ func ensureLatestLlamaCpp(ctx context.Context, log logging.Logger, httpClient *h
 	if ShouldUseGPUVariant {
 		canUseCUDA11, err = hasCUDA11CapableGPU(ctx, nvGPUInfoBin)
 		if err != nil {
+			l.status = fmt.Sprintf("failed to check CUDA 11 capability: %v", err)
 			return fmt.Errorf("failed to check CUDA 11 capability: %w", err)
 		}
 	}
@ -26,6 +27,7 @@ func ensureLatestLlamaCpp(ctx context.Context, log logging.Logger, httpClient *h
 	if canUseCUDA11 {
 		desiredVariant = "cuda"
 	}
-	return downloadLatestLlamaCpp(ctx, log, httpClient, llamaCppPath, vendoredServerStoragePath, desiredVersion,
+	l.status = fmt.Sprintf("looking for updates for %s variant", desiredVariant)
+	return l.downloadLatestLlamaCpp(ctx, log, httpClient, llamaCppPath, vendoredServerStoragePath, desiredVersion,
 		desiredVariant)
 }
--- a/pkg/inference/backends/llamacpp/llamacpp.go
+++ b/pkg/inference/backends/llamacpp/llamacpp.go
@ -34,6 +34,8 @@ type llamaCpp struct {
 	// updatedServerStoragePath is the parent path of the updated version of com.docker.llama-server.
 	// It is also where updates will be stored when downloaded.
 	updatedServerStoragePath string
+	// status is the state in which the llama.cpp backend is in.
+	status string
 }

 // New creates a new llama.cpp-based backend.
@ -81,13 +83,18 @@ func (l *llamaCpp) Install(ctx context.Context, httpClient *http.Client) error {
 		llamaServerBin = "com.docker.llama-server.exe"
 	}

+	l.status = "installing"
+
 	// Temporary workaround for dynamically downloading llama.cpp from Docker Hub.
 	// Internet access and an available docker/docker-model-backend-llamacpp:latest on Docker Hub are required.
 	// Even if docker/docker-model-backend-llamacpp:latest has been downloaded before, we still require its
 	// digest to be equal to the one on Docker Hub.
 	llamaCppPath := filepath.Join(l.updatedServerStoragePath, llamaServerBin)
-	if err := ensureLatestLlamaCpp(ctx, l.log, httpClient, llamaCppPath, l.vendoredServerStoragePath); err != nil {
+	if err := l.ensureLatestLlamaCpp(ctx, l.log, httpClient, llamaCppPath, l.vendoredServerStoragePath); err != nil {
 		l.log.Infof("failed to ensure latest llama.cpp: %v\n", err)
+		if !errors.Is(err, errLlamaCppUpToDate) {
+			l.status = fmt.Sprintf("failed to install llama.cpp: %v", err)
+		}
 		if errors.Is(err, context.Canceled) {
 			return err
 		}
@ -167,3 +174,7 @@ func (l *llamaCpp) Run(ctx context.Context, socket, model string, mode inference
 		return fmt.Errorf("llama.cpp terminated unexpectedly: %w", llamaCppErr)
 	}
 }
+
+func (l *llamaCpp) Status() string {
+	return l.status
+}
--- a/pkg/inference/backends/mlx/mlx.go
+++ b/pkg/inference/backends/mlx/mlx.go
@ -54,3 +54,7 @@ func (m *mlx) Run(ctx context.Context, socket, model string, mode inference.Back
 	m.log.Warn("MLX backend is not yet supported")
 	return errors.New("not implemented")
 }
+
+func (m *mlx) Status() string {
+	return "not running"
+}
--- a/pkg/inference/backends/vllm/vllm.go
+++ b/pkg/inference/backends/vllm/vllm.go
@ -54,3 +54,7 @@ func (v *vLLM) Run(ctx context.Context, socket, model string, mode inference.Bac
 	v.log.Warn("vLLM backend is not yet supported")
 	return errors.New("not implemented")
 }
+
+func (v *vLLM) Status() string {
+	return "not running"
+}
--- a/pkg/inference/scheduling/scheduler.go
+++ b/pkg/inference/scheduling/scheduler.go
@ -59,16 +59,16 @@ func NewScheduler(
 		http.Error(w, "not found", http.StatusNotFound)
 	})

-	for _, route := range s.GetRoutes() {
-		s.router.HandleFunc(route, s.handleOpenAIInference)
+	for route, handler := range s.routeHandlers() {
+		s.router.HandleFunc(route, handler)
 	}

 	// Scheduler successfully initialized.
 	return s
 }

-func (s *Scheduler) GetRoutes() []string {
-	return []string{
+func (s *Scheduler) routeHandlers() map[string]http.HandlerFunc {
+	openAIRoutes := []string{
 		"POST " + inference.InferencePrefix + "/{backend}/v1/chat/completions",
 		"POST " + inference.InferencePrefix + "/{backend}/v1/completions",
 		"POST " + inference.InferencePrefix + "/{backend}/v1/embeddings",
@ -76,6 +76,21 @@ func (s *Scheduler) GetRoutes() []string {
 		"POST " + inference.InferencePrefix + "/v1/completions",
 		"POST " + inference.InferencePrefix + "/v1/embeddings",
 	}
+	m := make(map[string]http.HandlerFunc)
+	for _, route := range openAIRoutes {
+		m[route] = s.handleOpenAIInference
+	}
+	m["GET "+inference.InferencePrefix+"/status"] = s.GetBackendStatus
+	return m
+}
+
+func (s *Scheduler) GetRoutes() []string {
+	routeHandlers := s.routeHandlers()
+	routes := make([]string, 0, len(routeHandlers))
+	for route := range routeHandlers {
+		routes = append(routes, route)
+	}
+	return routes
 }

 // Run is the scheduler's main run loop. By the time it returns, all inference
@ -196,6 +211,15 @@ func (s *Scheduler) handleOpenAIInference(w http.ResponseWriter, r *http.Request
 	runner.ServeHTTP(w, upstreamRequest)
 }

+func (s *Scheduler) GetBackendStatus(w http.ResponseWriter, r *http.Request) {
+	status := make(map[string]string)
+	for backendName, backend := range s.backends {
+		status[backendName] = backend.Status()
+	}
+	w.Header().Set("Content-Type", "application/json")
+	json.NewEncoder(w).Encode(status)
+}
+
 func (s *Scheduler) ResetInstaller(httpClient *http.Client) {
 	s.installer = newInstaller(s.log, s.backends, httpClient)
 }