Dockerize (#22)

* Adds Makefile for local development * Fix chat completions example request * Added delete example * Dockerize model-runner * WIP Run container with host access to socket * Dockerize model-runner * WIP Run container with host access to socket * Debugging * Run in Docker container with TCP port access * mounted model storage * - Remove duplication in .gitignore - Do not use alpine in builder image - NVIDIA seems to use Ubuntu in all of their CDI docs and produces Ubuntu tags for nvidia/cuda but not Debian. So use Ubuntu for our final image For more details: https://github.com/docker/model-runner/pull/22 * - Add MODELS_PATH environment variable to configure model storage location - Default to $HOME/.docker/models when MODELS_PATH is not set - Update Docker container to use /models as the default storage path - Update Makefile to pass MODELS_PATH to container - Update Dockerfile to create and set permissions for /models directory This change allows users to: - Override the model storage location via MODELS_PATH - Maintain backward compatibility with default $HOME/.docker/models path - Use a more idiomatic folder for /models * Removes unneeded logs
2025-04-29 18:03:12 +02:00 · 2025-04-29 18:03:12 +02:00 · dbbb7afe9f
parent 4239791795
commit dbbb7afe9f
10 changed files with 284 additions and 68 deletions
--- a/.dockerignore
+++ b/.dockerignore
@ -0,0 +1,76 @@
+# Version control
+.git/
+.gitignore
+
+# IDE and editor files
+.idea/
+.vscode/
+*.swp
+*.swo
+
+# Go build and cache artifacts
+/vendor/
+*.test
+*.out
+*.exe
+*.dll
+*.so
+*.dylib
+*.a
+*.o
+*.obj
+*.cgo*
+*.coverprofile
+*.prof
+*.tmp
+*.log
+
+# Go tool cache
+/go-build/
+/go-cache/
+
+# Test and development artifacts
+test/
+tests/
+*_test.go
+
+# Build outputs
+dist/
+build/
+out/
+debug/
+
+# Temporary and local files
+tmp/
+temp/
+.local/
+local/
+
+# Environment and secrets
+.env*
+*.env
+*.pem
+*.key
+*.crt
+config.local.*
+*.local.yml
+
+# Documentation and markdown
+docs/
+*.md
+README*
+LICENSE
+
+# Docker files
+Dockerfile*
+
+# Miscellaneous
+*.bak
+*.old
+*.orig
+*.rej
+*.DS_Store
+.Spotlight-V100
+.Trashes
+
+# Exclude nothing else; keep source code and necessary files for build
--- a/63
+++ b/63
@ -0,0 +1,63 @@
+# syntax=docker/dockerfile:1
+
+ARG GO_VERSION=1.24.2
+ARG LLAMA_SERVER_VERSION=latest
+ARG LLAMA_BINARY_PATH=/com.docker.llama-server.native.linux.cpu.amd64
+
+FROM golang:${GO_VERSION}-bookworm AS builder
+
+# Install git for go mod download if needed
+RUN apt-get update && apt-get install -y --no-install-recommends git && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /app
+
+# Copy go mod/sum first for better caching
+COPY --link go.mod go.sum ./
+
+# Download dependencies (with cache mounts)
+RUN --mount=type=cache,target=/go/pkg/mod \
+    --mount=type=cache,target=/root/.cache/go-build \
+    go mod download
+
+# Copy the rest of the source code
+COPY --link . .
+
+# Build the Go binary (static build)
+RUN --mount=type=cache,target=/go/pkg/mod \
+    --mount=type=cache,target=/root/.cache/go-build \
+    CGO_ENABLED=0 GOOS=linux go build -ldflags="-s -w" -o model-runner ./main.go
+
+# --- Get llama.cpp binary ---
+FROM docker/docker-model-backend-llamacpp:${LLAMA_SERVER_VERSION} AS llama-server
+
+# --- Final image ---
+FROM ubuntu:24.04 AS final
+
+# Create non-root user
+RUN groupadd --system modelrunner && useradd --system --gid modelrunner modelrunner
+
+# Install ca-certificates for HTTPS
+RUN apt-get update && apt-get install -y --no-install-recommends ca-certificates && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /app
+
+# Create directories for the socket file and llama.cpp binary, and set proper permissions
+RUN mkdir -p /var/run/model-runner /app/bin /models && \
+    chown -R modelrunner:modelrunner /var/run/model-runner /app/bin /models && \
+    chmod -R 755 /models
+
+# Copy the built binary from builder
+COPY --from=builder /app/model-runner /app/model-runner
+
+# Copy the llama.cpp binary from the llama-server stage
+ARG LLAMA_BINARY_PATH
+COPY --from=llama-server ${LLAMA_BINARY_PATH} /app/bin/com.docker.llama-server
+
+USER modelrunner
+
+# Set the environment variable for the socket path and LLaMA server binary path
+ENV MODEL_RUNNER_SOCK=/var/run/model-runner/model-runner.sock
+ENV LLAMA_SERVER_PATH=/app/bin
+ENV HOME=/home/modelrunner
+
+ENTRYPOINT ["/app/model-runner"]
--- a/49
+++ b/49
@ -1,9 +1,17 @@
 # Project variables
 APP_NAME := model-runner
 GO_VERSION := 1.23.7
+LLAMA_SERVER_VERSION := v0.0.4-rc2-cpu
+TARGET_OS := linux
+TARGET_ARCH := amd64
+ACCEL := cpu
+DOCKER_IMAGE := go-model-runner:latest
+LLAMA_BINARY := /com.docker.llama-server.native.$(TARGET_OS).$(ACCEL).$(TARGET_ARCH)
+PORT := 8080
+MODELS_PATH := $(shell pwd)/models

 # Main targets
-.PHONY: build run clean help
+.PHONY: build run clean test docker-build docker-run help

 # Default target
 .DEFAULT_GOAL := help
@ -20,11 +28,42 @@ run: build
 clean:
 	rm -f $(APP_NAME)
 	rm -f model-runner.sock
+	rm -rf $(MODELS_PATH)
+
+# Run tests
+test:
+	go test -v ./...
+
+# Build Docker image
+docker-build:
+	docker build --platform linux/amd64 \
+		--build-arg LLAMA_SERVER_VERSION=$(LLAMA_SERVER_VERSION) \
+		--build-arg LLAMA_BINARY_PATH=$(LLAMA_BINARY) \
+		-t $(DOCKER_IMAGE) .
+
+# Run in Docker container with TCP port access and mounted model storage
+docker-run: docker-build
+	@echo ""
+	@echo "Starting service on port $(PORT) with model storage at $(MODELS_PATH)..."
+	@echo "Service will be available at: http://localhost:$(PORT)"
+	@echo "Example usage: curl http://localhost:$(PORT)/models"
+	@echo ""
+	mkdir -p $(MODELS_PATH)
+	docker run --rm \
+		-p $(PORT):$(PORT) \
+		-v "$(MODELS_PATH):/models" \
+		-e MODEL_RUNNER_PORT=$(PORT) \
+		-e LLAMA_SERVER_PATH=/app/bin \
+		-e MODELS_PATH=/models \
+		$(DOCKER_IMAGE)

 # Show help
 help:
 	@echo "Available targets:"
-	@echo "  build          - Build the Go application"
-	@echo "  run            - Run the application locally"
-	@echo "  clean          - Clean build artifacts"
-	@echo "  help           - Show this help message"
+	@echo "  build          	- Build the Go application"
+	@echo "  run            	- Run the application locally"
+	@echo "  clean          	- Clean build artifacts"
+	@echo "  test           	- Run tests"
+	@echo "  docker-build   	- Build Docker image"
+	@echo "  docker-run     	- Run in Docker container with TCP port access and mounted model storage"
+	@echo "  help           	- Show this help message"
--- a/README.md
+++ b/README.md
@ -23,61 +23,80 @@ The Makefile provides the following targets:
 - `build` - Build the Go application
 - `run` - Run the application locally
 - `clean` - Clean build artifacts
+- `test` - Run tests
+- `docker-build` - Build the Docker image
+- `docker-run` - Run the application in a Docker container with TCP port access and mounted model storage
 - `help` - Show available targets

-### Examples
+### Running in Docker
+
+The application can be run in Docker with the following features enabled by default:
+- TCP port access (default port 8080)
+- Persistent model storage in a local `models` directory

 ```sh
-# Build the application
-make build
+# Run with default settings
+make docker-run

-# Run the application locally
-make run
-
-# Show all available targets
-make help
+# Customize port and model storage location
+make docker-run PORT=3000 MODELS_PATH=/path/to/your/models
 ```

+This will:
+- Create a `models` directory in your current working directory (or use the specified path)
+- Mount this directory into the container
+- Start the service on port 8080 (or the specified port)
+- All models downloaded will be stored in the host's `models` directory and will persist between container runs
+
+### llama.cpp integration
+
+The Docker image includes the llama.cpp server binary from the `docker/docker-model-backend-llamacpp` image. You can specify the version of the image to use by setting the `LLAMA_SERVER_VERSION` variable. Additionally, you can configure the target OS, architecture, and acceleration type:
+
+```sh
+# Build with a specific llama.cpp server version
+LLAMA_SERVER_VERSION=v0.0.4-rc2-cpu make docker-build
+
+# Specify all parameters
+LLAMA_SERVER_VERSION=v0.0.4-rc2-cpu TARGET_OS=linux TARGET_ARCH=amd64 ACCEL=cpu make docker-build
+```
+
+Default values:
+- `LLAMA_SERVER_VERSION`: v0.0.4-rc2-cpu
+- `TARGETOS`: linux
+- `TARGETARCH`: amd64
+- `ACCEL`: cpu
+
+The binary path in the image follows this pattern: `/com.docker.llama-server.native.${TARGETOS}.${ACCEL}.${TARGETARCH}`
+
 ## API Examples

-The Model Runner exposes a REST API over a Unix socket. You can interact with it using curl commands with the `--unix-socket` option.
+The Model Runner exposes a REST API that can be accessed via TCP port. You can interact with it using curl commands.

-### Listing Models
+### Using the API

-To list all available models:
+When running with `docker-run`, you can use regular HTTP requests:

 ```sh
-curl --unix-socket model-runner.sock localhost/models
-```
+# List all available models
+curl http://localhost:8080/models

-### Creating a Model
+# Create a new model
+curl http://localhost:8080/models/create -X POST -d '{"from": "ai/smollm2"}'

-To create a new model:
+# Get information about a specific model
+curl http://localhost:8080/models/ai/smollm2

-```sh
-curl --unix-socket model-runner.sock localhost/models/create -X POST -d '{"from": "ai/smollm2"}'
-```
-
-### Getting Model Information
-
-To get information about a specific model:
-
-```sh
-curl --unix-socket model-runner.sock localhost/models/ai/smollm2
-```
-
-### Chatting with a Model
-
-To chat with a model, you can send a POST request to the model's chat endpoint:
-
-```sh
-curl --unix-socket model-runner.sock localhost/engines/llama.cpp/v1/chat/completions -X POST -d '{
+# Chat with a model
+curl http://localhost:8080/engines/llama.cpp/v1/chat/completions -X POST -d '{
  "model": "ai/smollm2",
  "messages": [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "Hello, how are you?"}
  ]
 }'
+
+# Delete a model
+curl http://localhost:8080/models/ai/smollm2 -X DELETE
 ```

 The response will contain the model's reply:
@ -105,10 +124,3 @@ The response will contain the model's reply:
  }
 }
 ```
-
-### Deleting a model
-To delete a model from the server, send a DELETE request to the model's endpoint:
-
-```sh
-curl --unix-socket model-runner.sock localhost/models/ai/smollm2 -X DELETE
-```
--- a/go.mod
+++ b/go.mod
@ -5,7 +5,7 @@ go 1.23.7
 require (
 	github.com/containerd/containerd/v2 v2.0.4
 	github.com/containerd/platforms v1.0.0-rc.1
-	github.com/docker/model-distribution v0.0.0-20250418222450-8f6c05a5ffa4
+	github.com/docker/model-distribution v0.0.0-20250423075433-587f475e591d
 	github.com/jaypipes/ghw v0.16.0
 	github.com/opencontainers/go-digest v1.0.0
 	github.com/opencontainers/image-spec v1.1.1
--- a/go.sum
+++ b/go.sum
@ -39,6 +39,8 @@ github.com/docker/docker-credential-helpers v0.8.2 h1:bX3YxiGzFP5sOXWc3bTPEXdEaZ
 github.com/docker/docker-credential-helpers v0.8.2/go.mod h1:P3ci7E3lwkZg6XiHdRKft1KckHiO9a2rNtyFbZ/ry9M=
 github.com/docker/model-distribution v0.0.0-20250418222450-8f6c05a5ffa4 h1:v7DyBUd08t4XQe4NC2Rb8ZSWOiWVXD0TbdgIOQZ/IQo=
 github.com/docker/model-distribution v0.0.0-20250418222450-8f6c05a5ffa4/go.mod h1:fd9H/2KAY0+ByxbohWfCkxp0rxf0pqSlHEtFTjTXbLk=
+github.com/docker/model-distribution v0.0.0-20250423075433-587f475e591d h1:lyxdxjNHTSyQ2w1rjbuu5pbgX42AD3kmxWLNv3mdqQ4=
+github.com/docker/model-distribution v0.0.0-20250423075433-587f475e591d/go.mod h1:dThpO9JoG5Px3i+rTluAeZcqLGw8C0qepuEL4gL2o/c=
 github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg=
 github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U=
 github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A=
@ -59,6 +61,8 @@ github.com/google/go-containerregistry v0.20.3 h1:oNx7IdTI936V8CQRveCjaxOiegWwvM
 github.com/google/go-containerregistry v0.20.3/go.mod h1:w00pIgBRDVUDFM6bq+Qx8lwNWK+cxgCuX1vd3PIBDNI=
 github.com/gpustack/gguf-parser-go v0.14.0 h1:9kMdbJ9pHhvEeATx163WE9q+O74bi05KPJLEtVBUjw0=
 github.com/gpustack/gguf-parser-go v0.14.0/go.mod h1:GvHh1Kvvq5ojCOsJ5UpwiJJmIjFw3Qk5cW7R+CZ3IJo=
+github.com/gpustack/gguf-parser-go v0.14.1 h1:tmz2eTnSEFfE52V10FESqo9oAUquZ6JKQFntWC/wrEg=
+github.com/gpustack/gguf-parser-go v0.14.1/go.mod h1:GvHh1Kvvq5ojCOsJ5UpwiJJmIjFw3Qk5cW7R+CZ3IJo=
 github.com/henvic/httpretty v0.1.4 h1:Jo7uwIRWVFxkqOnErcoYfH90o3ddQyVrSANeS4cxYmU=
 github.com/henvic/httpretty v0.1.4/go.mod h1:Dn60sQTZfbt2dYsdUSNsCljyF4AfdqnuJFDLJA1I4AM=
 github.com/jaypipes/ghw v0.16.0 h1:3HurCTS38VNpeQLo5fIdZsySuo/qAfpPSJ5t05QBFPM=
--- a/main.go
+++ b/main.go
@ -32,16 +32,28 @@ func main() {
 		log.Fatalf("Failed to get user home directory: %v", err)
 	}

+	modelPath := os.Getenv("MODELS_PATH")
+	if modelPath == "" {
+		modelPath = filepath.Join(userHomeDir, ".docker", "models")
+	}
+
 	modelManager := models.NewManager(log, models.ClientConfig{
-		StoreRootPath: filepath.Join(userHomeDir, ".docker", "models"),
+		StoreRootPath: modelPath,
 		Logger:        log.WithFields(logrus.Fields{"component": "model-manager"}),
 	})

+	llamaServerPath := os.Getenv("LLAMA_SERVER_PATH")
+	if llamaServerPath == "" {
+		llamaServerPath = "/Applications/Docker.app/Contents/Resources/bin"
+	}
+
+	log.Infof("LLAMA_SERVER_PATH: %s", llamaServerPath)
+
 	llamaCppBackend, err := llamacpp.New(
 		log,
 		modelManager,
 		log.WithFields(logrus.Fields{"component": "llama.cpp"}),
-		"/Applications/Docker.app/Contents/Resources/bin",
+		llamaServerPath,
 		func() string { wd, _ := os.Getwd(); return wd }(),
 	)
 	if err != nil {
@ -64,22 +76,34 @@ func main() {
 		router.Handle(route, scheduler)
 	}

-	if err := os.Remove(sockName); err != nil {
-		if !os.IsNotExist(err) {
-			log.Fatalf("Failed to remove existing socket: %v", err)
-		}
-	}
-	ln, err := net.ListenUnix("unix", &net.UnixAddr{Name: sockName, Net: "unix"})
-	if err != nil {
-		log.Fatalf("Failed to listen on socket: %v", err)
-	}
-
 	server := &http.Server{Handler: router}
 	serverErrors := make(chan error, 1)
-	go func() {
-		serverErrors <- server.Serve(ln)
-	}()
-	defer server.Close()
+
+	// Check if we should use TCP port instead of Unix socket
+	tcpPort := os.Getenv("MODEL_RUNNER_PORT")
+	if tcpPort != "" {
+		// Use TCP port
+		addr := ":" + tcpPort
+		log.Infof("Listening on TCP port %s", tcpPort)
+		server.Addr = addr
+		go func() {
+			serverErrors <- server.ListenAndServe()
+		}()
+	} else {
+		// Use Unix socket
+		if err := os.Remove(sockName); err != nil {
+			if !os.IsNotExist(err) {
+				log.Fatalf("Failed to remove existing socket: %v", err)
+			}
+		}
+		ln, err := net.ListenUnix("unix", &net.UnixAddr{Name: sockName, Net: "unix"})
+		if err != nil {
+			log.Fatalf("Failed to listen on socket: %v", err)
+		}
+		go func() {
+			serverErrors <- server.Serve(ln)
+		}()
+	}

 	schedulerErrors := make(chan error, 1)
 	go func() {
--- a/pkg/inference/backends/llamacpp/download.go
+++ b/pkg/inference/backends/llamacpp/download.go
@ -33,6 +33,7 @@ var (
 func (l *llamaCpp) downloadLatestLlamaCpp(ctx context.Context, log logging.Logger, httpClient *http.Client,
 	llamaCppPath, vendoredServerStoragePath, desiredVersion, desiredVariant string,
 ) error {
+	log.Infof("downloadLatestLlamaCpp: %s, %s, %s, %s", desiredVersion, desiredVariant, vendoredServerStoragePath, llamaCppPath)
 	desiredTag := desiredVersion + "-" + desiredVariant
 	url := fmt.Sprintf("https://hub.docker.com/v2/namespaces/%s/repositories/%s/tags/%s", hubNamespace, hubRepo, desiredTag)
 	resp, err := httpClient.Get(url)
--- a/pkg/inference/backends/llamacpp/download_linux.go
+++ b/pkg/inference/backends/llamacpp/download_linux.go
@ -2,7 +2,6 @@ package llamacpp

 import (
 	"context"
-	"errors"
 	"net/http"

 	"github.com/docker/model-runner/pkg/logging"
@ -11,5 +10,5 @@ import (
 func (l *llamaCpp) ensureLatestLlamaCpp(ctx context.Context, log logging.Logger, httpClient *http.Client,
 	llamaCppPath, vendoredServerStoragePath string,
 ) error {
-	return errors.New("platform is not supported")
+	return errLlamaCppUpToDate
 }
--- a/pkg/inference/backends/llamacpp/llamacpp.go
+++ b/pkg/inference/backends/llamacpp/llamacpp.go
@ -71,11 +71,9 @@ func (l *llamaCpp) UsesExternalModelManagement() bool {
 func (l *llamaCpp) Install(ctx context.Context, httpClient *http.Client) error {
 	l.updatedLlamaCpp = false

-	// We don't currently support this backend on Windows or Linux. We'll likely
+	// We don't currently support this backend on Windows. We'll likely
 	// never support it on Intel Macs.
-	if runtime.GOOS == "linux" {
-		return errors.New("not implemented")
-	} else if (runtime.GOOS == "darwin" && runtime.GOARCH == "amd64") ||
+	if (runtime.GOOS == "darwin" && runtime.GOARCH == "amd64") ||
 		(runtime.GOOS == "windows" && !(runtime.GOARCH == "amd64" || runtime.GOARCH == "arm64")) {
 		return errors.New("platform not supported")
 	}