Merge remote-tracking branch 'origin/main' into shards

2025-08-22 09:27:07 -06:00 · 2025-08-22 09:27:07 -06:00 · 8d5f251df7
parent 156686cc6f d8ed374455
commit 8d5f251df7
14 changed files with 297 additions and 67 deletions
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@ -13,6 +13,9 @@ jobs:
      - name: Checkout code
        uses: actions/checkout@v4
      - name: Verify vendor/ is not present
        run: stat vendor && exit 1 || exit 0
      - name: Set up Go
        uses: actions/setup-go@v5
        with:
--- a/.gitignore
+++ b/.gitignore
@ -5,3 +5,4 @@ model-runner.sock
 models-store/
 # Directory where we store the updated llama.cpp
 updated-inference/
 vendor/
--- a/main.go
+++ b/main.go
@ -14,6 +14,7 @@ import (
 	"github.com/docker/model-runner/pkg/inference"
 	"github.com/docker/model-runner/pkg/inference/backends/llamacpp"
 	"github.com/docker/model-runner/pkg/inference/config"
 	"github.com/docker/model-runner/pkg/inference/memory"
 	"github.com/docker/model-runner/pkg/inference/models"
 	"github.com/docker/model-runner/pkg/inference/scheduling"
 	"github.com/docker/model-runner/pkg/metrics"
@ -54,6 +55,20 @@ func main() {
 		llamacpp.SetDesiredServerVersion(desiredSeverVersion)
 	}
 	llamaServerPath := os.Getenv("LLAMA_SERVER_PATH")
 	if llamaServerPath == "" {
 		llamaServerPath = "/Applications/Docker.app/Contents/Resources/model-runner/bin"
 	}
 	gpuInfo := gpuinfo.New(llamaServerPath)
 	sysMemInfo, err := memory.NewSystemMemoryInfo(log, gpuInfo)
 	if err != nil {
 		log.Fatalf("unable to initialize system memory info: %v", err)
 	}
 	memEstimator := memory.NewEstimator(sysMemInfo)
 	modelManager := models.NewManager(
 		log,
 		models.ClientConfig{
@ -61,13 +76,9 @@ func main() {
 			Logger:        log.WithFields(logrus.Fields{"component": "model-manager"}),
 		},
 		nil,
 		memEstimator,
 	)
 	llamaServerPath := os.Getenv("LLAMA_SERVER_PATH")
 	if llamaServerPath == "" {
 		llamaServerPath = "/Applications/Docker.app/Contents/Resources/model-runner/bin"
 	}
 	log.Infof("LLAMA_SERVER_PATH: %s", llamaServerPath)
 	// Create llama.cpp configuration from environment variables
@ -90,7 +101,7 @@ func main() {
 		log.Fatalf("unable to initialize %s backend: %v", llamacpp.Name, err)
 	}
-	gpuInfo := gpuinfo.New(llamaServerPath)
+	memEstimator.SetDefaultBackend(llamaCppBackend)
 	scheduler := scheduling.NewScheduler(
 		log,
@ -105,7 +116,7 @@ func main() {
 			"",
 			false,
 		),
-		gpuInfo,
+		sysMemInfo,
 	)
 	router := routing.NewNormalizedServeMux()
--- a/pkg/inference/backend.go
+++ b/pkg/inference/backend.go
@ -2,7 +2,6 @@ package inference
 import (
 	"context"
 	"errors"
 	"net/http"
 )
@ -18,9 +17,13 @@ const (
 	BackendModeEmbedding
 )
-var (
+type ErrGGUFParse struct {
-	ErrGGUFParse = errors.New("failed to parse GGUF file")
+	Err error
-)
+}
 func (e *ErrGGUFParse) Error() string {
 	return "failed to parse GGUF: " + e.Err.Error()
 }
 // String implements Stringer.String for BackendMode.
 func (m BackendMode) String() string {
@ -88,5 +91,5 @@ type Backend interface {
 	GetDiskUsage() (int64, error)
 	// GetRequiredMemoryForModel returns the required working memory for a given
 	// model.
-	GetRequiredMemoryForModel(model string, config *BackendConfiguration) (*RequiredMemory, error)
+	GetRequiredMemoryForModel(ctx context.Context, model string, config *BackendConfiguration) (*RequiredMemory, error)
 }
--- a/pkg/inference/backends/llamacpp/llamacpp.go
+++ b/pkg/inference/backends/llamacpp/llamacpp.go
@ -15,6 +15,8 @@ import (
 	"runtime"
 	"strings"
 	"github.com/docker/model-distribution/types"
 	v1 "github.com/google/go-containerregistry/pkg/v1"
 	parser "github.com/gpustack/gguf-parser-go"
 	"github.com/docker/model-runner/pkg/diskusage"
@ -223,23 +225,30 @@ func (l *llamaCpp) GetDiskUsage() (int64, error) {
 	return size, nil
 }
-func (l *llamaCpp) GetRequiredMemoryForModel(model string, config *inference.BackendConfiguration) (*inference.RequiredMemory, error) {
+func (l *llamaCpp) GetRequiredMemoryForModel(ctx context.Context, model string, config *inference.BackendConfiguration) (*inference.RequiredMemory, error) {
-	bundle, err := l.modelManager.GetBundle(model)
+	var mdlGguf *parser.GGUFFile
 	var mdlConfig types.Config
 	inStore, err := l.modelManager.IsModelInStore(model)
 	if err != nil {
-		return nil, fmt.Errorf("getting model(%s): %w", model, err)
+		return nil, fmt.Errorf("checking if model is in local store: %w", err)
 	}
 	if inStore {
 		mdlGguf, mdlConfig, err = l.parseLocalModel(model)
 		if err != nil {
 			return nil, &inference.ErrGGUFParse{Err: err}
 		}
 	} else {
 		mdlGguf, mdlConfig, err = l.parseRemoteModel(ctx, model)
 		if err != nil {
 			return nil, &inference.ErrGGUFParse{Err: err}
 		}
 	}
-	mdlGGUF, err := parser.ParseGGUFFile(bundle.GGUFPath())
+	contextSize := GetContextSize(mdlConfig, config)
 	if err != nil {
 		l.log.Warnf("Failed to parse gguf(%s): %s", bundle.GGUFPath(), err)
 		return nil, inference.ErrGGUFParse
 	}
 	contextSize := GetContextSize(bundle.RuntimeConfig(), config)
 	ngl := uint64(0)
 	if l.gpuSupported {
-		if runtime.GOOS == "windows" && runtime.GOARCH == "arm64" && strings.TrimSpace(mdlGGUF.Metadata().FileType.String()) != "Q4_0" {
+		if runtime.GOOS == "windows" && runtime.GOARCH == "arm64" && mdlConfig.Quantization != "Q4_0" {
 			ngl = 0 // only Q4_0 models can be accelerated on Adreno
 		}
 		ngl = 100
@ -248,7 +257,7 @@ func (l *llamaCpp) GetRequiredMemoryForModel(model string, config *inference.Bac
 	// TODO(p1-0tr): for now assume we are running on GPU (single one) - Devices[1];
 	// sum up weights + kv cache + context for an estimate of total GPU memory needed
 	// while running inference with the given model
-	estimate := mdlGGUF.EstimateLLaMACppRun(parser.WithLLaMACppContextSize(int32(contextSize)),
+	estimate := mdlGguf.EstimateLLaMACppRun(parser.WithLLaMACppContextSize(int32(contextSize)),
 		// TODO(p1-0tr): add logic for resolving other param values, instead of hardcoding them
 		parser.WithLLaMACppLogicalBatchSize(2048),
 		parser.WithLLaMACppOffloadLayers(ngl))
@ -270,6 +279,63 @@ func (l *llamaCpp) GetRequiredMemoryForModel(model string, config *inference.Bac
 	}, nil
 }
 func (l *llamaCpp) parseLocalModel(model string) (*parser.GGUFFile, types.Config, error) {
 	bundle, err := l.modelManager.GetBundle(model)
 	if err != nil {
 		return nil, types.Config{}, fmt.Errorf("getting model(%s): %w", model, err)
 	}
 	modelGGUF, err := parser.ParseGGUFFile(bundle.GGUFPath())
 	if err != nil {
 		return nil, types.Config{}, fmt.Errorf("parsing gguf(%s): %w", bundle.GGUFPath(), err)
 	}
 	return modelGGUF, bundle.RuntimeConfig(), nil
 }
 func (l *llamaCpp) parseRemoteModel(ctx context.Context, model string) (*parser.GGUFFile, types.Config, error) {
 	mdl, err := l.modelManager.GetRemoteModel(ctx, model)
 	if err != nil {
 		return nil, types.Config{}, fmt.Errorf("getting remote model(%s): %w", model, err)
 	}
 	layers, err := mdl.Layers()
 	if err != nil {
 		return nil, types.Config{}, fmt.Errorf("getting layers of model(%s): %w", model, err)
 	}
 	var ggufDigest v1.Hash
 	for _, layer := range layers {
 		mt, err := layer.MediaType()
 		if err != nil {
 			return nil, types.Config{}, fmt.Errorf("getting media type of model(%s) layer: %w", model, err)
 		}
 		if mt == types.MediaTypeGGUF {
 			ggufDigest, err = layer.Digest()
 			if err != nil {
 				return nil, types.Config{}, fmt.Errorf("getting digest of GGUF layer for model(%s): %w", model, err)
 			}
 			break
 		}
 	}
 	if ggufDigest.String() == "" {
 		return nil, types.Config{}, fmt.Errorf("model(%s) has no GGUF layer", model)
 	}
 	blobURL, err := l.modelManager.GetRemoteModelBlobURL(model, ggufDigest)
 	if err != nil {
 		return nil, types.Config{}, fmt.Errorf("getting GGUF blob URL for model(%s): %w", model, err)
 	}
 	tok, err := l.modelManager.BearerTokenForModel(ctx, model)
 	if err != nil {
 		return nil, types.Config{}, fmt.Errorf("getting bearer token for model(%s): %w", model, err)
 	}
 	mdlGguf, err := parser.ParseGGUFFileRemote(ctx, blobURL, parser.UseBearerAuth(tok))
 	if err != nil {
 		return nil, types.Config{}, fmt.Errorf("parsing GGUF for model(%s): %w", model, err)
 	}
 	config, err := mdl.Config()
 	if err != nil {
 		return nil, types.Config{}, fmt.Errorf("getting config for model(%s): %w", model, err)
 	}
 	return mdlGguf, config, nil
 }
 func (l *llamaCpp) checkGPUSupport(ctx context.Context) bool {
 	binPath := l.vendoredServerStoragePath
 	if l.updatedLlamaCpp {
--- a/pkg/inference/backends/mlx/mlx.go
+++ b/pkg/inference/backends/mlx/mlx.go
@ -63,6 +63,6 @@ func (m *mlx) GetDiskUsage() (int64, error) {
 	return 0, nil
 }
-func (m *mlx) GetRequiredMemoryForModel(model string, config *inference.BackendConfiguration) (*inference.RequiredMemory, error) {
+func (m *mlx) GetRequiredMemoryForModel(ctx context.Context, model string, config *inference.BackendConfiguration) (*inference.RequiredMemory, error) {
 	return nil, errors.New("not implemented")
 }
--- a/pkg/inference/backends/vllm/vllm.go
+++ b/pkg/inference/backends/vllm/vllm.go
@ -63,6 +63,6 @@ func (v *vLLM) GetDiskUsage() (int64, error) {
 	return 0, nil
 }
-func (v *vLLM) GetRequiredMemoryForModel(model string, config *inference.BackendConfiguration) (*inference.RequiredMemory, error) {
+func (v *vLLM) GetRequiredMemoryForModel(ctx context.Context, model string, config *inference.BackendConfiguration) (*inference.RequiredMemory, error) {
 	return nil, errors.New("not implemented")
 }
--- a/pkg/inference/memory/estimator.go
+++ b/pkg/inference/memory/estimator.go
@ -0,0 +1,48 @@
 package memory
 import (
 	"context"
 	"errors"
 	"fmt"
 	"github.com/docker/model-runner/pkg/inference"
 )
 type MemoryEstimator interface {
 	SetDefaultBackend(MemoryEstimatorBackend)
 	GetRequiredMemoryForModel(context.Context, string, *inference.BackendConfiguration) (*inference.RequiredMemory, error)
 	HaveSufficientMemoryForModel(ctx context.Context, model string, config *inference.BackendConfiguration) (bool, error)
 }
 type MemoryEstimatorBackend interface {
 	GetRequiredMemoryForModel(context.Context, string, *inference.BackendConfiguration) (*inference.RequiredMemory, error)
 }
 type memoryEstimator struct {
 	systemMemoryInfo SystemMemoryInfo
 	defaultBackend   MemoryEstimatorBackend
 }
 func NewEstimator(systemMemoryInfo SystemMemoryInfo) MemoryEstimator {
 	return &memoryEstimator{systemMemoryInfo: systemMemoryInfo}
 }
 func (m *memoryEstimator) SetDefaultBackend(backend MemoryEstimatorBackend) {
 	m.defaultBackend = backend
 }
 func (m *memoryEstimator) GetRequiredMemoryForModel(ctx context.Context, model string, config *inference.BackendConfiguration) (*inference.RequiredMemory, error) {
 	if m.defaultBackend == nil {
 		return nil, errors.New("default backend not configured")
 	}
 	return m.defaultBackend.GetRequiredMemoryForModel(ctx, model, config)
 }
 func (m *memoryEstimator) HaveSufficientMemoryForModel(ctx context.Context, model string, config *inference.BackendConfiguration) (bool, error) {
 	req, err := m.GetRequiredMemoryForModel(ctx, model, config)
 	if err != nil {
 		return false, fmt.Errorf("estimating required memory for model: %w", err)
 	}
 	return m.systemMemoryInfo.HaveSufficientMemory(*req), nil
 }
--- a/pkg/inference/memory/system.go
+++ b/pkg/inference/memory/system.go
@ -0,0 +1,55 @@
 package memory
 import (
 	"github.com/docker/model-runner/pkg/gpuinfo"
 	"github.com/docker/model-runner/pkg/inference"
 	"github.com/docker/model-runner/pkg/logging"
 	"github.com/elastic/go-sysinfo"
 )
 type SystemMemoryInfo interface {
 	HaveSufficientMemory(inference.RequiredMemory) bool
 	GetTotalMemory() inference.RequiredMemory
 }
 type systemMemoryInfo struct {
 	log         logging.Logger
 	totalMemory inference.RequiredMemory
 }
 func NewSystemMemoryInfo(log logging.Logger, gpuInfo *gpuinfo.GPUInfo) (SystemMemoryInfo, error) {
 	// Compute the amount of available memory.
 	// TODO(p1-0tr): improve error handling
 	vramSize, err := gpuInfo.GetVRAMSize()
 	if err != nil {
 		vramSize = 1
 		log.Warnf("Could not read VRAM size: %s", err)
 	} else {
 		log.Infof("Running on system with %d MB VRAM", vramSize/1024/1024)
 	}
 	ramSize := uint64(1)
 	hostInfo, err := sysinfo.Host()
 	if err != nil {
 		log.Warnf("Could not read host info: %s", err)
 	} else {
 		ram, err := hostInfo.Memory()
 		if err != nil {
 			log.Warnf("Could not read host RAM size: %s", err)
 		} else {
 			ramSize = ram.Total
 			log.Infof("Running on system with %d MB RAM", ramSize/1024/1024)
 		}
 	}
 	return &systemMemoryInfo{
 		log:         log,
 		totalMemory: inference.RequiredMemory{RAM: ramSize, VRAM: vramSize},
 	}, nil
 }
 func (s *systemMemoryInfo) HaveSufficientMemory(req inference.RequiredMemory) bool {
 	return req.RAM <= s.totalMemory.RAM && req.VRAM <= s.totalMemory.VRAM
 }
 func (s *systemMemoryInfo) GetTotalMemory() inference.RequiredMemory {
 	return s.totalMemory
 }
--- a/pkg/inference/models/api.go
+++ b/pkg/inference/models/api.go
@ -14,6 +14,9 @@ import (
 type ModelCreateRequest struct {
 	// From is the name of the model to pull.
 	From string `json:"from"`
 	// IgnoreRuntimeMemoryCheck indicates whether the server should check if it has sufficient
 	// memory to run the given model (assuming default configuration).
 	IgnoreRuntimeMemoryCheck bool `json:"ignore-runtime-memory-check,omitempty"`
 }
 // ToOpenAIList converts the model list to its OpenAI API representation. This function never
--- a/pkg/inference/models/manager.go
+++ b/pkg/inference/models/manager.go
@ -15,11 +15,12 @@ import (
 	"github.com/docker/model-distribution/distribution"
 	"github.com/docker/model-distribution/registry"
 	"github.com/docker/model-distribution/types"
 	"github.com/sirupsen/logrus"
 	"github.com/docker/model-runner/pkg/diskusage"
 	"github.com/docker/model-runner/pkg/inference"
 	"github.com/docker/model-runner/pkg/inference/memory"
 	"github.com/docker/model-runner/pkg/logging"
 	v1 "github.com/google/go-containerregistry/pkg/v1"
 	"github.com/sirupsen/logrus"
 )
 const (
@ -43,6 +44,8 @@ type Manager struct {
 	registryClient *registry.Client
 	// lock is used to synchronize access to the models manager's router.
 	lock sync.RWMutex
 	// memoryEstimator is used to calculate runtime memory requirements for models.
 	memoryEstimator memory.MemoryEstimator
 }
 type ClientConfig struct {
@ -57,7 +60,7 @@ type ClientConfig struct {
 }
 // NewManager creates a new model's manager.
-func NewManager(log logging.Logger, c ClientConfig, allowedOrigins []string) *Manager {
+func NewManager(log logging.Logger, c ClientConfig, allowedOrigins []string, memoryEstimator memory.MemoryEstimator) *Manager {
 	// Create the model distribution client.
 	distributionClient, err := distribution.NewClient(
 		distribution.WithStoreRootPath(c.StoreRootPath),
@ -84,6 +87,7 @@ func NewManager(log logging.Logger, c ClientConfig, allowedOrigins []string) *Ma
 		router:             http.NewServeMux(),
 		distributionClient: distributionClient,
 		registryClient:     registryClient,
 		memoryEstimator:    memoryEstimator,
 	}
 	// Register routes.
@ -164,6 +168,20 @@ func (m *Manager) handleCreateModel(w http.ResponseWriter, r *http.Request) {
 	// Pull the model. In the future, we may support additional operations here
 	// besides pulling (such as model building).
 	if !request.IgnoreRuntimeMemoryCheck {
 		m.log.Infof("Will estimate memory required for %q", request.From)
 		proceed, err := m.memoryEstimator.HaveSufficientMemoryForModel(r.Context(), request.From, nil)
 		if err != nil {
 			m.log.Warnf("Failed to calculate memory required for model %q: %s", request.From, err)
 			// Prefer staying functional in case of unexpected estimation errors.
 			proceed = true
 		}
 		if !proceed {
 			m.log.Warnf("Runtime memory requirement for model %q exceeds total system memory", request.From)
 			http.Error(w, "Runtime memory requirement for model exceeds total system memory", http.StatusInsufficientStorage)
 			return
 		}
 	}
 	if err := m.PullModel(request.From, r, w); err != nil {
 		if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) {
 			m.log.Infof("Request canceled/timed out while pulling model %q", request.From)
@ -563,6 +581,11 @@ func (m *Manager) ServeHTTP(w http.ResponseWriter, r *http.Request) {
 	m.router.ServeHTTP(w, r)
 }
 // IsModelInStore checks if a given model is in the local store.
 func (m *Manager) IsModelInStore(ref string) (bool, error) {
 	return m.distributionClient.IsModelInStore(ref)
 }
 // GetModel returns a single model.
 func (m *Manager) GetModel(ref string) (types.Model, error) {
 	model, err := m.distributionClient.GetModel(ref)
@ -572,6 +595,33 @@ func (m *Manager) GetModel(ref string) (types.Model, error) {
 	return model, err
 }
 // GetRemoteModel returns a single remote model.
 func (m *Manager) GetRemoteModel(ctx context.Context, ref string) (types.ModelArtifact, error) {
 	model, err := m.registryClient.Model(ctx, ref)
 	if err != nil {
 		return nil, fmt.Errorf("error while getting remote model: %w", err)
 	}
 	return model, nil
 }
 // GetRemoteModelBlobURL returns the URL of a given model blob.
 func (m *Manager) GetRemoteModelBlobURL(ref string, digest v1.Hash) (string, error) {
 	blobURL, err := m.registryClient.BlobURL(ref, digest)
 	if err != nil {
 		return "", fmt.Errorf("error while getting remote model blob URL: %w", err)
 	}
 	return blobURL, nil
 }
 // BearerTokenForModel returns the bearer token needed to pull a given model.
 func (m *Manager) BearerTokenForModel(ctx context.Context, ref string) (string, error) {
 	tok, err := m.registryClient.BearerToken(ctx, ref)
 	if err != nil {
 		return "", fmt.Errorf("error while getting bearer token for model: %w", err)
 	}
 	return tok, nil
 }
 // GetBundle returns model bundle.
 func (m *Manager) GetBundle(ref string) (types.ModelBundle, error) {
 	bundle, err := m.distributionClient.GetBundle(ref)
--- a/pkg/inference/models/manager_test.go
+++ b/pkg/inference/models/manager_test.go
@ -16,10 +16,23 @@ import (
 	"github.com/docker/model-distribution/builder"
 	reg "github.com/docker/model-distribution/registry"
 	"github.com/docker/model-runner/pkg/inference"
 	"github.com/docker/model-runner/pkg/inference/memory"
 	"github.com/sirupsen/logrus"
 )
 type mockMemoryEstimator struct{}
 func (me *mockMemoryEstimator) SetDefaultBackend(_ memory.MemoryEstimatorBackend) {}
 func (me *mockMemoryEstimator) GetRequiredMemoryForModel(_ context.Context, _ string, _ *inference.BackendConfiguration) (*inference.RequiredMemory, error) {
 	return &inference.RequiredMemory{RAM: 0, VRAM: 0}, nil
 }
 func (me *mockMemoryEstimator) HaveSufficientMemoryForModel(_ context.Context, _ string, _ *inference.BackendConfiguration) (bool, error) {
 	return true, nil
 }
 // getProjectRoot returns the absolute path to the project root directory
 func getProjectRoot(t *testing.T) string {
 	// Start from the current test file's directory
@ -109,10 +122,11 @@ func TestPullModel(t *testing.T) {
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
 			log := logrus.NewEntry(logrus.StandardLogger())
 			memEstimator := &mockMemoryEstimator{}
 			m := NewManager(log, ClientConfig{
 				StoreRootPath: tempDir,
 				Logger:        log.WithFields(logrus.Fields{"component": "model-manager"}),
-			}, nil)
+			}, nil, memEstimator)
 			r := httptest.NewRequest("POST", "/models/create", strings.NewReader(`{"from": "`+tag+`"}`))
 			if tt.acceptHeader != "" {
@ -219,12 +233,13 @@ func TestHandleGetModel(t *testing.T) {
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
 			log := logrus.NewEntry(logrus.StandardLogger())
 			memEstimator := &mockMemoryEstimator{}
 			m := NewManager(log, ClientConfig{
 				StoreRootPath: tempDir,
 				Logger:        log.WithFields(logrus.Fields{"component": "model-manager"}),
 				Transport:     http.DefaultTransport,
 				UserAgent:     "test-agent",
-			}, nil)
+			}, nil, memEstimator)
 			// First pull the model if we're testing local access
 			if !tt.remote && !strings.Contains(tt.modelName, "nonexistent") {
--- a/pkg/inference/scheduling/loader.go
+++ b/pkg/inference/scheduling/loader.go
@ -10,12 +10,11 @@ import (
 	"time"
 	"github.com/docker/model-runner/pkg/environment"
 	"github.com/docker/model-runner/pkg/gpuinfo"
 	"github.com/docker/model-runner/pkg/inference"
 	"github.com/docker/model-runner/pkg/inference/memory"
 	"github.com/docker/model-runner/pkg/inference/models"
 	"github.com/docker/model-runner/pkg/logging"
 	"github.com/docker/model-runner/pkg/metrics"
 	"github.com/elastic/go-sysinfo"
 )
 const (
@ -113,7 +112,7 @@ func newLoader(
 	backends map[string]inference.Backend,
 	modelManager *models.Manager,
 	openAIRecorder *metrics.OpenAIRecorder,
-	gpuInfo *gpuinfo.GPUInfo,
+	sysMemInfo memory.SystemMemoryInfo,
 ) *loader {
 	// Compute the number of runner slots to allocate. Because of RAM and VRAM
 	// limitations, it's unlikely that we'll ever be able to fully populate
@ -135,32 +134,7 @@ func newLoader(
 	}
 	// Compute the amount of available memory.
-	// TODO(p1-0tr): improve error handling
+	totalMemory := sysMemInfo.GetTotalMemory()
 	vramSize, err := gpuInfo.GetVRAMSize()
 	if err != nil {
 		vramSize = 1
 		log.Warnf("Could not read VRAM size: %s", err)
 	} else {
 		log.Infof("Running on system with %dMB VRAM", vramSize/1024/1024)
 	}
 	ramSize := uint64(1)
 	hostInfo, err := sysinfo.Host()
 	if err != nil {
 		log.Warnf("Could not read host info: %s", err)
 	} else {
 		ram, err := hostInfo.Memory()
 		if err != nil {
 			log.Warnf("Could not read host RAM size: %s", err)
 		} else {
 			ramSize = ram.Total
 			log.Infof("Running on system with %dMB RAM", ramSize/1024/1024)
 		}
 	}
 	totalMemory := inference.RequiredMemory{
 		RAM:  ramSize,
 		VRAM: vramSize,
 	}
 	// Create the loader.
 	l := &loader{
@ -420,12 +394,13 @@ func (l *loader) load(ctx context.Context, backendName, modelID, modelRef string
 	if rc, ok := l.runnerConfigs[runnerKey{backendName, modelID, mode}]; ok {
 		runnerConfig = &rc
 	}
-	memory, err := backend.GetRequiredMemoryForModel(modelID, runnerConfig)
+	memory, err := backend.GetRequiredMemoryForModel(ctx, modelID, runnerConfig)
-	if errors.Is(err, inference.ErrGGUFParse) {
+	var parseErr *inference.ErrGGUFParse
 	if errors.As(err, &parseErr) {
 		// TODO(p1-0tr): For now override memory checks in case model can't be parsed
 		// e.g. model is too new for gguf-parser-go to know. We should provide a cleaner
 		// way to bypass these checks.
-		l.log.Warnf("Could not parse model(%s), memory checks will be ignored for it.", modelID)
+		l.log.Warnf("Could not parse model(%s), memory checks will be ignored for it. Error: %s", modelID, parseErr)
 		memory = &inference.RequiredMemory{
 			RAM:  0,
 			VRAM: 0,
@ -433,7 +408,7 @@ func (l *loader) load(ctx context.Context, backendName, modelID, modelRef string
 	} else if err != nil {
 		return nil, err
 	}
-	l.log.Infof("Loading %s, which will require %dMB RAM and %dMB VRAM", modelID, memory.RAM/1024/1024, memory.VRAM/1024/1024)
+	l.log.Infof("Loading %s, which will require %d MB RAM and %d MB VRAM on a system with %d MB RAM and %d MB VRAM", modelID, memory.RAM/1024/1024, memory.VRAM/1024/1024, l.totalMemory.RAM/1024/1024, l.totalMemory.VRAM/1024/1024)
 	if l.totalMemory.RAM == 1 {
 		l.log.Warnf("RAM size unknown. Assume model will fit, but only one.")
 		memory.RAM = 1
--- a/pkg/inference/scheduling/scheduler.go
+++ b/pkg/inference/scheduling/scheduler.go
@ -13,8 +13,8 @@ import (
 	"time"
 	"github.com/docker/model-distribution/distribution"
 	"github.com/docker/model-runner/pkg/gpuinfo"
 	"github.com/docker/model-runner/pkg/inference"
 	"github.com/docker/model-runner/pkg/inference/memory"
 	"github.com/docker/model-runner/pkg/inference/models"
 	"github.com/docker/model-runner/pkg/logging"
 	"github.com/docker/model-runner/pkg/metrics"
@ -56,7 +56,7 @@ func NewScheduler(
 	httpClient *http.Client,
 	allowedOrigins []string,
 	tracker *metrics.Tracker,
-	gpuInfo *gpuinfo.GPUInfo,
+	sysMemInfo memory.SystemMemoryInfo,
 ) *Scheduler {
 	openAIRecorder := metrics.NewOpenAIRecorder(log.WithField("component", "openai-recorder"), modelManager)
@ -67,7 +67,7 @@ func NewScheduler(
 		defaultBackend: defaultBackend,
 		modelManager:   modelManager,
 		installer:      newInstaller(log, backends, httpClient),
-		loader:         newLoader(log, backends, modelManager, openAIRecorder, gpuInfo),
+		loader:         newLoader(log, backends, modelManager, openAIRecorder, sysMemInfo),
 		router:         http.NewServeMux(),
 		tracker:        tracker,
 		openAIRecorder: openAIRecorder,