inference: Support disabling pre-pull memory checks

Signed-off-by: Piotr Stankiewicz <piotr.stankiewicz@docker.com>
2025-08-19 16:20:04 +02:00 · 2025-08-19 16:20:04 +02:00 · 64c85dcd83
parent 15e31feb30
commit 64c85dcd83
2 changed files with 16 additions and 10 deletions
--- a/pkg/inference/models/api.go
+++ b/pkg/inference/models/api.go
@ -14,6 +14,9 @@ import (
 type ModelCreateRequest struct {
 	// From is the name of the model to pull.
 	From string `json:"from"`
 	// IgnoreRuntimeMemoryCheck indicates whether the server should check if it has sufficient
 	// memory to run the given model (assuming default configuration).
 	IgnoreRuntimeMemoryCheck bool `json:"ignore-runtime-memory-check,omitempty"`
 }
 // ToOpenAIList converts the model list to its OpenAI API representation. This function never
--- a/pkg/inference/models/manager.go
+++ b/pkg/inference/models/manager.go
@ -168,16 +168,19 @@ func (m *Manager) handleCreateModel(w http.ResponseWriter, r *http.Request) {
 	// Pull the model. In the future, we may support additional operations here
 	// besides pulling (such as model building).
-	proceed, err := m.memoryEstimator.HaveSufficientMemoryForModel(r.Context(), request.From, nil)
+	if !request.IgnoreRuntimeMemoryCheck {
-	if err != nil {
+		m.log.Infof("Will estimate memory required for %q", request.From)
-		m.log.Warnf("Failed to calculate memory required for model %q: %s", request.From, err)
+		proceed, err := m.memoryEstimator.HaveSufficientMemoryForModel(r.Context(), request.From, nil)
-		// Prefer staying functional in case of unexpected estimation errors.
+		if err != nil {
-		proceed = true
+			m.log.Warnf("Failed to calculate memory required for model %q: %s", request.From, err)
-	}
+			// Prefer staying functional in case of unexpected estimation errors.
-	if !proceed {
+			proceed = true
-		m.log.Warnf("Runtime memory requirement for model %q exceeds total system memory", request.From)
+		}
-		http.Error(w, "Runtime memory requirement for model exceeds total system memory", http.StatusInsufficientStorage)
+		if !proceed {
-		return
+			m.log.Warnf("Runtime memory requirement for model %q exceeds total system memory", request.From)
 			http.Error(w, "Runtime memory requirement for model exceeds total system memory", http.StatusInsufficientStorage)
 			return
 		}
 	}
 	if err := m.PullModel(request.From, r, w); err != nil {
 		if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) {