inference: Support disabling pre-pull memory checks

Signed-off-by: Piotr Stankiewicz <piotr.stankiewicz@docker.com>
This commit is contained in:
Piotr Stankiewicz 2025-08-19 16:20:04 +02:00 committed by Piotr
parent 15e31feb30
commit 64c85dcd83
2 changed files with 16 additions and 10 deletions

View File

@ -14,6 +14,9 @@ import (
type ModelCreateRequest struct { type ModelCreateRequest struct {
// From is the name of the model to pull. // From is the name of the model to pull.
From string `json:"from"` From string `json:"from"`
// IgnoreRuntimeMemoryCheck indicates whether the server should check if it has sufficient
// memory to run the given model (assuming default configuration).
IgnoreRuntimeMemoryCheck bool `json:"ignore-runtime-memory-check,omitempty"`
} }
// ToOpenAIList converts the model list to its OpenAI API representation. This function never // ToOpenAIList converts the model list to its OpenAI API representation. This function never

View File

@ -168,16 +168,19 @@ func (m *Manager) handleCreateModel(w http.ResponseWriter, r *http.Request) {
// Pull the model. In the future, we may support additional operations here // Pull the model. In the future, we may support additional operations here
// besides pulling (such as model building). // besides pulling (such as model building).
proceed, err := m.memoryEstimator.HaveSufficientMemoryForModel(r.Context(), request.From, nil) if !request.IgnoreRuntimeMemoryCheck {
if err != nil { m.log.Infof("Will estimate memory required for %q", request.From)
m.log.Warnf("Failed to calculate memory required for model %q: %s", request.From, err) proceed, err := m.memoryEstimator.HaveSufficientMemoryForModel(r.Context(), request.From, nil)
// Prefer staying functional in case of unexpected estimation errors. if err != nil {
proceed = true m.log.Warnf("Failed to calculate memory required for model %q: %s", request.From, err)
} // Prefer staying functional in case of unexpected estimation errors.
if !proceed { proceed = true
m.log.Warnf("Runtime memory requirement for model %q exceeds total system memory", request.From) }
http.Error(w, "Runtime memory requirement for model exceeds total system memory", http.StatusInsufficientStorage) if !proceed {
return m.log.Warnf("Runtime memory requirement for model %q exceeds total system memory", request.From)
http.Error(w, "Runtime memory requirement for model exceeds total system memory", http.StatusInsufficientStorage)
return
}
} }
if err := m.PullModel(request.From, r, w); err != nil { if err := m.PullModel(request.From, r, w); err != nil {
if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) { if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) {