diff --git a/pkg/inference/models/api.go b/pkg/inference/models/api.go index a21864d..196d64f 100644 --- a/pkg/inference/models/api.go +++ b/pkg/inference/models/api.go @@ -14,6 +14,9 @@ import ( type ModelCreateRequest struct { // From is the name of the model to pull. From string `json:"from"` + // IgnoreRuntimeMemoryCheck indicates whether the server should check if it has sufficient + // memory to run the given model (assuming default configuration). + IgnoreRuntimeMemoryCheck bool `json:"ignore-runtime-memory-check,omitempty"` } // ToOpenAIList converts the model list to its OpenAI API representation. This function never diff --git a/pkg/inference/models/manager.go b/pkg/inference/models/manager.go index 90946f4..4f79dcf 100644 --- a/pkg/inference/models/manager.go +++ b/pkg/inference/models/manager.go @@ -168,16 +168,19 @@ func (m *Manager) handleCreateModel(w http.ResponseWriter, r *http.Request) { // Pull the model. In the future, we may support additional operations here // besides pulling (such as model building). - proceed, err := m.memoryEstimator.HaveSufficientMemoryForModel(r.Context(), request.From, nil) - if err != nil { - m.log.Warnf("Failed to calculate memory required for model %q: %s", request.From, err) - // Prefer staying functional in case of unexpected estimation errors. - proceed = true - } - if !proceed { - m.log.Warnf("Runtime memory requirement for model %q exceeds total system memory", request.From) - http.Error(w, "Runtime memory requirement for model exceeds total system memory", http.StatusInsufficientStorage) - return + if !request.IgnoreRuntimeMemoryCheck { + m.log.Infof("Will estimate memory required for %q", request.From) + proceed, err := m.memoryEstimator.HaveSufficientMemoryForModel(r.Context(), request.From, nil) + if err != nil { + m.log.Warnf("Failed to calculate memory required for model %q: %s", request.From, err) + // Prefer staying functional in case of unexpected estimation errors. + proceed = true + } + if !proceed { + m.log.Warnf("Runtime memory requirement for model %q exceeds total system memory", request.From) + http.Error(w, "Runtime memory requirement for model exceeds total system memory", http.StatusInsufficientStorage) + return + } } if err := m.PullModel(request.From, r, w); err != nil { if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) {