package scheduling import ( "strings" "time" "github.com/docker/model-runner/pkg/inference" ) const ( // maximumOpenAIInferenceRequestSize is the maximum OpenAI API embedding or // completion request size that Scheduler will allow. This should be large // enough to encompass any real-world request but also small enough to avoid // DoS attacks. maximumOpenAIInferenceRequestSize = 10 * 1024 * 1024 ) // trimRequestPathToOpenAIRoot trims a request path to start at the first // instance of /v1/ to appear in the path. func trimRequestPathToOpenAIRoot(path string) string { index := strings.Index(path, "/v1/") if index == -1 { return path } return path[index:] } // backendModeForRequest determines the backend operation mode to handle an // OpenAI inference request. Its second parameter is true if and only if a valid // mode could be determined. func backendModeForRequest(path string) (inference.BackendMode, bool) { if strings.HasSuffix(path, "/v1/chat/completions") || strings.HasSuffix(path, "/v1/completions") { return inference.BackendModeCompletion, true } else if strings.HasSuffix(path, "/v1/embeddings") { return inference.BackendModeEmbedding, true } return inference.BackendMode(0), false } // OpenAIInferenceRequest is used to extract the model specification from either // a chat completion or embedding request in the OpenAI API. type OpenAIInferenceRequest struct { // Model is the requested model name. Model string `json:"model"` } // BackendStatus represents information about a running backend type BackendStatus struct { // BackendName is the name of the backend BackendName string `json:"backend_name"` // ModelName is the name of the model loaded in the backend ModelName string `json:"model_name"` // Mode is the mode the backend is operating in Mode string `json:"mode"` // LastUsed represents when this (backend, model, mode) tuple was last used LastUsed time.Time `json:"last_used,omitempty"` }