58 lines
1.9 KiB
Go
58 lines
1.9 KiB
Go
package scheduling
|
|
|
|
import (
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/docker/model-runner/pkg/inference"
|
|
)
|
|
|
|
const (
|
|
// maximumOpenAIInferenceRequestSize is the maximum OpenAI API embedding or
|
|
// completion request size that Scheduler will allow. This should be large
|
|
// enough to encompass any real-world request but also small enough to avoid
|
|
// DoS attacks.
|
|
maximumOpenAIInferenceRequestSize = 10 * 1024 * 1024
|
|
)
|
|
|
|
// trimRequestPathToOpenAIRoot trims a request path to start at the first
|
|
// instance of /v1/ to appear in the path.
|
|
func trimRequestPathToOpenAIRoot(path string) string {
|
|
index := strings.Index(path, "/v1/")
|
|
if index == -1 {
|
|
return path
|
|
}
|
|
return path[index:]
|
|
}
|
|
|
|
// backendModeForRequest determines the backend operation mode to handle an
|
|
// OpenAI inference request. Its second parameter is true if and only if a valid
|
|
// mode could be determined.
|
|
func backendModeForRequest(path string) (inference.BackendMode, bool) {
|
|
if strings.HasSuffix(path, "/v1/chat/completions") || strings.HasSuffix(path, "/v1/completions") {
|
|
return inference.BackendModeCompletion, true
|
|
} else if strings.HasSuffix(path, "/v1/embeddings") {
|
|
return inference.BackendModeEmbedding, true
|
|
}
|
|
return inference.BackendMode(0), false
|
|
}
|
|
|
|
// OpenAIInferenceRequest is used to extract the model specification from either
|
|
// a chat completion or embedding request in the OpenAI API.
|
|
type OpenAIInferenceRequest struct {
|
|
// Model is the requested model name.
|
|
Model string `json:"model"`
|
|
}
|
|
|
|
// BackendStatus represents information about a running backend
|
|
type BackendStatus struct {
|
|
// BackendName is the name of the backend
|
|
BackendName string `json:"backend_name"`
|
|
// ModelName is the name of the model loaded in the backend
|
|
ModelName string `json:"model_name"`
|
|
// Mode is the mode the backend is operating in
|
|
Mode string `json:"mode"`
|
|
// LastUsed represents when this (backend, model, mode) tuple was last used
|
|
LastUsed time.Time `json:"last_used,omitempty"`
|
|
}
|