model-runner/pkg/inference/scheduling/api.go

58 lines
1.9 KiB
Go

package scheduling
import (
"strings"
"time"
"github.com/docker/model-runner/pkg/inference"
)
const (
// maximumOpenAIInferenceRequestSize is the maximum OpenAI API embedding or
// completion request size that Scheduler will allow. This should be large
// enough to encompass any real-world request but also small enough to avoid
// DoS attacks.
maximumOpenAIInferenceRequestSize = 10 * 1024 * 1024
)
// trimRequestPathToOpenAIRoot trims a request path to start at the first
// instance of /v1/ to appear in the path.
func trimRequestPathToOpenAIRoot(path string) string {
index := strings.Index(path, "/v1/")
if index == -1 {
return path
}
return path[index:]
}
// backendModeForRequest determines the backend operation mode to handle an
// OpenAI inference request. Its second parameter is true if and only if a valid
// mode could be determined.
func backendModeForRequest(path string) (inference.BackendMode, bool) {
if strings.HasSuffix(path, "/v1/chat/completions") || strings.HasSuffix(path, "/v1/completions") {
return inference.BackendModeCompletion, true
} else if strings.HasSuffix(path, "/v1/embeddings") {
return inference.BackendModeEmbedding, true
}
return inference.BackendMode(0), false
}
// OpenAIInferenceRequest is used to extract the model specification from either
// a chat completion or embedding request in the OpenAI API.
type OpenAIInferenceRequest struct {
// Model is the requested model name.
Model string `json:"model"`
}
// BackendStatus represents information about a running backend
type BackendStatus struct {
// BackendName is the name of the backend
BackendName string `json:"backend_name"`
// ModelName is the name of the model loaded in the backend
ModelName string `json:"model_name"`
// Mode is the mode the backend is operating in
Mode string `json:"mode"`
// LastUsed represents when this (backend, model, mode) tuple was last used
LastUsed time.Time `json:"last_used,omitempty"`
}