model-runner/pkg/inference/scheduling/api.go

package scheduling

import (
	"strings"
	"time"

	"github.com/docker/model-runner/pkg/inference"
)

const (
	// maximumOpenAIInferenceRequestSize is the maximum OpenAI API embedding or
	// completion request size that Scheduler will allow. This should be large
	// enough to encompass any real-world request but also small enough to avoid
	// DoS attacks.
	maximumOpenAIInferenceRequestSize = 10 * 1024 * 1024
)

// trimRequestPathToOpenAIRoot trims a request path to start at the first
// instance of /v1/ to appear in the path.
func trimRequestPathToOpenAIRoot(path string) string {
	index := strings.Index(path, "/v1/")
	if index == -1 {
		return path
	}
	return path[index:]
}

// backendModeForRequest determines the backend operation mode to handle an
// OpenAI inference request. Its second parameter is true if and only if a valid
// mode could be determined.
func backendModeForRequest(path string) (inference.BackendMode, bool) {
	if strings.HasSuffix(path, "/v1/chat/completions") || strings.HasSuffix(path, "/v1/completions") {
		return inference.BackendModeCompletion, true
	} else if strings.HasSuffix(path, "/v1/embeddings") {
		return inference.BackendModeEmbedding, true
	}
	return inference.BackendMode(0), false
}

// OpenAIInferenceRequest is used to extract the model specification from either
// a chat completion or embedding request in the OpenAI API.
type OpenAIInferenceRequest struct {
	// Model is the requested model name.
	Model string `json:"model"`
}

// BackendStatus represents information about a running backend
type BackendStatus struct {
	// BackendName is the name of the backend
	BackendName string `json:"backend_name"`
	// ModelName is the name of the model loaded in the backend
	ModelName string `json:"model_name"`
	// Mode is the mode the backend is operating in
	Mode string `json:"mode"`
	// LastUsed represents when this (backend, model, mode) tuple was last used
	LastUsed time.Time `json:"last_used,omitempty"`
}