Implement basic memory estimation in scheduler
First pass implementation of memory estimation logic in model scheduler. This change heavily relies on gguf-parser-go to calculate estimated peak memory requirement for running inference with a given model. It adds GetRequiredMemoryForModel() to the Backend interface to allow each backend to deal with its config and calculate required memory usage based on it. Signed-off-by: Piotr Stankiewicz <piotr.stankiewicz@docker.com>
This commit is contained in:
parent
606aead0e5
commit
a4dc5834d1
2
Makefile
2
Makefile
|
|
@ -17,7 +17,7 @@ LLAMA_ARGS ?=
|
|||
|
||||
# Build the Go application
|
||||
build:
|
||||
CGO_ENABLED=0 go build -ldflags="-s -w" -o $(APP_NAME) ./main.go
|
||||
CGO_ENABLED=1 go build -ldflags="-s -w" -o $(APP_NAME) ./main.go
|
||||
|
||||
# Run the application locally
|
||||
run: build
|
||||
|
|
|
|||
|
|
@ -34,6 +34,11 @@ type BackendConfiguration struct {
|
|||
RuntimeFlags []string `json:"runtime-flags,omitempty"`
|
||||
}
|
||||
|
||||
type RequiredMemory struct {
|
||||
RAM uint64
|
||||
VRAM uint64 // TODO(p1-0tr): for now assume we are working with single GPU set-ups
|
||||
}
|
||||
|
||||
// Backend is the interface implemented by inference engine backends. Backend
|
||||
// implementations need not be safe for concurrent invocation of the following
|
||||
// methods, though their underlying server implementations do need to support
|
||||
|
|
@ -76,4 +81,7 @@ type Backend interface {
|
|||
Status() string
|
||||
// GetDiskUsage returns the disk usage of the backend.
|
||||
GetDiskUsage() (int64, error)
|
||||
// GetRequiredMemoryForModel returns the required working memory for a given
|
||||
// model.
|
||||
GetRequiredMemoryForModel(model string, config *BackendConfiguration) (*RequiredMemory, error)
|
||||
}
|
||||
|
|
|
|||
|
|
@ -13,6 +13,8 @@ import (
|
|||
"runtime"
|
||||
"strings"
|
||||
|
||||
parser "github.com/gpustack/gguf-parser-go"
|
||||
|
||||
"github.com/docker/model-runner/pkg/diskusage"
|
||||
"github.com/docker/model-runner/pkg/inference"
|
||||
"github.com/docker/model-runner/pkg/inference/config"
|
||||
|
|
@ -213,3 +215,38 @@ func (l *llamaCpp) GetDiskUsage() (int64, error) {
|
|||
}
|
||||
return size, nil
|
||||
}
|
||||
|
||||
func (l *llamaCpp) GetRequiredMemoryForModel(model string, config *inference.BackendConfiguration) (*inference.RequiredMemory, error) {
|
||||
mdl, err := l.modelManager.GetModel(model)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
mdlPath, err := mdl.GGUFPath()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
mdlGguf, err := parser.ParseGGUFFile(mdlPath)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
mdlConfig, err := mdl.Config()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
contextSize := GetContextSize(&mdlConfig, config)
|
||||
|
||||
// FIXME(p1-0tr): for now assume we are running on GPU (single one) - Devices[1];
|
||||
// sum up weights + kv cache + context for an estimate of total GPU memory needed
|
||||
// while running inference with the given model
|
||||
estimate := mdlGguf.EstimateLLaMACppRun(parser.WithLLaMACppContextSize(int32(contextSize)),
|
||||
// FIXME(p1-0tr): add logic for resolving other param values, instead of hardcoding them
|
||||
parser.WithLLaMACppLogicalBatchSize(2048),
|
||||
parser.WithLLaMACppOffloadLayers(100))
|
||||
memory := uint64(estimate.Devices[1].Weight.Sum() + estimate.Devices[1].KVCache.Sum() + estimate.Devices[1].Computation.Sum())
|
||||
|
||||
return &inference.RequiredMemory{
|
||||
RAM: 0,
|
||||
VRAM: memory,
|
||||
}, nil
|
||||
}
|
||||
|
|
|
|||
|
|
@ -57,16 +57,10 @@ func (c *Config) GetArgs(model types.Model, socket string, mode inference.Backen
|
|||
args = append(args, "--embeddings")
|
||||
}
|
||||
|
||||
// Add arguments from model config
|
||||
if modelCfg.ContextSize != nil {
|
||||
args = append(args, "--ctx-size", strconv.FormatUint(*modelCfg.ContextSize, 10))
|
||||
}
|
||||
args = append(args, "--ctx-size", strconv.FormatUint(GetContextSize(&modelCfg, config), 10))
|
||||
|
||||
// Add arguments from backend config
|
||||
if config != nil {
|
||||
if config.ContextSize > 0 && !containsArg(args, "--ctx-size") {
|
||||
args = append(args, "--ctx-size", strconv.FormatInt(config.ContextSize, 10))
|
||||
}
|
||||
args = append(args, config.RuntimeFlags...)
|
||||
}
|
||||
|
||||
|
|
@ -79,6 +73,19 @@ func (c *Config) GetArgs(model types.Model, socket string, mode inference.Backen
|
|||
return args, nil
|
||||
}
|
||||
|
||||
func GetContextSize(modelCfg *types.Config, backendCfg *inference.BackendConfiguration) uint64 {
|
||||
// Model config takes precedence
|
||||
if modelCfg != nil && modelCfg.ContextSize != nil {
|
||||
return *modelCfg.ContextSize
|
||||
}
|
||||
// else use backend config
|
||||
if backendCfg != nil && backendCfg.ContextSize > 0 {
|
||||
return uint64(backendCfg.ContextSize)
|
||||
}
|
||||
// finally return default
|
||||
return 4096 // llama.cpp default
|
||||
}
|
||||
|
||||
// containsArg checks if the given argument is already in the args slice.
|
||||
func containsArg(args []string, arg string) bool {
|
||||
for _, a := range args {
|
||||
|
|
|
|||
|
|
@ -62,3 +62,7 @@ func (m *mlx) Status() string {
|
|||
func (m *mlx) GetDiskUsage() (int64, error) {
|
||||
return 0, nil
|
||||
}
|
||||
|
||||
func (m *mlx) GetRequiredMemoryForModel(model string, config *inference.BackendConfiguration) (*inference.RequiredMemory, error) {
|
||||
return nil, errors.New("not implemented")
|
||||
}
|
||||
|
|
|
|||
|
|
@ -62,3 +62,7 @@ func (v *vLLM) Status() string {
|
|||
func (v *vLLM) GetDiskUsage() (int64, error) {
|
||||
return 0, nil
|
||||
}
|
||||
|
||||
func (v *vLLM) GetRequiredMemoryForModel(model string, config *inference.BackendConfiguration) (*inference.RequiredMemory, error) {
|
||||
return nil, errors.New("not implemented")
|
||||
}
|
||||
|
|
|
|||
|
|
@ -132,21 +132,11 @@ func newLoader(
|
|||
}
|
||||
|
||||
// Compute the amount of available memory.
|
||||
//
|
||||
// TODO: For now, we treat the system as having memory size 1 and all models
|
||||
// as having size 1 (and thus we'll only load a single model at a time).
|
||||
// However, the loader is designed to use "real" values for each and to
|
||||
// schedule appropriately. Thus, we should switch to polling the system
|
||||
// VRAM size here (and potentially even reserving a portion of it) and
|
||||
// computing model size through estimation (using parameter count and
|
||||
// quantization data type size).
|
||||
//
|
||||
// HACK: On GPU-enabled cloud engines, we'll bump this to 2. We can remove
|
||||
// this once we have VRAM estimation.
|
||||
totalMemory := uint64(1)
|
||||
if isGPUEnabledCloudEnvironment {
|
||||
totalMemory = 2
|
||||
vramSize, err := getVRAMSize() // FIXME(p1-0tr): only implemented on macOS for now
|
||||
if err != nil {
|
||||
return nil // FIXME(p1-0tr): should forward the error
|
||||
}
|
||||
totalMemory := vramSize
|
||||
|
||||
// Create the loader.
|
||||
l := &loader{
|
||||
|
|
@ -400,14 +390,16 @@ func (l *loader) load(ctx context.Context, backendName, modelID, modelRef string
|
|||
|
||||
// Estimate the amount of memory that will be used by the model and check
|
||||
// that we're even capable of loading it.
|
||||
//
|
||||
// TODO: For now, we treat the system as having memory size 1 and all models
|
||||
// as having size 1 (and thus we'll only load a single model at a time).
|
||||
// However, the loader is designed to use "real" values for each and to
|
||||
// schedule appropriately. Thus, we should switch to computing model size
|
||||
// here through estimation (using parameter count and quantization data type
|
||||
// size).
|
||||
memory := uint64(1)
|
||||
var runnerConfig *inference.BackendConfiguration
|
||||
if rc, ok := l.runnerConfigs[runnerKey{backendName, model, mode}]; ok {
|
||||
runnerConfig = &rc
|
||||
}
|
||||
reqMemory, err := backend.GetRequiredMemoryForModel(model, runnerConfig)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
memory := reqMemory.VRAM // FIXME(p1-0tr): extend loader to reason about RAM and VRAM (and multiple VRAM sets in the future)
|
||||
l.log.Infof("Loading %s, which will require %dMB of working memory", model, memory/1024.0/1024.0)
|
||||
if memory > l.totalMemory {
|
||||
return nil, errModelTooBig
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,17 @@
|
|||
package scheduling
|
||||
|
||||
/*
|
||||
#cgo LDFLAGS: -framework Metal
|
||||
#include "metal.h"
|
||||
*/
|
||||
import "C"
|
||||
import "errors"
|
||||
|
||||
// getVRAMSize returns total system GPU memory in bytes
|
||||
func getVRAMSize() (uint64, error) {
|
||||
vramSize := C.getVRAMSize()
|
||||
if vramSize == 0 {
|
||||
return 0, errors.New("could not get metal VRAM size")
|
||||
}
|
||||
return uint64(vramSize), nil
|
||||
}
|
||||
|
|
@ -0,0 +1,5 @@
|
|||
// +build darwin
|
||||
|
||||
#include <stddef.h>
|
||||
|
||||
size_t getVRAMSize();
|
||||
|
|
@ -0,0 +1,13 @@
|
|||
// +build darwin
|
||||
|
||||
#include <Metal/Metal.h>
|
||||
|
||||
#include "metal.h"
|
||||
|
||||
size_t getVRAMSize() {
|
||||
id<MTLDevice> device = MTLCreateSystemDefaultDevice();
|
||||
if (device) {
|
||||
return [device recommendedMaxWorkingSetSize];
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
Loading…
Reference in New Issue