diff --git a/main.go b/main.go index 95ad930..f024039 100644 --- a/main.go +++ b/main.go @@ -116,7 +116,7 @@ func main() { "", false, ), - gpuInfo, + sysMemInfo, ) router := routing.NewNormalizedServeMux() diff --git a/pkg/inference/memory/system.go b/pkg/inference/memory/system.go index 5931d43..fb0a384 100644 --- a/pkg/inference/memory/system.go +++ b/pkg/inference/memory/system.go @@ -9,6 +9,7 @@ import ( type SystemMemoryInfo interface { HaveSufficientMemory(inference.RequiredMemory) bool + GetTotalMemory() inference.RequiredMemory } type systemMemoryInfo struct { @@ -48,3 +49,7 @@ func NewSystemMemoryInfo(log logging.Logger, gpuInfo *gpuinfo.GPUInfo) (SystemMe func (s *systemMemoryInfo) HaveSufficientMemory(req inference.RequiredMemory) bool { return req.RAM <= s.totalMemory.RAM && req.VRAM <= s.totalMemory.VRAM } + +func (s *systemMemoryInfo) GetTotalMemory() inference.RequiredMemory { + return s.totalMemory +} diff --git a/pkg/inference/scheduling/loader.go b/pkg/inference/scheduling/loader.go index 7ed06a2..69166e0 100644 --- a/pkg/inference/scheduling/loader.go +++ b/pkg/inference/scheduling/loader.go @@ -10,12 +10,11 @@ import ( "time" "github.com/docker/model-runner/pkg/environment" - "github.com/docker/model-runner/pkg/gpuinfo" "github.com/docker/model-runner/pkg/inference" + "github.com/docker/model-runner/pkg/inference/memory" "github.com/docker/model-runner/pkg/inference/models" "github.com/docker/model-runner/pkg/logging" "github.com/docker/model-runner/pkg/metrics" - "github.com/elastic/go-sysinfo" ) const ( @@ -113,7 +112,7 @@ func newLoader( backends map[string]inference.Backend, modelManager *models.Manager, openAIRecorder *metrics.OpenAIRecorder, - gpuInfo *gpuinfo.GPUInfo, + sysMemInfo memory.SystemMemoryInfo, ) *loader { // Compute the number of runner slots to allocate. Because of RAM and VRAM // limitations, it's unlikely that we'll ever be able to fully populate @@ -135,32 +134,7 @@ func newLoader( } // Compute the amount of available memory. - // TODO(p1-0tr): improve error handling - vramSize, err := gpuInfo.GetVRAMSize() - if err != nil { - vramSize = 1 - log.Warnf("Could not read VRAM size: %s", err) - } else { - log.Infof("Running on system with %dMB VRAM", vramSize/1024/1024) - } - ramSize := uint64(1) - hostInfo, err := sysinfo.Host() - if err != nil { - log.Warnf("Could not read host info: %s", err) - } else { - ram, err := hostInfo.Memory() - if err != nil { - log.Warnf("Could not read host RAM size: %s", err) - } else { - ramSize = ram.Total - log.Infof("Running on system with %dMB RAM", ramSize/1024/1024) - } - } - - totalMemory := inference.RequiredMemory{ - RAM: ramSize, - VRAM: vramSize, - } + totalMemory := sysMemInfo.GetTotalMemory() // Create the loader. l := &loader{ @@ -434,7 +408,7 @@ func (l *loader) load(ctx context.Context, backendName, modelID, modelRef string } else if err != nil { return nil, err } - l.log.Infof("Loading %s, which will require %dMB RAM and %dMB VRAM", modelID, memory.RAM/1024/1024, memory.VRAM/1024/1024) + l.log.Infof("Loading %s, which will require %d MB RAM and %d MB VRAM on a system with %d MB RAM and %d MB VRAM", modelID, memory.RAM/1024/1024, memory.VRAM/1024/1024, l.totalMemory.RAM/1024/1024, l.totalMemory.VRAM/1024/1024) if l.totalMemory.RAM == 1 { l.log.Warnf("RAM size unknown. Assume model will fit, but only one.") memory.RAM = 1 diff --git a/pkg/inference/scheduling/scheduler.go b/pkg/inference/scheduling/scheduler.go index 3f716e9..b99db11 100644 --- a/pkg/inference/scheduling/scheduler.go +++ b/pkg/inference/scheduling/scheduler.go @@ -13,8 +13,8 @@ import ( "time" "github.com/docker/model-distribution/distribution" - "github.com/docker/model-runner/pkg/gpuinfo" "github.com/docker/model-runner/pkg/inference" + "github.com/docker/model-runner/pkg/inference/memory" "github.com/docker/model-runner/pkg/inference/models" "github.com/docker/model-runner/pkg/logging" "github.com/docker/model-runner/pkg/metrics" @@ -56,7 +56,7 @@ func NewScheduler( httpClient *http.Client, allowedOrigins []string, tracker *metrics.Tracker, - gpuInfo *gpuinfo.GPUInfo, + sysMemInfo memory.SystemMemoryInfo, ) *Scheduler { openAIRecorder := metrics.NewOpenAIRecorder(log.WithField("component", "openai-recorder"), modelManager) @@ -67,7 +67,7 @@ func NewScheduler( defaultBackend: defaultBackend, modelManager: modelManager, installer: newInstaller(log, backends, httpClient), - loader: newLoader(log, backends, modelManager, openAIRecorder, gpuInfo), + loader: newLoader(log, backends, modelManager, openAIRecorder, sysMemInfo), router: http.NewServeMux(), tracker: tracker, openAIRecorder: openAIRecorder,