Merge pull request #87 from docker/cloud-ttl-8hour

Temporarily bump GPU-enabled cloud idle timeout to 8 hours.
This commit is contained in:
Jacob Howard 2025-06-18 07:49:17 -06:00 committed by GitHub
commit ccc2a2bde6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
1 changed files with 38 additions and 21 deletions

View File

@ -21,9 +21,9 @@ const (
// being it is almost certainly greater than the number of models that most
// developers' systems will be able to load.
maximumRunnerSlots = 8
// runnerIdleTimeout is the maximum amount of time that a runner can sit
// idle (i.e. without any requests) before being terminated.
runnerIdleTimeout = 5 * time.Minute
// defaultRunnerIdleTimeout is the default maximum amount of time that a
// runner can sit idle (i.e. without any requests) before being terminated.
defaultRunnerIdleTimeout = 5 * time.Minute
)
var (
@ -59,6 +59,8 @@ type loader struct {
backends map[string]inference.Backend
// modelManager is the shared model manager.
modelManager *models.Manager
// runnerIdleTimeout is the loader-specific default runner idle timeout.
runnerIdleTimeout time.Duration
// totalMemory is the total system memory allocated to the loader.
totalMemory uint64
// idleCheck is used to signal the run loop when timestamps have updated.
@ -104,6 +106,19 @@ func newLoader(
// tune this heuristic for systems with enormous amounts of VRAM.
nSlots := min(runtime.NumCPU(), maximumRunnerSlots)
// Check if we have a special environment.
isGPUEnabledCloudEnvironment := environment.Get() == environment.EnvironmentCloud &&
os.Getenv("NVIDIA_VISIBLE_DEVICES") != ""
// Compute the idle runner timeout.
//
// HACK: On GPU-enabled cloud engines, we'll bump this to 8 hours. We can
// remove this once we have configurable TTLs.
runnerIdleTimeout := defaultRunnerIdleTimeout
if isGPUEnabledCloudEnvironment {
runnerIdleTimeout = 8 * time.Hour
}
// Compute the amount of available memory.
//
// TODO: For now, we treat the system as having memory size 1 and all models
@ -114,28 +129,30 @@ func newLoader(
// computing model size through estimation (using parameter count and
// quantization data type size).
//
// HACK: On GPU-enabled cloud engines, we'll temporarily bump this to 2.
// HACK: On GPU-enabled cloud engines, we'll bump this to 2. We can remove
// this once we have VRAM estimation.
totalMemory := uint64(1)
if environment.Get() == environment.EnvironmentCloud && os.Getenv("NVIDIA_VISIBLE_DEVICES") != "" {
if isGPUEnabledCloudEnvironment {
totalMemory = 2
}
// Create the loader.
l := &loader{
log: log,
backends: backends,
modelManager: modelManager,
totalMemory: totalMemory,
idleCheck: make(chan struct{}, 1),
guard: make(chan struct{}, 1),
availableMemory: totalMemory,
waiters: make(map[chan<- struct{}]bool),
runners: make(map[runnerKey]int, nSlots),
slots: make([]*runner, nSlots),
references: make([]uint, nSlots),
allocations: make([]uint64, nSlots),
timestamps: make([]time.Time, nSlots),
runnerConfigs: make(map[runnerKey]inference.BackendConfiguration),
log: log,
backends: backends,
modelManager: modelManager,
runnerIdleTimeout: runnerIdleTimeout,
totalMemory: totalMemory,
idleCheck: make(chan struct{}, 1),
guard: make(chan struct{}, 1),
availableMemory: totalMemory,
waiters: make(map[chan<- struct{}]bool),
runners: make(map[runnerKey]int, nSlots),
slots: make([]*runner, nSlots),
references: make([]uint, nSlots),
allocations: make([]uint64, nSlots),
timestamps: make([]time.Time, nSlots),
runnerConfigs: make(map[runnerKey]inference.BackendConfiguration),
}
l.guard <- struct{}{}
return l
@ -176,7 +193,7 @@ func (l *loader) evict(idleOnly bool) int {
now := time.Now()
for r, slot := range l.runners {
unused := l.references[slot] == 0
idle := unused && now.Sub(l.timestamps[slot]) > runnerIdleTimeout
idle := unused && now.Sub(l.timestamps[slot]) > l.runnerIdleTimeout
defunct := false
select {
case <-l.slots[slot].done:
@ -283,7 +300,7 @@ func (l *loader) idleCheckDuration() time.Duration {
// Compute the remaining duration. If negative, check immediately, otherwise
// wait until 100 milliseconds after expiration time (to avoid checking
// right on the expiration boundary).
if remaining := runnerIdleTimeout - time.Since(oldest); remaining < 0 {
if remaining := l.runnerIdleTimeout - time.Since(oldest); remaining < 0 {
return 0
} else {
return remaining + 100*time.Millisecond