|
|
|
@ -177,7 +177,7 @@ func (s *Scheduler) processPending(ctx context.Context) { |
|
|
|
} |
|
|
|
// Trigger an expiration to unload once it's done
|
|
|
|
runnerToExpire.refMu.Lock() |
|
|
|
slog.Debug("resetting model to expire immediately to make room", "model", runnerToExpire.model, "refCount", runnerToExpire.refCount) |
|
|
|
slog.Debug("resetting model to expire immediately to make room", "modelPath", runnerToExpire.modelPath, "refCount", runnerToExpire.refCount) |
|
|
|
if runnerToExpire.expireTimer != nil { |
|
|
|
runnerToExpire.expireTimer.Stop() |
|
|
|
runnerToExpire.expireTimer = nil |
|
|
|
@ -190,13 +190,13 @@ func (s *Scheduler) processPending(ctx context.Context) { |
|
|
|
// Wait for the unload to happen
|
|
|
|
// Note: at this point we're queueing up all incoming requests, even if they were for
|
|
|
|
// a different model that's loaded and not scheduled to be removed.
|
|
|
|
slog.Debug("waiting for pending requests to complete and unload to occur", "model", runnerToExpire.model) |
|
|
|
slog.Debug("waiting for pending requests to complete and unload to occur", "modelPath", runnerToExpire.modelPath) |
|
|
|
select { |
|
|
|
case <-ctx.Done(): |
|
|
|
slog.Debug("shutting down scheduler pending loop") |
|
|
|
return |
|
|
|
case <-s.unloadedCh: |
|
|
|
slog.Debug("unload completed", "model", runnerToExpire.model) |
|
|
|
slog.Debug("unload completed", "modelPath", runnerToExpire.modelPath) |
|
|
|
continue |
|
|
|
} |
|
|
|
} |
|
|
|
@ -219,23 +219,23 @@ func (s *Scheduler) processCompleted(ctx context.Context) { |
|
|
|
runner := s.loaded[finished.model.ModelPath] |
|
|
|
s.loadedMu.Unlock() |
|
|
|
if runner == nil { |
|
|
|
slog.Error("finished requeset signal received after model unloaded", "model", finished.model.ModelPath) |
|
|
|
slog.Error("finished requeset signal received after model unloaded", "modelPath", finished.model.ModelPath) |
|
|
|
continue |
|
|
|
} |
|
|
|
runner.refMu.Lock() |
|
|
|
runner.refCount-- |
|
|
|
if runner.refCount <= 0 { |
|
|
|
if runner.sessionDuration <= 0 { |
|
|
|
slog.Debug("runner with zero duration has gone idle, expiring to unload", "model", runner.model) |
|
|
|
slog.Debug("runner with zero duration has gone idle, expiring to unload", "modelPath", runner.modelPath) |
|
|
|
if runner.expireTimer != nil { |
|
|
|
runner.expireTimer.Stop() |
|
|
|
runner.expireTimer = nil |
|
|
|
} |
|
|
|
s.expiredCh <- runner |
|
|
|
} else if runner.expireTimer == nil { |
|
|
|
slog.Debug("runner with non-zero duration has gone idle, adding timer", "model", runner.model, "duration", runner.sessionDuration) |
|
|
|
slog.Debug("runner with non-zero duration has gone idle, adding timer", "modelPath", runner.modelPath, "duration", runner.sessionDuration) |
|
|
|
runner.expireTimer = time.AfterFunc(runner.sessionDuration, func() { |
|
|
|
slog.Debug("timer expired, expiring to unload", "model", runner.model) |
|
|
|
slog.Debug("timer expired, expiring to unload", "modelPath", runner.modelPath) |
|
|
|
runner.refMu.Lock() |
|
|
|
defer runner.refMu.Unlock() |
|
|
|
if runner.expireTimer != nil { |
|
|
|
@ -244,19 +244,21 @@ func (s *Scheduler) processCompleted(ctx context.Context) { |
|
|
|
} |
|
|
|
s.expiredCh <- runner |
|
|
|
}) |
|
|
|
runner.expiresAt = time.Now().Add(runner.sessionDuration) |
|
|
|
} else { |
|
|
|
slog.Debug("runner with non-zero duration has gone idle, resetting timer", "model", runner.model, "duration", runner.sessionDuration) |
|
|
|
slog.Debug("runner with non-zero duration has gone idle, resetting timer", "modelPath", runner.modelPath, "duration", runner.sessionDuration) |
|
|
|
runner.expireTimer.Reset(runner.sessionDuration) |
|
|
|
runner.expiresAt = time.Now().Add(runner.sessionDuration) |
|
|
|
} |
|
|
|
} |
|
|
|
slog.Debug("after processing request finished event", "model", runner.model, "refCount", runner.refCount) |
|
|
|
slog.Debug("after processing request finished event", "modelPath", runner.modelPath, "refCount", runner.refCount) |
|
|
|
runner.refMu.Unlock() |
|
|
|
case runner := <-s.expiredCh: |
|
|
|
slog.Debug("runner expired event received", "model", runner.model) |
|
|
|
slog.Debug("runner expired event received", "modelPath", runner.modelPath) |
|
|
|
runner.refMu.Lock() |
|
|
|
if runner.refCount > 0 { |
|
|
|
// Shouldn't happen, but safeguard to ensure no leaked runners
|
|
|
|
slog.Debug("expired event with positive ref count, retrying", "model", runner.model, "refCount", runner.refCount) |
|
|
|
slog.Debug("expired event with positive ref count, retrying", "modelPath", runner.modelPath, "refCount", runner.refCount) |
|
|
|
go func(runner *runnerRef) { |
|
|
|
// We can't unload yet, but want to as soon as the current request completes
|
|
|
|
// So queue up another expired event
|
|
|
|
@ -268,16 +270,16 @@ func (s *Scheduler) processCompleted(ctx context.Context) { |
|
|
|
} |
|
|
|
|
|
|
|
s.loadedMu.Lock() |
|
|
|
slog.Debug("got lock to unload", "model", runner.model) |
|
|
|
slog.Debug("got lock to unload", "modelPath", runner.modelPath) |
|
|
|
finished := runner.waitForVRAMRecovery() |
|
|
|
runner.unload() |
|
|
|
delete(s.loaded, runner.model) |
|
|
|
delete(s.loaded, runner.modelPath) |
|
|
|
s.loadedMu.Unlock() |
|
|
|
slog.Debug("runner released", "model", runner.model) |
|
|
|
slog.Debug("runner released", "modelPath", runner.modelPath) |
|
|
|
runner.refMu.Unlock() |
|
|
|
|
|
|
|
<-finished |
|
|
|
slog.Debug("sending an unloaded event", "model", runner.model) |
|
|
|
slog.Debug("sending an unloaded event", "modelPath", runner.modelPath) |
|
|
|
s.unloadedCh <- struct{}{} |
|
|
|
} |
|
|
|
} |
|
|
|
@ -316,18 +318,20 @@ func (s *Scheduler) load(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList) |
|
|
|
req.errCh <- err |
|
|
|
return |
|
|
|
} |
|
|
|
runner := &runnerRef{} |
|
|
|
runner.model = req.model.ModelPath |
|
|
|
runner.adapters = req.model.AdapterPaths |
|
|
|
runner.projectors = req.model.ProjectorPaths |
|
|
|
runner.llama = llama |
|
|
|
runner.Options = &req.opts |
|
|
|
runner.sessionDuration = req.sessionDuration |
|
|
|
runner.gpus = gpus |
|
|
|
runner.estimatedVRAM = llama.EstimatedVRAM() |
|
|
|
runner.loading = true |
|
|
|
runner.refCount = 1 |
|
|
|
runner := &runnerRef{ |
|
|
|
model: req.model, |
|
|
|
modelPath: req.model.ModelPath, |
|
|
|
llama: llama, |
|
|
|
Options: &req.opts, |
|
|
|
sessionDuration: req.sessionDuration, |
|
|
|
gpus: gpus, |
|
|
|
estimatedVRAM: llama.EstimatedVRAM(), |
|
|
|
estimatedTotal: llama.EstimatedTotal(), |
|
|
|
loading: true, |
|
|
|
refCount: 1, |
|
|
|
} |
|
|
|
runner.refMu.Lock() |
|
|
|
|
|
|
|
s.loadedMu.Lock() |
|
|
|
s.loaded[req.model.ModelPath] = runner |
|
|
|
slog.Info("loaded runners", "count", len(s.loaded)) |
|
|
|
@ -339,7 +343,7 @@ func (s *Scheduler) load(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList) |
|
|
|
slog.Error("error loading llama server", "error", err) |
|
|
|
runner.refCount-- |
|
|
|
req.errCh <- err |
|
|
|
slog.Debug("triggering expiration for failed load", "model", runner.model) |
|
|
|
slog.Debug("triggering expiration for failed load", "model", runner.modelPath) |
|
|
|
s.expiredCh <- runner |
|
|
|
return |
|
|
|
} |
|
|
|
@ -412,13 +416,14 @@ type runnerRef struct { |
|
|
|
loading bool // True only during initial load, then false forever
|
|
|
|
gpus gpu.GpuInfoList // Recorded at time of provisioning
|
|
|
|
estimatedVRAM uint64 |
|
|
|
estimatedTotal uint64 |
|
|
|
|
|
|
|
sessionDuration time.Duration |
|
|
|
expireTimer *time.Timer |
|
|
|
expiresAt time.Time |
|
|
|
|
|
|
|
model string |
|
|
|
adapters []string |
|
|
|
projectors []string |
|
|
|
model *Model |
|
|
|
modelPath string |
|
|
|
*api.Options |
|
|
|
} |
|
|
|
|
|
|
|
@ -431,9 +436,8 @@ func (runner *runnerRef) unload() { |
|
|
|
if runner.llama != nil { |
|
|
|
runner.llama.Close() |
|
|
|
} |
|
|
|
runner.model = nil |
|
|
|
runner.llama = nil |
|
|
|
runner.adapters = nil |
|
|
|
runner.projectors = nil |
|
|
|
runner.Options = nil |
|
|
|
runner.gpus = nil |
|
|
|
} |
|
|
|
@ -462,8 +466,8 @@ func (runner *runnerRef) needsReload(ctx context.Context, req *LlmRequest) bool |
|
|
|
|
|
|
|
ctx, cancel := context.WithTimeout(ctx, timeout) |
|
|
|
defer cancel() |
|
|
|
if !reflect.DeepEqual(runner.adapters, req.model.AdapterPaths) || // have the adapters changed?
|
|
|
|
!reflect.DeepEqual(runner.projectors, req.model.ProjectorPaths) || // have the projectors changed?
|
|
|
|
if !reflect.DeepEqual(runner.model.AdapterPaths, req.model.AdapterPaths) || // have the adapters changed?
|
|
|
|
!reflect.DeepEqual(runner.model.ProjectorPaths, req.model.ProjectorPaths) || // have the projectors changed?
|
|
|
|
!reflect.DeepEqual(optsExisting, optsNew) || // have the runner options changed?
|
|
|
|
runner.llama.Ping(ctx) != nil { |
|
|
|
return true |
|
|
|
|