@ -15,9 +15,9 @@ import (
"time"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/discover"
"github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/format"
"github.com/ollama/ollama/gpu"
"github.com/ollama/ollama/llm"
)
@ -41,10 +41,10 @@ type Scheduler struct {
loaded map [ string ] * runnerRef
loadedMu sync . Mutex
loadFn func ( req * LlmRequest , ggml * llm . GGML , gpus gpu . GpuInfoList , numParallel int )
newServerFn func ( gpus gpu . GpuInfoList , model string , ggml * llm . GGML , adapters [ ] string , projectors [ ] string , opts api . Options , numParallel int ) ( llm . LlamaServer , error )
getGpuFn func ( ) gpu . GpuInfoList
getCpuFn func ( ) gpu . GpuInfoList
loadFn func ( req * LlmRequest , ggml * llm . GGML , gpus discover . GpuInfoList , numParallel int )
newServerFn func ( gpus discover . GpuInfoList , model string , ggml * llm . GGML , adapters [ ] string , projectors [ ] string , opts api . Options , numParallel int ) ( llm . LlamaServer , error )
getGpuFn func ( ) discover . GpuInfoList
getCpuFn func ( ) discover . GpuInfoList
reschedDelay time . Duration
}
@ -69,8 +69,8 @@ func InitScheduler(ctx context.Context) *Scheduler {
unloadedCh : make ( chan interface { } , maxQueue ) ,
loaded : make ( map [ string ] * runnerRef ) ,
newServerFn : llm . NewLlamaServer ,
getGpuFn : gpu . GetGPUInfo ,
getCpuFn : gpu . GetCPUInfo ,
getGpuFn : discover . GetGPUInfo ,
getCpuFn : discover . GetCPUInfo ,
reschedDelay : 250 * time . Millisecond ,
}
sched . loadFn = sched . load
@ -157,7 +157,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
} else {
// Either no models are loaded or below envconfig.MaxRunners
// Get a refreshed GPU list
var gpus gpu . GpuInfoList
var gpus discover . GpuInfoList
if pending . opts . NumGPU == 0 {
gpus = s . getCpuFn ( )
} else {
@ -409,7 +409,7 @@ func (pending *LlmRequest) useLoadedRunner(runner *runnerRef, finished chan *Llm
} ( )
}
func ( s * Scheduler ) load ( req * LlmRequest , ggml * llm . GGML , gpus gpu . GpuInfoList , numParallel int ) {
func ( s * Scheduler ) load ( req * LlmRequest , ggml * llm . GGML , gpus discover . GpuInfoList , numParallel int ) {
if numParallel < 1 {
numParallel = 1
}
@ -470,7 +470,7 @@ func (s *Scheduler) load(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList,
} ( )
}
func ( s * Scheduler ) updateFreeSpace ( allGpus gpu . GpuInfoList ) {
func ( s * Scheduler ) updateFreeSpace ( allGpus discover . GpuInfoList ) {
type predKey struct {
Library string
ID string
@ -513,8 +513,8 @@ func (s *Scheduler) updateFreeSpace(allGpus gpu.GpuInfoList) {
// to avoid scheduling another model on the same GPU(s) that haven't stabilized.
// This routine returns the set of GPUs that do not have an active loading model.
// If all GPUs have loading models, an empty list will be returned (not a single CPU entry)
func ( s * Scheduler ) filterGPUsWithoutLoadingModels ( allGpus gpu . GpuInfoList ) gpu . GpuInfoList {
ret := append ( gpu . GpuInfoList { } , allGpus ... )
func ( s * Scheduler ) filterGPUsWithoutLoadingModels ( allGpus discover . GpuInfoList ) discover . GpuInfoList {
ret := append ( discover . GpuInfoList { } , allGpus ... )
s . loadedMu . Lock ( )
defer s . loadedMu . Unlock ( )
for _ , runner := range s . loaded {
@ -542,7 +542,7 @@ type runnerRef struct {
llama llm . LlamaServer
loading bool // True only during initial load, then false forever
gpus gpu . GpuInfoList // Recorded at time of provisioning
gpus discover . GpuInfoList // Recorded at time of provisioning
estimatedVRAM uint64
estimatedTotal uint64
@ -630,7 +630,7 @@ func (runner *runnerRef) waitForVRAMRecovery() chan interface{} {
start := time . Now ( )
// Establish a baseline before we unload
gpusBefore := gpu . GetGPUInfo ( )
gpusBefore := discover . GetGPUInfo ( )
var totalMemoryBefore , freeMemoryBefore uint64
for _ , gpu := range gpusBefore {
totalMemoryBefore += gpu . TotalMemory
@ -648,7 +648,7 @@ func (runner *runnerRef) waitForVRAMRecovery() chan interface{} {
}
// Query GPUs, look for free to go back up
gpusNow := gpu . GetGPUInfo ( )
gpusNow := discover . GetGPUInfo ( )
var totalMemoryNow , freeMemoryNow uint64
for _ , gpu := range gpusNow {
totalMemoryNow += gpu . TotalMemory
@ -685,7 +685,7 @@ func (a ByDuration) Less(i, j int) bool {
// If the model can not be fit fully within the available GPU(s) nil is returned
// If numParallel is <= 0, this will attempt try to optimize parallism based on available VRAM, and adjust
// opts.NumCtx accordingly
func pickBestFullFitByLibrary ( req * LlmRequest , ggml * llm . GGML , gpus gpu . GpuInfoList , numParallel * int ) gpu . GpuInfoList {
func pickBestFullFitByLibrary ( req * LlmRequest , ggml * llm . GGML , gpus discover . GpuInfoList , numParallel * int ) discover . GpuInfoList {
var estimatedVRAM uint64
var numParallelToTry [ ] int
@ -698,22 +698,22 @@ func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoL
for _ , gl := range gpus . ByLibrary ( ) {
var ok bool
sgl := append ( make ( gpu . GpuInfoList , 0 , len ( gl ) ) , gl ... )
sgl := append ( make ( discover . GpuInfoList , 0 , len ( gl ) ) , gl ... )
// TODO - potentially sort by performance capability, existing models loaded, etc.
// TODO - Eliminate any GPUs that already have envconfig.MaxRunners loaded on them
// Note: at present, this will favor more VRAM over faster GPU speed in mixed setups
sort . Sort ( sort . Reverse ( gpu . ByFreeMemory ( sgl ) ) )
sort . Sort ( sort . Reverse ( discover . ByFreeMemory ( sgl ) ) )
// First attempt to fit the model into a single GPU
for _ , p := range numParallelToTry {
req . opts . NumCtx = req . origNumCtx * p
if ! envconfig . SchedSpread ( ) {
for _ , g := range sgl {
if ok , estimatedVRAM = llm . PredictServerFit ( [ ] gpu . GpuInfo { g } , ggml , req . model . AdapterPaths , req . model . ProjectorPaths , req . opts ) ; ok {
if ok , estimatedVRAM = llm . PredictServerFit ( [ ] discover . GpuInfo { g } , ggml , req . model . AdapterPaths , req . model . ProjectorPaths , req . opts ) ; ok {
slog . Info ( "new model will fit in available VRAM in single GPU, loading" , "model" , req . model . ModelPath , "gpu" , g . ID , "parallel" , p , "available" , g . FreeMemory , "required" , format . HumanBytes2 ( estimatedVRAM ) )
* numParallel = p
return [ ] gpu . GpuInfo { g }
return [ ] discover . GpuInfo { g }
}
}
}
@ -737,7 +737,7 @@ func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoL
}
// If multiple Libraries are detected, pick the Library which loads the most layers for the model
func pickBestPartialFitByLibrary ( req * LlmRequest , ggml * llm . GGML , gpus gpu . GpuInfoList , numParallel * int ) gpu . GpuInfoList {
func pickBestPartialFitByLibrary ( req * LlmRequest , ggml * llm . GGML , gpus discover . GpuInfoList , numParallel * int ) discover . GpuInfoList {
if * numParallel <= 0 {
* numParallel = 1
req . opts . NumCtx = req . origNumCtx
@ -822,7 +822,7 @@ func (s *Scheduler) expireRunner(model *Model) {
// If other runners are loaded, make sure the pending request will fit in system memory
// If not, pick a runner to unload, else return nil and the request can be loaded
func ( s * Scheduler ) maybeFindCPURunnerToUnload ( req * LlmRequest , ggml * llm . GGML , gpus gpu . GpuInfoList ) * runnerRef {
func ( s * Scheduler ) maybeFindCPURunnerToUnload ( req * LlmRequest , ggml * llm . GGML , gpus discover . GpuInfoList ) * runnerRef {
slog . Debug ( "evaluating if CPU model load will fit in available system memory" )
estimate := llm . EstimateGPULayers ( gpus , ggml , req . model . ProjectorPaths , req . opts )
if estimate . TotalSize <= gpus [ 0 ] . FreeMemory {