|
|
|
@ -1,9 +1,12 @@ |
|
|
|
package llm |
|
|
|
|
|
|
|
import ( |
|
|
|
"cmp" |
|
|
|
"fmt" |
|
|
|
"log/slog" |
|
|
|
"maps" |
|
|
|
"os" |
|
|
|
"slices" |
|
|
|
"strconv" |
|
|
|
"strings" |
|
|
|
|
|
|
|
@ -120,12 +123,10 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin |
|
|
|
} |
|
|
|
|
|
|
|
layers := f.Tensors().GroupLayers() |
|
|
|
// add one layer worth of memory as a buffer
|
|
|
|
if blk0, ok := layers["blk.0"]; ok { |
|
|
|
layerSize = blk0.Size() |
|
|
|
} else { |
|
|
|
slog.Warn("model missing blk.0 layer size") |
|
|
|
} |
|
|
|
// add one layer (chosing the max layer) worth of memory as a buffer
|
|
|
|
layerSize = slices.MaxFunc(slices.Collect(maps.Values(layers)), func(a, b ggml.Layer) int { |
|
|
|
return cmp.Compare(a.Size(), b.Size()) |
|
|
|
}).Size() |
|
|
|
|
|
|
|
var kvct string |
|
|
|
if envconfig.FlashAttention() && |
|
|
|
@ -219,7 +220,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin |
|
|
|
} |
|
|
|
|
|
|
|
// For all the layers, find where they can fit on the GPU(s)
|
|
|
|
for i := range int(f.KV().BlockCount()) { |
|
|
|
for i := int(f.KV().BlockCount()) - 1; i >= 0; i-- { |
|
|
|
// Some models have inconsistent layer sizes
|
|
|
|
if blk, ok := layers[fmt.Sprintf("blk.%d", i)]; ok { |
|
|
|
layerSize = blk.Size() |
|
|
|
@ -229,6 +230,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin |
|
|
|
|
|
|
|
if opts.NumGPU >= 0 && layerCount >= opts.NumGPU { |
|
|
|
// Stop allocating on GPU(s) once we hit the users target NumGPU
|
|
|
|
overflow += layerSize |
|
|
|
continue |
|
|
|
} |
|
|
|
|
|
|
|
@ -245,13 +247,13 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin |
|
|
|
gpusWithSpace = append(gpusWithSpace[:i%j], gpusWithSpace[i%j+1:]...) |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
if len(gpusWithSpace) == 0 { |
|
|
|
overflow += layerSize |
|
|
|
} |
|
|
|
} |
|
|
|
if layerCount >= int(f.KV().BlockCount()) { |
|
|
|
fullyLoaded = true |
|
|
|
} else { |
|
|
|
for i := layerCount; i < int(f.KV().BlockCount()); i++ { |
|
|
|
overflow += layerSize |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
// Determine if we need to consider output then find where it fits
|
|
|
|
|