Browse Source
api: remove unused or unsupported api options (#10574 )
Some options listed in api/types.go are not supported in
newer models, or have been deprecated in the past. This is
the first of a series of PRs to clean up the API options
brucemacd/model-forward-test-ext
Jeffrey Morgan
11 months ago
committed by
GitHub
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with
8 additions and
24 deletions
api/types.go
docs/api.md
llama/llama.go
llm/server.go
parser/parser.go
parser/parser_test.go
runner/llamarunner/runner.go
runner/ollamarunner/runner.go
@ -283,12 +283,7 @@ type Runner struct {
NumBatch int ` json:"num_batch,omitempty" `
NumGPU int ` json:"num_gpu,omitempty" `
MainGPU int ` json:"main_gpu,omitempty" `
LowVRAM bool ` json:"low_vram,omitempty" `
F16KV bool ` json:"f16_kv,omitempty" ` // Deprecated: This option is ignored
LogitsAll bool ` json:"logits_all,omitempty" `
VocabOnly bool ` json:"vocab_only,omitempty" `
UseMMap * bool ` json:"use_mmap,omitempty" `
UseMLock bool ` json:"use_mlock,omitempty" `
NumThread int ` json:"num_thread,omitempty" `
}
@ -671,8 +666,6 @@ func DefaultOptions() Options {
NumBatch : 512 ,
NumGPU : - 1 , // -1 here indicates that NumGPU should be set dynamically
NumThread : 0 , // let the runtime decide
LowVRAM : false ,
UseMLock : false ,
UseMMap : nil ,
} ,
}
@ -404,10 +404,7 @@ curl http://localhost:11434/api/generate -d '{
"num_batch": 2,
"num_gpu": 1,
"main_gpu": 0,
"low_vram": false,
"vocab_only": false,
"use_mmap": true,
"use_mlock": false,
"num_thread": 8
}
}'
@ -199,7 +199,6 @@ type ModelParams struct {
NumGpuLayers int
MainGpu int
UseMmap bool
UseMlock bool
TensorSplit [ ] float32
Progress func ( float32 )
VocabOnly bool
@ -218,7 +217,6 @@ func LoadModelFromFile(modelPath string, params ModelParams) (*Model, error) {
cparams . n_gpu_layers = C . int ( params . NumGpuLayers )
cparams . main_gpu = C . int32_t ( params . MainGpu )
cparams . use_mmap = C . bool ( params . UseMmap )
cparams . use_mlock = C . bool ( params . UseMlock )
cparams . vocab_only = C . bool ( params . VocabOnly )
if len ( params . TensorSplit ) > 0 {
@ -217,10 +217,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
params = append ( params , "--no-mmap" )
}
if opts . UseMLock {
params = append ( params , "--mlock" )
}
// TODO - NUMA support currently doesn't work properly
params = append ( params , "--parallel" , strconv . Itoa ( numParallel ) )
@ -39,7 +39,14 @@ func (f Modelfile) String() string {
return sb . String ( )
}
var deprecatedParameters = [ ] string { "penalize_newline" }
var deprecatedParameters = [ ] string {
"penalize_newline" ,
"low_vram" ,
"f16_kv" ,
"logits_all" ,
"vocab_only" ,
"use_mlock" ,
}
// CreateRequest creates a new *api.CreateRequest from an existing Modelfile
func ( f Modelfile ) CreateRequest ( relativeDir string ) ( * api . CreateRequest , error ) {
@ -478,11 +478,7 @@ func TestParseFileParameters(t *testing.T) {
"num_gqa 1" : { "num_gqa" , "1" } ,
"num_gpu 1" : { "num_gpu" , "1" } ,
"main_gpu 1" : { "main_gpu" , "1" } ,
"low_vram true" : { "low_vram" , "true" } ,
"logits_all true" : { "logits_all" , "true" } ,
"vocab_only true" : { "vocab_only" , "true" } ,
"use_mmap true" : { "use_mmap" , "true" } ,
"use_mlock true" : { "use_mlock" , "true" } ,
"num_thread 1" : { "num_thread" , "1" } ,
"num_keep 1" : { "num_keep" , "1" } ,
"seed 1" : { "seed" , "1" } ,
@ -820,7 +820,6 @@ func Execute(args []string) error {
threads := fs . Int ( "threads" , runtime . NumCPU ( ) , "Number of threads to use during generation" )
verbose := fs . Bool ( "verbose" , false , "verbose output (default: disabled)" )
noMmap := fs . Bool ( "no-mmap" , false , "do not memory-map model (slower load but may reduce pageouts if not using mlock)" )
mlock := fs . Bool ( "mlock" , false , "force system to keep model in RAM rather than swapping or compressing" )
tensorSplit := fs . String ( "tensor-split" , "" , "fraction of the model to offload to each GPU, comma-separated list of proportions" )
multiUserCache := fs . Bool ( "multiuser-cache" , false , "optimize input cache algorithm for multiple users" )
@ -876,7 +875,6 @@ func Execute(args []string) error {
NumGpuLayers : * nGpuLayers ,
MainGpu : * mainGpu ,
UseMmap : ! * noMmap && lpaths . String ( ) == "" ,
UseMlock : * mlock ,
TensorSplit : tensorSplitFloats ,
Progress : func ( progress float32 ) {
server . progress = progress
@ -818,7 +818,6 @@ func Execute(args []string) error {
threads := fs . Int ( "threads" , runtime . NumCPU ( ) , "Number of threads to use during generation" )
verbose := fs . Bool ( "verbose" , false , "verbose output (default: disabled)" )
_ = fs . Bool ( "no-mmap" , false , "do not memory-map model (slower load but may reduce pageouts if not using mlock)" )
_ = fs . Bool ( "mlock" , false , "force system to keep model in RAM rather than swapping or compressing" )
tensorSplit := fs . String ( "tensor-split" , "" , "fraction of the model to offload to each GPU, comma-separated list of proportions" )
multiUserCache := fs . Bool ( "multiuser-cache" , false , "optimize input cache algorithm for multiple users" )