ollama/convert/reader.go

package convert

import (
	"errors"
	"io"
	"io/fs"
	"strings"
)

type Tensor interface {
	Name() string
	Shape() []uint64
	Kind() uint32
	SetRepacker(Repacker)
	WriteTo(io.Writer) (int64, error)
	Clone() Tensor
}

type tensorBase struct {
	name     string
	shape    []uint64
	repacker Repacker
}

func (t tensorBase) Name() string {
	return t.name
}

func (t tensorBase) Shape() []uint64 {
	return t.shape
}

const (
	tensorKindFP32 uint32 = iota
	tensorKindFP16
	tensorKindMXFP4 = 4
	tensorKindBF16  = 30
)

func (t tensorBase) Kind() uint32 {
	if strings.HasSuffix(t.name, ".ffn_gate_inp.weight") ||
		t.name == "token_types.weight" ||
		t.name == "v.positional_embedding_vlm" ||
		t.name == "v.tile_position_embd.weight" ||
		t.name == "v.pre_tile_position_embd.weight" ||
		t.name == "v.post_tile_position_embd.weight" {
		// these tensors are always F32
		return tensorKindFP32
	}

	switch len(t.shape) {
	case 0:
		panic("invalid tensor shape")
	case 1:
		return tensorKindFP32
	default:
		return tensorKindFP16
	}
}

func (t *tensorBase) SetRepacker(fn Repacker) {
	t.repacker = fn
}

type Repacker func(string, []float32, []uint64) ([]float32, error)

func parseTensors(fsys fs.FS, replacer *strings.Replacer) ([]Tensor, error) {
	patterns := []struct {
		Pattern string
		Func    func(fs.FS, *strings.Replacer, ...string) ([]Tensor, error)
	}{
		{"*.safetensors", parseSafetensors},
		{"pytorch_model-*-of-*.bin", parseTorch},
		{"pytorch_model.bin", parseTorch},
		{"consolidated.*.pth", parseTorch},
	}

	for _, pattern := range patterns {
		matches, err := fs.Glob(fsys, pattern.Pattern)
		if err != nil {
			return nil, err
		}

		if len(matches) > 0 {
			return pattern.Func(fsys, replacer, matches...)
		}
	}

	return nil, errors.New("unknown tensor format")
}
refactor convert 2 years ago			`package convert`

			`import (`
			`"errors"`
			`"io"`
convert: only extract large files 2 years ago			`"io/fs"`
refactor convert 2 years ago			`"strings"`
			`)`

			`type Tensor interface {`
			`Name() string`
			`Shape() []uint64`
			`Kind() uint32`
llama4 1 year ago			`SetRepacker(Repacker)`
refactor convert 2 years ago			`WriteTo(io.Writer) (int64, error)`
llama4 1 year ago			`Clone() Tensor`
refactor convert 2 years ago			`}`

			`type tensorBase struct {`
llama4 1 year ago			`name string`
			`shape []uint64`
			`repacker Repacker`
refactor convert 2 years ago			`}`

			`func (t tensorBase) Name() string {`
			`return t.name`
			`}`

			`func (t tensorBase) Shape() []uint64 {`
			`return t.shape`
			`}`

comments 2 years ago			`const (`
gpt-oss (#11672) * bf16 * tests * gpt-oss * enable gptoss for engine * rough estimate * convert to mxfp4 * handle safetensors U8 * clamp glu/linear * update tokenizer * MXFP4 support This implements the Open Compute Microscaling (MX) FP4 format as a tensor type with backend implementations focusing on mulmat and mulmatid on CPU, CUDA, and Metal. * Unit tests for MXFP4 support This exercises various operations and shapes on both CPU and GPU (if detected on the system) * cuda graph * unit test adjustments * cuda: optimize memory access Read 4 bytes at a time (8 elements) when performing mul_mat_vec_mxfp4 * mac: fix crash on old macos versions cblas_sgemm is only supported on v13.3 and up, however bf16 is only supported on v14+ so we were falling back to ggml-blas and crashing on bf16 tensors. Checking for the function being null seems to be the simplest way to condittionally avoid registering the backend. * server: Minimum context length for gptoss This model requires a minimum context length of 8192 to function effectively. Users can set higher values through all normal mechanisms but lower values will be silently reset. * ggml: Multiply by numParallel for gptoss sliding window When computing the graph size estimate, the context size is already multiplied by numParallel so estimates reflect that. However, since sliding window models use a smaller, fixed context size, they need to manually take numParallel into account. * gpt-oss integration includes harmony parser and thinking levels, etc. * fix sync * fix tests * fix lint --------- Co-authored-by: Daniel Hiltgen <daniel@ollama.com> Co-authored-by: Jesse Gross <jesse@ollama.com> Co-authored-by: Devon Rifkin <drifkin@drifkin.net> 8 months ago			`tensorKindFP32 uint32 = iota`
			`tensorKindFP16`
			`tensorKindMXFP4 = 4`
			`tensorKindBF16 = 30`
comments 2 years ago			`)`

refactor convert 2 years ago			`func (t tensorBase) Kind() uint32 {`
convert gemma2 2 years ago			`if strings.HasSuffix(t.name, ".ffn_gate_inp.weight") \|\|`
llama4 1 year ago			`t.name == "token_types.weight" \|\|`
chore: update mllama to use ollama engine (#10637) 11 months ago			`t.name == "v.positional_embedding_vlm" \|\|`
			`t.name == "v.tile_position_embd.weight" \|\|`
			`t.name == "v.pre_tile_position_embd.weight" \|\|`
			`t.name == "v.post_tile_position_embd.weight" {`
convert gemma2 2 years ago			`// these tensors are always F32`
gpt-oss (#11672) * bf16 * tests * gpt-oss * enable gptoss for engine * rough estimate * convert to mxfp4 * handle safetensors U8 * clamp glu/linear * update tokenizer * MXFP4 support This implements the Open Compute Microscaling (MX) FP4 format as a tensor type with backend implementations focusing on mulmat and mulmatid on CPU, CUDA, and Metal. * Unit tests for MXFP4 support This exercises various operations and shapes on both CPU and GPU (if detected on the system) * cuda graph * unit test adjustments * cuda: optimize memory access Read 4 bytes at a time (8 elements) when performing mul_mat_vec_mxfp4 * mac: fix crash on old macos versions cblas_sgemm is only supported on v13.3 and up, however bf16 is only supported on v14+ so we were falling back to ggml-blas and crashing on bf16 tensors. Checking for the function being null seems to be the simplest way to condittionally avoid registering the backend. * server: Minimum context length for gptoss This model requires a minimum context length of 8192 to function effectively. Users can set higher values through all normal mechanisms but lower values will be silently reset. * ggml: Multiply by numParallel for gptoss sliding window When computing the graph size estimate, the context size is already multiplied by numParallel so estimates reflect that. However, since sliding window models use a smaller, fixed context size, they need to manually take numParallel into account. * gpt-oss integration includes harmony parser and thinking levels, etc. * fix sync * fix tests * fix lint --------- Co-authored-by: Daniel Hiltgen <daniel@ollama.com> Co-authored-by: Jesse Gross <jesse@ollama.com> Co-authored-by: Devon Rifkin <drifkin@drifkin.net> 8 months ago			`return tensorKindFP32`
refactor convert 2 years ago			`}`

			`switch len(t.shape) {`
			`case 0:`
			`panic("invalid tensor shape")`
			`case 1:`
gpt-oss (#11672) * bf16 * tests * gpt-oss * enable gptoss for engine * rough estimate * convert to mxfp4 * handle safetensors U8 * clamp glu/linear * update tokenizer * MXFP4 support This implements the Open Compute Microscaling (MX) FP4 format as a tensor type with backend implementations focusing on mulmat and mulmatid on CPU, CUDA, and Metal. * Unit tests for MXFP4 support This exercises various operations and shapes on both CPU and GPU (if detected on the system) * cuda graph * unit test adjustments * cuda: optimize memory access Read 4 bytes at a time (8 elements) when performing mul_mat_vec_mxfp4 * mac: fix crash on old macos versions cblas_sgemm is only supported on v13.3 and up, however bf16 is only supported on v14+ so we were falling back to ggml-blas and crashing on bf16 tensors. Checking for the function being null seems to be the simplest way to condittionally avoid registering the backend. * server: Minimum context length for gptoss This model requires a minimum context length of 8192 to function effectively. Users can set higher values through all normal mechanisms but lower values will be silently reset. * ggml: Multiply by numParallel for gptoss sliding window When computing the graph size estimate, the context size is already multiplied by numParallel so estimates reflect that. However, since sliding window models use a smaller, fixed context size, they need to manually take numParallel into account. * gpt-oss integration includes harmony parser and thinking levels, etc. * fix sync * fix tests * fix lint --------- Co-authored-by: Daniel Hiltgen <daniel@ollama.com> Co-authored-by: Jesse Gross <jesse@ollama.com> Co-authored-by: Devon Rifkin <drifkin@drifkin.net> 8 months ago			`return tensorKindFP32`
refactor convert 2 years ago			`default:`
gpt-oss (#11672) * bf16 * tests * gpt-oss * enable gptoss for engine * rough estimate * convert to mxfp4 * handle safetensors U8 * clamp glu/linear * update tokenizer * MXFP4 support This implements the Open Compute Microscaling (MX) FP4 format as a tensor type with backend implementations focusing on mulmat and mulmatid on CPU, CUDA, and Metal. * Unit tests for MXFP4 support This exercises various operations and shapes on both CPU and GPU (if detected on the system) * cuda graph * unit test adjustments * cuda: optimize memory access Read 4 bytes at a time (8 elements) when performing mul_mat_vec_mxfp4 * mac: fix crash on old macos versions cblas_sgemm is only supported on v13.3 and up, however bf16 is only supported on v14+ so we were falling back to ggml-blas and crashing on bf16 tensors. Checking for the function being null seems to be the simplest way to condittionally avoid registering the backend. * server: Minimum context length for gptoss This model requires a minimum context length of 8192 to function effectively. Users can set higher values through all normal mechanisms but lower values will be silently reset. * ggml: Multiply by numParallel for gptoss sliding window When computing the graph size estimate, the context size is already multiplied by numParallel so estimates reflect that. However, since sliding window models use a smaller, fixed context size, they need to manually take numParallel into account. * gpt-oss integration includes harmony parser and thinking levels, etc. * fix sync * fix tests * fix lint --------- Co-authored-by: Daniel Hiltgen <daniel@ollama.com> Co-authored-by: Jesse Gross <jesse@ollama.com> Co-authored-by: Devon Rifkin <drifkin@drifkin.net> 8 months ago			`return tensorKindFP16`
refactor convert 2 years ago			`}`
			`}`

llama4 1 year ago			`func (t *tensorBase) SetRepacker(fn Repacker) {`
refactor convert 2 years ago			`t.repacker = fn`
			`}`

llama4 1 year ago			`type Repacker func(string, []float32, []uint64) ([]float32, error)`
refactor convert 2 years ago
convert gemma2 2 years ago			`func parseTensors(fsys fs.FS, replacer *strings.Replacer) ([]Tensor, error) {`
convert: fix parse functions 2 years ago			`patterns := []struct {`
			`Pattern string`
convert gemma2 2 years ago			`Func func(fs.FS, *strings.Replacer, ...string) ([]Tensor, error)`
convert: fix parse functions 2 years ago			`}{`
model: support for mistral-small in the ollama runner Mistral is a popular research lab making open source models. This updates the forward pass of llama architecture models to support both llama models and mistral models by accounting for additional metadata present in mistral models, and finding the correct dimensions for the output projection. 1 year ago			`{"*.safetensors", parseSafetensors},`
convert: fix parse functions 2 years ago			`{"pytorch_model--of-.bin", parseTorch},`
			`{"pytorch_model.bin", parseTorch},`
			`{"consolidated.*.pth", parseTorch},`
refactor convert 2 years ago			`}`

convert: fix parse functions 2 years ago			`for _, pattern := range patterns {`
			`matches, err := fs.Glob(fsys, pattern.Pattern)`
refactor convert 2 years ago			`if err != nil {`
			`return nil, err`
			`}`

			`if len(matches) > 0 {`
convert gemma2 2 years ago			`return pattern.Func(fsys, replacer, matches...)`
refactor convert 2 years ago			`}`
			`}`

			`return nil, errors.New("unknown tensor format")`
			`}`