mirror of https://gitee.com/namelin2022/ollama
Browse Source
* Move quantization logic to GGML via new backend This moves the model aware logic to Go code and calls GGMLs quantization code for model creation. * Remove "add model quantizations" This is no longer needed now that quantization is implemented in Go+GGML code directly.brucemacd/model-forward-test-ext
committed by
GitHub
39 changed files with 1850 additions and 436 deletions
@ -1,185 +1,341 @@ |
|||
package ggml |
|||
|
|||
import "fmt" |
|||
import ( |
|||
"fmt" |
|||
"log/slog" |
|||
"strings" |
|||
) |
|||
|
|||
type fileType uint32 |
|||
// FileType is the Go equivalent to llama_ftype used for gguf file typing
|
|||
type FileType uint32 |
|||
|
|||
const ( |
|||
fileTypeF32 fileType = iota |
|||
fileTypeF16 |
|||
fileTypeQ4_0 |
|||
fileTypeQ4_1 |
|||
fileTypeQ4_1_F16 |
|||
fileTypeQ4_2 // unused
|
|||
fileTypeQ4_3 // unused
|
|||
fileTypeQ8_0 |
|||
fileTypeQ5_0 |
|||
fileTypeQ5_1 |
|||
fileTypeQ2_K |
|||
fileTypeQ3_K_S |
|||
fileTypeQ3_K_M |
|||
fileTypeQ3_K_L |
|||
fileTypeQ4_K_S |
|||
fileTypeQ4_K_M |
|||
fileTypeQ5_K_S |
|||
fileTypeQ5_K_M |
|||
fileTypeQ6_K |
|||
fileTypeIQ2_XXS |
|||
fileTypeIQ2_XS |
|||
fileTypeQ2_K_S |
|||
fileTypeIQ3_XS |
|||
fileTypeIQ3_XXS |
|||
fileTypeIQ1_S |
|||
fileTypeIQ4_NL |
|||
fileTypeIQ3_S |
|||
fileTypeIQ3_M |
|||
fileTypeIQ2_S |
|||
fileTypeIQ2_M |
|||
fileTypeIQ4_XS |
|||
fileTypeIQ1_M |
|||
fileTypeBF16 |
|||
FileTypeF32 FileType = iota |
|||
FileTypeF16 |
|||
FileTypeQ4_0 |
|||
FileTypeQ4_1 |
|||
fileTypeQ4_1_F16 // unused by GGML
|
|||
fileTypeQ4_2 // unused by GGML
|
|||
fileTypeQ4_3 // unused by GGML
|
|||
FileTypeQ8_0 |
|||
FileTypeQ5_0 |
|||
FileTypeQ5_1 |
|||
FileTypeQ2_K |
|||
FileTypeQ3_K_S |
|||
FileTypeQ3_K_M |
|||
FileTypeQ3_K_L |
|||
FileTypeQ4_K_S |
|||
FileTypeQ4_K_M |
|||
FileTypeQ5_K_S |
|||
FileTypeQ5_K_M |
|||
FileTypeQ6_K |
|||
fileTypeIQ2_XXS // not supported by ollama
|
|||
fileTypeIQ2_XS // not supported by ollama
|
|||
FileTypeQ2_K_S |
|||
fileTypeIQ3_XS // not supported by ollama
|
|||
fileTypeIQ3_XXS // not supported by ollama
|
|||
fileTypeIQ1_S // not supported by ollama
|
|||
fileTypeIQ4_NL // not supported by ollama
|
|||
fileTypeIQ3_S // not supported by ollama
|
|||
fileTypeIQ3_M // not supported by ollama
|
|||
fileTypeIQ2_S // not supported by ollama
|
|||
fileTypeIQ2_M // not supported by ollama
|
|||
fileTypeIQ4_XS // not supported by ollama
|
|||
fileTypeIQ1_M // not supported by ollama
|
|||
FileTypeBF16 |
|||
fileTypeQ4_0_4_4 // unused by GGML
|
|||
fileTypeQ4_0_4_8 // unused by GGML
|
|||
fileTypeQ4_0_8_8 // unused by GGML
|
|||
fileTypeTQ1_0 // not supported by ollama
|
|||
fileTypeTQ2_0 // not supported by ollama
|
|||
|
|||
fileTypeUnknown |
|||
FileTypeUnknown = 1024 |
|||
) |
|||
|
|||
func ParseFileType(s string) (fileType, error) { |
|||
// ParseFileType parses the provided GGUF file type
|
|||
// Only Ollama supported types are considered valid
|
|||
func ParseFileType(s string) (FileType, error) { |
|||
switch s { |
|||
case "F32": |
|||
return fileTypeF32, nil |
|||
return FileTypeF32, nil |
|||
case "F16": |
|||
return fileTypeF16, nil |
|||
return FileTypeF16, nil |
|||
case "Q4_0": |
|||
return fileTypeQ4_0, nil |
|||
return FileTypeQ4_0, nil |
|||
case "Q4_1": |
|||
return fileTypeQ4_1, nil |
|||
case "Q4_1_F16": |
|||
return fileTypeQ4_1_F16, nil |
|||
return FileTypeQ4_1, nil |
|||
case "Q8_0": |
|||
return fileTypeQ8_0, nil |
|||
return FileTypeQ8_0, nil |
|||
case "Q5_0": |
|||
return fileTypeQ5_0, nil |
|||
return FileTypeQ5_0, nil |
|||
case "Q5_1": |
|||
return fileTypeQ5_1, nil |
|||
return FileTypeQ5_1, nil |
|||
case "Q2_K": |
|||
return fileTypeQ2_K, nil |
|||
return FileTypeQ2_K, nil |
|||
case "Q3_K_S": |
|||
return fileTypeQ3_K_S, nil |
|||
return FileTypeQ3_K_S, nil |
|||
case "Q3_K_M": |
|||
return fileTypeQ3_K_M, nil |
|||
return FileTypeQ3_K_M, nil |
|||
case "Q3_K_L": |
|||
return fileTypeQ3_K_L, nil |
|||
return FileTypeQ3_K_L, nil |
|||
case "Q4_K_S": |
|||
return fileTypeQ4_K_S, nil |
|||
case "Q4_K_M": |
|||
return fileTypeQ4_K_M, nil |
|||
return FileTypeQ4_K_S, nil |
|||
case "Q4_K_M", "Q4_K": |
|||
return FileTypeQ4_K_M, nil |
|||
case "Q5_K_S": |
|||
return fileTypeQ5_K_S, nil |
|||
case "Q5_K_M": |
|||
return fileTypeQ5_K_M, nil |
|||
return FileTypeQ5_K_S, nil |
|||
case "Q5_K_M", "Q5_K": |
|||
return FileTypeQ5_K_M, nil |
|||
case "Q6_K": |
|||
return fileTypeQ6_K, nil |
|||
case "IQ2_XXS": |
|||
return fileTypeIQ2_XXS, nil |
|||
case "IQ2_XS": |
|||
return fileTypeIQ2_XS, nil |
|||
return FileTypeQ6_K, nil |
|||
case "Q2_K_S": |
|||
return fileTypeQ2_K_S, nil |
|||
case "IQ3_XS": |
|||
return fileTypeIQ3_XS, nil |
|||
case "IQ3_XXS": |
|||
return fileTypeIQ3_XXS, nil |
|||
case "IQ1_S": |
|||
return fileTypeIQ1_S, nil |
|||
case "IQ4_NL": |
|||
return fileTypeIQ4_NL, nil |
|||
case "IQ3_S": |
|||
return fileTypeIQ3_S, nil |
|||
case "IQ3_M": |
|||
return fileTypeIQ3_M, nil |
|||
case "IQ2_S": |
|||
return fileTypeIQ2_S, nil |
|||
case "IQ2_M": |
|||
return fileTypeIQ2_M, nil |
|||
case "IQ4_XS": |
|||
return fileTypeIQ4_XS, nil |
|||
case "IQ1_M": |
|||
return fileTypeIQ1_M, nil |
|||
return FileTypeQ2_K_S, nil |
|||
case "BF16": |
|||
return fileTypeBF16, nil |
|||
return FileTypeBF16, nil |
|||
default: |
|||
return fileTypeUnknown, fmt.Errorf("unknown fileType: %s", s) |
|||
supportedFileTypes := []FileType{ |
|||
FileTypeF32, |
|||
FileTypeF16, |
|||
FileTypeQ4_K_S, |
|||
FileTypeQ4_K_M, |
|||
FileTypeQ8_0, |
|||
// fsggml.FileTypeBF16, // TODO
|
|||
} |
|||
strs := make([]string, len(supportedFileTypes)) |
|||
for i := range supportedFileTypes { |
|||
strs[i] = supportedFileTypes[i].String() |
|||
} |
|||
|
|||
return FileTypeUnknown, fmt.Errorf("unsupported quantization type %s - supported types are %s", s, strings.Join(strs, ", ")) |
|||
} |
|||
} |
|||
|
|||
func (t fileType) String() string { |
|||
func (t FileType) String() string { |
|||
switch t { |
|||
case fileTypeF32: |
|||
case FileTypeF32: |
|||
return "F32" |
|||
case fileTypeF16: |
|||
case FileTypeF16: |
|||
return "F16" |
|||
case fileTypeQ4_0: |
|||
case FileTypeQ4_0: |
|||
return "Q4_0" |
|||
case fileTypeQ4_1: |
|||
case FileTypeQ4_1: |
|||
return "Q4_1" |
|||
case fileTypeQ4_1_F16: |
|||
return "Q4_1_F16" |
|||
case fileTypeQ8_0: |
|||
case FileTypeQ8_0: |
|||
return "Q8_0" |
|||
case fileTypeQ5_0: |
|||
case FileTypeQ5_0: |
|||
return "Q5_0" |
|||
case fileTypeQ5_1: |
|||
case FileTypeQ5_1: |
|||
return "Q5_1" |
|||
case fileTypeQ2_K: |
|||
case FileTypeQ2_K: |
|||
return "Q2_K" |
|||
case fileTypeQ3_K_S: |
|||
case FileTypeQ3_K_S: |
|||
return "Q3_K_S" |
|||
case fileTypeQ3_K_M: |
|||
case FileTypeQ3_K_M: |
|||
return "Q3_K_M" |
|||
case fileTypeQ3_K_L: |
|||
case FileTypeQ3_K_L: |
|||
return "Q3_K_L" |
|||
case fileTypeQ4_K_S: |
|||
case FileTypeQ4_K_S: |
|||
return "Q4_K_S" |
|||
case fileTypeQ4_K_M: |
|||
case FileTypeQ4_K_M: |
|||
return "Q4_K_M" |
|||
case fileTypeQ5_K_S: |
|||
case FileTypeQ5_K_S: |
|||
return "Q5_K_S" |
|||
case fileTypeQ5_K_M: |
|||
case FileTypeQ5_K_M: |
|||
return "Q5_K_M" |
|||
case fileTypeQ6_K: |
|||
case FileTypeQ6_K: |
|||
return "Q6_K" |
|||
case fileTypeIQ2_XXS: |
|||
return "IQ2_XXS" |
|||
case fileTypeIQ2_XS: |
|||
return "IQ2_XS" |
|||
case fileTypeQ2_K_S: |
|||
case FileTypeQ2_K_S: |
|||
return "Q2_K_S" |
|||
case fileTypeIQ3_XS: |
|||
return "IQ3_XS" |
|||
case fileTypeIQ3_XXS: |
|||
return "IQ3_XXS" |
|||
case fileTypeIQ1_S: |
|||
return "IQ1_S" |
|||
case fileTypeIQ4_NL: |
|||
return "IQ4_NL" |
|||
case fileTypeIQ3_S: |
|||
return "IQ3_S" |
|||
case fileTypeIQ3_M: |
|||
return "IQ3_M" |
|||
case fileTypeIQ2_S: |
|||
return "IQ2_S" |
|||
case fileTypeIQ4_XS: |
|||
return "IQ4_XS" |
|||
case fileTypeIQ2_M: |
|||
return "IQ2_M" |
|||
case fileTypeIQ1_M: |
|||
return "IQ1_M" |
|||
case fileTypeBF16: |
|||
case FileTypeBF16: |
|||
return "BF16" |
|||
default: |
|||
return "unknown" |
|||
} |
|||
} |
|||
|
|||
func (t fileType) Value() uint32 { |
|||
func (t FileType) Value() uint32 { |
|||
return uint32(t) |
|||
} |
|||
|
|||
func (ftype FileType) ToTensorType() TensorType { |
|||
switch ftype { |
|||
case FileTypeF32: |
|||
return TensorTypeF32 |
|||
case FileTypeF16: |
|||
return TensorTypeF16 |
|||
case FileTypeQ4_0: |
|||
return TensorTypeQ4_0 |
|||
case FileTypeQ4_1: |
|||
return TensorTypeQ4_1 |
|||
case FileTypeQ8_0: |
|||
return TensorTypeQ8_0 |
|||
case FileTypeQ5_0: |
|||
return TensorTypeQ5_0 |
|||
case FileTypeQ5_1: |
|||
return TensorTypeQ5_1 |
|||
case FileTypeQ2_K: |
|||
return TensorTypeQ2_K |
|||
case FileTypeQ3_K_S: |
|||
return TensorTypeQ3_K |
|||
case FileTypeQ3_K_M: |
|||
return TensorTypeQ3_K |
|||
case FileTypeQ3_K_L: |
|||
return TensorTypeQ3_K |
|||
case FileTypeQ4_K_S: |
|||
return TensorTypeQ4_K |
|||
case FileTypeQ4_K_M: |
|||
return TensorTypeQ4_K |
|||
case FileTypeQ5_K_S: |
|||
return TensorTypeQ5_K |
|||
case FileTypeQ5_K_M: |
|||
return TensorTypeQ5_K |
|||
case FileTypeQ6_K: |
|||
return TensorTypeQ6_K |
|||
case FileTypeQ2_K_S: |
|||
return TensorTypeQ2_K |
|||
case FileTypeBF16: |
|||
return TensorTypeBF16 |
|||
default: |
|||
slog.Warn("unsupported file type", "type", ftype) |
|||
return 0 // F32
|
|||
} |
|||
} |
|||
|
|||
// TensorType is equivalent to ggml_type for individual tensor types
|
|||
// Note: these are not the same as FileType
|
|||
type TensorType uint32 |
|||
|
|||
const ( |
|||
TensorTypeF32 TensorType = iota |
|||
TensorTypeF16 |
|||
TensorTypeQ4_0 |
|||
TensorTypeQ4_1 |
|||
tensorTypeQ4_2 // unused by GGML
|
|||
tensorTypeQ4_3 // unused by GGML
|
|||
TensorTypeQ5_0 |
|||
TensorTypeQ5_1 |
|||
TensorTypeQ8_0 |
|||
TensorTypeQ8_1 |
|||
TensorTypeQ2_K |
|||
TensorTypeQ3_K |
|||
TensorTypeQ4_K |
|||
TensorTypeQ5_K |
|||
TensorTypeQ6_K |
|||
TensorTypeQ8_K |
|||
tensorTypeIQ2_XXS // not supported by ollama
|
|||
tensorTypeIQ2_XS // not supported by ollama
|
|||
tensorTypeIQ3_XXS // not supported by ollama
|
|||
tensorTypeIQ1_S // not supported by ollama
|
|||
tensorTypeIQ4_NL // not supported by ollama
|
|||
tensorTypeIQ3_S // not supported by ollama
|
|||
tensorTypeIQ2_S // not supported by ollama
|
|||
tensorTypeIQ4_XS // not supported by ollama
|
|||
TensorTypeI8 |
|||
TensorTypeI16 |
|||
TensorTypeI32 |
|||
TensorTypeI64 |
|||
TensorTypeF64 |
|||
tensorTypeIQ1_M // not supported by ollama
|
|||
TensorTypeBF16 |
|||
tensorTypeQ4_0_4_4 // unused by GGML
|
|||
tensorTypeQ4_0_4_8 // unused by GGML
|
|||
tensorTypeQ4_0_8_8 // unused by GGML
|
|||
tensorTypeTQ1_0 // not supported by ollama
|
|||
tensorTypeTQ2_0 // not supported by ollama
|
|||
tensorTypeIQ4_NL_4_4 // unused by GGML
|
|||
tensorTypeIQ4_NL_4_8 // unused by GGML
|
|||
tensorTypeIQ4_NL_8_8 // unused by GGML
|
|||
) |
|||
|
|||
// ParseFileType parses the provided GGUF file type
|
|||
// Only Ollama supported types are considered valid
|
|||
func ParseTensorType(s string) (TensorType, error) { |
|||
switch s { |
|||
case "F32": |
|||
return TensorTypeF32, nil |
|||
case "F16": |
|||
return TensorTypeF16, nil |
|||
case "Q4_0": |
|||
return TensorTypeQ4_0, nil |
|||
case "Q4_1": |
|||
return TensorTypeQ4_1, nil |
|||
case "Q5_0": |
|||
return TensorTypeQ5_0, nil |
|||
case "Q5_1": |
|||
return TensorTypeQ5_1, nil |
|||
case "Q8_0": |
|||
return TensorTypeQ8_0, nil |
|||
case "Q8_1": |
|||
return TensorTypeQ8_1, nil |
|||
case "Q2_K": |
|||
return TensorTypeQ2_K, nil |
|||
case "Q3_K": |
|||
return TensorTypeQ3_K, nil |
|||
case "Q4_K": |
|||
return TensorTypeQ4_K, nil |
|||
case "Q5_K": |
|||
return TensorTypeQ5_K, nil |
|||
case "Q6_K": |
|||
return TensorTypeQ6_K, nil |
|||
case "Q8_K": |
|||
return TensorTypeQ8_K, nil |
|||
case "F64": |
|||
return TensorTypeF64, nil |
|||
case "BF16": |
|||
return TensorTypeBF16, nil |
|||
default: |
|||
return 0, fmt.Errorf("unsupported quantization type %s", s) |
|||
} |
|||
} |
|||
|
|||
func (t TensorType) IsQuantized() bool { |
|||
switch t { |
|||
case TensorTypeF32, TensorTypeF16, TensorTypeBF16: |
|||
return false |
|||
default: |
|||
return true |
|||
} |
|||
} |
|||
|
|||
func (t TensorType) RowSize(ne uint64) uint64 { |
|||
return t.TypeSize() * ne / t.BlockSize() |
|||
} |
|||
|
|||
func (t TensorType) String() string { |
|||
switch t { |
|||
case TensorTypeF32: |
|||
return "F32" |
|||
case TensorTypeF16: |
|||
return "F16" |
|||
case TensorTypeQ4_0: |
|||
return "Q4_0" |
|||
case TensorTypeQ4_1: |
|||
return "Q4_1" |
|||
case TensorTypeQ5_0: |
|||
return "Q5_0" |
|||
case TensorTypeQ5_1: |
|||
return "Q5_1" |
|||
case TensorTypeQ8_0: |
|||
return "Q8_0" |
|||
case TensorTypeQ8_1: |
|||
return "Q8_1" |
|||
case TensorTypeQ2_K: |
|||
return "Q2_K" |
|||
case TensorTypeQ3_K: |
|||
return "Q3_K" |
|||
case TensorTypeQ4_K: |
|||
return "Q4_K" |
|||
case TensorTypeQ5_K: |
|||
return "Q5_K" |
|||
case TensorTypeQ6_K: |
|||
return "Q6_K" |
|||
case TensorTypeQ8_K: |
|||
return "Q8_K" |
|||
case TensorTypeF64: |
|||
return "F64" |
|||
case TensorTypeBF16: |
|||
return "BF16" |
|||
default: |
|||
return "unknown" |
|||
} |
|||
} |
|||
|
|||
@ -0,0 +1,130 @@ |
|||
//go:build integration && models
|
|||
|
|||
package integration |
|||
|
|||
import ( |
|||
"bytes" |
|||
"context" |
|||
"fmt" |
|||
"log/slog" |
|||
"strings" |
|||
"testing" |
|||
"time" |
|||
|
|||
"github.com/ollama/ollama/api" |
|||
) |
|||
|
|||
func TestQuantization(t *testing.T) { |
|||
sourceModels := []string{ |
|||
"qwen2.5:0.5b-instruct-fp16", |
|||
} |
|||
quantizations := []string{ |
|||
"Q8_0", |
|||
"Q4_K_S", |
|||
"Q4_K_M", |
|||
"Q4_K", |
|||
} |
|||
softTimeout, hardTimeout := getTimeouts(t) |
|||
started := time.Now() |
|||
slog.Info("Setting timeouts", "soft", softTimeout, "hard", hardTimeout) |
|||
ctx, cancel := context.WithTimeout(context.Background(), hardTimeout) |
|||
defer cancel() |
|||
client, _, cleanup := InitServerConnection(ctx, t) |
|||
defer cleanup() |
|||
|
|||
for _, base := range sourceModels { |
|||
if err := PullIfMissing(ctx, client, base); err != nil { |
|||
t.Fatalf("pull failed %s", err) |
|||
} |
|||
for _, quant := range quantizations { |
|||
newName := fmt.Sprintf("%s__%s", base, quant) |
|||
t.Run(newName, func(t *testing.T) { |
|||
if time.Now().Sub(started) > softTimeout { |
|||
t.Skip("skipping remaining tests to avoid excessive runtime") |
|||
} |
|||
req := &api.CreateRequest{ |
|||
Model: newName, |
|||
Quantization: quant, |
|||
From: base, |
|||
} |
|||
fn := func(resp api.ProgressResponse) error { |
|||
// fmt.Print(".")
|
|||
return nil |
|||
} |
|||
t.Logf("quantizing: %s -> %s", base, quant) |
|||
if err := client.Create(ctx, req, fn); err != nil { |
|||
t.Fatalf("create failed %s", err) |
|||
} |
|||
defer func() { |
|||
req := &api.DeleteRequest{ |
|||
Model: newName, |
|||
} |
|||
t.Logf("deleting: %s -> %s", base, quant) |
|||
if err := client.Delete(ctx, req); err != nil { |
|||
t.Logf("failed to clean up %s: %s", req.Model, err) |
|||
} |
|||
}() |
|||
// Check metadata on the model
|
|||
resp, err := client.Show(ctx, &api.ShowRequest{Name: newName}) |
|||
if err != nil { |
|||
t.Fatalf("unable to show model: %s", err) |
|||
} |
|||
if !strings.Contains(resp.Details.QuantizationLevel, quant) { |
|||
t.Fatalf("unexpected quantization for %s:\ngot: %s", newName, resp.Details.QuantizationLevel) |
|||
} |
|||
|
|||
stream := true |
|||
genReq := api.GenerateRequest{ |
|||
Model: newName, |
|||
Prompt: "why is the sky blue?", |
|||
KeepAlive: &api.Duration{Duration: 3 * time.Second}, |
|||
Options: map[string]any{ |
|||
"seed": 42, |
|||
"temperature": 0.0, |
|||
}, |
|||
Stream: &stream, |
|||
} |
|||
t.Logf("verifying: %s -> %s", base, quant) |
|||
|
|||
// Some smaller quantizations can cause models to have poor quality
|
|||
// or get stuck in repetition loops, so we stop as soon as we have any matches
|
|||
anyResp := []string{"rayleigh", "scattering", "day", "sun", "moon", "color", "nitrogen", "oxygen"} |
|||
reqCtx, reqCancel := context.WithCancel(ctx) |
|||
atLeastOne := false |
|||
var buf bytes.Buffer |
|||
genfn := func(response api.GenerateResponse) error { |
|||
buf.Write([]byte(response.Response)) |
|||
fullResp := strings.ToLower(buf.String()) |
|||
for _, resp := range anyResp { |
|||
if strings.Contains(fullResp, resp) { |
|||
atLeastOne = true |
|||
t.Log(fullResp) |
|||
reqCancel() |
|||
break |
|||
} |
|||
} |
|||
return nil |
|||
} |
|||
|
|||
done := make(chan int) |
|||
var genErr error |
|||
go func() { |
|||
genErr = client.Generate(reqCtx, &genReq, genfn) |
|||
done <- 0 |
|||
}() |
|||
|
|||
select { |
|||
case <-done: |
|||
if genErr != nil && !atLeastOne { |
|||
t.Fatalf("failed with %s request prompt %s ", genReq.Model, genReq.Prompt) |
|||
} |
|||
case <-ctx.Done(): |
|||
t.Error("outer test context done while waiting for generate") |
|||
} |
|||
|
|||
t.Logf("passed") |
|||
|
|||
}) |
|||
} |
|||
} |
|||
} |
|||
@ -1,96 +0,0 @@ |
|||
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 |
|||
From: jmorganca <jmorganca@gmail.com> |
|||
Date: Tue, 8 Apr 2025 20:39:32 -0700 |
|||
Subject: [PATCH] add model quantizations |
|||
|
|||
a temporary patch to add model quantization for |
|||
models not supported in llama.cpp |
|||
---
|
|||
src/llama-arch.cpp | 17 +++++++++++++++++ |
|||
src/llama-arch.h | 1 + |
|||
src/llama-model.cpp | 2 ++ |
|||
src/llama-quant.cpp | 4 ++++ |
|||
4 files changed, 24 insertions(+) |
|||
|
|||
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
|
|||
index eb7b5325..df42d1a5 100644
|
|||
--- a/src/llama-arch.cpp
|
|||
+++ b/src/llama-arch.cpp
|
|||
@@ -74,6 +74,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|||
{ LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" }, |
|||
{ LLM_ARCH_PLM, "plm" }, |
|||
{ LLM_ARCH_BAILINGMOE, "bailingmoe" }, |
|||
+ { LLM_ARCH_MISTRAL3, "mistral3" },
|
|||
{ LLM_ARCH_UNKNOWN, "(unknown)" }, |
|||
}; |
|||
|
|||
@@ -1606,6 +1607,22 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|||
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" }, |
|||
}, |
|||
}, |
|||
+ {
|
|||
+ LLM_ARCH_MISTRAL3,
|
|||
+ {
|
|||
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|||
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
|||
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
|||
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
|||
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
|||
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
|||
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
|||
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
|||
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
|||
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|||
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
|||
+ }
|
|||
+ },
|
|||
{ |
|||
LLM_ARCH_UNKNOWN, |
|||
{ |
|||
diff --git a/src/llama-arch.h b/src/llama-arch.h
|
|||
index bc8a4f0b..bda9d071 100644
|
|||
--- a/src/llama-arch.h
|
|||
+++ b/src/llama-arch.h
|
|||
@@ -76,6 +76,7 @@ enum llm_arch {
|
|||
LLM_ARCH_CHAMELEON, |
|||
LLM_ARCH_SOLAR, |
|||
LLM_ARCH_WAVTOKENIZER_DEC, |
|||
+ LLM_ARCH_MISTRAL3,
|
|||
LLM_ARCH_PLM, |
|||
LLM_ARCH_BAILINGMOE, |
|||
LLM_ARCH_UNKNOWN, |
|||
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
|
|||
index 9d099f11..ef70486d 100644
|
|||
--- a/src/llama-model.cpp
|
|||
+++ b/src/llama-model.cpp
|
|||
@@ -1437,6 +1437,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
default: type = LLM_TYPE_UNKNOWN; |
|||
} |
|||
} break; |
|||
+ case LLM_ARCH_MISTRAL3: break;
|
|||
default: throw std::runtime_error("unsupported model architecture"); |
|||
} |
|||
|
|||
@@ -13751,6 +13752,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|||
case LLM_ARCH_CHAMELEON: |
|||
case LLM_ARCH_SOLAR: |
|||
case LLM_ARCH_BAILINGMOE: |
|||
+ case LLM_ARCH_MISTRAL3:
|
|||
return LLAMA_ROPE_TYPE_NORM; |
|||
|
|||
// the pairs of head values are offset by n_rot/2 |
|||
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
|
|||
index 223e1f3f..8ae6dde8 100644
|
|||
--- a/src/llama-quant.cpp
|
|||
+++ b/src/llama-quant.cpp
|
|||
@@ -744,6 +744,10 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|||
// This used to be a regex, but <regex> has an extreme cost to compile times. |
|||
bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'? |
|||
|
|||
+ // don't quantize vision stuff
|
|||
+ quantize &= name.find("v.") == std::string::npos;
|
|||
+ quantize &= name.find("mm.") == std::string::npos;
|
|||
+
|
|||
// quantize only 2D and 3D tensors (experts) |
|||
quantize &= (ggml_n_dims(tensor) >= 2); |
|||
|
|||
@ -0,0 +1,83 @@ |
|||
package ggml |
|||
|
|||
// #cgo CPPFLAGS: -I${SRCDIR}/ggml/src
|
|||
// #include <stdlib.h>
|
|||
// #include <stdint.h>
|
|||
// #include "ggml.h"
|
|||
// #include "ggml-cpu.h"
|
|||
// #include "ggml-backend.h"
|
|||
// #include "ggml-quants.h"
|
|||
import "C" |
|||
|
|||
import ( |
|||
"unsafe" |
|||
|
|||
fsggml "github.com/ollama/ollama/fs/ggml" |
|||
) |
|||
|
|||
// convertToF32 converts (dequantizes) the raw data to F32 so we can then quantize it
|
|||
func ConvertToF32(data []byte, dtype uint32, nelements uint64) []float32 { |
|||
f32s := make([]float32, nelements) |
|||
elems := C.int64_t(nelements) |
|||
switch dtype { |
|||
case C.GGML_TYPE_F16: |
|||
C.ggml_fp16_to_fp32_row((*C.uint16_t)(unsafe.Pointer(&data[0])), (*C.float)(&f32s[0]), elems) |
|||
case C.GGML_TYPE_Q4_0: |
|||
C.dequantize_row_q4_0((*C.block_q4_0)(unsafe.Pointer(&data[0])), (*C.float)(&f32s[0]), elems) |
|||
case C.GGML_TYPE_Q4_1: |
|||
C.dequantize_row_q4_1((*C.block_q4_1)(unsafe.Pointer(&data[0])), (*C.float)(&f32s[0]), elems) |
|||
case C.GGML_TYPE_Q5_0: |
|||
C.dequantize_row_q5_0((*C.block_q5_0)(unsafe.Pointer(&data[0])), (*C.float)(&f32s[0]), elems) |
|||
case C.GGML_TYPE_Q5_1: |
|||
C.dequantize_row_q5_1((*C.block_q5_1)(unsafe.Pointer(&data[0])), (*C.float)(&f32s[0]), elems) |
|||
case C.GGML_TYPE_Q8_0: |
|||
C.dequantize_row_q8_0((*C.block_q8_0)(unsafe.Pointer(&data[0])), (*C.float)(&f32s[0]), elems) |
|||
case C.GGML_TYPE_Q2_K: |
|||
C.dequantize_row_q2_K((*C.block_q2_K)(unsafe.Pointer(&data[0])), (*C.float)(&f32s[0]), elems) |
|||
case C.GGML_TYPE_Q3_K: |
|||
C.dequantize_row_q3_K((*C.block_q3_K)(unsafe.Pointer(&data[0])), (*C.float)(&f32s[0]), elems) |
|||
case C.GGML_TYPE_Q4_K: |
|||
C.dequantize_row_q4_K((*C.block_q4_K)(unsafe.Pointer(&data[0])), (*C.float)(&f32s[0]), elems) |
|||
case C.GGML_TYPE_Q5_K: |
|||
C.dequantize_row_q5_K((*C.block_q5_K)(unsafe.Pointer(&data[0])), (*C.float)(&f32s[0]), elems) |
|||
case C.GGML_TYPE_Q6_K: |
|||
C.dequantize_row_q6_K((*C.block_q6_K)(unsafe.Pointer(&data[0])), (*C.float)(&f32s[0]), elems) |
|||
case C.GGML_TYPE_BF16: |
|||
C.ggml_bf16_to_fp32_row((*C.ggml_bf16_t)(unsafe.Pointer(&data[0])), (*C.float)(&f32s[0]), elems) |
|||
default: |
|||
panic("unsupported quantization format") |
|||
} |
|||
return f32s |
|||
} |
|||
|
|||
func Quantize(newType fsggml.TensorType, f32s []float32, shape []uint64) []byte { |
|||
buf := make([]byte, len(f32s)*4) // upper bound on size
|
|||
nPerRow := C.int64_t(shape[0]) |
|||
nrows := C.int64_t(1) |
|||
if len(shape) > 1 { |
|||
nrows = C.int64_t(shape[1]) |
|||
} |
|||
shape2 := C.int64_t(1) |
|||
if len(shape) > 2 { |
|||
shape2 = C.int64_t(shape[2]) |
|||
} |
|||
nelements_matrix := nPerRow * nrows |
|||
newSize := C.size_t(0) |
|||
for i03 := C.int64_t(0); i03 < shape2; i03++ { |
|||
f32s_03 := i03 * nelements_matrix |
|||
buf_03 := C.int64_t(C.ggml_row_size(uint32(newType), nPerRow)) * i03 * nrows |
|||
newSize += C.ggml_quantize_chunk( |
|||
uint32(newType), |
|||
(*C.float)(&f32s[f32s_03]), |
|||
unsafe.Pointer((uintptr)(unsafe.Pointer(&buf[0]))+uintptr(buf_03)), |
|||
0, |
|||
nrows, |
|||
nPerRow, |
|||
nil) |
|||
} |
|||
return buf[:newSize] |
|||
} |
|||
|
|||
func QuantizationVersion() uint32 { |
|||
return uint32(C.GGML_QNT_VERSION) |
|||
} |
|||
@ -0,0 +1,274 @@ |
|||
package server |
|||
|
|||
import ( |
|||
"fmt" |
|||
"io" |
|||
"log/slog" |
|||
"maps" |
|||
"os" |
|||
"strings" |
|||
"unsafe" |
|||
|
|||
fsggml "github.com/ollama/ollama/fs/ggml" |
|||
"github.com/ollama/ollama/ml/backend/ggml" |
|||
) |
|||
|
|||
type quantizer struct { |
|||
*os.File |
|||
offset uint64 |
|||
from, to *fsggml.Tensor |
|||
progressFn func(n uint64) |
|||
} |
|||
|
|||
func (q quantizer) WriteTo(w io.Writer) (int64, error) { |
|||
quantize := q.from.Kind != q.to.Kind |
|||
sr := io.NewSectionReader(q, int64(q.offset), int64(q.from.Size())) |
|||
if !quantize { |
|||
n, err := io.Copy(w, sr) |
|||
q.progressFn(q.from.Size()) |
|||
return n, err |
|||
} |
|||
data, err := io.ReadAll(sr) |
|||
if err != nil { |
|||
slog.Warn("file read error", "tensor", q.from.Name, "file", q.Name(), "error", err) |
|||
return 0, fmt.Errorf("unable to read tensor %s from %s: %s", q.from.Name, q.Name(), err) |
|||
} |
|||
var f32s []float32 |
|||
newType := fsggml.TensorType(q.to.Kind) |
|||
if fsggml.TensorType(q.from.Kind) == fsggml.TensorTypeF32 { |
|||
f32s = unsafe.Slice((*float32)(unsafe.Pointer(&data[0])), q.from.Elements()) |
|||
} else { |
|||
f32s = ggml.ConvertToF32(data, q.from.Kind, q.from.Elements()) |
|||
} |
|||
data = ggml.Quantize(newType, f32s, q.from.Shape) |
|||
n, err := w.Write(data) |
|||
q.progressFn(q.from.Size()) |
|||
return int64(n), err |
|||
} |
|||
|
|||
type quantizeState struct { |
|||
nAttnV int // Number of attn_*v* weight tensors
|
|||
nFfnDown int // Number of ffn_down tensors
|
|||
iAttnV int // Running counter of number of attn_v tensors that have been processed
|
|||
iFfnDown int // Running counter of number of ffn_down tensors that have been processed
|
|||
hasOutput bool // used to figure out if a model shares tok_embd with the output weight
|
|||
} |
|||
|
|||
func useMoreBits(iLayer, nLayers int) bool { |
|||
return iLayer < (nLayers/8) || iLayer >= 7*nLayers/8 || (iLayer-nLayers/8)%3 == 2 |
|||
} |
|||
|
|||
func getTensorNewType(kv fsggml.KV, qs *quantizeState, newType fsggml.TensorType, name string, shape []uint64, ftype fsggml.FileType) fsggml.TensorType { |
|||
// Ported from llama_tensor_get_type, removed unsupported quantization types
|
|||
nExperts := max(1, kv.Uint("expert_count", 0)) |
|||
if name == "output.weight" || name == "output_norm.weight" || (!qs.hasOutput && name == "token_embd.weight") { |
|||
nx := shape[0] |
|||
qk_k := newType.BlockSize() |
|||
if nx%qk_k != 0 { |
|||
newType = fsggml.TensorTypeQ8_0 |
|||
} else if newType != fsggml.TensorTypeQ8_0 { |
|||
newType = fsggml.TensorTypeQ6_K |
|||
} |
|||
} else if strings.Contains(name, "attn_v.weight") { |
|||
if ftype == fsggml.FileTypeQ2_K { |
|||
if kv.GQA() >= 4 { |
|||
newType = fsggml.TensorTypeQ4_K |
|||
} else { |
|||
newType = fsggml.TensorTypeQ3_K |
|||
} |
|||
} else if ftype == fsggml.FileTypeQ2_K_S && kv.GQA() >= 4 { |
|||
newType = fsggml.TensorTypeQ4_K |
|||
} else if ftype == fsggml.FileTypeQ3_K_M { |
|||
if qs.iAttnV < 2 { |
|||
newType = fsggml.TensorTypeQ5_K |
|||
} else { |
|||
newType = fsggml.TensorTypeQ4_K |
|||
} |
|||
} else if ftype == fsggml.FileTypeQ3_K_L { |
|||
newType = fsggml.TensorTypeQ5_K |
|||
} else if (ftype == fsggml.FileTypeQ4_K_M || ftype == fsggml.FileTypeQ5_K_M) && |
|||
useMoreBits(qs.iAttnV, qs.nAttnV) { |
|||
newType = fsggml.TensorTypeQ6_K |
|||
} else if ftype == fsggml.FileTypeQ4_K_S && qs.iAttnV < 4 { |
|||
newType = fsggml.TensorTypeQ5_K |
|||
} |
|||
|
|||
// TODO
|
|||
// if (qs.model.type == LLM_TYPE_70B) {
|
|||
// // In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
|
|||
// // 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
|
|||
// // nearly negligible increase in model size by quantizing this tensor with more bits:
|
|||
// if (newType == GGML_TYPE_Q3_K || newType == GGML_TYPE_Q4_K) newType = GGML_TYPE_Q5_K;
|
|||
// }
|
|||
|
|||
if nExperts == 8 { |
|||
// for the 8-expert model, bumping this to Q8_0 trades just ~128MB
|
|||
newType = fsggml.TensorTypeQ8_0 |
|||
} |
|||
qs.iAttnV++ |
|||
} else if strings.Contains(name, "attn_k.weight") { |
|||
if nExperts == 8 { |
|||
// for the 8-expert model, bumping this to Q8_0 trades just ~128MB
|
|||
newType = fsggml.TensorTypeQ8_0 |
|||
} |
|||
} else if strings.Contains(name, "ffn_down") { |
|||
iLayer := qs.iFfnDown |
|||
n_layer := qs.nFfnDown |
|||
if ftype == fsggml.FileTypeQ2_K { |
|||
newType = fsggml.TensorTypeQ3_K |
|||
} else if ftype == fsggml.FileTypeQ2_K_S { |
|||
if iLayer < n_layer/8 { |
|||
newType = fsggml.TensorTypeQ4_K |
|||
} |
|||
} else if ftype == fsggml.FileTypeQ3_K_M { |
|||
if iLayer < n_layer/16 { |
|||
newType = fsggml.TensorTypeQ5_K |
|||
} else if useMoreBits(iLayer, n_layer) { |
|||
newType = fsggml.TensorTypeQ4_K |
|||
} else { |
|||
newType = fsggml.TensorTypeQ3_K |
|||
} |
|||
} else if ftype == fsggml.FileTypeQ3_K_L { |
|||
newType = fsggml.TensorTypeQ5_K |
|||
} else if ftype == fsggml.FileTypeQ4_K_M { |
|||
if useMoreBits(iLayer, n_layer) { |
|||
newType = fsggml.TensorTypeQ6_K |
|||
} |
|||
} else if ftype == fsggml.FileTypeQ5_K_M && useMoreBits(iLayer, n_layer) { |
|||
newType = fsggml.TensorTypeQ6_K |
|||
} else if ftype == fsggml.FileTypeQ4_K_S && iLayer < n_layer/8 { |
|||
newType = fsggml.TensorTypeQ5_K |
|||
} |
|||
qs.iFfnDown++ |
|||
} else if strings.Contains(name, "attn_output.weight") { |
|||
if nExperts == 8 { |
|||
if ftype == fsggml.FileTypeQ2_K || ftype == fsggml.FileTypeQ3_K_S || ftype == fsggml.FileTypeQ3_K_M || |
|||
ftype == fsggml.FileTypeQ4_K_S || ftype == fsggml.FileTypeQ4_K_M { |
|||
newType = fsggml.TensorTypeQ5_K |
|||
} |
|||
} else { |
|||
if ftype == fsggml.FileTypeQ2_K { |
|||
newType = fsggml.TensorTypeQ3_K |
|||
} else if ftype == fsggml.FileTypeQ3_K_M { |
|||
newType = fsggml.TensorTypeQ4_K |
|||
} else if ftype == fsggml.FileTypeQ3_K_L { |
|||
newType = fsggml.TensorTypeQ5_K |
|||
} |
|||
} |
|||
} else if strings.Contains(name, "attn_qkv.weight") { |
|||
if ftype == fsggml.FileTypeQ3_K_M || ftype == fsggml.FileTypeQ3_K_L { |
|||
newType = fsggml.TensorTypeQ4_K |
|||
} else if ftype == fsggml.FileTypeQ4_K_M { |
|||
newType = fsggml.TensorTypeQ5_K |
|||
} else if ftype == fsggml.FileTypeQ5_K_M { |
|||
newType = fsggml.TensorTypeQ6_K |
|||
} |
|||
} |
|||
|
|||
if newType.IsQuantized() { |
|||
nx := shape[0] |
|||
ny := uint64(1) |
|||
if len(shape) > 1 { |
|||
ny = shape[1] |
|||
} |
|||
qk_k := newType.BlockSize() |
|||
if nx%qk_k != 0 { |
|||
slog.Warn(fmt.Sprintf("tensor cols %d x %d are not divisible by %d, required for %s. Falling back to quantization %s", nx, ny, qk_k, newType.String(), fsggml.TensorTypeF16.String())) |
|||
newType = fsggml.TensorTypeF16 |
|||
} |
|||
} |
|||
return newType |
|||
} |
|||
|
|||
func quantize(in, out *os.File, orig *fsggml.GGML, newFileType fsggml.FileType, progressFn func(n uint64)) error { |
|||
kv := maps.Clone(orig.KV()) |
|||
kv["general.file_type"] = newFileType |
|||
// kv["general.quantization_version"] = ggml.QuantizationVersion()
|
|||
qs := &quantizeState{} |
|||
// Build up the quantize state so newType can adjust types
|
|||
layerCount := 0 |
|||
for k, l := range orig.Tensors().GroupLayers() { |
|||
if strings.HasPrefix(k, "blk.") { |
|||
layerCount++ |
|||
} |
|||
for _, tensor := range l { |
|||
if strings.Contains(tensor.Name, "attn_v.weight") || |
|||
strings.Contains(tensor.Name, "attn_qkv.weight") || |
|||
strings.Contains(tensor.Name, "attn_kv_b.weight") { |
|||
qs.nAttnV++ |
|||
} else if tensor.Name == "output.weight" { |
|||
qs.hasOutput = true |
|||
} |
|||
} |
|||
} |
|||
qs.nFfnDown = layerCount |
|||
|
|||
origTensors := orig.Tensors().Items() |
|||
outputTensors := make([]*fsggml.Tensor, len(origTensors)) |
|||
for i, tensor := range origTensors { |
|||
tensor := tensor |
|||
newType := newType(tensor, kv, qs, newFileType) |
|||
newTensor := &fsggml.Tensor{ |
|||
Name: tensor.Name, |
|||
Shape: tensor.Shape, |
|||
Kind: uint32(newType), |
|||
} |
|||
outputTensors[i] = newTensor |
|||
outputTensors[i].WriterTo = quantizer{ |
|||
File: in, |
|||
offset: orig.Tensors().Offset + tensor.Offset, |
|||
from: tensor, |
|||
to: newTensor, |
|||
progressFn: progressFn, |
|||
} |
|||
} |
|||
return fsggml.WriteGGUF(out, kv, outputTensors) |
|||
} |
|||
|
|||
func newType(t *fsggml.Tensor, kv fsggml.KV, qs *quantizeState, ftype fsggml.FileType) fsggml.TensorType { |
|||
defaultType := ftype.ToTensorType() |
|||
name := t.Name |
|||
quantize := strings.HasSuffix(name, "weight") |
|||
|
|||
// don't quantize vision stuff
|
|||
quantize = quantize && (!strings.Contains(name, "v.") || strings.Contains(name, "_v.")) |
|||
quantize = quantize && !strings.Contains(name, "mm.") |
|||
|
|||
// quantize only 2D and 3D tensors (experts)
|
|||
quantize = quantize && (len(t.Shape) >= 2) |
|||
|
|||
// do not quantize norm tensors
|
|||
quantize = quantize && !strings.Contains(name, "_norm.weight") |
|||
|
|||
// do not quantize expert gating tensors
|
|||
quantize = quantize && !strings.Contains(name, "ffn_gate_inp.weight") |
|||
|
|||
// do not quantize positional embeddings and token types (BERT)
|
|||
quantize = quantize && (name != "position_embd.weight") |
|||
quantize = quantize && (name != "token_types.weight") |
|||
|
|||
// do not quantize Mamba's small yet 2D weights
|
|||
// NOTE: can't use LLM_TN here because the layer number is not known
|
|||
quantize = quantize && !strings.Contains(name, "ssm_conv1d.weight") |
|||
|
|||
// do not quantize RWKV's time_mix_first tensors
|
|||
quantize = quantize && !strings.Contains(name, "time_mix_first.weight") |
|||
quantize = quantize && !strings.Contains(name, "time_mix_w1.weight") |
|||
quantize = quantize && !strings.Contains(name, "time_mix_w2.weight") |
|||
quantize = quantize && !strings.Contains(name, "time_mix_decay_w1.weight") |
|||
quantize = quantize && !strings.Contains(name, "time_mix_decay_w2.weight") |
|||
quantize = quantize && !strings.Contains(name, "time_mix_lerp_fused.weight") |
|||
|
|||
// do not quantize relative position bias (T5)
|
|||
quantize = quantize && !strings.Contains(name, "attn_rel_b.weight") |
|||
|
|||
newType := fsggml.TensorType(t.Kind) |
|||
if quantize { |
|||
// get more optimal quantization type based on the tensor shape, layer, etc.
|
|||
newType = getTensorNewType(kv, qs, defaultType, t.Name, t.Shape, ftype) |
|||
if newType != defaultType { |
|||
slog.Debug("tensor quantization adjusted for better quality", "name", t.Name, "requested", defaultType, "quantization", newType) |
|||
} |
|||
} |
|||
return newType |
|||
} |
|||
@ -0,0 +1,882 @@ |
|||
package server |
|||
|
|||
import ( |
|||
"bytes" |
|||
"fmt" |
|||
"math" |
|||
"os" |
|||
"strings" |
|||
"testing" |
|||
|
|||
fsggml "github.com/ollama/ollama/fs/ggml" |
|||
"github.com/ollama/ollama/ml/backend/ggml" |
|||
) |
|||
|
|||
func TestGetTensorNewType(t *testing.T) { |
|||
cases := []struct { |
|||
name string |
|||
kv map[string]any |
|||
qs quantizeState |
|||
newType fsggml.TensorType |
|||
tensor_name string |
|||
shape []uint64 |
|||
ftype fsggml.FileType |
|||
expected fsggml.TensorType |
|||
expectedPanic string |
|||
}{ |
|||
{ |
|||
name: "output_unsupported", |
|||
kv: map[string]any{}, |
|||
newType: fsggml.TensorTypeQ4_0, |
|||
tensor_name: "output.weight", |
|||
shape: []uint64{100, 100}, |
|||
ftype: fsggml.FileTypeF32, |
|||
expected: fsggml.TensorTypeF16, |
|||
}, |
|||
{ |
|||
name: "output_Q8", |
|||
kv: map[string]any{}, |
|||
newType: fsggml.TensorTypeQ4_0, |
|||
tensor_name: "output.weight", |
|||
shape: []uint64{1024, 1024}, |
|||
ftype: fsggml.FileTypeF32, |
|||
expected: fsggml.TensorTypeQ6_K, |
|||
}, |
|||
{ |
|||
name: "attn_v.weight_q4_k", |
|||
kv: map[string]any{ |
|||
"general.architecture": "foo", |
|||
"foo.attention.head_count": uint32(4), |
|||
"foo.attention.head_count_kv": uint32(1), |
|||
}, |
|||
newType: fsggml.TensorTypeQ4_0, |
|||
tensor_name: "blk.0.attn_v.weight", |
|||
shape: []uint64{256}, |
|||
ftype: fsggml.FileTypeQ2_K, |
|||
expected: fsggml.TensorTypeQ4_K, |
|||
}, |
|||
{ |
|||
name: "attn_v.weight_q3_k", |
|||
kv: map[string]any{}, |
|||
newType: fsggml.TensorTypeQ4_0, |
|||
tensor_name: "blk.0.attn_v.weight", |
|||
shape: []uint64{256}, |
|||
ftype: fsggml.FileTypeQ2_K, |
|||
expected: fsggml.TensorTypeQ3_K, |
|||
}, |
|||
{ |
|||
name: "attn_v.weight_q2_k_s_q4_k", |
|||
kv: map[string]any{ |
|||
"general.architecture": "foo", |
|||
"foo.attention.head_count": uint32(4), |
|||
"foo.attention.head_count_kv": uint32(1), |
|||
}, |
|||
newType: fsggml.TensorTypeQ4_0, |
|||
tensor_name: "blk.0.attn_v.weight", |
|||
shape: []uint64{256}, |
|||
ftype: fsggml.FileTypeQ2_K_S, |
|||
expected: fsggml.TensorTypeQ4_K, |
|||
}, |
|||
{ |
|||
name: "attn_v.weight_q3_k_m", |
|||
kv: map[string]any{}, |
|||
newType: fsggml.TensorTypeQ4_0, |
|||
tensor_name: "blk.0.attn_v.weight", |
|||
shape: []uint64{256}, |
|||
ftype: fsggml.FileTypeQ3_K_M, |
|||
expected: fsggml.TensorTypeQ5_K, |
|||
}, |
|||
{ |
|||
name: "attn_v.weight_q3_k_m_i", |
|||
qs: quantizeState{ |
|||
iAttnV: 2, |
|||
}, |
|||
kv: map[string]any{}, |
|||
newType: fsggml.TensorTypeQ4_0, |
|||
tensor_name: "blk.0.attn_v.weight", |
|||
shape: []uint64{256}, |
|||
ftype: fsggml.FileTypeQ3_K_M, |
|||
expected: fsggml.TensorTypeQ4_K, |
|||
}, |
|||
{ |
|||
name: "attn_v.weight_q3_k_l", |
|||
kv: map[string]any{}, |
|||
newType: fsggml.TensorTypeQ4_0, |
|||
tensor_name: "blk.0.attn_v.weight", |
|||
shape: []uint64{256}, |
|||
ftype: fsggml.FileTypeQ3_K_L, |
|||
expected: fsggml.TensorTypeQ5_K, |
|||
}, |
|||
{ |
|||
name: "attn_v.weight_q4_k_m", |
|||
qs: quantizeState{ |
|||
iAttnV: 2, |
|||
nAttnV: 3 * 8, |
|||
}, |
|||
kv: map[string]any{}, |
|||
newType: fsggml.TensorTypeQ4_0, |
|||
tensor_name: "blk.0.attn_v.weight", |
|||
shape: []uint64{256}, |
|||
ftype: fsggml.FileTypeQ4_K_M, |
|||
expected: fsggml.TensorTypeQ6_K, |
|||
}, |
|||
{ |
|||
name: "attn_v.weight_q4_k_s", |
|||
qs: quantizeState{}, |
|||
kv: map[string]any{}, |
|||
newType: fsggml.TensorTypeQ4_0, |
|||
tensor_name: "blk.0.attn_v.weight", |
|||
shape: []uint64{256}, |
|||
ftype: fsggml.FileTypeQ4_K_S, |
|||
expected: fsggml.TensorTypeQ5_K, |
|||
}, |
|||
{ |
|||
name: "attn_v.weight_8_expert", |
|||
qs: quantizeState{}, |
|||
kv: map[string]any{ |
|||
"general.architecture": "foo", |
|||
"foo.expert_count": uint32(8), |
|||
}, |
|||
newType: fsggml.TensorTypeQ4_0, |
|||
tensor_name: "blk.0.attn_v.weight", |
|||
shape: []uint64{256}, |
|||
ftype: fsggml.FileTypeF32, |
|||
expected: fsggml.TensorTypeQ8_0, |
|||
}, |
|||
{ |
|||
name: "attn_k.weight_8_expert", |
|||
qs: quantizeState{}, |
|||
kv: map[string]any{ |
|||
"general.architecture": "foo", |
|||
"foo.expert_count": uint32(8), |
|||
}, |
|||
newType: fsggml.TensorTypeQ4_0, |
|||
tensor_name: "blk.0.attn_k.weight", |
|||
shape: []uint64{256}, |
|||
ftype: fsggml.FileTypeF32, |
|||
expected: fsggml.TensorTypeQ8_0, |
|||
}, |
|||
{ |
|||
name: "ffn_down_q2_k", |
|||
qs: quantizeState{}, |
|||
kv: map[string]any{}, |
|||
newType: fsggml.TensorTypeQ4_0, |
|||
tensor_name: "ffn_down", |
|||
shape: []uint64{256}, |
|||
ftype: fsggml.FileTypeQ2_K, |
|||
expected: fsggml.TensorTypeQ3_K, |
|||
}, |
|||
{ |
|||
name: "ffn_down_q2_k_s", |
|||
qs: quantizeState{}, |
|||
kv: map[string]any{}, |
|||
newType: fsggml.TensorTypeQ4_0, |
|||
tensor_name: "ffn_down", |
|||
shape: []uint64{256}, |
|||
ftype: fsggml.FileTypeQ2_K_S, |
|||
expected: fsggml.TensorTypeQ4_0, |
|||
}, |
|||
{ |
|||
name: "ffn_down_q2_k_s_layers", |
|||
qs: quantizeState{ |
|||
iFfnDown: 2, |
|||
nFfnDown: 3 * 8, |
|||
}, |
|||
kv: map[string]any{}, |
|||
newType: fsggml.TensorTypeQ4_0, |
|||
tensor_name: "ffn_down", |
|||
shape: []uint64{256}, |
|||
ftype: fsggml.FileTypeQ2_K_S, |
|||
expected: fsggml.TensorTypeQ4_K, |
|||
}, |
|||
{ |
|||
name: "ffn_down_q3_k_m_base", |
|||
qs: quantizeState{ |
|||
iFfnDown: 1, |
|||
nFfnDown: 8, |
|||
}, |
|||
kv: map[string]any{}, |
|||
newType: fsggml.TensorTypeQ4_0, |
|||
tensor_name: "ffn_down", |
|||
shape: []uint64{256}, |
|||
ftype: fsggml.FileTypeQ3_K_M, |
|||
expected: fsggml.TensorTypeQ3_K, |
|||
}, |
|||
{ |
|||
name: "ffn_down_q3_k_m_16", |
|||
qs: quantizeState{ |
|||
iFfnDown: 2, |
|||
nFfnDown: 3 * 16, |
|||
}, |
|||
kv: map[string]any{}, |
|||
newType: fsggml.TensorTypeQ4_0, |
|||
tensor_name: "ffn_down", |
|||
shape: []uint64{256}, |
|||
ftype: fsggml.FileTypeQ3_K_M, |
|||
expected: fsggml.TensorTypeQ5_K, |
|||
}, |
|||
{ |
|||
name: "ffn_down_q3_k_m_8", |
|||
qs: quantizeState{ |
|||
iFfnDown: 2, |
|||
nFfnDown: 3 * 8, |
|||
}, |
|||
kv: map[string]any{}, |
|||
newType: fsggml.TensorTypeQ4_0, |
|||
tensor_name: "ffn_down", |
|||
shape: []uint64{256}, |
|||
ftype: fsggml.FileTypeQ3_K_M, |
|||
expected: fsggml.TensorTypeQ4_K, |
|||
}, |
|||
{ |
|||
name: "ffn_down_q3_k_l", |
|||
qs: quantizeState{}, |
|||
kv: map[string]any{}, |
|||
newType: fsggml.TensorTypeQ4_0, |
|||
tensor_name: "ffn_down", |
|||
shape: []uint64{256}, |
|||
ftype: fsggml.FileTypeQ3_K_L, |
|||
expected: fsggml.TensorTypeQ5_K, |
|||
}, |
|||
{ |
|||
name: "ffn_down_q4_k_m", |
|||
qs: quantizeState{ |
|||
iFfnDown: 1, |
|||
nFfnDown: 8, |
|||
}, |
|||
kv: map[string]any{}, |
|||
newType: fsggml.TensorTypeQ4_0, |
|||
tensor_name: "ffn_down", |
|||
shape: []uint64{256}, |
|||
ftype: fsggml.FileTypeQ4_K_M, |
|||
expected: fsggml.TensorTypeQ4_0, |
|||
}, |
|||
{ |
|||
name: "ffn_down_q4_k_m_6", |
|||
qs: quantizeState{ |
|||
iFfnDown: 2, |
|||
nFfnDown: 3 * 8, |
|||
}, |
|||
kv: map[string]any{}, |
|||
newType: fsggml.TensorTypeQ4_0, |
|||
tensor_name: "ffn_down", |
|||
shape: []uint64{256}, |
|||
ftype: fsggml.FileTypeQ4_K_M, |
|||
expected: fsggml.TensorTypeQ6_K, |
|||
}, |
|||
{ |
|||
name: "ffn_down_q5_k_m", |
|||
qs: quantizeState{ |
|||
iFfnDown: 2, |
|||
nFfnDown: 3 * 8, |
|||
}, |
|||
kv: map[string]any{}, |
|||
newType: fsggml.TensorTypeQ4_0, |
|||
tensor_name: "ffn_down", |
|||
shape: []uint64{256}, |
|||
ftype: fsggml.FileTypeQ5_K_M, |
|||
expected: fsggml.TensorTypeQ6_K, |
|||
}, |
|||
{ |
|||
name: "ffn_down_q4_k_s", |
|||
qs: quantizeState{ |
|||
iFfnDown: 2, |
|||
nFfnDown: 3 * 8, |
|||
}, |
|||
kv: map[string]any{}, |
|||
newType: fsggml.TensorTypeQ4_0, |
|||
tensor_name: "ffn_down", |
|||
shape: []uint64{256}, |
|||
ftype: fsggml.FileTypeQ4_K_S, |
|||
expected: fsggml.TensorTypeQ5_K, |
|||
}, |
|||
{ |
|||
name: "attn_output.weight_8_expert", |
|||
qs: quantizeState{}, |
|||
kv: map[string]any{ |
|||
"general.architecture": "foo", |
|||
"foo.expert_count": uint32(8), |
|||
}, |
|||
newType: fsggml.TensorTypeQ4_0, |
|||
tensor_name: "blk.0.attn_output.weight", |
|||
shape: []uint64{256}, |
|||
ftype: fsggml.FileTypeQ2_K, |
|||
expected: fsggml.TensorTypeQ5_K, |
|||
}, |
|||
{ |
|||
name: "attn_output.weight_q2", |
|||
qs: quantizeState{}, |
|||
kv: map[string]any{}, |
|||
newType: fsggml.TensorTypeQ4_0, |
|||
tensor_name: "blk.0.attn_output.weight", |
|||
shape: []uint64{256}, |
|||
ftype: fsggml.FileTypeQ2_K, |
|||
expected: fsggml.TensorTypeQ3_K, |
|||
}, |
|||
{ |
|||
name: "attn_output.weight_q3_k_m", |
|||
qs: quantizeState{}, |
|||
kv: map[string]any{}, |
|||
newType: fsggml.TensorTypeQ4_0, |
|||
tensor_name: "blk.0.attn_output.weight", |
|||
shape: []uint64{256}, |
|||
ftype: fsggml.FileTypeQ3_K_M, |
|||
expected: fsggml.TensorTypeQ4_K, |
|||
}, |
|||
{ |
|||
name: "attn_output.weight_q3_k_l", |
|||
qs: quantizeState{}, |
|||
kv: map[string]any{}, |
|||
newType: fsggml.TensorTypeQ4_0, |
|||
tensor_name: "blk.0.attn_output.weight", |
|||
shape: []uint64{256}, |
|||
ftype: fsggml.FileTypeQ3_K_L, |
|||
expected: fsggml.TensorTypeQ5_K, |
|||
}, |
|||
{ |
|||
name: "attn_qkv.weight_q3_k_m", |
|||
qs: quantizeState{}, |
|||
kv: map[string]any{}, |
|||
newType: fsggml.TensorTypeQ4_0, |
|||
tensor_name: "blk.0.attn_qkv.weight", |
|||
shape: []uint64{256}, |
|||
ftype: fsggml.FileTypeQ3_K_M, |
|||
expected: fsggml.TensorTypeQ4_K, |
|||
}, |
|||
{ |
|||
name: "attn_qkv.weight_q4_k_m", |
|||
qs: quantizeState{}, |
|||
kv: map[string]any{}, |
|||
newType: fsggml.TensorTypeQ4_0, |
|||
tensor_name: "blk.0.attn_qkv.weight", |
|||
shape: []uint64{256}, |
|||
ftype: fsggml.FileTypeQ4_K_M, |
|||
expected: fsggml.TensorTypeQ5_K, |
|||
}, |
|||
{ |
|||
name: "attn_qkv.weight_q5_k_m", |
|||
qs: quantizeState{}, |
|||
kv: map[string]any{}, |
|||
newType: fsggml.TensorTypeQ4_0, |
|||
tensor_name: "blk.0.attn_qkv.weight", |
|||
shape: []uint64{256}, |
|||
ftype: fsggml.FileTypeQ5_K_M, |
|||
expected: fsggml.TensorTypeQ6_K, |
|||
}, |
|||
} |
|||
for _, tt := range cases { |
|||
t.Run(tt.name, func(t *testing.T) { |
|||
if tt.expectedPanic != "" { |
|||
defer func() { |
|||
e := recover() |
|||
if !strings.Contains(fmt.Sprintf("%v", e), tt.expectedPanic) { |
|||
t.Fatalf("incorrect panic\ngot: %v\nexpected: %s", e, tt.expectedPanic) |
|||
} |
|||
}() |
|||
} else { |
|||
defer func() { |
|||
e := recover() |
|||
if e != nil { |
|||
t.Fatalf("hit unexpected panic %v", e) |
|||
} |
|||
}() |
|||
} |
|||
ret := getTensorNewType(tt.kv, &tt.qs, tt.newType, tt.tensor_name, tt.shape, tt.ftype) |
|||
if ret != tt.expected { |
|||
t.Fatalf("incorrect type returned\ngot: %d\nexpected: %d", ret, tt.expected) |
|||
} |
|||
}) |
|||
} |
|||
} |
|||
|
|||
func TestQuantizeModel(t *testing.T) { |
|||
cases := []struct { |
|||
name string |
|||
kv map[string]any |
|||
tensors []*fsggml.Tensor |
|||
newType string |
|||
expectedTensorTypes map[string]fsggml.TensorType |
|||
}{ |
|||
{ |
|||
name: "f16_q4_k", |
|||
kv: map[string]any{ |
|||
"general.architecture": "foo", |
|||
}, |
|||
tensors: []*fsggml.Tensor{ |
|||
{ |
|||
Name: "blk.0.attn.weight", Kind: uint32(fsggml.TensorTypeF16), |
|||
Offset: uint64(0), Shape: []uint64{512, 2}, |
|||
WriterTo: bytes.NewReader( |
|||
append(append(append(quantBytes[fsggml.TensorTypeF16], quantBytes[fsggml.TensorTypeF16]...), quantBytes[fsggml.TensorTypeF16]...), quantBytes[fsggml.TensorTypeF16]...), |
|||
), |
|||
}, |
|||
{ |
|||
Name: "output.weight", Kind: uint32(fsggml.TensorTypeF16), |
|||
Offset: uint64(0), Shape: []uint64{256, 4}, |
|||
WriterTo: bytes.NewReader( |
|||
append(append(append(quantBytes[fsggml.TensorTypeF16], quantBytes[fsggml.TensorTypeF16]...), quantBytes[fsggml.TensorTypeF16]...), quantBytes[fsggml.TensorTypeF16]...), |
|||
), |
|||
}, |
|||
}, |
|||
newType: "Q4_K", |
|||
expectedTensorTypes: map[string]fsggml.TensorType{ |
|||
"blk.0.attn.weight": fsggml.TensorTypeQ4_K, |
|||
"output.weight": fsggml.TensorTypeQ6_K, |
|||
}, |
|||
}, |
|||
{ |
|||
name: "f32_q4_k", |
|||
kv: map[string]any{ |
|||
"general.architecture": "foo", |
|||
}, |
|||
tensors: []*fsggml.Tensor{ |
|||
{ |
|||
Name: "blk.0.attn_v.weight", Kind: uint32(fsggml.TensorTypeF32), |
|||
Offset: uint64(0), Shape: []uint64{512, 2}, |
|||
WriterTo: bytes.NewReader( |
|||
append(append(append(quantBytes[fsggml.TensorTypeF32], quantBytes[fsggml.TensorTypeF32]...), quantBytes[fsggml.TensorTypeF32]...), quantBytes[fsggml.TensorTypeF32]...), |
|||
), |
|||
}, |
|||
{ |
|||
Name: "output.weight", Kind: uint32(fsggml.TensorTypeF32), |
|||
Offset: uint64(0), Shape: []uint64{512}, |
|||
WriterTo: bytes.NewReader(append(quantBytes[fsggml.TensorTypeF32], quantBytes[fsggml.TensorTypeF32]...)), |
|||
}, |
|||
}, |
|||
newType: "Q4_K", |
|||
expectedTensorTypes: map[string]fsggml.TensorType{ |
|||
"blk.0.attn_v.weight": fsggml.TensorTypeQ6_K, |
|||
"output.weight": fsggml.TensorTypeF32, |
|||
}, |
|||
}, |
|||
{ |
|||
name: "f16_q8_0", |
|||
kv: map[string]any{ |
|||
"general.architecture": "foo", |
|||
}, |
|||
tensors: []*fsggml.Tensor{ |
|||
{ |
|||
Name: "blk.0.attn.weight", Kind: uint32(fsggml.TensorTypeF16), |
|||
Offset: uint64(0), Shape: []uint64{32, 16, 2}, |
|||
WriterTo: bytes.NewReader( |
|||
append(append(append(quantBytes[fsggml.TensorTypeF16], quantBytes[fsggml.TensorTypeF16]...), quantBytes[fsggml.TensorTypeF16]...), quantBytes[fsggml.TensorTypeF16]...), |
|||
), |
|||
}, |
|||
{ |
|||
Name: "output.weight", Kind: uint32(fsggml.TensorTypeF16), |
|||
Offset: uint64(0), Shape: []uint64{256, 4}, |
|||
WriterTo: bytes.NewReader( |
|||
append(append(append(quantBytes[fsggml.TensorTypeF16], quantBytes[fsggml.TensorTypeF16]...), quantBytes[fsggml.TensorTypeF16]...), quantBytes[fsggml.TensorTypeF16]...), |
|||
), |
|||
}, |
|||
}, |
|||
newType: "Q8_0", |
|||
expectedTensorTypes: map[string]fsggml.TensorType{ |
|||
"blk.0.attn.weight": fsggml.TensorTypeQ8_0, |
|||
"output.weight": fsggml.TensorTypeQ8_0, |
|||
}, |
|||
}, |
|||
} |
|||
|
|||
for _, tt := range cases { |
|||
t.Run(tt.name, func(t *testing.T) { |
|||
f, err := os.CreateTemp(t.TempDir(), tt.name) |
|||
if err != nil { |
|||
t.Fatal(err.Error()) |
|||
} |
|||
defer f.Close() |
|||
err = fsggml.WriteGGUF(f, tt.kv, tt.tensors) |
|||
if err != nil { |
|||
t.Fatalf("failed to create initial model: %s", err) |
|||
} |
|||
fp, err := os.Open(f.Name()) |
|||
if err != nil { |
|||
t.Fatal(err.Error()) |
|||
} |
|||
defer fp.Close() |
|||
meta, _, err := fsggml.Decode(fp, -1) |
|||
if err != nil { |
|||
t.Fatal(err.Error()) |
|||
} |
|||
progressCalled := false |
|||
progress := func(n uint64) { |
|||
// fmt.Fprintf(os.Stderr, "progress: %f\n", p)
|
|||
progressCalled = true |
|||
} |
|||
tmp, err := os.CreateTemp(t.TempDir(), tt.name+".out") |
|||
if err != nil { |
|||
t.Fatal(err.Error()) |
|||
} |
|||
defer tmp.Close() |
|||
ftype, err := fsggml.ParseFileType(tt.newType) |
|||
if err != nil { |
|||
t.Fatal(err.Error()) |
|||
} |
|||
|
|||
err = quantize(fp, tmp, meta, ftype, progress) |
|||
if err != nil { |
|||
t.Fatalf("error during quantize: %s", err) |
|||
} |
|||
if !progressCalled { |
|||
t.Fatalf("progress was not reported") |
|||
} |
|||
// Now attempt to load it back and make sure types match expected
|
|||
fpNew, err := os.Open(tmp.Name()) |
|||
if err != nil { |
|||
t.Fatalf("failed to load the quantized model %s: %s", tmp.Name(), err) |
|||
} |
|||
defer fpNew.Close() |
|||
newMeta, _, err := fsggml.Decode(fpNew, -1) |
|||
if err != nil { |
|||
t.Fatalf("failed to load the quantized model %s: %s", tmp.Name(), err) |
|||
} |
|||
tensors := newMeta.Tensors() |
|||
for _, l := range tensors.GroupLayers() { |
|||
for _, tensor := range l { |
|||
if fsggml.TensorType(tensor.Kind) != tt.expectedTensorTypes[tensor.Name] { |
|||
t.Fatalf("incorrect output type for %s\ngot:%s\nexpected:%s", tensor.Name, fsggml.TensorType(tensor.Kind), tt.expectedTensorTypes[tensor.Name]) |
|||
} |
|||
} |
|||
} |
|||
}) |
|||
} |
|||
} |
|||
|
|||
func TestConvertToF32(t *testing.T) { |
|||
expected := make([]float32, 256) |
|||
for i := range expected { |
|||
expected[i] = float32(i) |
|||
} |
|||
for dtype, data := range quantBytes { |
|||
// Skip the no-op
|
|||
if dtype == fsggml.TensorTypeF32 { |
|||
continue |
|||
} |
|||
t.Run(dtype.String(), func(t *testing.T) { |
|||
fp32 := ggml.ConvertToF32(data, uint32(dtype), 256) |
|||
similarity := cosineSimilarity(expected, fp32) |
|||
if similarity < 0.999 { |
|||
t.Fatalf("Results not similar enough: %s %f", dtype.String(), similarity) |
|||
} |
|||
}) |
|||
} |
|||
} |
|||
|
|||
func dotProduct[V float32 | float64](v1, v2 []V) V { |
|||
var result V = 0 |
|||
for i := range v1 { |
|||
result += v1[i] * v2[i] |
|||
} |
|||
return result |
|||
} |
|||
|
|||
func magnitude[V float32 | float64](v []V) V { |
|||
var result V = 0 |
|||
for _, val := range v { |
|||
result += val * val |
|||
} |
|||
return V(math.Sqrt(float64(result))) |
|||
} |
|||
|
|||
func cosineSimilarity[V float32 | float64](v1, v2 []V) V { |
|||
return dotProduct(v1, v2) / (magnitude(v1) * magnitude(v2)) |
|||
} |
|||
|
|||
// Precomputed quantized data - arange 256
|
|||
// # For gguf-py supported types
|
|||
// import gguf
|
|||
// import numpy as np
|
|||
// print(repr(gguf.quantize(np.arange(256, dtype=np.float16), gguf.GGMLQuantizationType.Q4_0)))
|
|||
//
|
|||
// For types not supported by gguf-py converted via ggml_fp32_to_fp16_row and quantize_XXX
|
|||
//
|
|||
// data := make([]byte, 256*2)
|
|||
// fp32 := make([]float32, 256)
|
|||
// for i := range 256 {
|
|||
// fp32[i] = float32(i)
|
|||
// }
|
|||
// l := C.quantize_q6_K((*C.float)(&fp32[0]), unsafe.Pointer(&data[0]), 1, 256, nil)
|
|||
// for i := range data[:int(l)] {
|
|||
// fmt.Printf("%d, ", data[i])
|
|||
// }
|
|||
var ( |
|||
quantBytes = map[fsggml.TensorType][]byte{ |
|||
fsggml.TensorTypeQ4_0: { |
|||
192, 195, 72, 72, 55, 55, 55, 55, 38, 38, 38, 38, 21, |
|||
21, 21, 21, 4, 4, 224, 199, 36, 36, 36, 36, 19, 19, |
|||
19, 19, 19, 19, 19, 19, 2, 2, 2, 2, 240, 201, 19, |
|||
19, 18, 18, 18, 18, 18, 18, 18, 18, 2, 2, 2, 2, |
|||
1, 1, 240, 203, 18, 18, 18, 18, 18, 18, 18, 18, 1, |
|||
1, 1, 1, 1, 1, 1, 1, 248, 204, 18, 18, 17, 17, |
|||
17, 17, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 248, |
|||
205, 17, 17, 17, 17, 1, 1, 1, 1, 1, 1, 1, 1, |
|||
1, 1, 1, 1, 248, 206, 17, 17, 1, 1, 1, 1, 1, |
|||
1, 1, 1, 1, 1, 1, 1, 1, 1, 248, 207, 1, 1, |
|||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
|||
1, |
|||
}, |
|||
fsggml.TensorTypeQ4_1: { |
|||
34, 64, 0, 0, 128, 128, 145, 145, 162, 162, 179, 179, 196, |
|||
196, 213, 213, 230, 230, 247, 247, 34, 64, 0, 80, 128, 128, |
|||
145, 145, 162, 162, 179, 179, 196, 196, 213, 213, 230, 230, 247, |
|||
247, 34, 64, 0, 84, 128, 128, 145, 145, 162, 162, 179, 179, |
|||
196, 196, 213, 213, 230, 230, 247, 247, 34, 64, 0, 86, 128, |
|||
128, 145, 145, 162, 162, 179, 179, 196, 196, 213, 213, 230, 230, |
|||
247, 247, 34, 64, 0, 88, 128, 128, 145, 145, 162, 162, 179, |
|||
179, 196, 196, 213, 213, 230, 230, 247, 247, 34, 64, 0, 89, |
|||
128, 128, 145, 145, 162, 162, 179, 179, 196, 196, 213, 213, 230, |
|||
230, 247, 247, 34, 64, 0, 90, 128, 128, 145, 145, 162, 162, |
|||
179, 179, 196, 196, 213, 213, 230, 230, 247, 247, 34, 64, 0, |
|||
91, 128, 128, 145, 145, 162, 162, 179, 179, 196, 196, 213, 213, |
|||
230, 230, 247, 247, |
|||
}, |
|||
fsggml.TensorTypeQ5_0: { |
|||
192, 191, 1, 0, 0, 0, 128, 127, 127, 110, 110, 93, 93, |
|||
76, 76, 59, 59, 42, 42, 25, 25, 8, 224, 195, 0, 0, |
|||
0, 0, 72, 72, 55, 55, 55, 55, 38, 38, 38, 38, 21, |
|||
21, 21, 21, 4, 4, 240, 197, 0, 0, 0, 0, 53, 37, |
|||
37, 37, 37, 36, 36, 20, 20, 20, 20, 19, 19, 3, 3, |
|||
3, 240, 199, 0, 0, 0, 0, 36, 36, 36, 36, 19, 19, |
|||
19, 19, 19, 19, 19, 19, 2, 2, 2, 2, 248, 200, 0, |
|||
0, 0, 0, 35, 19, 19, 19, 19, 19, 19, 18, 18, 18, |
|||
18, 2, 2, 2, 2, 2, 248, 201, 0, 0, 0, 0, 19, |
|||
19, 18, 18, 18, 18, 18, 18, 18, 18, 2, 2, 2, 2, |
|||
1, 1, 248, 202, 0, 0, 0, 0, 18, 18, 18, 18, 18, |
|||
18, 18, 18, 18, 2, 2, 1, 1, 1, 1, 1, 248, 203, |
|||
0, 0, 0, 0, 18, 18, 18, 18, 18, 18, 18, 18, 1, |
|||
1, 1, 1, 1, 1, 1, 1, |
|||
}, |
|||
fsggml.TensorTypeQ5_1: { |
|||
0, 60, 0, 0, 0, 0, 255, 255, 0, 17, 34, 51, 68, |
|||
85, 102, 119, 136, 153, 170, 187, 204, 221, 238, 255, 0, 60, |
|||
0, 80, 0, 0, 255, 255, 0, 17, 34, 51, 68, 85, 102, |
|||
119, 136, 153, 170, 187, 204, 221, 238, 255, 0, 60, 0, 84, |
|||
0, 0, 255, 255, 0, 17, 34, 51, 68, 85, 102, 119, 136, |
|||
153, 170, 187, 204, 221, 238, 255, 0, 60, 0, 86, 0, 0, |
|||
255, 255, 0, 17, 34, 51, 68, 85, 102, 119, 136, 153, 170, |
|||
187, 204, 221, 238, 255, 0, 60, 0, 88, 0, 0, 255, 255, |
|||
0, 17, 34, 51, 68, 85, 102, 119, 136, 153, 170, 187, 204, |
|||
221, 238, 255, 0, 60, 0, 89, 0, 0, 255, 255, 0, 17, |
|||
34, 51, 68, 85, 102, 119, 136, 153, 170, 187, 204, 221, 238, |
|||
255, 0, 60, 0, 90, 0, 0, 255, 255, 0, 17, 34, 51, |
|||
68, 85, 102, 119, 136, 153, 170, 187, 204, 221, 238, 255, 0, |
|||
60, 0, 91, 0, 0, 255, 255, 0, 17, 34, 51, 68, 85, |
|||
102, 119, 136, 153, 170, 187, 204, 221, 238, 255, |
|||
}, |
|||
fsggml.TensorTypeQ8_0: { |
|||
208, 51, 0, 4, 8, 12, 16, 20, 25, 29, 33, 37, 41, |
|||
45, 49, 53, 57, 61, 66, 70, 74, 78, 82, 86, 90, 94, |
|||
98, 102, 107, 111, 115, 119, 123, 127, 240, 55, 65, 67, 69, |
|||
71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91, 93, 95, |
|||
97, 99, 101, 103, 105, 107, 109, 111, 113, 115, 117, 119, 121, |
|||
123, 125, 127, 252, 57, 86, 87, 88, 90, 91, 92, 94, 95, |
|||
96, 98, 99, 100, 102, 103, 104, 106, 107, 108, 110, 111, 112, |
|||
114, 115, 116, 118, 119, 120, 122, 123, 124, 126, 127, 0, 60, |
|||
96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, |
|||
109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, |
|||
122, 123, 124, 125, 126, 127, 2, 61, 102, 103, 104, 105, 105, |
|||
106, 107, 108, 109, 109, 110, 111, 112, 113, 113, 114, 115, 116, |
|||
117, 117, 118, 119, 120, 121, 121, 122, 123, 124, 125, 125, 126, |
|||
127, 4, 62, 106, 107, 108, 108, 109, 110, 110, 111, 112, 112, |
|||
113, 114, 114, 115, 116, 116, 117, 118, 118, 119, 120, 120, 121, |
|||
122, 122, 123, 124, 124, 125, 126, 126, 127, 6, 63, 109, 110, |
|||
110, 111, 112, 112, 113, 113, 114, 114, 115, 116, 116, 117, 117, |
|||
118, 118, 119, 120, 120, 121, 121, 122, 122, 123, 124, 124, 125, |
|||
125, 126, 126, 127, 4, 64, 112, 112, 113, 113, 114, 114, 115, |
|||
115, 116, 116, 117, 117, 118, 118, 119, 119, 120, 120, 121, 121, |
|||
122, 122, 123, 123, 124, 124, 125, 125, 126, 126, 127, 127, |
|||
}, |
|||
fsggml.TensorTypeBF16: { |
|||
0, 0, 128, 63, 0, 64, 64, 64, 128, 64, 160, 64, 192, |
|||
64, 224, 64, 0, 65, 16, 65, 32, 65, 48, 65, 64, 65, |
|||
80, 65, 96, 65, 112, 65, 128, 65, 136, 65, 144, 65, 152, |
|||
65, 160, 65, 168, 65, 176, 65, 184, 65, 192, 65, 200, 65, |
|||
208, 65, 216, 65, 224, 65, 232, 65, 240, 65, 248, 65, 0, |
|||
66, 4, 66, 8, 66, 12, 66, 16, 66, 20, 66, 24, 66, |
|||
28, 66, 32, 66, 36, 66, 40, 66, 44, 66, 48, 66, 52, |
|||
66, 56, 66, 60, 66, 64, 66, 68, 66, 72, 66, 76, 66, |
|||
80, 66, 84, 66, 88, 66, 92, 66, 96, 66, 100, 66, 104, |
|||
66, 108, 66, 112, 66, 116, 66, 120, 66, 124, 66, 128, 66, |
|||
130, 66, 132, 66, 134, 66, 136, 66, 138, 66, 140, 66, 142, |
|||
66, 144, 66, 146, 66, 148, 66, 150, 66, 152, 66, 154, 66, |
|||
156, 66, 158, 66, 160, 66, 162, 66, 164, 66, 166, 66, 168, |
|||
66, 170, 66, 172, 66, 174, 66, 176, 66, 178, 66, 180, 66, |
|||
182, 66, 184, 66, 186, 66, 188, 66, 190, 66, 192, 66, 194, |
|||
66, 196, 66, 198, 66, 200, 66, 202, 66, 204, 66, 206, 66, |
|||
208, 66, 210, 66, 212, 66, 214, 66, 216, 66, 218, 66, 220, |
|||
66, 222, 66, 224, 66, 226, 66, 228, 66, 230, 66, 232, 66, |
|||
234, 66, 236, 66, 238, 66, 240, 66, 242, 66, 244, 66, 246, |
|||
66, 248, 66, 250, 66, 252, 66, 254, 66, 0, 67, 1, 67, |
|||
2, 67, 3, 67, 4, 67, 5, 67, 6, 67, 7, 67, 8, |
|||
67, 9, 67, 10, 67, 11, 67, 12, 67, 13, 67, 14, 67, |
|||
15, 67, 16, 67, 17, 67, 18, 67, 19, 67, 20, 67, 21, |
|||
67, 22, 67, 23, 67, 24, 67, 25, 67, 26, 67, 27, 67, |
|||
28, 67, 29, 67, 30, 67, 31, 67, 32, 67, 33, 67, 34, |
|||
67, 35, 67, 36, 67, 37, 67, 38, 67, 39, 67, 40, 67, |
|||
41, 67, 42, 67, 43, 67, 44, 67, 45, 67, 46, 67, 47, |
|||
67, 48, 67, 49, 67, 50, 67, 51, 67, 52, 67, 53, 67, |
|||
54, 67, 55, 67, 56, 67, 57, 67, 58, 67, 59, 67, 60, |
|||
67, 61, 67, 62, 67, 63, 67, 64, 67, 65, 67, 66, 67, |
|||
67, 67, 68, 67, 69, 67, 70, 67, 71, 67, 72, 67, 73, |
|||
67, 74, 67, 75, 67, 76, 67, 77, 67, 78, 67, 79, 67, |
|||
80, 67, 81, 67, 82, 67, 83, 67, 84, 67, 85, 67, 86, |
|||
67, 87, 67, 88, 67, 89, 67, 90, 67, 91, 67, 92, 67, |
|||
93, 67, 94, 67, 95, 67, 96, 67, 97, 67, 98, 67, 99, |
|||
67, 100, 67, 101, 67, 102, 67, 103, 67, 104, 67, 105, 67, |
|||
106, 67, 107, 67, 108, 67, 109, 67, 110, 67, 111, 67, 112, |
|||
67, 113, 67, 114, 67, 115, 67, 116, 67, 117, 67, 118, 67, |
|||
119, 67, 120, 67, 121, 67, 122, 67, 123, 67, 124, 67, 125, |
|||
67, 126, 67, 127, 67, |
|||
}, |
|||
fsggml.TensorTypeF16: { |
|||
0, 0, 0, 60, 0, 64, 0, 66, 0, 68, 0, 69, 0, 70, 0, 71, 0, |
|||
72, 128, 72, 0, 73, 128, 73, 0, 74, 128, 74, 0, 75, 128, 75, |
|||
0, 76, 64, 76, 128, 76, 192, 76, 0, 77, 64, 77, 128, 77, 192, |
|||
77, 0, 78, 64, 78, 128, 78, 192, 78, 0, 79, 64, 79, 128, 79, |
|||
192, 79, 0, 80, 32, 80, 64, 80, 96, 80, 128, 80, 160, 80, |
|||
192, 80, 224, 80, 0, 81, 32, 81, 64, 81, 96, 81, 128, 81, |
|||
160, 81, 192, 81, 224, 81, 0, 82, 32, 82, 64, 82, 96, 82, |
|||
128, 82, 160, 82, 192, 82, 224, 82, 0, 83, 32, 83, 64, 83, |
|||
96, 83, 128, 83, 160, 83, 192, 83, 224, 83, 0, 84, 16, 84, |
|||
32, 84, 48, 84, 64, 84, 80, 84, 96, 84, 112, 84, 128, 84, |
|||
144, 84, 160, 84, 176, 84, 192, 84, 208, 84, 224, 84, 240, |
|||
84, 0, 85, 16, 85, 32, 85, 48, 85, 64, 85, 80, 85, 96, 85, |
|||
112, 85, 128, 85, 144, 85, 160, 85, 176, 85, 192, 85, 208, |
|||
85, 224, 85, 240, 85, 0, 86, 16, 86, 32, 86, 48, 86, 64, |
|||
86, 80, 86, 96, 86, 112, 86, 128, 86, 144, 86, 160, 86, |
|||
176, 86, 192, 86, 208, 86, 224, 86, 240, 86, 0, 87, 16, |
|||
87, 32, 87, 48, 87, 64, 87, 80, 87, 96, 87, 112, 87, 128, |
|||
87, 144, 87, 160, 87, 176, 87, 192, 87, 208, 87, 224, 87, |
|||
240, 87, 0, 88, 8, 88, 16, 88, 24, 88, 32, 88, 40, 88, |
|||
48, 88, 56, 88, 64, 88, 72, 88, 80, 88, 88, 88, 96, 88, |
|||
104, 88, 112, 88, 120, 88, 128, 88, 136, 88, 144, 88, 152, |
|||
88, 160, 88, 168, 88, 176, 88, 184, 88, 192, 88, 200, 88, |
|||
208, 88, 216, 88, 224, 88, 232, 88, 240, 88, 248, 88, 0, |
|||
89, 8, 89, 16, 89, 24, 89, 32, 89, 40, 89, 48, 89, 56, 89, |
|||
64, 89, 72, 89, 80, 89, 88, 89, 96, 89, 104, 89, 112, 89, |
|||
120, 89, 128, 89, 136, 89, 144, 89, 152, 89, 160, 89, 168, |
|||
89, 176, 89, 184, 89, 192, 89, 200, 89, 208, 89, 216, 89, |
|||
224, 89, 232, 89, 240, 89, 248, 89, 0, 90, 8, 90, 16, 90, |
|||
24, 90, 32, 90, 40, 90, 48, 90, 56, 90, 64, 90, 72, 90, 80, |
|||
90, 88, 90, 96, 90, 104, 90, 112, 90, 120, 90, 128, 90, |
|||
136, 90, 144, 90, 152, 90, 160, 90, 168, 90, 176, 90, 184, |
|||
90, 192, 90, 200, 90, 208, 90, 216, 90, 224, 90, 232, 90, |
|||
240, 90, 248, 90, 0, 91, 8, 91, 16, 91, 24, 91, 32, 91, 40, |
|||
91, 48, 91, 56, 91, 64, 91, 72, 91, 80, 91, 88, 91, 96, 91, |
|||
104, 91, 112, 91, 120, 91, 128, 91, 136, 91, 144, 91, 152, |
|||
91, 160, 91, 168, 91, 176, 91, 184, 91, 192, 91, 200, 91, |
|||
208, 91, 216, 91, 224, 91, 232, 91, 240, 91, 248, 91, |
|||
}, |
|||
fsggml.TensorTypeF32: { |
|||
0, 0, 0, 0, 0, 0, 128, 63, 0, 0, 0, 64, 0, 0, 64, 64, 0, 0, 128, |
|||
64, 0, 0, 160, 64, 0, 0, 192, 64, 0, 0, 224, 64, 0, 0, 0, 65, 0, |
|||
0, 16, 65, 0, 0, 32, 65, 0, 0, 48, 65, 0, 0, 64, 65, 0, 0, 80, 65, |
|||
0, 0, 96, 65, 0, 0, 112, 65, 0, 0, 128, 65, 0, 0, 136, 65, 0, 0, |
|||
144, 65, 0, 0, 152, 65, 0, 0, 160, 65, 0, 0, 168, 65, 0, 0, 176, |
|||
65, 0, 0, 184, 65, 0, 0, 192, 65, 0, 0, 200, 65, 0, 0, 208, 65, 0, |
|||
0, 216, 65, 0, 0, 224, 65, 0, 0, 232, 65, 0, 0, 240, 65, 0, 0, 248, |
|||
65, 0, 0, 0, 66, 0, 0, 4, 66, 0, 0, 8, 66, 0, 0, 12, 66, 0, 0, 16, |
|||
66, 0, 0, 20, 66, 0, 0, 24, 66, 0, 0, 28, 66, 0, 0, 32, 66, 0, 0, |
|||
36, 66, 0, 0, 40, 66, 0, 0, 44, 66, 0, 0, 48, 66, 0, 0, 52, 66, 0, |
|||
0, 56, 66, 0, 0, 60, 66, 0, 0, 64, 66, 0, 0, 68, 66, 0, 0, 72, 66, |
|||
0, 0, 76, 66, 0, 0, 80, 66, 0, 0, 84, 66, 0, 0, 88, 66, 0, 0, 92, 66, |
|||
0, 0, 96, 66, 0, 0, 100, 66, 0, 0, 104, 66, 0, 0, 108, 66, 0, 0, 112, |
|||
66, 0, 0, 116, 66, 0, 0, 120, 66, 0, 0, 124, 66, 0, 0, 128, 66, 0, 0, |
|||
130, 66, 0, 0, 132, 66, 0, 0, 134, 66, 0, 0, 136, 66, 0, 0, 138, 66, |
|||
0, 0, 140, 66, 0, 0, 142, 66, 0, 0, 144, 66, 0, 0, 146, 66, 0, 0, 148, |
|||
66, 0, 0, 150, 66, 0, 0, 152, 66, 0, 0, 154, 66, 0, 0, 156, 66, 0, 0, |
|||
158, 66, 0, 0, 160, 66, 0, 0, 162, 66, 0, 0, 164, 66, 0, 0, 166, 66, |
|||
0, 0, 168, 66, 0, 0, 170, 66, 0, 0, 172, 66, 0, 0, 174, 66, 0, 0, 176, |
|||
66, 0, 0, 178, 66, 0, 0, 180, 66, 0, 0, 182, 66, 0, 0, 184, 66, 0, 0, |
|||
186, 66, 0, 0, 188, 66, 0, 0, 190, 66, 0, 0, 192, 66, 0, 0, 194, 66, 0, |
|||
0, 196, 66, 0, 0, 198, 66, 0, 0, 200, 66, 0, 0, 202, 66, 0, 0, 204, 66, |
|||
0, 0, 206, 66, 0, 0, 208, 66, 0, 0, 210, 66, 0, 0, 212, 66, 0, 0, 214, 66, |
|||
0, 0, 216, 66, 0, 0, 218, 66, 0, 0, 220, 66, 0, 0, 222, 66, 0, 0, 224, 66, |
|||
0, 0, 226, 66, 0, 0, 228, 66, 0, 0, 230, 66, 0, 0, 232, 66, 0, 0, 234, 66, |
|||
0, 0, 236, 66, 0, 0, 238, 66, 0, 0, 240, 66, 0, 0, 242, 66, 0, 0, 244, 66, |
|||
0, 0, 246, 66, 0, 0, 248, 66, 0, 0, 250, 66, 0, 0, 252, 66, 0, 0, 254, 66, |
|||
0, 0, 0, 67, 0, 0, 1, 67, 0, 0, 2, 67, 0, 0, 3, 67, 0, 0, 4, 67, 0, 0, 5, 67, |
|||
0, 0, 6, 67, 0, 0, 7, 67, 0, 0, 8, 67, 0, 0, 9, 67, 0, 0, 10, 67, 0, 0, 11, |
|||
67, 0, 0, 12, 67, 0, 0, 13, 67, 0, 0, 14, 67, 0, 0, 15, 67, 0, 0, 16, 67, |
|||
0, 0, 17, 67, 0, 0, 18, 67, 0, 0, 19, 67, 0, 0, 20, 67, 0, 0, 21, 67, 0, 0, |
|||
22, 67, 0, 0, 23, 67, 0, 0, 24, 67, 0, 0, 25, 67, 0, 0, 26, 67, 0, 0, 27, |
|||
67, 0, 0, 28, 67, 0, 0, 29, 67, 0, 0, 30, 67, 0, 0, 31, 67, 0, 0, 32, 67, |
|||
0, 0, 33, 67, 0, 0, 34, 67, 0, 0, 35, 67, 0, 0, 36, 67, 0, 0, 37, 67, 0, 0, |
|||
38, 67, 0, 0, 39, 67, 0, 0, 40, 67, 0, 0, 41, 67, 0, 0, 42, 67, 0, 0, 43, 67, |
|||
0, 0, 44, 67, 0, 0, 45, 67, 0, 0, 46, 67, 0, 0, 47, 67, 0, 0, 48, 67, 0, 0, |
|||
49, 67, 0, 0, 50, 67, 0, 0, 51, 67, 0, 0, 52, 67, 0, 0, 53, 67, 0, 0, 54, 67, |
|||
0, 0, 55, 67, 0, 0, 56, 67, 0, 0, 57, 67, 0, 0, 58, 67, 0, 0, 59, 67, 0, 0, |
|||
60, 67, 0, 0, 61, 67, 0, 0, 62, 67, 0, 0, 63, 67, 0, 0, 64, 67, 0, 0, 65, 67, |
|||
0, 0, 66, 67, 0, 0, 67, 67, 0, 0, 68, 67, 0, 0, 69, 67, 0, 0, 70, 67, 0, 0, 71, |
|||
67, 0, 0, 72, 67, 0, 0, 73, 67, 0, 0, 74, 67, 0, 0, 75, 67, 0, 0, 76, 67, 0, |
|||
0, 77, 67, 0, 0, 78, 67, 0, 0, 79, 67, 0, 0, 80, 67, 0, 0, 81, 67, 0, 0, 82, |
|||
67, 0, 0, 83, 67, 0, 0, 84, 67, 0, 0, 85, 67, 0, 0, 86, 67, 0, 0, 87, 67, 0, |
|||
0, 88, 67, 0, 0, 89, 67, 0, 0, 90, 67, 0, 0, 91, 67, 0, 0, 92, 67, 0, 0, 93, |
|||
67, 0, 0, 94, 67, 0, 0, 95, 67, 0, 0, 96, 67, 0, 0, 97, 67, 0, 0, 98, 67, 0, |
|||
0, 99, 67, 0, 0, 100, 67, 0, 0, 101, 67, 0, 0, 102, 67, 0, 0, 103, 67, 0, 0, |
|||
104, 67, 0, 0, 105, 67, 0, 0, 106, 67, 0, 0, 107, 67, 0, 0, 108, 67, 0, 0, 109, |
|||
67, 0, 0, 110, 67, 0, 0, 111, 67, 0, 0, 112, 67, 0, 0, 113, 67, 0, 0, 114, 67, |
|||
0, 0, 115, 67, 0, 0, 116, 67, 0, 0, 117, 67, 0, 0, 118, 67, 0, 0, 119, 67, 0, |
|||
0, 120, 67, 0, 0, 121, 67, 0, 0, 122, 67, 0, 0, 123, 67, 0, 0, 124, 67, 0, 0, |
|||
125, 67, 0, 0, 126, 67, 0, 0, 127, 67, |
|||
}, |
|||
fsggml.TensorTypeQ4_K: { |
|||
52, 52, 0, 0, 136, 208, 216, 223, 0, 0, 0, 0, 8, 0, 8, 15, 128, |
|||
128, 129, 129, 146, 146, 147, 147, 164, 164, 165, 165, 166, 182, |
|||
183, 183, 184, 200, 201, 201, 202, 218, 218, 219, 219, 236, 236, |
|||
237, 237, 254, 254, 255, 202, 202, 202, 203, 203, 203, 219, 219, |
|||
219, 220, 220, 220, 220, 220, 236, 237, 237, 237, 237, 237, |
|||
237, 237, 238, 254, 254, 254, 254, 254, 255, 255, 255, 255, 220, |
|||
220, 220, 220, 221, 221, 221, 221, 221, 221, 221, 237, 237, 237, |
|||
238, 238, 238, 238, 238, 238, 238, 238, 238, 254, 254, 255, 255, |
|||
255, 255, 255, 255, 255, 237, 237, 237, 237, 237, 237, 237, 238, |
|||
238, 238, 238, 238, 238, 238, 238, 238, 254, 254, 254, 254, 254, |
|||
254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, |
|||
}, |
|||
fsggml.TensorTypeQ2_K: { |
|||
1, 2, 3, 3, 4, 5, 7, 7, 8, 9, 10, 11, 12, 13, 14, 15, 184, 184, |
|||
184, 185, 249, 249, 249, 249, 249, 250, 250, 254, 254, 254, 254, |
|||
255, 253, 253, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, |
|||
254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, |
|||
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, |
|||
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 171, 69, 0, 0, |
|||
}, |
|||
fsggml.TensorTypeQ5_K: { |
|||
32, 48, 0, 0, 136, 208, 216, 223, 0, 0, 0, 0, 8, 0, 7, 15, 254, |
|||
254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, |
|||
254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, |
|||
255, 255, 255, 255, 255, 0, 1, 2, 19, 20, 37, 38, 55, 56, 73, 74, |
|||
91, 92, 109, 110, 127, 112, 128, 129, 146, 147, 164, 165, 182, 183, |
|||
200, 201, 218, 219, 236, 237, 254, 133, 133, 149, 150, 150, 150, |
|||
167, 167, 167, 168, 184, 184, 185, 185, 201, 202, 202, 202, 219, |
|||
219, 219, 219, 236, 236, 236, 237, 253, 253, 254, 254, 254, 255, |
|||
169, 169, 169, 169, 186, 186, 186, 186, 186, 187, 187, 203, 203, |
|||
203, 204, 204, 204, 220, 220, 221, 221, 221, 221, 237, 237, 238, |
|||
238, 238, 238, 254, 255, 255, 203, 203, 203, 204, 204, 204, 204, |
|||
204, 220, 220, 220, 221, 221, 221, 221, 221, 237, 237, 238, 238, |
|||
238, 238, 238, 238, 254, 255, 255, 255, 255, 255, 255, 255, |
|||
}, |
|||
fsggml.TensorTypeQ6_K: { |
|||
96, 110, 92, 90, 88, 70, 68, 50, 48, 46, 44, 42, 24, 22, 4, 2, 80, |
|||
95, 78, 77, 76, 59, 58, 57, 40, 39, 38, 21, 20, 19, 2, 1, 75, 75, |
|||
74, 57, 57, 56, 55, 39, 38, 37, 21, 20, 20, 19, 2, 2, 72, 55, 55, |
|||
54, 54, 37, 37, 36, 36, 19, 19, 18, 18, 1, 1, 0, 35, 35, 35, 35, |
|||
34, 18, 18, 18, 17, 17, 17, 1, 1, 0, 0, 0, 35, 35, 34, 34, 18, |
|||
18, 18, 17, 17, 17, 17, 1, 0, 0, 0, 0, 35, 35, 35, 19, 19, 18, 18, |
|||
18, 18, 18, 1, 1, 1, 1, 1, 1, 34, 34, 18, 18, 18, 18, 17, 17, 17, |
|||
17, 1, 1, 0, 0, 0, 0, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, |
|||
0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
|||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
|||
0, 0, 0, 0, 0, 248, 240, 231, 224, 216, 208, 200, 192, 184, 176, |
|||
166, 160, 152, 144, 136, 128, 235, 43, |
|||
}, |
|||
fsggml.TensorTypeQ3_K: { |
|||
1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
|||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 20, 20, 23, 23, 7, 7, 6, 6, 6, 2, |
|||
1, 1, 1, 1, 0, 0, 22, 22, 6, 6, 5, 5, 5, 1, 1, 1, 1, 1, 0, 0, 0, |
|||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
|||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 238, 204, 170, 136, 102, 68, |
|||
34, 1, 5, 5, 5, 5, 189, 63, |
|||
}, |
|||
} |
|||
) |
|||
Loading…
Reference in new issue