mirror of https://gitee.com/namelin2022/ollama
committed by
GitHub
5 changed files with 149 additions and 0 deletions
@ -0,0 +1,113 @@ |
|||
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 |
|||
From: Patrick Devine <patrick@infrahq.com> |
|||
Date: Fri, 14 Mar 2025 16:33:23 -0700 |
|||
Subject: [PATCH] gemma3 quantization |
|||
|
|||
---
|
|||
src/llama-arch.cpp | 19 +++++++++++++++++++ |
|||
src/llama-arch.h | 1 + |
|||
src/llama-model.cpp | 7 +++++++ |
|||
src/llama-quant.cpp | 9 +++++++++ |
|||
4 files changed, 36 insertions(+) |
|||
|
|||
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
|
|||
index b6f20286..b443fcd3 100644
|
|||
--- a/src/llama-arch.cpp
|
|||
+++ b/src/llama-arch.cpp
|
|||
@@ -37,6 +37,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|||
{ LLM_ARCH_MINICPM3, "minicpm3" }, |
|||
{ LLM_ARCH_GEMMA, "gemma" }, |
|||
{ LLM_ARCH_GEMMA2, "gemma2" }, |
|||
+ { LLM_ARCH_GEMMA3, "gemma3" },
|
|||
{ LLM_ARCH_STARCODER2, "starcoder2" }, |
|||
{ LLM_ARCH_MAMBA, "mamba" }, |
|||
{ LLM_ARCH_XVERSE, "xverse" }, |
|||
@@ -804,6 +805,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|||
{ LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" }, |
|||
}, |
|||
}, |
|||
+ {
|
|||
+ LLM_ARCH_GEMMA3,
|
|||
+ {
|
|||
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|||
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
|||
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
|||
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
|||
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
|||
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
|||
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
|||
+ { LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
|
|||
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
|||
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
|||
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
|||
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|||
+ { LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
|
|||
+ },
|
|||
+ },
|
|||
{ |
|||
LLM_ARCH_STARCODER2, |
|||
{ |
|||
diff --git a/src/llama-arch.h b/src/llama-arch.h
|
|||
index ec742224..aad92a5d 100644
|
|||
--- a/src/llama-arch.h
|
|||
+++ b/src/llama-arch.h
|
|||
@@ -41,6 +41,7 @@ enum llm_arch {
|
|||
LLM_ARCH_MINICPM3, |
|||
LLM_ARCH_GEMMA, |
|||
LLM_ARCH_GEMMA2, |
|||
+ LLM_ARCH_GEMMA3,
|
|||
LLM_ARCH_STARCODER2, |
|||
LLM_ARCH_MAMBA, |
|||
LLM_ARCH_XVERSE, |
|||
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
|
|||
index ab1a07d1..70183041 100644
|
|||
--- a/src/llama-model.cpp
|
|||
+++ b/src/llama-model.cpp
|
|||
@@ -878,6 +878,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
default: type = LLM_TYPE_UNKNOWN; |
|||
} |
|||
} break; |
|||
+ case LLM_ARCH_GEMMA3:
|
|||
+ {
|
|||
+ } break;
|
|||
case LLM_ARCH_STARCODER2: |
|||
{ |
|||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); |
|||
@@ -2537,6 +2540,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0); |
|||
} |
|||
} break; |
|||
+ case LLM_ARCH_GEMMA3:
|
|||
+ {
|
|||
+ } break;
|
|||
case LLM_ARCH_STARCODER2: |
|||
{ |
|||
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); |
|||
@@ -4029,6 +4035,7 @@ enum llama_rope_type llama_model_rope_type(const struct llama_model * model) {
|
|||
case LLM_ARCH_PHIMOE: |
|||
case LLM_ARCH_GEMMA: |
|||
case LLM_ARCH_GEMMA2: |
|||
+ case LLM_ARCH_GEMMA3:
|
|||
case LLM_ARCH_STARCODER2: |
|||
case LLM_ARCH_OPENELM: |
|||
case LLM_ARCH_GPTNEOX: |
|||
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
|
|||
index 6eb1da08..d2f3a510 100644
|
|||
--- a/src/llama-quant.cpp
|
|||
+++ b/src/llama-quant.cpp
|
|||
@@ -737,6 +737,15 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|||
// This used to be a regex, but <regex> has an extreme cost to compile times. |
|||
bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'? |
|||
|
|||
+ // don't quantize vision stuff
|
|||
+ quantize &= name.find("v.blk.") == std::string::npos;
|
|||
+
|
|||
+ quantize &= name.find("mm.mm_input_projection.weight") == std::string::npos;
|
|||
+ quantize &= name.find("mm.mm_soft_emb_norm.weight") == std::string::npos;
|
|||
+ quantize &= name.find("v.patch_embedding.weight") == std::string::npos;
|
|||
+ quantize &= name.find("v.position_embedding.weight") == std::string::npos;
|
|||
+ quantize &= name.find("v.post_layernorm.weight") == std::string::npos;
|
|||
+
|
|||
// quantize only 2D and 3D tensors (experts) |
|||
quantize &= (ggml_n_dims(tensor) >= 2); |
|||
|
|||
Loading…
Reference in new issue