mirror of https://gitee.com/namelin2022/ollama
committed by
GitHub
15 changed files with 150 additions and 422 deletions
@ -1,14 +1,14 @@ |
|||
|
|||
set(TARGET ollama_llama_server) |
|||
option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON) |
|||
include_directories(${CMAKE_CURRENT_SOURCE_DIR}) |
|||
add_executable(${TARGET} server.cpp utils.hpp json.hpp httplib.h) |
|||
install(TARGETS ${TARGET} RUNTIME) |
|||
target_compile_definitions(${TARGET} PRIVATE |
|||
SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}> |
|||
) |
|||
target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT}) |
|||
if (WIN32) |
|||
TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32) |
|||
endif() |
|||
|
|||
set(TARGET ollama_llama_server) |
|||
option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON) |
|||
include_directories(${CMAKE_CURRENT_SOURCE_DIR}) |
|||
add_executable(${TARGET} server.cpp utils.hpp json.hpp httplib.h) |
|||
install(TARGETS ${TARGET} RUNTIME) |
|||
target_compile_definitions(${TARGET} PRIVATE |
|||
SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}> |
|||
) |
|||
target_link_libraries(${TARGET} PRIVATE ggml llama common llava ${CMAKE_THREAD_LIBS_INIT}) |
|||
if (WIN32) |
|||
TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32) |
|||
endif() |
|||
target_compile_features(${TARGET} PRIVATE cxx_std_11) |
|||
@ -1 +1 @@ |
|||
Subproject commit 7c26775adb579e92b59c82e8084c07a1d0f75e9c |
|||
Subproject commit d7fd29fff16456ce9c3a23fd2d09a66256b05aff |
|||
@ -0,0 +1,45 @@ |
|||
diff --git a/src/llama.cpp b/src/llama.cpp
|
|||
index 1fe2b9f7..a43312a7 100644
|
|||
--- a/src/llama.cpp
|
|||
+++ b/src/llama.cpp
|
|||
@@ -13689,7 +13689,7 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
|
|||
const auto n_embd = hparams.n_embd; |
|||
|
|||
// TODO: use a per-batch flag for logits presence instead |
|||
- const bool has_logits = !cparams.embeddings;
|
|||
+ const bool has_logits = cparams.causal_attn;
|
|||
const bool has_embd = lctx.is_encoding || (cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE)); |
|||
|
|||
const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0; |
|||
@@ -13959,17 +13959,25 @@ static int llama_decode_internal(
|
|||
// no output |
|||
res = nullptr; |
|||
embd = nullptr; |
|||
- } else if (cparams.embeddings) {
|
|||
- res = nullptr; // do not extract logits for embedding case
|
|||
- embd = gf->nodes[gf->n_nodes - 1];
|
|||
- if (strcmp(embd->name, "result_embd_pooled") != 0) {
|
|||
- embd = gf->nodes[gf->n_nodes - 2];
|
|||
+ }
|
|||
+
|
|||
+ if (cparams.embeddings) {
|
|||
+ for (int i = gf->n_nodes - 1; i >= 0; --i) {
|
|||
+ embd = gf->nodes[i];
|
|||
+ if (strcmp(embd->name, "result_embd_pooled") == 0) {
|
|||
+ break;
|
|||
+ }
|
|||
} |
|||
GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0 && "missing embeddings tensor"); |
|||
- } else {
|
|||
+ } else {
|
|||
embd = nullptr; // do not extract embeddings when not needed |
|||
GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor"); |
|||
} |
|||
+
|
|||
+ if (!cparams.causal_attn) {
|
|||
+ res = nullptr; // do not extract logits when not needed
|
|||
+ }
|
|||
+
|
|||
// LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs); |
|||
|
|||
ggml_backend_sched_alloc_graph(lctx.sched, gf); |
|||
@ -1,305 +0,0 @@ |
|||
From 5cadb45f39d001ffbad95b690d6cf0abcb4a6d96 Mon Sep 17 00:00:00 2001 |
|||
From: Ollama maintainers <hello@ollama.com> |
|||
Date: Wed, 26 Jun 2024 16:18:09 -0700 |
|||
Subject: [PATCH] Architecture support |
|||
|
|||
---
|
|||
llama.cpp | 194 +++++++++++++++++++++++++++++++++++++++++++++++++++++- |
|||
1 file changed, 193 insertions(+), 1 deletion(-) |
|||
|
|||
diff --git a/llama.cpp b/llama.cpp
|
|||
index 61948751..3b4196f5 100644
|
|||
--- a/llama.cpp
|
|||
+++ b/llama.cpp
|
|||
@@ -217,6 +217,7 @@ enum llm_arch {
|
|||
LLM_ARCH_INTERNLM2, |
|||
LLM_ARCH_MINICPM, |
|||
LLM_ARCH_GEMMA, |
|||
+ LLM_ARCH_GEMMA2,
|
|||
LLM_ARCH_STARCODER2, |
|||
LLM_ARCH_MAMBA, |
|||
LLM_ARCH_XVERSE, |
|||
@@ -255,6 +256,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|||
{ LLM_ARCH_INTERNLM2, "internlm2" }, |
|||
{ LLM_ARCH_MINICPM, "minicpm" }, |
|||
{ LLM_ARCH_GEMMA, "gemma" }, |
|||
+ { LLM_ARCH_GEMMA2, "gemma2" },
|
|||
{ LLM_ARCH_STARCODER2, "starcoder2" }, |
|||
{ LLM_ARCH_MAMBA, "mamba" }, |
|||
{ LLM_ARCH_XVERSE, "xverse" }, |
|||
@@ -464,10 +466,12 @@ enum llm_tensor {
|
|||
LLM_TENSOR_ATTN_NORM, |
|||
LLM_TENSOR_ATTN_NORM_2, |
|||
LLM_TENSOR_ATTN_OUT_NORM, |
|||
+ LLM_TENSOR_ATTN_POST_NORM,
|
|||
LLM_TENSOR_ATTN_ROT_EMBD, |
|||
LLM_TENSOR_FFN_GATE_INP, |
|||
LLM_TENSOR_FFN_GATE_INP_SHEXP, |
|||
LLM_TENSOR_FFN_NORM, |
|||
+ LLM_TENSOR_FFN_POST_NORM,
|
|||
LLM_TENSOR_FFN_GATE, |
|||
LLM_TENSOR_FFN_DOWN, |
|||
LLM_TENSOR_FFN_UP, |
|||
@@ -960,6 +964,24 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, |
|||
}, |
|||
}, |
|||
+ {
|
|||
+ LLM_ARCH_GEMMA2,
|
|||
+ {
|
|||
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|||
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
|||
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
|||
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
|||
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
|||
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
|||
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
|||
+ { LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
|
|||
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
|||
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
|||
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
|||
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|||
+ { LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
|
|||
+ },
|
|||
+ },
|
|||
{ |
|||
LLM_ARCH_STARCODER2, |
|||
{ |
|||
@@ -1941,6 +1963,8 @@ enum e_model {
|
|||
MODEL_8x22B, |
|||
MODEL_16x12B, |
|||
MODEL_10B_128x3_66B, |
|||
+ MODEL_9B,
|
|||
+ MODEL_27B,
|
|||
}; |
|||
|
|||
static const size_t kiB = 1024; |
|||
@@ -2114,6 +2138,7 @@ struct llama_layer {
|
|||
struct ggml_tensor * attn_out_norm_b; |
|||
struct ggml_tensor * attn_q_a_norm; |
|||
struct ggml_tensor * attn_kv_a_norm; |
|||
+ struct ggml_tensor * attn_post_norm;
|
|||
|
|||
// attention |
|||
struct ggml_tensor * wq; |
|||
@@ -2136,6 +2161,7 @@ struct llama_layer {
|
|||
// normalization |
|||
struct ggml_tensor * ffn_norm; |
|||
struct ggml_tensor * ffn_norm_b; |
|||
+ struct ggml_tensor * ffn_post_norm;
|
|||
struct ggml_tensor * layer_out_norm; |
|||
struct ggml_tensor * layer_out_norm_b; |
|||
struct ggml_tensor * ffn_norm_exps; |
|||
@@ -4529,6 +4555,16 @@ static void llm_load_hparams(
|
|||
} |
|||
} break; |
|||
case LLM_ARCH_GEMMA: |
|||
+ {
|
|||
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|||
+
|
|||
+ switch (hparams.n_layer) {
|
|||
+ case 18: model.type = e_model::MODEL_9B; break;
|
|||
+ case 28: model.type = e_model::MODEL_27B; break;
|
|||
+ default: model.type = e_model::MODEL_UNKNOWN;
|
|||
+ }
|
|||
+ } break;
|
|||
+ case LLM_ARCH_GEMMA2:
|
|||
{ |
|||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); |
|||
|
|||
@@ -6305,6 +6341,40 @@ static bool llm_load_tensors(
|
|||
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}); |
|||
} |
|||
} break; |
|||
+ case LLM_ARCH_GEMMA2:
|
|||
+ {
|
|||
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
|||
+
|
|||
+ // output
|
|||
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
|||
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
|
|||
+
|
|||
+ const int64_t n_ff = hparams.n_ff;
|
|||
+ const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
|||
+ const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
|||
+ const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
|||
+
|
|||
+ for (uint32_t i = 0; i < n_layer; ++i) {
|
|||
+ ggml_context * ctx_layer = ctx_for_layer(i);
|
|||
+ ggml_context * ctx_split = ctx_for_layer_split(i);
|
|||
+
|
|||
+ auto & layer = model.layers[i];
|
|||
+
|
|||
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
|||
+
|
|||
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * hparams.n_head});
|
|||
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa});
|
|||
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa});
|
|||
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * hparams.n_head, n_embd});
|
|||
+ layer.attn_post_norm = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd});
|
|||
+
|
|||
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
|||
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
|||
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
|||
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
|||
+ layer.ffn_post_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd});
|
|||
+ }
|
|||
+ } break;
|
|||
case LLM_ARCH_STARCODER2: |
|||
{ |
|||
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); |
|||
@@ -10614,6 +10684,123 @@ struct llm_build_context {
|
|||
return gf; |
|||
} |
|||
|
|||
+ struct ggml_cgraph * build_gemma2() {
|
|||
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
|||
+
|
|||
+ const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
|||
+
|
|||
+ struct ggml_tensor * cur;
|
|||
+ struct ggml_tensor * inpL;
|
|||
+
|
|||
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
|||
+
|
|||
+ inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
|
|||
+ cb(inpL, "inp_scaled", -1);
|
|||
+
|
|||
+ // inp_pos - contains the positions
|
|||
+ struct ggml_tensor * inp_pos = build_inp_pos();
|
|||
+
|
|||
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
|||
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
|||
+
|
|||
+ for (int il = 0; il < n_layer; ++il) {
|
|||
+ // norm
|
|||
+ cur = llm_build_norm(ctx0, inpL, hparams,
|
|||
+ model.layers[il].attn_norm, NULL,
|
|||
+ LLM_NORM_RMS, cb, il);
|
|||
+ cb(cur, "attn_norm", il);
|
|||
+
|
|||
+ // self-attention
|
|||
+ {
|
|||
+ // compute Q and K and RoPE them
|
|||
+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
|||
+ cb(Qcur, "Qcur", il);
|
|||
+
|
|||
+ struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
|||
+ cb(Kcur, "Kcur", il);
|
|||
+
|
|||
+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
|||
+ cb(Vcur, "Vcur", il);
|
|||
+
|
|||
+ Qcur = ggml_rope_ext(
|
|||
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos, nullptr,
|
|||
+ n_embd_head_k, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|||
+ ext_factor, attn_factor, beta_fast, beta_slow);
|
|||
+ cb(Qcur, "Qcur", il);
|
|||
+
|
|||
+ Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k)));
|
|||
+ cb(Qcur, "Qcur_scaled", il);
|
|||
+
|
|||
+ Kcur = ggml_rope_ext(
|
|||
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos, nullptr,
|
|||
+ n_embd_head_k, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|||
+ ext_factor, attn_factor, beta_fast, beta_slow);
|
|||
+ cb(Kcur, "Kcur", il);
|
|||
+
|
|||
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
|||
+ model.layers[il].wo, NULL,
|
|||
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
|||
+ }
|
|||
+
|
|||
+ if (il == n_layer - 1) {
|
|||
+ // skip computing output for unused tokens
|
|||
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|||
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|||
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
|||
+ }
|
|||
+
|
|||
+ cur = llm_build_norm(ctx0, cur, hparams,
|
|||
+ model.layers[il].attn_post_norm, NULL,
|
|||
+ LLM_NORM_RMS, cb, il);
|
|||
+ cb(cur, "attn_post_norm", il);
|
|||
+
|
|||
+ struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
|
|||
+ cb(sa_out, "sa_out", il);
|
|||
+
|
|||
+ cur = llm_build_norm(ctx0, sa_out, hparams,
|
|||
+ model.layers[il].ffn_norm, NULL,
|
|||
+ LLM_NORM_RMS, cb, il);
|
|||
+ cb(cur, "ffn_norm", il);
|
|||
+
|
|||
+ // feed-forward network
|
|||
+ {
|
|||
+ cur = llm_build_ffn(ctx0, cur,
|
|||
+ model.layers[il].ffn_up, NULL,
|
|||
+ model.layers[il].ffn_gate, NULL,
|
|||
+ model.layers[il].ffn_down, NULL,
|
|||
+ NULL,
|
|||
+ LLM_FFN_GELU, LLM_FFN_PAR, cb, il);
|
|||
+ cb(cur, "ffn_out", il);
|
|||
+ }
|
|||
+
|
|||
+ cur = llm_build_norm(ctx0, cur, hparams,
|
|||
+ model.layers[il].ffn_post_norm, NULL,
|
|||
+ LLM_NORM_RMS, cb, -1);
|
|||
+ cb(cur, "ffn_post_norm", -1);
|
|||
+
|
|||
+ cur = ggml_add(ctx0, cur, sa_out);
|
|||
+ cb(cur, "l_out", il);
|
|||
+
|
|||
+ // input for next layer
|
|||
+ inpL = cur;
|
|||
+ }
|
|||
+
|
|||
+ cur = inpL;
|
|||
+
|
|||
+ cur = llm_build_norm(ctx0, cur, hparams,
|
|||
+ model.output_norm, NULL,
|
|||
+ LLM_NORM_RMS, cb, -1);
|
|||
+ cb(cur, "result_norm", -1);
|
|||
+
|
|||
+ // lm_head
|
|||
+ cur = ggml_mul_mat(ctx0, model.output, cur);
|
|||
+ cb(cur, "result_output", -1);
|
|||
+
|
|||
+ ggml_build_forward_expand(gf, cur);
|
|||
+
|
|||
+ return gf;
|
|||
+ }
|
|||
+
|
|||
struct ggml_cgraph * build_starcoder2() { |
|||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); |
|||
|
|||
@@ -11847,6 +12034,10 @@ static struct ggml_cgraph * llama_build_graph(
|
|||
{ |
|||
result = llm.build_gemma(); |
|||
} break; |
|||
+ case LLM_ARCH_GEMMA2:
|
|||
+ {
|
|||
+ result = llm.build_gemma2();
|
|||
+ } break;
|
|||
case LLM_ARCH_STARCODER2: |
|||
{ |
|||
result = llm.build_starcoder2(); |
|||
@@ -16671,6 +16862,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|||
case LLM_ARCH_PHI2: |
|||
case LLM_ARCH_PHI3: |
|||
case LLM_ARCH_GEMMA: |
|||
+ case LLM_ARCH_GEMMA2:
|
|||
case LLM_ARCH_STARCODER2: |
|||
case LLM_ARCH_GPTNEOX: |
|||
return LLAMA_ROPE_TYPE_NEOX; |
|||
@@ -18551,7 +18743,7 @@ static int32_t llama_chat_apply_template_internal(
|
|||
if (add_ass) { |
|||
ss << "<s>assistant\n"; |
|||
} |
|||
- } else if (tmpl == "gemma" || tmpl.find("<start_of_turn>") != std::string::npos) {
|
|||
+ } else if (tmpl == "gemma" || tmpl == "gemma2" || tmpl.find("<start_of_turn>") != std::string::npos) {
|
|||
// google/gemma-7b-it |
|||
std::string system_prompt = ""; |
|||
for (auto message : chat) { |
|||
--
|
|||
2.45.2 |
|||
|
|||
Loading…
Reference in new issue