From 7baf4fd1349db8434e954de90e381fc527009ed1 Mon Sep 17 00:00:00 2001 From: Shunta Saito Date: Fri, 18 Jul 2025 17:05:02 +0900 Subject: [PATCH 1/7] Fix dimensions for expand --- src/llama-model.cpp | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index b88f4ebc5c02f..59b6d7a21ceeb 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -15908,6 +15908,7 @@ struct llm_build_plamo2 : public llm_graph_context_mamba { // pre_mixer_norm cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il); + cb(cur, "pre_mixer_norm", il); // check if this layer is Mamba or Attention bool is_mamba_layer = hparams.is_recurrent(il); @@ -16103,8 +16104,8 @@ struct llm_build_plamo2 : public llm_graph_context_mamba { ggml_build_forward_expand(gf, ggml_cpy(ctx0, last_conv, ggml_view_1d(ctx0, conv_states_all, - (d_conv - 1)*(d_inner)*(n_seqs), - kv_head*(d_conv - 1)*(d_inner)*ggml_element_size(conv_states_all)))); + (d_conv - 1)*(d_inner + 2*n_group*d_state)*(n_seqs), + kv_head*(d_conv - 1)*(d_inner + 2*n_group*d_state)*ggml_element_size(conv_states_all)))); // 1D convolution x = ggml_ssm_conv(ctx0, conv_x, model.layers[il].ssm_conv1d); @@ -16158,6 +16159,7 @@ struct llm_build_plamo2 : public llm_graph_context_mamba { // Custom operator to optimize the parallel associative scan // as described in the Annex D of the Mamba paper. // => {d_inner, n_seq_tokens, n_seqs} and {d_state, d_inner, n_seqs} + cb(ids, "mamba_ssm_scan_ids", il); return ggml_ssm_scan(ctx, ssm, x, dt, A, B, C, ids); }; @@ -16167,9 +16169,8 @@ struct llm_build_plamo2 : public llm_graph_context_mamba { // store last states ggml_build_forward_expand(gf, ggml_cpy(ctx0, - ggml_view_1d(ctx0, y_ssm, d_state*d_inner*n_seqs, x->nb[3]*x->ne[3]), - ggml_view_1d(ctx0, ssm_states_all, d_state*d_inner*n_seqs, - kv_head*d_state*d_inner*ggml_element_size(ssm_states_all)))); + ggml_view_1d(ctx0, y_ssm, d_state*d_inner*n_seqs, ggml_nelements(x)*x->nb[0]), + ggml_view_1d(ctx0, ssm_states_all, d_state*d_inner*n_seqs, kv_head*d_state*d_inner*ggml_element_size(ssm_states_all)))); ggml_tensor * y = ggml_view_4d(ctx0, y_ssm, head_dim, n_heads, n_seq_tokens, n_seqs, head_dim * ggml_element_size(x), head_dim * n_heads * ggml_element_size(x), head_dim * n_heads * n_seq_tokens * ggml_element_size(x), 0); cb(y, "mamba_y_view", il); @@ -16177,6 +16178,7 @@ struct llm_build_plamo2 : public llm_graph_context_mamba { // Add D parameter and apply gating with z // {d_inner, n_seq_tokens, n_seqs} * {d_inner} => {d_inner, n_seq_tokens, n_seqs} ggml_tensor * D = ggml_reshape_2d(ctx0, model.layers[il].ssm_d, 1, n_heads); + cb(D, "mamba_D", il); y = ggml_add(ctx0, y, ggml_mul(ctx0, x, D)); cb(y, "mamba_y_add_d", il); From e39bc0926039f8aa7eda21ee22dbdf923ea33faa Mon Sep 17 00:00:00 2001 From: Shunta Saito Date: Sat, 26 Jul 2025 17:19:35 +0900 Subject: [PATCH 2/7] Change dimensions to copy states to cache --- src/llama-model.cpp | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 59b6d7a21ceeb..8d574bcc01ba4 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -16104,8 +16104,9 @@ struct llm_build_plamo2 : public llm_graph_context_mamba { ggml_build_forward_expand(gf, ggml_cpy(ctx0, last_conv, ggml_view_1d(ctx0, conv_states_all, - (d_conv - 1)*(d_inner + 2*n_group*d_state)*(n_seqs), - kv_head*(d_conv - 1)*(d_inner + 2*n_group*d_state)*ggml_element_size(conv_states_all)))); + (d_conv - 1)*d_inner*(n_seqs), + kv_head*(d_conv - 1)*d_inner*ggml_element_size(conv_states_all)))); + cb(conv_states_all, "mamba_conv1d_state", il); // 1D convolution x = ggml_ssm_conv(ctx0, conv_x, model.layers[il].ssm_conv1d); @@ -16159,7 +16160,6 @@ struct llm_build_plamo2 : public llm_graph_context_mamba { // Custom operator to optimize the parallel associative scan // as described in the Annex D of the Mamba paper. // => {d_inner, n_seq_tokens, n_seqs} and {d_state, d_inner, n_seqs} - cb(ids, "mamba_ssm_scan_ids", il); return ggml_ssm_scan(ctx, ssm, x, dt, A, B, C, ids); }; @@ -16169,8 +16169,9 @@ struct llm_build_plamo2 : public llm_graph_context_mamba { // store last states ggml_build_forward_expand(gf, ggml_cpy(ctx0, - ggml_view_1d(ctx0, y_ssm, d_state*d_inner*n_seqs, ggml_nelements(x)*x->nb[0]), - ggml_view_1d(ctx0, ssm_states_all, d_state*d_inner*n_seqs, kv_head*d_state*d_inner*ggml_element_size(ssm_states_all)))); + ggml_view_1d(ctx0, y_ssm, n_heads*head_dim*d_state*n_seqs, ggml_nelements(x)*x->nb[0]), + ggml_view_1d(ctx0, ssm_states_all, n_heads*head_dim*d_state*n_seqs, kv_head*d_state*d_inner*ggml_element_size(ssm_states_all)))); + cb(ssm_states_all, "mamba_ssm_states", il); ggml_tensor * y = ggml_view_4d(ctx0, y_ssm, head_dim, n_heads, n_seq_tokens, n_seqs, head_dim * ggml_element_size(x), head_dim * n_heads * ggml_element_size(x), head_dim * n_heads * n_seq_tokens * ggml_element_size(x), 0); cb(y, "mamba_y_view", il); From bd4d2e1cd7ca8a1815a30f5799408c3fdeb031eb Mon Sep 17 00:00:00 2001 From: Shunta Saito Date: Sat, 26 Jul 2025 23:54:15 +0900 Subject: [PATCH 3/7] Fix the default value for plamo2 conversion --- convert_hf_to_gguf.py | 4 ++-- examples/eval-callback/eval-callback.cpp | 2 +- src/llama-model.cpp | 9 +++++---- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index d9185c8060028..9a0e670577a52 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -3788,7 +3788,7 @@ def set_gguf_parameters(self): self.gguf_writer.add_block_count(block_count) self.gguf_writer.add_head_count(hparams.get("num_attention_heads", 32)) self.gguf_writer.add_layer_norm_rms_eps(hparams.get("rms_norm_eps", 1e-06)) - self.gguf_writer.add_rope_freq_base(hparams.get("rope_theta", 1000000.0)) + self.gguf_writer.add_rope_freq_base(hparams.get("rope_theta", 10000)) # Mamba parameters self.gguf_writer.add_ssm_state_size(hparams.get("mamba_d_state", 64)) @@ -3799,7 +3799,7 @@ def set_gguf_parameters(self): self.gguf_writer.add_ssm_group_count(0) # MLP feed forward parameters (for attention layers) - self.gguf_writer.add_feed_forward_length(hparams.get("intermediate_size", 16384)) + self.gguf_writer.add_feed_forward_length(hparams.get("intermediate_size", 13312)) self.gguf_writer.add_file_type(self.ftype) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp index 4afd80eb454ad..2e7f5a98c1964 100644 --- a/examples/eval-callback/eval-callback.cpp +++ b/examples/eval-callback/eval-callback.cpp @@ -122,7 +122,7 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) { if (!ggml_is_quantized(t->type)) { uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data(); - ggml_print_tensor(data, t->type, t->ne, t->nb, 3); + ggml_print_tensor(data, t->type, t->ne, t->nb, 32); } return true; diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 8d574bcc01ba4..53e31f2df277a 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -16019,6 +16019,7 @@ struct llm_build_plamo2 : public llm_graph_context_mamba { n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); + cb(Qcur, "Qcur_rope", il); Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il); cb(Kcur, "Kcur_normed", il); @@ -16104,8 +16105,8 @@ struct llm_build_plamo2 : public llm_graph_context_mamba { ggml_build_forward_expand(gf, ggml_cpy(ctx0, last_conv, ggml_view_1d(ctx0, conv_states_all, - (d_conv - 1)*d_inner*(n_seqs), - kv_head*(d_conv - 1)*d_inner*ggml_element_size(conv_states_all)))); + (d_conv - 1)*(d_inner + 2*n_group*d_state)*(n_seqs), + kv_head*(d_conv - 1)*(d_inner + 2*n_group*d_state)*ggml_element_size(conv_states_all)))); cb(conv_states_all, "mamba_conv1d_state", il); // 1D convolution @@ -16169,8 +16170,8 @@ struct llm_build_plamo2 : public llm_graph_context_mamba { // store last states ggml_build_forward_expand(gf, ggml_cpy(ctx0, - ggml_view_1d(ctx0, y_ssm, n_heads*head_dim*d_state*n_seqs, ggml_nelements(x)*x->nb[0]), - ggml_view_1d(ctx0, ssm_states_all, n_heads*head_dim*d_state*n_seqs, kv_head*d_state*d_inner*ggml_element_size(ssm_states_all)))); + ggml_view_1d(ctx0, y_ssm, n_heads*head_dim*d_state*n_seqs, n_heads*head_dim*n_seq_tokens*n_seqs*ggml_element_size(y_ssm)), + ggml_view_1d(ctx0, ssm_states_all, n_heads*head_dim*d_state*n_seqs, kv_head*n_seqs*n_heads*head_dim*d_state*ggml_element_size(ssm_states_all)))); cb(ssm_states_all, "mamba_ssm_states", il); ggml_tensor * y = ggml_view_4d(ctx0, y_ssm, head_dim, n_heads, n_seq_tokens, n_seqs, head_dim * ggml_element_size(x), head_dim * n_heads * ggml_element_size(x), head_dim * n_heads * n_seq_tokens * ggml_element_size(x), 0); From c475203c516cdfaf06c219a976c34a3949cdbc96 Mon Sep 17 00:00:00 2001 From: Shunta Saito Date: Sun, 27 Jul 2025 02:38:19 +0900 Subject: [PATCH 4/7] Fix scale given to build_attn --- examples/eval-callback/eval-callback.cpp | 2 +- src/llama-model.cpp | 11 +++-------- 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp index 2e7f5a98c1964..4afd80eb454ad 100644 --- a/examples/eval-callback/eval-callback.cpp +++ b/examples/eval-callback/eval-callback.cpp @@ -122,7 +122,7 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) { if (!ggml_is_quantized(t->type)) { uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data(); - ggml_print_tensor(data, t->type, t->ne, t->nb, 32); + ggml_print_tensor(data, t->type, t->ne, t->nb, 3); } return true; diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 53e31f2df277a..8b5394fc69c2d 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -15908,7 +15908,6 @@ struct llm_build_plamo2 : public llm_graph_context_mamba { // pre_mixer_norm cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il); - cb(cur, "pre_mixer_norm", il); // check if this layer is Mamba or Attention bool is_mamba_layer = hparams.is_recurrent(il); @@ -15989,7 +15988,7 @@ struct llm_build_plamo2 : public llm_graph_context_mamba { { // PLaMo-2 uses combined QKV tensor ggml_tensor * qkv = build_lora_mm(model.layers[il].wqkv, cur); - cb(qkv, "qkv", il); + cb(qkv, "wqkv", il); // split QKV tensor into Q, K, V const int64_t n_embd_head_q = hparams.n_embd_head_k; @@ -16013,27 +16012,24 @@ struct llm_build_plamo2 : public llm_graph_context_mamba { Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il); cb(Qcur, "Qcur_normed", il); - Qcur = ggml_rope_ext( ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); - cb(Qcur, "Qcur_rope", il); Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il); cb(Kcur, "Kcur_normed", il); - Kcur = ggml_rope_ext( ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); - cur = build_attn(inp, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, NULL, NULL, 1.0f, il); + cur = build_attn(inp, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, NULL, NULL, 1.0f/sqrtf(float(n_embd_head_v)), il); } - cb(cur, "attn_out", il); + cb(cur, "attn_output", il); return cur; } @@ -16180,7 +16176,6 @@ struct llm_build_plamo2 : public llm_graph_context_mamba { // Add D parameter and apply gating with z // {d_inner, n_seq_tokens, n_seqs} * {d_inner} => {d_inner, n_seq_tokens, n_seqs} ggml_tensor * D = ggml_reshape_2d(ctx0, model.layers[il].ssm_d, 1, n_heads); - cb(D, "mamba_D", il); y = ggml_add(ctx0, y, ggml_mul(ctx0, x, D)); cb(y, "mamba_y_add_d", il); From 75f0a0d7300528d072383553be60113b91b38589 Mon Sep 17 00:00:00 2001 From: Shunta Saito Date: Sun, 27 Jul 2025 10:26:46 +0900 Subject: [PATCH 5/7] Update src/llama-model.cpp MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Sigbjørn Skjæret --- src/llama-model.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 8b5394fc69c2d..0c52f96a2e7c4 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -16012,6 +16012,7 @@ struct llm_build_plamo2 : public llm_graph_context_mamba { Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il); cb(Qcur, "Qcur_normed", il); + Qcur = ggml_rope_ext( ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, From 429639d06366c5358332b81ab6bab40ada286af2 Mon Sep 17 00:00:00 2001 From: Shunta Saito Date: Sun, 27 Jul 2025 10:26:54 +0900 Subject: [PATCH 6/7] Update src/llama-model.cpp MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Sigbjørn Skjæret --- src/llama-model.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 0c52f96a2e7c4..893efb65ffe40 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -16021,6 +16021,7 @@ struct llm_build_plamo2 : public llm_graph_context_mamba { Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il); cb(Kcur, "Kcur_normed", il); + Kcur = ggml_rope_ext( ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, From 60a705de55750118e51d88bc5bc234f9ca3a3d71 Mon Sep 17 00:00:00 2001 From: Shunta Saito Date: Sun, 27 Jul 2025 10:27:33 +0900 Subject: [PATCH 7/7] Update src/llama-model.cpp MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Sigbjørn Skjæret --- src/llama-model.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 893efb65ffe40..c9cc4318be0b0 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -16031,7 +16031,7 @@ struct llm_build_plamo2 : public llm_graph_context_mamba { cur = build_attn(inp, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, NULL, NULL, 1.0f/sqrtf(float(n_embd_head_v)), il); } - cb(cur, "attn_output", il); + cb(cur, "attn_out", il); return cur; }