From 7baf4fd1349db8434e954de90e381fc527009ed1 Mon Sep 17 00:00:00 2001
From: Shunta Saito <shunta.saito@gmail.com>
Date: Fri, 18 Jul 2025 17:05:02 +0900
Subject: [PATCH 1/7] Fix dimensions for expand

---
 src/llama-model.cpp | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index b88f4ebc5c02f..59b6d7a21ceeb 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -15908,6 +15908,7 @@ struct llm_build_plamo2 : public llm_graph_context_mamba {
 
             // pre_mixer_norm
             cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
+            cb(cur, "pre_mixer_norm", il);
 
             // check if this layer is Mamba or Attention
             bool is_mamba_layer = hparams.is_recurrent(il);
@@ -16103,8 +16104,8 @@ struct llm_build_plamo2 : public llm_graph_context_mamba {
             ggml_build_forward_expand(gf,
                 ggml_cpy(ctx0, last_conv,
                     ggml_view_1d(ctx0, conv_states_all,
-                        (d_conv - 1)*(d_inner)*(n_seqs),
-                        kv_head*(d_conv - 1)*(d_inner)*ggml_element_size(conv_states_all))));
+                        (d_conv - 1)*(d_inner + 2*n_group*d_state)*(n_seqs),
+                        kv_head*(d_conv - 1)*(d_inner + 2*n_group*d_state)*ggml_element_size(conv_states_all))));
 
             // 1D convolution
             x = ggml_ssm_conv(ctx0, conv_x, model.layers[il].ssm_conv1d);
@@ -16158,6 +16159,7 @@ struct llm_build_plamo2 : public llm_graph_context_mamba {
                 // Custom operator to optimize the parallel associative scan
                 // as described in the Annex D of the Mamba paper.
                 // => {d_inner, n_seq_tokens, n_seqs} and {d_state, d_inner, n_seqs}
+                cb(ids, "mamba_ssm_scan_ids", il);
                 return ggml_ssm_scan(ctx, ssm, x, dt, A, B, C, ids);
             };
 
@@ -16167,9 +16169,8 @@ struct llm_build_plamo2 : public llm_graph_context_mamba {
             // store last states
             ggml_build_forward_expand(gf,
                 ggml_cpy(ctx0,
-                    ggml_view_1d(ctx0, y_ssm, d_state*d_inner*n_seqs, x->nb[3]*x->ne[3]),
-                    ggml_view_1d(ctx0, ssm_states_all, d_state*d_inner*n_seqs,
-                            kv_head*d_state*d_inner*ggml_element_size(ssm_states_all))));
+                    ggml_view_1d(ctx0, y_ssm, d_state*d_inner*n_seqs, ggml_nelements(x)*x->nb[0]),
+                    ggml_view_1d(ctx0, ssm_states_all, d_state*d_inner*n_seqs, kv_head*d_state*d_inner*ggml_element_size(ssm_states_all))));
 
             ggml_tensor * y = ggml_view_4d(ctx0, y_ssm, head_dim, n_heads, n_seq_tokens, n_seqs, head_dim * ggml_element_size(x), head_dim * n_heads * ggml_element_size(x), head_dim * n_heads * n_seq_tokens * ggml_element_size(x), 0);
             cb(y, "mamba_y_view", il);
@@ -16177,6 +16178,7 @@ struct llm_build_plamo2 : public llm_graph_context_mamba {
             // Add D parameter and apply gating with z
             // {d_inner, n_seq_tokens, n_seqs} * {d_inner} => {d_inner, n_seq_tokens, n_seqs}
             ggml_tensor * D = ggml_reshape_2d(ctx0, model.layers[il].ssm_d, 1, n_heads);
+            cb(D, "mamba_D", il);
             y = ggml_add(ctx0, y, ggml_mul(ctx0, x, D));
             cb(y, "mamba_y_add_d", il);
 

From e39bc0926039f8aa7eda21ee22dbdf923ea33faa Mon Sep 17 00:00:00 2001
From: Shunta Saito <shunta.saito@gmail.com>
Date: Sat, 26 Jul 2025 17:19:35 +0900
Subject: [PATCH 2/7] Change dimensions to copy states to cache

---
 src/llama-model.cpp | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 59b6d7a21ceeb..8d574bcc01ba4 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -16104,8 +16104,9 @@ struct llm_build_plamo2 : public llm_graph_context_mamba {
             ggml_build_forward_expand(gf,
                 ggml_cpy(ctx0, last_conv,
                     ggml_view_1d(ctx0, conv_states_all,
-                        (d_conv - 1)*(d_inner + 2*n_group*d_state)*(n_seqs),
-                        kv_head*(d_conv - 1)*(d_inner + 2*n_group*d_state)*ggml_element_size(conv_states_all))));
+                        (d_conv - 1)*d_inner*(n_seqs),
+                        kv_head*(d_conv - 1)*d_inner*ggml_element_size(conv_states_all))));
+            cb(conv_states_all, "mamba_conv1d_state", il);
 
             // 1D convolution
             x = ggml_ssm_conv(ctx0, conv_x, model.layers[il].ssm_conv1d);
@@ -16159,7 +16160,6 @@ struct llm_build_plamo2 : public llm_graph_context_mamba {
                 // Custom operator to optimize the parallel associative scan
                 // as described in the Annex D of the Mamba paper.
                 // => {d_inner, n_seq_tokens, n_seqs} and {d_state, d_inner, n_seqs}
-                cb(ids, "mamba_ssm_scan_ids", il);
                 return ggml_ssm_scan(ctx, ssm, x, dt, A, B, C, ids);
             };
 
@@ -16169,8 +16169,9 @@ struct llm_build_plamo2 : public llm_graph_context_mamba {
             // store last states
             ggml_build_forward_expand(gf,
                 ggml_cpy(ctx0,
-                    ggml_view_1d(ctx0, y_ssm, d_state*d_inner*n_seqs, ggml_nelements(x)*x->nb[0]),
-                    ggml_view_1d(ctx0, ssm_states_all, d_state*d_inner*n_seqs, kv_head*d_state*d_inner*ggml_element_size(ssm_states_all))));
+                    ggml_view_1d(ctx0, y_ssm, n_heads*head_dim*d_state*n_seqs, ggml_nelements(x)*x->nb[0]),
+                    ggml_view_1d(ctx0, ssm_states_all, n_heads*head_dim*d_state*n_seqs, kv_head*d_state*d_inner*ggml_element_size(ssm_states_all))));
+            cb(ssm_states_all, "mamba_ssm_states", il);
 
             ggml_tensor * y = ggml_view_4d(ctx0, y_ssm, head_dim, n_heads, n_seq_tokens, n_seqs, head_dim * ggml_element_size(x), head_dim * n_heads * ggml_element_size(x), head_dim * n_heads * n_seq_tokens * ggml_element_size(x), 0);
             cb(y, "mamba_y_view", il);

From bd4d2e1cd7ca8a1815a30f5799408c3fdeb031eb Mon Sep 17 00:00:00 2001
From: Shunta Saito <shunta.saito@gmail.com>
Date: Sat, 26 Jul 2025 23:54:15 +0900
Subject: [PATCH 3/7] Fix the default value for plamo2 conversion

---
 convert_hf_to_gguf.py                    | 4 ++--
 examples/eval-callback/eval-callback.cpp | 2 +-
 src/llama-model.cpp                      | 9 +++++----
 3 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index d9185c8060028..9a0e670577a52 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -3788,7 +3788,7 @@ def set_gguf_parameters(self):
         self.gguf_writer.add_block_count(block_count)
         self.gguf_writer.add_head_count(hparams.get("num_attention_heads", 32))
         self.gguf_writer.add_layer_norm_rms_eps(hparams.get("rms_norm_eps", 1e-06))
-        self.gguf_writer.add_rope_freq_base(hparams.get("rope_theta", 1000000.0))
+        self.gguf_writer.add_rope_freq_base(hparams.get("rope_theta", 10000))
 
         # Mamba parameters
         self.gguf_writer.add_ssm_state_size(hparams.get("mamba_d_state", 64))
@@ -3799,7 +3799,7 @@ def set_gguf_parameters(self):
         self.gguf_writer.add_ssm_group_count(0)
 
         # MLP feed forward parameters (for attention layers)
-        self.gguf_writer.add_feed_forward_length(hparams.get("intermediate_size", 16384))
+        self.gguf_writer.add_feed_forward_length(hparams.get("intermediate_size", 13312))
         self.gguf_writer.add_file_type(self.ftype)
 
     def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp
index 4afd80eb454ad..2e7f5a98c1964 100644
--- a/examples/eval-callback/eval-callback.cpp
+++ b/examples/eval-callback/eval-callback.cpp
@@ -122,7 +122,7 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
 
     if (!ggml_is_quantized(t->type)) {
         uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data();
-        ggml_print_tensor(data, t->type, t->ne, t->nb, 3);
+        ggml_print_tensor(data, t->type, t->ne, t->nb, 32);
     }
 
     return true;
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 8d574bcc01ba4..53e31f2df277a 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -16019,6 +16019,7 @@ struct llm_build_plamo2 : public llm_graph_context_mamba {
                     n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                     ext_factor, attn_factor, beta_fast, beta_slow
                     );
+            cb(Qcur, "Qcur_rope", il);
 
             Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
             cb(Kcur, "Kcur_normed", il);
@@ -16104,8 +16105,8 @@ struct llm_build_plamo2 : public llm_graph_context_mamba {
             ggml_build_forward_expand(gf,
                 ggml_cpy(ctx0, last_conv,
                     ggml_view_1d(ctx0, conv_states_all,
-                        (d_conv - 1)*d_inner*(n_seqs),
-                        kv_head*(d_conv - 1)*d_inner*ggml_element_size(conv_states_all))));
+                        (d_conv - 1)*(d_inner + 2*n_group*d_state)*(n_seqs),
+                        kv_head*(d_conv - 1)*(d_inner + 2*n_group*d_state)*ggml_element_size(conv_states_all))));
             cb(conv_states_all, "mamba_conv1d_state", il);
 
             // 1D convolution
@@ -16169,8 +16170,8 @@ struct llm_build_plamo2 : public llm_graph_context_mamba {
             // store last states
             ggml_build_forward_expand(gf,
                 ggml_cpy(ctx0,
-                    ggml_view_1d(ctx0, y_ssm, n_heads*head_dim*d_state*n_seqs, ggml_nelements(x)*x->nb[0]),
-                    ggml_view_1d(ctx0, ssm_states_all, n_heads*head_dim*d_state*n_seqs, kv_head*d_state*d_inner*ggml_element_size(ssm_states_all))));
+                    ggml_view_1d(ctx0, y_ssm, n_heads*head_dim*d_state*n_seqs, n_heads*head_dim*n_seq_tokens*n_seqs*ggml_element_size(y_ssm)),
+                    ggml_view_1d(ctx0, ssm_states_all, n_heads*head_dim*d_state*n_seqs, kv_head*n_seqs*n_heads*head_dim*d_state*ggml_element_size(ssm_states_all))));
             cb(ssm_states_all, "mamba_ssm_states", il);
 
             ggml_tensor * y = ggml_view_4d(ctx0, y_ssm, head_dim, n_heads, n_seq_tokens, n_seqs, head_dim * ggml_element_size(x), head_dim * n_heads * ggml_element_size(x), head_dim * n_heads * n_seq_tokens * ggml_element_size(x), 0);

From c475203c516cdfaf06c219a976c34a3949cdbc96 Mon Sep 17 00:00:00 2001
From: Shunta Saito <shunta.saito@gmail.com>
Date: Sun, 27 Jul 2025 02:38:19 +0900
Subject: [PATCH 4/7] Fix scale given to build_attn

---
 examples/eval-callback/eval-callback.cpp |  2 +-
 src/llama-model.cpp                      | 11 +++--------
 2 files changed, 4 insertions(+), 9 deletions(-)

diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp
index 2e7f5a98c1964..4afd80eb454ad 100644
--- a/examples/eval-callback/eval-callback.cpp
+++ b/examples/eval-callback/eval-callback.cpp
@@ -122,7 +122,7 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
 
     if (!ggml_is_quantized(t->type)) {
         uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data();
-        ggml_print_tensor(data, t->type, t->ne, t->nb, 32);
+        ggml_print_tensor(data, t->type, t->ne, t->nb, 3);
     }
 
     return true;
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 53e31f2df277a..8b5394fc69c2d 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -15908,7 +15908,6 @@ struct llm_build_plamo2 : public llm_graph_context_mamba {
 
             // pre_mixer_norm
             cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
-            cb(cur, "pre_mixer_norm", il);
 
             // check if this layer is Mamba or Attention
             bool is_mamba_layer = hparams.is_recurrent(il);
@@ -15989,7 +15988,7 @@ struct llm_build_plamo2 : public llm_graph_context_mamba {
         {
             // PLaMo-2 uses combined QKV tensor
             ggml_tensor * qkv = build_lora_mm(model.layers[il].wqkv, cur);
-            cb(qkv, "qkv", il);
+            cb(qkv, "wqkv", il);
 
             // split QKV tensor into Q, K, V
             const int64_t n_embd_head_q = hparams.n_embd_head_k;
@@ -16013,27 +16012,24 @@ struct llm_build_plamo2 : public llm_graph_context_mamba {
 
             Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
             cb(Qcur, "Qcur_normed", il);
-
             Qcur = ggml_rope_ext(
                     ctx0, Qcur, inp_pos, nullptr,
                     n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                     ext_factor, attn_factor, beta_fast, beta_slow
                     );
-            cb(Qcur, "Qcur_rope", il);
 
             Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
             cb(Kcur, "Kcur_normed", il);
-
             Kcur = ggml_rope_ext(
                     ctx0, Kcur, inp_pos, nullptr,
                     n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                     ext_factor, attn_factor, beta_fast, beta_slow
                     );
 
-            cur = build_attn(inp, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, NULL, NULL, 1.0f, il);
+            cur = build_attn(inp, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, NULL, NULL, 1.0f/sqrtf(float(n_embd_head_v)), il);
         }
 
-        cb(cur, "attn_out", il);
+        cb(cur, "attn_output", il);
 
         return cur;
     }
@@ -16180,7 +16176,6 @@ struct llm_build_plamo2 : public llm_graph_context_mamba {
             // Add D parameter and apply gating with z
             // {d_inner, n_seq_tokens, n_seqs} * {d_inner} => {d_inner, n_seq_tokens, n_seqs}
             ggml_tensor * D = ggml_reshape_2d(ctx0, model.layers[il].ssm_d, 1, n_heads);
-            cb(D, "mamba_D", il);
             y = ggml_add(ctx0, y, ggml_mul(ctx0, x, D));
             cb(y, "mamba_y_add_d", il);
 

From 75f0a0d7300528d072383553be60113b91b38589 Mon Sep 17 00:00:00 2001
From: Shunta Saito <shunta.saito@gmail.com>
Date: Sun, 27 Jul 2025 10:26:46 +0900
Subject: [PATCH 5/7] Update src/llama-model.cpp
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
---
 src/llama-model.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 8b5394fc69c2d..0c52f96a2e7c4 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -16012,6 +16012,7 @@ struct llm_build_plamo2 : public llm_graph_context_mamba {
 
             Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
             cb(Qcur, "Qcur_normed", il);
+
             Qcur = ggml_rope_ext(
                     ctx0, Qcur, inp_pos, nullptr,
                     n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,

From 429639d06366c5358332b81ab6bab40ada286af2 Mon Sep 17 00:00:00 2001
From: Shunta Saito <shunta.saito@gmail.com>
Date: Sun, 27 Jul 2025 10:26:54 +0900
Subject: [PATCH 6/7] Update src/llama-model.cpp
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
---
 src/llama-model.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 0c52f96a2e7c4..893efb65ffe40 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -16021,6 +16021,7 @@ struct llm_build_plamo2 : public llm_graph_context_mamba {
 
             Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
             cb(Kcur, "Kcur_normed", il);
+
             Kcur = ggml_rope_ext(
                     ctx0, Kcur, inp_pos, nullptr,
                     n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,

From 60a705de55750118e51d88bc5bc234f9ca3a3d71 Mon Sep 17 00:00:00 2001
From: Shunta Saito <shunta.saito@gmail.com>
Date: Sun, 27 Jul 2025 10:27:33 +0900
Subject: [PATCH 7/7] Update src/llama-model.cpp
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
---
 src/llama-model.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 893efb65ffe40..c9cc4318be0b0 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -16031,7 +16031,7 @@ struct llm_build_plamo2 : public llm_graph_context_mamba {
             cur = build_attn(inp, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, NULL, NULL, 1.0f/sqrtf(float(n_embd_head_v)), il);
         }
 
-        cb(cur, "attn_output", il);
+        cb(cur, "attn_out", il);
 
         return cur;
     }