diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index d9185c8060028..9a0e670577a52 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -3788,7 +3788,7 @@ def set_gguf_parameters(self): self.gguf_writer.add_block_count(block_count) self.gguf_writer.add_head_count(hparams.get("num_attention_heads", 32)) self.gguf_writer.add_layer_norm_rms_eps(hparams.get("rms_norm_eps", 1e-06)) - self.gguf_writer.add_rope_freq_base(hparams.get("rope_theta", 1000000.0)) + self.gguf_writer.add_rope_freq_base(hparams.get("rope_theta", 10000)) # Mamba parameters self.gguf_writer.add_ssm_state_size(hparams.get("mamba_d_state", 64)) @@ -3799,7 +3799,7 @@ def set_gguf_parameters(self): self.gguf_writer.add_ssm_group_count(0) # MLP feed forward parameters (for attention layers) - self.gguf_writer.add_feed_forward_length(hparams.get("intermediate_size", 16384)) + self.gguf_writer.add_feed_forward_length(hparams.get("intermediate_size", 13312)) self.gguf_writer.add_file_type(self.ftype) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: diff --git a/src/llama-model.cpp b/src/llama-model.cpp index b88f4ebc5c02f..c9cc4318be0b0 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -15988,7 +15988,7 @@ struct llm_build_plamo2 : public llm_graph_context_mamba { { // PLaMo-2 uses combined QKV tensor ggml_tensor * qkv = build_lora_mm(model.layers[il].wqkv, cur); - cb(qkv, "qkv", il); + cb(qkv, "wqkv", il); // split QKV tensor into Q, K, V const int64_t n_embd_head_q = hparams.n_embd_head_k; @@ -16028,7 +16028,7 @@ struct llm_build_plamo2 : public llm_graph_context_mamba { ext_factor, attn_factor, beta_fast, beta_slow ); - cur = build_attn(inp, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, NULL, NULL, 1.0f, il); + cur = build_attn(inp, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, NULL, NULL, 1.0f/sqrtf(float(n_embd_head_v)), il); } cb(cur, "attn_out", il); @@ -16103,8 +16103,9 @@ struct llm_build_plamo2 : public llm_graph_context_mamba { ggml_build_forward_expand(gf, ggml_cpy(ctx0, last_conv, ggml_view_1d(ctx0, conv_states_all, - (d_conv - 1)*(d_inner)*(n_seqs), - kv_head*(d_conv - 1)*(d_inner)*ggml_element_size(conv_states_all)))); + (d_conv - 1)*(d_inner + 2*n_group*d_state)*(n_seqs), + kv_head*(d_conv - 1)*(d_inner + 2*n_group*d_state)*ggml_element_size(conv_states_all)))); + cb(conv_states_all, "mamba_conv1d_state", il); // 1D convolution x = ggml_ssm_conv(ctx0, conv_x, model.layers[il].ssm_conv1d); @@ -16167,9 +16168,9 @@ struct llm_build_plamo2 : public llm_graph_context_mamba { // store last states ggml_build_forward_expand(gf, ggml_cpy(ctx0, - ggml_view_1d(ctx0, y_ssm, d_state*d_inner*n_seqs, x->nb[3]*x->ne[3]), - ggml_view_1d(ctx0, ssm_states_all, d_state*d_inner*n_seqs, - kv_head*d_state*d_inner*ggml_element_size(ssm_states_all)))); + ggml_view_1d(ctx0, y_ssm, n_heads*head_dim*d_state*n_seqs, n_heads*head_dim*n_seq_tokens*n_seqs*ggml_element_size(y_ssm)), + ggml_view_1d(ctx0, ssm_states_all, n_heads*head_dim*d_state*n_seqs, kv_head*n_seqs*n_heads*head_dim*d_state*ggml_element_size(ssm_states_all)))); + cb(ssm_states_all, "mamba_ssm_states", il); ggml_tensor * y = ggml_view_4d(ctx0, y_ssm, head_dim, n_heads, n_seq_tokens, n_seqs, head_dim * ggml_element_size(x), head_dim * n_heads * ggml_element_size(x), head_dim * n_heads * n_seq_tokens * ggml_element_size(x), 0); cb(y, "mamba_y_view", il);