@@ -211,6 +211,8 @@ struct clip_layer {
211
211
ggml_tensor * q_b = nullptr ;
212
212
ggml_tensor * v_w = nullptr ;
213
213
ggml_tensor * v_b = nullptr ;
214
+ ggml_tensor * qkv_w = nullptr ;
215
+ ggml_tensor * qkv_b = nullptr ;
214
216
215
217
ggml_tensor * o_w = nullptr ;
216
218
ggml_tensor * o_b = nullptr ;
@@ -1576,18 +1578,65 @@ struct clip_graph {
1576
1578
ggml_tensor * inp = build_inp ();
1577
1579
inp = ggml_concat (ctx0, inp, model.class_embedding , 1 );
1578
1580
1579
- // build ViT transformer
1580
- ggml_tensor * cur = build_vit (
1581
- inp, n_pos,
1582
- NORM_TYPE_NORMAL,
1583
- hparams.ffn_op ,
1584
- model.position_embeddings ,
1585
- nullptr );
1581
+ inp = ggml_add (ctx0, inp, model.position_embeddings );
1582
+ cb (inp, " inp_pos" , -1 );
1583
+
1584
+ ggml_tensor * inpL = inp;
1585
+
1586
+ for (int il = 0 ; il < n_layer; il++) {
1587
+ auto & layer = model.layers [il];
1588
+ ggml_tensor * cur = inpL;
1589
+
1590
+ cur = ggml_mul_mat (ctx0, layer.qkv_w , cur);
1591
+
1592
+ cur = ggml_add (ctx0, cur, layer.qkv_b );
1593
+
1594
+ ggml_tensor * Qcur = ggml_cont (ctx0, ggml_view_2d (ctx0, cur, n_embd, n_pos,
1595
+ cur->nb [1 ], 0 ));
1596
+ ggml_tensor * Kcur = ggml_cont (ctx0, ggml_view_2d (ctx0, cur, n_embd, n_pos,
1597
+ cur->nb [1 ], n_embd * sizeof (float )));
1598
+ ggml_tensor * Vcur = ggml_cont (ctx0, ggml_view_2d (ctx0, cur, n_embd, n_pos,
1599
+ cur->nb [1 ], 2 * n_embd * sizeof (float )));
1600
+
1601
+ Qcur = ggml_reshape_3d (ctx0, Qcur, d_head, n_head, n_pos);
1602
+ Kcur = ggml_reshape_3d (ctx0, Kcur, d_head, n_head, n_pos);
1603
+ Vcur = ggml_reshape_3d (ctx0, Vcur, d_head, n_head, n_pos);
1604
+
1605
+ cb (Qcur, " Qcur" , il);
1606
+ cb (Kcur, " Kcur" , il);
1607
+ cb (Vcur, " Vcur" , il);
1608
+
1609
+ cur = build_attn (layer.o_w , layer.o_b ,
1610
+ Qcur, Kcur, Vcur, nullptr , kq_scale, il);
1611
+ cb (cur, " attn_out" , il);
1612
+
1613
+ cur = build_norm (cur, layer.ln_1_w , layer.ln_1_b , NORM_TYPE_NORMAL, eps, il);
1614
+ cb (cur, " attn_post_norm" , il);
1615
+
1616
+ cur = ggml_add (ctx0, cur, inpL);
1617
+ inpL = cur;
1618
+
1619
+ cur = build_ffn (cur,
1620
+ layer.ff_up_w , layer.ff_up_b ,
1621
+ layer.ff_gate_w , layer.ff_gate_b ,
1622
+ layer.ff_down_w , layer.ff_down_b ,
1623
+ hparams.ffn_op , il);
1624
+
1625
+ cb (cur, " ffn_out" , il);
1626
+
1627
+ cur = build_norm (cur, layer.ln_2_w , layer.ln_2_b , NORM_TYPE_NORMAL, eps, il);
1628
+ cb (cur, " ffn_post_norm" , il);
1629
+
1630
+ cur = ggml_add (ctx0, cur, inpL);
1631
+ cb (cur, " layer_out" , il);
1632
+ inpL = cur;
1633
+
1634
+ }
1586
1635
1587
1636
// remove CLS token (like build_llama4 does)
1588
- cur = ggml_view_2d (ctx0, cur ,
1637
+ ggml_tensor * cur = ggml_view_2d (ctx0, inpL ,
1589
1638
n_embd, n_patches,
1590
- ggml_row_size (cur ->type , n_embd), 0 );
1639
+ ggml_row_size (inpL ->type , n_embd), 0 );
1591
1640
1592
1641
// Multiply with mm_model_proj
1593
1642
cur = ggml_mul_mat (ctx0, model.mm_model_proj , cur);
@@ -1665,14 +1714,9 @@ struct clip_graph {
1665
1714
auto & layer = model.layers [il];
1666
1715
ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states
1667
1716
1668
- // Check if this is COGVLM projector type for post-norm layernorm order
1669
- const bool is_cogvlm = ctx->proj_type () == PROJECTOR_TYPE_COGVLM;
1670
-
1671
- // layernorm1 (only for non-COGVLM)
1672
- if (!is_cogvlm) {
1673
- cur = build_norm (cur, layer.ln_1_w , layer.ln_1_b , norm_t , eps, il);
1674
- cb (cur, " layer_inp_normed" , il);
1675
- }
1717
+ // layernorm1
1718
+ cur = build_norm (cur, layer.ln_1_w , layer.ln_1_b , norm_t , eps, il);
1719
+ cb (cur, " layer_inp_normed" , il);
1676
1720
1677
1721
// self-attention
1678
1722
{
@@ -1726,24 +1770,16 @@ struct clip_graph {
1726
1770
cb (cur, " attn_out_scaled" , il);
1727
1771
}
1728
1772
1729
- // Apply layernorm AFTER attention for COGVLM (post-norm)
1730
- if (is_cogvlm) {
1731
- cur = build_norm (cur, layer.ln_1_w , layer.ln_1_b , norm_t , eps, il);
1732
- cb (cur, " attn_post_norm" , il);
1733
- }
1734
-
1735
1773
// re-add the layer input, e.g., residual
1736
1774
cur = ggml_add (ctx0, cur, inpL);
1737
1775
1738
1776
inpL = cur; // inpL = residual, cur = hidden_states
1739
1777
1740
1778
cb (cur, " ffn_inp" , il);
1741
1779
1742
- // layernorm2 (only for non-COGVLM)
1743
- if (!is_cogvlm) {
1744
- cur = build_norm (cur, layer.ln_2_w , layer.ln_2_b , norm_t , eps, il);
1745
- cb (cur, " ffn_inp_normed" , il);
1746
- }
1780
+ // layernorm2
1781
+ cur = build_norm (cur, layer.ln_2_w , layer.ln_2_b , norm_t , eps, il);
1782
+ cb (cur, " ffn_inp_normed" , il);
1747
1783
1748
1784
// ffn
1749
1785
cur = build_ffn (cur,
@@ -1759,12 +1795,6 @@ struct clip_graph {
1759
1795
cb (cur, " ffn_out_scaled" , il);
1760
1796
}
1761
1797
1762
- // Apply layernorm AFTER MLP for COGVLM (post-norm)
1763
- if (is_cogvlm) {
1764
- cur = build_norm (cur, layer.ln_2_w , layer.ln_2_b , norm_t , eps, il);
1765
- cb (cur, " ffn_post_norm" , il);
1766
- }
1767
-
1768
1798
// residual 2
1769
1799
cur = ggml_add (ctx0, inpL, cur);
1770
1800
cb (cur, " layer_out" , il);
@@ -2466,10 +2496,11 @@ struct clip_model_loader {
2466
2496
model.layers .resize (hparams.n_layer );
2467
2497
for (int il = 0 ; il < hparams.n_layer ; ++il) {
2468
2498
auto & layer = model.layers [il];
2469
- layer.k_w = get_tensor (string_format (TN_ATTN_K, prefix, il, " weight" ));
2470
- layer.q_w = get_tensor (string_format (TN_ATTN_Q, prefix, il, " weight" ));
2471
- layer.v_w = get_tensor (string_format (TN_ATTN_V, prefix, il, " weight" ));
2499
+ layer.k_w = get_tensor (string_format (TN_ATTN_K, prefix, il, " weight" ), false );
2500
+ layer.q_w = get_tensor (string_format (TN_ATTN_Q, prefix, il, " weight" ), false );
2501
+ layer.v_w = get_tensor (string_format (TN_ATTN_V, prefix, il, " weight" ), false );
2472
2502
layer.o_w = get_tensor (string_format (TN_ATTN_OUTPUT, prefix, il, " weight" ));
2503
+ layer.qkv_w = get_tensor (string_format (TN_ATTN_QKV, prefix, il, " weight" ), false );
2473
2504
layer.k_norm = get_tensor (string_format (TN_ATTN_K_NORM, prefix, il, " weight" ), false );
2474
2505
layer.q_norm = get_tensor (string_format (TN_ATTN_Q_NORM, prefix, il, " weight" ), false );
2475
2506
layer.ln_1_w = get_tensor (string_format (TN_LN_1, prefix, il, " weight" ), false );
@@ -2481,6 +2512,7 @@ struct clip_model_loader {
2481
2512
layer.q_b = get_tensor (string_format (TN_ATTN_Q, prefix, il, " bias" ), false );
2482
2513
layer.v_b = get_tensor (string_format (TN_ATTN_V, prefix, il, " bias" ), false );
2483
2514
layer.o_b = get_tensor (string_format (TN_ATTN_OUTPUT, prefix, il, " bias" ), false );
2515
+ layer.qkv_b = get_tensor (string_format (TN_ATTN_QKV, prefix, il, " bias" ), false );
2484
2516
layer.ln_1_b = get_tensor (string_format (TN_LN_1, prefix, il, " bias" ), false );
2485
2517
layer.ln_2_b = get_tensor (string_format (TN_LN_2, prefix, il, " bias" ), false );
2486
2518
0 commit comments