@@ -212,6 +212,7 @@ struct clip_layer {
212
212
ggml_tensor * v_w = nullptr ;
213
213
ggml_tensor * v_b = nullptr ;
214
214
ggml_tensor * qkv_w = nullptr ;
215
+ ggml_tensor * qkv_b = nullptr ;
215
216
216
217
ggml_tensor * o_w = nullptr ;
217
218
ggml_tensor * o_b = nullptr ;
@@ -1552,7 +1553,7 @@ struct clip_graph {
1552
1553
} else if (ctx->proj_type () == PROJECTOR_TYPE_VOXTRAL) {
1553
1554
// projector
1554
1555
cur = ggml_mul_mat (ctx0, model.mm_1_w , cur);
1555
- cur = ggml_gelu_erf (ctx0, cur);
1556
+ cur = ggml_gelu_erf (ctx0,ld cur);
1556
1557
cur = ggml_mul_mat (ctx0, model.mm_2_w , cur);
1557
1558
1558
1559
} else {
@@ -1577,18 +1578,109 @@ struct clip_graph {
1577
1578
ggml_tensor * inp = build_inp ();
1578
1579
inp = ggml_concat (ctx0, inp, model.class_embedding , 1 );
1579
1580
1580
- // build ViT transformer
1581
- ggml_tensor * cur = build_vit (
1582
- inp, n_pos,
1583
- NORM_TYPE_NORMAL,
1584
- hparams.ffn_op ,
1585
- model.position_embeddings ,
1586
- nullptr );
1581
+ // Add position embeddings
1582
+ inp = ggml_add (ctx0, inp, model.position_embeddings );
1583
+ cb (inp, " pos_embed" , -1 );
1584
+
1585
+ ggml_tensor * inpL = inp;
1586
+
1587
+ // pre-layernorm
1588
+ if (model.pre_ln_w ) {
1589
+ inpL = build_norm (inpL, model.pre_ln_w , model.pre_ln_b , NORM_TYPE_NORMAL, eps, -1 );
1590
+ cb (inpL, " pre_ln" , -1 );
1591
+ }
1592
+
1593
+ // loop over layers
1594
+ for (int il = 0 ; il < n_layer; il++) {
1595
+ auto & layer = model.layers [il];
1596
+ ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states
1597
+
1598
+ // Note: cogvlm applies layernorm after attention, not before
1599
+ // So we skip the layernorm1 here
1600
+
1601
+ // self-attention
1602
+ {
1603
+ // Use combined qkv_w and qkv_b instead of separate Q, K, V tensors
1604
+ ggml_tensor * qkv = ggml_mul_mat (ctx0, layer.qkv_w , cur);
1605
+ if (layer.qkv_b ) {
1606
+ qkv = ggml_add (ctx0, qkv, layer.qkv_b );
1607
+ }
1608
+
1609
+ // Split qkv into Q, K, V along the first dimension
1610
+ // qkv shape: [3 * n_embd, n_pos] -> split into [n_embd, n_pos] each
1611
+ ggml_tensor * Qcur = ggml_view_2d (ctx0, qkv, n_embd, n_pos,
1612
+ ggml_row_size (qkv->type , n_embd), 0 );
1613
+ ggml_tensor * Kcur = ggml_view_2d (ctx0, qkv, n_embd, n_pos,
1614
+ ggml_row_size (qkv->type , n_embd), n_embd * ggml_element_size (qkv));
1615
+ ggml_tensor * Vcur = ggml_view_2d (ctx0, qkv, n_embd, n_pos,
1616
+ ggml_row_size (qkv->type , n_embd), 2 * n_embd * ggml_element_size (qkv));
1617
+
1618
+ Qcur = ggml_reshape_3d (ctx0, Qcur, d_head, n_head, n_pos);
1619
+ Kcur = ggml_reshape_3d (ctx0, Kcur, d_head, n_head, n_pos);
1620
+ Vcur = ggml_reshape_3d (ctx0, Vcur, d_head, n_head, n_pos);
1621
+
1622
+ cb (Qcur, " Qcur" , il);
1623
+ cb (Kcur, " Kcur" , il);
1624
+ cb (Vcur, " Vcur" , il);
1625
+
1626
+ cur = build_attn (layer.o_w , layer.o_b ,
1627
+ Qcur, Kcur, Vcur, nullptr , kq_scale, il);
1628
+ cb (cur, " attn_out" , il);
1629
+ }
1630
+
1631
+ if (layer.ls_1_w ) {
1632
+ cur = ggml_mul (ctx0, cur, layer.ls_1_w );
1633
+ cb (cur, " attn_out_scaled" , il);
1634
+ }
1635
+
1636
+ // Apply layernorm after attention for cogvlm
1637
+ cur = build_norm (cur, layer.ln_1_w , layer.ln_1_b , NORM_TYPE_NORMAL, eps, il);
1638
+ cb (cur, " attn_post_norm" , il);
1639
+
1640
+ // re-add the layer input, e.g., residual
1641
+ cur = ggml_add (ctx0, cur, inpL);
1642
+
1643
+ inpL = cur; // inpL = residual, cur = hidden_states
1644
+
1645
+ cb (cur, " ffn_inp" , il);
1646
+
1647
+ // Note: cogvlm applies layernorm after mlp, not before
1648
+ // So we skip the layernorm2 here
1649
+
1650
+ // ffn
1651
+ cur = build_ffn (cur,
1652
+ layer.ff_up_w , layer.ff_up_b ,
1653
+ layer.ff_gate_w , layer.ff_gate_b ,
1654
+ layer.ff_down_w , layer.ff_down_b ,
1655
+ hparams.ffn_op , il);
1656
+
1657
+ cb (cur, " ffn_out" , il);
1658
+
1659
+ if (layer.ls_2_w ) {
1660
+ cur = ggml_mul (ctx0, cur, layer.ls_2_w );
1661
+ cb (cur, " ffn_out_scaled" , il);
1662
+ }
1663
+
1664
+ // Apply layernorm after mlp for cogvlm
1665
+ cur = build_norm (cur, layer.ln_2_w , layer.ln_2_b , NORM_TYPE_NORMAL, eps, il);
1666
+ cb (cur, " ffn_post_norm" , il);
1667
+
1668
+ // residual 2
1669
+ cur = ggml_add (ctx0, inpL, cur);
1670
+ cb (cur, " layer_out" , il);
1671
+
1672
+ inpL = cur;
1673
+ }
1674
+
1675
+ // post-layernorm
1676
+ if (model.post_ln_w ) {
1677
+ inpL = build_norm (inpL, model.post_ln_w , model.post_ln_b , NORM_TYPE_NORMAL, eps, -1 );
1678
+ }
1587
1679
1588
1680
// remove CLS token (like build_llama4 does)
1589
- cur = ggml_view_2d (ctx0, cur ,
1681
+ cur = ggml_view_2d (ctx0, inpL ,
1590
1682
n_embd, n_patches,
1591
- ggml_row_size (cur ->type , n_embd), 0 );
1683
+ ggml_row_size (inpL ->type , n_embd), 0 );
1592
1684
1593
1685
// Multiply with mm_model_proj
1594
1686
cur = ggml_mul_mat (ctx0, model.mm_model_proj , cur);
@@ -1671,10 +1763,8 @@ struct clip_graph {
1671
1763
ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states
1672
1764
1673
1765
// layernorm1
1674
- if (ctx->proj_type () != PROJECTOR_TYPE_COGVLM) {
1675
- cur = build_norm (cur, layer.ln_1_w , layer.ln_1_b , norm_t , eps, il);
1676
- cb (cur, " layer_inp_normed" , il);
1677
- }
1766
+ cur = build_norm (cur, layer.ln_1_w , layer.ln_1_b , norm_t , eps, il);
1767
+ cb (cur, " layer_inp_normed" , il);
1678
1768
1679
1769
// self-attention
1680
1770
{
@@ -1728,11 +1818,7 @@ struct clip_graph {
1728
1818
cb (cur, " attn_out_scaled" , il);
1729
1819
}
1730
1820
1731
- // Apply layernorm after attention for cogvlm
1732
- if (ctx->proj_type () == PROJECTOR_TYPE_COGVLM) {
1733
- cur = build_norm (cur, layer.ln_1_w , layer.ln_1_b , norm_t , eps, il);
1734
- cb (cur, " attn_post_norm" , il);
1735
- }
1821
+
1736
1822
1737
1823
// re-add the layer input, e.g., residual
1738
1824
cur = ggml_add (ctx0, cur, inpL);
@@ -1742,10 +1828,8 @@ struct clip_graph {
1742
1828
cb (cur, " ffn_inp" , il);
1743
1829
1744
1830
// layernorm2
1745
- if (ctx->proj_type () != PROJECTOR_TYPE_COGVLM) {
1746
- cur = build_norm (cur, layer.ln_2_w , layer.ln_2_b , norm_t , eps, il);
1747
- cb (cur, " ffn_inp_normed" , il);
1748
- }
1831
+ cur = build_norm (cur, layer.ln_2_w , layer.ln_2_b , norm_t , eps, il);
1832
+ cb (cur, " ffn_inp_normed" , il);
1749
1833
1750
1834
// ffn
1751
1835
cur = build_ffn (cur,
@@ -1761,11 +1845,7 @@ struct clip_graph {
1761
1845
cb (cur, " ffn_out_scaled" , il);
1762
1846
}
1763
1847
1764
- // Apply layernorm after mlp for cogvlm
1765
- if (ctx->proj_type () == PROJECTOR_TYPE_COGVLM) {
1766
- cur = build_norm (cur, layer.ln_2_w , layer.ln_2_b , norm_t , eps, il);
1767
- cb (cur, " ffn_post_norm" , il);
1768
- }
1848
+
1769
1849
1770
1850
// residual 2
1771
1851
cur = ggml_add (ctx0, inpL, cur);
0 commit comments