@@ -1553,7 +1553,7 @@ struct clip_graph {
1553
1553
} else if (ctx->proj_type () == PROJECTOR_TYPE_VOXTRAL) {
1554
1554
// projector
1555
1555
cur = ggml_mul_mat (ctx0, model.mm_1_w , cur);
1556
- cur = ggml_gelu_erf (ctx0,ld cur);
1556
+ cur = ggml_gelu_erf (ctx0, cur);
1557
1557
cur = ggml_mul_mat (ctx0, model.mm_2_w , cur);
1558
1558
1559
1559
} else {
@@ -1575,25 +1575,17 @@ struct clip_graph {
1575
1575
const int n_pos = n_patches + 1 ; // +1 for [CLS]
1576
1576
1577
1577
// build input and concatenate class embedding
1578
- ggml_tensor * inp = build_inp ();
1579
- inp = ggml_concat (ctx0, inp , model.class_embedding , 1 );
1578
+ ggml_tensor * cur = build_inp ();
1579
+ cur = ggml_concat (ctx0, cur , model.class_embedding , 1 );
1580
1580
1581
1581
// Add position embeddings
1582
- inp = ggml_add (ctx0, inp, model.position_embeddings );
1583
- cb (inp, " pos_embed" , -1 );
1584
-
1585
- ggml_tensor * inpL = inp;
1586
-
1587
- // pre-layernorm
1588
- if (model.pre_ln_w ) {
1589
- inpL = build_norm (inpL, model.pre_ln_w , model.pre_ln_b , NORM_TYPE_NORMAL, eps, -1 );
1590
- cb (inpL, " pre_ln" , -1 );
1591
- }
1582
+ cur = ggml_add (ctx0, cur, model.position_embeddings );
1583
+ cb (cur, " pos_embed" , -1 );
1592
1584
1593
1585
// loop over layers
1594
1586
for (int il = 0 ; il < n_layer; il++) {
1595
1587
auto & layer = model.layers [il];
1596
- ggml_tensor * cur = inpL ; // inpL = residual, cur = hidden_states
1588
+ ggml_tensor * inpL = cur ; // inpL = residual, cur = hidden_states
1597
1589
1598
1590
// Note: cogvlm applies layernorm after attention, not before
1599
1591
// So we skip the layernorm1 here
@@ -1608,12 +1600,15 @@ struct clip_graph {
1608
1600
1609
1601
// Split qkv into Q, K, V along the first dimension
1610
1602
// qkv shape: [3 * n_embd, n_pos] -> split into [n_embd, n_pos] each
1611
- ggml_tensor * Qcur = ggml_view_2d (ctx0, qkv, n_embd, n_pos,
1612
- ggml_row_size (qkv->type , n_embd), 0 );
1613
- ggml_tensor * Kcur = ggml_view_2d (ctx0, qkv, n_embd, n_pos,
1614
- ggml_row_size (qkv->type , n_embd), n_embd * ggml_element_size (qkv));
1615
- ggml_tensor * Vcur = ggml_view_2d (ctx0, qkv, n_embd, n_pos,
1616
- ggml_row_size (qkv->type , n_embd), 2 * n_embd * ggml_element_size (qkv));
1603
+ ggml_tensor * Qcur = ggml_view_2d (ctx0, qkv, n_embd, n_pos,
1604
+ qkv->nb [1 ], 0 );
1605
+ ggml_tensor * Kcur = ggml_view_2d (ctx0, qkv, n_embd, n_pos,
1606
+ qkv->nb [1 ], n_embd * ggml_element_size (qkv));
1607
+ ggml_tensor * Vcur = ggml_view_2d (ctx0, qkv, n_embd, n_pos,
1608
+ qkv->nb [1 ], 2 * n_embd * ggml_element_size (qkv));
1609
+ Qcur = ggml_cont (ctx0, Qcur);
1610
+ Kcur = ggml_cont (ctx0, Kcur);
1611
+ Vcur = ggml_cont (ctx0, Vcur);
1617
1612
1618
1613
Qcur = ggml_reshape_3d (ctx0, Qcur, d_head, n_head, n_pos);
1619
1614
Kcur = ggml_reshape_3d (ctx0, Kcur, d_head, n_head, n_pos);
@@ -1628,11 +1623,6 @@ struct clip_graph {
1628
1623
cb (cur, " attn_out" , il);
1629
1624
}
1630
1625
1631
- if (layer.ls_1_w ) {
1632
- cur = ggml_mul (ctx0, cur, layer.ls_1_w );
1633
- cb (cur, " attn_out_scaled" , il);
1634
- }
1635
-
1636
1626
// Apply layernorm after attention for cogvlm
1637
1627
cur = build_norm (cur, layer.ln_1_w , layer.ln_1_b , NORM_TYPE_NORMAL, eps, il);
1638
1628
cb (cur, " attn_post_norm" , il);
@@ -1656,31 +1646,19 @@ struct clip_graph {
1656
1646
1657
1647
cb (cur, " ffn_out" , il);
1658
1648
1659
- if (layer.ls_2_w ) {
1660
- cur = ggml_mul (ctx0, cur, layer.ls_2_w );
1661
- cb (cur, " ffn_out_scaled" , il);
1662
- }
1663
-
1664
1649
// Apply layernorm after mlp for cogvlm
1665
1650
cur = build_norm (cur, layer.ln_2_w , layer.ln_2_b , NORM_TYPE_NORMAL, eps, il);
1666
1651
cb (cur, " ffn_post_norm" , il);
1667
1652
1668
1653
// residual 2
1669
1654
cur = ggml_add (ctx0, inpL, cur);
1670
1655
cb (cur, " layer_out" , il);
1671
-
1672
- inpL = cur;
1673
- }
1674
-
1675
- // post-layernorm
1676
- if (model.post_ln_w ) {
1677
- inpL = build_norm (inpL, model.post_ln_w , model.post_ln_b , NORM_TYPE_NORMAL, eps, -1 );
1678
1656
}
1679
1657
1680
1658
// remove CLS token (like build_llama4 does)
1681
- cur = ggml_view_2d (ctx0, inpL ,
1659
+ cur = ggml_view_2d (ctx0, cur ,
1682
1660
n_embd, n_patches,
1683
- ggml_row_size (inpL ->type , n_embd), 0 );
1661
+ ggml_row_size (cur ->type , n_embd), 0 );
1684
1662
1685
1663
// Multiply with mm_model_proj
1686
1664
cur = ggml_mul_mat (ctx0, model.mm_model_proj , cur);
@@ -1689,7 +1667,6 @@ struct clip_graph {
1689
1667
cur = build_norm (cur, model.mm_post_fc_norm_w , model.mm_post_fc_norm_b , NORM_TYPE_NORMAL, 1e-5 , -1 );
1690
1668
1691
1669
// Apply GELU
1692
- // TODO: Not 100% sure about gelu and silu configuration
1693
1670
cur = ggml_gelu_inplace (ctx0, cur);
1694
1671
1695
1672
// Branch 1: multiply with mm_h_to_4h_w
@@ -2548,9 +2525,9 @@ struct clip_model_loader {
2548
2525
model.layers .resize (hparams.n_layer );
2549
2526
for (int il = 0 ; il < hparams.n_layer ; ++il) {
2550
2527
auto & layer = model.layers [il];
2551
- layer.k_w = get_tensor (string_format (TN_ATTN_K, prefix, il, " weight" ));
2552
- layer.q_w = get_tensor (string_format (TN_ATTN_Q, prefix, il, " weight" ));
2553
- layer.v_w = get_tensor (string_format (TN_ATTN_V, prefix, il, " weight" ));
2528
+ layer.k_w = get_tensor (string_format (TN_ATTN_K, prefix, il, " weight" ), false );
2529
+ layer.q_w = get_tensor (string_format (TN_ATTN_Q, prefix, il, " weight" ), false );
2530
+ layer.v_w = get_tensor (string_format (TN_ATTN_V, prefix, il, " weight" ), false );
2554
2531
layer.o_w = get_tensor (string_format (TN_ATTN_OUTPUT, prefix, il, " weight" ));
2555
2532
layer.qkv_w = get_tensor (string_format (TN_ATTN_QKV, prefix, il, " weight" ), false );
2556
2533
layer.k_norm = get_tensor (string_format (TN_ATTN_K_NORM, prefix, il, " weight" ), false );
0 commit comments