Skip to content

Commit 40b07b2

Browse files
committed
Change to combined QKV in CLIP for CogVLM
1 parent a1c5697 commit 40b07b2

File tree

1 file changed

+108
-28
lines changed

1 file changed

+108
-28
lines changed

tools/mtmd/clip.cpp

Lines changed: 108 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -212,6 +212,7 @@ struct clip_layer {
212212
ggml_tensor * v_w = nullptr;
213213
ggml_tensor * v_b = nullptr;
214214
ggml_tensor * qkv_w = nullptr;
215+
ggml_tensor * qkv_b = nullptr;
215216

216217
ggml_tensor * o_w = nullptr;
217218
ggml_tensor * o_b = nullptr;
@@ -1552,7 +1553,7 @@ struct clip_graph {
15521553
} else if (ctx->proj_type() == PROJECTOR_TYPE_VOXTRAL) {
15531554
// projector
15541555
cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
1555-
cur = ggml_gelu_erf(ctx0, cur);
1556+
cur = ggml_gelu_erf(ctx0,ld cur);
15561557
cur = ggml_mul_mat(ctx0, model.mm_2_w, cur);
15571558

15581559
} else {
@@ -1577,18 +1578,109 @@ struct clip_graph {
15771578
ggml_tensor * inp = build_inp();
15781579
inp = ggml_concat(ctx0, inp, model.class_embedding, 1);
15791580

1580-
// build ViT transformer
1581-
ggml_tensor * cur = build_vit(
1582-
inp, n_pos,
1583-
NORM_TYPE_NORMAL,
1584-
hparams.ffn_op,
1585-
model.position_embeddings,
1586-
nullptr);
1581+
// Add position embeddings
1582+
inp = ggml_add(ctx0, inp, model.position_embeddings);
1583+
cb(inp, "pos_embed", -1);
1584+
1585+
ggml_tensor * inpL = inp;
1586+
1587+
// pre-layernorm
1588+
if (model.pre_ln_w) {
1589+
inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, NORM_TYPE_NORMAL, eps, -1);
1590+
cb(inpL, "pre_ln", -1);
1591+
}
1592+
1593+
// loop over layers
1594+
for (int il = 0; il < n_layer; il++) {
1595+
auto & layer = model.layers[il];
1596+
ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states
1597+
1598+
// Note: cogvlm applies layernorm after attention, not before
1599+
// So we skip the layernorm1 here
1600+
1601+
// self-attention
1602+
{
1603+
// Use combined qkv_w and qkv_b instead of separate Q, K, V tensors
1604+
ggml_tensor * qkv = ggml_mul_mat(ctx0, layer.qkv_w, cur);
1605+
if (layer.qkv_b) {
1606+
qkv = ggml_add(ctx0, qkv, layer.qkv_b);
1607+
}
1608+
1609+
// Split qkv into Q, K, V along the first dimension
1610+
// qkv shape: [3 * n_embd, n_pos] -> split into [n_embd, n_pos] each
1611+
ggml_tensor * Qcur = ggml_view_2d(ctx0, qkv, n_embd, n_pos,
1612+
ggml_row_size(qkv->type, n_embd), 0);
1613+
ggml_tensor * Kcur = ggml_view_2d(ctx0, qkv, n_embd, n_pos,
1614+
ggml_row_size(qkv->type, n_embd), n_embd * ggml_element_size(qkv));
1615+
ggml_tensor * Vcur = ggml_view_2d(ctx0, qkv, n_embd, n_pos,
1616+
ggml_row_size(qkv->type, n_embd), 2 * n_embd * ggml_element_size(qkv));
1617+
1618+
Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_pos);
1619+
Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_pos);
1620+
Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_pos);
1621+
1622+
cb(Qcur, "Qcur", il);
1623+
cb(Kcur, "Kcur", il);
1624+
cb(Vcur, "Vcur", il);
1625+
1626+
cur = build_attn(layer.o_w, layer.o_b,
1627+
Qcur, Kcur, Vcur, nullptr, kq_scale, il);
1628+
cb(cur, "attn_out", il);
1629+
}
1630+
1631+
if (layer.ls_1_w) {
1632+
cur = ggml_mul(ctx0, cur, layer.ls_1_w);
1633+
cb(cur, "attn_out_scaled", il);
1634+
}
1635+
1636+
// Apply layernorm after attention for cogvlm
1637+
cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, eps, il);
1638+
cb(cur, "attn_post_norm", il);
1639+
1640+
// re-add the layer input, e.g., residual
1641+
cur = ggml_add(ctx0, cur, inpL);
1642+
1643+
inpL = cur; // inpL = residual, cur = hidden_states
1644+
1645+
cb(cur, "ffn_inp", il);
1646+
1647+
// Note: cogvlm applies layernorm after mlp, not before
1648+
// So we skip the layernorm2 here
1649+
1650+
// ffn
1651+
cur = build_ffn(cur,
1652+
layer.ff_up_w, layer.ff_up_b,
1653+
layer.ff_gate_w, layer.ff_gate_b,
1654+
layer.ff_down_w, layer.ff_down_b,
1655+
hparams.ffn_op, il);
1656+
1657+
cb(cur, "ffn_out", il);
1658+
1659+
if (layer.ls_2_w) {
1660+
cur = ggml_mul(ctx0, cur, layer.ls_2_w);
1661+
cb(cur, "ffn_out_scaled", il);
1662+
}
1663+
1664+
// Apply layernorm after mlp for cogvlm
1665+
cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, eps, il);
1666+
cb(cur, "ffn_post_norm", il);
1667+
1668+
// residual 2
1669+
cur = ggml_add(ctx0, inpL, cur);
1670+
cb(cur, "layer_out", il);
1671+
1672+
inpL = cur;
1673+
}
1674+
1675+
// post-layernorm
1676+
if (model.post_ln_w) {
1677+
inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, NORM_TYPE_NORMAL, eps, -1);
1678+
}
15871679

15881680
// remove CLS token (like build_llama4 does)
1589-
cur = ggml_view_2d(ctx0, cur,
1681+
cur = ggml_view_2d(ctx0, inpL,
15901682
n_embd, n_patches,
1591-
ggml_row_size(cur->type, n_embd), 0);
1683+
ggml_row_size(inpL->type, n_embd), 0);
15921684

15931685
// Multiply with mm_model_proj
15941686
cur = ggml_mul_mat(ctx0, model.mm_model_proj, cur);
@@ -1671,10 +1763,8 @@ struct clip_graph {
16711763
ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states
16721764

16731765
// layernorm1
1674-
if (ctx->proj_type() != PROJECTOR_TYPE_COGVLM) {
1675-
cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, norm_t, eps, il);
1676-
cb(cur, "layer_inp_normed", il);
1677-
}
1766+
cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, norm_t, eps, il);
1767+
cb(cur, "layer_inp_normed", il);
16781768

16791769
// self-attention
16801770
{
@@ -1728,11 +1818,7 @@ struct clip_graph {
17281818
cb(cur, "attn_out_scaled", il);
17291819
}
17301820

1731-
// Apply layernorm after attention for cogvlm
1732-
if (ctx->proj_type() == PROJECTOR_TYPE_COGVLM) {
1733-
cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, norm_t, eps, il);
1734-
cb(cur, "attn_post_norm", il);
1735-
}
1821+
17361822

17371823
// re-add the layer input, e.g., residual
17381824
cur = ggml_add(ctx0, cur, inpL);
@@ -1742,10 +1828,8 @@ struct clip_graph {
17421828
cb(cur, "ffn_inp", il);
17431829

17441830
// layernorm2
1745-
if (ctx->proj_type() != PROJECTOR_TYPE_COGVLM) {
1746-
cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il);
1747-
cb(cur, "ffn_inp_normed", il);
1748-
}
1831+
cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il);
1832+
cb(cur, "ffn_inp_normed", il);
17491833

17501834
// ffn
17511835
cur = build_ffn(cur,
@@ -1761,11 +1845,7 @@ struct clip_graph {
17611845
cb(cur, "ffn_out_scaled", il);
17621846
}
17631847

1764-
// Apply layernorm after mlp for cogvlm
1765-
if (ctx->proj_type() == PROJECTOR_TYPE_COGVLM) {
1766-
cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il);
1767-
cb(cur, "ffn_post_norm", il);
1768-
}
1848+
17691849

17701850
// residual 2
17711851
cur = ggml_add(ctx0, inpL, cur);

0 commit comments

Comments
 (0)