@@ -938,6 +938,100 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
938
938
return moe_out;
939
939
}
940
940
941
+ ggml_tensor * llm_graph_context::build_moe_ffn_from_probs (
942
+ ggml_tensor * cur,
943
+ ggml_tensor * probs,
944
+ ggml_tensor * up_exps,
945
+ ggml_tensor * gate_exps,
946
+ ggml_tensor * down_exps,
947
+ ggml_tensor * exp_probs_b,
948
+ int64_t n_expert,
949
+ int64_t n_expert_used,
950
+ llama_expert_gating_func_type gating_op,
951
+ int il) const {
952
+ const int64_t n_embd = cur->ne [0 ];
953
+ const int64_t n_tokens = cur->ne [1 ];
954
+
955
+ // add experts selection bias - introduced in DeepSeek V3
956
+ // leave probs unbiased as it's later used to get expert weights
957
+ ggml_tensor * selection_probs = probs;
958
+ if (exp_probs_b != nullptr ) {
959
+ selection_probs = ggml_add (ctx0, probs, exp_probs_b);
960
+ cb (selection_probs, " ffn_moe_probs_biased" , il);
961
+ }
962
+
963
+ // select experts
964
+ ggml_tensor * selected_experts = ggml_top_k (ctx0, selection_probs, n_expert_used); // [n_expert_used, n_tokens]
965
+ cb (selected_experts->src [0 ], " ffn_moe_argsort" , il);
966
+ cb (selected_experts, " ffn_moe_topk" , il);
967
+
968
+ ggml_tensor * weights = ggml_get_rows (ctx0,
969
+ ggml_reshape_3d (ctx0, probs, 1 , n_expert, n_tokens), selected_experts); // [1, n_expert_used, n_tokens]
970
+ cb (weights, " ffn_moe_weights" , il);
971
+
972
+ weights = ggml_reshape_2d (ctx0, weights, n_expert_used, n_tokens);
973
+ if (gating_op == LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX) {
974
+ weights = ggml_soft_max (ctx0, weights);
975
+ } else {
976
+ weights = ggml_sigmoid (ctx0, weights);
977
+ ggml_tensor * weights_sum = ggml_sum_rows (ctx0, weights); // [1, n_tokens]
978
+ cb (weights_sum, " ffn_moe_weights_sum" , il);
979
+
980
+ weights = ggml_div (ctx0, weights, weights_sum); // [n_expert_used, n_tokens]
981
+ cb (weights, " ffn_moe_weights_norm" , il);
982
+ }
983
+
984
+ weights = ggml_reshape_3d (ctx0, weights, 1 , n_expert_used, n_tokens);
985
+
986
+ cur = ggml_reshape_3d (ctx0, cur, n_embd, 1 , n_tokens);
987
+
988
+ ggml_tensor * up = build_lora_mm_id (up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
989
+ cb (up, " ffn_moe_up" , il);
990
+
991
+ ggml_tensor * experts = nullptr ;
992
+ cur = build_lora_mm_id (gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
993
+ cb (cur, " ffn_moe_gate" , il);
994
+
995
+ cur = ggml_reglu_split (ctx0, cur, up);
996
+ cb (cur, " ffn_moe_reglu" , il);
997
+
998
+ experts = build_lora_mm_id (down_exps, cur, selected_experts); // [n_embd, n_expert_used, n_tokens]
999
+ cb (experts, " ffn_moe_down" , il);
1000
+
1001
+ experts = ggml_mul (ctx0, experts, weights);
1002
+ cb (cur, " ffn_moe_weighted" , il);
1003
+
1004
+ ggml_tensor * cur_experts[LLAMA_MAX_EXPERTS] = { nullptr };
1005
+
1006
+ assert (n_expert_used > 0 );
1007
+
1008
+ // order the views before the adds
1009
+ for (uint32_t i = 0 ; i < hparams.n_expert_used ; ++i) {
1010
+ cur_experts[i] = ggml_view_2d (ctx0, experts, n_embd, n_tokens, experts->nb [2 ], i*experts->nb [1 ]);
1011
+
1012
+ ggml_build_forward_expand (gf, cur_experts[i]);
1013
+ }
1014
+
1015
+ // aggregate experts
1016
+ // note: here we explicitly use hparams.n_expert_used instead of n_expert_used
1017
+ // to avoid potentially a large number of add nodes during warmup
1018
+ // ref: https://github.com/ggml-org/llama.cpp/pull/14753
1019
+ ggml_tensor * moe_out = cur_experts[0 ];
1020
+
1021
+ for (uint32_t i = 1 ; i < hparams.n_expert_used ; ++i) {
1022
+ moe_out = ggml_add (ctx0, moe_out, cur_experts[i]);
1023
+ }
1024
+
1025
+ if (n_expert_used == 1 ) {
1026
+ // avoid returning a non-contiguous tensor
1027
+ moe_out = ggml_cont (ctx0, moe_out);
1028
+ }
1029
+
1030
+ cb (moe_out, " ffn_moe_out" , il);
1031
+
1032
+ return moe_out;
1033
+ }
1034
+
941
1035
// input embeddings with optional lora
942
1036
ggml_tensor * llm_graph_context::build_inp_embd (ggml_tensor * tok_embd) const {
943
1037
const int64_t n_embd = hparams.n_embd ;
0 commit comments