flashinfer-ai
diff --git a/‎csrc/trtllm_batched_gemm_runner.cu‎
Lines changed: 10 additions & 8 deletions b/‎csrc/trtllm_batched_gemm_runner.cu‎
Lines changed: 10 additions & 8 deletions
diff --git a/‎csrc/trtllm_fused_moe_kernel_launcher.cu‎
Lines changed: 1418 additions & 1106 deletions b/‎csrc/trtllm_fused_moe_kernel_launcher.cu‎
Lines changed: 1418 additions & 1106 deletions
diff --git a/‎csrc/trtllm_fused_moe_routing_renormalize.cu‎
Lines changed: 2 additions & 2 deletions b/‎csrc/trtllm_fused_moe_routing_renormalize.cu‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎flashinfer/artifacts.py‎
Lines changed: 3 additions & 3 deletions b/‎flashinfer/artifacts.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎flashinfer/fused_moe/__init__.py‎
Lines changed: 4 additions & 0 deletions b/‎flashinfer/fused_moe/__init__.py‎
Lines changed: 4 additions & 0 deletions
@@ -116,14 +116,16 @@ TrtllmGenBatchedGemmRunner::TrtllmGenBatchedGemmRunner(
     }
   }
 
-  FLASHINFER_CHECK(
-      !mPassingConfigIndices.empty(),
-      "No kernel found for the given options: mDtypeA: %s, mDtypeB: %s, mDtypeC: %s, "
-      "mUseDeepSeekFp8: %d, "
-      "mTransposeMmaOutput: %d, mRouteAct: %d, mFusedAct: %d, mIsStaticBatch: %d, mTileSize: %d",
-      tg::dtypeToString(mOptions.dtypeA).c_str(), tg::dtypeToString(mOptions.dtypeB).c_str(),
-      tg::dtypeToString(mOptions.dtypeC).c_str(), mOptions.deepSeekFp8, mOptions.transposeMmaOutput,
-      mOptions.routeAct, mOptions.fusedAct, mOptions.staticBatch, mOptions.tileSize);
+  std::ostringstream error_msg;
+  error_msg << "No kernel found for the given options: "
+            << "mDtypeA: " << tg::dtypeToString(mOptions.dtypeA)
+            << ", mDtypeB: " << tg::dtypeToString(mOptions.dtypeB)
+            << ", mDtypeC: " << tg::dtypeToString(mOptions.dtypeC)
+            << ", mUseDeepSeekFp8: " << mOptions.deepSeekFp8
+            << ", mTransposeMmaOutput: " << mOptions.transposeMmaOutput
+            << ", mRouteAct: " << mOptions.routeAct << ", mFusedAct: " << mOptions.fusedAct
+            << ", mIsStaticBatch: " << mOptions.staticBatch << ", mTileSize: " << mOptions.tileSize;
+  FLASHINFER_CHECK(!mPassingConfigIndices.empty(), error_msg.str());
 }
 
 size_t TrtllmGenBatchedGemmRunner::getWorkspaceSizeInBytes(
 
@@ -435,8 +435,8 @@ void run(Data const& data, void* stream) {
       << "Routing kernel expects #experts " << data.mNumExperts << " to be a multiple of 4.";
 
   // FIXME: routingIndicesBlockKernel breaks the vllm + gpt-oss DeepEP
-  // bool const useSingleBlock = data.mNumTokens <= BlockKernelMaxNumTokens;
-  bool const useSingleBlock = false;
+  bool const useSingleBlock =
+      data.mNumTokens <= BlockKernelMaxNumTokens && data.mPtrTopKPacked == nullptr;
 
   bool const useSingleCluster =
       data.mNumTokens <= ((data.mPtrScores != nullptr || data.mPtrTopKIds != nullptr)
 
@@ -89,7 +89,7 @@ class ArtifactPath:
 
     TRTLLM_GEN_FMHA: str = "463def7494c9fc6792b5aa5b5beef34025e247ac/fmha/trtllm-gen/"
     TRTLLM_GEN_BMM: str = (
-        "23daeee32b60bde7947ce1ee7a58d4ab701f134b/batched_gemm-0d28130-add42d1"
+        "c108f5cc46420e11805467898186533fb48d6a6f/batched_gemm-0d28130-7b26988"
     )
     TRTLLM_GEN_GEMM: str = (
         "1fddc48b7b48af33914d040051b3e2ee9ba4701e/gemm-145d1b1-9b113e3"
@@ -105,7 +105,7 @@ class MetaInfoHash:
         "2b8a485f2af84768bc769e678eb6014a8181ad95a7ea9e699de5efca4b18ec6a"
     )
     TRTLLM_GEN_BMM: str = (
-        "6cfade1395f9648aba5dcf2c329114619e175c0f238882555178f98c8f5c1968"
+        "26c51b75921be90235d193675facdea5d8341c4c52c73bd0a7c8e787c0388beb"
     )
     TRTLLM_GEN_GEMM: str = (
         "bd5c3227bec4f8d7a7d3a27fd7628e010d99a5c42651d0a6b97e146803e63340"
@@ -123,7 +123,7 @@ class CheckSumHash:
         "639c534614e9fdf5a9cfa91f7ea8f53989613019c0e1f8b755f461e1fcc7546f"
     )
     TRTLLM_GEN_BMM: str = (
-        "46ccf0492e3ed10135c2861a4f4ef9bb45846610f9a9d2ccaf2d5bf01d2006fd"
+        "85a4516b7ab25b1a6495398ae934a00e30ccd6662b9ec27be1330d7bba5e1ddf"
     )
     DEEPGEMM: str = "1a2a166839042dbd2a57f48051c82cd1ad032815927c753db269a4ed10d0ffbf"
     TRTLLM_GEN_GEMM: str = (
 
@@ -29,6 +29,7 @@
     trtllm_fp4_block_scale_routed_moe,
     trtllm_fp8_block_scale_moe,
     trtllm_fp8_per_tensor_scale_moe,
+    trtllm_bf16_moe,
 )
 
 __all__ = [
@@ -40,8 +41,11 @@
     "gen_cutlass_fused_moe_sm120_module",
     "gen_cutlass_fused_moe_sm100_module",
     "gen_cutlass_fused_moe_sm90_module",
+    "gen_trtllm_gen_fused_moe_sm100_module",
     "reorder_rows_for_gated_act_gemm",
+    "trtllm_bf16_moe",
     "trtllm_fp4_block_scale_moe",
+    "trtllm_fp4_block_scale_routed_moe",
     "trtllm_fp8_block_scale_moe",
     "trtllm_fp8_per_tensor_scale_moe",
 ]
Original file line number	Diff line number	Diff line change
`@@ -89,7 +89,7 @@ class ArtifactPath:`
`89`	`89`
`90`	`90`	`TRTLLM_GEN_FMHA: str = "463def7494c9fc6792b5aa5b5beef34025e247ac/fmha/trtllm-gen/"`
`91`	`91`	`TRTLLM_GEN_BMM: str = (`
`92`		`- "23daeee32b60bde7947ce1ee7a58d4ab701f134b/batched_gemm-0d28130-add42d1"`
	`92`	`+ "c108f5cc46420e11805467898186533fb48d6a6f/batched_gemm-0d28130-7b26988"`
`93`	`93`	`)`
`94`	`94`	`TRTLLM_GEN_GEMM: str = (`
`95`	`95`	`"1fddc48b7b48af33914d040051b3e2ee9ba4701e/gemm-145d1b1-9b113e3"`
`@@ -105,7 +105,7 @@ class MetaInfoHash:`
`105`	`105`	`"2b8a485f2af84768bc769e678eb6014a8181ad95a7ea9e699de5efca4b18ec6a"`
`106`	`106`	`)`
`107`	`107`	`TRTLLM_GEN_BMM: str = (`
`108`		`- "6cfade1395f9648aba5dcf2c329114619e175c0f238882555178f98c8f5c1968"`
	`108`	`+ "26c51b75921be90235d193675facdea5d8341c4c52c73bd0a7c8e787c0388beb"`
`109`	`109`	`)`
`110`	`110`	`TRTLLM_GEN_GEMM: str = (`
`111`	`111`	`"bd5c3227bec4f8d7a7d3a27fd7628e010d99a5c42651d0a6b97e146803e63340"`
`@@ -123,7 +123,7 @@ class CheckSumHash:`
`123`	`123`	`"639c534614e9fdf5a9cfa91f7ea8f53989613019c0e1f8b755f461e1fcc7546f"`
`124`	`124`	`)`
`125`	`125`	`TRTLLM_GEN_BMM: str = (`
`126`		`- "46ccf0492e3ed10135c2861a4f4ef9bb45846610f9a9d2ccaf2d5bf01d2006fd"`
	`126`	`+ "85a4516b7ab25b1a6495398ae934a00e30ccd6662b9ec27be1330d7bba5e1ddf"`
`127`	`127`	`)`
`128`	`128`	`DEEPGEMM: str = "1a2a166839042dbd2a57f48051c82cd1ad032815927c753db269a4ed10d0ffbf"`
`129`	`129`	`TRTLLM_GEN_GEMM: str = (`