update trtllm-gen to fix several issues

PerkzZheng · PerkzZheng · commit e4d7f46e717d · 2025-11-07T10:39:11.000Z
Signed-off-by: Perkz Zheng &lt;67892460+PerkzZheng@users.noreply.github.com&gt;
diff --git a/flashinfer/artifacts.py b/flashinfer/artifacts.py
@@ -87,7 +87,7 @@ class ArtifactPath:
     When compiling new cubins for backend directories, update the corresponding path.
     """
 
-    TRTLLM_GEN_FMHA: str = "463def7494c9fc6792b5aa5b5beef34025e247ac/fmha/trtllm-gen/"
+    TRTLLM_GEN_FMHA: str = "b793e1b2cf7c419f070372ba55bbe53ca6fb9016/fmha/trtllm-gen/"
     TRTLLM_GEN_BMM: str = (
         "23daeee32b60bde7947ce1ee7a58d4ab701f134b/batched_gemm-0d28130-add42d1"
     )
@@ -102,7 +102,7 @@ class ArtifactPath:
 class MetaInfoHash:
     DEEPGEMM: str = "f161e031826adb8c4f0d31ddbd2ed77e4909e4e43cdfc9728918162a62fcccfb"
     TRTLLM_GEN_FMHA: str = (
-        "2b8a485f2af84768bc769e678eb6014a8181ad95a7ea9e699de5efca4b18ec6a"
+        "bf45e2c21de9fbf5209bec3975b5ffe24b1d7a2e00aa40c548c992281864009f"
     )
     TRTLLM_GEN_BMM: str = (
         "6cfade1395f9648aba5dcf2c329114619e175c0f238882555178f98c8f5c1968"
@@ -123,7 +123,7 @@ class CheckSumHash:
         "639c534614e9fdf5a9cfa91f7ea8f53989613019c0e1f8b755f461e1fcc7546f"
     )
     TRTLLM_GEN_BMM: str = (
-        "46ccf0492e3ed10135c2861a4f4ef9bb45846610f9a9d2ccaf2d5bf01d2006fd"
+        "1ebace613389a4f2e10b14315da5d522642c5dcaae23f01213d56c59068f148b"
     )
     DEEPGEMM: str = "1a2a166839042dbd2a57f48051c82cd1ad032815927c753db269a4ed10d0ffbf"
     TRTLLM_GEN_GEMM: str = (
diff --git a/include/flashinfer/trtllm/fmha/fmhaKernels.cuh b/include/flashinfer/trtllm/fmha/fmhaKernels.cuh
@@ -96,14 +96,15 @@ class TllmGenFmhaKernel {
   inline uint64_t hashID(int qkvLayout, int maskType, int kernelType, int scheduler,
                          int multiCtasKvMode, int headDimPerCtaV, int headDimQk, int headDimV,
                          int tileSizeKv, int numTokensPerPage, int maxNumHeadsQPerKvInCta,
-                         bool reuseSmemKForV, bool uses2CtaMma) const {
+                         bool reuseSmemKForV, bool uses2CtaMma, bool sparseMla) const {
     FLASHINFER_CHECK((headDimPerCtaV >= 32) && (headDimQk >= 32) && (headDimV >= 32) &&
-                         (headDimPerCtaV <= 2048) && (headDimQk <= 2048) && (headDimV <= 2048) &&
-                         (numTokensPerPage <= 128),
-                     "Expect (32 <= headDim <= 2048) && (numTokensPerPage <= 128), "
-                     "got headDimPerCtaV=%d, headDimQk=%d, "
-                     "headDimV=%d, numTokensPerPage=%d",
-                     headDimPerCtaV, headDimQk, headDimV, numTokensPerPage);
+                         (headDimPerCtaV <= 1024) && (headDimQk <= 1024) && (headDimV <= 1024),
+                     "Expect (32 <= headDim <= 1024), got headDimPerCtaV=%d, headDimQk=%d, "
+                     "headDimV=%d",
+                     headDimPerCtaV, headDimQk, headDimV);
+    // The numTokensPerPage must be power of 2.
+    FLASHINFER_CHECK((numTokensPerPage & (numTokensPerPage - 1)) == 0,
+                     "The numTokensPerPage must be power of 2.");
     FLASHINFER_CHECK(maxNumHeadsQPerKvInCta <= 128,
                      "The maxNumHeadsQPerKvInCta <= 128 is required.");
     FLASHINFER_CHECK(tileSizeKv == 64 || tileSizeKv == 128, "The tileSizeKv must be 64 or 128.");
@@ -113,25 +114,26 @@ class TllmGenFmhaKernel {
     // Bit 8  - 11: kernelType.
     // Bit 12 - 15: tileScheduler.
     // Bit 16 - 17: multiCtasKvMode.
-    // Bit 18 - 24: (headDimPerCtaV >> 5).
-    // Bit 25 - 31: (headDimQk >> 5).
-    // Bit 32 - 38: (headDimV >> 5).
-    // Bit 39 - 40: (tileSizeKv >> 6).
-    // Bit 41 - 48: numTokensPerPage.
+    // Bit 18 - 25: (headDimPerCtaV >> 3).
+    // Bit 26 - 33: (headDimQk >> 3).
+    // Bit 34 - 41: (headDimV >> 3).
+    // Bit 42 - 43: (tileSizeKv >> 6).
+    // Bit 44 - 48: (log2(numTokensPerPage)).
     // Bit 49 - 56: maxNumHeadsQPerKvInCta.
     // Bit 57 - 57: reuseSmemKForV.
     // Bit 58 - 58: uses2CtaMma.
+    // Bit 59 - 59: sparseMla.
     return (static_cast<uint64_t>(qkvLayout) << 0) | (static_cast<uint64_t>(maskType) << 4) |
            (static_cast<uint64_t>(kernelType) << 8) | (static_cast<uint64_t>(scheduler) << 12) |
            (static_cast<uint64_t>(multiCtasKvMode) << 16) |
-           (static_cast<uint64_t>(headDimPerCtaV >> 5) << 18) |
-           (static_cast<uint64_t>(headDimQk >> 5) << 25) |
-           (static_cast<uint64_t>(headDimV >> 5) << 32) |
-           (static_cast<uint64_t>(tileSizeKv >> 6) << 39) |
-           (static_cast<uint64_t>(numTokensPerPage) << 41) |
+           (static_cast<uint64_t>(headDimPerCtaV >> 3) << 18) |
+           (static_cast<uint64_t>(headDimQk >> 3) << 26) |
+           (static_cast<uint64_t>(headDimV >> 3) << 34) |
+           (static_cast<uint64_t>(tileSizeKv >> 6) << 42) |
+           (static_cast<uint64_t>(log2(numTokensPerPage)) << 44) |
            (static_cast<uint64_t>(maxNumHeadsQPerKvInCta) << 49) |
            (static_cast<uint64_t>(reuseSmemKForV) << 57) |
-           (static_cast<uint64_t>(uses2CtaMma) << 58);
+           (static_cast<uint64_t>(uses2CtaMma) << 58) | (static_cast<uint64_t>(sparseMla) << 59);
   }
 
   uint64_t hashID(KernelMeta const& kernelMeta) const {
@@ -140,7 +142,7 @@ class TllmGenFmhaKernel {
                   kernelMeta.mHeadDimPerCtaV, kernelMeta.mHeadDimQk, kernelMeta.mHeadDimV,
                   kernelMeta.mTileSizeKv, kernelMeta.mNumTokensPerPage,
                   kernelMeta.mMaxNumHeadsQPerKvInCta, kernelMeta.mReuseSmemKForV,
-                  kernelMeta.m2CtaMma);
+                  kernelMeta.m2CtaMma, kernelMeta.mSparseMla);
   }
 
   std::pair<bool, std::string> checkIfKernelExist(RunnerParams const& params) const {
@@ -552,7 +554,8 @@ class TllmGenFmhaKernel {
                static_cast<int>(selectKernelParams.mMultiCtasKvMode),
                selectKernelParams.mHeadDimPerCtaV, params.mHeadDimQk, params.mHeadDimV,
                selectKernelParams.mTileSizeKv, numTokensPerPage, maxNumHeadsQPerKvInCta,
-               selectKernelParams.mReuseSmemKForV, selectKernelParams.mUses2CtaMma),
+               selectKernelParams.mReuseSmemKForV, selectKernelParams.mUses2CtaMma,
+               /* sparseMla */ false),
         info);
   }
 
diff --git a/include/flashinfer/trtllm/fmha/kernelParams.h b/include/flashinfer/trtllm/fmha/kernelParams.h
@@ -104,6 +104,8 @@ struct KernelParams {
   // The sequence lengths for K/V. Required by pagedKv kernels to avoid unnecessary computation
   // based on (ptrCumSeqLensKv[batchIdx + 1] - ptrCumSeqLensKv[batchIdx]).
   int32_t const* ptrSeqLensKv;
+  // The reserved memory buffer.
+  int32_t* ptrReservedMem;
   // The softmax stats buffer.
   float2* ptrSoftmaxStats;
 
@@ -139,6 +141,8 @@ struct KernelParams {
   int64_t mNumHiddenEltsO;
   // The total number of pages in the paged-kv memory pool.
   int32_t mNumPagesInMemPool;
+  // The number of tokens per page (used if dynamic numTokensPerPage is enabled).
+  int32_t mNumTokensPerPageLog2;
   // The output scale for FP8 quantization.
   float mOutputScale;
   // The scaling factor for softmax (multiplied by log2 to use faster exp2).
@@ -147,11 +151,15 @@ struct KernelParams {
   float mScaleSfKv;
   // The SF scale for O.
   float mScaleSfO;
+  // The reserved parameter.
+  float mReservedParam;
   // The start token index in SF tensor. Used for FP4 SF offset calculation in generation phase
   // kernel when inflight batching is enabled in TRT-LLM.
   int32_t mStartTokenIdxSfO;
   // The sum of sequence lengths for Q and K/V.
   int32_t mSumOfSeqLensQ, mSumOfSeqLensKv;
+  // The sparseMla topK value.
+  int32_t mSparseMlaTopK;
   // The flag to use block sparse attention.
   bool mUseBlockSparseAttention;
 
@@ -537,6 +545,8 @@ struct KernelParams {
                                       int32_t maxNumCtasQ, int32_t maxNumCtasKv) {
     // Create the return struct.
     KernelParams params;
+    // Memset the kernel parameters to 0.
+    memset(&params, 0, sizeof(KernelParams));
 
     // Get the device pointers for TMA descriptors.
     auto [qPtr, kPtr, vPtr] = getDevicePtrs(options, get_size_in_bytes(kernelMeta.mDataTypeKv));
@@ -681,6 +691,16 @@ struct KernelParams {
       // Default 0 means that chunked attention is disabled.
       params.mChunkedAttentionSizeLog2 = 0;
     }
+
+    // Compute the log of numTokensPerPage
+    int32_t numTokensPerPageLog2{-1};
+    if (isPagedKv(options.mQkvLayout)) {
+      FLASHINFER_CHECK((options.mNumTokensPerPage & (options.mNumTokensPerPage - 1)) == 0,
+                       "NumTokensPerPage must be power of 2");
+      numTokensPerPageLog2 = (int)log2f((float)options.mNumTokensPerPage);
+    }
+    params.mNumTokensPerPageLog2 = numTokensPerPageLog2;
+
     params.mMaxSeqLenQ = options.mMaxSeqLenQ;
     params.mMaxSeqLenKv = options.mMaxSeqLenKv;
     params.mMaxNumCtasQ = maxNumCtasQ;

Original file line number	Diff line number	Diff line change
`@@ -87,7 +87,7 @@ class ArtifactPath:`
`87`	`87`	`When compiling new cubins for backend directories, update the corresponding path.`
`88`	`88`	`"""`
`89`	`89`
`90`		`- TRTLLM_GEN_FMHA: str = "463def7494c9fc6792b5aa5b5beef34025e247ac/fmha/trtllm-gen/"`
	`90`	`+ TRTLLM_GEN_FMHA: str = "b793e1b2cf7c419f070372ba55bbe53ca6fb9016/fmha/trtllm-gen/"`
`91`	`91`	`TRTLLM_GEN_BMM: str = (`
`92`	`92`	`"23daeee32b60bde7947ce1ee7a58d4ab701f134b/batched_gemm-0d28130-add42d1"`
`93`	`93`	`)`
`@@ -102,7 +102,7 @@ class ArtifactPath:`
`102`	`102`	`class MetaInfoHash:`
`103`	`103`	`DEEPGEMM: str = "f161e031826adb8c4f0d31ddbd2ed77e4909e4e43cdfc9728918162a62fcccfb"`
`104`	`104`	`TRTLLM_GEN_FMHA: str = (`
`105`		`- "2b8a485f2af84768bc769e678eb6014a8181ad95a7ea9e699de5efca4b18ec6a"`
	`105`	`+ "bf45e2c21de9fbf5209bec3975b5ffe24b1d7a2e00aa40c548c992281864009f"`
`106`	`106`	`)`
`107`	`107`	`TRTLLM_GEN_BMM: str = (`
`108`	`108`	`"6cfade1395f9648aba5dcf2c329114619e175c0f238882555178f98c8f5c1968"`
`@@ -123,7 +123,7 @@ class CheckSumHash:`
`123`	`123`	`"639c534614e9fdf5a9cfa91f7ea8f53989613019c0e1f8b755f461e1fcc7546f"`
`124`	`124`	`)`
`125`	`125`	`TRTLLM_GEN_BMM: str = (`
`126`		`- "46ccf0492e3ed10135c2861a4f4ef9bb45846610f9a9d2ccaf2d5bf01d2006fd"`
	`126`	`+ "1ebace613389a4f2e10b14315da5d522642c5dcaae23f01213d56c59068f148b"`
`127`	`127`	`)`
`128`	`128`	`DEEPGEMM: str = "1a2a166839042dbd2a57f48051c82cd1ad032815927c753db269a4ed10d0ffbf"`
`129`	`129`	`TRTLLM_GEN_GEMM: str = (`