tile-ai
diff --git a/‎_sources/autoapi/tilelang/intrinsics/index.rst.txt‎
Lines changed: 2 additions & 0 deletions b/‎_sources/autoapi/tilelang/intrinsics/index.rst.txt‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎_sources/autoapi/tilelang/intrinsics/mma_sm70_layout/index.rst.txt‎
Lines changed: 40 additions & 0 deletions b/‎_sources/autoapi/tilelang/intrinsics/mma_sm70_layout/index.rst.txt‎
Lines changed: 40 additions & 0 deletions
diff --git a/‎_sources/autoapi/tilelang/intrinsics/mma_sm70_macro_generator/index.rst.txt‎
Lines changed: 187 additions & 0 deletions b/‎_sources/autoapi/tilelang/intrinsics/mma_sm70_macro_generator/index.rst.txt‎
Lines changed: 187 additions & 0 deletions
diff --git a/‎_sources/autoapi/tilelang/language/builtin/index.rst.txt‎
Lines changed: 55 additions & 0 deletions b/‎_sources/autoapi/tilelang/language/builtin/index.rst.txt‎
Lines changed: 55 additions & 0 deletions
diff --git a/‎_sources/autoapi/tilelang/layout/swizzle/index.rst.txt‎
Lines changed: 3 additions & 0 deletions b/‎_sources/autoapi/tilelang/layout/swizzle/index.rst.txt‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎_sources/autoapi/tilelang/tileop/gemm/gemm_mma_sm70/index.rst.txt‎
Lines changed: 40 additions & 0 deletions b/‎_sources/autoapi/tilelang/tileop/gemm/gemm_mma_sm70/index.rst.txt‎
Lines changed: 40 additions & 0 deletions
diff --git a/‎_sources/autoapi/tilelang/tileop/gemm/index.rst.txt‎
Lines changed: 1 addition & 0 deletions b/‎_sources/autoapi/tilelang/tileop/gemm/index.rst.txt‎
Lines changed: 1 addition & 0 deletions
@@ -14,6 +14,8 @@ Submodules
    /autoapi/tilelang/intrinsics/mfma_macro_generator/index
    /autoapi/tilelang/intrinsics/mma_layout/index
    /autoapi/tilelang/intrinsics/mma_macro_generator/index
+   /autoapi/tilelang/intrinsics/mma_sm70_layout/index
+   /autoapi/tilelang/intrinsics/mma_sm70_macro_generator/index
    /autoapi/tilelang/intrinsics/tcgen05_macro_generator/index
    /autoapi/tilelang/intrinsics/utils/index
    /autoapi/tilelang/intrinsics/wgmma_macro_generator/index
 
@@ -0,0 +1,40 @@
+tilelang.intrinsics.mma_sm70_layout
+===================================
+
+.. py:module:: tilelang.intrinsics.mma_sm70_layout
+
+
+Functions
+---------
+
+.. autoapisummary::
+
+   tilelang.intrinsics.mma_sm70_layout.shared_16x4_to_mma_a_32x4_layout
+   tilelang.intrinsics.mma_sm70_layout.shared_4x16_to_mma_b_32x4_layout
+   tilelang.intrinsics.mma_sm70_layout.shared_16x4_to_mma_b_32x4_layout_trans
+   tilelang.intrinsics.mma_sm70_layout.mma_32x8_to_shared_16x16_layout_fp32
+   tilelang.intrinsics.mma_sm70_layout.mma_32x8_to_shared_16x16_layout_fp16
+   tilelang.intrinsics.mma_sm70_layout.mma_load_a_32x4_to_shared_16x4_layout
+   tilelang.intrinsics.mma_sm70_layout.mma_load_b_32x4_to_shared_16x4_layout_trans
+   tilelang.intrinsics.mma_sm70_layout.mma_load_b_32x4_to_shared_4x16_layout
+
+
+Module Contents
+---------------
+
+.. py:function:: shared_16x4_to_mma_a_32x4_layout(row, col, rep)
+
+.. py:function:: shared_4x16_to_mma_b_32x4_layout(row, col, rep)
+
+.. py:function:: shared_16x4_to_mma_b_32x4_layout_trans(row, col, rep)
+
+.. py:function:: mma_32x8_to_shared_16x16_layout_fp32(thread_id, local_id)
+
+.. py:function:: mma_32x8_to_shared_16x16_layout_fp16(thread_id, local_id)
+
+.. py:function:: mma_load_a_32x4_to_shared_16x4_layout(thread_id, local_id)
+
+.. py:function:: mma_load_b_32x4_to_shared_16x4_layout_trans(thread_id, local_id)
+
+.. py:function:: mma_load_b_32x4_to_shared_4x16_layout(thread_id, local_id)
+
@@ -0,0 +1,187 @@
+tilelang.intrinsics.mma_sm70_macro_generator
+============================================
+
+.. py:module:: tilelang.intrinsics.mma_sm70_macro_generator
+
+
+Attributes
+----------
+
+.. autoapisummary::
+
+   tilelang.intrinsics.mma_sm70_macro_generator.lift
+
+
+Classes
+-------
+
+.. autoapisummary::
+
+   tilelang.intrinsics.mma_sm70_macro_generator.TensorCoreIntrinEmitter
+
+
+Module Contents
+---------------
+
+.. py:data:: lift
+
+.. py:class:: TensorCoreIntrinEmitter(a_dtype = 'float16', b_dtype = 'float16', accum_dtype = 'float16', a_transposed = False, b_transposed = False, block_row_warps = 2, block_col_warps = 2, warp_row_tiles = 8, warp_col_tiles = 8, chunk = 16, reduce_k = 1, num_elems_per_byte = 1, is_m_first = False, thread_var = None)
+
+   To eliminate Python syntax within TIR Macro.
+
+
+   .. py:attribute:: M_DIM
+      :value: 16
+
+
+
+   .. py:attribute:: n_dim
+      :value: 16
+
+
+
+   .. py:attribute:: WARP_SIZE
+      :value: 32
+
+
+
+   .. py:attribute:: HALF_WARP_SIZE
+      :value: 16
+
+
+
+   .. py:attribute:: dtype_abbrv
+
+
+   .. py:attribute:: is_m_first
+      :value: False
+
+
+
+   .. py:attribute:: a_dtype
+      :value: 'float16'
+
+
+
+   .. py:attribute:: b_dtype
+      :value: 'float16'
+
+
+
+   .. py:attribute:: accum_dtype
+      :value: 'float16'
+
+
+
+   .. py:attribute:: a_transposed
+      :value: False
+
+
+
+   .. py:attribute:: b_transposed
+      :value: False
+
+
+
+   .. py:attribute:: block_row_warps
+      :value: 2
+
+
+
+   .. py:attribute:: block_col_warps
+      :value: 2
+
+
+
+   .. py:attribute:: warp_row_tiles
+      :value: 8
+
+
+
+   .. py:attribute:: warp_col_tiles
+      :value: 8
+
+
+
+   .. py:attribute:: chunk
+      :value: 16
+
+
+
+   .. py:attribute:: reduce_k
+      :value: 1
+
+
+
+   .. py:attribute:: threads
+      :value: 128
+
+
+
+   .. py:attribute:: num_elems_per_byte
+      :value: 1
+
+
+
+   .. py:attribute:: thread_var
+      :value: None
+
+
+
+   .. py:method:: get_thread_binding()
+
+
+   .. py:method:: get_store_index_map(inverse = False)
+
+
+   .. py:method:: extract_thread_binding(thread_id, is_m_first = None)
+
+      is_m_first: True if the thread binding is in the form of (tx, warp_n, warp_m)
+      which represents [warp_size, block_row_warps (split n), block_col_warps (split m)]
+      Otherwise, it is in the form of [warp_size, block_col_warps (split m), block_row_warps (split n)]
+
+
+
+   .. py:method:: ldmatrix_a(A_local_buf, A_shared_buf, ki, rk = 0)
+
+
+   .. py:method:: ldmatrix_b(B_local_buf, B_shared_buf, ki, rk = 0)
+
+
+   .. py:method:: mma(A_local_buf, B_local_buf, C_local_buf, k_inner = 0)
+
+
+   .. py:method:: make_mma_load_layout(local_buf, matrix = 'A')
+
+      Create a layout function for storing MMA results into a fragment buffer.
+      This layout is used in conjunction with `inverse_mma_store_layout` to
+      map fragment indices to threads and local indices.
+
+      :param local_buf: The local buffer representing a fragment of a matrix.
+      :type local_buf: tir.Buffer
+
+      :returns: A fragment object that describes how threads and indices
+                in `local_buf` are laid out.
+      :rtype: T.Fragment
+
+      :raises AssertionError: If `local_buf` is not detected to be a fragment buffer.
+
+
+
+   .. py:method:: make_mma_store_layout(local_buf)
+
+      Create a layout function for storing MMA results into a fragment buffer.
+      This layout is used in conjunction with `inverse_mma_store_layout` to
+      map fragment indices to threads and local indices.
+
+      :param local_buf: The local buffer representing a fragment of a matrix.
+      :type local_buf: tir.Buffer
+
+      :returns: A fragment object that describes how threads and indices
+                in `local_buf` are laid out.
+      :rtype: T.Fragment
+
+      :raises AssertionError: If `local_buf` is not detected to be a fragment buffer.
+
+
+
@@ -55,6 +55,7 @@ Functions
    tilelang.language.builtin.loop_break
    tilelang.language.builtin.cp_async_barrier_noinc
    tilelang.language.builtin.tcgen05_mma_arrive
+   tilelang.language.builtin.ptx_mma_sm70
 
 
 Module Contents
@@ -520,3 +521,57 @@ Module Contents
    :type mbar_ptr: PrimExpr
 
 
+.. py:function:: ptx_mma_sm70(shape, A_layout, B_layout, A_dtype, B_dtype, C_dtype, multiplicand_a, a_index, multiplicand_b, b_index, accumulator, c_index)
+
+   TVM intrinsic for ptx tensor core mma instructions on SM70 (Volta).
+
+   This intrinsic provides SM70-specific MMA operations that support m16n16k4 shape
+   with FP16 inputs and FP16/FP32 accumulation.
+
+   :param shape: The shape of mma fragment (e.g., "m16n16k4").
+   :type shape: str
+   :param A_layout: The layout of multiplicand fragment A ("row" or "col").
+   :type A_layout: str
+   :param B_layout: The layout of multiplicand fragment B ("row" or "col").
+   :type B_layout: str
+   :param A_dtype: The data type of multiplicand fragment A (typically "fp16").
+   :type A_dtype: str
+   :param B_dtype: The data type of multiplicand fragment B (typically "fp16").
+   :type B_dtype: str
+   :param C_dtype: The data type of accumulator fragment C ("fp16" or "fp32").
+   :type C_dtype: str
+   :param multiplicand_a: The multiplicand fragment A variable.
+   :type multiplicand_a: Var
+   :param a_index: The index of multiplicand fragment A.
+   :type a_index: Expr
+   :param multiplicand_b: The multiplicand fragment B variable.
+   :type multiplicand_b: Var
+   :param b_index: The index of multiplicand fragment B.
+   :type b_index: Expr
+   :param accumulator: The accumulator fragment C variable.
+   :type accumulator: Var
+   :param c_index: The index of accumulator fragment C.
+   :type c_index: Expr
+
+   :returns: **call** -- The call expression.
+   :rtype: PrimExpr
+
+   .. rubric:: Examples
+
+   >>> T.ptx_mma_sm70(
+   ...     "float16",
+   ...     "m16n16k4",
+   ...     "row",
+   ...     "col",
+   ...     "fp16",
+   ...     "fp16",
+   ...     "fp16",
+   ...     A_local.data,
+   ...     0,
+   ...     B_local.data,
+   ...     0,
+   ...     C_local.data,
+   ...     0,
+   ... )
+
+
@@ -15,6 +15,7 @@ Functions
 .. autoapisummary::
 
    tilelang.layout.swizzle.make_swizzled_layout
+   tilelang.layout.swizzle.make_volta_swizzled_layout
    tilelang.layout.swizzle.make_wgmma_swizzled_layout
    tilelang.layout.swizzle.make_tcgen05mma_swizzled_layout
    tilelang.layout.swizzle.make_full_bank_swizzled_layout
@@ -28,6 +29,8 @@ Module Contents
 
 .. py:function:: make_swizzled_layout(buffer, k_major = True, allow_pad = True)
 
+.. py:function:: make_volta_swizzled_layout(buffer, is_a = True, k_inner = True)
+
 .. py:function:: make_wgmma_swizzled_layout(buffer, continuity = None, k_major = True)
 
 .. py:function:: make_tcgen05mma_swizzled_layout(buffer, continuity = None, k_major = True)
 
@@ -0,0 +1,40 @@
+tilelang.tileop.gemm.gemm_mma_sm70
+==================================
+
+.. py:module:: tilelang.tileop.gemm.gemm_mma_sm70
+
+
+Classes
+-------
+
+.. autoapisummary::
+
+   tilelang.tileop.gemm.gemm_mma_sm70.GemmMMASm70
+
+
+Module Contents
+---------------
+
+.. py:class:: GemmMMASm70
+
+   Bases: :py:obj:`tilelang.tileop.gemm.gemm_base.GemmBase`
+
+
+   .. py:method:: infer_layout(target, thread_nums)
+
+
+   .. py:method:: lower(layout_map, target, thread_nums, thread_var)
+
+
+   .. py:method:: is_gemm_ss()
+
+
+   .. py:method:: is_gemm_sr()
+
+
+   .. py:method:: is_gemm_rs()
+
+
+   .. py:method:: is_gemm_rr()
+
+
@@ -13,6 +13,7 @@ Submodules
    /autoapi/tilelang/tileop/gemm/gemm_base/index
    /autoapi/tilelang/tileop/gemm/gemm_mfma/index
    /autoapi/tilelang/tileop/gemm/gemm_mma/index
+   /autoapi/tilelang/tileop/gemm/gemm_mma_sm70/index
    /autoapi/tilelang/tileop/gemm/gemm_tcgen05/index
    /autoapi/tilelang/tileop/gemm/gemm_wgmma/index