@@ -26,7 +26,7 @@ Module Contents
2626
2727.. py :data :: lift
2828
29- .. py :class :: MatrixCoreIntrinEmitter(a_dtype = ' float16' , b_dtype = ' float16' , accum_dtype = ' float16' , a_transposed = False , b_transposed = False , block_row_warps = 2 , block_col_warps = 2 , warp_row_tiles = 8 , warp_col_tiles = 8 , chunk = 16 , reduce_k = 1 , num_elems_per_byte = 1 , k_pack = None , is_m_first = False , b_preshuffle = False )
29+ .. py :class :: MatrixCoreIntrinEmitter(a_dtype = ' float16' , b_dtype = ' float16' , accum_dtype = ' float16' , a_transposed = False , b_transposed = False , block_row_warps = 2 , block_col_warps = 2 , warp_row_tiles = 8 , warp_col_tiles = 8 , chunk = 16 , reduce_k = 1 , num_elems_per_byte = 1 , k_pack = None , is_m_first = False , b_preshuffle = False , thread_var = None )
3030
3131 To eliminate Python syntax within TIR Macro.
3232
@@ -134,9 +134,20 @@ Module Contents
134134
135135
136136
137+ .. py :attribute :: thread_var
138+ :value: None
139+
140+
141+
137142 .. py :method :: get_ldmatrix_index_map(is_b = False )
138143
139144
145+ .. py :method :: get_store_index_map(inverse = False )
146+
147+
148+ .. py :method :: get_thread_binding()
149+
150+
140151 .. py :method :: extract_thread_binding(thread_id, is_m_first = None )
141152
142153 is_m_first: True if the thread binding is in the form of (tx, warp_n, warp_m)
@@ -151,12 +162,42 @@ Module Contents
151162 .. py :method :: ldmatrix_b(B_local_buf, B_shared_buf, ki, rk = 0 )
152163
153164
154- .. py :method :: mfma(A_local_buf, B_local_buf, C_local_buf)
165+ .. py :method :: mfma(A_local_buf, B_local_buf, C_local_buf, k_inner = 0 )
155166
156167
157168 .. py :method :: stmatrix(C_local_buf, C_buf, pid_m = None , pid_n = None )
158169
159170
171+ .. py :method :: make_mfma_load_layout(local_buf, matrix = ' A' )
172+
173+ Create a layout function for storing MFMA results into a fragment buffer.
174+
175+ :param local_buf: The local buffer representing a fragment of a matrix.
176+ :type local_buf: tir.Buffer
177+
178+ :returns: A fragment object that describes how threads and indices
179+ in `local_buf ` are laid out.
180+ :rtype: T.Fragment
181+
182+ :raises AssertionError: If `local_buf ` is not detected to be a fragment buffer.
183+
184+
185+
186+ .. py :method :: make_mfma_store_layout(local_buf)
187+
188+ Create a layout function for storing MFMA results into a fragment buffer.
189+
190+ :param local_buf: The local buffer representing a fragment of a matrix.
191+ :type local_buf: tir.Buffer
192+
193+ :returns: A fragment object that describes how threads and indices
194+ in `local_buf ` are laid out.
195+ :rtype: T.Fragment
196+
197+ :raises AssertionError: If `local_buf ` is not detected to be a fragment buffer.
198+
199+
200+
160201.. py :class :: MatrixCorePreshuffleIntrinEmitter(a_dtype = ' float16' , b_dtype = ' float16' , accum_dtype = ' float16' , a_transposed = False , b_transposed = False , block_row_warps = 2 , block_col_warps = 2 , warp_row_tiles = 8 , warp_col_tiles = 8 , chunk = 16 , reduce_k = 1 , num_elems_per_byte = 1 , k_pack = None , is_m_first = False , a_preshuffle = False , b_preshuffle = False )
161202
162203 Bases: :py:obj: `MatrixCoreIntrinEmitter `
0 commit comments