@@ -1605,3 +1605,109 @@ def kernel(ptr):
16051605 }
16061606}
16071607""" )
1608+
1609+
1610+ @gluon .jit
1611+ def buffer_load_store_kernel (x , y ):
1612+ layout : ttgl .constexpr = ttgl .BlockedLayout (size_per_thread = [1 , 1 ], threads_per_warp = [1 , 64 ], warps_per_cta = [4 , 1 ],
1613+ order = [1 , 0 ])
1614+
1615+ offsets = ttgl .arange (0 , 64 * 64 ).reshape (64 , 64 )
1616+ offsets = ttgl .convert_layout (offsets , layout = layout )
1617+ mask = ttgl .full ((64 , 64 ), 1 , tl .int1 , layout = layout )
1618+ other = ttgl .full ((64 , 64 ), 1.0 , tl .float32 , layout = layout )
1619+ a = ttgl .amd .cdna3 .buffer_load (ptr = x , offsets = offsets , mask = mask , other = other , cache = '.ca' )
1620+ ttgl .amd .cdna3 .buffer_store (stored_value = a , ptr = y , offsets = offsets , mask = mask , cache = '.ca' )
1621+
1622+ a = ttgl .amd .cdna4 .buffer_load (ptr = x , offsets = offsets , mask = mask , other = other , cache = '.ca' )
1623+ ttgl .amd .cdna4 .buffer_store (stored_value = a , ptr = y , offsets = offsets , mask = mask , cache = '.ca' )
1624+
1625+
1626+ @pytest .mark .parametrize ("target" , [HIP_TARGET_CDNA3 , HIP_TARGET_CDNA4 ])
1627+ def test_buffer_load_store (target ):
1628+ x = MockTensor (ttgl .float32 )
1629+ y = MockTensor (ttgl .float32 )
1630+ module = run_parser (buffer_load_store_kernel , * make_args (x , y ), target = target )
1631+
1632+ expecttest .assert_expected_inline (
1633+ anonymize_ir (module .str_nodebug ()), """\
1634+ #blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 64], warpsPerCTA = [4, 1], order = [1, 0]}>
1635+ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "...", "ttg.threads-per-warp" = 64 : i32} {
1636+ tt.func public @buffer_load_store_kernel(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32> {tt.divisibility = 16 : i32}) attributes {noinline = false} {
1637+ %0 = tt.make_range {end = 4096 : i32, start = 0 : i32} : tensor<4096xi32, #gluon.auto_encoding>
1638+ %1 = tt.reshape %0 : tensor<4096xi32, #gluon.auto_encoding> -> tensor<64x64xi32, #gluon.auto_encoding>
1639+ %2 = ttg.convert_layout %1 : tensor<64x64xi32, #gluon.auto_encoding> -> tensor<64x64xi32, #blocked>
1640+ %true = arith.constant true
1641+ %cst = arith.constant dense<true> : tensor<64x64xi1, #blocked>
1642+ %cst_0 = arith.constant 1.000000e+00 : f32
1643+ %cst_1 = arith.constant dense<1.000000e+00> : tensor<64x64xf32, #blocked>
1644+ %3 = amdgpu.buffer_load %arg0[%2], %cst, %cst_1 cacheModifier = ca : tensor<64x64xf32, #blocked>
1645+ amdgpu.buffer_store %3, %arg1[%2], %cst cacheModifier = ca : tensor<64x64xf32, #blocked>
1646+ %4 = amdgpu.buffer_load %arg0[%2], %cst, %cst_1 cacheModifier = ca : tensor<64x64xf32, #blocked>
1647+ amdgpu.buffer_store %4, %arg1[%2], %cst cacheModifier = ca : tensor<64x64xf32, #blocked>
1648+ tt.return
1649+ }
1650+ }
1651+ """ )
1652+
1653+
1654+ @gluon .jit
1655+ def buffer_load_store_with_broadcast_kernel (x , y ):
1656+ layout : ttgl .constexpr = ttgl .BlockedLayout (size_per_thread = [1 , 1 ], threads_per_warp = [1 , 64 ], warps_per_cta = [4 , 1 ],
1657+ order = [1 , 0 ])
1658+
1659+ offsets = ttgl .arange (0 , 64 * 64 ).reshape (64 , 64 )
1660+ offsets = ttgl .convert_layout (offsets , layout = layout )
1661+ other = ttgl .full ((64 , 64 ), 1.0 , tl .float32 , layout = layout )
1662+
1663+ mask = ttgl .full ((64 , 1 ), 1 , tl .int1 , layout = layout )
1664+ a = ttgl .amd .cdna3 .buffer_load (ptr = x , offsets = offsets , mask = mask , other = other , cache = '.ca' )
1665+ ttgl .amd .cdna3 .buffer_store (stored_value = a , ptr = y , offsets = offsets , mask = mask , cache = '.ca' )
1666+
1667+ mask = ttgl .full ((1 , 64 ), 1 , tl .int1 , layout = layout )
1668+ a = ttgl .amd .cdna3 .buffer_load (ptr = x , offsets = offsets , mask = mask , other = other , cache = '.ca' )
1669+ ttgl .amd .cdna3 .buffer_store (stored_value = a , ptr = y , offsets = offsets , mask = mask , cache = '.ca' )
1670+
1671+ other = 1.0
1672+ a = ttgl .amd .cdna3 .buffer_load (ptr = x , offsets = offsets , mask = mask , other = other , cache = '.ca' )
1673+ ttgl .amd .cdna3 .buffer_store (stored_value = a , ptr = y , offsets = offsets , mask = mask , cache = '.ca' )
1674+
1675+
1676+ @pytest .mark .parametrize ("target" , [HIP_TARGET_CDNA3 , HIP_TARGET_CDNA4 ])
1677+ def test_buffer_load_store_with_broadcast (target ):
1678+ x = MockTensor (ttgl .float32 )
1679+ y = MockTensor (ttgl .float32 )
1680+ module = run_parser (buffer_load_store_with_broadcast_kernel , * make_args (x , y ), target = target )
1681+
1682+ expecttest .assert_expected_inline (
1683+ anonymize_ir (module .str_nodebug ()), """\
1684+ #blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 64], warpsPerCTA = [4, 1], order = [1, 0]}>
1685+ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "...", "ttg.threads-per-warp" = 64 : i32} {
1686+ tt.func public @buffer_load_store_with_broadcast_kernel(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32> {tt.divisibility = 16 : i32}) attributes {noinline = false} {
1687+ %0 = tt.make_range {end = 4096 : i32, start = 0 : i32} : tensor<4096xi32, #gluon.auto_encoding>
1688+ %1 = tt.reshape %0 : tensor<4096xi32, #gluon.auto_encoding> -> tensor<64x64xi32, #gluon.auto_encoding>
1689+ %2 = ttg.convert_layout %1 : tensor<64x64xi32, #gluon.auto_encoding> -> tensor<64x64xi32, #blocked>
1690+ %cst = arith.constant 1.000000e+00 : f32
1691+ %cst_0 = arith.constant dense<1.000000e+00> : tensor<64x64xf32, #blocked>
1692+ %true = arith.constant true
1693+ %cst_1 = arith.constant dense<true> : tensor<64x1xi1, #blocked>
1694+ %3 = tt.broadcast %cst_1 : tensor<64x1xi1, #blocked> -> tensor<64x64xi1, #blocked>
1695+ %4 = amdgpu.buffer_load %arg0[%2], %3, %cst_0 cacheModifier = ca : tensor<64x64xf32, #blocked>
1696+ %5 = tt.broadcast %cst_1 : tensor<64x1xi1, #blocked> -> tensor<64x64xi1, #blocked>
1697+ amdgpu.buffer_store %4, %arg1[%2], %5 cacheModifier = ca : tensor<64x64xf32, #blocked>
1698+ %true_2 = arith.constant true
1699+ %cst_3 = arith.constant dense<true> : tensor<1x64xi1, #blocked>
1700+ %6 = tt.broadcast %cst_3 : tensor<1x64xi1, #blocked> -> tensor<64x64xi1, #blocked>
1701+ %7 = amdgpu.buffer_load %arg0[%2], %6, %cst_0 cacheModifier = ca : tensor<64x64xf32, #blocked>
1702+ %8 = tt.broadcast %cst_3 : tensor<1x64xi1, #blocked> -> tensor<64x64xi1, #blocked>
1703+ amdgpu.buffer_store %7, %arg1[%2], %8 cacheModifier = ca : tensor<64x64xf32, #blocked>
1704+ %cst_4 = arith.constant 1.000000e+00 : f32
1705+ %9 = tt.broadcast %cst_3 : tensor<1x64xi1, #blocked> -> tensor<64x64xi1, #blocked>
1706+ %cst_5 = arith.constant dense<1.000000e+00> : tensor<64x64xf32, #blocked>
1707+ %10 = amdgpu.buffer_load %arg0[%2], %9, %cst_5 cacheModifier = ca : tensor<64x64xf32, #blocked>
1708+ %11 = tt.broadcast %cst_3 : tensor<1x64xi1, #blocked> -> tensor<64x64xi1, #blocked>
1709+ amdgpu.buffer_store %10, %arg1[%2], %11 cacheModifier = ca : tensor<64x64xf32, #blocked>
1710+ tt.return
1711+ }
1712+ }
1713+ """ )
0 commit comments