From 83f1de8ee1be3c7427ccf505ddeb60bc421d8ca8 Mon Sep 17 00:00:00 2001 From: Anamika Chatterjee Date: Mon, 29 Sep 2025 08:10:27 +0300 Subject: [PATCH 01/15] Test commit --- examples/00_bmg_gemm/00_bmg_gemm.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/00_bmg_gemm/00_bmg_gemm.cpp b/examples/00_bmg_gemm/00_bmg_gemm.cpp index 7e9291227e..93e1651e03 100644 --- a/examples/00_bmg_gemm/00_bmg_gemm.cpp +++ b/examples/00_bmg_gemm/00_bmg_gemm.cpp @@ -367,7 +367,7 @@ int main(int argc, const char** argv) Layout, Stride<_4, _1, _0>>>::TiledMMA; // For Intel BMG, PipelineStages defines how many k-blocks ahead to prefetch from A and B. - constexpr int PipelineStages = 2; + constexpr int PipelineStages = 3; using GEMMDispatchPolicy = cutlass::gemm::MainloopIntelXeXMX16; using EpilogueDispatchPolicy = cutlass::epilogue::IntelXeXMX16; From 0b184f0809ffc6094c63202592db4ee3741c29ef Mon Sep 17 00:00:00 2001 From: Anamika Chatterjee Date: Mon, 29 Sep 2025 09:12:28 +0300 Subject: [PATCH 02/15] Enable new mma and copy atoms --- examples/00_bmg_gemm/00_bmg_gemm.cpp | 71 ++++++++++++++---- include/cutlass/gemm/collective/xe_mma.hpp | 85 +++++++++++----------- include/cutlass/gemm/kernel/xe_gemm.hpp | 8 +- 3 files changed, 103 insertions(+), 61 deletions(-) diff --git a/examples/00_bmg_gemm/00_bmg_gemm.cpp b/examples/00_bmg_gemm/00_bmg_gemm.cpp index 93e1651e03..d50e9349ac 100644 --- a/examples/00_bmg_gemm/00_bmg_gemm.cpp +++ b/examples/00_bmg_gemm/00_bmg_gemm.cpp @@ -81,6 +81,58 @@ using namespace cute; +/////////////////////////////////////////////////////////////////////////////////////////////////// + +// Helper template to check if a type is complete +template +struct is_complete : std::false_type {}; + +template +struct is_complete : std::true_type {}; + +template +static constexpr bool is_complete_v = is_complete::value; + + +template +auto +choose_mma_op() +{ + if constexpr (is_complete_v>) + return XE_DPAS_TT<8, TC, TA, TB>{}; + else if constexpr (is_same_v, cute::bfloat16_t>) + return XE_DPAS_TT<8, float, cute::bfloat16_t>{}; + else /* Use f16 by default as upconversion sequences are typically faster */ + return XE_DPAS_TT<8, float, cute::half_t>{}; +} + +// Helper function to choose tiled MMA based on tensor properties +template +auto +choose_tiled_mma() +{ + + auto op = choose_mma_op(); + + constexpr bool byte = (cute::max(sizeof_bits_v, sizeof_bits_v) <= 8); + + constexpr bool is_A_transposed = std::is_same_v; + constexpr bool is_B_transposed = std::is_same_v; + constexpr bool use_1x_dpas_per_k = is_A_transposed || (byte && is_B_transposed); + + + using _K = conditional_t, C>; + + using WGTile = Shape<_256, _256, _K>; // 256x256 WG tile size + using SGLayout = Layout, Stride<_4, _1, _0>>; // 8x4 SG tiling, n-major + + using MMA = typename TiledMMAHelper, Layout, SGLayout>::TiledMMA; + + return MMA{}; +} + + /////////////////////////////////////////////////////////////////////////////////////////////////// // Command line options parsing @@ -350,24 +402,11 @@ int main(int argc, const char** argv) using GmemTiledCopyB = XE_2D_U16x32x32_LD_V; // Workgroup-level tile - using TileShape = Shape<_256, _256, _32>; - - // A TiledMMA struct defines a tiling of an MMA atom over M, N and K, combining both additional - // hardware (sub-groups for Intel BMG) and iterations by each sub-group. - // - // The TiledMMAHelper struct defines a specific TiledMMA for a given MMA atom - // (XE_8x16x16_F32BF16BF16F32_TT), TileShape (<256, 256, 32>) and sub-group layout (8x4x1). The - // TiledMMA constructed using TiledMMAHelper has the property that each sub-group operates on a - // single contiguous chunk of the work-group TileShape. For this configuration, this implies that - // each sub-group operates on a contiguous 32x64x32 chunk (4x4x2 iterations). See - // 0t_mma_atom.md#TiledMMAs for more info. Sub-groups are arranged row-major (stride 4,1,0) for - // performance reasons. - using TiledMma = // M=8,N=16,K=16, D=f32,A=bf16,B=bf16,C=f32 - typename TiledMMAHelper, Layout, - Layout, Stride<_4, _1, _0>>>::TiledMMA; + using TiledMma = decltype(choose_tiled_mma()); + using TileShape = decltype(TiledMma{}.tile_mnk()); // For Intel BMG, PipelineStages defines how many k-blocks ahead to prefetch from A and B. - constexpr int PipelineStages = 3; + constexpr int PipelineStages = 2; using GEMMDispatchPolicy = cutlass::gemm::MainloopIntelXeXMX16; using EpilogueDispatchPolicy = cutlass::epilogue::IntelXeXMX16; diff --git a/include/cutlass/gemm/collective/xe_mma.hpp b/include/cutlass/gemm/collective/xe_mma.hpp index 7d75825a3e..9357491aa2 100644 --- a/include/cutlass/gemm/collective/xe_mma.hpp +++ b/include/cutlass/gemm/collective/xe_mma.hpp @@ -100,9 +100,6 @@ struct CollectiveMma, TileShape_, Element static constexpr auto Num_SGs = ATOM_N * ATOM_M * ATOM_K; static constexpr uint32_t MaxThreadsPerBlock = size(TiledMma{}); - using Copy_A = typename Copy_Traits::template DefaultTiledCopy; - using Copy_B = typename Copy_Traits::template DefaultTiledCopy; - // Host side kernel arguments struct Arguments { ElementA const* ptr_A; @@ -112,8 +109,11 @@ struct CollectiveMma, TileShape_, Element }; struct Params { - Copy_A tiled_copy_a; - Copy_B tiled_copy_b; + ElementA const* ptr_A; + StrideA dA; + ElementB const* ptr_B; + StrideB dB; + int M, N, K, L; }; // @@ -129,12 +129,11 @@ struct CollectiveMma, TileShape_, Element auto [M,N,K,L] = problem_shape; - auto mA_mkl = make_tensor(make_gmem_ptr(args.ptr_A), make_layout(make_shape(M, K, L), args.dA)); - auto mB_nkl = make_tensor(make_gmem_ptr(args.ptr_B), make_layout(make_shape(N, K, L), args.dB)); - Copy_A tiled_copy_a{Copy_A{}.with(mA_mkl)}; - Copy_B tiled_copy_b{Copy_B{}.with(mB_nkl)}; - - return Params{tiled_copy_a, tiled_copy_b}; + return Params{args.ptr_A, + args.dA, + args.ptr_B, + args.dB, + M, N, K, L}; } template @@ -177,8 +176,15 @@ struct CollectiveMma, TileShape_, Element static_assert(is_rmem::value, "D tensor must be rmem resident."); static_assert(is_rmem::value, "C tensor must be rmem resident."); - auto thr_copy_A = mainloop.tiled_copy_a.get_slice(thread_idx); - auto thr_copy_B = mainloop.tiled_copy_b.get_slice(thread_idx); + auto mA_mkl = make_tensor(make_gmem_ptr(mainloop.ptr_A), + make_layout(make_shape(mainloop.M, mainloop.K, mainloop.L), cute::take<0,2>(mainloop.dA))); + auto mB_nkl = make_tensor(make_gmem_ptr(mainloop.ptr_B), + make_layout(make_shape(mainloop.N, mainloop.K, mainloop.L), cute::take<0,2>(mainloop.dB))); + auto copy_a = make_block_2d_copy_A(TiledMma{}, mA_mkl); + auto copy_b = make_block_2d_copy_B(TiledMma{}, mB_nkl); + + auto thr_copy_a = copy_a.get_slice(thread_idx); + auto thr_copy_b = copy_b.get_slice(thread_idx); // Instantiate the MMA object and get thread slice TiledMma tiled_mma; @@ -188,27 +194,25 @@ struct CollectiveMma, TileShape_, Element auto first_thread_in_sg_idx = sg.get_group_linear_id() * DispatchPolicy::SubgroupSize; auto thr_mma = tiled_mma.get_slice(first_thread_in_sg_idx); - // Partition global counting tensors for MMA - Tensor tCgA = thr_mma.partition_A(gA); - Tensor tCgB = thr_mma.partition_B(gB); + /* Register fragments for MMA */ + auto tCrA = thr_mma.partition_sg_fragment_A(gA(_,_,0)); + auto tCrB = thr_mma.partition_sg_fragment_B(gB(_,_,0)); - Tensor tCrA = make_tensor(make_fragment_layout(mainloop.tiled_copy_a, tCgA(_,_,_,0).shape())); - Tensor tCrB = make_tensor(make_fragment_layout(mainloop.tiled_copy_b, tCgB(_,_,_,0).shape())); + /* Register fragments for copies */ + auto tArA = thr_copy_a.partition_sg_fragment_D(gA(_,_,0)); + auto tBrB = thr_copy_b.partition_sg_fragment_D(gB(_,_,0)); - // Retile registers for copies - Tensor tArA = thr_copy_A.retile_D(tCrA); - Tensor tBrB = thr_copy_B.retile_D(tCrB); - - // Retile global counting tensors for copies - Tensor tAgA = thr_copy_A.retile_S(tCgA); - Tensor tBgB = thr_copy_B.retile_S(tCgB); - - auto tiled_prefetch_a = cute::prefetch_selector,Int>, Num_SGs>(mainloop.tiled_copy_a); - auto tiled_prefetch_b = cute::prefetch_selector,Int>, Num_SGs>(mainloop.tiled_copy_b); - auto thr_prefetch_A = tiled_prefetch_a.get_slice(thread_idx); - auto thr_prefetch_B = tiled_prefetch_b.get_slice(thread_idx); + /* Partition global tensor (proxies) for copies */ + Tensor tAgA = thr_copy_a.partition_S(gA); + Tensor tBgB = thr_copy_b.partition_S(gB); - // Partition global tile for prefetch + /* Create prefetch TiledCopy instances */ + auto prefetch_a = make_block_2d_prefetch(copy_a); + auto prefetch_b = make_block_2d_prefetch(copy_b); + auto thr_prefetch_A = prefetch_a.get_slice(thread_idx); + auto thr_prefetch_B = prefetch_b.get_slice(thread_idx); + + /* Partition global tensor (proxies) for prefetch */ auto pAgA = thr_prefetch_A.partition_S(gA); auto pBgB = thr_prefetch_B.partition_S(gB); @@ -216,20 +220,18 @@ struct CollectiveMma, TileShape_, Element #define PRINT(x) print(#x ": "); print(x); print("\n"); if (cute::thread(LOG_THREAD, LOG_GROUP)) { print("======================= A: \n"); - PRINT(tCgA); PRINT(tAgA); PRINT(tCrA); PRINT(tArA); - PRINT(mainloop.tiled_copy_a); + PRINT(copy_a); print("======================= B: \n"); - PRINT(tCgB); PRINT(tBgB); PRINT(tCrB); PRINT(tBrB); - PRINT(mainloop.tiled_copy_b); + PRINT(copy_b); } #undef PRINT #endif @@ -243,19 +245,19 @@ struct CollectiveMma, TileShape_, Element CUTLASS_PRAGMA_UNROLL for (; prefetch_k < DispatchPolicy::Stages; prefetch_k++) { - prefetch(tiled_prefetch_a, pAgA(_, _, _, prefetch_k)); - prefetch(tiled_prefetch_b, pBgB(_, _, _, prefetch_k)); + prefetch(prefetch_a, pAgA(_, _, _, prefetch_k)); + prefetch(prefetch_b, pBgB(_, _, _, prefetch_k)); } for (int k_tile = k_start_idx; k_tile < k_tile_count + k_start_idx; k_tile++, prefetch_k++) { barrier_arrive(barrier_scope); // Copy gmem to rmem for the first k_tile - copy(mainloop.tiled_copy_a, tAgA(_,_,_,k_tile), tArA); - copy(mainloop.tiled_copy_b, tBgB(_,_,_,k_tile), tBrB); + copy(copy_a, tAgA(_,_,_,k_tile), tArA); + copy(copy_b, tBgB(_,_,_,k_tile), tBrB); if (prefetch_k < k_tile_count) { - prefetch(tiled_prefetch_a, pAgA(_, _, _, prefetch_k)); - prefetch(tiled_prefetch_b, pBgB(_, _, _, prefetch_k)); + prefetch(prefetch_a, pAgA(_, _, _, prefetch_k)); + prefetch(prefetch_b, pBgB(_, _, _, prefetch_k)); } cute::gemm(tiled_mma, tCrA, tCrB, accum); @@ -267,3 +269,4 @@ struct CollectiveMma, TileShape_, Element } // namespace cutlass::gemm::collective ///////////////////////////////////////////////////////////////////////////////////////////////// + diff --git a/include/cutlass/gemm/kernel/xe_gemm.hpp b/include/cutlass/gemm/kernel/xe_gemm.hpp index 1b1bebcc18..43e9d033e1 100644 --- a/include/cutlass/gemm/kernel/xe_gemm.hpp +++ b/include/cutlass/gemm/kernel/xe_gemm.hpp @@ -238,11 +238,11 @@ class GemmUniversal< constexpr auto workgroup_shape = WorkgroupTileShape{}; // (SUB_M,SUB_N,SUB_K) constexpr auto subgroup_shape = SubgroupTileShape{}; - Tensor mA_mkl = cute::get_xe_tensor(make_shape(M,K,L)); //(m,k,l) - Tensor mB_nkl = cute::get_xe_tensor(make_shape(N,K,L)); //(n,k,l) + Tensor cA = make_identity_tensor(make_shape(M,K,L)); // (M,K,L) + Tensor cB = make_identity_tensor(make_shape(M,K,L)); // (N,K,L) - Tensor gA = local_tile(mA_mkl, select<0,2>(blk_shape), make_coord(m_coord,_,l_coord)); - Tensor gB = local_tile(mB_nkl, select<1,2>(blk_shape), make_coord(n_coord,_,l_coord)); + Tensor gA = local_tile(cA, select<0,2>(blk_shape), make_coord(m_coord,_,l_coord)); + Tensor gB = local_tile(cB, select<1,2>(blk_shape), make_coord(n_coord,_,l_coord)); // Allocate the tiled_mma and the accumulators for the (M,N) subgroup_shape TiledMma tiled_mma; From ef1bafad27692d64bd94ff6bbc0bfc34b0063650 Mon Sep 17 00:00:00 2001 From: Anamika Chatterjee Date: Tue, 30 Sep 2025 15:39:13 +0300 Subject: [PATCH 03/15] adding legacy code back for collectivemma and gemmuniversal --- .vscode/settings.json | 24 + examples/00_bmg_gemm/00_bmg_gemm.cpp | 4 +- examples/00_bmg_gemm/00_bmg_gemm_legacy.cpp | 429 ++++++++++++++++++ examples/00_bmg_gemm/CMakeLists.txt | 9 + .../gemm/collective/collective_mma.hpp | 1 + .../gemm/collective/collective_mma_decl.hpp | 20 + include/cutlass/gemm/collective/xe_mma.hpp | 4 +- .../cutlass/gemm/collective/xe_mma_legacy.hpp | 269 +++++++++++ .../cutlass/gemm/kernel/gemm_universal.hpp | 1 + .../cutlass/gemm/kernel/gemm_universal_decl.h | 9 + include/cutlass/gemm/kernel/xe_gemm.hpp | 2 +- .../cutlass/gemm/kernel/xe_gemm_legacy.hpp | 284 ++++++++++++ 12 files changed, 1051 insertions(+), 5 deletions(-) create mode 100644 .vscode/settings.json create mode 100644 examples/00_bmg_gemm/00_bmg_gemm_legacy.cpp create mode 100644 include/cutlass/gemm/collective/xe_mma_legacy.hpp create mode 100644 include/cutlass/gemm/kernel/xe_gemm_legacy.hpp diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000000..43b3531021 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,24 @@ +{ + "files.associations": { + "compare": "cpp", + "string": "cpp", + "unordered_map": "cpp", + "vector": "cpp", + "exception": "cpp", + "memory": "cpp", + "memory_resource": "cpp", + "string_view": "cpp", + "random": "cpp", + "initializer_list": "cpp", + "istream": "cpp", + "new": "cpp", + "ostream": "cpp", + "sstream": "cpp", + "stdexcept": "cpp", + "streambuf": "cpp", + "system_error": "cpp", + "tuple": "cpp", + "type_traits": "cpp", + "typeinfo": "cpp" + } +} \ No newline at end of file diff --git a/examples/00_bmg_gemm/00_bmg_gemm.cpp b/examples/00_bmg_gemm/00_bmg_gemm.cpp index d50e9349ac..312dcd23d0 100644 --- a/examples/00_bmg_gemm/00_bmg_gemm.cpp +++ b/examples/00_bmg_gemm/00_bmg_gemm.cpp @@ -437,7 +437,7 @@ int main(int argc, const char** argv) void, void>; // GEMM Mainloop - iteration over blocks in K dimension - using CollectiveMainloop = cutlass::gemm::collective::CollectiveMma< + using CollectiveMainloop = cutlass::gemm::collective::CollectiveMmaNew< GEMMDispatchPolicy, TileShape, ElementInputA, @@ -450,7 +450,7 @@ int main(int argc, const char** argv) >; // Define the whole kernel (mainloop and epilogue) - using GemmKernel = cutlass::gemm::kernel::GemmUniversal< + using GemmKernel = cutlass::gemm::kernel::GemmUniversalNew< Shape, // Defer global problem shape definition to runtime CollectiveMainloop, CollectiveEpilogue diff --git a/examples/00_bmg_gemm/00_bmg_gemm_legacy.cpp b/examples/00_bmg_gemm/00_bmg_gemm_legacy.cpp new file mode 100644 index 0000000000..7e9291227e --- /dev/null +++ b/examples/00_bmg_gemm/00_bmg_gemm_legacy.cpp @@ -0,0 +1,429 @@ +/*************************************************************************************************** + * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved. + * Copyright (C) 2025 Intel Corporation, All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief CUTLASS Intel BMG Gemm Example. + + This example constructs and executes a simple CUTLASS GEMM kernel on Intel BMG hardware, and + verifies its correctness with a reference implementation + (cutlass::reference::device::GemmComplex). The example also provides a performance measurement + for the GEMM in TFLOPS. + + This example makes use of BMGs subgroup cooperative 2d-block copy operations and DPAS instructions. + + The shapes of the A and B matrix are defined at runtime by `options.m`, `.n` and `.k`, and the + batch size is defined by `options.l`. The tile shape, which defines how much work is executed by + a single work-group, is defined at compile time by: + ``` + using TileShape = Shape<_256, _256, _32>; + ``` + That is, each work-group processes a tile of M=256, N=256, and iterates over `options.k` in + blocks of K=32. + + Performance of GEMM on BMG is heavily dependent on prefetching the A and B matrices. That is, + executing Intel specific prefetch instructions for future iterations to ensure that the required + blocks of A and B are resident in cache before they are needed. + + To build & run this example (from your build dir): + + $ ninja 00_bmg_gemm + $ ./examples/sycl/00_bmg_gemm/00_bmg_gemm + + Call with `--help` for information about available options +*/ + +#include "cutlass/epilogue/collective/default_epilogue.hpp" +#include "cutlass/epilogue/collective/xe_epilogue.hpp" +#include "cutlass/epilogue/fusion/xe_callbacks.hpp" +#include "cutlass/gemm/device/gemm_universal.h" +#include "cutlass/gemm/device/gemm_universal_adapter.h" +#include "cutlass/gemm/collective/collective_mma.hpp" +#include "cutlass/util/GPU_Clock.hpp" + +#include +#include + +#include "cutlass/util/command_line.h" +#include "cutlass/util/device_memory.h" +#include "cutlass/util/packed_stride.hpp" +#include "cutlass/util/reference/device/gemm_complex.h" +#include "cutlass/util/reference/device/tensor_compare.h" +#include "sycl_common.hpp" +#include "helper.h" + +using namespace cute; + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +// Command line options parsing +struct Options { + + bool help; + bool error; + + int m, n, k, l, iterations; + float alpha, beta; + + Options(): + help(false), + error(false), + m(5120), n(4096), k(4096), l(1), iterations(20), + alpha(1.f), beta(0.f) + { } + + // Parses the command line + void parse(int argc, char const **args) { + cutlass::CommandLine cmd(argc, args); + + if (cmd.check_cmd_line_flag("help")) { + help = true; + return; + } + + cmd.get_cmd_line_argument("m", m, 5120); + cmd.get_cmd_line_argument("n", n, 4096); + cmd.get_cmd_line_argument("k", k, 4096); + cmd.get_cmd_line_argument("l", l, 1); + cmd.get_cmd_line_argument("alpha", alpha, 1.f); + cmd.get_cmd_line_argument("beta", beta, 0.f); + cmd.get_cmd_line_argument("iterations", iterations, 100); + } + + /// Prints the usage statement. + std::ostream & print_usage(std::ostream &out) const { + + out << "BMG GEMM Example\n\n" + << "Options:\n\n" + << " --help If specified, displays this usage statement\n\n" + << " --m= Sets the M extent of the GEMM\n" + << " --n= Sets the N extent of the GEMM\n" + << " --k= Sets the K extent of the GEMM\n" + << " --l= Sets the L extent (batch count) of the GEMM\n" + << " --alpha= Epilogue scalar alpha\n" + << " --beta= Epilogue scalar beta\n\n" + << " --iterations= Iterations\n\n"; + + return out; + } +}; + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +template < + class Gemm +> +struct ExampleRunner { + + using StrideA = typename Gemm::GemmKernel::StrideA; + using StrideB = typename Gemm::GemmKernel::StrideB; + using StrideC = typename Gemm::GemmKernel::StrideC; + using StrideD = typename Gemm::GemmKernel::StrideD; + + using LayoutA = typename Gemm::LayoutA; + using LayoutB = typename Gemm::LayoutB; + using LayoutC = typename Gemm::LayoutC; + using LayoutD = typename Gemm::LayoutD; + + using ElementA = typename Gemm::ElementA; + using ElementB = typename Gemm::ElementB; + using ElementAcc = typename Gemm::ElementAccumulator; + + using CollectiveEpilogue = typename Gemm::CollectiveEpilogue; + using ElementC = typename Gemm::ElementC; + using ElementOutput = typename CollectiveEpilogue::ElementOutput; + using ElementCompute = typename CollectiveEpilogue::ElementCompute; + using ElementAccumulator = typename CollectiveEpilogue::ElementAccumulator; + + using ProblemShapeType = typename Gemm::GemmKernel::ProblemShape; + + // + // Data members + // + + /// Initialization + StrideA stride_A; + StrideB stride_B; + StrideC stride_C; + StrideD stride_D; + uint64_t seed = 0; + + cutlass::DeviceAllocation block_A; + cutlass::DeviceAllocation block_B; + cutlass::DeviceAllocation block_C; + cutlass::DeviceAllocation block_D; + cutlass::DeviceAllocation block_ref_D; // Reference GEMM result for verification + + // + // Methods + // + + bool verify(const ProblemShapeType& problem_size, ElementCompute alpha, ElementCompute beta) { + auto [M, N, K, L] = problem_size; + + cutlass::TensorRef ref_A(block_A.get(), LayoutA::packed({M, K})); + cutlass::TensorRef ref_B(block_B.get(), LayoutB::packed({K, N})); + cutlass::TensorRef ref_C(block_C.get(), LayoutC::packed({M, N})); + cutlass::TensorRef ref_D(block_ref_D.get(), LayoutD::packed({M, N})); + + cutlass::reference::device::GemmComplex( + {M, N, K}, + alpha, + ref_A, + cutlass::ComplexTransform::kNone, + ref_B, + cutlass::ComplexTransform::kNone, + beta, + ref_C, + ref_D, + ElementAccumulator(0), + L, // batch_count + M * K, // batch_stride_A + K * N, // batch_stride_B + M * N, // batch_stride_C + M * N // batch_stride_D + ); + + // CUTLASS on SYCL uses the compatibility library compat for e.g. default in-order queue + compat::wait(); + + // Check if output from CUTLASS kernel and reference kernel are equal or not + bool passed = cutlass::reference::device::BlockCompareEqual( + block_ref_D.get(), block_D.get(), block_D.size()); + + return passed; + } + + /// Initialize operands to be used in the GEMM and reference GEMM + void initialize(const ProblemShapeType& problem_size) { + auto problem_shape_MNKL = cute::append<4>(problem_size, 1); + auto [M, N, K, L] = problem_shape_MNKL; + + // Complete the stride by combining static layout info (StrideA) with runtime size info (M,K,L) + stride_A = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, K, L)); + stride_B = cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(N, K, L)); + stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L)); + stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L)); + + block_A.reset(static_cast(M) * K * L); + block_B.reset(static_cast(K) * N * L); + block_C.reset(static_cast(M) * N * L); + block_D.reset(static_cast(M) * N * L); + block_ref_D.reset(static_cast(M) * N * L); + + initialize_block(block_A, seed + 2023); + initialize_block(block_B, seed + 2022); + initialize_block(block_C, seed + 2021); + } + + cutlass::Status run(const Options& options, const cutlass::KernelHardwareInfo& hw_info) { + ProblemShapeType problem_size = ProblemShapeType{options.m, options.n, options.k, options.l}; + + initialize(problem_size); + + typename Gemm::GemmKernel::Arguments arguments{ + cutlass::gemm::GemmUniversalMode::kGemm, + problem_size, + {block_A.get(), stride_A, block_B.get(), stride_B}, + {{options.alpha, options.beta}, block_C.get(), stride_C, block_D.get(), stride_D}, + hw_info + }; + + Gemm gemm_op; + + size_t workspace_size = Gemm::get_workspace_size(arguments); + cutlass::device_memory::allocation workspace(workspace_size); + + if (gemm_op.can_implement(arguments) != cutlass::Status::kSuccess){ + std::cout << "Invalid Problem Size: " << options.m << 'x' << options.n << 'x' << options.k << 'x' << options.l << std::endl; + std::exit(1); + } + + CUTLASS_CHECK(gemm_op.initialize(arguments, workspace.get())); + + // Run the GEMM + CUTLASS_CHECK(gemm_op.run()); + + compat::wait(); + + // Verify that the result is correct + bool passed = verify(problem_size, options.alpha, options.beta); + std::cout << "Disposition: " << (passed ? "Passed" : "Failed") << std::endl; + + if(!passed) return cutlass::Status::kErrorInternal; + + if (options.iterations > 0) { + GPU_Clock timer; + timer.start(); + for (int i = 0; i < options.iterations; ++i) { + gemm_op.run(); + } + compat::wait(); + + float cute_time = timer.seconds() / options.iterations; + double tflops = (2.0 * options.m * options.n * options.k * options.l) * 1e-12; + std::cout << "Problem Size: " << options.m << 'x' << options.n << 'x' << options.k << 'x' << options.l << std::endl; + printf("Cutlass GEMM Performance: [%4.3f]TFlop/s (%6.4f)ms\n", tflops / cute_time, cute_time*1000); + } + + return cutlass::Status::kSuccess; + } + +}; + +int main(int argc, const char** argv) +{ + // + // Parse options + // + + Options options; + + options.parse(argc, argv); + + if (options.help) { + options.print_usage(std::cout) << std::endl; + return 0; + } + + if (options.error) { + std::cerr << "Aborting execution." << std::endl; + return -1; + } + + // + // Run examples + // + + // The KernelHardwareInfo struct holds the number of EUs on the GPU with a given device ID. This + // information is used by the underlying kernel. + cutlass::KernelHardwareInfo hw_info; + + // Change device_id to another value if you are running on a machine with multiple GPUs and wish + // to use a GPU other than that with device ID 0. + hw_info.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id); + + bool passed; + + // The code section below describes datatype for input, output matrices and computation between + // elements in input matrices. + using ElementAccumulator = float; // <- data type of accumulator + using ElementComputeEpilogue = float; // <- data type of epilogue operations + using ElementInputA = bfloat16_t; // <- data type of elements in input matrix A + using ElementInputB = bfloat16_t; // <- data type of elements in input matrix B + using ElementOutput = float; // <- data type of elements in output matrix D + + using LayoutA = cutlass::layout::RowMajor; + using LayoutB = cutlass::layout::RowMajor; + using LayoutC = cutlass::layout::RowMajor; + using LayoutD = cutlass::layout::RowMajor; + + // The 2D block copy operations used for the A and B matrices + using GmemTiledCopyA = XE_2D_U16x32x32_LD_N; + using GmemTiledCopyB = XE_2D_U16x32x32_LD_V; + + // Workgroup-level tile + using TileShape = Shape<_256, _256, _32>; + + // A TiledMMA struct defines a tiling of an MMA atom over M, N and K, combining both additional + // hardware (sub-groups for Intel BMG) and iterations by each sub-group. + // + // The TiledMMAHelper struct defines a specific TiledMMA for a given MMA atom + // (XE_8x16x16_F32BF16BF16F32_TT), TileShape (<256, 256, 32>) and sub-group layout (8x4x1). The + // TiledMMA constructed using TiledMMAHelper has the property that each sub-group operates on a + // single contiguous chunk of the work-group TileShape. For this configuration, this implies that + // each sub-group operates on a contiguous 32x64x32 chunk (4x4x2 iterations). See + // 0t_mma_atom.md#TiledMMAs for more info. Sub-groups are arranged row-major (stride 4,1,0) for + // performance reasons. + using TiledMma = // M=8,N=16,K=16, D=f32,A=bf16,B=bf16,C=f32 + typename TiledMMAHelper, Layout, + Layout, Stride<_4, _1, _0>>>::TiledMMA; + + // For Intel BMG, PipelineStages defines how many k-blocks ahead to prefetch from A and B. + constexpr int PipelineStages = 2; + using GEMMDispatchPolicy = cutlass::gemm::MainloopIntelXeXMX16; + using EpilogueDispatchPolicy = cutlass::epilogue::IntelXeXMX16; + + // This is the 'default' epilogue operation (Linear Combination) which performs everything in: + // (D = alpha * (A*B) + beta * C) + // aside from the (A*B), which is handled by the GEMM. See 05_bmg_gemm_with_epilogues for more + // complex epilogue examples. + using EpilogueOp = cutlass::epilogue::fusion::LinearCombination; + + // FusionCallbacks ties the EpilogueOp to an implementation (based on the dispatch + // policy/architecture) and defines the epilogue arguments. + using FusionCallBacks = cutlass::epilogue::fusion::FusionCallbacks; + // GEMM Epilogue - loads & stores C/D matrices, performs epilogue operations & load/stores any + // auxiliary data required + using CollectiveEpilogue = cutlass::epilogue::collective::CollectiveEpilogue< + EpilogueDispatchPolicy, + TileShape, + ElementAccumulator, + cutlass::gemm::TagToStrideC_t, // Converts CUTLASS 2.x to CUTLASS 3.x representation + ElementOutput, + cutlass::gemm::TagToStrideC_t, // Converts CUTLASS 2.x to CUTLASS 3.x representation + FusionCallBacks, + XE_2D_U32x8x16_LD_N, // The copy atom used to load matrix C + void, void, + XE_2D_U32x8x16_ST_N, // The copy atom used to store matrix D + void, void>; + + // GEMM Mainloop - iteration over blocks in K dimension + using CollectiveMainloop = cutlass::gemm::collective::CollectiveMma< + GEMMDispatchPolicy, + TileShape, + ElementInputA, + cutlass::gemm::TagToStrideA_t, // Converts CUTLASS 2.x to CUTLASS 3.x representation + ElementInputB, + cutlass::gemm::TagToStrideB_t, // Converts CUTLASS 2.x to CUTLASS 3.x representation + TiledMma, + GmemTiledCopyA, void, void, cute::identity, // A + GmemTiledCopyB, void, void, cute::identity // B + >; + + // Define the whole kernel (mainloop and epilogue) + using GemmKernel = cutlass::gemm::kernel::GemmUniversal< + Shape, // Defer global problem shape definition to runtime + CollectiveMainloop, + CollectiveEpilogue + >; + + // The GemmUniversalAdapter wraps the defined GEMM kernel and handles the launch, and e.g. + // persistent scratch memory if required. + using Gemm = cutlass::gemm::device::GemmUniversalAdapter; + + ExampleRunner runner; + + CUTLASS_CHECK(runner.run(options, hw_info)); + + return 0; +} diff --git a/examples/00_bmg_gemm/CMakeLists.txt b/examples/00_bmg_gemm/CMakeLists.txt index 5bfc4a5e29..6564ad47a9 100644 --- a/examples/00_bmg_gemm/CMakeLists.txt +++ b/examples/00_bmg_gemm/CMakeLists.txt @@ -40,6 +40,15 @@ cutlass_example_add_executable( TEST_SMALL_SHAPE ) +cutlass_example_add_executable( + 00_bmg_gemm_legacy + 00_bmg_gemm_legacy.cpp + TEST_COMMAND_OPTIONS + TEST_BATCHES + TEST_LARGE + TEST_SMALL_SHAPE +) + set(TEST_SMALL_SHAPE_PADDABLE --m=1 --n=1 --k=2 --l=2) cutlass_example_add_executable( 00_bmg_gemm_padded diff --git a/include/cutlass/gemm/collective/collective_mma.hpp b/include/cutlass/gemm/collective/collective_mma.hpp index b54776ece5..3c02ca1efa 100644 --- a/include/cutlass/gemm/collective/collective_mma.hpp +++ b/include/cutlass/gemm/collective/collective_mma.hpp @@ -77,6 +77,7 @@ #endif // !defined(__CUDACC_RTC__) #if defined(SYCL_INTEL_TARGET) +#include "cutlass/gemm/collective/xe_mma_legacy.hpp" #include "cutlass/gemm/collective/xe_mma.hpp" #include "cutlass/gemm/collective/xe_array_mma.hpp" #include "cutlass/gemm/collective/xe_array_mma_fp8.hpp" diff --git a/include/cutlass/gemm/collective/collective_mma_decl.hpp b/include/cutlass/gemm/collective/collective_mma_decl.hpp index a2faa1ff28..a8d2572a71 100644 --- a/include/cutlass/gemm/collective/collective_mma_decl.hpp +++ b/include/cutlass/gemm/collective/collective_mma_decl.hpp @@ -58,6 +58,26 @@ struct CollectiveMma { static_assert(cutlass::detail::dependent_false, "Could not find a mainloop specialization."); }; +template < + class DispatchPolicy, + class TileShape, + class ElementA, + class StrideA, + class ElementB, + class StrideB, + class TiledMma, + class GmemTiledCopyA, + class SmemLayoutAtomA, + class SmemCopyAtomA, + class TransformA, + class GmemTiledCopyB, + class SmemLayoutAtomB, + class SmemCopyAtomB, + class TransformB +> +struct CollectiveMmaNew { + static_assert(cutlass::detail::dependent_false, "Could not find a mainloop specialization."); +}; ///////////////////////////////////////////////////////////////////////////////////////////////// } // namespace cutlass::gemm::collective diff --git a/include/cutlass/gemm/collective/xe_mma.hpp b/include/cutlass/gemm/collective/xe_mma.hpp index 9357491aa2..668a088939 100644 --- a/include/cutlass/gemm/collective/xe_mma.hpp +++ b/include/cutlass/gemm/collective/xe_mma.hpp @@ -47,7 +47,7 @@ using namespace cute; template -struct CollectiveMma, TileShape_, ElementA_, StrideA_, ElementB_, StrideB_, TiledMma_, +struct CollectiveMmaNew, TileShape_, ElementA_, StrideA_, ElementB_, StrideB_, TiledMma_, GmemTiledCopyA_, SmemLayoutAtomA_, SmemCopyAtomA_, TransformA_, GmemTiledCopyB_, SmemLayoutAtomB_, SmemCopyAtomB_, TransformB_> { // @@ -120,7 +120,7 @@ struct CollectiveMma, TileShape_, Element // Methods // - CollectiveMma() = default; + CollectiveMmaNew() = default; template static constexpr Params diff --git a/include/cutlass/gemm/collective/xe_mma_legacy.hpp b/include/cutlass/gemm/collective/xe_mma_legacy.hpp new file mode 100644 index 0000000000..ac7fe83884 --- /dev/null +++ b/include/cutlass/gemm/collective/xe_mma_legacy.hpp @@ -0,0 +1,269 @@ +/*************************************************************************************************** + * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved. + * Copyright (C) 2025 Intel Corporation, All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include "cutlass/cutlass.h" +#include "cutlass/gemm/dispatch_policy.hpp" + +#include "cute/algorithm/functional.hpp" +#include "cute/atom/mma_atom.hpp" +#include "cute/algorithm/gemm.hpp" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass::gemm::collective { +using namespace cute; +///////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct CollectiveMma, TileShape_, ElementA_, StrideA_, ElementB_, StrideB_, TiledMma_, + GmemTiledCopyA_, SmemLayoutAtomA_, SmemCopyAtomA_, TransformA_, GmemTiledCopyB_, SmemLayoutAtomB_, + SmemCopyAtomB_, TransformB_> { + // + // Type Aliases + // + using DispatchPolicy = MainloopIntelXeXMX16; + using WorkgroupTileShape = TileShape_; + using ElementA = ElementA_; + using StrideA = StrideA_; + using ElementB = ElementB_; + using StrideB = StrideB_; + using TiledMma = TiledMma_; + using ElementAccumulator = typename TiledMma::ValTypeC; + using GmemTiledCopyA = GmemTiledCopyA_; + using GmemTiledCopyB = GmemTiledCopyB_; + using SmemLayoutAtomA = SmemLayoutAtomA_; + using SmemLayoutAtomB = SmemLayoutAtomB_; + using SmemCopyAtomA = SmemCopyAtomA_; + using SmemCopyAtomB = SmemCopyAtomB_; + using TransformA = TransformA_; + using TransformB = TransformB_; + using ArchTag = typename DispatchPolicy::ArchTag; + + static_assert(platform::is_same::value, "MainloopIntelXeXMX16 requires that A and B have same type."); + static_assert(std::is_same_v, "Transformation for A is not currently supported on Intel PVC"); + static_assert(std::is_same_v, "Transformation for B is not currently supported on Intel PVC"); + + static constexpr int SubgroupSize = DispatchPolicy::SubgroupSize; + + using MmaAtomShape = typename TiledMma::AtomShape_MNK; + + static constexpr int BLK_M = get<0>(WorkgroupTileShape{}); + static constexpr int BLK_N = get<1>(WorkgroupTileShape{}); + static constexpr int BLK_K = get<2>(WorkgroupTileShape{}); + + static constexpr int ATOM_M = get<1>(typename TiledMma::ThrLayoutVMNK{}.shape()); + static constexpr int ATOM_N = get<2>(typename TiledMma::ThrLayoutVMNK{}.shape()); + static constexpr int ATOM_K = get<3>(typename TiledMma::ThrLayoutVMNK{}.shape()); + + static_assert(BLK_M % TiledMma{}.template tile_size_mnk<0>() == 0, "TiledMma permutation size must match block size."); + static_assert(BLK_N % TiledMma{}.template tile_size_mnk<1>() == 0, "TiledMma permutation size must match block size."); + static_assert(BLK_K % TiledMma{}.template tile_size_mnk<2>() == 0, "TiledMma permutation size must match block size."); + + static constexpr int SG_M = ceil_div(BLK_M, ATOM_M); + static constexpr int SG_N = ceil_div(BLK_N, ATOM_N); + static constexpr int SG_K = ceil_div(BLK_K, ATOM_K); + using SubgroupTileShape = Shape, C, C>; + + // 32 + static constexpr auto Num_SGs = ATOM_N * ATOM_M * ATOM_K; + static constexpr uint32_t MaxThreadsPerBlock = size(TiledMma{}); + + using Copy_A = typename Copy_Traits::template DefaultTiledCopy; + using Copy_B = typename Copy_Traits::template DefaultTiledCopy; + + // Host side kernel arguments + struct Arguments { + ElementA const* ptr_A; + StrideA dA; + ElementB const* ptr_B; + StrideB dB; + }; + + struct Params { + Copy_A tiled_copy_a; + Copy_B tiled_copy_b; + }; + + // + // Methods + // + + CollectiveMma() = default; + + template + static constexpr Params + to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) { + (void) workspace; + + auto [M,N,K,L] = problem_shape; + + auto mA_mkl = make_tensor(make_gmem_ptr(args.ptr_A), make_layout(make_shape(M, K, L), args.dA)); + auto mB_nkl = make_tensor(make_gmem_ptr(args.ptr_B), make_layout(make_shape(N, K, L), args.dB)); + Copy_A tiled_copy_a{Copy_A{}.with(mA_mkl)}; + Copy_B tiled_copy_b{Copy_B{}.with(mB_nkl)}; + + return Params{tiled_copy_a, tiled_copy_b}; + } + + template + static bool + can_implement( + ProblemShape problem_shapes, + Arguments const& args) { + constexpr int copy_alignment_bits = 128; + constexpr int batch_alignment_bits = 512; + auto problem_shape_MNKL = append<4>(problem_shapes, 1); + auto [M,N,K,L] = problem_shape_MNKL; + + bool implementable = true; + + constexpr int min_aligned_elements_A = copy_alignment_bits / sizeof_bits::value; + implementable &= cutlass::detail::check_alignment(cute::make_shape(M,K,L), args.dA); + constexpr int min_aligned_elements_B = copy_alignment_bits / sizeof_bits::value; + implementable &= cutlass::detail::check_alignment(cute::make_shape(N,K,L), args.dB); + + if (L > 1) { + constexpr int min_batch_aligned_elements_A = batch_alignment_bits / sizeof_bits::value; + implementable &= get<2>(args.dA) % min_batch_aligned_elements_A == 0; + constexpr int min_batch_aligned_elements_B = batch_alignment_bits / sizeof_bits::value; + implementable &= get<2>(args.dB) % min_batch_aligned_elements_B == 0; + } + + if (!implementable) { + CUTLASS_TRACE_HOST(" CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for XE 2D copy.\n"); + } + + return implementable; + } + + /// Perform a subgroup-scoped matrix multiply-accumulate + template + CUTLASS_DEVICE void operator()(FrgTensorD &accum, TensorA gA, TensorB gB, FrgTensorC const &src_accum, + KTileIterator k_tile_iter, int k_tile_count, BlkCoord const &blk_coord, int const &K_start, int thread_idx, + Params const &mainloop) { + (void)blk_coord; + static_assert(is_rmem::value, "D tensor must be rmem resident."); + static_assert(is_rmem::value, "C tensor must be rmem resident."); + + auto thr_copy_A = mainloop.tiled_copy_a.get_slice(thread_idx); + auto thr_copy_B = mainloop.tiled_copy_b.get_slice(thread_idx); + + // Instantiate the MMA object and get thread slice + TiledMma tiled_mma; + // TODO(Codeplay): see if we can make this nicer + // To make all work items in a subgroup have the same global tensors pass in the index of work item 0 in each subgroup + auto sg = compat::get_nd_item<1>().get_sub_group(); + auto first_thread_in_sg_idx = sg.get_group_linear_id() * DispatchPolicy::SubgroupSize; + auto thr_mma = tiled_mma.get_slice(first_thread_in_sg_idx); + + // Partition global counting tensors for MMA + Tensor tCgA = thr_mma.partition_A(gA); + Tensor tCgB = thr_mma.partition_B(gB); + + Tensor tCrA = make_tensor(make_fragment_layout(mainloop.tiled_copy_a, tCgA(_,_,_,0).shape())); + Tensor tCrB = make_tensor(make_fragment_layout(mainloop.tiled_copy_b, tCgB(_,_,_,0).shape())); + + // Retile registers for copies + Tensor tArA = thr_copy_A.retile_D(tCrA); + Tensor tBrB = thr_copy_B.retile_D(tCrB); + + // Retile global counting tensors for copies + Tensor tAgA = thr_copy_A.retile_S(tCgA); + Tensor tBgB = thr_copy_B.retile_S(tCgB); + + auto tiled_prefetch_a = cute::prefetch_selector,Int>, Num_SGs>(mainloop.tiled_copy_a); + auto tiled_prefetch_b = cute::prefetch_selector,Int>, Num_SGs>(mainloop.tiled_copy_b); + auto thr_prefetch_A = tiled_prefetch_a.get_slice(thread_idx); + auto thr_prefetch_B = tiled_prefetch_b.get_slice(thread_idx); + + // Partition global tile for prefetch + auto pAgA = thr_prefetch_A.partition_S(gA); + auto pBgB = thr_prefetch_B.partition_S(gB); + +#if CUTLASS_ENABLE_DEBUG_PRINTS +#define PRINT(x) print(#x ": "); print(x); print("\n"); + if (cute::thread(LOG_THREAD, LOG_GROUP)) { + print("======================= A: \n"); + PRINT(tCgA); + PRINT(tAgA); + + PRINT(tCrA); + PRINT(tArA); + PRINT(mainloop.tiled_copy_a); + + print("======================= B: \n"); + PRINT(tCgB); + PRINT(tBgB); + + PRINT(tCrB); + PRINT(tBrB); + PRINT(mainloop.tiled_copy_b); + } +#undef PRINT +#endif + + // + // Mainloop + // + const auto k_start_idx = crd2idx((*k_tile_iter), make_shape(K_start)); + constexpr int barrier_scope = 2; + int prefetch_k = k_start_idx; + + CUTLASS_PRAGMA_UNROLL + for (; prefetch_k < DispatchPolicy::Stages; prefetch_k++) { + prefetch(tiled_prefetch_a, pAgA(_, _, _, prefetch_k)); + prefetch(tiled_prefetch_b, pBgB(_, _, _, prefetch_k)); + } + + for (int k_tile = k_start_idx; k_tile < k_tile_count + k_start_idx; k_tile++, prefetch_k++) { + barrier_arrive(barrier_scope); + // Copy gmem to rmem for the first k_tile + copy(mainloop.tiled_copy_a, tAgA(_,_,_,k_tile), tArA); + copy(mainloop.tiled_copy_b, tBgB(_,_,_,k_tile), tBrB); + + if (prefetch_k < k_tile_count) { + prefetch(tiled_prefetch_a, pAgA(_, _, _, prefetch_k)); + prefetch(tiled_prefetch_b, pBgB(_, _, _, prefetch_k)); + } + + cute::gemm(tiled_mma, tCrA, tCrB, accum); + barrier_wait(barrier_scope); + } + } +}; + +} // namespace cutlass::gemm::collective + +///////////////////////////////////////////////////////////////////////////////////////////////// \ No newline at end of file diff --git a/include/cutlass/gemm/kernel/gemm_universal.hpp b/include/cutlass/gemm/kernel/gemm_universal.hpp index 69137d2114..3f6fa9696a 100644 --- a/include/cutlass/gemm/kernel/gemm_universal.hpp +++ b/include/cutlass/gemm/kernel/gemm_universal.hpp @@ -78,6 +78,7 @@ struct IsCutlass3ArrayKernel class GemmUniversal; +template < + class ProblemShapeOrThreadblockMma_, // (m, n, k) or (m, n, k, l) + class CollectiveMainloopOrEpilogue_, + class CollectiveEpilogueOrThreadblockSwizzle_, + class TileScheduler_ = void, + class Enable = void +> +class GemmUniversalNew; + } // namespace cutlass::gemm::kernel diff --git a/include/cutlass/gemm/kernel/xe_gemm.hpp b/include/cutlass/gemm/kernel/xe_gemm.hpp index 43e9d033e1..63d4467cd4 100644 --- a/include/cutlass/gemm/kernel/xe_gemm.hpp +++ b/include/cutlass/gemm/kernel/xe_gemm.hpp @@ -47,7 +47,7 @@ template < class CollectiveEpilogue_, class TileScheduler_ > -class GemmUniversal< +class GemmUniversalNew< ProblemShape_, CollectiveMainloop_, CollectiveEpilogue_, diff --git a/include/cutlass/gemm/kernel/xe_gemm_legacy.hpp b/include/cutlass/gemm/kernel/xe_gemm_legacy.hpp new file mode 100644 index 0000000000..04e6ecfc99 --- /dev/null +++ b/include/cutlass/gemm/kernel/xe_gemm_legacy.hpp @@ -0,0 +1,284 @@ +/*************************************************************************************************** + * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include "cutlass/cutlass.h" +#include "cutlass/kernel_hardware_info.hpp" +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/dispatch_policy.hpp" + +#include "cute/tensor.hpp" + +namespace cutlass::gemm::kernel { + +/////////////////////////////////////////////////////////////////////////////// + +template < + class ProblemShape_, + class CollectiveMainloop_, + class CollectiveEpilogue_, + class TileScheduler_ +> +class GemmUniversal< + ProblemShape_, + CollectiveMainloop_, + CollectiveEpilogue_, + TileScheduler_, + cute::enable_if_t>> +{ +public: + // + // Type Aliases + // + using ProblemShape = ProblemShape_; + + static_assert(rank(ProblemShape{}) == 3 or rank(ProblemShape{}) == 4, + "ProblemShape{} should be or "); + + // Mainloop derived types + using CollectiveMainloop = CollectiveMainloop_; + using TileShape = typename CollectiveMainloop::WorkgroupTileShape; + using WorkgroupTileShape = TileShape; + using TiledMma = typename CollectiveMainloop::TiledMma; + using ArchTag = typename CollectiveMainloop::ArchTag; + using ElementA = typename CollectiveMainloop::ElementA; + using StrideA = typename CollectiveMainloop::StrideA; + using ElementB = typename CollectiveMainloop::ElementB; + using StrideB = typename CollectiveMainloop::StrideB; + using DispatchPolicy = typename CollectiveMainloop::DispatchPolicy; + using ElementAccumulator = typename CollectiveMainloop::ElementAccumulator; + using MainloopArguments = typename CollectiveMainloop::Arguments; + using ClusterShape = typename DispatchPolicy::ClusterShape; + using MainloopParams = typename CollectiveMainloop::Params; + + static_assert(cute::is_void_v or cute::is_same_v, + "Intel Xe does not support specializing the tile scheduler."); + using TileSchedulerTag = TileScheduler_; + using TileScheduler = typename detail::TileSchedulerSelector< + TileScheduler_, ArchTag, WorkgroupTileShape, + cute::Shape, cute::Int<1>, cute::Int<1>>>::Scheduler; + using TileSchedulerArguments = typename TileScheduler::Arguments; + using TileSchedulerParams = typename TileScheduler::Params; + + // Epilogue derived types + using CollectiveEpilogue = CollectiveEpilogue_; + using ElementC = typename CollectiveEpilogue::ElementC; + using StrideC = typename CollectiveEpilogue::StrideC; + using ElementD = typename CollectiveEpilogue::ElementD; + using StrideD = typename CollectiveEpilogue::StrideD; + using EpilogueArguments = typename CollectiveEpilogue::Arguments; + using EpilogueParams = typename CollectiveEpilogue::Params; + static_assert(cute::is_same_v, + "Mainloop and epilogue do not agree on accumulator value type."); + + // MSVC requires the cast to fix a warning-as-error. + static constexpr int SharedStorageSize = 0; + + static constexpr int SubgroupSize = CollectiveMainloop::SubgroupSize; // sub_group size + static constexpr uint32_t MaxThreadsPerBlock = CollectiveMainloop::MaxThreadsPerBlock; + using MmaAtomShape = typename CollectiveMainloop::MmaAtomShape; + using SubgroupTileShape = typename CollectiveMainloop::SubgroupTileShape; + + // Kernel level shared memory storage + struct SharedStorage { + using EpilogueTensorStorage = typename CollectiveEpilogue::TensorStorage; + EpilogueTensorStorage epilogue; + }; + + // Device side arguments + struct Arguments { + GemmUniversalMode mode{}; + ProblemShape problem_shape{}; + MainloopArguments mainloop{}; + EpilogueArguments epilogue{}; + KernelHardwareInfo hw_info{}; + TileSchedulerArguments scheduler{}; + }; + + // Kernel entry point API + struct Params { + GemmUniversalMode mode{}; + ProblemShape problem_shape{}; + MainloopParams mainloop{}; + EpilogueParams epilogue{}; + KernelHardwareInfo hw_info{}; + TileSchedulerParams scheduler{}; + }; + + // + // Methods + // + + // Convert to underlying arguments. In this case, a simple copy for the aliased type. + static + Params + to_underlying_arguments(Arguments const& args, void* workspace) { + (void) workspace; + auto problem_shape_MNKL = append<4>(args.problem_shape, 1); + + auto mainloop_args = CollectiveMainloop::to_underlying_arguments(args.problem_shape, args.mainloop, workspace); + TileSchedulerParams scheduler = TileScheduler::to_underlying_arguments( + problem_shape_MNKL, TileShape{}, ClusterShape{}, args.hw_info, args.scheduler, &workspace); + return { + args.mode, + args.problem_shape, + mainloop_args, + CollectiveEpilogue::to_underlying_arguments(args.problem_shape, args.epilogue, workspace), + args.hw_info, + scheduler + }; + } + + static bool + can_implement(Arguments const& args) { + bool implementable = true; + + implementable = implementable && (args.mode == GemmUniversalMode::kGemm || + (args.mode == GemmUniversalMode::kBatched && rank(ProblemShape{}) == 4)); + + implementable &= TileScheduler::can_implement(args.scheduler); + + implementable &= CollectiveMainloop::can_implement(args.problem_shape, args.mainloop); + implementable &= CollectiveEpilogue::can_implement(args.problem_shape, args.epilogue); + + return implementable; + } + + static int + get_workspace_size(Arguments const& args) { + return 0; + } + + static + cutlass::Status + initialize_workspace(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr, + CudaHostAdapter* cuda_adapter = nullptr) { + return Status::kSuccess; + } + + static dim3 + get_grid_shape(Params const& params) { + dim3 grid = TileScheduler::get_tiled_cta_shape_mnl(params.problem_shape, TileShape{}, ClusterShape{}); + if(params.scheduler.raster_order_ == TileScheduler::RasterOrder::AlongN) { + return {grid.y, grid.x, grid.z}; + } else { + return {grid.x, grid.y, grid.z}; + } + } + + static dim3 + get_block_shape() { + return dim3(MaxThreadsPerBlock, 1, 1); + } + + CUTLASS_DEVICE + void + operator()(Params const& params, char* smem_buf) { + SharedStorage& shared_storage = *reinterpret_cast(smem_buf); + // Preconditions + CUTE_STATIC_ASSERT(is_static::value); + + // Separate out problem shape for convenience + // Optionally append 1s until problem shape is rank-4 in case its is only rank-3 (MNK) + auto problem_shape_MNKL = append<4>(params.problem_shape, Int<1>{}); + auto M = get<0>(problem_shape_MNKL); + auto N = get<1>(problem_shape_MNKL); + auto K = get<2>(problem_shape_MNKL); + auto L = get<3>(problem_shape_MNKL); + + // Preconditions + static_assert(cute::rank(StrideA{}) == 3, "StrideA must be rank-3: [M, K, L]. If batch mode is not needed, set L stride to Int<0>."); + static_assert(cute::rank(StrideB{}) == 3, "StrideB must be rank-3: [N, K, L]. If batch mode is not needed, set L stride to Int<0>."); + static_assert(cute::rank(StrideC{}) == 3, "StrideC must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>."); + static_assert(cute::rank(StrideD{}) == 3, "StrideD must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>."); + + // Get the appropriate blocks for this sub_group -- potential for sub_group locality + int thread_idx = int(ThreadIdxX()); + auto blk_shape = TileShape{}; + int m_coord, n_coord, l_coord; + if (params.scheduler.raster_order_ == TileScheduler::RasterOrder::AlongN) { + m_coord = BlockIdxY(); + n_coord = BlockIdxX(); + l_coord = BlockIdxZ(); + } else { + m_coord = BlockIdxX(); + n_coord = BlockIdxY(); + l_coord = BlockIdxZ(); + } + + auto blk_coord_mnkl = make_coord(m_coord, n_coord, _, l_coord); + constexpr auto workgroup_shape = WorkgroupTileShape{}; // (SUB_M,SUB_N,SUB_K) + constexpr auto subgroup_shape = SubgroupTileShape{}; + + Tensor mA_mkl = cute::get_xe_tensor(make_shape(M,K,L)); //(m,k,l) + Tensor mB_nkl = cute::get_xe_tensor(make_shape(N,K,L)); //(n,k,l) + + Tensor gA = local_tile(mA_mkl, select<0,2>(blk_shape), make_coord(m_coord,_,l_coord)); + Tensor gB = local_tile(mB_nkl, select<1,2>(blk_shape), make_coord(n_coord,_,l_coord)); + + // Allocate the tiled_mma and the accumulators for the (M,N) subgroup_shape + TiledMma tiled_mma; + + Tensor accumulators = partition_fragment_C(tiled_mma, take<0,2>(blk_shape)); + clear(accumulators); + + auto k_tile_iter = cute::make_coord_iterator(idx2crd(0, make_shape(K)), make_shape(K)); + int k_tile_count = ceil_div(K, get<2>(workgroup_shape)); + + // Perform the collective scoped MMA + CollectiveMainloop collective_mma; + collective_mma( + accumulators, + gA, + gB, + accumulators, + k_tile_iter, k_tile_count, + blk_coord_mnkl, // TODO(codeplay): Remove this once unneeded in xe_mma_mixed_input.hpp + K, + thread_idx, + params.mainloop + ); + + CollectiveEpilogue epilogue{params.epilogue, shared_storage.epilogue}; + epilogue( + problem_shape_MNKL, + subgroup_shape, // TODO(codeplay): Inconsistency here w/ blk_coord_mnkl + blk_coord_mnkl, + accumulators, + tiled_mma, + thread_idx + ); + } +}; + +/////////////////////////////////////////////////////////////////////////////// + +} // namespace cutlass::gemm::kernel \ No newline at end of file From f210ba363a1208cc031b305976e5d5c9930126df Mon Sep 17 00:00:00 2001 From: Anamika Chatterjee Date: Tue, 30 Sep 2025 15:42:09 +0300 Subject: [PATCH 04/15] delete unwanted file --- .vscode/settings.json | 24 ------------------------ 1 file changed, 24 deletions(-) delete mode 100644 .vscode/settings.json diff --git a/.vscode/settings.json b/.vscode/settings.json deleted file mode 100644 index 43b3531021..0000000000 --- a/.vscode/settings.json +++ /dev/null @@ -1,24 +0,0 @@ -{ - "files.associations": { - "compare": "cpp", - "string": "cpp", - "unordered_map": "cpp", - "vector": "cpp", - "exception": "cpp", - "memory": "cpp", - "memory_resource": "cpp", - "string_view": "cpp", - "random": "cpp", - "initializer_list": "cpp", - "istream": "cpp", - "new": "cpp", - "ostream": "cpp", - "sstream": "cpp", - "stdexcept": "cpp", - "streambuf": "cpp", - "system_error": "cpp", - "tuple": "cpp", - "type_traits": "cpp", - "typeinfo": "cpp" - } -} \ No newline at end of file From 5f5a8b789523d94557d98aca076afd2bfd024a63 Mon Sep 17 00:00:00 2001 From: Anamika Chatterjee Date: Wed, 1 Oct 2025 12:22:31 +0300 Subject: [PATCH 05/15] Changes added based on feedback --- examples/00_bmg_gemm/00_bmg_gemm.cpp | 64 ++----------------- .../gemm/collective/collective_mma_decl.hpp | 20 ------ include/cutlass/gemm/collective/xe_mma.hpp | 8 +-- include/cutlass/gemm/dispatch_policy.hpp | 14 ++++ .../cutlass/gemm/kernel/gemm_universal.hpp | 1 - .../cutlass/gemm/kernel/gemm_universal_decl.h | 9 --- include/cutlass/gemm/kernel/xe_gemm.hpp | 4 +- 7 files changed, 26 insertions(+), 94 deletions(-) diff --git a/examples/00_bmg_gemm/00_bmg_gemm.cpp b/examples/00_bmg_gemm/00_bmg_gemm.cpp index 312dcd23d0..ada885eb4f 100644 --- a/examples/00_bmg_gemm/00_bmg_gemm.cpp +++ b/examples/00_bmg_gemm/00_bmg_gemm.cpp @@ -81,58 +81,6 @@ using namespace cute; -/////////////////////////////////////////////////////////////////////////////////////////////////// - -// Helper template to check if a type is complete -template -struct is_complete : std::false_type {}; - -template -struct is_complete : std::true_type {}; - -template -static constexpr bool is_complete_v = is_complete::value; - - -template -auto -choose_mma_op() -{ - if constexpr (is_complete_v>) - return XE_DPAS_TT<8, TC, TA, TB>{}; - else if constexpr (is_same_v, cute::bfloat16_t>) - return XE_DPAS_TT<8, float, cute::bfloat16_t>{}; - else /* Use f16 by default as upconversion sequences are typically faster */ - return XE_DPAS_TT<8, float, cute::half_t>{}; -} - -// Helper function to choose tiled MMA based on tensor properties -template -auto -choose_tiled_mma() -{ - - auto op = choose_mma_op(); - - constexpr bool byte = (cute::max(sizeof_bits_v, sizeof_bits_v) <= 8); - - constexpr bool is_A_transposed = std::is_same_v; - constexpr bool is_B_transposed = std::is_same_v; - constexpr bool use_1x_dpas_per_k = is_A_transposed || (byte && is_B_transposed); - - - using _K = conditional_t, C>; - - using WGTile = Shape<_256, _256, _K>; // 256x256 WG tile size - using SGLayout = Layout, Stride<_4, _1, _0>>; // 8x4 SG tiling, n-major - - using MMA = typename TiledMMAHelper, Layout, SGLayout>::TiledMMA; - - return MMA{}; -} - - /////////////////////////////////////////////////////////////////////////////////////////////////// // Command line options parsing @@ -401,13 +349,13 @@ int main(int argc, const char** argv) using GmemTiledCopyA = XE_2D_U16x32x32_LD_N; using GmemTiledCopyB = XE_2D_U16x32x32_LD_V; - // Workgroup-level tile - using TiledMma = decltype(choose_tiled_mma()); - using TileShape = decltype(TiledMma{}.tile_mnk()); + // New MMA atom XE_DPAS_TT using workgroup-level tile shape of 256×256×32 + using TileShape = Shape<_256, _256, _32>; + using TiledMma = typename TiledMMAHelper>, Layout, Layout, Stride<_4, _1, _0>>>::TiledMMA; // For Intel BMG, PipelineStages defines how many k-blocks ahead to prefetch from A and B. constexpr int PipelineStages = 2; - using GEMMDispatchPolicy = cutlass::gemm::MainloopIntelXeXMX16; + using GEMMDispatchPolicy = cutlass::gemm::MainloopXeL1Staged; using EpilogueDispatchPolicy = cutlass::epilogue::IntelXeXMX16; // This is the 'default' epilogue operation (Linear Combination) which performs everything in: @@ -437,7 +385,7 @@ int main(int argc, const char** argv) void, void>; // GEMM Mainloop - iteration over blocks in K dimension - using CollectiveMainloop = cutlass::gemm::collective::CollectiveMmaNew< + using CollectiveMainloop = cutlass::gemm::collective::CollectiveMma< GEMMDispatchPolicy, TileShape, ElementInputA, @@ -450,7 +398,7 @@ int main(int argc, const char** argv) >; // Define the whole kernel (mainloop and epilogue) - using GemmKernel = cutlass::gemm::kernel::GemmUniversalNew< + using GemmKernel = cutlass::gemm::kernel::GemmUniversal< Shape, // Defer global problem shape definition to runtime CollectiveMainloop, CollectiveEpilogue diff --git a/include/cutlass/gemm/collective/collective_mma_decl.hpp b/include/cutlass/gemm/collective/collective_mma_decl.hpp index a8d2572a71..a2faa1ff28 100644 --- a/include/cutlass/gemm/collective/collective_mma_decl.hpp +++ b/include/cutlass/gemm/collective/collective_mma_decl.hpp @@ -58,26 +58,6 @@ struct CollectiveMma { static_assert(cutlass::detail::dependent_false, "Could not find a mainloop specialization."); }; -template < - class DispatchPolicy, - class TileShape, - class ElementA, - class StrideA, - class ElementB, - class StrideB, - class TiledMma, - class GmemTiledCopyA, - class SmemLayoutAtomA, - class SmemCopyAtomA, - class TransformA, - class GmemTiledCopyB, - class SmemLayoutAtomB, - class SmemCopyAtomB, - class TransformB -> -struct CollectiveMmaNew { - static_assert(cutlass::detail::dependent_false, "Could not find a mainloop specialization."); -}; ///////////////////////////////////////////////////////////////////////////////////////////////// } // namespace cutlass::gemm::collective diff --git a/include/cutlass/gemm/collective/xe_mma.hpp b/include/cutlass/gemm/collective/xe_mma.hpp index 668a088939..8df8945a7e 100644 --- a/include/cutlass/gemm/collective/xe_mma.hpp +++ b/include/cutlass/gemm/collective/xe_mma.hpp @@ -47,13 +47,13 @@ using namespace cute; template -struct CollectiveMmaNew, TileShape_, ElementA_, StrideA_, ElementB_, StrideB_, TiledMma_, +struct CollectiveMma, TileShape_, ElementA_, StrideA_, ElementB_, StrideB_, TiledMma_, GmemTiledCopyA_, SmemLayoutAtomA_, SmemCopyAtomA_, TransformA_, GmemTiledCopyB_, SmemLayoutAtomB_, SmemCopyAtomB_, TransformB_> { // // Type Aliases // - using DispatchPolicy = MainloopIntelXeXMX16; + using DispatchPolicy = MainloopXeL1Staged; using WorkgroupTileShape = TileShape_; using ElementA = ElementA_; using StrideA = StrideA_; @@ -71,7 +71,7 @@ struct CollectiveMmaNew, TileShape_, Elem using TransformB = TransformB_; using ArchTag = typename DispatchPolicy::ArchTag; - static_assert(platform::is_same::value, "MainloopIntelXeXMX16 requires that A and B have same type."); + static_assert(platform::is_same::value, "MainloopXeL1Staged requires that A and B have same type."); static_assert(std::is_same_v, "Transformation for A is not currently supported on Intel PVC"); static_assert(std::is_same_v, "Transformation for B is not currently supported on Intel PVC"); @@ -120,7 +120,7 @@ struct CollectiveMmaNew, TileShape_, Elem // Methods // - CollectiveMmaNew() = default; + CollectiveMma() = default; template static constexpr Params diff --git a/include/cutlass/gemm/dispatch_policy.hpp b/include/cutlass/gemm/dispatch_policy.hpp index b742dfd76f..423e14d10f 100644 --- a/include/cutlass/gemm/dispatch_policy.hpp +++ b/include/cutlass/gemm/dispatch_policy.hpp @@ -1247,6 +1247,20 @@ struct MainloopDeviceAgnostic { using Schedule = KernelMultistage; }; #endif + +#if defined(CUTLASS_ENABLE_SYCL) +// Note: This dispatch policy is specifically added for CollectiveMma to support +// the integration of new MMA atoms (XE_DPAS_TT) and copy atoms for Intel XE architecture +template +struct MainloopXeL1Staged { + constexpr static int Stages = Stages_; + constexpr static int SubgroupSize = 16; + using ArchTag = arch::IntelXe; + using Schedule = KernelSchedule; + using ClusterShape = Shape<_1,_1,_1>; +}; +#endif + // n-buffer in smem, pipelined with Blackwell UMMA and TMA, Warp specialized dynamic schedule template< int LoadABPipelineStageCount_, diff --git a/include/cutlass/gemm/kernel/gemm_universal.hpp b/include/cutlass/gemm/kernel/gemm_universal.hpp index 3f6fa9696a..69137d2114 100644 --- a/include/cutlass/gemm/kernel/gemm_universal.hpp +++ b/include/cutlass/gemm/kernel/gemm_universal.hpp @@ -78,7 +78,6 @@ struct IsCutlass3ArrayKernel class GemmUniversal; -template < - class ProblemShapeOrThreadblockMma_, // (m, n, k) or (m, n, k, l) - class CollectiveMainloopOrEpilogue_, - class CollectiveEpilogueOrThreadblockSwizzle_, - class TileScheduler_ = void, - class Enable = void -> -class GemmUniversalNew; - } // namespace cutlass::gemm::kernel diff --git a/include/cutlass/gemm/kernel/xe_gemm.hpp b/include/cutlass/gemm/kernel/xe_gemm.hpp index 63d4467cd4..7a13d0fc1f 100644 --- a/include/cutlass/gemm/kernel/xe_gemm.hpp +++ b/include/cutlass/gemm/kernel/xe_gemm.hpp @@ -47,7 +47,7 @@ template < class CollectiveEpilogue_, class TileScheduler_ > -class GemmUniversalNew< +class GemmUniversal< ProblemShape_, CollectiveMainloop_, CollectiveEpilogue_, @@ -239,7 +239,7 @@ class GemmUniversalNew< constexpr auto subgroup_shape = SubgroupTileShape{}; Tensor cA = make_identity_tensor(make_shape(M,K,L)); // (M,K,L) - Tensor cB = make_identity_tensor(make_shape(M,K,L)); // (N,K,L) + Tensor cB = make_identity_tensor(make_shape(N,K,L)); // (N,K,L) Tensor gA = local_tile(cA, select<0,2>(blk_shape), make_coord(m_coord,_,l_coord)); Tensor gB = local_tile(cB, select<1,2>(blk_shape), make_coord(n_coord,_,l_coord)); From c55ac28007df91d94b2037c25e642578d0ef5610 Mon Sep 17 00:00:00 2001 From: Anamika Chatterjee Date: Wed, 1 Oct 2025 12:29:23 +0300 Subject: [PATCH 06/15] Remove xe_gemm_legacy as its not longer used --- .../cutlass/gemm/kernel/xe_gemm_legacy.hpp | 284 ------------------ 1 file changed, 284 deletions(-) delete mode 100644 include/cutlass/gemm/kernel/xe_gemm_legacy.hpp diff --git a/include/cutlass/gemm/kernel/xe_gemm_legacy.hpp b/include/cutlass/gemm/kernel/xe_gemm_legacy.hpp deleted file mode 100644 index 04e6ecfc99..0000000000 --- a/include/cutlass/gemm/kernel/xe_gemm_legacy.hpp +++ /dev/null @@ -1,284 +0,0 @@ -/*************************************************************************************************** - * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved. - * SPDX-License-Identifier: BSD-3-Clause - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - **************************************************************************************************/ -#pragma once - -#include "cutlass/cutlass.h" -#include "cutlass/kernel_hardware_info.hpp" -#include "cutlass/gemm/gemm.h" -#include "cutlass/gemm/dispatch_policy.hpp" - -#include "cute/tensor.hpp" - -namespace cutlass::gemm::kernel { - -/////////////////////////////////////////////////////////////////////////////// - -template < - class ProblemShape_, - class CollectiveMainloop_, - class CollectiveEpilogue_, - class TileScheduler_ -> -class GemmUniversal< - ProblemShape_, - CollectiveMainloop_, - CollectiveEpilogue_, - TileScheduler_, - cute::enable_if_t>> -{ -public: - // - // Type Aliases - // - using ProblemShape = ProblemShape_; - - static_assert(rank(ProblemShape{}) == 3 or rank(ProblemShape{}) == 4, - "ProblemShape{} should be or "); - - // Mainloop derived types - using CollectiveMainloop = CollectiveMainloop_; - using TileShape = typename CollectiveMainloop::WorkgroupTileShape; - using WorkgroupTileShape = TileShape; - using TiledMma = typename CollectiveMainloop::TiledMma; - using ArchTag = typename CollectiveMainloop::ArchTag; - using ElementA = typename CollectiveMainloop::ElementA; - using StrideA = typename CollectiveMainloop::StrideA; - using ElementB = typename CollectiveMainloop::ElementB; - using StrideB = typename CollectiveMainloop::StrideB; - using DispatchPolicy = typename CollectiveMainloop::DispatchPolicy; - using ElementAccumulator = typename CollectiveMainloop::ElementAccumulator; - using MainloopArguments = typename CollectiveMainloop::Arguments; - using ClusterShape = typename DispatchPolicy::ClusterShape; - using MainloopParams = typename CollectiveMainloop::Params; - - static_assert(cute::is_void_v or cute::is_same_v, - "Intel Xe does not support specializing the tile scheduler."); - using TileSchedulerTag = TileScheduler_; - using TileScheduler = typename detail::TileSchedulerSelector< - TileScheduler_, ArchTag, WorkgroupTileShape, - cute::Shape, cute::Int<1>, cute::Int<1>>>::Scheduler; - using TileSchedulerArguments = typename TileScheduler::Arguments; - using TileSchedulerParams = typename TileScheduler::Params; - - // Epilogue derived types - using CollectiveEpilogue = CollectiveEpilogue_; - using ElementC = typename CollectiveEpilogue::ElementC; - using StrideC = typename CollectiveEpilogue::StrideC; - using ElementD = typename CollectiveEpilogue::ElementD; - using StrideD = typename CollectiveEpilogue::StrideD; - using EpilogueArguments = typename CollectiveEpilogue::Arguments; - using EpilogueParams = typename CollectiveEpilogue::Params; - static_assert(cute::is_same_v, - "Mainloop and epilogue do not agree on accumulator value type."); - - // MSVC requires the cast to fix a warning-as-error. - static constexpr int SharedStorageSize = 0; - - static constexpr int SubgroupSize = CollectiveMainloop::SubgroupSize; // sub_group size - static constexpr uint32_t MaxThreadsPerBlock = CollectiveMainloop::MaxThreadsPerBlock; - using MmaAtomShape = typename CollectiveMainloop::MmaAtomShape; - using SubgroupTileShape = typename CollectiveMainloop::SubgroupTileShape; - - // Kernel level shared memory storage - struct SharedStorage { - using EpilogueTensorStorage = typename CollectiveEpilogue::TensorStorage; - EpilogueTensorStorage epilogue; - }; - - // Device side arguments - struct Arguments { - GemmUniversalMode mode{}; - ProblemShape problem_shape{}; - MainloopArguments mainloop{}; - EpilogueArguments epilogue{}; - KernelHardwareInfo hw_info{}; - TileSchedulerArguments scheduler{}; - }; - - // Kernel entry point API - struct Params { - GemmUniversalMode mode{}; - ProblemShape problem_shape{}; - MainloopParams mainloop{}; - EpilogueParams epilogue{}; - KernelHardwareInfo hw_info{}; - TileSchedulerParams scheduler{}; - }; - - // - // Methods - // - - // Convert to underlying arguments. In this case, a simple copy for the aliased type. - static - Params - to_underlying_arguments(Arguments const& args, void* workspace) { - (void) workspace; - auto problem_shape_MNKL = append<4>(args.problem_shape, 1); - - auto mainloop_args = CollectiveMainloop::to_underlying_arguments(args.problem_shape, args.mainloop, workspace); - TileSchedulerParams scheduler = TileScheduler::to_underlying_arguments( - problem_shape_MNKL, TileShape{}, ClusterShape{}, args.hw_info, args.scheduler, &workspace); - return { - args.mode, - args.problem_shape, - mainloop_args, - CollectiveEpilogue::to_underlying_arguments(args.problem_shape, args.epilogue, workspace), - args.hw_info, - scheduler - }; - } - - static bool - can_implement(Arguments const& args) { - bool implementable = true; - - implementable = implementable && (args.mode == GemmUniversalMode::kGemm || - (args.mode == GemmUniversalMode::kBatched && rank(ProblemShape{}) == 4)); - - implementable &= TileScheduler::can_implement(args.scheduler); - - implementable &= CollectiveMainloop::can_implement(args.problem_shape, args.mainloop); - implementable &= CollectiveEpilogue::can_implement(args.problem_shape, args.epilogue); - - return implementable; - } - - static int - get_workspace_size(Arguments const& args) { - return 0; - } - - static - cutlass::Status - initialize_workspace(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr, - CudaHostAdapter* cuda_adapter = nullptr) { - return Status::kSuccess; - } - - static dim3 - get_grid_shape(Params const& params) { - dim3 grid = TileScheduler::get_tiled_cta_shape_mnl(params.problem_shape, TileShape{}, ClusterShape{}); - if(params.scheduler.raster_order_ == TileScheduler::RasterOrder::AlongN) { - return {grid.y, grid.x, grid.z}; - } else { - return {grid.x, grid.y, grid.z}; - } - } - - static dim3 - get_block_shape() { - return dim3(MaxThreadsPerBlock, 1, 1); - } - - CUTLASS_DEVICE - void - operator()(Params const& params, char* smem_buf) { - SharedStorage& shared_storage = *reinterpret_cast(smem_buf); - // Preconditions - CUTE_STATIC_ASSERT(is_static::value); - - // Separate out problem shape for convenience - // Optionally append 1s until problem shape is rank-4 in case its is only rank-3 (MNK) - auto problem_shape_MNKL = append<4>(params.problem_shape, Int<1>{}); - auto M = get<0>(problem_shape_MNKL); - auto N = get<1>(problem_shape_MNKL); - auto K = get<2>(problem_shape_MNKL); - auto L = get<3>(problem_shape_MNKL); - - // Preconditions - static_assert(cute::rank(StrideA{}) == 3, "StrideA must be rank-3: [M, K, L]. If batch mode is not needed, set L stride to Int<0>."); - static_assert(cute::rank(StrideB{}) == 3, "StrideB must be rank-3: [N, K, L]. If batch mode is not needed, set L stride to Int<0>."); - static_assert(cute::rank(StrideC{}) == 3, "StrideC must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>."); - static_assert(cute::rank(StrideD{}) == 3, "StrideD must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>."); - - // Get the appropriate blocks for this sub_group -- potential for sub_group locality - int thread_idx = int(ThreadIdxX()); - auto blk_shape = TileShape{}; - int m_coord, n_coord, l_coord; - if (params.scheduler.raster_order_ == TileScheduler::RasterOrder::AlongN) { - m_coord = BlockIdxY(); - n_coord = BlockIdxX(); - l_coord = BlockIdxZ(); - } else { - m_coord = BlockIdxX(); - n_coord = BlockIdxY(); - l_coord = BlockIdxZ(); - } - - auto blk_coord_mnkl = make_coord(m_coord, n_coord, _, l_coord); - constexpr auto workgroup_shape = WorkgroupTileShape{}; // (SUB_M,SUB_N,SUB_K) - constexpr auto subgroup_shape = SubgroupTileShape{}; - - Tensor mA_mkl = cute::get_xe_tensor(make_shape(M,K,L)); //(m,k,l) - Tensor mB_nkl = cute::get_xe_tensor(make_shape(N,K,L)); //(n,k,l) - - Tensor gA = local_tile(mA_mkl, select<0,2>(blk_shape), make_coord(m_coord,_,l_coord)); - Tensor gB = local_tile(mB_nkl, select<1,2>(blk_shape), make_coord(n_coord,_,l_coord)); - - // Allocate the tiled_mma and the accumulators for the (M,N) subgroup_shape - TiledMma tiled_mma; - - Tensor accumulators = partition_fragment_C(tiled_mma, take<0,2>(blk_shape)); - clear(accumulators); - - auto k_tile_iter = cute::make_coord_iterator(idx2crd(0, make_shape(K)), make_shape(K)); - int k_tile_count = ceil_div(K, get<2>(workgroup_shape)); - - // Perform the collective scoped MMA - CollectiveMainloop collective_mma; - collective_mma( - accumulators, - gA, - gB, - accumulators, - k_tile_iter, k_tile_count, - blk_coord_mnkl, // TODO(codeplay): Remove this once unneeded in xe_mma_mixed_input.hpp - K, - thread_idx, - params.mainloop - ); - - CollectiveEpilogue epilogue{params.epilogue, shared_storage.epilogue}; - epilogue( - problem_shape_MNKL, - subgroup_shape, // TODO(codeplay): Inconsistency here w/ blk_coord_mnkl - blk_coord_mnkl, - accumulators, - tiled_mma, - thread_idx - ); - } -}; - -/////////////////////////////////////////////////////////////////////////////// - -} // namespace cutlass::gemm::kernel \ No newline at end of file From 946b46cccd2eac3c3259a9364e2be9e37ab7817e Mon Sep 17 00:00:00 2001 From: Anamika Chatterjee Date: Fri, 3 Oct 2025 17:16:11 +0300 Subject: [PATCH 07/15] Changes added based on feedback --- examples/00_bmg_gemm/00_bmg_gemm.cpp | 30 +- examples/00_bmg_gemm/00_bmg_gemm_legacy.cpp | 429 -------------------- examples/00_bmg_gemm/CMakeLists.txt | 9 - include/cutlass/gemm/collective/xe_mma.hpp | 64 ++- 4 files changed, 70 insertions(+), 462 deletions(-) delete mode 100644 examples/00_bmg_gemm/00_bmg_gemm_legacy.cpp diff --git a/examples/00_bmg_gemm/00_bmg_gemm.cpp b/examples/00_bmg_gemm/00_bmg_gemm.cpp index ada885eb4f..cf64eb3113 100644 --- a/examples/00_bmg_gemm/00_bmg_gemm.cpp +++ b/examples/00_bmg_gemm/00_bmg_gemm.cpp @@ -345,12 +345,19 @@ int main(int argc, const char** argv) using LayoutC = cutlass::layout::RowMajor; using LayoutD = cutlass::layout::RowMajor; - // The 2D block copy operations used for the A and B matrices - using GmemTiledCopyA = XE_2D_U16x32x32_LD_N; - using GmemTiledCopyB = XE_2D_U16x32x32_LD_V; - - // New MMA atom XE_DPAS_TT using workgroup-level tile shape of 256×256×32 - using TileShape = Shape<_256, _256, _32>; + // Workgroup-level tile + using TileShape = Shape<_256, _256, _32>; + + // A TiledMMA struct defines a tiling of an MMA atom over M, N and K, combining both additional + // hardware (sub-groups for Intel BMG) and iterations by each sub-group. + // + // The TiledMMAHelper struct defines a specific TiledMMA for a given MMA atom. This example uses + // the XE_DPAS_TT<8, float, cute::bfloat16_t> atom, which represents an 8x16x16 DPAS operation with float32 accumulation and bfloat16 inputs, TileShape (<256, 256, 32>) and sub-group layout (8x4x1). + // The TiledMMA constructed using TiledMMAHelper has the property that each sub-group operates on a + // single contiguous chunk of the work-group TileShape. For this configuration, this implies that + // each sub-group operates on a contiguous 32x64x32 chunk (4x4x2 iterations). See + // 0t_mma_atom.md#TiledMMAs for more info. Sub-groups are arranged row-major (stride 4,1,0) for + // performance reasons. using TiledMma = typename TiledMMAHelper>, Layout, Layout, Stride<_4, _1, _0>>>::TiledMMA; // For Intel BMG, PipelineStages defines how many k-blocks ahead to prefetch from A and B. @@ -385,6 +392,13 @@ int main(int argc, const char** argv) void, void>; // GEMM Mainloop - iteration over blocks in K dimension + // + // Copy operations for A and B matrices: + // - Use 'void' (as shown below) to automatically select new 2D block copy operations + // - To use legacy copy operations, replace 'void' with specific copy atoms, e.g.: + // using GmemTiledCopyA = XE_2D_U16x32x32_LD_N; + // using GmemTiledCopyB = XE_2D_U16x32x32_LD_V; + // Then replace the first 'void' with GmemTiledCopyA and fifth 'void' with GmemTiledCopyB using CollectiveMainloop = cutlass::gemm::collective::CollectiveMma< GEMMDispatchPolicy, TileShape, @@ -393,8 +407,8 @@ int main(int argc, const char** argv) ElementInputB, cutlass::gemm::TagToStrideB_t, // Converts CUTLASS 2.x to CUTLASS 3.x representation TiledMma, - GmemTiledCopyA, void, void, cute::identity, // A - GmemTiledCopyB, void, void, cute::identity // B + void, void, void, cute::identity, // A + void, void, void, cute::identity // B >; // Define the whole kernel (mainloop and epilogue) diff --git a/examples/00_bmg_gemm/00_bmg_gemm_legacy.cpp b/examples/00_bmg_gemm/00_bmg_gemm_legacy.cpp deleted file mode 100644 index 7e9291227e..0000000000 --- a/examples/00_bmg_gemm/00_bmg_gemm_legacy.cpp +++ /dev/null @@ -1,429 +0,0 @@ -/*************************************************************************************************** - * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved. - * Copyright (C) 2025 Intel Corporation, All rights reserved. - * SPDX-License-Identifier: BSD-3-Clause - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - **************************************************************************************************/ -/*! \file - \brief CUTLASS Intel BMG Gemm Example. - - This example constructs and executes a simple CUTLASS GEMM kernel on Intel BMG hardware, and - verifies its correctness with a reference implementation - (cutlass::reference::device::GemmComplex). The example also provides a performance measurement - for the GEMM in TFLOPS. - - This example makes use of BMGs subgroup cooperative 2d-block copy operations and DPAS instructions. - - The shapes of the A and B matrix are defined at runtime by `options.m`, `.n` and `.k`, and the - batch size is defined by `options.l`. The tile shape, which defines how much work is executed by - a single work-group, is defined at compile time by: - ``` - using TileShape = Shape<_256, _256, _32>; - ``` - That is, each work-group processes a tile of M=256, N=256, and iterates over `options.k` in - blocks of K=32. - - Performance of GEMM on BMG is heavily dependent on prefetching the A and B matrices. That is, - executing Intel specific prefetch instructions for future iterations to ensure that the required - blocks of A and B are resident in cache before they are needed. - - To build & run this example (from your build dir): - - $ ninja 00_bmg_gemm - $ ./examples/sycl/00_bmg_gemm/00_bmg_gemm - - Call with `--help` for information about available options -*/ - -#include "cutlass/epilogue/collective/default_epilogue.hpp" -#include "cutlass/epilogue/collective/xe_epilogue.hpp" -#include "cutlass/epilogue/fusion/xe_callbacks.hpp" -#include "cutlass/gemm/device/gemm_universal.h" -#include "cutlass/gemm/device/gemm_universal_adapter.h" -#include "cutlass/gemm/collective/collective_mma.hpp" -#include "cutlass/util/GPU_Clock.hpp" - -#include -#include - -#include "cutlass/util/command_line.h" -#include "cutlass/util/device_memory.h" -#include "cutlass/util/packed_stride.hpp" -#include "cutlass/util/reference/device/gemm_complex.h" -#include "cutlass/util/reference/device/tensor_compare.h" -#include "sycl_common.hpp" -#include "helper.h" - -using namespace cute; - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -// Command line options parsing -struct Options { - - bool help; - bool error; - - int m, n, k, l, iterations; - float alpha, beta; - - Options(): - help(false), - error(false), - m(5120), n(4096), k(4096), l(1), iterations(20), - alpha(1.f), beta(0.f) - { } - - // Parses the command line - void parse(int argc, char const **args) { - cutlass::CommandLine cmd(argc, args); - - if (cmd.check_cmd_line_flag("help")) { - help = true; - return; - } - - cmd.get_cmd_line_argument("m", m, 5120); - cmd.get_cmd_line_argument("n", n, 4096); - cmd.get_cmd_line_argument("k", k, 4096); - cmd.get_cmd_line_argument("l", l, 1); - cmd.get_cmd_line_argument("alpha", alpha, 1.f); - cmd.get_cmd_line_argument("beta", beta, 0.f); - cmd.get_cmd_line_argument("iterations", iterations, 100); - } - - /// Prints the usage statement. - std::ostream & print_usage(std::ostream &out) const { - - out << "BMG GEMM Example\n\n" - << "Options:\n\n" - << " --help If specified, displays this usage statement\n\n" - << " --m= Sets the M extent of the GEMM\n" - << " --n= Sets the N extent of the GEMM\n" - << " --k= Sets the K extent of the GEMM\n" - << " --l= Sets the L extent (batch count) of the GEMM\n" - << " --alpha= Epilogue scalar alpha\n" - << " --beta= Epilogue scalar beta\n\n" - << " --iterations= Iterations\n\n"; - - return out; - } -}; - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -template < - class Gemm -> -struct ExampleRunner { - - using StrideA = typename Gemm::GemmKernel::StrideA; - using StrideB = typename Gemm::GemmKernel::StrideB; - using StrideC = typename Gemm::GemmKernel::StrideC; - using StrideD = typename Gemm::GemmKernel::StrideD; - - using LayoutA = typename Gemm::LayoutA; - using LayoutB = typename Gemm::LayoutB; - using LayoutC = typename Gemm::LayoutC; - using LayoutD = typename Gemm::LayoutD; - - using ElementA = typename Gemm::ElementA; - using ElementB = typename Gemm::ElementB; - using ElementAcc = typename Gemm::ElementAccumulator; - - using CollectiveEpilogue = typename Gemm::CollectiveEpilogue; - using ElementC = typename Gemm::ElementC; - using ElementOutput = typename CollectiveEpilogue::ElementOutput; - using ElementCompute = typename CollectiveEpilogue::ElementCompute; - using ElementAccumulator = typename CollectiveEpilogue::ElementAccumulator; - - using ProblemShapeType = typename Gemm::GemmKernel::ProblemShape; - - // - // Data members - // - - /// Initialization - StrideA stride_A; - StrideB stride_B; - StrideC stride_C; - StrideD stride_D; - uint64_t seed = 0; - - cutlass::DeviceAllocation block_A; - cutlass::DeviceAllocation block_B; - cutlass::DeviceAllocation block_C; - cutlass::DeviceAllocation block_D; - cutlass::DeviceAllocation block_ref_D; // Reference GEMM result for verification - - // - // Methods - // - - bool verify(const ProblemShapeType& problem_size, ElementCompute alpha, ElementCompute beta) { - auto [M, N, K, L] = problem_size; - - cutlass::TensorRef ref_A(block_A.get(), LayoutA::packed({M, K})); - cutlass::TensorRef ref_B(block_B.get(), LayoutB::packed({K, N})); - cutlass::TensorRef ref_C(block_C.get(), LayoutC::packed({M, N})); - cutlass::TensorRef ref_D(block_ref_D.get(), LayoutD::packed({M, N})); - - cutlass::reference::device::GemmComplex( - {M, N, K}, - alpha, - ref_A, - cutlass::ComplexTransform::kNone, - ref_B, - cutlass::ComplexTransform::kNone, - beta, - ref_C, - ref_D, - ElementAccumulator(0), - L, // batch_count - M * K, // batch_stride_A - K * N, // batch_stride_B - M * N, // batch_stride_C - M * N // batch_stride_D - ); - - // CUTLASS on SYCL uses the compatibility library compat for e.g. default in-order queue - compat::wait(); - - // Check if output from CUTLASS kernel and reference kernel are equal or not - bool passed = cutlass::reference::device::BlockCompareEqual( - block_ref_D.get(), block_D.get(), block_D.size()); - - return passed; - } - - /// Initialize operands to be used in the GEMM and reference GEMM - void initialize(const ProblemShapeType& problem_size) { - auto problem_shape_MNKL = cute::append<4>(problem_size, 1); - auto [M, N, K, L] = problem_shape_MNKL; - - // Complete the stride by combining static layout info (StrideA) with runtime size info (M,K,L) - stride_A = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, K, L)); - stride_B = cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(N, K, L)); - stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L)); - stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L)); - - block_A.reset(static_cast(M) * K * L); - block_B.reset(static_cast(K) * N * L); - block_C.reset(static_cast(M) * N * L); - block_D.reset(static_cast(M) * N * L); - block_ref_D.reset(static_cast(M) * N * L); - - initialize_block(block_A, seed + 2023); - initialize_block(block_B, seed + 2022); - initialize_block(block_C, seed + 2021); - } - - cutlass::Status run(const Options& options, const cutlass::KernelHardwareInfo& hw_info) { - ProblemShapeType problem_size = ProblemShapeType{options.m, options.n, options.k, options.l}; - - initialize(problem_size); - - typename Gemm::GemmKernel::Arguments arguments{ - cutlass::gemm::GemmUniversalMode::kGemm, - problem_size, - {block_A.get(), stride_A, block_B.get(), stride_B}, - {{options.alpha, options.beta}, block_C.get(), stride_C, block_D.get(), stride_D}, - hw_info - }; - - Gemm gemm_op; - - size_t workspace_size = Gemm::get_workspace_size(arguments); - cutlass::device_memory::allocation workspace(workspace_size); - - if (gemm_op.can_implement(arguments) != cutlass::Status::kSuccess){ - std::cout << "Invalid Problem Size: " << options.m << 'x' << options.n << 'x' << options.k << 'x' << options.l << std::endl; - std::exit(1); - } - - CUTLASS_CHECK(gemm_op.initialize(arguments, workspace.get())); - - // Run the GEMM - CUTLASS_CHECK(gemm_op.run()); - - compat::wait(); - - // Verify that the result is correct - bool passed = verify(problem_size, options.alpha, options.beta); - std::cout << "Disposition: " << (passed ? "Passed" : "Failed") << std::endl; - - if(!passed) return cutlass::Status::kErrorInternal; - - if (options.iterations > 0) { - GPU_Clock timer; - timer.start(); - for (int i = 0; i < options.iterations; ++i) { - gemm_op.run(); - } - compat::wait(); - - float cute_time = timer.seconds() / options.iterations; - double tflops = (2.0 * options.m * options.n * options.k * options.l) * 1e-12; - std::cout << "Problem Size: " << options.m << 'x' << options.n << 'x' << options.k << 'x' << options.l << std::endl; - printf("Cutlass GEMM Performance: [%4.3f]TFlop/s (%6.4f)ms\n", tflops / cute_time, cute_time*1000); - } - - return cutlass::Status::kSuccess; - } - -}; - -int main(int argc, const char** argv) -{ - // - // Parse options - // - - Options options; - - options.parse(argc, argv); - - if (options.help) { - options.print_usage(std::cout) << std::endl; - return 0; - } - - if (options.error) { - std::cerr << "Aborting execution." << std::endl; - return -1; - } - - // - // Run examples - // - - // The KernelHardwareInfo struct holds the number of EUs on the GPU with a given device ID. This - // information is used by the underlying kernel. - cutlass::KernelHardwareInfo hw_info; - - // Change device_id to another value if you are running on a machine with multiple GPUs and wish - // to use a GPU other than that with device ID 0. - hw_info.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id); - - bool passed; - - // The code section below describes datatype for input, output matrices and computation between - // elements in input matrices. - using ElementAccumulator = float; // <- data type of accumulator - using ElementComputeEpilogue = float; // <- data type of epilogue operations - using ElementInputA = bfloat16_t; // <- data type of elements in input matrix A - using ElementInputB = bfloat16_t; // <- data type of elements in input matrix B - using ElementOutput = float; // <- data type of elements in output matrix D - - using LayoutA = cutlass::layout::RowMajor; - using LayoutB = cutlass::layout::RowMajor; - using LayoutC = cutlass::layout::RowMajor; - using LayoutD = cutlass::layout::RowMajor; - - // The 2D block copy operations used for the A and B matrices - using GmemTiledCopyA = XE_2D_U16x32x32_LD_N; - using GmemTiledCopyB = XE_2D_U16x32x32_LD_V; - - // Workgroup-level tile - using TileShape = Shape<_256, _256, _32>; - - // A TiledMMA struct defines a tiling of an MMA atom over M, N and K, combining both additional - // hardware (sub-groups for Intel BMG) and iterations by each sub-group. - // - // The TiledMMAHelper struct defines a specific TiledMMA for a given MMA atom - // (XE_8x16x16_F32BF16BF16F32_TT), TileShape (<256, 256, 32>) and sub-group layout (8x4x1). The - // TiledMMA constructed using TiledMMAHelper has the property that each sub-group operates on a - // single contiguous chunk of the work-group TileShape. For this configuration, this implies that - // each sub-group operates on a contiguous 32x64x32 chunk (4x4x2 iterations). See - // 0t_mma_atom.md#TiledMMAs for more info. Sub-groups are arranged row-major (stride 4,1,0) for - // performance reasons. - using TiledMma = // M=8,N=16,K=16, D=f32,A=bf16,B=bf16,C=f32 - typename TiledMMAHelper, Layout, - Layout, Stride<_4, _1, _0>>>::TiledMMA; - - // For Intel BMG, PipelineStages defines how many k-blocks ahead to prefetch from A and B. - constexpr int PipelineStages = 2; - using GEMMDispatchPolicy = cutlass::gemm::MainloopIntelXeXMX16; - using EpilogueDispatchPolicy = cutlass::epilogue::IntelXeXMX16; - - // This is the 'default' epilogue operation (Linear Combination) which performs everything in: - // (D = alpha * (A*B) + beta * C) - // aside from the (A*B), which is handled by the GEMM. See 05_bmg_gemm_with_epilogues for more - // complex epilogue examples. - using EpilogueOp = cutlass::epilogue::fusion::LinearCombination; - - // FusionCallbacks ties the EpilogueOp to an implementation (based on the dispatch - // policy/architecture) and defines the epilogue arguments. - using FusionCallBacks = cutlass::epilogue::fusion::FusionCallbacks; - // GEMM Epilogue - loads & stores C/D matrices, performs epilogue operations & load/stores any - // auxiliary data required - using CollectiveEpilogue = cutlass::epilogue::collective::CollectiveEpilogue< - EpilogueDispatchPolicy, - TileShape, - ElementAccumulator, - cutlass::gemm::TagToStrideC_t, // Converts CUTLASS 2.x to CUTLASS 3.x representation - ElementOutput, - cutlass::gemm::TagToStrideC_t, // Converts CUTLASS 2.x to CUTLASS 3.x representation - FusionCallBacks, - XE_2D_U32x8x16_LD_N, // The copy atom used to load matrix C - void, void, - XE_2D_U32x8x16_ST_N, // The copy atom used to store matrix D - void, void>; - - // GEMM Mainloop - iteration over blocks in K dimension - using CollectiveMainloop = cutlass::gemm::collective::CollectiveMma< - GEMMDispatchPolicy, - TileShape, - ElementInputA, - cutlass::gemm::TagToStrideA_t, // Converts CUTLASS 2.x to CUTLASS 3.x representation - ElementInputB, - cutlass::gemm::TagToStrideB_t, // Converts CUTLASS 2.x to CUTLASS 3.x representation - TiledMma, - GmemTiledCopyA, void, void, cute::identity, // A - GmemTiledCopyB, void, void, cute::identity // B - >; - - // Define the whole kernel (mainloop and epilogue) - using GemmKernel = cutlass::gemm::kernel::GemmUniversal< - Shape, // Defer global problem shape definition to runtime - CollectiveMainloop, - CollectiveEpilogue - >; - - // The GemmUniversalAdapter wraps the defined GEMM kernel and handles the launch, and e.g. - // persistent scratch memory if required. - using Gemm = cutlass::gemm::device::GemmUniversalAdapter; - - ExampleRunner runner; - - CUTLASS_CHECK(runner.run(options, hw_info)); - - return 0; -} diff --git a/examples/00_bmg_gemm/CMakeLists.txt b/examples/00_bmg_gemm/CMakeLists.txt index 6564ad47a9..5bfc4a5e29 100644 --- a/examples/00_bmg_gemm/CMakeLists.txt +++ b/examples/00_bmg_gemm/CMakeLists.txt @@ -40,15 +40,6 @@ cutlass_example_add_executable( TEST_SMALL_SHAPE ) -cutlass_example_add_executable( - 00_bmg_gemm_legacy - 00_bmg_gemm_legacy.cpp - TEST_COMMAND_OPTIONS - TEST_BATCHES - TEST_LARGE - TEST_SMALL_SHAPE -) - set(TEST_SMALL_SHAPE_PADDABLE --m=1 --n=1 --k=2 --l=2) cutlass_example_add_executable( 00_bmg_gemm_padded diff --git a/include/cutlass/gemm/collective/xe_mma.hpp b/include/cutlass/gemm/collective/xe_mma.hpp index 8df8945a7e..9a55d5bc52 100644 --- a/include/cutlass/gemm/collective/xe_mma.hpp +++ b/include/cutlass/gemm/collective/xe_mma.hpp @@ -176,23 +176,42 @@ struct CollectiveMma, TileShape_, ElementA_ static_assert(is_rmem::value, "D tensor must be rmem resident."); static_assert(is_rmem::value, "C tensor must be rmem resident."); - auto mA_mkl = make_tensor(make_gmem_ptr(mainloop.ptr_A), - make_layout(make_shape(mainloop.M, mainloop.K, mainloop.L), cute::take<0,2>(mainloop.dA))); - auto mB_nkl = make_tensor(make_gmem_ptr(mainloop.ptr_B), - make_layout(make_shape(mainloop.N, mainloop.K, mainloop.L), cute::take<0,2>(mainloop.dB))); - auto copy_a = make_block_2d_copy_A(TiledMma{}, mA_mkl); - auto copy_b = make_block_2d_copy_B(TiledMma{}, mB_nkl); + auto copy_a = [&]() { + if constexpr (!std::is_void_v) { + // User provided copy operation - use full stride + auto mA_mkl = make_tensor(make_gmem_ptr(mainloop.ptr_A), + make_layout(make_shape(mainloop.M, mainloop.K, mainloop.L), mainloop.dA)); + using Copy_A = typename Copy_Traits::template DefaultTiledCopy; + return Copy_A{}.with(mA_mkl); + } else { + // Use new 2D copy operations with 2D stride + auto mA_mkl = make_tensor(make_gmem_ptr(mainloop.ptr_A), + make_layout(make_shape(mainloop.M, mainloop.K, mainloop.L), cute::take<0,2>(mainloop.dA))); + return make_block_2d_copy_A(TiledMma{}, mA_mkl); + } + }(); + + auto copy_b = [&]() { + if constexpr (!std::is_void_v) { + // User provided copy operation - use full stride + auto mB_nkl = make_tensor(make_gmem_ptr(mainloop.ptr_B), + make_layout(make_shape(mainloop.N, mainloop.K, mainloop.L), mainloop.dB)); + using Copy_B = typename Copy_Traits::template DefaultTiledCopy; + return Copy_B{}.with(mB_nkl); + } else { + // Use new 2D copy operations with 2D stride + auto mB_nkl = make_tensor(make_gmem_ptr(mainloop.ptr_B), + make_layout(make_shape(mainloop.N, mainloop.K, mainloop.L), cute::take<0,2>(mainloop.dB))); + return make_block_2d_copy_B(TiledMma{}, mB_nkl); + } + }(); auto thr_copy_a = copy_a.get_slice(thread_idx); auto thr_copy_b = copy_b.get_slice(thread_idx); // Instantiate the MMA object and get thread slice TiledMma tiled_mma; - // TODO(Codeplay): see if we can make this nicer - // To make all work items in a subgroup have the same global tensors pass in the index of work item 0 in each subgroup - auto sg = compat::get_nd_item<1>().get_sub_group(); - auto first_thread_in_sg_idx = sg.get_group_linear_id() * DispatchPolicy::SubgroupSize; - auto thr_mma = tiled_mma.get_slice(first_thread_in_sg_idx); + auto thr_mma = tiled_mma.get_slice(thread_idx); /* Register fragments for MMA */ auto tCrA = thr_mma.partition_sg_fragment_A(gA(_,_,0)); @@ -206,11 +225,24 @@ struct CollectiveMma, TileShape_, ElementA_ Tensor tAgA = thr_copy_a.partition_S(gA); Tensor tBgB = thr_copy_b.partition_S(gB); - /* Create prefetch TiledCopy instances */ - auto prefetch_a = make_block_2d_prefetch(copy_a); - auto prefetch_b = make_block_2d_prefetch(copy_b); - auto thr_prefetch_A = prefetch_a.get_slice(thread_idx); - auto thr_prefetch_B = prefetch_b.get_slice(thread_idx); + /* Create prefetch TiledCopy instances - different for legacy vs new copy operations */ + auto [prefetch_a, prefetch_b, thr_prefetch_A, thr_prefetch_B] = [&]() { + if constexpr (!std::is_void_v && !std::is_void_v) { + // Legacy copy operations - use prefetch_selector + auto tiled_prefetch_a = cute::prefetch_selector,Int>, Num_SGs>(copy_a); + auto tiled_prefetch_b = cute::prefetch_selector,Int>, Num_SGs>(copy_b); + auto thr_prefetch_A = tiled_prefetch_a.get_slice(thread_idx); + auto thr_prefetch_B = tiled_prefetch_b.get_slice(thread_idx); + return std::make_tuple(tiled_prefetch_a, tiled_prefetch_b, thr_prefetch_A, thr_prefetch_B); + } else { + // New 2D copy operations - use make_block_2d_prefetch + auto prefetch_a = make_block_2d_prefetch(copy_a); + auto prefetch_b = make_block_2d_prefetch(copy_b); + auto thr_prefetch_A = prefetch_a.get_slice(thread_idx); + auto thr_prefetch_B = prefetch_b.get_slice(thread_idx); + return std::make_tuple(prefetch_a, prefetch_b, thr_prefetch_A, thr_prefetch_B); + } + }(); /* Partition global tensor (proxies) for prefetch */ auto pAgA = thr_prefetch_A.partition_S(gA); From c97f011cab591b5f48d31bd9517ba45125f988a1 Mon Sep 17 00:00:00 2001 From: Anamika Chatterjee Date: Sat, 4 Oct 2025 17:52:43 +0300 Subject: [PATCH 08/15] Applied review comments --- examples/00_bmg_gemm/00_bmg_gemm.cpp | 15 ++++++--------- .../cutlass/gemm/collective/collective_mma.hpp | 2 +- 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/examples/00_bmg_gemm/00_bmg_gemm.cpp b/examples/00_bmg_gemm/00_bmg_gemm.cpp index cf64eb3113..0f73fc525c 100644 --- a/examples/00_bmg_gemm/00_bmg_gemm.cpp +++ b/examples/00_bmg_gemm/00_bmg_gemm.cpp @@ -345,6 +345,10 @@ int main(int argc, const char** argv) using LayoutC = cutlass::layout::RowMajor; using LayoutD = cutlass::layout::RowMajor; + // [New Copy Atom] Automatically select 2D block copy operations used for the A and B matrices + using GmemTiledCopyA = void; // For older version of copy atom, use XE_2D_U16x32x32_LD_N + using GmemTiledCopyB = void; // For older version of copy atom, use XE_2D_U16x32x32_LD_V + // Workgroup-level tile using TileShape = Shape<_256, _256, _32>; @@ -392,13 +396,6 @@ int main(int argc, const char** argv) void, void>; // GEMM Mainloop - iteration over blocks in K dimension - // - // Copy operations for A and B matrices: - // - Use 'void' (as shown below) to automatically select new 2D block copy operations - // - To use legacy copy operations, replace 'void' with specific copy atoms, e.g.: - // using GmemTiledCopyA = XE_2D_U16x32x32_LD_N; - // using GmemTiledCopyB = XE_2D_U16x32x32_LD_V; - // Then replace the first 'void' with GmemTiledCopyA and fifth 'void' with GmemTiledCopyB using CollectiveMainloop = cutlass::gemm::collective::CollectiveMma< GEMMDispatchPolicy, TileShape, @@ -407,8 +404,8 @@ int main(int argc, const char** argv) ElementInputB, cutlass::gemm::TagToStrideB_t, // Converts CUTLASS 2.x to CUTLASS 3.x representation TiledMma, - void, void, void, cute::identity, // A - void, void, void, cute::identity // B + GmemTiledCopyA, void, void, cute::identity, // A + GmemTiledCopyB, void, void, cute::identity // B >; // Define the whole kernel (mainloop and epilogue) diff --git a/include/cutlass/gemm/collective/collective_mma.hpp b/include/cutlass/gemm/collective/collective_mma.hpp index 3c02ca1efa..bb5e2017d8 100644 --- a/include/cutlass/gemm/collective/collective_mma.hpp +++ b/include/cutlass/gemm/collective/collective_mma.hpp @@ -77,8 +77,8 @@ #endif // !defined(__CUDACC_RTC__) #if defined(SYCL_INTEL_TARGET) -#include "cutlass/gemm/collective/xe_mma_legacy.hpp" #include "cutlass/gemm/collective/xe_mma.hpp" +#include "cutlass/gemm/collective/xe_mma_legacy.hpp" #include "cutlass/gemm/collective/xe_array_mma.hpp" #include "cutlass/gemm/collective/xe_array_mma_fp8.hpp" #include "cutlass/gemm/collective/xe_mma_mixed_input.hpp" From 9691e608e813f4875dda25d3454031907a4e4780 Mon Sep 17 00:00:00 2001 From: Anamika Chatterjee Date: Mon, 6 Oct 2025 12:42:30 +0300 Subject: [PATCH 09/15] Add compile-time checks to enforce new XE copy atoms in block 2D functions --- examples/00_bmg_gemm/00_bmg_gemm.cpp | 11 +++-- include/cute/atom/copy_traits_xe_2d.hpp | 36 +++++++++++++++ include/cutlass/gemm/collective/xe_mma.hpp | 54 ++++++++-------------- 3 files changed, 64 insertions(+), 37 deletions(-) diff --git a/examples/00_bmg_gemm/00_bmg_gemm.cpp b/examples/00_bmg_gemm/00_bmg_gemm.cpp index 0f73fc525c..0f7c665820 100644 --- a/examples/00_bmg_gemm/00_bmg_gemm.cpp +++ b/examples/00_bmg_gemm/00_bmg_gemm.cpp @@ -345,9 +345,13 @@ int main(int argc, const char** argv) using LayoutC = cutlass::layout::RowMajor; using LayoutD = cutlass::layout::RowMajor; - // [New Copy Atom] Automatically select 2D block copy operations used for the A and B matrices - using GmemTiledCopyA = void; // For older version of copy atom, use XE_2D_U16x32x32_LD_N - using GmemTiledCopyB = void; // For older version of copy atom, use XE_2D_U16x32x32_LD_V + // [New Copy Atom] When left unspecified (void), make_block_2d_copy_* automatically selects + // appropriate 2D block copy operations for matrices A and B. Alternatively, you can + // explicitly specify new copy atom operations such as XE_LOAD_2D, XE_LOAD_2D_VNNI + // (applicable only to matrix B), or XE_LOAD_2D_TRANSPOSE. + // Refer https://github.com/intel/sycl-tla/blob/petercad/rearchitecture/media/docs/cpp/xe_rearchitecture.md + using GmemTiledCopyA = void; //XE_LOAD_2D<16, 32, 32>; + using GmemTiledCopyB = void; //XE_LOAD_2D_VNNI<16, 32, 32>; // Workgroup-level tile using TileShape = Shape<_256, _256, _32>; @@ -366,6 +370,7 @@ int main(int argc, const char** argv) // For Intel BMG, PipelineStages defines how many k-blocks ahead to prefetch from A and B. constexpr int PipelineStages = 2; + // For older version of copy/mma atom, use cutlass::gemm::MainloopIntelXeXMX16 as dispatch policy using GEMMDispatchPolicy = cutlass::gemm::MainloopXeL1Staged; using EpilogueDispatchPolicy = cutlass::epilogue::IntelXeXMX16; diff --git a/include/cute/atom/copy_traits_xe_2d.hpp b/include/cute/atom/copy_traits_xe_2d.hpp index 2df8ae0a38..d8f45207fc 100644 --- a/include/cute/atom/copy_traits_xe_2d.hpp +++ b/include/cute/atom/copy_traits_xe_2d.hpp @@ -756,6 +756,31 @@ make_block_2d_copy_X(CopyOp const& op, // Copy operation return make_block_2d_copy(op, gstride, x_mode, y_mode, atom_shape, sv_layout_t); } +// Helper trait to detect new XE copy ops +template +struct is_new_xe_atom : cute::false_type {}; + +// Helper trait specifically for XE_LOAD_2D_VNNI (for copy B) +template +struct is_new_xe_atom_vnni : cute::false_type {}; + +// Helper trait specifically for XE_STORE_2D (for copy C) +template +struct is_new_xe_atom_store : cute::false_type {}; + +// Check if T is an instantiation of XE_LOAD_2D +template +struct is_new_xe_atom> : cute::true_type {}; + +// Check if T is an instantiation of XE_LOAD_2D_TRANSPOSE +template +struct is_new_xe_atom> : cute::true_type {}; + +// Check if T is an instantiation of XE_LOAD_2D_VNNI +template +struct is_new_xe_atom_vnni> : cute::true_type {}; + + // MMA-focused TiledCopy creation functions. template CUTE_HOST_DEVICE @@ -774,6 +799,12 @@ make_block_2d_copy_A(CopyOp const& op, // Copy operation TiledMMA const& mma, // TiledMMA instance Tensor const& gmem) // Global tensor { + // This will pass for new atoms like XE_LOAD_2D<16, 32, 32> + // and fail for old atoms like XE_2D_U16x32x32_LD_N + static_assert(is_new_xe_atom::value, + "Old XE atom ops not compatible with make_block_2d_copy_A. " + "Please use the new templated atoms: XE_LOAD_2D or XE_LOAD_2D_TRANSPOSE. " + "Examples: XE_2D_U16x32x32_LD_N -> XE_LOAD_2D<16, 32, 32>, XE_2D_U16x32x32_LD_V -> XE_LOAD_2D_TRANSPOSE<16, 32, 32>"); using ValType = typename GEngine::value_type; return make_block_2d_copy_A(op, mma, gmem.stride()).with(gmem); } @@ -846,6 +877,11 @@ make_block_2d_copy_B(CopyOp const& op, // Copy operation TiledMMA const& mma, // TiledMMA instance Tensor const& gmem) // Global tensor { + // Only accept XE_LOAD_2D_VNNI for copy B + static_assert(is_new_xe_atom_vnni::value, + "Old XE atom ops not compatible with make_block_2d_copy_B. " + "Please use the new templated atom: XE_LOAD_2D_VNNI. " + "Examples: XE_2D_U16x32x32_LD_V -> XE_LOAD_2D_VNNI<16, 32, 32, 32>"); using ValType = typename GEngine::value_type; return make_block_2d_copy_B(op, mma, gmem.stride()).with(gmem); } diff --git a/include/cutlass/gemm/collective/xe_mma.hpp b/include/cutlass/gemm/collective/xe_mma.hpp index 9a55d5bc52..64f23fd93f 100644 --- a/include/cutlass/gemm/collective/xe_mma.hpp +++ b/include/cutlass/gemm/collective/xe_mma.hpp @@ -176,32 +176,26 @@ struct CollectiveMma, TileShape_, ElementA_ static_assert(is_rmem::value, "D tensor must be rmem resident."); static_assert(is_rmem::value, "C tensor must be rmem resident."); + auto mA_mkl = make_tensor(make_gmem_ptr(mainloop.ptr_A), + make_layout(make_shape(mainloop.M, mainloop.K, mainloop.L), cute::take<0,2>(mainloop.dA))); + auto mB_nkl = make_tensor(make_gmem_ptr(mainloop.ptr_B), + make_layout(make_shape(mainloop.N, mainloop.K, mainloop.L), cute::take<0,2>(mainloop.dB))); auto copy_a = [&]() { if constexpr (!std::is_void_v) { - // User provided copy operation - use full stride - auto mA_mkl = make_tensor(make_gmem_ptr(mainloop.ptr_A), - make_layout(make_shape(mainloop.M, mainloop.K, mainloop.L), mainloop.dA)); - using Copy_A = typename Copy_Traits::template DefaultTiledCopy; - return Copy_A{}.with(mA_mkl); + // User provided copy operation + return make_block_2d_copy_A(GmemTiledCopyA{}, TiledMma{}, mA_mkl); } else { - // Use new 2D copy operations with 2D stride - auto mA_mkl = make_tensor(make_gmem_ptr(mainloop.ptr_A), - make_layout(make_shape(mainloop.M, mainloop.K, mainloop.L), cute::take<0,2>(mainloop.dA))); + // make_block_2d_copy_A automatically selects copy operation return make_block_2d_copy_A(TiledMma{}, mA_mkl); } }(); auto copy_b = [&]() { if constexpr (!std::is_void_v) { - // User provided copy operation - use full stride - auto mB_nkl = make_tensor(make_gmem_ptr(mainloop.ptr_B), - make_layout(make_shape(mainloop.N, mainloop.K, mainloop.L), mainloop.dB)); - using Copy_B = typename Copy_Traits::template DefaultTiledCopy; - return Copy_B{}.with(mB_nkl); + // User provided copy operation + return make_block_2d_copy_B(GmemTiledCopyB{}, TiledMma{}, mB_nkl); } else { - // Use new 2D copy operations with 2D stride - auto mB_nkl = make_tensor(make_gmem_ptr(mainloop.ptr_B), - make_layout(make_shape(mainloop.N, mainloop.K, mainloop.L), cute::take<0,2>(mainloop.dB))); + // make_block_2d_copy_B automatically selects copy operation return make_block_2d_copy_B(TiledMma{}, mB_nkl); } }(); @@ -225,24 +219,12 @@ struct CollectiveMma, TileShape_, ElementA_ Tensor tAgA = thr_copy_a.partition_S(gA); Tensor tBgB = thr_copy_b.partition_S(gB); - /* Create prefetch TiledCopy instances - different for legacy vs new copy operations */ - auto [prefetch_a, prefetch_b, thr_prefetch_A, thr_prefetch_B] = [&]() { - if constexpr (!std::is_void_v && !std::is_void_v) { - // Legacy copy operations - use prefetch_selector - auto tiled_prefetch_a = cute::prefetch_selector,Int>, Num_SGs>(copy_a); - auto tiled_prefetch_b = cute::prefetch_selector,Int>, Num_SGs>(copy_b); - auto thr_prefetch_A = tiled_prefetch_a.get_slice(thread_idx); - auto thr_prefetch_B = tiled_prefetch_b.get_slice(thread_idx); - return std::make_tuple(tiled_prefetch_a, tiled_prefetch_b, thr_prefetch_A, thr_prefetch_B); - } else { - // New 2D copy operations - use make_block_2d_prefetch - auto prefetch_a = make_block_2d_prefetch(copy_a); - auto prefetch_b = make_block_2d_prefetch(copy_b); - auto thr_prefetch_A = prefetch_a.get_slice(thread_idx); - auto thr_prefetch_B = prefetch_b.get_slice(thread_idx); - return std::make_tuple(prefetch_a, prefetch_b, thr_prefetch_A, thr_prefetch_B); - } - }(); + /* Create prefetch TiledCopy instances */ + auto prefetch_a = make_block_2d_prefetch(copy_a); + auto prefetch_b = make_block_2d_prefetch(copy_b); + + auto thr_prefetch_A = prefetch_a.get_slice(thread_idx); + auto thr_prefetch_B = prefetch_b.get_slice(thread_idx); /* Partition global tensor (proxies) for prefetch */ auto pAgA = thr_prefetch_A.partition_S(gA); @@ -292,6 +274,10 @@ struct CollectiveMma, TileShape_, ElementA_ prefetch(prefetch_b, pBgB(_, _, _, prefetch_k)); } + /* Shuffle data from copy fragments to MMA fragments */ + reorder(tArA, tCrA); + reorder(tBrB, tCrB); + cute::gemm(tiled_mma, tCrA, tCrB, accum); barrier_wait(barrier_scope); } From 93b076a479dbf1a0cf6d292c17fe18bc9471d199 Mon Sep 17 00:00:00 2001 From: Anamika Chatterjee Date: Mon, 6 Oct 2025 13:27:57 +0300 Subject: [PATCH 10/15] Modified static assert message --- include/cute/atom/copy_traits_xe_2d.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/cute/atom/copy_traits_xe_2d.hpp b/include/cute/atom/copy_traits_xe_2d.hpp index d8f45207fc..f1658c7f2e 100644 --- a/include/cute/atom/copy_traits_xe_2d.hpp +++ b/include/cute/atom/copy_traits_xe_2d.hpp @@ -802,7 +802,7 @@ make_block_2d_copy_A(CopyOp const& op, // Copy operation // This will pass for new atoms like XE_LOAD_2D<16, 32, 32> // and fail for old atoms like XE_2D_U16x32x32_LD_N static_assert(is_new_xe_atom::value, - "Old XE atom ops not compatible with make_block_2d_copy_A. " + "Old XE copy atom ops not compatible with make_block_2d_copy_A. " "Please use the new templated atoms: XE_LOAD_2D or XE_LOAD_2D_TRANSPOSE. " "Examples: XE_2D_U16x32x32_LD_N -> XE_LOAD_2D<16, 32, 32>, XE_2D_U16x32x32_LD_V -> XE_LOAD_2D_TRANSPOSE<16, 32, 32>"); using ValType = typename GEngine::value_type; @@ -879,7 +879,7 @@ make_block_2d_copy_B(CopyOp const& op, // Copy operation { // Only accept XE_LOAD_2D_VNNI for copy B static_assert(is_new_xe_atom_vnni::value, - "Old XE atom ops not compatible with make_block_2d_copy_B. " + "Old XE copy atom ops not compatible with make_block_2d_copy_B. " "Please use the new templated atom: XE_LOAD_2D_VNNI. " "Examples: XE_2D_U16x32x32_LD_V -> XE_LOAD_2D_VNNI<16, 32, 32, 32>"); using ValType = typename GEngine::value_type; From a6f068c58fc0041d4c0353f017ec5a2f1cd85901 Mon Sep 17 00:00:00 2001 From: Anamika Chatterjee Date: Mon, 6 Oct 2025 13:28:54 +0300 Subject: [PATCH 11/15] Modified static assert message --- include/cute/atom/copy_traits_xe_2d.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/cute/atom/copy_traits_xe_2d.hpp b/include/cute/atom/copy_traits_xe_2d.hpp index f1658c7f2e..e6cbe664c8 100644 --- a/include/cute/atom/copy_traits_xe_2d.hpp +++ b/include/cute/atom/copy_traits_xe_2d.hpp @@ -802,7 +802,7 @@ make_block_2d_copy_A(CopyOp const& op, // Copy operation // This will pass for new atoms like XE_LOAD_2D<16, 32, 32> // and fail for old atoms like XE_2D_U16x32x32_LD_N static_assert(is_new_xe_atom::value, - "Old XE copy atom ops not compatible with make_block_2d_copy_A. " + "Legacy XE copy atom ops not compatible with make_block_2d_copy_A. " "Please use the new templated atoms: XE_LOAD_2D or XE_LOAD_2D_TRANSPOSE. " "Examples: XE_2D_U16x32x32_LD_N -> XE_LOAD_2D<16, 32, 32>, XE_2D_U16x32x32_LD_V -> XE_LOAD_2D_TRANSPOSE<16, 32, 32>"); using ValType = typename GEngine::value_type; @@ -879,7 +879,7 @@ make_block_2d_copy_B(CopyOp const& op, // Copy operation { // Only accept XE_LOAD_2D_VNNI for copy B static_assert(is_new_xe_atom_vnni::value, - "Old XE copy atom ops not compatible with make_block_2d_copy_B. " + "Legacy XE copy atom ops not compatible with make_block_2d_copy_B. " "Please use the new templated atom: XE_LOAD_2D_VNNI. " "Examples: XE_2D_U16x32x32_LD_V -> XE_LOAD_2D_VNNI<16, 32, 32, 32>"); using ValType = typename GEngine::value_type; From e1e64f7316ef342e01a34be2ebbe60de42575a61 Mon Sep 17 00:00:00 2001 From: Anamika Chatterjee Date: Wed, 8 Oct 2025 14:11:30 +0300 Subject: [PATCH 12/15] Move legacy example to legacy folder, pass 2D strides to make_block_2d_copy_*, and move tensor/copy initialization to host-side params in to_underlying_arguments --- examples/00_bmg_gemm/legacy/00_bmg_gemm.cpp | 429 ++++++++++++++++++++ examples/00_bmg_gemm/legacy/CMakeLists.txt | 41 ++ examples/CMakeLists.txt | 1 + include/cutlass/gemm/collective/xe_mma.hpp | 97 +++-- 4 files changed, 534 insertions(+), 34 deletions(-) create mode 100644 examples/00_bmg_gemm/legacy/00_bmg_gemm.cpp create mode 100644 examples/00_bmg_gemm/legacy/CMakeLists.txt diff --git a/examples/00_bmg_gemm/legacy/00_bmg_gemm.cpp b/examples/00_bmg_gemm/legacy/00_bmg_gemm.cpp new file mode 100644 index 0000000000..91139cf0d6 --- /dev/null +++ b/examples/00_bmg_gemm/legacy/00_bmg_gemm.cpp @@ -0,0 +1,429 @@ +/*************************************************************************************************** + * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved. + * Copyright (C) 2025 Intel Corporation, All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief CUTLASS Intel BMG Gemm Example. + + This example constructs and executes a simple CUTLASS GEMM kernel on Intel BMG hardware, and + verifies its correctness with a reference implementation + (cutlass::reference::device::GemmComplex). The example also provides a performance measurement + for the GEMM in TFLOPS. + + This example makes use of BMGs subgroup cooperative 2d-block copy operations and DPAS instructions. + + The shapes of the A and B matrix are defined at runtime by `options.m`, `.n` and `.k`, and the + batch size is defined by `options.l`. The tile shape, which defines how much work is executed by + a single work-group, is defined at compile time by: + ``` + using TileShape = Shape<_256, _256, _32>; + ``` + That is, each work-group processes a tile of M=256, N=256, and iterates over `options.k` in + blocks of K=32. + + Performance of GEMM on BMG is heavily dependent on prefetching the A and B matrices. That is, + executing Intel specific prefetch instructions for future iterations to ensure that the required + blocks of A and B are resident in cache before they are needed. + + To build & run this example (from your build dir): + + $ ninja 00_bmg_gemm + $ ./examples/sycl/00_bmg_gemm/00_bmg_gemm + + Call with `--help` for information about available options +*/ + +#include "cutlass/epilogue/collective/default_epilogue.hpp" +#include "cutlass/epilogue/collective/xe_epilogue.hpp" +#include "cutlass/epilogue/fusion/xe_callbacks.hpp" +#include "cutlass/gemm/device/gemm_universal.h" +#include "cutlass/gemm/device/gemm_universal_adapter.h" +#include "cutlass/gemm/collective/collective_mma.hpp" +#include "cutlass/util/GPU_Clock.hpp" + +#include +#include + +#include "cutlass/util/command_line.h" +#include "cutlass/util/device_memory.h" +#include "cutlass/util/packed_stride.hpp" +#include "cutlass/util/reference/device/gemm_complex.h" +#include "cutlass/util/reference/device/tensor_compare.h" +#include "sycl_common.hpp" +#include "helper.h" + +using namespace cute; + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +// Command line options parsing +struct Options { + + bool help; + bool error; + + int m, n, k, l, iterations; + float alpha, beta; + + Options(): + help(false), + error(false), + m(5120), n(4096), k(4096), l(1), iterations(20), + alpha(1.f), beta(0.f) + { } + + // Parses the command line + void parse(int argc, char const **args) { + cutlass::CommandLine cmd(argc, args); + + if (cmd.check_cmd_line_flag("help")) { + help = true; + return; + } + + cmd.get_cmd_line_argument("m", m, 5120); + cmd.get_cmd_line_argument("n", n, 4096); + cmd.get_cmd_line_argument("k", k, 4096); + cmd.get_cmd_line_argument("l", l, 1); + cmd.get_cmd_line_argument("alpha", alpha, 1.f); + cmd.get_cmd_line_argument("beta", beta, 0.f); + cmd.get_cmd_line_argument("iterations", iterations, 100); + } + + /// Prints the usage statement. + std::ostream & print_usage(std::ostream &out) const { + + out << "BMG GEMM Example\n\n" + << "Options:\n\n" + << " --help If specified, displays this usage statement\n\n" + << " --m= Sets the M extent of the GEMM\n" + << " --n= Sets the N extent of the GEMM\n" + << " --k= Sets the K extent of the GEMM\n" + << " --l= Sets the L extent (batch count) of the GEMM\n" + << " --alpha= Epilogue scalar alpha\n" + << " --beta= Epilogue scalar beta\n\n" + << " --iterations= Iterations\n\n"; + + return out; + } +}; + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +template < + class Gemm +> +struct ExampleRunner { + + using StrideA = typename Gemm::GemmKernel::StrideA; + using StrideB = typename Gemm::GemmKernel::StrideB; + using StrideC = typename Gemm::GemmKernel::StrideC; + using StrideD = typename Gemm::GemmKernel::StrideD; + + using LayoutA = typename Gemm::LayoutA; + using LayoutB = typename Gemm::LayoutB; + using LayoutC = typename Gemm::LayoutC; + using LayoutD = typename Gemm::LayoutD; + + using ElementA = typename Gemm::ElementA; + using ElementB = typename Gemm::ElementB; + using ElementAcc = typename Gemm::ElementAccumulator; + + using CollectiveEpilogue = typename Gemm::CollectiveEpilogue; + using ElementC = typename Gemm::ElementC; + using ElementOutput = typename CollectiveEpilogue::ElementOutput; + using ElementCompute = typename CollectiveEpilogue::ElementCompute; + using ElementAccumulator = typename CollectiveEpilogue::ElementAccumulator; + + using ProblemShapeType = typename Gemm::GemmKernel::ProblemShape; + + // + // Data members + // + + /// Initialization + StrideA stride_A; + StrideB stride_B; + StrideC stride_C; + StrideD stride_D; + uint64_t seed = 0; + + cutlass::DeviceAllocation block_A; + cutlass::DeviceAllocation block_B; + cutlass::DeviceAllocation block_C; + cutlass::DeviceAllocation block_D; + cutlass::DeviceAllocation block_ref_D; // Reference GEMM result for verification + + // + // Methods + // + + bool verify(const ProblemShapeType& problem_size, ElementCompute alpha, ElementCompute beta) { + auto [M, N, K, L] = problem_size; + + cutlass::TensorRef ref_A(block_A.get(), LayoutA::packed({M, K})); + cutlass::TensorRef ref_B(block_B.get(), LayoutB::packed({K, N})); + cutlass::TensorRef ref_C(block_C.get(), LayoutC::packed({M, N})); + cutlass::TensorRef ref_D(block_ref_D.get(), LayoutD::packed({M, N})); + + cutlass::reference::device::GemmComplex( + {M, N, K}, + alpha, + ref_A, + cutlass::ComplexTransform::kNone, + ref_B, + cutlass::ComplexTransform::kNone, + beta, + ref_C, + ref_D, + ElementAccumulator(0), + L, // batch_count + M * K, // batch_stride_A + K * N, // batch_stride_B + M * N, // batch_stride_C + M * N // batch_stride_D + ); + + // CUTLASS on SYCL uses the compatibility library compat for e.g. default in-order queue + compat::wait(); + + // Check if output from CUTLASS kernel and reference kernel are equal or not + bool passed = cutlass::reference::device::BlockCompareEqual( + block_ref_D.get(), block_D.get(), block_D.size()); + + return passed; + } + + /// Initialize operands to be used in the GEMM and reference GEMM + void initialize(const ProblemShapeType& problem_size) { + auto problem_shape_MNKL = cute::append<4>(problem_size, 1); + auto [M, N, K, L] = problem_shape_MNKL; + + // Complete the stride by combining static layout info (StrideA) with runtime size info (M,K,L) + stride_A = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, K, L)); + stride_B = cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(N, K, L)); + stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L)); + stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L)); + + block_A.reset(static_cast(M) * K * L); + block_B.reset(static_cast(K) * N * L); + block_C.reset(static_cast(M) * N * L); + block_D.reset(static_cast(M) * N * L); + block_ref_D.reset(static_cast(M) * N * L); + + initialize_block(block_A, seed + 2023); + initialize_block(block_B, seed + 2022); + initialize_block(block_C, seed + 2021); + } + + cutlass::Status run(const Options& options, const cutlass::KernelHardwareInfo& hw_info) { + ProblemShapeType problem_size = ProblemShapeType{options.m, options.n, options.k, options.l}; + + initialize(problem_size); + + typename Gemm::GemmKernel::Arguments arguments{ + cutlass::gemm::GemmUniversalMode::kGemm, + problem_size, + {block_A.get(), stride_A, block_B.get(), stride_B}, + {{options.alpha, options.beta}, block_C.get(), stride_C, block_D.get(), stride_D}, + hw_info + }; + + Gemm gemm_op; + + size_t workspace_size = Gemm::get_workspace_size(arguments); + cutlass::device_memory::allocation workspace(workspace_size); + + if (gemm_op.can_implement(arguments) != cutlass::Status::kSuccess){ + std::cout << "Invalid Problem Size: " << options.m << 'x' << options.n << 'x' << options.k << 'x' << options.l << std::endl; + std::exit(1); + } + + CUTLASS_CHECK(gemm_op.initialize(arguments, workspace.get())); + + // Run the GEMM + CUTLASS_CHECK(gemm_op.run()); + + compat::wait(); + + // Verify that the result is correct + bool passed = verify(problem_size, options.alpha, options.beta); + std::cout << "Disposition: " << (passed ? "Passed" : "Failed") << std::endl; + + if(!passed) return cutlass::Status::kErrorInternal; + + if (options.iterations > 0) { + GPU_Clock timer; + timer.start(); + for (int i = 0; i < options.iterations; ++i) { + gemm_op.run(); + } + compat::wait(); + + float cute_time = timer.seconds() / options.iterations; + double tflops = (2.0 * options.m * options.n * options.k * options.l) * 1e-12; + std::cout << "Problem Size: " << options.m << 'x' << options.n << 'x' << options.k << 'x' << options.l << std::endl; + printf("Cutlass GEMM Performance: [%4.3f]TFlop/s (%6.4f)ms\n", tflops / cute_time, cute_time*1000); + } + + return cutlass::Status::kSuccess; + } + +}; + +int main(int argc, const char** argv) +{ + // + // Parse options + // + + Options options; + + options.parse(argc, argv); + + if (options.help) { + options.print_usage(std::cout) << std::endl; + return 0; + } + + if (options.error) { + std::cerr << "Aborting execution." << std::endl; + return -1; + } + + // + // Run examples + // + + // The KernelHardwareInfo struct holds the number of EUs on the GPU with a given device ID. This + // information is used by the underlying kernel. + cutlass::KernelHardwareInfo hw_info; + + // Change device_id to another value if you are running on a machine with multiple GPUs and wish + // to use a GPU other than that with device ID 0. + hw_info.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id); + + bool passed; + + // The code section below describes datatype for input, output matrices and computation between + // elements in input matrices. + using ElementAccumulator = float; // <- data type of accumulator + using ElementComputeEpilogue = float; // <- data type of epilogue operations + using ElementInputA = bfloat16_t; // <- data type of elements in input matrix A + using ElementInputB = bfloat16_t; // <- data type of elements in input matrix B + using ElementOutput = float; // <- data type of elements in output matrix D + + using LayoutA = cutlass::layout::RowMajor; + using LayoutB = cutlass::layout::RowMajor; + using LayoutC = cutlass::layout::RowMajor; + using LayoutD = cutlass::layout::RowMajor; + + // The 2D block copy operations used for the A and B matrices + using GmemTiledCopyA = XE_2D_U16x32x32_LD_N; + using GmemTiledCopyB = XE_2D_U16x32x32_LD_V; + + // Workgroup-level tile + using TileShape = Shape<_256, _256, _32>; + + // A TiledMMA struct defines a tiling of an MMA atom over M, N and K, combining both additional + // hardware (sub-groups for Intel BMG) and iterations by each sub-group. + // + // The TiledMMAHelper struct defines a specific TiledMMA for a given MMA atom + // (XE_8x16x16_F32BF16BF16F32_TT), TileShape (<256, 256, 32>) and sub-group layout (8x4x1). The + // TiledMMA constructed using TiledMMAHelper has the property that each sub-group operates on a + // single contiguous chunk of the work-group TileShape. For this configuration, this implies that + // each sub-group operates on a contiguous 32x64x32 chunk (4x4x2 iterations). See + // 0t_mma_atom.md#TiledMMAs for more info. Sub-groups are arranged row-major (stride 4,1,0) for + // performance reasons. + using TiledMma = // M=8,N=16,K=16, D=f32,A=bf16,B=bf16,C=f32 + typename TiledMMAHelper, Layout, + Layout, Stride<_4, _1, _0>>>::TiledMMA; + + // For Intel BMG, PipelineStages defines how many k-blocks ahead to prefetch from A and B. + constexpr int PipelineStages = 2; + using GEMMDispatchPolicy = cutlass::gemm::MainloopIntelXeXMX16; + using EpilogueDispatchPolicy = cutlass::epilogue::IntelXeXMX16; + + // This is the 'default' epilogue operation (Linear Combination) which performs everything in: + // (D = alpha * (A*B) + beta * C) + // aside from the (A*B), which is handled by the GEMM. See 05_bmg_gemm_with_epilogues for more + // complex epilogue examples. + using EpilogueOp = cutlass::epilogue::fusion::LinearCombination; + + // FusionCallbacks ties the EpilogueOp to an implementation (based on the dispatch + // policy/architecture) and defines the epilogue arguments. + using FusionCallBacks = cutlass::epilogue::fusion::FusionCallbacks; + // GEMM Epilogue - loads & stores C/D matrices, performs epilogue operations & load/stores any + // auxiliary data required + using CollectiveEpilogue = cutlass::epilogue::collective::CollectiveEpilogue< + EpilogueDispatchPolicy, + TileShape, + ElementAccumulator, + cutlass::gemm::TagToStrideC_t, // Converts CUTLASS 2.x to CUTLASS 3.x representation + ElementOutput, + cutlass::gemm::TagToStrideC_t, // Converts CUTLASS 2.x to CUTLASS 3.x representation + FusionCallBacks, + XE_2D_U32x8x16_LD_N, // The copy atom used to load matrix C + void, void, + XE_2D_U32x8x16_ST_N, // The copy atom used to store matrix D + void, void>; + + // GEMM Mainloop - iteration over blocks in K dimension + using CollectiveMainloop = cutlass::gemm::collective::CollectiveMma< + GEMMDispatchPolicy, + TileShape, + ElementInputA, + cutlass::gemm::TagToStrideA_t, // Converts CUTLASS 2.x to CUTLASS 3.x representation + ElementInputB, + cutlass::gemm::TagToStrideB_t, // Converts CUTLASS 2.x to CUTLASS 3.x representation + TiledMma, + GmemTiledCopyA, void, void, cute::identity, // A + GmemTiledCopyB, void, void, cute::identity // B + >; + + // Define the whole kernel (mainloop and epilogue) + using GemmKernel = cutlass::gemm::kernel::GemmUniversal< + Shape, // Defer global problem shape definition to runtime + CollectiveMainloop, + CollectiveEpilogue + >; + + // The GemmUniversalAdapter wraps the defined GEMM kernel and handles the launch, and e.g. + // persistent scratch memory if required. + using Gemm = cutlass::gemm::device::GemmUniversalAdapter; + + ExampleRunner runner; + + CUTLASS_CHECK(runner.run(options, hw_info)); + + return 0; +} \ No newline at end of file diff --git a/examples/00_bmg_gemm/legacy/CMakeLists.txt b/examples/00_bmg_gemm/legacy/CMakeLists.txt new file mode 100644 index 0000000000..1d40199221 --- /dev/null +++ b/examples/00_bmg_gemm/legacy/CMakeLists.txt @@ -0,0 +1,41 @@ +# Copyright (c) 2024 - 2025 Codeplay Software Ltd. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +set(TEST_BATCHES --l=2) +set(TEST_LARGE "--l=513 --m=8 --n=16384 --k=512") # B matrix capacity > uint32_max +set(TEST_SMALL_SHAPE --m=4 --n=8 --k=8 --l=2) + + +cutlass_example_add_executable( + 00_bmg_gemm_legacy + 00_bmg_gemm.cpp + TEST_COMMAND_OPTIONS + TEST_BATCHES + TEST_LARGE + TEST_SMALL_SHAPE +) diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index d141f5b7de..38c6b34b75 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -101,6 +101,7 @@ if(CUTLASS_ENABLE_SYCL) message(STATUS "Building examples for Intel GPU targets") foreach(EXAMPLE 00_bmg_gemm + 00_bmg_gemm/legacy 01_bmg_gemm_with_collective_builder 02_bmg_gemm_mixed_dtype 03_bmg_gemm_streamk diff --git a/include/cutlass/gemm/collective/xe_mma.hpp b/include/cutlass/gemm/collective/xe_mma.hpp index 64f23fd93f..bbd6add94e 100644 --- a/include/cutlass/gemm/collective/xe_mma.hpp +++ b/include/cutlass/gemm/collective/xe_mma.hpp @@ -100,6 +100,37 @@ struct CollectiveMma, TileShape_, ElementA_ static constexpr auto Num_SGs = ATOM_N * ATOM_M * ATOM_K; static constexpr uint32_t MaxThreadsPerBlock = size(TiledMma{}); + // Helper struct to deduce CopyOpA type + template + struct CopyOpAHelper { + static auto get() { + auto tmp = make_tensor(make_gmem_ptr(static_cast(nullptr)), + make_layout(make_shape(Int{}, Int{}, Int<1>{}), SA{})); + if constexpr (!std::is_void_v) { + return make_block_2d_copy_A(GTCA{}, TM{}, tmp(_,_,0)); + } else { + return make_block_2d_copy_A(TM{}, tmp(_,_,0)); + } + } + }; + + // Helper struct to deduce CopyOpB type + template + struct CopyOpBHelper { + static auto get() { + auto tmp = make_tensor(make_gmem_ptr(static_cast(nullptr)), + make_layout(make_shape(Int{}, Int{}, Int<1>{}), SB{})); + if constexpr (!std::is_void_v) { + return make_block_2d_copy_B(GTCB{}, TM{}, tmp(_,_,0)); + } else { + return make_block_2d_copy_B(TM{}, tmp(_,_,0)); + } + } + }; + + using CopyOpA = decltype(CopyOpAHelper::get()); + using CopyOpB = decltype(CopyOpBHelper::get()); + // Host side kernel arguments struct Arguments { ElementA const* ptr_A; @@ -113,7 +144,8 @@ struct CollectiveMma, TileShape_, ElementA_ StrideA dA; ElementB const* ptr_B; StrideB dB; - int M, N, K, L; + CopyOpA copy_a; + CopyOpB copy_b; }; // @@ -129,11 +161,32 @@ struct CollectiveMma, TileShape_, ElementA_ auto [M,N,K,L] = problem_shape; + auto mA_mkl = make_tensor(make_gmem_ptr(args.ptr_A), + make_layout(make_shape(M, K, L), args.dA)); + auto mB_nkl = make_tensor(make_gmem_ptr(args.ptr_B), + make_layout(make_shape(N, K, L), args.dB)); + + CopyOpA copy_a = [&]() { + if constexpr (!std::is_void_v) { + return make_block_2d_copy_A(GmemTiledCopyA{}, TiledMma{}, mA_mkl(_,_,0)); + } else { + return make_block_2d_copy_A(TiledMma{}, mA_mkl(_,_,0)); + } + }(); + + CopyOpB copy_b = [&]() { + if constexpr (!std::is_void_v) { + return make_block_2d_copy_B(GmemTiledCopyB{}, TiledMma{}, mB_nkl(_,_,0)); + } else { + return make_block_2d_copy_B(TiledMma{}, mB_nkl(_,_,0)); + } + }(); + return Params{args.ptr_A, args.dA, args.ptr_B, args.dB, - M, N, K, L}; + copy_a, copy_b}; } template @@ -176,32 +229,8 @@ struct CollectiveMma, TileShape_, ElementA_ static_assert(is_rmem::value, "D tensor must be rmem resident."); static_assert(is_rmem::value, "C tensor must be rmem resident."); - auto mA_mkl = make_tensor(make_gmem_ptr(mainloop.ptr_A), - make_layout(make_shape(mainloop.M, mainloop.K, mainloop.L), cute::take<0,2>(mainloop.dA))); - auto mB_nkl = make_tensor(make_gmem_ptr(mainloop.ptr_B), - make_layout(make_shape(mainloop.N, mainloop.K, mainloop.L), cute::take<0,2>(mainloop.dB))); - auto copy_a = [&]() { - if constexpr (!std::is_void_v) { - // User provided copy operation - return make_block_2d_copy_A(GmemTiledCopyA{}, TiledMma{}, mA_mkl); - } else { - // make_block_2d_copy_A automatically selects copy operation - return make_block_2d_copy_A(TiledMma{}, mA_mkl); - } - }(); - - auto copy_b = [&]() { - if constexpr (!std::is_void_v) { - // User provided copy operation - return make_block_2d_copy_B(GmemTiledCopyB{}, TiledMma{}, mB_nkl); - } else { - // make_block_2d_copy_B automatically selects copy operation - return make_block_2d_copy_B(TiledMma{}, mB_nkl); - } - }(); - - auto thr_copy_a = copy_a.get_slice(thread_idx); - auto thr_copy_b = copy_b.get_slice(thread_idx); + auto thr_copy_a = mainloop.copy_a.get_slice(thread_idx); + auto thr_copy_b = mainloop.copy_b.get_slice(thread_idx); // Instantiate the MMA object and get thread slice TiledMma tiled_mma; @@ -220,8 +249,8 @@ struct CollectiveMma, TileShape_, ElementA_ Tensor tBgB = thr_copy_b.partition_S(gB); /* Create prefetch TiledCopy instances */ - auto prefetch_a = make_block_2d_prefetch(copy_a); - auto prefetch_b = make_block_2d_prefetch(copy_b); + auto prefetch_a = make_block_2d_prefetch(mainloop.copy_a); + auto prefetch_b = make_block_2d_prefetch(mainloop.copy_b); auto thr_prefetch_A = prefetch_a.get_slice(thread_idx); auto thr_prefetch_B = prefetch_b.get_slice(thread_idx); @@ -238,14 +267,14 @@ struct CollectiveMma, TileShape_, ElementA_ PRINT(tCrA); PRINT(tArA); - PRINT(copy_a); + PRINT(mainloop.copy_a); print("======================= B: \n"); PRINT(tBgB); PRINT(tCrB); PRINT(tBrB); - PRINT(copy_b); + PRINT(mainloop.copy_b); } #undef PRINT #endif @@ -266,8 +295,8 @@ struct CollectiveMma, TileShape_, ElementA_ for (int k_tile = k_start_idx; k_tile < k_tile_count + k_start_idx; k_tile++, prefetch_k++) { barrier_arrive(barrier_scope); // Copy gmem to rmem for the first k_tile - copy(copy_a, tAgA(_,_,_,k_tile), tArA); - copy(copy_b, tBgB(_,_,_,k_tile), tBrB); + copy(mainloop.copy_a, tAgA(_,_,_,k_tile), tArA); + copy(mainloop.copy_b, tBgB(_,_,_,k_tile), tBrB); if (prefetch_k < k_tile_count) { prefetch(prefetch_a, pAgA(_, _, _, prefetch_k)); From ea67069e04beca4db21d8df3388dd66ae5ba2567 Mon Sep 17 00:00:00 2001 From: Anamika Chatterjee Date: Fri, 10 Oct 2025 11:32:35 +0300 Subject: [PATCH 13/15] Applied reviwer comment --- examples/00_bmg_gemm/00_bmg_gemm.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/examples/00_bmg_gemm/00_bmg_gemm.cpp b/examples/00_bmg_gemm/00_bmg_gemm.cpp index 0f7c665820..874fce9c75 100644 --- a/examples/00_bmg_gemm/00_bmg_gemm.cpp +++ b/examples/00_bmg_gemm/00_bmg_gemm.cpp @@ -277,8 +277,6 @@ struct ExampleRunner { bool passed = verify(problem_size, options.alpha, options.beta); std::cout << "Disposition: " << (passed ? "Passed" : "Failed") << std::endl; - if(!passed) return cutlass::Status::kErrorInternal; - if (options.iterations > 0) { GPU_Clock timer; timer.start(); @@ -345,7 +343,7 @@ int main(int argc, const char** argv) using LayoutC = cutlass::layout::RowMajor; using LayoutD = cutlass::layout::RowMajor; - // [New Copy Atom] When left unspecified (void), make_block_2d_copy_* automatically selects + // [New Copy Atom] When left unspecified (void), MainloopXeL1Staged automatically selects // appropriate 2D block copy operations for matrices A and B. Alternatively, you can // explicitly specify new copy atom operations such as XE_LOAD_2D, XE_LOAD_2D_VNNI // (applicable only to matrix B), or XE_LOAD_2D_TRANSPOSE. From e9878b9c164a4be800e324457074ebc9bf2c1f9e Mon Sep 17 00:00:00 2001 From: Anamika Chatterjee Date: Fri, 10 Oct 2025 11:37:45 +0300 Subject: [PATCH 14/15] This is an empty commit From fbb7bb5c58a52bf2ffacd2473f97945b0c8a199b Mon Sep 17 00:00:00 2001 From: Anamika Chatterjee Date: Fri, 10 Oct 2025 16:59:05 +0530 Subject: [PATCH 15/15] Preventing exceptions on older IGC versions --- examples/00_bmg_gemm/00_bmg_gemm.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/00_bmg_gemm/00_bmg_gemm.cpp b/examples/00_bmg_gemm/00_bmg_gemm.cpp index 874fce9c75..508c2e5063 100644 --- a/examples/00_bmg_gemm/00_bmg_gemm.cpp +++ b/examples/00_bmg_gemm/00_bmg_gemm.cpp @@ -277,7 +277,7 @@ struct ExampleRunner { bool passed = verify(problem_size, options.alpha, options.beta); std::cout << "Disposition: " << (passed ? "Passed" : "Failed") << std::endl; - if (options.iterations > 0) { + if (passed && options.iterations > 0) { GPU_Clock timer; timer.start(); for (int i = 0; i < options.iterations; ++i) {