From 83f1de8ee1be3c7427ccf505ddeb60bc421d8ca8 Mon Sep 17 00:00:00 2001
From: Anamika Chatterjee <anamikac@habana.ai>
Date: Mon, 29 Sep 2025 08:10:27 +0300
Subject: [PATCH 01/15] Test commit

---
 examples/00_bmg_gemm/00_bmg_gemm.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/examples/00_bmg_gemm/00_bmg_gemm.cpp b/examples/00_bmg_gemm/00_bmg_gemm.cpp
index 7e9291227e..93e1651e03 100644
--- a/examples/00_bmg_gemm/00_bmg_gemm.cpp
+++ b/examples/00_bmg_gemm/00_bmg_gemm.cpp
@@ -367,7 +367,7 @@ int main(int argc, const char** argv)
                                     Layout<Shape<_8, _4, _1>, Stride<_4, _1, _0>>>::TiledMMA;
 
   // For Intel BMG, PipelineStages defines how many k-blocks ahead to prefetch from A and B.
-  constexpr int PipelineStages = 2;
+  constexpr int PipelineStages = 3;
   using GEMMDispatchPolicy = cutlass::gemm::MainloopIntelXeXMX16<PipelineStages>;
   using EpilogueDispatchPolicy = cutlass::epilogue::IntelXeXMX16;
 

From 0b184f0809ffc6094c63202592db4ee3741c29ef Mon Sep 17 00:00:00 2001
From: Anamika Chatterjee <anamikac@habana.ai>
Date: Mon, 29 Sep 2025 09:12:28 +0300
Subject: [PATCH 02/15] Enable new mma and copy atoms

---
 examples/00_bmg_gemm/00_bmg_gemm.cpp       | 71 ++++++++++++++----
 include/cutlass/gemm/collective/xe_mma.hpp | 85 +++++++++++-----------
 include/cutlass/gemm/kernel/xe_gemm.hpp    |  8 +-
 3 files changed, 103 insertions(+), 61 deletions(-)

diff --git a/examples/00_bmg_gemm/00_bmg_gemm.cpp b/examples/00_bmg_gemm/00_bmg_gemm.cpp
index 93e1651e03..d50e9349ac 100644
--- a/examples/00_bmg_gemm/00_bmg_gemm.cpp
+++ b/examples/00_bmg_gemm/00_bmg_gemm.cpp
@@ -81,6 +81,58 @@
 
 using namespace cute;
 
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Helper template to check if a type is complete
+template <typename T, size_t = 0>
+struct is_complete : std::false_type {};
+
+template <typename T>
+struct is_complete<T, 0 * sizeof(T)> : std::true_type {};
+
+template <typename T>
+static constexpr bool is_complete_v = is_complete<T>::value;
+
+
+template <typename TA, typename TB, typename TC>
+auto
+choose_mma_op()
+{
+  if constexpr (is_complete_v<XE_DPAS_TT<8, TC, TA, TB>>) 
+    return XE_DPAS_TT<8, TC, TA, TB>{};
+  else if constexpr (is_same_v<std::decay_t<TA>, cute::bfloat16_t>)
+    return XE_DPAS_TT<8, float, cute::bfloat16_t>{};
+  else  /* Use f16 by default as upconversion sequences are typically faster */
+    return XE_DPAS_TT<8, float, cute::half_t>{};
+}
+
+// Helper function to choose tiled MMA based on tensor properties
+template <typename TA, typename TB, typename TC, typename LayoutA, typename LayoutB>
+auto
+choose_tiled_mma() 
+{
+
+  auto op = choose_mma_op<TA,TB,TC>();
+
+  constexpr bool byte = (cute::max(sizeof_bits_v<TA>, sizeof_bits_v<TB>) <= 8);
+
+  constexpr bool is_A_transposed = std::is_same_v<LayoutA, cutlass::layout::ColumnMajor>;
+  constexpr bool is_B_transposed = std::is_same_v<LayoutB, cutlass::layout::ColumnMajor>;
+  constexpr bool use_1x_dpas_per_k = is_A_transposed || (byte && is_B_transposed);
+  
+  
+  using _K = conditional_t<use_1x_dpas_per_k,
+                           C<op.K>, C<op.K*2>>;
+
+  using WGTile = Shape<_256, _256, _K>;                               // 256x256 WG tile size
+  using SGLayout = Layout<Shape<_8, _4, _1>, Stride<_4, _1, _0>>;     // 8x4 SG tiling, n-major
+
+  using MMA = typename TiledMMAHelper<MMA_Atom<decltype(op)>, Layout<WGTile>, SGLayout>::TiledMMA;
+
+  return MMA{};
+}
+
+
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 
 // Command line options parsing
@@ -350,24 +402,11 @@ int main(int argc, const char** argv)
   using GmemTiledCopyB = XE_2D_U16x32x32_LD_V;
 
   // Workgroup-level tile
-  using TileShape = Shape<_256, _256, _32>;
-
-  // A TiledMMA struct defines a tiling of an MMA atom over M, N and K, combining both additional
-  // hardware (sub-groups for Intel BMG) and iterations by each sub-group.
-  //
-  // The TiledMMAHelper struct defines a specific TiledMMA for a given MMA atom
-  // (XE_8x16x16_F32BF16BF16F32_TT), TileShape (<256, 256, 32>) and sub-group layout (8x4x1). The
-  // TiledMMA constructed using TiledMMAHelper has the property that each sub-group operates on a
-  // single contiguous chunk of the work-group TileShape. For this configuration, this implies that
-  // each sub-group operates on a contiguous 32x64x32 chunk (4x4x2 iterations). See
-  // 0t_mma_atom.md#TiledMMAs for more info. Sub-groups are arranged row-major (stride 4,1,0) for
-  // performance reasons.
-  using TiledMma =                    // M=8,N=16,K=16, D=f32,A=bf16,B=bf16,C=f32
-      typename TiledMMAHelper<MMA_Atom<XE_8x16x16_F32BF16BF16F32_TT>, Layout<TileShape>,
-                                    Layout<Shape<_8, _4, _1>, Stride<_4, _1, _0>>>::TiledMMA;
+  using TiledMma = decltype(choose_tiled_mma<ElementInputA, ElementInputB, ElementOutput, LayoutA, LayoutB>());
+  using TileShape = decltype(TiledMma{}.tile_mnk());
 
   // For Intel BMG, PipelineStages defines how many k-blocks ahead to prefetch from A and B.
-  constexpr int PipelineStages = 3;
+  constexpr int PipelineStages = 2;
   using GEMMDispatchPolicy = cutlass::gemm::MainloopIntelXeXMX16<PipelineStages>;
   using EpilogueDispatchPolicy = cutlass::epilogue::IntelXeXMX16;
 
diff --git a/include/cutlass/gemm/collective/xe_mma.hpp b/include/cutlass/gemm/collective/xe_mma.hpp
index 7d75825a3e..9357491aa2 100644
--- a/include/cutlass/gemm/collective/xe_mma.hpp
+++ b/include/cutlass/gemm/collective/xe_mma.hpp
@@ -100,9 +100,6 @@ struct CollectiveMma<MainloopIntelXeXMX16<Stages, Schedule>, TileShape_, Element
   static constexpr auto Num_SGs = ATOM_N * ATOM_M * ATOM_K;
   static constexpr uint32_t MaxThreadsPerBlock = size(TiledMma{});
 
-  using Copy_A = typename Copy_Traits<GmemTiledCopyA, StrideA>::template DefaultTiledCopy<ElementA>;
-  using Copy_B = typename Copy_Traits<GmemTiledCopyB, StrideB>::template DefaultTiledCopy<ElementB>;
-
   // Host side kernel arguments
   struct Arguments {
     ElementA const* ptr_A;
@@ -112,8 +109,11 @@ struct CollectiveMma<MainloopIntelXeXMX16<Stages, Schedule>, TileShape_, Element
   };
 
   struct Params {
-    Copy_A tiled_copy_a;
-    Copy_B tiled_copy_b;
+    ElementA const* ptr_A;
+    StrideA dA;
+    ElementB const* ptr_B;
+    StrideB dB;
+    int M, N, K, L; 
   };
 
   //
@@ -129,12 +129,11 @@ struct CollectiveMma<MainloopIntelXeXMX16<Stages, Schedule>, TileShape_, Element
 
     auto [M,N,K,L] = problem_shape;
 
-    auto mA_mkl = make_tensor(make_gmem_ptr(args.ptr_A), make_layout(make_shape(M, K, L), args.dA));
-    auto mB_nkl = make_tensor(make_gmem_ptr(args.ptr_B), make_layout(make_shape(N, K, L), args.dB));
-    Copy_A tiled_copy_a{Copy_A{}.with(mA_mkl)};
-    Copy_B tiled_copy_b{Copy_B{}.with(mB_nkl)};
-    
-    return Params{tiled_copy_a, tiled_copy_b};
+    return Params{args.ptr_A,
+    args.dA,
+    args.ptr_B,
+    args.dB,
+    M, N, K, L};
   }
 
   template<class ProblemShape>
@@ -177,8 +176,15 @@ struct CollectiveMma<MainloopIntelXeXMX16<Stages, Schedule>, TileShape_, Element
     static_assert(is_rmem<FrgTensorD>::value, "D tensor must be rmem resident.");
     static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
 
-    auto thr_copy_A = mainloop.tiled_copy_a.get_slice(thread_idx);
-    auto thr_copy_B = mainloop.tiled_copy_b.get_slice(thread_idx);
+    auto mA_mkl = make_tensor(make_gmem_ptr(mainloop.ptr_A), 
+                              make_layout(make_shape(mainloop.M, mainloop.K, mainloop.L), cute::take<0,2>(mainloop.dA)));
+    auto mB_nkl = make_tensor(make_gmem_ptr(mainloop.ptr_B), 
+                              make_layout(make_shape(mainloop.N, mainloop.K, mainloop.L), cute::take<0,2>(mainloop.dB)));
+    auto copy_a = make_block_2d_copy_A(TiledMma{}, mA_mkl);
+    auto copy_b = make_block_2d_copy_B(TiledMma{}, mB_nkl);
+
+    auto thr_copy_a = copy_a.get_slice(thread_idx);
+    auto thr_copy_b = copy_b.get_slice(thread_idx);
 
     // Instantiate the MMA object and get thread slice
     TiledMma tiled_mma;
@@ -188,27 +194,25 @@ struct CollectiveMma<MainloopIntelXeXMX16<Stages, Schedule>, TileShape_, Element
     auto first_thread_in_sg_idx = sg.get_group_linear_id() * DispatchPolicy::SubgroupSize;
     auto thr_mma = tiled_mma.get_slice(first_thread_in_sg_idx);
 
-    // Partition global counting tensors for MMA
-    Tensor tCgA = thr_mma.partition_A(gA);
-    Tensor tCgB = thr_mma.partition_B(gB);
+    /* Register fragments for MMA */
+    auto tCrA = thr_mma.partition_sg_fragment_A(gA(_,_,0));
+    auto tCrB = thr_mma.partition_sg_fragment_B(gB(_,_,0));
 
-    Tensor tCrA = make_tensor<ElementA>(make_fragment_layout(mainloop.tiled_copy_a, tCgA(_,_,_,0).shape()));
-    Tensor tCrB = make_tensor<ElementB>(make_fragment_layout(mainloop.tiled_copy_b, tCgB(_,_,_,0).shape()));
+    /* Register fragments for copies */
+    auto tArA = thr_copy_a.partition_sg_fragment_D(gA(_,_,0));
+    auto tBrB = thr_copy_b.partition_sg_fragment_D(gB(_,_,0));
 
-    // Retile registers for copies
-    Tensor tArA = thr_copy_A.retile_D(tCrA);
-    Tensor tBrB = thr_copy_B.retile_D(tCrB);
-    
-    // Retile global counting tensors for copies
-    Tensor tAgA = thr_copy_A.retile_S(tCgA);
-    Tensor tBgB = thr_copy_B.retile_S(tCgB);
-    
-    auto tiled_prefetch_a = cute::prefetch_selector<Shape<Int<BLK_M>,Int<BLK_K>>, Num_SGs>(mainloop.tiled_copy_a);
-    auto tiled_prefetch_b = cute::prefetch_selector<Shape<Int<BLK_N>,Int<BLK_K>>, Num_SGs>(mainloop.tiled_copy_b);
-    auto thr_prefetch_A = tiled_prefetch_a.get_slice(thread_idx);
-    auto thr_prefetch_B = tiled_prefetch_b.get_slice(thread_idx);
+    /* Partition global tensor (proxies) for copies */
+    Tensor tAgA = thr_copy_a.partition_S(gA);
+    Tensor tBgB = thr_copy_b.partition_S(gB);
     
-    // Partition global tile for prefetch
+    /* Create prefetch TiledCopy instances */
+    auto prefetch_a = make_block_2d_prefetch(copy_a);
+    auto prefetch_b = make_block_2d_prefetch(copy_b);
+    auto thr_prefetch_A = prefetch_a.get_slice(thread_idx);
+    auto thr_prefetch_B = prefetch_b.get_slice(thread_idx);
+
+    /* Partition global tensor (proxies) for prefetch */
     auto pAgA = thr_prefetch_A.partition_S(gA);
     auto pBgB = thr_prefetch_B.partition_S(gB);
 
@@ -216,20 +220,18 @@ struct CollectiveMma<MainloopIntelXeXMX16<Stages, Schedule>, TileShape_, Element
 #define PRINT(x) print(#x ": "); print(x); print("\n");
     if (cute::thread(LOG_THREAD, LOG_GROUP)) {
       print("======================= A: \n");
-      PRINT(tCgA);
       PRINT(tAgA);
 
       PRINT(tCrA);
       PRINT(tArA);
-      PRINT(mainloop.tiled_copy_a);
+      PRINT(copy_a);
 
       print("======================= B: \n");
-      PRINT(tCgB);
       PRINT(tBgB);
 
       PRINT(tCrB);
       PRINT(tBrB);
-      PRINT(mainloop.tiled_copy_b);
+      PRINT(copy_b);
       }
 #undef PRINT
 #endif
@@ -243,19 +245,19 @@ struct CollectiveMma<MainloopIntelXeXMX16<Stages, Schedule>, TileShape_, Element
 
     CUTLASS_PRAGMA_UNROLL
     for (; prefetch_k < DispatchPolicy::Stages; prefetch_k++) {
-      prefetch(tiled_prefetch_a, pAgA(_, _, _, prefetch_k));
-      prefetch(tiled_prefetch_b, pBgB(_, _, _, prefetch_k));
+      prefetch(prefetch_a, pAgA(_, _, _, prefetch_k));
+      prefetch(prefetch_b, pBgB(_, _, _, prefetch_k));
     }
 
     for (int k_tile = k_start_idx; k_tile < k_tile_count + k_start_idx; k_tile++, prefetch_k++) {
       barrier_arrive(barrier_scope);
       // Copy gmem to rmem for the first k_tile
-      copy(mainloop.tiled_copy_a, tAgA(_,_,_,k_tile), tArA);
-      copy(mainloop.tiled_copy_b, tBgB(_,_,_,k_tile), tBrB);
+      copy(copy_a, tAgA(_,_,_,k_tile), tArA);
+      copy(copy_b, tBgB(_,_,_,k_tile), tBrB);
 
       if (prefetch_k < k_tile_count) {
-        prefetch(tiled_prefetch_a, pAgA(_, _, _, prefetch_k));
-        prefetch(tiled_prefetch_b, pBgB(_, _, _, prefetch_k));
+        prefetch(prefetch_a, pAgA(_, _, _, prefetch_k));
+        prefetch(prefetch_b, pBgB(_, _, _, prefetch_k));
       }
 
       cute::gemm(tiled_mma, tCrA, tCrB, accum);
@@ -267,3 +269,4 @@ struct CollectiveMma<MainloopIntelXeXMX16<Stages, Schedule>, TileShape_, Element
 } // namespace cutlass::gemm::collective
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/include/cutlass/gemm/kernel/xe_gemm.hpp b/include/cutlass/gemm/kernel/xe_gemm.hpp
index 1b1bebcc18..43e9d033e1 100644
--- a/include/cutlass/gemm/kernel/xe_gemm.hpp
+++ b/include/cutlass/gemm/kernel/xe_gemm.hpp
@@ -238,11 +238,11 @@ class GemmUniversal<
     constexpr auto workgroup_shape = WorkgroupTileShape{};                                                  // (SUB_M,SUB_N,SUB_K)
     constexpr auto subgroup_shape = SubgroupTileShape{};                   
 
-    Tensor mA_mkl = cute::get_xe_tensor(make_shape(M,K,L));   //(m,k,l)
-    Tensor mB_nkl = cute::get_xe_tensor(make_shape(N,K,L));   //(n,k,l)
+    Tensor cA = make_identity_tensor(make_shape(M,K,L));   // (M,K,L)
+    Tensor cB = make_identity_tensor(make_shape(M,K,L));   // (N,K,L)
 
-    Tensor gA = local_tile(mA_mkl, select<0,2>(blk_shape), make_coord(m_coord,_,l_coord));
-    Tensor gB = local_tile(mB_nkl, select<1,2>(blk_shape), make_coord(n_coord,_,l_coord));
+    Tensor gA = local_tile(cA, select<0,2>(blk_shape), make_coord(m_coord,_,l_coord));
+    Tensor gB = local_tile(cB, select<1,2>(blk_shape), make_coord(n_coord,_,l_coord));
 
     // Allocate the tiled_mma and the accumulators for the (M,N) subgroup_shape
     TiledMma tiled_mma;

From ef1bafad27692d64bd94ff6bbc0bfc34b0063650 Mon Sep 17 00:00:00 2001
From: Anamika Chatterjee <anamikac@habana.ai>
Date: Tue, 30 Sep 2025 15:39:13 +0300
Subject: [PATCH 03/15] adding legacy code back for collectivemma and
 gemmuniversal

---
 .vscode/settings.json                         |  24 +
 examples/00_bmg_gemm/00_bmg_gemm.cpp          |   4 +-
 examples/00_bmg_gemm/00_bmg_gemm_legacy.cpp   | 429 ++++++++++++++++++
 examples/00_bmg_gemm/CMakeLists.txt           |   9 +
 .../gemm/collective/collective_mma.hpp        |   1 +
 .../gemm/collective/collective_mma_decl.hpp   |  20 +
 include/cutlass/gemm/collective/xe_mma.hpp    |   4 +-
 .../cutlass/gemm/collective/xe_mma_legacy.hpp | 269 +++++++++++
 .../cutlass/gemm/kernel/gemm_universal.hpp    |   1 +
 .../cutlass/gemm/kernel/gemm_universal_decl.h |   9 +
 include/cutlass/gemm/kernel/xe_gemm.hpp       |   2 +-
 .../cutlass/gemm/kernel/xe_gemm_legacy.hpp    | 284 ++++++++++++
 12 files changed, 1051 insertions(+), 5 deletions(-)
 create mode 100644 .vscode/settings.json
 create mode 100644 examples/00_bmg_gemm/00_bmg_gemm_legacy.cpp
 create mode 100644 include/cutlass/gemm/collective/xe_mma_legacy.hpp
 create mode 100644 include/cutlass/gemm/kernel/xe_gemm_legacy.hpp

diff --git a/.vscode/settings.json b/.vscode/settings.json
new file mode 100644
index 0000000000..43b3531021
--- /dev/null
+++ b/.vscode/settings.json
@@ -0,0 +1,24 @@
+{
+    "files.associations": {
+        "compare": "cpp",
+        "string": "cpp",
+        "unordered_map": "cpp",
+        "vector": "cpp",
+        "exception": "cpp",
+        "memory": "cpp",
+        "memory_resource": "cpp",
+        "string_view": "cpp",
+        "random": "cpp",
+        "initializer_list": "cpp",
+        "istream": "cpp",
+        "new": "cpp",
+        "ostream": "cpp",
+        "sstream": "cpp",
+        "stdexcept": "cpp",
+        "streambuf": "cpp",
+        "system_error": "cpp",
+        "tuple": "cpp",
+        "type_traits": "cpp",
+        "typeinfo": "cpp"
+    }
+}
\ No newline at end of file
diff --git a/examples/00_bmg_gemm/00_bmg_gemm.cpp b/examples/00_bmg_gemm/00_bmg_gemm.cpp
index d50e9349ac..312dcd23d0 100644
--- a/examples/00_bmg_gemm/00_bmg_gemm.cpp
+++ b/examples/00_bmg_gemm/00_bmg_gemm.cpp
@@ -437,7 +437,7 @@ int main(int argc, const char** argv)
           void, void>;
 
   // GEMM Mainloop - iteration over blocks in K dimension
-  using CollectiveMainloop = cutlass::gemm::collective::CollectiveMma<
+  using CollectiveMainloop = cutlass::gemm::collective::CollectiveMmaNew<
           GEMMDispatchPolicy,
           TileShape,
           ElementInputA,
@@ -450,7 +450,7 @@ int main(int argc, const char** argv)
   >;
 
   // Define the whole kernel (mainloop and epilogue)
-  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversalNew<
   Shape<int, int, int, int>, // Defer global problem shape definition to runtime
   CollectiveMainloop,
   CollectiveEpilogue
diff --git a/examples/00_bmg_gemm/00_bmg_gemm_legacy.cpp b/examples/00_bmg_gemm/00_bmg_gemm_legacy.cpp
new file mode 100644
index 0000000000..7e9291227e
--- /dev/null
+++ b/examples/00_bmg_gemm/00_bmg_gemm_legacy.cpp
@@ -0,0 +1,429 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief CUTLASS Intel BMG Gemm Example.
+
+    This example constructs and executes a simple CUTLASS GEMM kernel on Intel BMG hardware, and
+    verifies its correctness with a reference implementation
+    (cutlass::reference::device::GemmComplex). The example also provides a performance measurement
+    for the GEMM in TFLOPS.
+
+    This example makes use of BMGs subgroup cooperative 2d-block copy operations and DPAS instructions.
+
+    The shapes of the A and B matrix are defined at runtime by `options.m`, `.n` and `.k`, and the
+    batch size is defined by `options.l`. The tile shape, which defines how much work is executed by
+    a single work-group, is defined at compile time by:
+    ```
+      using TileShape = Shape<_256, _256, _32>;
+    ```
+    That is, each work-group processes a tile of M=256, N=256, and iterates over `options.k` in
+    blocks of K=32.
+
+    Performance of GEMM on BMG is heavily dependent on prefetching the A and B matrices. That is,
+    executing Intel specific prefetch instructions for future iterations to ensure that the required
+    blocks of A and B are resident in cache before they are needed.
+
+    To build & run this example (from your build dir):
+
+      $ ninja 00_bmg_gemm
+      $ ./examples/sycl/00_bmg_gemm/00_bmg_gemm
+
+    Call with `--help` for information about available options
+*/
+
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/collective/xe_epilogue.hpp"
+#include "cutlass/epilogue/fusion/xe_callbacks.hpp"
+#include "cutlass/gemm/device/gemm_universal.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/collective/collective_mma.hpp"
+#include "cutlass/util/GPU_Clock.hpp"
+
+#include <cute/tensor.hpp>
+#include <random>
+
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/device_memory.h"
+#include "cutlass/util/packed_stride.hpp"
+#include "cutlass/util/reference/device/gemm_complex.h"
+#include "cutlass/util/reference/device/tensor_compare.h"
+#include "sycl_common.hpp"
+#include "helper.h"
+
+using namespace cute;
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Command line options parsing
+struct Options {
+
+  bool help;
+  bool error;
+
+  int m, n, k, l, iterations;
+  float alpha, beta;
+
+  Options():
+    help(false),
+    error(false),
+    m(5120), n(4096), k(4096), l(1), iterations(20),
+    alpha(1.f), beta(0.f)
+  { }
+
+  // Parses the command line
+  void parse(int argc, char const **args) {
+    cutlass::CommandLine cmd(argc, args);
+
+    if (cmd.check_cmd_line_flag("help")) {
+      help = true;
+      return;
+    }
+
+    cmd.get_cmd_line_argument("m", m, 5120);
+    cmd.get_cmd_line_argument("n", n, 4096);
+    cmd.get_cmd_line_argument("k", k, 4096);
+    cmd.get_cmd_line_argument("l", l, 1);
+    cmd.get_cmd_line_argument("alpha", alpha, 1.f);
+    cmd.get_cmd_line_argument("beta", beta, 0.f);
+    cmd.get_cmd_line_argument("iterations", iterations, 100);
+  }
+
+  /// Prints the usage statement.
+  std::ostream & print_usage(std::ostream &out) const {
+
+    out << "BMG GEMM Example\n\n"
+      << "Options:\n\n"
+      << "  --help                      If specified, displays this usage statement\n\n"
+      << "  --m=<int>                   Sets the M extent of the GEMM\n"
+      << "  --n=<int>                   Sets the N extent of the GEMM\n"
+      << "  --k=<int>                   Sets the K extent of the GEMM\n"
+      << "  --l=<int>                   Sets the L extent (batch count) of the GEMM\n"
+      << "  --alpha=<s32>               Epilogue scalar alpha\n"
+      << "  --beta=<s32>                Epilogue scalar beta\n\n"
+      << "  --iterations=<int>          Iterations\n\n";
+
+    return out;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  class Gemm
+>
+struct ExampleRunner {
+
+  using StrideA = typename Gemm::GemmKernel::StrideA;
+  using StrideB = typename Gemm::GemmKernel::StrideB;
+  using StrideC = typename Gemm::GemmKernel::StrideC;
+  using StrideD = typename Gemm::GemmKernel::StrideD;
+
+  using LayoutA = typename Gemm::LayoutA;
+  using LayoutB = typename Gemm::LayoutB;
+  using LayoutC = typename Gemm::LayoutC;
+  using LayoutD = typename Gemm::LayoutD;
+
+  using ElementA = typename Gemm::ElementA;
+  using ElementB = typename Gemm::ElementB;
+  using ElementAcc = typename Gemm::ElementAccumulator;
+
+  using CollectiveEpilogue = typename Gemm::CollectiveEpilogue;
+  using ElementC = typename Gemm::ElementC;
+  using ElementOutput = typename CollectiveEpilogue::ElementOutput;
+  using ElementCompute = typename CollectiveEpilogue::ElementCompute;
+  using ElementAccumulator = typename CollectiveEpilogue::ElementAccumulator;
+
+  using ProblemShapeType = typename Gemm::GemmKernel::ProblemShape;
+
+  //
+  // Data members
+  //
+
+  /// Initialization
+  StrideA stride_A;
+  StrideB stride_B;
+  StrideC stride_C;
+  StrideD stride_D;
+  uint64_t seed = 0;
+
+  cutlass::DeviceAllocation<ElementA> block_A;
+  cutlass::DeviceAllocation<ElementB> block_B;
+  cutlass::DeviceAllocation<ElementC> block_C;
+  cutlass::DeviceAllocation<ElementOutput> block_D;
+  cutlass::DeviceAllocation<ElementOutput> block_ref_D; // Reference GEMM result for verification
+
+  //
+  // Methods
+  //
+
+  bool verify(const ProblemShapeType& problem_size, ElementCompute alpha, ElementCompute beta) {
+    auto [M, N, K, L] = problem_size;
+
+    cutlass::TensorRef ref_A(block_A.get(), LayoutA::packed({M, K}));
+    cutlass::TensorRef ref_B(block_B.get(), LayoutB::packed({K, N}));
+    cutlass::TensorRef ref_C(block_C.get(), LayoutC::packed({M, N}));
+    cutlass::TensorRef ref_D(block_ref_D.get(), LayoutD::packed({M, N}));
+
+    cutlass::reference::device::GemmComplex(
+          {M, N, K},
+          alpha,
+          ref_A,
+          cutlass::ComplexTransform::kNone,
+          ref_B,
+          cutlass::ComplexTransform::kNone,
+          beta,
+          ref_C,
+          ref_D,
+          ElementAccumulator(0),
+          L,     // batch_count
+          M * K, // batch_stride_A
+          K * N, // batch_stride_B
+          M * N, // batch_stride_C
+          M * N  // batch_stride_D
+        );
+
+    // CUTLASS on SYCL uses the compatibility library compat for e.g. default in-order queue
+    compat::wait();
+
+    // Check if output from CUTLASS kernel and reference kernel are equal or not
+    bool passed = cutlass::reference::device::BlockCompareEqual(
+      block_ref_D.get(), block_D.get(), block_D.size());
+
+    return passed;
+  }
+
+  /// Initialize operands to be used in the GEMM and reference GEMM
+  void initialize(const ProblemShapeType& problem_size) {
+    auto problem_shape_MNKL = cute::append<4>(problem_size, 1);
+    auto [M, N, K, L] = problem_shape_MNKL;
+
+    // Complete the stride by combining static layout info (StrideA) with runtime size info (M,K,L)
+    stride_A = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, K, L));
+    stride_B = cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(N, K, L));
+    stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L));
+    stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L));
+
+    block_A.reset(static_cast<std::size_t>(M) * K * L);
+    block_B.reset(static_cast<std::size_t>(K) * N * L);
+    block_C.reset(static_cast<std::size_t>(M) * N * L);
+    block_D.reset(static_cast<std::size_t>(M) * N * L);
+    block_ref_D.reset(static_cast<std::size_t>(M) * N * L);
+
+    initialize_block(block_A, seed + 2023);
+    initialize_block(block_B, seed + 2022);
+    initialize_block(block_C, seed + 2021);
+  }
+
+  cutlass::Status run(const Options& options, const cutlass::KernelHardwareInfo& hw_info) {
+    ProblemShapeType problem_size = ProblemShapeType{options.m, options.n, options.k, options.l};
+
+    initialize(problem_size);
+
+    typename Gemm::GemmKernel::Arguments arguments{
+      cutlass::gemm::GemmUniversalMode::kGemm,
+      problem_size,
+      {block_A.get(), stride_A, block_B.get(), stride_B},
+      {{options.alpha, options.beta}, block_C.get(), stride_C, block_D.get(), stride_D},
+      hw_info
+    };
+
+    Gemm gemm_op;
+
+    size_t workspace_size = Gemm::get_workspace_size(arguments);
+    cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+    if (gemm_op.can_implement(arguments) != cutlass::Status::kSuccess){
+      std::cout << "Invalid Problem Size: " << options.m << 'x' << options.n << 'x' << options.k << 'x' << options.l << std::endl;
+      std::exit(1);
+    }
+
+    CUTLASS_CHECK(gemm_op.initialize(arguments, workspace.get()));
+
+    // Run the GEMM
+    CUTLASS_CHECK(gemm_op.run());
+
+    compat::wait();
+
+    // Verify that the result is correct
+    bool passed = verify(problem_size, options.alpha, options.beta);
+    std::cout << "Disposition: " << (passed ? "Passed" : "Failed") << std::endl;
+
+    if(!passed) return cutlass::Status::kErrorInternal;
+
+    if (options.iterations > 0) {
+      GPU_Clock timer;
+      timer.start();
+      for (int i = 0; i < options.iterations; ++i) {
+        gemm_op.run();
+      }
+      compat::wait();
+
+      float cute_time = timer.seconds() / options.iterations;
+      double tflops = (2.0 * options.m * options.n * options.k * options.l) * 1e-12;
+      std::cout << "Problem Size: " << options.m << 'x' << options.n << 'x' << options.k << 'x' << options.l << std::endl;
+      printf("Cutlass GEMM Performance:     [%4.3f]TFlop/s  (%6.4f)ms\n", tflops / cute_time, cute_time*1000);
+    }
+
+    return cutlass::Status::kSuccess;
+  }
+
+};
+
+int main(int argc, const char** argv)
+{
+  //
+  // Parse options
+  //
+
+  Options options;
+
+  options.parse(argc, argv);
+
+  if (options.help) {
+    options.print_usage(std::cout) << std::endl;
+    return 0;
+  }
+
+  if (options.error) {
+    std::cerr << "Aborting execution." << std::endl;
+    return -1;
+  }
+
+  //
+  // Run examples
+  //
+
+  // The KernelHardwareInfo struct holds the number of EUs on the GPU with a given device ID. This
+  // information is used by the underlying kernel.
+  cutlass::KernelHardwareInfo hw_info;
+
+  // Change device_id to another value if you are running on a machine with multiple GPUs and wish
+  // to use a GPU other than that with device ID 0.
+  hw_info.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id);
+
+  bool passed;
+
+  // The code section below describes datatype for input, output matrices and computation between
+  // elements in input matrices.
+  using ElementAccumulator = float;      // <- data type of accumulator
+  using ElementComputeEpilogue = float;  // <- data type of epilogue operations
+  using ElementInputA = bfloat16_t;      // <- data type of elements in input matrix A
+  using ElementInputB = bfloat16_t;      // <- data type of elements in input matrix B
+  using ElementOutput = float;           // <- data type of elements in output matrix D
+
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using LayoutD = cutlass::layout::RowMajor;
+
+  // The 2D block copy operations used for the A and B matrices
+  using GmemTiledCopyA = XE_2D_U16x32x32_LD_N;
+  using GmemTiledCopyB = XE_2D_U16x32x32_LD_V;
+
+  // Workgroup-level tile
+  using TileShape = Shape<_256, _256, _32>;
+
+  // A TiledMMA struct defines a tiling of an MMA atom over M, N and K, combining both additional
+  // hardware (sub-groups for Intel BMG) and iterations by each sub-group.
+  //
+  // The TiledMMAHelper struct defines a specific TiledMMA for a given MMA atom
+  // (XE_8x16x16_F32BF16BF16F32_TT), TileShape (<256, 256, 32>) and sub-group layout (8x4x1). The
+  // TiledMMA constructed using TiledMMAHelper has the property that each sub-group operates on a
+  // single contiguous chunk of the work-group TileShape. For this configuration, this implies that
+  // each sub-group operates on a contiguous 32x64x32 chunk (4x4x2 iterations). See
+  // 0t_mma_atom.md#TiledMMAs for more info. Sub-groups are arranged row-major (stride 4,1,0) for
+  // performance reasons.
+  using TiledMma =                    // M=8,N=16,K=16, D=f32,A=bf16,B=bf16,C=f32
+      typename TiledMMAHelper<MMA_Atom<XE_8x16x16_F32BF16BF16F32_TT>, Layout<TileShape>,
+                                    Layout<Shape<_8, _4, _1>, Stride<_4, _1, _0>>>::TiledMMA;
+
+  // For Intel BMG, PipelineStages defines how many k-blocks ahead to prefetch from A and B.
+  constexpr int PipelineStages = 2;
+  using GEMMDispatchPolicy = cutlass::gemm::MainloopIntelXeXMX16<PipelineStages>;
+  using EpilogueDispatchPolicy = cutlass::epilogue::IntelXeXMX16;
+
+  // This is the 'default' epilogue operation (Linear Combination) which performs everything in:
+  // (D = alpha * (A*B) + beta * C)
+  // aside from the (A*B), which is handled by the GEMM. See 05_bmg_gemm_with_epilogues for more
+  // complex epilogue examples.
+  using EpilogueOp = cutlass::epilogue::fusion::LinearCombination<ElementOutput, ElementComputeEpilogue,
+          ElementAccumulator, ElementAccumulator, cutlass::FloatRoundStyle::round_to_nearest>;
+
+  // FusionCallbacks ties the EpilogueOp to an implementation (based on the dispatch
+  // policy/architecture) and defines the epilogue arguments.
+  using FusionCallBacks = cutlass::epilogue::fusion::FusionCallbacks<EpilogueDispatchPolicy, EpilogueOp, TileShape,
+          decltype(tile_shape(TiledMma()))>;
+  // GEMM Epilogue - loads & stores C/D matrices, performs epilogue operations & load/stores any
+  // auxiliary data required
+  using CollectiveEpilogue = cutlass::epilogue::collective::CollectiveEpilogue<
+          EpilogueDispatchPolicy,
+          TileShape,
+          ElementAccumulator,
+          cutlass::gemm::TagToStrideC_t<LayoutC>, // Converts CUTLASS 2.x to CUTLASS 3.x representation
+          ElementOutput,
+          cutlass::gemm::TagToStrideC_t<LayoutD>, // Converts CUTLASS 2.x to CUTLASS 3.x representation
+          FusionCallBacks,
+          XE_2D_U32x8x16_LD_N, // The copy atom used to load matrix C
+          void, void,
+          XE_2D_U32x8x16_ST_N, // The copy atom used to store matrix D
+          void, void>;
+
+  // GEMM Mainloop - iteration over blocks in K dimension
+  using CollectiveMainloop = cutlass::gemm::collective::CollectiveMma<
+          GEMMDispatchPolicy,
+          TileShape,
+          ElementInputA,
+          cutlass::gemm::TagToStrideA_t<LayoutA>, // Converts CUTLASS 2.x to CUTLASS 3.x representation
+          ElementInputB,
+          cutlass::gemm::TagToStrideB_t<LayoutB>, // Converts CUTLASS 2.x to CUTLASS 3.x representation
+          TiledMma,
+          GmemTiledCopyA, void, void, cute::identity,  // A
+          GmemTiledCopyB, void, void, cute::identity   // B
+  >;
+
+  // Define the whole kernel (mainloop and epilogue)
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+  Shape<int, int, int, int>, // Defer global problem shape definition to runtime
+  CollectiveMainloop,
+  CollectiveEpilogue
+  >;
+
+  // The GemmUniversalAdapter wraps the defined GEMM kernel and handles the launch, and e.g.
+  // persistent scratch memory if required.
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+  ExampleRunner<Gemm> runner;
+
+  CUTLASS_CHECK(runner.run(options, hw_info));
+
+  return 0;
+}
diff --git a/examples/00_bmg_gemm/CMakeLists.txt b/examples/00_bmg_gemm/CMakeLists.txt
index 5bfc4a5e29..6564ad47a9 100644
--- a/examples/00_bmg_gemm/CMakeLists.txt
+++ b/examples/00_bmg_gemm/CMakeLists.txt
@@ -40,6 +40,15 @@ cutlass_example_add_executable(
   TEST_SMALL_SHAPE
 )
 
+cutlass_example_add_executable(
+  00_bmg_gemm_legacy
+  00_bmg_gemm_legacy.cpp
+  TEST_COMMAND_OPTIONS
+  TEST_BATCHES
+  TEST_LARGE
+  TEST_SMALL_SHAPE
+)
+
 set(TEST_SMALL_SHAPE_PADDABLE --m=1 --n=1 --k=2 --l=2)
 cutlass_example_add_executable(
   00_bmg_gemm_padded
diff --git a/include/cutlass/gemm/collective/collective_mma.hpp b/include/cutlass/gemm/collective/collective_mma.hpp
index b54776ece5..3c02ca1efa 100644
--- a/include/cutlass/gemm/collective/collective_mma.hpp
+++ b/include/cutlass/gemm/collective/collective_mma.hpp
@@ -77,6 +77,7 @@
 #endif // !defined(__CUDACC_RTC__)
 
 #if defined(SYCL_INTEL_TARGET)
+#include "cutlass/gemm/collective/xe_mma_legacy.hpp"
 #include "cutlass/gemm/collective/xe_mma.hpp"
 #include "cutlass/gemm/collective/xe_array_mma.hpp"
 #include "cutlass/gemm/collective/xe_array_mma_fp8.hpp"
diff --git a/include/cutlass/gemm/collective/collective_mma_decl.hpp b/include/cutlass/gemm/collective/collective_mma_decl.hpp
index a2faa1ff28..a8d2572a71 100644
--- a/include/cutlass/gemm/collective/collective_mma_decl.hpp
+++ b/include/cutlass/gemm/collective/collective_mma_decl.hpp
@@ -58,6 +58,26 @@ struct CollectiveMma {
   static_assert(cutlass::detail::dependent_false<ElementA>, "Could not find a mainloop specialization.");
 };
 
+template <
+  class DispatchPolicy,
+  class TileShape,
+  class ElementA,
+  class StrideA,
+  class ElementB,
+  class StrideB,
+  class TiledMma,
+  class GmemTiledCopyA,
+  class SmemLayoutAtomA,
+  class SmemCopyAtomA,
+  class TransformA,
+  class GmemTiledCopyB,
+  class SmemLayoutAtomB,
+  class SmemCopyAtomB,
+  class TransformB
+>
+struct CollectiveMmaNew {
+  static_assert(cutlass::detail::dependent_false<ElementA>, "Could not find a mainloop specialization.");
+};
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 } // namespace cutlass::gemm::collective
diff --git a/include/cutlass/gemm/collective/xe_mma.hpp b/include/cutlass/gemm/collective/xe_mma.hpp
index 9357491aa2..668a088939 100644
--- a/include/cutlass/gemm/collective/xe_mma.hpp
+++ b/include/cutlass/gemm/collective/xe_mma.hpp
@@ -47,7 +47,7 @@ using namespace cute;
 template <int Stages, class Schedule, class TileShape_, class ElementA_, class StrideA_, class ElementB_, class StrideB_,
           class TiledMma_, class GmemTiledCopyA_, class SmemLayoutAtomA_, class SmemCopyAtomA_, class TransformA_,
           class GmemTiledCopyB_, class SmemLayoutAtomB_, class SmemCopyAtomB_, class TransformB_>
-struct CollectiveMma<MainloopIntelXeXMX16<Stages, Schedule>, TileShape_, ElementA_, StrideA_, ElementB_, StrideB_, TiledMma_,
+struct CollectiveMmaNew<MainloopIntelXeXMX16<Stages, Schedule>, TileShape_, ElementA_, StrideA_, ElementB_, StrideB_, TiledMma_,
                      GmemTiledCopyA_, SmemLayoutAtomA_, SmemCopyAtomA_, TransformA_, GmemTiledCopyB_, SmemLayoutAtomB_,
                      SmemCopyAtomB_, TransformB_> {
   //
@@ -120,7 +120,7 @@ struct CollectiveMma<MainloopIntelXeXMX16<Stages, Schedule>, TileShape_, Element
   // Methods
   //
 
-  CollectiveMma() = default;
+  CollectiveMmaNew() = default;
 
   template <class ProblemShape>
   static constexpr Params
diff --git a/include/cutlass/gemm/collective/xe_mma_legacy.hpp b/include/cutlass/gemm/collective/xe_mma_legacy.hpp
new file mode 100644
index 0000000000..ac7fe83884
--- /dev/null
+++ b/include/cutlass/gemm/collective/xe_mma_legacy.hpp
@@ -0,0 +1,269 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+
+#include "cute/algorithm/functional.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/algorithm/gemm.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+using namespace cute;
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int Stages, class Schedule, class TileShape_, class ElementA_, class StrideA_, class ElementB_, class StrideB_,
+          class TiledMma_, class GmemTiledCopyA_, class SmemLayoutAtomA_, class SmemCopyAtomA_, class TransformA_,
+          class GmemTiledCopyB_, class SmemLayoutAtomB_, class SmemCopyAtomB_, class TransformB_>
+struct CollectiveMma<MainloopIntelXeXMX16<Stages, Schedule>, TileShape_, ElementA_, StrideA_, ElementB_, StrideB_, TiledMma_,
+                     GmemTiledCopyA_, SmemLayoutAtomA_, SmemCopyAtomA_, TransformA_, GmemTiledCopyB_, SmemLayoutAtomB_,
+                     SmemCopyAtomB_, TransformB_> {
+  //
+  // Type Aliases
+  //
+  using DispatchPolicy = MainloopIntelXeXMX16<Stages, Schedule>;
+  using WorkgroupTileShape = TileShape_;
+  using ElementA = ElementA_;
+  using StrideA = StrideA_;
+  using ElementB = ElementB_;
+  using StrideB = StrideB_;
+  using TiledMma = TiledMma_;
+  using ElementAccumulator = typename TiledMma::ValTypeC;
+  using GmemTiledCopyA = GmemTiledCopyA_;
+  using GmemTiledCopyB = GmemTiledCopyB_;
+  using SmemLayoutAtomA = SmemLayoutAtomA_;
+  using SmemLayoutAtomB = SmemLayoutAtomB_;
+  using SmemCopyAtomA = SmemCopyAtomA_;
+  using SmemCopyAtomB = SmemCopyAtomB_;
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+  using ArchTag = typename DispatchPolicy::ArchTag;
+
+  static_assert(platform::is_same<ElementA, ElementB>::value, "MainloopIntelXeXMX16 requires that A and B have same type.");
+  static_assert(std::is_same_v<TransformA, cute::identity>, "Transformation for A is not currently supported on Intel PVC");
+  static_assert(std::is_same_v<TransformB, cute::identity>, "Transformation for B is not currently supported on Intel PVC");
+
+  static constexpr int SubgroupSize = DispatchPolicy::SubgroupSize;
+
+  using MmaAtomShape = typename TiledMma::AtomShape_MNK;
+
+  static constexpr int BLK_M = get<0>(WorkgroupTileShape{});
+  static constexpr int BLK_N = get<1>(WorkgroupTileShape{});
+  static constexpr int BLK_K = get<2>(WorkgroupTileShape{});
+
+  static constexpr int ATOM_M = get<1>(typename TiledMma::ThrLayoutVMNK{}.shape());
+  static constexpr int ATOM_N = get<2>(typename TiledMma::ThrLayoutVMNK{}.shape());
+  static constexpr int ATOM_K = get<3>(typename TiledMma::ThrLayoutVMNK{}.shape());
+
+  static_assert(BLK_M % TiledMma{}.template tile_size_mnk<0>() == 0, "TiledMma permutation size must match block size.");
+  static_assert(BLK_N % TiledMma{}.template tile_size_mnk<1>() == 0, "TiledMma permutation size must match block size.");
+  static_assert(BLK_K % TiledMma{}.template tile_size_mnk<2>() == 0, "TiledMma permutation size must match block size.");
+
+  static constexpr int SG_M = ceil_div(BLK_M, ATOM_M);
+  static constexpr int SG_N = ceil_div(BLK_N, ATOM_N);
+  static constexpr int SG_K = ceil_div(BLK_K, ATOM_K);
+  using SubgroupTileShape = Shape<C<SG_M>, C<SG_N>, C<SG_K>>;
+
+  // 32
+  static constexpr auto Num_SGs = ATOM_N * ATOM_M * ATOM_K;
+  static constexpr uint32_t MaxThreadsPerBlock = size(TiledMma{});
+
+  using Copy_A = typename Copy_Traits<GmemTiledCopyA, StrideA>::template DefaultTiledCopy<ElementA>;
+  using Copy_B = typename Copy_Traits<GmemTiledCopyB, StrideB>::template DefaultTiledCopy<ElementB>;
+
+  // Host side kernel arguments
+  struct Arguments {
+    ElementA const* ptr_A;
+    StrideA dA;
+    ElementB const* ptr_B;
+    StrideB dB;
+  };
+
+  struct Params {
+    Copy_A tiled_copy_a;
+    Copy_B tiled_copy_b;
+  };
+
+  //
+  // Methods
+  //
+
+  CollectiveMma() = default;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    (void) workspace;
+
+    auto [M,N,K,L] = problem_shape;
+
+    auto mA_mkl = make_tensor(make_gmem_ptr(args.ptr_A), make_layout(make_shape(M, K, L), args.dA));
+    auto mB_nkl = make_tensor(make_gmem_ptr(args.ptr_B), make_layout(make_shape(N, K, L), args.dB));
+    Copy_A tiled_copy_a{Copy_A{}.with(mA_mkl)};
+    Copy_B tiled_copy_b{Copy_B{}.with(mB_nkl)};
+    
+    return Params{tiled_copy_a, tiled_copy_b};
+  }
+
+  template<class ProblemShape>
+  static bool
+  can_implement(
+      ProblemShape problem_shapes,
+      Arguments const& args) {
+    constexpr int copy_alignment_bits = 128;
+    constexpr int batch_alignment_bits = 512;
+    auto problem_shape_MNKL = append<4>(problem_shapes, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    bool implementable = true;
+
+    constexpr int min_aligned_elements_A = copy_alignment_bits / sizeof_bits<ElementA>::value;
+    implementable &= cutlass::detail::check_alignment<min_aligned_elements_A>(cute::make_shape(M,K,L), args.dA);
+    constexpr int min_aligned_elements_B = copy_alignment_bits / sizeof_bits<ElementB>::value;
+    implementable &= cutlass::detail::check_alignment<min_aligned_elements_B>(cute::make_shape(N,K,L), args.dB);
+
+    if (L > 1) {
+      constexpr int min_batch_aligned_elements_A = batch_alignment_bits / sizeof_bits<ElementA>::value;
+      implementable &= get<2>(args.dA) % min_batch_aligned_elements_A == 0;
+      constexpr int min_batch_aligned_elements_B = batch_alignment_bits / sizeof_bits<ElementB>::value;
+      implementable &= get<2>(args.dB) % min_batch_aligned_elements_B == 0;
+    }
+
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for XE 2D copy.\n");
+    }
+
+    return implementable;
+  }
+
+  /// Perform a subgroup-scoped matrix multiply-accumulate
+  template <class FrgTensorD, class TensorA, class TensorB, class FrgTensorC, class KTileIterator, class BlkCoord>
+  CUTLASS_DEVICE void operator()(FrgTensorD &accum, TensorA gA, TensorB gB, FrgTensorC const &src_accum,
+                                 KTileIterator k_tile_iter, int k_tile_count, BlkCoord const &blk_coord, int const &K_start, int thread_idx,
+                                 Params const &mainloop) {
+    (void)blk_coord;
+    static_assert(is_rmem<FrgTensorD>::value, "D tensor must be rmem resident.");
+    static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
+
+    auto thr_copy_A = mainloop.tiled_copy_a.get_slice(thread_idx);
+    auto thr_copy_B = mainloop.tiled_copy_b.get_slice(thread_idx);
+
+    // Instantiate the MMA object and get thread slice
+    TiledMma tiled_mma;
+    // TODO(Codeplay): see if we can make this nicer
+    // To make all work items in a subgroup have the same global tensors pass in the index of work item 0 in each subgroup
+    auto sg = compat::get_nd_item<1>().get_sub_group();
+    auto first_thread_in_sg_idx = sg.get_group_linear_id() * DispatchPolicy::SubgroupSize;
+    auto thr_mma = tiled_mma.get_slice(first_thread_in_sg_idx);
+
+    // Partition global counting tensors for MMA
+    Tensor tCgA = thr_mma.partition_A(gA);
+    Tensor tCgB = thr_mma.partition_B(gB);
+
+    Tensor tCrA = make_tensor<ElementA>(make_fragment_layout(mainloop.tiled_copy_a, tCgA(_,_,_,0).shape()));
+    Tensor tCrB = make_tensor<ElementB>(make_fragment_layout(mainloop.tiled_copy_b, tCgB(_,_,_,0).shape()));
+
+    // Retile registers for copies
+    Tensor tArA = thr_copy_A.retile_D(tCrA);
+    Tensor tBrB = thr_copy_B.retile_D(tCrB);
+    
+    // Retile global counting tensors for copies
+    Tensor tAgA = thr_copy_A.retile_S(tCgA);
+    Tensor tBgB = thr_copy_B.retile_S(tCgB);
+    
+    auto tiled_prefetch_a = cute::prefetch_selector<Shape<Int<BLK_M>,Int<BLK_K>>, Num_SGs>(mainloop.tiled_copy_a);
+    auto tiled_prefetch_b = cute::prefetch_selector<Shape<Int<BLK_N>,Int<BLK_K>>, Num_SGs>(mainloop.tiled_copy_b);
+    auto thr_prefetch_A = tiled_prefetch_a.get_slice(thread_idx);
+    auto thr_prefetch_B = tiled_prefetch_b.get_slice(thread_idx);
+    
+    // Partition global tile for prefetch
+    auto pAgA = thr_prefetch_A.partition_S(gA);
+    auto pBgB = thr_prefetch_B.partition_S(gB);
+
+#if CUTLASS_ENABLE_DEBUG_PRINTS
+#define PRINT(x) print(#x ": "); print(x); print("\n");
+    if (cute::thread(LOG_THREAD, LOG_GROUP)) {
+      print("======================= A: \n");
+      PRINT(tCgA);
+      PRINT(tAgA);
+
+      PRINT(tCrA);
+      PRINT(tArA);
+      PRINT(mainloop.tiled_copy_a);
+
+      print("======================= B: \n");
+      PRINT(tCgB);
+      PRINT(tBgB);
+
+      PRINT(tCrB);
+      PRINT(tBrB);
+      PRINT(mainloop.tiled_copy_b);
+      }
+#undef PRINT
+#endif
+
+    //
+    // Mainloop
+    //
+    const auto k_start_idx = crd2idx((*k_tile_iter), make_shape(K_start));
+    constexpr int barrier_scope = 2;
+    int prefetch_k = k_start_idx;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (; prefetch_k < DispatchPolicy::Stages; prefetch_k++) {
+      prefetch(tiled_prefetch_a, pAgA(_, _, _, prefetch_k));
+      prefetch(tiled_prefetch_b, pBgB(_, _, _, prefetch_k));
+    }
+
+    for (int k_tile = k_start_idx; k_tile < k_tile_count + k_start_idx; k_tile++, prefetch_k++) {
+      barrier_arrive(barrier_scope);
+      // Copy gmem to rmem for the first k_tile
+      copy(mainloop.tiled_copy_a, tAgA(_,_,_,k_tile), tArA);
+      copy(mainloop.tiled_copy_b, tBgB(_,_,_,k_tile), tBrB);
+
+      if (prefetch_k < k_tile_count) {
+        prefetch(tiled_prefetch_a, pAgA(_, _, _, prefetch_k));
+        prefetch(tiled_prefetch_b, pBgB(_, _, _, prefetch_k));
+      }
+
+      cute::gemm(tiled_mma, tCrA, tCrB, accum);
+      barrier_wait(barrier_scope);
+    }
+  }
+};
+
+} // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
\ No newline at end of file
diff --git a/include/cutlass/gemm/kernel/gemm_universal.hpp b/include/cutlass/gemm/kernel/gemm_universal.hpp
index 69137d2114..3f6fa9696a 100644
--- a/include/cutlass/gemm/kernel/gemm_universal.hpp
+++ b/include/cutlass/gemm/kernel/gemm_universal.hpp
@@ -78,6 +78,7 @@ struct IsCutlass3ArrayKernel<ProblemShape, cute::void_t<typename ProblemShape::U
 #include "cutlass/gemm/kernel/sm120_gemm_tma_warpspecialized_cooperative_asymmetric_dma.hpp"
 
 #if defined(SYCL_INTEL_TARGET)
+#include "cutlass/gemm/kernel/xe_gemm_legacy.hpp"
 #include "cutlass/gemm/kernel/xe_gemm.hpp"
 #include "cutlass/gemm/kernel/xe_gemm_cooperative.hpp"
 #include "cutlass/gemm/kernel/xe_gemm_array_cooperative.hpp"
diff --git a/include/cutlass/gemm/kernel/gemm_universal_decl.h b/include/cutlass/gemm/kernel/gemm_universal_decl.h
index 9465234219..91a4a6bc15 100644
--- a/include/cutlass/gemm/kernel/gemm_universal_decl.h
+++ b/include/cutlass/gemm/kernel/gemm_universal_decl.h
@@ -56,6 +56,15 @@ template <
 >
 class GemmUniversal;
 
+template <
+  class ProblemShapeOrThreadblockMma_, // (m, n, k) or (m, n, k, l)
+  class CollectiveMainloopOrEpilogue_,
+  class CollectiveEpilogueOrThreadblockSwizzle_,
+  class TileScheduler_ = void,
+  class Enable = void
+>
+class GemmUniversalNew;
+
 
 } // namespace cutlass::gemm::kernel
 
diff --git a/include/cutlass/gemm/kernel/xe_gemm.hpp b/include/cutlass/gemm/kernel/xe_gemm.hpp
index 43e9d033e1..63d4467cd4 100644
--- a/include/cutlass/gemm/kernel/xe_gemm.hpp
+++ b/include/cutlass/gemm/kernel/xe_gemm.hpp
@@ -47,7 +47,7 @@ template <
   class CollectiveEpilogue_,
   class TileScheduler_
 >
-class GemmUniversal<
+class GemmUniversalNew<
   ProblemShape_,
   CollectiveMainloop_,
   CollectiveEpilogue_,
diff --git a/include/cutlass/gemm/kernel/xe_gemm_legacy.hpp b/include/cutlass/gemm/kernel/xe_gemm_legacy.hpp
new file mode 100644
index 0000000000..04e6ecfc99
--- /dev/null
+++ b/include/cutlass/gemm/kernel/xe_gemm_legacy.hpp
@@ -0,0 +1,284 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/kernel_hardware_info.hpp"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+
+#include "cute/tensor.hpp"
+
+namespace cutlass::gemm::kernel {
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <
+  class ProblemShape_,
+  class CollectiveMainloop_,
+  class CollectiveEpilogue_,
+  class TileScheduler_
+>
+class GemmUniversal<
+  ProblemShape_,
+  CollectiveMainloop_,
+  CollectiveEpilogue_,
+  TileScheduler_,
+  cute::enable_if_t<cute::is_base_of_v<KernelXe, typename CollectiveMainloop_::DispatchPolicy::Schedule>>>
+{
+public:
+  //
+  // Type Aliases
+  //
+  using ProblemShape = ProblemShape_;
+
+  static_assert(rank(ProblemShape{}) == 3 or rank(ProblemShape{}) == 4,
+    "ProblemShape{} should be <M,N,K> or <M,N,K,L>");
+
+  // Mainloop derived types
+  using CollectiveMainloop = CollectiveMainloop_;
+  using TileShape = typename CollectiveMainloop::WorkgroupTileShape;
+  using WorkgroupTileShape = TileShape;
+  using TiledMma  = typename CollectiveMainloop::TiledMma;
+  using ArchTag   = typename CollectiveMainloop::ArchTag;
+  using ElementA  = typename CollectiveMainloop::ElementA;
+  using StrideA   = typename CollectiveMainloop::StrideA;
+  using ElementB  = typename CollectiveMainloop::ElementB;
+  using StrideB   = typename CollectiveMainloop::StrideB;
+  using DispatchPolicy = typename CollectiveMainloop::DispatchPolicy;
+  using ElementAccumulator = typename CollectiveMainloop::ElementAccumulator;
+  using MainloopArguments = typename CollectiveMainloop::Arguments;
+  using ClusterShape = typename DispatchPolicy::ClusterShape;
+  using MainloopParams = typename CollectiveMainloop::Params;
+
+  static_assert(cute::is_void_v<TileScheduler_> or cute::is_same_v<TileScheduler_, PersistentScheduler>,
+    "Intel Xe does not support specializing the tile scheduler.");
+  using TileSchedulerTag = TileScheduler_;
+  using TileScheduler = typename detail::TileSchedulerSelector<
+    TileScheduler_, ArchTag, WorkgroupTileShape,
+    cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>>::Scheduler;
+  using TileSchedulerArguments = typename TileScheduler::Arguments;
+  using TileSchedulerParams = typename TileScheduler::Params;
+
+  // Epilogue derived types
+  using CollectiveEpilogue = CollectiveEpilogue_;
+  using ElementC = typename CollectiveEpilogue::ElementC;
+  using StrideC  = typename CollectiveEpilogue::StrideC;
+  using ElementD = typename CollectiveEpilogue::ElementD;
+  using StrideD  = typename CollectiveEpilogue::StrideD;
+  using EpilogueArguments = typename CollectiveEpilogue::Arguments;
+  using EpilogueParams = typename CollectiveEpilogue::Params;
+  static_assert(cute::is_same_v<ElementAccumulator, typename CollectiveEpilogue::ElementAccumulator>,
+    "Mainloop and epilogue do not agree on accumulator value type.");
+
+  // MSVC requires the cast to fix a warning-as-error.
+  static constexpr int SharedStorageSize = 0;
+
+  static constexpr int SubgroupSize = CollectiveMainloop::SubgroupSize; // sub_group size
+  static constexpr uint32_t MaxThreadsPerBlock = CollectiveMainloop::MaxThreadsPerBlock;
+  using MmaAtomShape = typename CollectiveMainloop::MmaAtomShape;
+  using SubgroupTileShape = typename CollectiveMainloop::SubgroupTileShape;
+
+  // Kernel level shared memory storage
+  struct SharedStorage {
+    using EpilogueTensorStorage = typename CollectiveEpilogue::TensorStorage;
+    EpilogueTensorStorage epilogue;
+  };
+
+  // Device side arguments
+  struct Arguments {
+    GemmUniversalMode mode{};
+    ProblemShape problem_shape{};
+    MainloopArguments mainloop{};
+    EpilogueArguments epilogue{};
+    KernelHardwareInfo hw_info{};
+    TileSchedulerArguments scheduler{};
+  };
+
+  // Kernel entry point API
+  struct Params {
+    GemmUniversalMode mode{};
+    ProblemShape problem_shape{};
+    MainloopParams mainloop{};
+    EpilogueParams epilogue{};
+    KernelHardwareInfo hw_info{};
+    TileSchedulerParams scheduler{};
+  };
+
+  //
+  // Methods
+  //
+
+  // Convert to underlying arguments. In this case, a simple copy for the aliased type.
+  static
+  Params
+  to_underlying_arguments(Arguments const& args, void* workspace) {
+    (void) workspace;
+    auto problem_shape_MNKL = append<4>(args.problem_shape, 1);
+
+    auto mainloop_args = CollectiveMainloop::to_underlying_arguments(args.problem_shape, args.mainloop, workspace);
+    TileSchedulerParams scheduler = TileScheduler::to_underlying_arguments(
+      problem_shape_MNKL, TileShape{}, ClusterShape{}, args.hw_info, args.scheduler, &workspace);
+    return {
+      args.mode,
+      args.problem_shape,
+      mainloop_args,
+      CollectiveEpilogue::to_underlying_arguments(args.problem_shape, args.epilogue, workspace),
+      args.hw_info,
+      scheduler
+    };
+  }
+
+  static bool
+  can_implement(Arguments const& args) {
+    bool implementable = true;
+
+    implementable = implementable && (args.mode == GemmUniversalMode::kGemm ||
+          (args.mode == GemmUniversalMode::kBatched && rank(ProblemShape{}) == 4));
+
+    implementable &= TileScheduler::can_implement(args.scheduler);
+
+    implementable &= CollectiveMainloop::can_implement(args.problem_shape, args.mainloop);
+    implementable &= CollectiveEpilogue::can_implement(args.problem_shape, args.epilogue);
+
+    return implementable;
+  }
+
+  static int
+  get_workspace_size(Arguments const& args) {
+    return 0;
+  }
+
+  static
+  cutlass::Status
+  initialize_workspace(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr, 
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return Status::kSuccess;
+  }
+
+  static dim3
+  get_grid_shape(Params const& params) {
+    dim3 grid = TileScheduler::get_tiled_cta_shape_mnl(params.problem_shape, TileShape{}, ClusterShape{});
+    if(params.scheduler.raster_order_ == TileScheduler::RasterOrder::AlongN) {
+      return {grid.y, grid.x, grid.z};
+    } else {
+      return {grid.x, grid.y, grid.z};
+    }
+  }
+
+  static dim3
+  get_block_shape() {
+    return dim3(MaxThreadsPerBlock, 1, 1);
+  }
+
+  CUTLASS_DEVICE
+  void
+  operator()(Params const& params, char* smem_buf) {
+    SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(smem_buf);
+    // Preconditions
+    CUTE_STATIC_ASSERT(is_static<WorkgroupTileShape>::value);
+
+    // Separate out problem shape for convenience
+    // Optionally append 1s until problem shape is rank-4 in case its is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(params.problem_shape, Int<1>{});
+    auto M = get<0>(problem_shape_MNKL);
+    auto N = get<1>(problem_shape_MNKL);
+    auto K = get<2>(problem_shape_MNKL);
+    auto L = get<3>(problem_shape_MNKL);
+
+    // Preconditions
+    static_assert(cute::rank(StrideA{}) == 3, "StrideA must be rank-3: [M, K, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(StrideB{}) == 3, "StrideB must be rank-3: [N, K, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(StrideC{}) == 3, "StrideC must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(StrideD{}) == 3, "StrideD must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
+
+    // Get the appropriate blocks for this sub_group -- potential for sub_group locality
+    int thread_idx = int(ThreadIdxX());
+    auto blk_shape = TileShape{};
+    int m_coord, n_coord, l_coord;
+    if (params.scheduler.raster_order_ == TileScheduler::RasterOrder::AlongN) {
+      m_coord = BlockIdxY();
+      n_coord = BlockIdxX();
+      l_coord = BlockIdxZ();
+    } else {
+      m_coord = BlockIdxX();
+      n_coord = BlockIdxY();
+      l_coord = BlockIdxZ();
+    }
+
+    auto blk_coord_mnkl = make_coord(m_coord, n_coord, _, l_coord);
+    constexpr auto workgroup_shape = WorkgroupTileShape{};                                                  // (SUB_M,SUB_N,SUB_K)
+    constexpr auto subgroup_shape = SubgroupTileShape{};                   
+
+    Tensor mA_mkl = cute::get_xe_tensor(make_shape(M,K,L));   //(m,k,l)
+    Tensor mB_nkl = cute::get_xe_tensor(make_shape(N,K,L));   //(n,k,l)
+
+    Tensor gA = local_tile(mA_mkl, select<0,2>(blk_shape), make_coord(m_coord,_,l_coord));
+    Tensor gB = local_tile(mB_nkl, select<1,2>(blk_shape), make_coord(n_coord,_,l_coord));
+
+    // Allocate the tiled_mma and the accumulators for the (M,N) subgroup_shape
+    TiledMma tiled_mma;
+
+    Tensor accumulators = partition_fragment_C(tiled_mma, take<0,2>(blk_shape)); 
+    clear(accumulators);
+
+    auto k_tile_iter  = cute::make_coord_iterator(idx2crd(0, make_shape(K)), make_shape(K));
+    int  k_tile_count = ceil_div(K, get<2>(workgroup_shape));
+
+    // Perform the collective scoped MMA
+    CollectiveMainloop collective_mma;
+    collective_mma(
+      accumulators,
+      gA,
+      gB,
+      accumulators,
+      k_tile_iter, k_tile_count,
+      blk_coord_mnkl, // TODO(codeplay): Remove this once unneeded in xe_mma_mixed_input.hpp
+      K,
+      thread_idx,
+      params.mainloop
+    );
+
+    CollectiveEpilogue epilogue{params.epilogue, shared_storage.epilogue};
+    epilogue(
+      problem_shape_MNKL,
+      subgroup_shape, // TODO(codeplay): Inconsistency here w/ blk_coord_mnkl
+      blk_coord_mnkl,
+      accumulators,
+      tiled_mma,
+      thread_idx
+    );
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::kernel
\ No newline at end of file

From f210ba363a1208cc031b305976e5d5c9930126df Mon Sep 17 00:00:00 2001
From: Anamika Chatterjee <anamikac@habana.ai>
Date: Tue, 30 Sep 2025 15:42:09 +0300
Subject: [PATCH 04/15] delete unwanted file

---
 .vscode/settings.json | 24 ------------------------
 1 file changed, 24 deletions(-)
 delete mode 100644 .vscode/settings.json

diff --git a/.vscode/settings.json b/.vscode/settings.json
deleted file mode 100644
index 43b3531021..0000000000
--- a/.vscode/settings.json
+++ /dev/null
@@ -1,24 +0,0 @@
-{
-    "files.associations": {
-        "compare": "cpp",
-        "string": "cpp",
-        "unordered_map": "cpp",
-        "vector": "cpp",
-        "exception": "cpp",
-        "memory": "cpp",
-        "memory_resource": "cpp",
-        "string_view": "cpp",
-        "random": "cpp",
-        "initializer_list": "cpp",
-        "istream": "cpp",
-        "new": "cpp",
-        "ostream": "cpp",
-        "sstream": "cpp",
-        "stdexcept": "cpp",
-        "streambuf": "cpp",
-        "system_error": "cpp",
-        "tuple": "cpp",
-        "type_traits": "cpp",
-        "typeinfo": "cpp"
-    }
-}
\ No newline at end of file

From 5f5a8b789523d94557d98aca076afd2bfd024a63 Mon Sep 17 00:00:00 2001
From: Anamika Chatterjee <anamikac@habana.ai>
Date: Wed, 1 Oct 2025 12:22:31 +0300
Subject: [PATCH 05/15] Changes added based on feedback

---
 examples/00_bmg_gemm/00_bmg_gemm.cpp          | 64 ++-----------------
 .../gemm/collective/collective_mma_decl.hpp   | 20 ------
 include/cutlass/gemm/collective/xe_mma.hpp    |  8 +--
 include/cutlass/gemm/dispatch_policy.hpp      | 14 ++++
 .../cutlass/gemm/kernel/gemm_universal.hpp    |  1 -
 .../cutlass/gemm/kernel/gemm_universal_decl.h |  9 ---
 include/cutlass/gemm/kernel/xe_gemm.hpp       |  4 +-
 7 files changed, 26 insertions(+), 94 deletions(-)

diff --git a/examples/00_bmg_gemm/00_bmg_gemm.cpp b/examples/00_bmg_gemm/00_bmg_gemm.cpp
index 312dcd23d0..ada885eb4f 100644
--- a/examples/00_bmg_gemm/00_bmg_gemm.cpp
+++ b/examples/00_bmg_gemm/00_bmg_gemm.cpp
@@ -81,58 +81,6 @@
 
 using namespace cute;
 
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Helper template to check if a type is complete
-template <typename T, size_t = 0>
-struct is_complete : std::false_type {};
-
-template <typename T>
-struct is_complete<T, 0 * sizeof(T)> : std::true_type {};
-
-template <typename T>
-static constexpr bool is_complete_v = is_complete<T>::value;
-
-
-template <typename TA, typename TB, typename TC>
-auto
-choose_mma_op()
-{
-  if constexpr (is_complete_v<XE_DPAS_TT<8, TC, TA, TB>>) 
-    return XE_DPAS_TT<8, TC, TA, TB>{};
-  else if constexpr (is_same_v<std::decay_t<TA>, cute::bfloat16_t>)
-    return XE_DPAS_TT<8, float, cute::bfloat16_t>{};
-  else  /* Use f16 by default as upconversion sequences are typically faster */
-    return XE_DPAS_TT<8, float, cute::half_t>{};
-}
-
-// Helper function to choose tiled MMA based on tensor properties
-template <typename TA, typename TB, typename TC, typename LayoutA, typename LayoutB>
-auto
-choose_tiled_mma() 
-{
-
-  auto op = choose_mma_op<TA,TB,TC>();
-
-  constexpr bool byte = (cute::max(sizeof_bits_v<TA>, sizeof_bits_v<TB>) <= 8);
-
-  constexpr bool is_A_transposed = std::is_same_v<LayoutA, cutlass::layout::ColumnMajor>;
-  constexpr bool is_B_transposed = std::is_same_v<LayoutB, cutlass::layout::ColumnMajor>;
-  constexpr bool use_1x_dpas_per_k = is_A_transposed || (byte && is_B_transposed);
-  
-  
-  using _K = conditional_t<use_1x_dpas_per_k,
-                           C<op.K>, C<op.K*2>>;
-
-  using WGTile = Shape<_256, _256, _K>;                               // 256x256 WG tile size
-  using SGLayout = Layout<Shape<_8, _4, _1>, Stride<_4, _1, _0>>;     // 8x4 SG tiling, n-major
-
-  using MMA = typename TiledMMAHelper<MMA_Atom<decltype(op)>, Layout<WGTile>, SGLayout>::TiledMMA;
-
-  return MMA{};
-}
-
-
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 
 // Command line options parsing
@@ -401,13 +349,13 @@ int main(int argc, const char** argv)
   using GmemTiledCopyA = XE_2D_U16x32x32_LD_N;
   using GmemTiledCopyB = XE_2D_U16x32x32_LD_V;
 
-  // Workgroup-level tile
-  using TiledMma = decltype(choose_tiled_mma<ElementInputA, ElementInputB, ElementOutput, LayoutA, LayoutB>());
-  using TileShape = decltype(TiledMma{}.tile_mnk());
+  // New MMA atom XE_DPAS_TT using workgroup-level tile shape of 256×256×32  
+  using TileShape = Shape<_256, _256, _32>; 
+  using TiledMma = typename TiledMMAHelper<MMA_Atom<XE_DPAS_TT<8, float, cute::bfloat16_t>>, Layout<TileShape>, Layout<Shape<_8, _4, _1>, Stride<_4, _1, _0>>>::TiledMMA;
 
   // For Intel BMG, PipelineStages defines how many k-blocks ahead to prefetch from A and B.
   constexpr int PipelineStages = 2;
-  using GEMMDispatchPolicy = cutlass::gemm::MainloopIntelXeXMX16<PipelineStages>;
+  using GEMMDispatchPolicy = cutlass::gemm::MainloopXeL1Staged<PipelineStages>;
   using EpilogueDispatchPolicy = cutlass::epilogue::IntelXeXMX16;
 
   // This is the 'default' epilogue operation (Linear Combination) which performs everything in:
@@ -437,7 +385,7 @@ int main(int argc, const char** argv)
           void, void>;
 
   // GEMM Mainloop - iteration over blocks in K dimension
-  using CollectiveMainloop = cutlass::gemm::collective::CollectiveMmaNew<
+  using CollectiveMainloop = cutlass::gemm::collective::CollectiveMma<
           GEMMDispatchPolicy,
           TileShape,
           ElementInputA,
@@ -450,7 +398,7 @@ int main(int argc, const char** argv)
   >;
 
   // Define the whole kernel (mainloop and epilogue)
-  using GemmKernel = cutlass::gemm::kernel::GemmUniversalNew<
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
   Shape<int, int, int, int>, // Defer global problem shape definition to runtime
   CollectiveMainloop,
   CollectiveEpilogue
diff --git a/include/cutlass/gemm/collective/collective_mma_decl.hpp b/include/cutlass/gemm/collective/collective_mma_decl.hpp
index a8d2572a71..a2faa1ff28 100644
--- a/include/cutlass/gemm/collective/collective_mma_decl.hpp
+++ b/include/cutlass/gemm/collective/collective_mma_decl.hpp
@@ -58,26 +58,6 @@ struct CollectiveMma {
   static_assert(cutlass::detail::dependent_false<ElementA>, "Could not find a mainloop specialization.");
 };
 
-template <
-  class DispatchPolicy,
-  class TileShape,
-  class ElementA,
-  class StrideA,
-  class ElementB,
-  class StrideB,
-  class TiledMma,
-  class GmemTiledCopyA,
-  class SmemLayoutAtomA,
-  class SmemCopyAtomA,
-  class TransformA,
-  class GmemTiledCopyB,
-  class SmemLayoutAtomB,
-  class SmemCopyAtomB,
-  class TransformB
->
-struct CollectiveMmaNew {
-  static_assert(cutlass::detail::dependent_false<ElementA>, "Could not find a mainloop specialization.");
-};
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 } // namespace cutlass::gemm::collective
diff --git a/include/cutlass/gemm/collective/xe_mma.hpp b/include/cutlass/gemm/collective/xe_mma.hpp
index 668a088939..8df8945a7e 100644
--- a/include/cutlass/gemm/collective/xe_mma.hpp
+++ b/include/cutlass/gemm/collective/xe_mma.hpp
@@ -47,13 +47,13 @@ using namespace cute;
 template <int Stages, class Schedule, class TileShape_, class ElementA_, class StrideA_, class ElementB_, class StrideB_,
           class TiledMma_, class GmemTiledCopyA_, class SmemLayoutAtomA_, class SmemCopyAtomA_, class TransformA_,
           class GmemTiledCopyB_, class SmemLayoutAtomB_, class SmemCopyAtomB_, class TransformB_>
-struct CollectiveMmaNew<MainloopIntelXeXMX16<Stages, Schedule>, TileShape_, ElementA_, StrideA_, ElementB_, StrideB_, TiledMma_,
+struct CollectiveMma<MainloopXeL1Staged<Stages, Schedule>, TileShape_, ElementA_, StrideA_, ElementB_, StrideB_, TiledMma_,
                      GmemTiledCopyA_, SmemLayoutAtomA_, SmemCopyAtomA_, TransformA_, GmemTiledCopyB_, SmemLayoutAtomB_,
                      SmemCopyAtomB_, TransformB_> {
   //
   // Type Aliases
   //
-  using DispatchPolicy = MainloopIntelXeXMX16<Stages, Schedule>;
+  using DispatchPolicy = MainloopXeL1Staged<Stages, Schedule>;
   using WorkgroupTileShape = TileShape_;
   using ElementA = ElementA_;
   using StrideA = StrideA_;
@@ -71,7 +71,7 @@ struct CollectiveMmaNew<MainloopIntelXeXMX16<Stages, Schedule>, TileShape_, Elem
   using TransformB = TransformB_;
   using ArchTag = typename DispatchPolicy::ArchTag;
 
-  static_assert(platform::is_same<ElementA, ElementB>::value, "MainloopIntelXeXMX16 requires that A and B have same type.");
+  static_assert(platform::is_same<ElementA, ElementB>::value, "MainloopXeL1Staged requires that A and B have same type.");
   static_assert(std::is_same_v<TransformA, cute::identity>, "Transformation for A is not currently supported on Intel PVC");
   static_assert(std::is_same_v<TransformB, cute::identity>, "Transformation for B is not currently supported on Intel PVC");
 
@@ -120,7 +120,7 @@ struct CollectiveMmaNew<MainloopIntelXeXMX16<Stages, Schedule>, TileShape_, Elem
   // Methods
   //
 
-  CollectiveMmaNew() = default;
+  CollectiveMma() = default;
 
   template <class ProblemShape>
   static constexpr Params
diff --git a/include/cutlass/gemm/dispatch_policy.hpp b/include/cutlass/gemm/dispatch_policy.hpp
index b742dfd76f..423e14d10f 100644
--- a/include/cutlass/gemm/dispatch_policy.hpp
+++ b/include/cutlass/gemm/dispatch_policy.hpp
@@ -1247,6 +1247,20 @@ struct MainloopDeviceAgnostic {
   using Schedule = KernelMultistage;
 };
 #endif
+
+#if defined(CUTLASS_ENABLE_SYCL) 
+// Note: This dispatch policy is specifically added for CollectiveMma to support
+// the integration of new MMA atoms (XE_DPAS_TT) and copy atoms for Intel XE architecture
+template<int Stages_, class KernelSchedule = KernelXe>
+struct MainloopXeL1Staged {
+  constexpr static int Stages = Stages_;
+  constexpr static int SubgroupSize = 16;
+  using ArchTag = arch::IntelXe;
+  using Schedule = KernelSchedule;
+  using ClusterShape = Shape<_1,_1,_1>;
+};
+#endif
+
 // n-buffer in smem, pipelined with Blackwell UMMA and TMA, Warp specialized dynamic schedule
 template<
   int LoadABPipelineStageCount_,
diff --git a/include/cutlass/gemm/kernel/gemm_universal.hpp b/include/cutlass/gemm/kernel/gemm_universal.hpp
index 3f6fa9696a..69137d2114 100644
--- a/include/cutlass/gemm/kernel/gemm_universal.hpp
+++ b/include/cutlass/gemm/kernel/gemm_universal.hpp
@@ -78,7 +78,6 @@ struct IsCutlass3ArrayKernel<ProblemShape, cute::void_t<typename ProblemShape::U
 #include "cutlass/gemm/kernel/sm120_gemm_tma_warpspecialized_cooperative_asymmetric_dma.hpp"
 
 #if defined(SYCL_INTEL_TARGET)
-#include "cutlass/gemm/kernel/xe_gemm_legacy.hpp"
 #include "cutlass/gemm/kernel/xe_gemm.hpp"
 #include "cutlass/gemm/kernel/xe_gemm_cooperative.hpp"
 #include "cutlass/gemm/kernel/xe_gemm_array_cooperative.hpp"
diff --git a/include/cutlass/gemm/kernel/gemm_universal_decl.h b/include/cutlass/gemm/kernel/gemm_universal_decl.h
index 91a4a6bc15..9465234219 100644
--- a/include/cutlass/gemm/kernel/gemm_universal_decl.h
+++ b/include/cutlass/gemm/kernel/gemm_universal_decl.h
@@ -56,15 +56,6 @@ template <
 >
 class GemmUniversal;
 
-template <
-  class ProblemShapeOrThreadblockMma_, // (m, n, k) or (m, n, k, l)
-  class CollectiveMainloopOrEpilogue_,
-  class CollectiveEpilogueOrThreadblockSwizzle_,
-  class TileScheduler_ = void,
-  class Enable = void
->
-class GemmUniversalNew;
-
 
 } // namespace cutlass::gemm::kernel
 
diff --git a/include/cutlass/gemm/kernel/xe_gemm.hpp b/include/cutlass/gemm/kernel/xe_gemm.hpp
index 63d4467cd4..7a13d0fc1f 100644
--- a/include/cutlass/gemm/kernel/xe_gemm.hpp
+++ b/include/cutlass/gemm/kernel/xe_gemm.hpp
@@ -47,7 +47,7 @@ template <
   class CollectiveEpilogue_,
   class TileScheduler_
 >
-class GemmUniversalNew<
+class GemmUniversal<
   ProblemShape_,
   CollectiveMainloop_,
   CollectiveEpilogue_,
@@ -239,7 +239,7 @@ class GemmUniversalNew<
     constexpr auto subgroup_shape = SubgroupTileShape{};                   
 
     Tensor cA = make_identity_tensor(make_shape(M,K,L));   // (M,K,L)
-    Tensor cB = make_identity_tensor(make_shape(M,K,L));   // (N,K,L)
+    Tensor cB = make_identity_tensor(make_shape(N,K,L));   // (N,K,L)
 
     Tensor gA = local_tile(cA, select<0,2>(blk_shape), make_coord(m_coord,_,l_coord));
     Tensor gB = local_tile(cB, select<1,2>(blk_shape), make_coord(n_coord,_,l_coord));

From c55ac28007df91d94b2037c25e642578d0ef5610 Mon Sep 17 00:00:00 2001
From: Anamika Chatterjee <anamikac@habana.ai>
Date: Wed, 1 Oct 2025 12:29:23 +0300
Subject: [PATCH 06/15] Remove xe_gemm_legacy as its not longer used

---
 .../cutlass/gemm/kernel/xe_gemm_legacy.hpp    | 284 ------------------
 1 file changed, 284 deletions(-)
 delete mode 100644 include/cutlass/gemm/kernel/xe_gemm_legacy.hpp

diff --git a/include/cutlass/gemm/kernel/xe_gemm_legacy.hpp b/include/cutlass/gemm/kernel/xe_gemm_legacy.hpp
deleted file mode 100644
index 04e6ecfc99..0000000000
--- a/include/cutlass/gemm/kernel/xe_gemm_legacy.hpp
+++ /dev/null
@@ -1,284 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/kernel_hardware_info.hpp"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/gemm/dispatch_policy.hpp"
-
-#include "cute/tensor.hpp"
-
-namespace cutlass::gemm::kernel {
-
-///////////////////////////////////////////////////////////////////////////////
-
-template <
-  class ProblemShape_,
-  class CollectiveMainloop_,
-  class CollectiveEpilogue_,
-  class TileScheduler_
->
-class GemmUniversal<
-  ProblemShape_,
-  CollectiveMainloop_,
-  CollectiveEpilogue_,
-  TileScheduler_,
-  cute::enable_if_t<cute::is_base_of_v<KernelXe, typename CollectiveMainloop_::DispatchPolicy::Schedule>>>
-{
-public:
-  //
-  // Type Aliases
-  //
-  using ProblemShape = ProblemShape_;
-
-  static_assert(rank(ProblemShape{}) == 3 or rank(ProblemShape{}) == 4,
-    "ProblemShape{} should be <M,N,K> or <M,N,K,L>");
-
-  // Mainloop derived types
-  using CollectiveMainloop = CollectiveMainloop_;
-  using TileShape = typename CollectiveMainloop::WorkgroupTileShape;
-  using WorkgroupTileShape = TileShape;
-  using TiledMma  = typename CollectiveMainloop::TiledMma;
-  using ArchTag   = typename CollectiveMainloop::ArchTag;
-  using ElementA  = typename CollectiveMainloop::ElementA;
-  using StrideA   = typename CollectiveMainloop::StrideA;
-  using ElementB  = typename CollectiveMainloop::ElementB;
-  using StrideB   = typename CollectiveMainloop::StrideB;
-  using DispatchPolicy = typename CollectiveMainloop::DispatchPolicy;
-  using ElementAccumulator = typename CollectiveMainloop::ElementAccumulator;
-  using MainloopArguments = typename CollectiveMainloop::Arguments;
-  using ClusterShape = typename DispatchPolicy::ClusterShape;
-  using MainloopParams = typename CollectiveMainloop::Params;
-
-  static_assert(cute::is_void_v<TileScheduler_> or cute::is_same_v<TileScheduler_, PersistentScheduler>,
-    "Intel Xe does not support specializing the tile scheduler.");
-  using TileSchedulerTag = TileScheduler_;
-  using TileScheduler = typename detail::TileSchedulerSelector<
-    TileScheduler_, ArchTag, WorkgroupTileShape,
-    cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>>::Scheduler;
-  using TileSchedulerArguments = typename TileScheduler::Arguments;
-  using TileSchedulerParams = typename TileScheduler::Params;
-
-  // Epilogue derived types
-  using CollectiveEpilogue = CollectiveEpilogue_;
-  using ElementC = typename CollectiveEpilogue::ElementC;
-  using StrideC  = typename CollectiveEpilogue::StrideC;
-  using ElementD = typename CollectiveEpilogue::ElementD;
-  using StrideD  = typename CollectiveEpilogue::StrideD;
-  using EpilogueArguments = typename CollectiveEpilogue::Arguments;
-  using EpilogueParams = typename CollectiveEpilogue::Params;
-  static_assert(cute::is_same_v<ElementAccumulator, typename CollectiveEpilogue::ElementAccumulator>,
-    "Mainloop and epilogue do not agree on accumulator value type.");
-
-  // MSVC requires the cast to fix a warning-as-error.
-  static constexpr int SharedStorageSize = 0;
-
-  static constexpr int SubgroupSize = CollectiveMainloop::SubgroupSize; // sub_group size
-  static constexpr uint32_t MaxThreadsPerBlock = CollectiveMainloop::MaxThreadsPerBlock;
-  using MmaAtomShape = typename CollectiveMainloop::MmaAtomShape;
-  using SubgroupTileShape = typename CollectiveMainloop::SubgroupTileShape;
-
-  // Kernel level shared memory storage
-  struct SharedStorage {
-    using EpilogueTensorStorage = typename CollectiveEpilogue::TensorStorage;
-    EpilogueTensorStorage epilogue;
-  };
-
-  // Device side arguments
-  struct Arguments {
-    GemmUniversalMode mode{};
-    ProblemShape problem_shape{};
-    MainloopArguments mainloop{};
-    EpilogueArguments epilogue{};
-    KernelHardwareInfo hw_info{};
-    TileSchedulerArguments scheduler{};
-  };
-
-  // Kernel entry point API
-  struct Params {
-    GemmUniversalMode mode{};
-    ProblemShape problem_shape{};
-    MainloopParams mainloop{};
-    EpilogueParams epilogue{};
-    KernelHardwareInfo hw_info{};
-    TileSchedulerParams scheduler{};
-  };
-
-  //
-  // Methods
-  //
-
-  // Convert to underlying arguments. In this case, a simple copy for the aliased type.
-  static
-  Params
-  to_underlying_arguments(Arguments const& args, void* workspace) {
-    (void) workspace;
-    auto problem_shape_MNKL = append<4>(args.problem_shape, 1);
-
-    auto mainloop_args = CollectiveMainloop::to_underlying_arguments(args.problem_shape, args.mainloop, workspace);
-    TileSchedulerParams scheduler = TileScheduler::to_underlying_arguments(
-      problem_shape_MNKL, TileShape{}, ClusterShape{}, args.hw_info, args.scheduler, &workspace);
-    return {
-      args.mode,
-      args.problem_shape,
-      mainloop_args,
-      CollectiveEpilogue::to_underlying_arguments(args.problem_shape, args.epilogue, workspace),
-      args.hw_info,
-      scheduler
-    };
-  }
-
-  static bool
-  can_implement(Arguments const& args) {
-    bool implementable = true;
-
-    implementable = implementable && (args.mode == GemmUniversalMode::kGemm ||
-          (args.mode == GemmUniversalMode::kBatched && rank(ProblemShape{}) == 4));
-
-    implementable &= TileScheduler::can_implement(args.scheduler);
-
-    implementable &= CollectiveMainloop::can_implement(args.problem_shape, args.mainloop);
-    implementable &= CollectiveEpilogue::can_implement(args.problem_shape, args.epilogue);
-
-    return implementable;
-  }
-
-  static int
-  get_workspace_size(Arguments const& args) {
-    return 0;
-  }
-
-  static
-  cutlass::Status
-  initialize_workspace(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr, 
-    CudaHostAdapter* cuda_adapter = nullptr) {
-    return Status::kSuccess;
-  }
-
-  static dim3
-  get_grid_shape(Params const& params) {
-    dim3 grid = TileScheduler::get_tiled_cta_shape_mnl(params.problem_shape, TileShape{}, ClusterShape{});
-    if(params.scheduler.raster_order_ == TileScheduler::RasterOrder::AlongN) {
-      return {grid.y, grid.x, grid.z};
-    } else {
-      return {grid.x, grid.y, grid.z};
-    }
-  }
-
-  static dim3
-  get_block_shape() {
-    return dim3(MaxThreadsPerBlock, 1, 1);
-  }
-
-  CUTLASS_DEVICE
-  void
-  operator()(Params const& params, char* smem_buf) {
-    SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(smem_buf);
-    // Preconditions
-    CUTE_STATIC_ASSERT(is_static<WorkgroupTileShape>::value);
-
-    // Separate out problem shape for convenience
-    // Optionally append 1s until problem shape is rank-4 in case its is only rank-3 (MNK)
-    auto problem_shape_MNKL = append<4>(params.problem_shape, Int<1>{});
-    auto M = get<0>(problem_shape_MNKL);
-    auto N = get<1>(problem_shape_MNKL);
-    auto K = get<2>(problem_shape_MNKL);
-    auto L = get<3>(problem_shape_MNKL);
-
-    // Preconditions
-    static_assert(cute::rank(StrideA{}) == 3, "StrideA must be rank-3: [M, K, L]. If batch mode is not needed, set L stride to Int<0>.");
-    static_assert(cute::rank(StrideB{}) == 3, "StrideB must be rank-3: [N, K, L]. If batch mode is not needed, set L stride to Int<0>.");
-    static_assert(cute::rank(StrideC{}) == 3, "StrideC must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
-    static_assert(cute::rank(StrideD{}) == 3, "StrideD must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
-
-    // Get the appropriate blocks for this sub_group -- potential for sub_group locality
-    int thread_idx = int(ThreadIdxX());
-    auto blk_shape = TileShape{};
-    int m_coord, n_coord, l_coord;
-    if (params.scheduler.raster_order_ == TileScheduler::RasterOrder::AlongN) {
-      m_coord = BlockIdxY();
-      n_coord = BlockIdxX();
-      l_coord = BlockIdxZ();
-    } else {
-      m_coord = BlockIdxX();
-      n_coord = BlockIdxY();
-      l_coord = BlockIdxZ();
-    }
-
-    auto blk_coord_mnkl = make_coord(m_coord, n_coord, _, l_coord);
-    constexpr auto workgroup_shape = WorkgroupTileShape{};                                                  // (SUB_M,SUB_N,SUB_K)
-    constexpr auto subgroup_shape = SubgroupTileShape{};                   
-
-    Tensor mA_mkl = cute::get_xe_tensor(make_shape(M,K,L));   //(m,k,l)
-    Tensor mB_nkl = cute::get_xe_tensor(make_shape(N,K,L));   //(n,k,l)
-
-    Tensor gA = local_tile(mA_mkl, select<0,2>(blk_shape), make_coord(m_coord,_,l_coord));
-    Tensor gB = local_tile(mB_nkl, select<1,2>(blk_shape), make_coord(n_coord,_,l_coord));
-
-    // Allocate the tiled_mma and the accumulators for the (M,N) subgroup_shape
-    TiledMma tiled_mma;
-
-    Tensor accumulators = partition_fragment_C(tiled_mma, take<0,2>(blk_shape)); 
-    clear(accumulators);
-
-    auto k_tile_iter  = cute::make_coord_iterator(idx2crd(0, make_shape(K)), make_shape(K));
-    int  k_tile_count = ceil_div(K, get<2>(workgroup_shape));
-
-    // Perform the collective scoped MMA
-    CollectiveMainloop collective_mma;
-    collective_mma(
-      accumulators,
-      gA,
-      gB,
-      accumulators,
-      k_tile_iter, k_tile_count,
-      blk_coord_mnkl, // TODO(codeplay): Remove this once unneeded in xe_mma_mixed_input.hpp
-      K,
-      thread_idx,
-      params.mainloop
-    );
-
-    CollectiveEpilogue epilogue{params.epilogue, shared_storage.epilogue};
-    epilogue(
-      problem_shape_MNKL,
-      subgroup_shape, // TODO(codeplay): Inconsistency here w/ blk_coord_mnkl
-      blk_coord_mnkl,
-      accumulators,
-      tiled_mma,
-      thread_idx
-    );
-  }
-};
-
-///////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass::gemm::kernel
\ No newline at end of file

From 946b46cccd2eac3c3259a9364e2be9e37ab7817e Mon Sep 17 00:00:00 2001
From: Anamika Chatterjee <anamikac@habana.ai>
Date: Fri, 3 Oct 2025 17:16:11 +0300
Subject: [PATCH 07/15] Changes added based on feedback

---
 examples/00_bmg_gemm/00_bmg_gemm.cpp        |  30 +-
 examples/00_bmg_gemm/00_bmg_gemm_legacy.cpp | 429 --------------------
 examples/00_bmg_gemm/CMakeLists.txt         |   9 -
 include/cutlass/gemm/collective/xe_mma.hpp  |  64 ++-
 4 files changed, 70 insertions(+), 462 deletions(-)
 delete mode 100644 examples/00_bmg_gemm/00_bmg_gemm_legacy.cpp

diff --git a/examples/00_bmg_gemm/00_bmg_gemm.cpp b/examples/00_bmg_gemm/00_bmg_gemm.cpp
index ada885eb4f..cf64eb3113 100644
--- a/examples/00_bmg_gemm/00_bmg_gemm.cpp
+++ b/examples/00_bmg_gemm/00_bmg_gemm.cpp
@@ -345,12 +345,19 @@ int main(int argc, const char** argv)
   using LayoutC = cutlass::layout::RowMajor;
   using LayoutD = cutlass::layout::RowMajor;
 
-  // The 2D block copy operations used for the A and B matrices
-  using GmemTiledCopyA = XE_2D_U16x32x32_LD_N;
-  using GmemTiledCopyB = XE_2D_U16x32x32_LD_V;
-
-  // New MMA atom XE_DPAS_TT using workgroup-level tile shape of 256×256×32  
-  using TileShape = Shape<_256, _256, _32>; 
+  // Workgroup-level tile
+  using TileShape = Shape<_256, _256, _32>;
+  
+  // A TiledMMA struct defines a tiling of an MMA atom over M, N and K, combining both additional
+  // hardware (sub-groups for Intel BMG) and iterations by each sub-group.
+  //
+  // The TiledMMAHelper struct defines a specific TiledMMA for a given MMA atom. This example uses
+  // the XE_DPAS_TT<8, float, cute::bfloat16_t> atom, which represents an 8x16x16 DPAS operation with float32 accumulation and bfloat16 inputs, TileShape (<256, 256, 32>) and sub-group layout (8x4x1). 
+  // The TiledMMA constructed using TiledMMAHelper has the property that each sub-group operates on a
+  // single contiguous chunk of the work-group TileShape. For this configuration, this implies that
+  // each sub-group operates on a contiguous 32x64x32 chunk (4x4x2 iterations). See
+  // 0t_mma_atom.md#TiledMMAs for more info. Sub-groups are arranged row-major (stride 4,1,0) for
+  // performance reasons.
   using TiledMma = typename TiledMMAHelper<MMA_Atom<XE_DPAS_TT<8, float, cute::bfloat16_t>>, Layout<TileShape>, Layout<Shape<_8, _4, _1>, Stride<_4, _1, _0>>>::TiledMMA;
 
   // For Intel BMG, PipelineStages defines how many k-blocks ahead to prefetch from A and B.
@@ -385,6 +392,13 @@ int main(int argc, const char** argv)
           void, void>;
 
   // GEMM Mainloop - iteration over blocks in K dimension
+  // 
+  // Copy operations for A and B matrices:
+  // - Use 'void' (as shown below) to automatically select new 2D block copy operations
+  // - To use legacy copy operations, replace 'void' with specific copy atoms, e.g.:
+  //   using GmemTiledCopyA = XE_2D_U16x32x32_LD_N;
+  //   using GmemTiledCopyB = XE_2D_U16x32x32_LD_V;
+  //   Then replace the first 'void' with GmemTiledCopyA and fifth 'void' with GmemTiledCopyB
   using CollectiveMainloop = cutlass::gemm::collective::CollectiveMma<
           GEMMDispatchPolicy,
           TileShape,
@@ -393,8 +407,8 @@ int main(int argc, const char** argv)
           ElementInputB,
           cutlass::gemm::TagToStrideB_t<LayoutB>, // Converts CUTLASS 2.x to CUTLASS 3.x representation
           TiledMma,
-          GmemTiledCopyA, void, void, cute::identity,  // A
-          GmemTiledCopyB, void, void, cute::identity   // B
+          void, void, void, cute::identity,  // A
+          void, void, void, cute::identity   // B
   >;
 
   // Define the whole kernel (mainloop and epilogue)
diff --git a/examples/00_bmg_gemm/00_bmg_gemm_legacy.cpp b/examples/00_bmg_gemm/00_bmg_gemm_legacy.cpp
deleted file mode 100644
index 7e9291227e..0000000000
--- a/examples/00_bmg_gemm/00_bmg_gemm_legacy.cpp
+++ /dev/null
@@ -1,429 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
- * Copyright (C) 2025 Intel Corporation, All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief CUTLASS Intel BMG Gemm Example.
-
-    This example constructs and executes a simple CUTLASS GEMM kernel on Intel BMG hardware, and
-    verifies its correctness with a reference implementation
-    (cutlass::reference::device::GemmComplex). The example also provides a performance measurement
-    for the GEMM in TFLOPS.
-
-    This example makes use of BMGs subgroup cooperative 2d-block copy operations and DPAS instructions.
-
-    The shapes of the A and B matrix are defined at runtime by `options.m`, `.n` and `.k`, and the
-    batch size is defined by `options.l`. The tile shape, which defines how much work is executed by
-    a single work-group, is defined at compile time by:
-    ```
-      using TileShape = Shape<_256, _256, _32>;
-    ```
-    That is, each work-group processes a tile of M=256, N=256, and iterates over `options.k` in
-    blocks of K=32.
-
-    Performance of GEMM on BMG is heavily dependent on prefetching the A and B matrices. That is,
-    executing Intel specific prefetch instructions for future iterations to ensure that the required
-    blocks of A and B are resident in cache before they are needed.
-
-    To build & run this example (from your build dir):
-
-      $ ninja 00_bmg_gemm
-      $ ./examples/sycl/00_bmg_gemm/00_bmg_gemm
-
-    Call with `--help` for information about available options
-*/
-
-#include "cutlass/epilogue/collective/default_epilogue.hpp"
-#include "cutlass/epilogue/collective/xe_epilogue.hpp"
-#include "cutlass/epilogue/fusion/xe_callbacks.hpp"
-#include "cutlass/gemm/device/gemm_universal.h"
-#include "cutlass/gemm/device/gemm_universal_adapter.h"
-#include "cutlass/gemm/collective/collective_mma.hpp"
-#include "cutlass/util/GPU_Clock.hpp"
-
-#include <cute/tensor.hpp>
-#include <random>
-
-#include "cutlass/util/command_line.h"
-#include "cutlass/util/device_memory.h"
-#include "cutlass/util/packed_stride.hpp"
-#include "cutlass/util/reference/device/gemm_complex.h"
-#include "cutlass/util/reference/device/tensor_compare.h"
-#include "sycl_common.hpp"
-#include "helper.h"
-
-using namespace cute;
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Command line options parsing
-struct Options {
-
-  bool help;
-  bool error;
-
-  int m, n, k, l, iterations;
-  float alpha, beta;
-
-  Options():
-    help(false),
-    error(false),
-    m(5120), n(4096), k(4096), l(1), iterations(20),
-    alpha(1.f), beta(0.f)
-  { }
-
-  // Parses the command line
-  void parse(int argc, char const **args) {
-    cutlass::CommandLine cmd(argc, args);
-
-    if (cmd.check_cmd_line_flag("help")) {
-      help = true;
-      return;
-    }
-
-    cmd.get_cmd_line_argument("m", m, 5120);
-    cmd.get_cmd_line_argument("n", n, 4096);
-    cmd.get_cmd_line_argument("k", k, 4096);
-    cmd.get_cmd_line_argument("l", l, 1);
-    cmd.get_cmd_line_argument("alpha", alpha, 1.f);
-    cmd.get_cmd_line_argument("beta", beta, 0.f);
-    cmd.get_cmd_line_argument("iterations", iterations, 100);
-  }
-
-  /// Prints the usage statement.
-  std::ostream & print_usage(std::ostream &out) const {
-
-    out << "BMG GEMM Example\n\n"
-      << "Options:\n\n"
-      << "  --help                      If specified, displays this usage statement\n\n"
-      << "  --m=<int>                   Sets the M extent of the GEMM\n"
-      << "  --n=<int>                   Sets the N extent of the GEMM\n"
-      << "  --k=<int>                   Sets the K extent of the GEMM\n"
-      << "  --l=<int>                   Sets the L extent (batch count) of the GEMM\n"
-      << "  --alpha=<s32>               Epilogue scalar alpha\n"
-      << "  --beta=<s32>                Epilogue scalar beta\n\n"
-      << "  --iterations=<int>          Iterations\n\n";
-
-    return out;
-  }
-};
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  class Gemm
->
-struct ExampleRunner {
-
-  using StrideA = typename Gemm::GemmKernel::StrideA;
-  using StrideB = typename Gemm::GemmKernel::StrideB;
-  using StrideC = typename Gemm::GemmKernel::StrideC;
-  using StrideD = typename Gemm::GemmKernel::StrideD;
-
-  using LayoutA = typename Gemm::LayoutA;
-  using LayoutB = typename Gemm::LayoutB;
-  using LayoutC = typename Gemm::LayoutC;
-  using LayoutD = typename Gemm::LayoutD;
-
-  using ElementA = typename Gemm::ElementA;
-  using ElementB = typename Gemm::ElementB;
-  using ElementAcc = typename Gemm::ElementAccumulator;
-
-  using CollectiveEpilogue = typename Gemm::CollectiveEpilogue;
-  using ElementC = typename Gemm::ElementC;
-  using ElementOutput = typename CollectiveEpilogue::ElementOutput;
-  using ElementCompute = typename CollectiveEpilogue::ElementCompute;
-  using ElementAccumulator = typename CollectiveEpilogue::ElementAccumulator;
-
-  using ProblemShapeType = typename Gemm::GemmKernel::ProblemShape;
-
-  //
-  // Data members
-  //
-
-  /// Initialization
-  StrideA stride_A;
-  StrideB stride_B;
-  StrideC stride_C;
-  StrideD stride_D;
-  uint64_t seed = 0;
-
-  cutlass::DeviceAllocation<ElementA> block_A;
-  cutlass::DeviceAllocation<ElementB> block_B;
-  cutlass::DeviceAllocation<ElementC> block_C;
-  cutlass::DeviceAllocation<ElementOutput> block_D;
-  cutlass::DeviceAllocation<ElementOutput> block_ref_D; // Reference GEMM result for verification
-
-  //
-  // Methods
-  //
-
-  bool verify(const ProblemShapeType& problem_size, ElementCompute alpha, ElementCompute beta) {
-    auto [M, N, K, L] = problem_size;
-
-    cutlass::TensorRef ref_A(block_A.get(), LayoutA::packed({M, K}));
-    cutlass::TensorRef ref_B(block_B.get(), LayoutB::packed({K, N}));
-    cutlass::TensorRef ref_C(block_C.get(), LayoutC::packed({M, N}));
-    cutlass::TensorRef ref_D(block_ref_D.get(), LayoutD::packed({M, N}));
-
-    cutlass::reference::device::GemmComplex(
-          {M, N, K},
-          alpha,
-          ref_A,
-          cutlass::ComplexTransform::kNone,
-          ref_B,
-          cutlass::ComplexTransform::kNone,
-          beta,
-          ref_C,
-          ref_D,
-          ElementAccumulator(0),
-          L,     // batch_count
-          M * K, // batch_stride_A
-          K * N, // batch_stride_B
-          M * N, // batch_stride_C
-          M * N  // batch_stride_D
-        );
-
-    // CUTLASS on SYCL uses the compatibility library compat for e.g. default in-order queue
-    compat::wait();
-
-    // Check if output from CUTLASS kernel and reference kernel are equal or not
-    bool passed = cutlass::reference::device::BlockCompareEqual(
-      block_ref_D.get(), block_D.get(), block_D.size());
-
-    return passed;
-  }
-
-  /// Initialize operands to be used in the GEMM and reference GEMM
-  void initialize(const ProblemShapeType& problem_size) {
-    auto problem_shape_MNKL = cute::append<4>(problem_size, 1);
-    auto [M, N, K, L] = problem_shape_MNKL;
-
-    // Complete the stride by combining static layout info (StrideA) with runtime size info (M,K,L)
-    stride_A = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, K, L));
-    stride_B = cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(N, K, L));
-    stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L));
-    stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L));
-
-    block_A.reset(static_cast<std::size_t>(M) * K * L);
-    block_B.reset(static_cast<std::size_t>(K) * N * L);
-    block_C.reset(static_cast<std::size_t>(M) * N * L);
-    block_D.reset(static_cast<std::size_t>(M) * N * L);
-    block_ref_D.reset(static_cast<std::size_t>(M) * N * L);
-
-    initialize_block(block_A, seed + 2023);
-    initialize_block(block_B, seed + 2022);
-    initialize_block(block_C, seed + 2021);
-  }
-
-  cutlass::Status run(const Options& options, const cutlass::KernelHardwareInfo& hw_info) {
-    ProblemShapeType problem_size = ProblemShapeType{options.m, options.n, options.k, options.l};
-
-    initialize(problem_size);
-
-    typename Gemm::GemmKernel::Arguments arguments{
-      cutlass::gemm::GemmUniversalMode::kGemm,
-      problem_size,
-      {block_A.get(), stride_A, block_B.get(), stride_B},
-      {{options.alpha, options.beta}, block_C.get(), stride_C, block_D.get(), stride_D},
-      hw_info
-    };
-
-    Gemm gemm_op;
-
-    size_t workspace_size = Gemm::get_workspace_size(arguments);
-    cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
-
-    if (gemm_op.can_implement(arguments) != cutlass::Status::kSuccess){
-      std::cout << "Invalid Problem Size: " << options.m << 'x' << options.n << 'x' << options.k << 'x' << options.l << std::endl;
-      std::exit(1);
-    }
-
-    CUTLASS_CHECK(gemm_op.initialize(arguments, workspace.get()));
-
-    // Run the GEMM
-    CUTLASS_CHECK(gemm_op.run());
-
-    compat::wait();
-
-    // Verify that the result is correct
-    bool passed = verify(problem_size, options.alpha, options.beta);
-    std::cout << "Disposition: " << (passed ? "Passed" : "Failed") << std::endl;
-
-    if(!passed) return cutlass::Status::kErrorInternal;
-
-    if (options.iterations > 0) {
-      GPU_Clock timer;
-      timer.start();
-      for (int i = 0; i < options.iterations; ++i) {
-        gemm_op.run();
-      }
-      compat::wait();
-
-      float cute_time = timer.seconds() / options.iterations;
-      double tflops = (2.0 * options.m * options.n * options.k * options.l) * 1e-12;
-      std::cout << "Problem Size: " << options.m << 'x' << options.n << 'x' << options.k << 'x' << options.l << std::endl;
-      printf("Cutlass GEMM Performance:     [%4.3f]TFlop/s  (%6.4f)ms\n", tflops / cute_time, cute_time*1000);
-    }
-
-    return cutlass::Status::kSuccess;
-  }
-
-};
-
-int main(int argc, const char** argv)
-{
-  //
-  // Parse options
-  //
-
-  Options options;
-
-  options.parse(argc, argv);
-
-  if (options.help) {
-    options.print_usage(std::cout) << std::endl;
-    return 0;
-  }
-
-  if (options.error) {
-    std::cerr << "Aborting execution." << std::endl;
-    return -1;
-  }
-
-  //
-  // Run examples
-  //
-
-  // The KernelHardwareInfo struct holds the number of EUs on the GPU with a given device ID. This
-  // information is used by the underlying kernel.
-  cutlass::KernelHardwareInfo hw_info;
-
-  // Change device_id to another value if you are running on a machine with multiple GPUs and wish
-  // to use a GPU other than that with device ID 0.
-  hw_info.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id);
-
-  bool passed;
-
-  // The code section below describes datatype for input, output matrices and computation between
-  // elements in input matrices.
-  using ElementAccumulator = float;      // <- data type of accumulator
-  using ElementComputeEpilogue = float;  // <- data type of epilogue operations
-  using ElementInputA = bfloat16_t;      // <- data type of elements in input matrix A
-  using ElementInputB = bfloat16_t;      // <- data type of elements in input matrix B
-  using ElementOutput = float;           // <- data type of elements in output matrix D
-
-  using LayoutA = cutlass::layout::RowMajor;
-  using LayoutB = cutlass::layout::RowMajor;
-  using LayoutC = cutlass::layout::RowMajor;
-  using LayoutD = cutlass::layout::RowMajor;
-
-  // The 2D block copy operations used for the A and B matrices
-  using GmemTiledCopyA = XE_2D_U16x32x32_LD_N;
-  using GmemTiledCopyB = XE_2D_U16x32x32_LD_V;
-
-  // Workgroup-level tile
-  using TileShape = Shape<_256, _256, _32>;
-
-  // A TiledMMA struct defines a tiling of an MMA atom over M, N and K, combining both additional
-  // hardware (sub-groups for Intel BMG) and iterations by each sub-group.
-  //
-  // The TiledMMAHelper struct defines a specific TiledMMA for a given MMA atom
-  // (XE_8x16x16_F32BF16BF16F32_TT), TileShape (<256, 256, 32>) and sub-group layout (8x4x1). The
-  // TiledMMA constructed using TiledMMAHelper has the property that each sub-group operates on a
-  // single contiguous chunk of the work-group TileShape. For this configuration, this implies that
-  // each sub-group operates on a contiguous 32x64x32 chunk (4x4x2 iterations). See
-  // 0t_mma_atom.md#TiledMMAs for more info. Sub-groups are arranged row-major (stride 4,1,0) for
-  // performance reasons.
-  using TiledMma =                    // M=8,N=16,K=16, D=f32,A=bf16,B=bf16,C=f32
-      typename TiledMMAHelper<MMA_Atom<XE_8x16x16_F32BF16BF16F32_TT>, Layout<TileShape>,
-                                    Layout<Shape<_8, _4, _1>, Stride<_4, _1, _0>>>::TiledMMA;
-
-  // For Intel BMG, PipelineStages defines how many k-blocks ahead to prefetch from A and B.
-  constexpr int PipelineStages = 2;
-  using GEMMDispatchPolicy = cutlass::gemm::MainloopIntelXeXMX16<PipelineStages>;
-  using EpilogueDispatchPolicy = cutlass::epilogue::IntelXeXMX16;
-
-  // This is the 'default' epilogue operation (Linear Combination) which performs everything in:
-  // (D = alpha * (A*B) + beta * C)
-  // aside from the (A*B), which is handled by the GEMM. See 05_bmg_gemm_with_epilogues for more
-  // complex epilogue examples.
-  using EpilogueOp = cutlass::epilogue::fusion::LinearCombination<ElementOutput, ElementComputeEpilogue,
-          ElementAccumulator, ElementAccumulator, cutlass::FloatRoundStyle::round_to_nearest>;
-
-  // FusionCallbacks ties the EpilogueOp to an implementation (based on the dispatch
-  // policy/architecture) and defines the epilogue arguments.
-  using FusionCallBacks = cutlass::epilogue::fusion::FusionCallbacks<EpilogueDispatchPolicy, EpilogueOp, TileShape,
-          decltype(tile_shape(TiledMma()))>;
-  // GEMM Epilogue - loads & stores C/D matrices, performs epilogue operations & load/stores any
-  // auxiliary data required
-  using CollectiveEpilogue = cutlass::epilogue::collective::CollectiveEpilogue<
-          EpilogueDispatchPolicy,
-          TileShape,
-          ElementAccumulator,
-          cutlass::gemm::TagToStrideC_t<LayoutC>, // Converts CUTLASS 2.x to CUTLASS 3.x representation
-          ElementOutput,
-          cutlass::gemm::TagToStrideC_t<LayoutD>, // Converts CUTLASS 2.x to CUTLASS 3.x representation
-          FusionCallBacks,
-          XE_2D_U32x8x16_LD_N, // The copy atom used to load matrix C
-          void, void,
-          XE_2D_U32x8x16_ST_N, // The copy atom used to store matrix D
-          void, void>;
-
-  // GEMM Mainloop - iteration over blocks in K dimension
-  using CollectiveMainloop = cutlass::gemm::collective::CollectiveMma<
-          GEMMDispatchPolicy,
-          TileShape,
-          ElementInputA,
-          cutlass::gemm::TagToStrideA_t<LayoutA>, // Converts CUTLASS 2.x to CUTLASS 3.x representation
-          ElementInputB,
-          cutlass::gemm::TagToStrideB_t<LayoutB>, // Converts CUTLASS 2.x to CUTLASS 3.x representation
-          TiledMma,
-          GmemTiledCopyA, void, void, cute::identity,  // A
-          GmemTiledCopyB, void, void, cute::identity   // B
-  >;
-
-  // Define the whole kernel (mainloop and epilogue)
-  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
-  Shape<int, int, int, int>, // Defer global problem shape definition to runtime
-  CollectiveMainloop,
-  CollectiveEpilogue
-  >;
-
-  // The GemmUniversalAdapter wraps the defined GEMM kernel and handles the launch, and e.g.
-  // persistent scratch memory if required.
-  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
-
-  ExampleRunner<Gemm> runner;
-
-  CUTLASS_CHECK(runner.run(options, hw_info));
-
-  return 0;
-}
diff --git a/examples/00_bmg_gemm/CMakeLists.txt b/examples/00_bmg_gemm/CMakeLists.txt
index 6564ad47a9..5bfc4a5e29 100644
--- a/examples/00_bmg_gemm/CMakeLists.txt
+++ b/examples/00_bmg_gemm/CMakeLists.txt
@@ -40,15 +40,6 @@ cutlass_example_add_executable(
   TEST_SMALL_SHAPE
 )
 
-cutlass_example_add_executable(
-  00_bmg_gemm_legacy
-  00_bmg_gemm_legacy.cpp
-  TEST_COMMAND_OPTIONS
-  TEST_BATCHES
-  TEST_LARGE
-  TEST_SMALL_SHAPE
-)
-
 set(TEST_SMALL_SHAPE_PADDABLE --m=1 --n=1 --k=2 --l=2)
 cutlass_example_add_executable(
   00_bmg_gemm_padded
diff --git a/include/cutlass/gemm/collective/xe_mma.hpp b/include/cutlass/gemm/collective/xe_mma.hpp
index 8df8945a7e..9a55d5bc52 100644
--- a/include/cutlass/gemm/collective/xe_mma.hpp
+++ b/include/cutlass/gemm/collective/xe_mma.hpp
@@ -176,23 +176,42 @@ struct CollectiveMma<MainloopXeL1Staged<Stages, Schedule>, TileShape_, ElementA_
     static_assert(is_rmem<FrgTensorD>::value, "D tensor must be rmem resident.");
     static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
 
-    auto mA_mkl = make_tensor(make_gmem_ptr(mainloop.ptr_A), 
-                              make_layout(make_shape(mainloop.M, mainloop.K, mainloop.L), cute::take<0,2>(mainloop.dA)));
-    auto mB_nkl = make_tensor(make_gmem_ptr(mainloop.ptr_B), 
-                              make_layout(make_shape(mainloop.N, mainloop.K, mainloop.L), cute::take<0,2>(mainloop.dB)));
-    auto copy_a = make_block_2d_copy_A(TiledMma{}, mA_mkl);
-    auto copy_b = make_block_2d_copy_B(TiledMma{}, mB_nkl);
+    auto copy_a = [&]() {
+    if constexpr (!std::is_void_v<GmemTiledCopyA>) {
+      // User provided copy operation - use full stride
+      auto mA_mkl = make_tensor(make_gmem_ptr(mainloop.ptr_A), 
+                                make_layout(make_shape(mainloop.M, mainloop.K, mainloop.L), mainloop.dA));
+      using Copy_A = typename Copy_Traits<GmemTiledCopyA, StrideA>::template DefaultTiledCopy<ElementA>;
+      return Copy_A{}.with(mA_mkl);
+    } else {
+      // Use new 2D copy operations with 2D stride
+      auto mA_mkl = make_tensor(make_gmem_ptr(mainloop.ptr_A), 
+                                make_layout(make_shape(mainloop.M, mainloop.K, mainloop.L), cute::take<0,2>(mainloop.dA)));
+      return make_block_2d_copy_A(TiledMma{}, mA_mkl);
+    }
+  }();
+  
+  auto copy_b = [&]() {
+    if constexpr (!std::is_void_v<GmemTiledCopyB>) {
+      // User provided copy operation - use full stride
+      auto mB_nkl = make_tensor(make_gmem_ptr(mainloop.ptr_B), 
+                                make_layout(make_shape(mainloop.N, mainloop.K, mainloop.L), mainloop.dB));
+      using Copy_B = typename Copy_Traits<GmemTiledCopyB, StrideB>::template DefaultTiledCopy<ElementB>;
+      return Copy_B{}.with(mB_nkl);
+    } else {
+      // Use new 2D copy operations with 2D stride
+      auto mB_nkl = make_tensor(make_gmem_ptr(mainloop.ptr_B), 
+                                make_layout(make_shape(mainloop.N, mainloop.K, mainloop.L), cute::take<0,2>(mainloop.dB)));
+      return make_block_2d_copy_B(TiledMma{}, mB_nkl);
+    }
+  }();
 
     auto thr_copy_a = copy_a.get_slice(thread_idx);
     auto thr_copy_b = copy_b.get_slice(thread_idx);
 
     // Instantiate the MMA object and get thread slice
     TiledMma tiled_mma;
-    // TODO(Codeplay): see if we can make this nicer
-    // To make all work items in a subgroup have the same global tensors pass in the index of work item 0 in each subgroup
-    auto sg = compat::get_nd_item<1>().get_sub_group();
-    auto first_thread_in_sg_idx = sg.get_group_linear_id() * DispatchPolicy::SubgroupSize;
-    auto thr_mma = tiled_mma.get_slice(first_thread_in_sg_idx);
+    auto thr_mma = tiled_mma.get_slice(thread_idx);
 
     /* Register fragments for MMA */
     auto tCrA = thr_mma.partition_sg_fragment_A(gA(_,_,0));
@@ -206,11 +225,24 @@ struct CollectiveMma<MainloopXeL1Staged<Stages, Schedule>, TileShape_, ElementA_
     Tensor tAgA = thr_copy_a.partition_S(gA);
     Tensor tBgB = thr_copy_b.partition_S(gB);
     
-    /* Create prefetch TiledCopy instances */
-    auto prefetch_a = make_block_2d_prefetch(copy_a);
-    auto prefetch_b = make_block_2d_prefetch(copy_b);
-    auto thr_prefetch_A = prefetch_a.get_slice(thread_idx);
-    auto thr_prefetch_B = prefetch_b.get_slice(thread_idx);
+  /* Create prefetch TiledCopy instances - different for legacy vs new copy operations */
+  auto [prefetch_a, prefetch_b, thr_prefetch_A, thr_prefetch_B] = [&]() {
+    if constexpr (!std::is_void_v<GmemTiledCopyA> && !std::is_void_v<GmemTiledCopyB>) {
+      // Legacy copy operations - use prefetch_selector
+      auto tiled_prefetch_a = cute::prefetch_selector<Shape<Int<BLK_M>,Int<BLK_K>>, Num_SGs>(copy_a);
+      auto tiled_prefetch_b = cute::prefetch_selector<Shape<Int<BLK_N>,Int<BLK_K>>, Num_SGs>(copy_b);
+      auto thr_prefetch_A = tiled_prefetch_a.get_slice(thread_idx);
+      auto thr_prefetch_B = tiled_prefetch_b.get_slice(thread_idx);
+      return std::make_tuple(tiled_prefetch_a, tiled_prefetch_b, thr_prefetch_A, thr_prefetch_B);
+    } else {
+      // New 2D copy operations - use make_block_2d_prefetch
+      auto prefetch_a = make_block_2d_prefetch(copy_a);
+      auto prefetch_b = make_block_2d_prefetch(copy_b);
+      auto thr_prefetch_A = prefetch_a.get_slice(thread_idx);
+      auto thr_prefetch_B = prefetch_b.get_slice(thread_idx);
+      return std::make_tuple(prefetch_a, prefetch_b, thr_prefetch_A, thr_prefetch_B);
+    }
+  }();
 
     /* Partition global tensor (proxies) for prefetch */
     auto pAgA = thr_prefetch_A.partition_S(gA);

From c97f011cab591b5f48d31bd9517ba45125f988a1 Mon Sep 17 00:00:00 2001
From: Anamika Chatterjee <anamikac@habana.ai>
Date: Sat, 4 Oct 2025 17:52:43 +0300
Subject: [PATCH 08/15] Applied review comments

---
 examples/00_bmg_gemm/00_bmg_gemm.cpp              | 15 ++++++---------
 .../cutlass/gemm/collective/collective_mma.hpp    |  2 +-
 2 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/examples/00_bmg_gemm/00_bmg_gemm.cpp b/examples/00_bmg_gemm/00_bmg_gemm.cpp
index cf64eb3113..0f73fc525c 100644
--- a/examples/00_bmg_gemm/00_bmg_gemm.cpp
+++ b/examples/00_bmg_gemm/00_bmg_gemm.cpp
@@ -345,6 +345,10 @@ int main(int argc, const char** argv)
   using LayoutC = cutlass::layout::RowMajor;
   using LayoutD = cutlass::layout::RowMajor;
 
+  // [New Copy Atom] Automatically select 2D block copy operations used for the A and B matrices
+  using GmemTiledCopyA = void; // For older version of copy atom, use XE_2D_U16x32x32_LD_N
+  using GmemTiledCopyB = void; // For older version of copy atom, use XE_2D_U16x32x32_LD_V
+
   // Workgroup-level tile
   using TileShape = Shape<_256, _256, _32>;
   
@@ -392,13 +396,6 @@ int main(int argc, const char** argv)
           void, void>;
 
   // GEMM Mainloop - iteration over blocks in K dimension
-  // 
-  // Copy operations for A and B matrices:
-  // - Use 'void' (as shown below) to automatically select new 2D block copy operations
-  // - To use legacy copy operations, replace 'void' with specific copy atoms, e.g.:
-  //   using GmemTiledCopyA = XE_2D_U16x32x32_LD_N;
-  //   using GmemTiledCopyB = XE_2D_U16x32x32_LD_V;
-  //   Then replace the first 'void' with GmemTiledCopyA and fifth 'void' with GmemTiledCopyB
   using CollectiveMainloop = cutlass::gemm::collective::CollectiveMma<
           GEMMDispatchPolicy,
           TileShape,
@@ -407,8 +404,8 @@ int main(int argc, const char** argv)
           ElementInputB,
           cutlass::gemm::TagToStrideB_t<LayoutB>, // Converts CUTLASS 2.x to CUTLASS 3.x representation
           TiledMma,
-          void, void, void, cute::identity,  // A
-          void, void, void, cute::identity   // B
+          GmemTiledCopyA, void, void, cute::identity,  // A
+          GmemTiledCopyB, void, void, cute::identity   // B
   >;
 
   // Define the whole kernel (mainloop and epilogue)
diff --git a/include/cutlass/gemm/collective/collective_mma.hpp b/include/cutlass/gemm/collective/collective_mma.hpp
index 3c02ca1efa..bb5e2017d8 100644
--- a/include/cutlass/gemm/collective/collective_mma.hpp
+++ b/include/cutlass/gemm/collective/collective_mma.hpp
@@ -77,8 +77,8 @@
 #endif // !defined(__CUDACC_RTC__)
 
 #if defined(SYCL_INTEL_TARGET)
-#include "cutlass/gemm/collective/xe_mma_legacy.hpp"
 #include "cutlass/gemm/collective/xe_mma.hpp"
+#include "cutlass/gemm/collective/xe_mma_legacy.hpp"
 #include "cutlass/gemm/collective/xe_array_mma.hpp"
 #include "cutlass/gemm/collective/xe_array_mma_fp8.hpp"
 #include "cutlass/gemm/collective/xe_mma_mixed_input.hpp"

From 9691e608e813f4875dda25d3454031907a4e4780 Mon Sep 17 00:00:00 2001
From: Anamika Chatterjee <anamikac@habana.ai>
Date: Mon, 6 Oct 2025 12:42:30 +0300
Subject: [PATCH 09/15] Add compile-time checks to enforce new XE copy atoms in
 block 2D functions

---
 examples/00_bmg_gemm/00_bmg_gemm.cpp       | 11 +++--
 include/cute/atom/copy_traits_xe_2d.hpp    | 36 +++++++++++++++
 include/cutlass/gemm/collective/xe_mma.hpp | 54 ++++++++--------------
 3 files changed, 64 insertions(+), 37 deletions(-)

diff --git a/examples/00_bmg_gemm/00_bmg_gemm.cpp b/examples/00_bmg_gemm/00_bmg_gemm.cpp
index 0f73fc525c..0f7c665820 100644
--- a/examples/00_bmg_gemm/00_bmg_gemm.cpp
+++ b/examples/00_bmg_gemm/00_bmg_gemm.cpp
@@ -345,9 +345,13 @@ int main(int argc, const char** argv)
   using LayoutC = cutlass::layout::RowMajor;
   using LayoutD = cutlass::layout::RowMajor;
 
-  // [New Copy Atom] Automatically select 2D block copy operations used for the A and B matrices
-  using GmemTiledCopyA = void; // For older version of copy atom, use XE_2D_U16x32x32_LD_N
-  using GmemTiledCopyB = void; // For older version of copy atom, use XE_2D_U16x32x32_LD_V
+  // [New Copy Atom] When left unspecified (void), make_block_2d_copy_* automatically selects 
+  // appropriate 2D block copy operations for matrices A and B. Alternatively, you can 
+  // explicitly specify new copy atom operations such as XE_LOAD_2D, XE_LOAD_2D_VNNI 
+  // (applicable only to matrix B), or XE_LOAD_2D_TRANSPOSE.
+  // Refer https://github.com/intel/sycl-tla/blob/petercad/rearchitecture/media/docs/cpp/xe_rearchitecture.md
+  using GmemTiledCopyA = void; //XE_LOAD_2D<16, 32, 32>;
+  using GmemTiledCopyB = void; //XE_LOAD_2D_VNNI<16, 32, 32>;
 
   // Workgroup-level tile
   using TileShape = Shape<_256, _256, _32>;
@@ -366,6 +370,7 @@ int main(int argc, const char** argv)
 
   // For Intel BMG, PipelineStages defines how many k-blocks ahead to prefetch from A and B.
   constexpr int PipelineStages = 2;
+  // For older version of copy/mma atom, use cutlass::gemm::MainloopIntelXeXMX16 as dispatch policy
   using GEMMDispatchPolicy = cutlass::gemm::MainloopXeL1Staged<PipelineStages>;
   using EpilogueDispatchPolicy = cutlass::epilogue::IntelXeXMX16;
 
diff --git a/include/cute/atom/copy_traits_xe_2d.hpp b/include/cute/atom/copy_traits_xe_2d.hpp
index 2df8ae0a38..d8f45207fc 100644
--- a/include/cute/atom/copy_traits_xe_2d.hpp
+++ b/include/cute/atom/copy_traits_xe_2d.hpp
@@ -756,6 +756,31 @@ make_block_2d_copy_X(CopyOp             const& op,          // Copy operation
   return make_block_2d_copy<ValType>(op, gstride, x_mode, y_mode, atom_shape, sv_layout_t);
 }
 
+// Helper trait to detect new XE copy ops
+template<typename T>
+struct is_new_xe_atom : cute::false_type {};
+
+// Helper trait specifically for XE_LOAD_2D_VNNI (for copy B)
+template<typename T>
+struct is_new_xe_atom_vnni : cute::false_type {};
+
+// Helper trait specifically for XE_STORE_2D (for copy C)
+template<typename T>
+struct is_new_xe_atom_store : cute::false_type {};
+
+// Check if T is an instantiation of XE_LOAD_2D 
+template<int Bits, int Height, int Width, int BlockWidth>
+struct is_new_xe_atom<XE_LOAD_2D<Bits, Height, Width, BlockWidth>> : cute::true_type {};
+
+// Check if T is an instantiation of XE_LOAD_2D_TRANSPOSE 
+template<int Bits, int Height, int Width>
+struct is_new_xe_atom<XE_LOAD_2D_TRANSPOSE<Bits, Height, Width>> : cute::true_type {};
+
+// Check if T is an instantiation of XE_LOAD_2D_VNNI 
+template<int Bits, int Height, int Width, int BlockWidth>
+struct is_new_xe_atom_vnni<XE_LOAD_2D_VNNI<Bits, Height, Width, BlockWidth>> : cute::true_type {};
+
+
 // MMA-focused TiledCopy creation functions.
 template <class TiledMMA, class GEngine, class GLayout>
 CUTE_HOST_DEVICE
@@ -774,6 +799,12 @@ make_block_2d_copy_A(CopyOp                   const& op,    // Copy operation
                      TiledMMA                 const& mma,   // TiledMMA instance
                      Tensor<GEngine, GLayout> const& gmem)  // Global tensor
 {
+  // This will pass for new atoms like XE_LOAD_2D<16, 32, 32> 
+  // and fail for old atoms like XE_2D_U16x32x32_LD_N
+  static_assert(is_new_xe_atom<CopyOp>::value, 
+    "Old XE atom ops not compatible with make_block_2d_copy_A. "
+    "Please use the new templated atoms: XE_LOAD_2D<Bits, Height, Width> or XE_LOAD_2D_TRANSPOSE<Bits, Height, Width>. "
+    "Examples: XE_2D_U16x32x32_LD_N -> XE_LOAD_2D<16, 32, 32>, XE_2D_U16x32x32_LD_V -> XE_LOAD_2D_TRANSPOSE<16, 32, 32>");
   using ValType = typename GEngine::value_type;
   return make_block_2d_copy_A<ValType>(op, mma, gmem.stride()).with(gmem);
 }
@@ -846,6 +877,11 @@ make_block_2d_copy_B(CopyOp                   const& op,    // Copy operation
                      TiledMMA                 const& mma,   // TiledMMA instance
                      Tensor<GEngine, GLayout> const& gmem)  // Global tensor
 {
+  // Only accept XE_LOAD_2D_VNNI for copy B
+  static_assert(is_new_xe_atom_vnni<CopyOp>::value, 
+    "Old XE atom ops not compatible with make_block_2d_copy_B. "
+    "Please use the new templated atom: XE_LOAD_2D_VNNI<Bits, Height, Width, BlockWidth>. "
+    "Examples: XE_2D_U16x32x32_LD_V -> XE_LOAD_2D_VNNI<16, 32, 32, 32>");
   using ValType = typename GEngine::value_type;
   return make_block_2d_copy_B<ValType>(op, mma, gmem.stride()).with(gmem);
 }
diff --git a/include/cutlass/gemm/collective/xe_mma.hpp b/include/cutlass/gemm/collective/xe_mma.hpp
index 9a55d5bc52..64f23fd93f 100644
--- a/include/cutlass/gemm/collective/xe_mma.hpp
+++ b/include/cutlass/gemm/collective/xe_mma.hpp
@@ -176,32 +176,26 @@ struct CollectiveMma<MainloopXeL1Staged<Stages, Schedule>, TileShape_, ElementA_
     static_assert(is_rmem<FrgTensorD>::value, "D tensor must be rmem resident.");
     static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
 
+    auto mA_mkl = make_tensor(make_gmem_ptr(mainloop.ptr_A), 
+                                make_layout(make_shape(mainloop.M, mainloop.K, mainloop.L), cute::take<0,2>(mainloop.dA)));
+    auto mB_nkl = make_tensor(make_gmem_ptr(mainloop.ptr_B), 
+                                make_layout(make_shape(mainloop.N, mainloop.K, mainloop.L), cute::take<0,2>(mainloop.dB)));
     auto copy_a = [&]() {
     if constexpr (!std::is_void_v<GmemTiledCopyA>) {
-      // User provided copy operation - use full stride
-      auto mA_mkl = make_tensor(make_gmem_ptr(mainloop.ptr_A), 
-                                make_layout(make_shape(mainloop.M, mainloop.K, mainloop.L), mainloop.dA));
-      using Copy_A = typename Copy_Traits<GmemTiledCopyA, StrideA>::template DefaultTiledCopy<ElementA>;
-      return Copy_A{}.with(mA_mkl);
+      // User provided copy operation
+      return make_block_2d_copy_A(GmemTiledCopyA{}, TiledMma{}, mA_mkl);
     } else {
-      // Use new 2D copy operations with 2D stride
-      auto mA_mkl = make_tensor(make_gmem_ptr(mainloop.ptr_A), 
-                                make_layout(make_shape(mainloop.M, mainloop.K, mainloop.L), cute::take<0,2>(mainloop.dA)));
+      // make_block_2d_copy_A automatically selects copy operation
       return make_block_2d_copy_A(TiledMma{}, mA_mkl);
     }
   }();
   
   auto copy_b = [&]() {
     if constexpr (!std::is_void_v<GmemTiledCopyB>) {
-      // User provided copy operation - use full stride
-      auto mB_nkl = make_tensor(make_gmem_ptr(mainloop.ptr_B), 
-                                make_layout(make_shape(mainloop.N, mainloop.K, mainloop.L), mainloop.dB));
-      using Copy_B = typename Copy_Traits<GmemTiledCopyB, StrideB>::template DefaultTiledCopy<ElementB>;
-      return Copy_B{}.with(mB_nkl);
+      // User provided copy operation
+      return make_block_2d_copy_B(GmemTiledCopyB{}, TiledMma{}, mB_nkl);
     } else {
-      // Use new 2D copy operations with 2D stride
-      auto mB_nkl = make_tensor(make_gmem_ptr(mainloop.ptr_B), 
-                                make_layout(make_shape(mainloop.N, mainloop.K, mainloop.L), cute::take<0,2>(mainloop.dB)));
+      // make_block_2d_copy_B automatically selects copy operation
       return make_block_2d_copy_B(TiledMma{}, mB_nkl);
     }
   }();
@@ -225,24 +219,12 @@ struct CollectiveMma<MainloopXeL1Staged<Stages, Schedule>, TileShape_, ElementA_
     Tensor tAgA = thr_copy_a.partition_S(gA);
     Tensor tBgB = thr_copy_b.partition_S(gB);
     
-  /* Create prefetch TiledCopy instances - different for legacy vs new copy operations */
-  auto [prefetch_a, prefetch_b, thr_prefetch_A, thr_prefetch_B] = [&]() {
-    if constexpr (!std::is_void_v<GmemTiledCopyA> && !std::is_void_v<GmemTiledCopyB>) {
-      // Legacy copy operations - use prefetch_selector
-      auto tiled_prefetch_a = cute::prefetch_selector<Shape<Int<BLK_M>,Int<BLK_K>>, Num_SGs>(copy_a);
-      auto tiled_prefetch_b = cute::prefetch_selector<Shape<Int<BLK_N>,Int<BLK_K>>, Num_SGs>(copy_b);
-      auto thr_prefetch_A = tiled_prefetch_a.get_slice(thread_idx);
-      auto thr_prefetch_B = tiled_prefetch_b.get_slice(thread_idx);
-      return std::make_tuple(tiled_prefetch_a, tiled_prefetch_b, thr_prefetch_A, thr_prefetch_B);
-    } else {
-      // New 2D copy operations - use make_block_2d_prefetch
-      auto prefetch_a = make_block_2d_prefetch(copy_a);
-      auto prefetch_b = make_block_2d_prefetch(copy_b);
-      auto thr_prefetch_A = prefetch_a.get_slice(thread_idx);
-      auto thr_prefetch_B = prefetch_b.get_slice(thread_idx);
-      return std::make_tuple(prefetch_a, prefetch_b, thr_prefetch_A, thr_prefetch_B);
-    }
-  }();
+    /* Create prefetch TiledCopy instances */
+    auto prefetch_a = make_block_2d_prefetch(copy_a);
+    auto prefetch_b = make_block_2d_prefetch(copy_b);
+      
+    auto thr_prefetch_A = prefetch_a.get_slice(thread_idx);
+    auto thr_prefetch_B = prefetch_b.get_slice(thread_idx);
 
     /* Partition global tensor (proxies) for prefetch */
     auto pAgA = thr_prefetch_A.partition_S(gA);
@@ -292,6 +274,10 @@ struct CollectiveMma<MainloopXeL1Staged<Stages, Schedule>, TileShape_, ElementA_
         prefetch(prefetch_b, pBgB(_, _, _, prefetch_k));
       }
 
+      /* Shuffle data from copy fragments to MMA fragments */
+      reorder(tArA, tCrA);
+      reorder(tBrB, tCrB);
+
       cute::gemm(tiled_mma, tCrA, tCrB, accum);
       barrier_wait(barrier_scope);
     }

From 93b076a479dbf1a0cf6d292c17fe18bc9471d199 Mon Sep 17 00:00:00 2001
From: Anamika Chatterjee <anamikac@habana.ai>
Date: Mon, 6 Oct 2025 13:27:57 +0300
Subject: [PATCH 10/15] Modified static assert message

---
 include/cute/atom/copy_traits_xe_2d.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/cute/atom/copy_traits_xe_2d.hpp b/include/cute/atom/copy_traits_xe_2d.hpp
index d8f45207fc..f1658c7f2e 100644
--- a/include/cute/atom/copy_traits_xe_2d.hpp
+++ b/include/cute/atom/copy_traits_xe_2d.hpp
@@ -802,7 +802,7 @@ make_block_2d_copy_A(CopyOp                   const& op,    // Copy operation
   // This will pass for new atoms like XE_LOAD_2D<16, 32, 32> 
   // and fail for old atoms like XE_2D_U16x32x32_LD_N
   static_assert(is_new_xe_atom<CopyOp>::value, 
-    "Old XE atom ops not compatible with make_block_2d_copy_A. "
+    "Old XE copy atom ops not compatible with make_block_2d_copy_A. "
     "Please use the new templated atoms: XE_LOAD_2D<Bits, Height, Width> or XE_LOAD_2D_TRANSPOSE<Bits, Height, Width>. "
     "Examples: XE_2D_U16x32x32_LD_N -> XE_LOAD_2D<16, 32, 32>, XE_2D_U16x32x32_LD_V -> XE_LOAD_2D_TRANSPOSE<16, 32, 32>");
   using ValType = typename GEngine::value_type;
@@ -879,7 +879,7 @@ make_block_2d_copy_B(CopyOp                   const& op,    // Copy operation
 {
   // Only accept XE_LOAD_2D_VNNI for copy B
   static_assert(is_new_xe_atom_vnni<CopyOp>::value, 
-    "Old XE atom ops not compatible with make_block_2d_copy_B. "
+    "Old XE copy atom ops not compatible with make_block_2d_copy_B. "
     "Please use the new templated atom: XE_LOAD_2D_VNNI<Bits, Height, Width, BlockWidth>. "
     "Examples: XE_2D_U16x32x32_LD_V -> XE_LOAD_2D_VNNI<16, 32, 32, 32>");
   using ValType = typename GEngine::value_type;

From a6f068c58fc0041d4c0353f017ec5a2f1cd85901 Mon Sep 17 00:00:00 2001
From: Anamika Chatterjee <anamikac@habana.ai>
Date: Mon, 6 Oct 2025 13:28:54 +0300
Subject: [PATCH 11/15] Modified static assert message

---
 include/cute/atom/copy_traits_xe_2d.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/cute/atom/copy_traits_xe_2d.hpp b/include/cute/atom/copy_traits_xe_2d.hpp
index f1658c7f2e..e6cbe664c8 100644
--- a/include/cute/atom/copy_traits_xe_2d.hpp
+++ b/include/cute/atom/copy_traits_xe_2d.hpp
@@ -802,7 +802,7 @@ make_block_2d_copy_A(CopyOp                   const& op,    // Copy operation
   // This will pass for new atoms like XE_LOAD_2D<16, 32, 32> 
   // and fail for old atoms like XE_2D_U16x32x32_LD_N
   static_assert(is_new_xe_atom<CopyOp>::value, 
-    "Old XE copy atom ops not compatible with make_block_2d_copy_A. "
+    "Legacy XE copy atom ops not compatible with make_block_2d_copy_A. "
     "Please use the new templated atoms: XE_LOAD_2D<Bits, Height, Width> or XE_LOAD_2D_TRANSPOSE<Bits, Height, Width>. "
     "Examples: XE_2D_U16x32x32_LD_N -> XE_LOAD_2D<16, 32, 32>, XE_2D_U16x32x32_LD_V -> XE_LOAD_2D_TRANSPOSE<16, 32, 32>");
   using ValType = typename GEngine::value_type;
@@ -879,7 +879,7 @@ make_block_2d_copy_B(CopyOp                   const& op,    // Copy operation
 {
   // Only accept XE_LOAD_2D_VNNI for copy B
   static_assert(is_new_xe_atom_vnni<CopyOp>::value, 
-    "Old XE copy atom ops not compatible with make_block_2d_copy_B. "
+    "Legacy XE copy atom ops not compatible with make_block_2d_copy_B. "
     "Please use the new templated atom: XE_LOAD_2D_VNNI<Bits, Height, Width, BlockWidth>. "
     "Examples: XE_2D_U16x32x32_LD_V -> XE_LOAD_2D_VNNI<16, 32, 32, 32>");
   using ValType = typename GEngine::value_type;

From e1e64f7316ef342e01a34be2ebbe60de42575a61 Mon Sep 17 00:00:00 2001
From: Anamika Chatterjee <anamikac@habana.ai>
Date: Wed, 8 Oct 2025 14:11:30 +0300
Subject: [PATCH 12/15] Move legacy example to legacy folder, pass 2D strides
 to make_block_2d_copy_*, and move tensor/copy initialization to host-side
 params in to_underlying_arguments

---
 examples/00_bmg_gemm/legacy/00_bmg_gemm.cpp | 429 ++++++++++++++++++++
 examples/00_bmg_gemm/legacy/CMakeLists.txt  |  41 ++
 examples/CMakeLists.txt                     |   1 +
 include/cutlass/gemm/collective/xe_mma.hpp  |  97 +++--
 4 files changed, 534 insertions(+), 34 deletions(-)
 create mode 100644 examples/00_bmg_gemm/legacy/00_bmg_gemm.cpp
 create mode 100644 examples/00_bmg_gemm/legacy/CMakeLists.txt

diff --git a/examples/00_bmg_gemm/legacy/00_bmg_gemm.cpp b/examples/00_bmg_gemm/legacy/00_bmg_gemm.cpp
new file mode 100644
index 0000000000..91139cf0d6
--- /dev/null
+++ b/examples/00_bmg_gemm/legacy/00_bmg_gemm.cpp
@@ -0,0 +1,429 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief CUTLASS Intel BMG Gemm Example.
+
+    This example constructs and executes a simple CUTLASS GEMM kernel on Intel BMG hardware, and
+    verifies its correctness with a reference implementation
+    (cutlass::reference::device::GemmComplex). The example also provides a performance measurement
+    for the GEMM in TFLOPS.
+
+    This example makes use of BMGs subgroup cooperative 2d-block copy operations and DPAS instructions.
+
+    The shapes of the A and B matrix are defined at runtime by `options.m`, `.n` and `.k`, and the
+    batch size is defined by `options.l`. The tile shape, which defines how much work is executed by
+    a single work-group, is defined at compile time by:
+    ```
+      using TileShape = Shape<_256, _256, _32>;
+    ```
+    That is, each work-group processes a tile of M=256, N=256, and iterates over `options.k` in
+    blocks of K=32.
+
+    Performance of GEMM on BMG is heavily dependent on prefetching the A and B matrices. That is,
+    executing Intel specific prefetch instructions for future iterations to ensure that the required
+    blocks of A and B are resident in cache before they are needed.
+
+    To build & run this example (from your build dir):
+
+      $ ninja 00_bmg_gemm
+      $ ./examples/sycl/00_bmg_gemm/00_bmg_gemm
+
+    Call with `--help` for information about available options
+*/
+
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/collective/xe_epilogue.hpp"
+#include "cutlass/epilogue/fusion/xe_callbacks.hpp"
+#include "cutlass/gemm/device/gemm_universal.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/collective/collective_mma.hpp"
+#include "cutlass/util/GPU_Clock.hpp"
+
+#include <cute/tensor.hpp>
+#include <random>
+
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/device_memory.h"
+#include "cutlass/util/packed_stride.hpp"
+#include "cutlass/util/reference/device/gemm_complex.h"
+#include "cutlass/util/reference/device/tensor_compare.h"
+#include "sycl_common.hpp"
+#include "helper.h"
+
+using namespace cute;
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Command line options parsing
+struct Options {
+
+  bool help;
+  bool error;
+
+  int m, n, k, l, iterations;
+  float alpha, beta;
+
+  Options():
+    help(false),
+    error(false),
+    m(5120), n(4096), k(4096), l(1), iterations(20),
+    alpha(1.f), beta(0.f)
+  { }
+
+  // Parses the command line
+  void parse(int argc, char const **args) {
+    cutlass::CommandLine cmd(argc, args);
+
+    if (cmd.check_cmd_line_flag("help")) {
+      help = true;
+      return;
+    }
+
+    cmd.get_cmd_line_argument("m", m, 5120);
+    cmd.get_cmd_line_argument("n", n, 4096);
+    cmd.get_cmd_line_argument("k", k, 4096);
+    cmd.get_cmd_line_argument("l", l, 1);
+    cmd.get_cmd_line_argument("alpha", alpha, 1.f);
+    cmd.get_cmd_line_argument("beta", beta, 0.f);
+    cmd.get_cmd_line_argument("iterations", iterations, 100);
+  }
+
+  /// Prints the usage statement.
+  std::ostream & print_usage(std::ostream &out) const {
+
+    out << "BMG GEMM Example\n\n"
+      << "Options:\n\n"
+      << "  --help                      If specified, displays this usage statement\n\n"
+      << "  --m=<int>                   Sets the M extent of the GEMM\n"
+      << "  --n=<int>                   Sets the N extent of the GEMM\n"
+      << "  --k=<int>                   Sets the K extent of the GEMM\n"
+      << "  --l=<int>                   Sets the L extent (batch count) of the GEMM\n"
+      << "  --alpha=<s32>               Epilogue scalar alpha\n"
+      << "  --beta=<s32>                Epilogue scalar beta\n\n"
+      << "  --iterations=<int>          Iterations\n\n";
+
+    return out;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  class Gemm
+>
+struct ExampleRunner {
+
+  using StrideA = typename Gemm::GemmKernel::StrideA;
+  using StrideB = typename Gemm::GemmKernel::StrideB;
+  using StrideC = typename Gemm::GemmKernel::StrideC;
+  using StrideD = typename Gemm::GemmKernel::StrideD;
+
+  using LayoutA = typename Gemm::LayoutA;
+  using LayoutB = typename Gemm::LayoutB;
+  using LayoutC = typename Gemm::LayoutC;
+  using LayoutD = typename Gemm::LayoutD;
+
+  using ElementA = typename Gemm::ElementA;
+  using ElementB = typename Gemm::ElementB;
+  using ElementAcc = typename Gemm::ElementAccumulator;
+
+  using CollectiveEpilogue = typename Gemm::CollectiveEpilogue;
+  using ElementC = typename Gemm::ElementC;
+  using ElementOutput = typename CollectiveEpilogue::ElementOutput;
+  using ElementCompute = typename CollectiveEpilogue::ElementCompute;
+  using ElementAccumulator = typename CollectiveEpilogue::ElementAccumulator;
+
+  using ProblemShapeType = typename Gemm::GemmKernel::ProblemShape;
+
+  //
+  // Data members
+  //
+
+  /// Initialization
+  StrideA stride_A;
+  StrideB stride_B;
+  StrideC stride_C;
+  StrideD stride_D;
+  uint64_t seed = 0;
+
+  cutlass::DeviceAllocation<ElementA> block_A;
+  cutlass::DeviceAllocation<ElementB> block_B;
+  cutlass::DeviceAllocation<ElementC> block_C;
+  cutlass::DeviceAllocation<ElementOutput> block_D;
+  cutlass::DeviceAllocation<ElementOutput> block_ref_D; // Reference GEMM result for verification
+
+  //
+  // Methods
+  //
+
+  bool verify(const ProblemShapeType& problem_size, ElementCompute alpha, ElementCompute beta) {
+    auto [M, N, K, L] = problem_size;
+
+    cutlass::TensorRef ref_A(block_A.get(), LayoutA::packed({M, K}));
+    cutlass::TensorRef ref_B(block_B.get(), LayoutB::packed({K, N}));
+    cutlass::TensorRef ref_C(block_C.get(), LayoutC::packed({M, N}));
+    cutlass::TensorRef ref_D(block_ref_D.get(), LayoutD::packed({M, N}));
+
+    cutlass::reference::device::GemmComplex(
+          {M, N, K},
+          alpha,
+          ref_A,
+          cutlass::ComplexTransform::kNone,
+          ref_B,
+          cutlass::ComplexTransform::kNone,
+          beta,
+          ref_C,
+          ref_D,
+          ElementAccumulator(0),
+          L,     // batch_count
+          M * K, // batch_stride_A
+          K * N, // batch_stride_B
+          M * N, // batch_stride_C
+          M * N  // batch_stride_D
+        );
+
+    // CUTLASS on SYCL uses the compatibility library compat for e.g. default in-order queue
+    compat::wait();
+
+    // Check if output from CUTLASS kernel and reference kernel are equal or not
+    bool passed = cutlass::reference::device::BlockCompareEqual(
+      block_ref_D.get(), block_D.get(), block_D.size());
+
+    return passed;
+  }
+
+  /// Initialize operands to be used in the GEMM and reference GEMM
+  void initialize(const ProblemShapeType& problem_size) {
+    auto problem_shape_MNKL = cute::append<4>(problem_size, 1);
+    auto [M, N, K, L] = problem_shape_MNKL;
+
+    // Complete the stride by combining static layout info (StrideA) with runtime size info (M,K,L)
+    stride_A = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, K, L));
+    stride_B = cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(N, K, L));
+    stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L));
+    stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L));
+
+    block_A.reset(static_cast<std::size_t>(M) * K * L);
+    block_B.reset(static_cast<std::size_t>(K) * N * L);
+    block_C.reset(static_cast<std::size_t>(M) * N * L);
+    block_D.reset(static_cast<std::size_t>(M) * N * L);
+    block_ref_D.reset(static_cast<std::size_t>(M) * N * L);
+
+    initialize_block(block_A, seed + 2023);
+    initialize_block(block_B, seed + 2022);
+    initialize_block(block_C, seed + 2021);
+  }
+
+  cutlass::Status run(const Options& options, const cutlass::KernelHardwareInfo& hw_info) {
+    ProblemShapeType problem_size = ProblemShapeType{options.m, options.n, options.k, options.l};
+
+    initialize(problem_size);
+
+    typename Gemm::GemmKernel::Arguments arguments{
+      cutlass::gemm::GemmUniversalMode::kGemm,
+      problem_size,
+      {block_A.get(), stride_A, block_B.get(), stride_B},
+      {{options.alpha, options.beta}, block_C.get(), stride_C, block_D.get(), stride_D},
+      hw_info
+    };
+
+    Gemm gemm_op;
+
+    size_t workspace_size = Gemm::get_workspace_size(arguments);
+    cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+    if (gemm_op.can_implement(arguments) != cutlass::Status::kSuccess){
+      std::cout << "Invalid Problem Size: " << options.m << 'x' << options.n << 'x' << options.k << 'x' << options.l << std::endl;
+      std::exit(1);
+    }
+
+    CUTLASS_CHECK(gemm_op.initialize(arguments, workspace.get()));
+
+    // Run the GEMM
+    CUTLASS_CHECK(gemm_op.run());
+
+    compat::wait();
+
+    // Verify that the result is correct
+    bool passed = verify(problem_size, options.alpha, options.beta);
+    std::cout << "Disposition: " << (passed ? "Passed" : "Failed") << std::endl;
+
+    if(!passed) return cutlass::Status::kErrorInternal;
+
+    if (options.iterations > 0) {
+      GPU_Clock timer;
+      timer.start();
+      for (int i = 0; i < options.iterations; ++i) {
+        gemm_op.run();
+      }
+      compat::wait();
+
+      float cute_time = timer.seconds() / options.iterations;
+      double tflops = (2.0 * options.m * options.n * options.k * options.l) * 1e-12;
+      std::cout << "Problem Size: " << options.m << 'x' << options.n << 'x' << options.k << 'x' << options.l << std::endl;
+      printf("Cutlass GEMM Performance:     [%4.3f]TFlop/s  (%6.4f)ms\n", tflops / cute_time, cute_time*1000);
+    }
+
+    return cutlass::Status::kSuccess;
+  }
+
+};
+
+int main(int argc, const char** argv)
+{
+  //
+  // Parse options
+  //
+
+  Options options;
+
+  options.parse(argc, argv);
+
+  if (options.help) {
+    options.print_usage(std::cout) << std::endl;
+    return 0;
+  }
+
+  if (options.error) {
+    std::cerr << "Aborting execution." << std::endl;
+    return -1;
+  }
+
+  //
+  // Run examples
+  //
+
+  // The KernelHardwareInfo struct holds the number of EUs on the GPU with a given device ID. This
+  // information is used by the underlying kernel.
+  cutlass::KernelHardwareInfo hw_info;
+
+  // Change device_id to another value if you are running on a machine with multiple GPUs and wish
+  // to use a GPU other than that with device ID 0.
+  hw_info.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id);
+
+  bool passed;
+
+  // The code section below describes datatype for input, output matrices and computation between
+  // elements in input matrices.
+  using ElementAccumulator = float;      // <- data type of accumulator
+  using ElementComputeEpilogue = float;  // <- data type of epilogue operations
+  using ElementInputA = bfloat16_t;      // <- data type of elements in input matrix A
+  using ElementInputB = bfloat16_t;      // <- data type of elements in input matrix B
+  using ElementOutput = float;           // <- data type of elements in output matrix D
+
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using LayoutD = cutlass::layout::RowMajor;
+
+  // The 2D block copy operations used for the A and B matrices
+  using GmemTiledCopyA = XE_2D_U16x32x32_LD_N;
+  using GmemTiledCopyB = XE_2D_U16x32x32_LD_V;
+
+  // Workgroup-level tile
+  using TileShape = Shape<_256, _256, _32>;
+
+  // A TiledMMA struct defines a tiling of an MMA atom over M, N and K, combining both additional
+  // hardware (sub-groups for Intel BMG) and iterations by each sub-group.
+  //
+  // The TiledMMAHelper struct defines a specific TiledMMA for a given MMA atom
+  // (XE_8x16x16_F32BF16BF16F32_TT), TileShape (<256, 256, 32>) and sub-group layout (8x4x1). The
+  // TiledMMA constructed using TiledMMAHelper has the property that each sub-group operates on a
+  // single contiguous chunk of the work-group TileShape. For this configuration, this implies that
+  // each sub-group operates on a contiguous 32x64x32 chunk (4x4x2 iterations). See
+  // 0t_mma_atom.md#TiledMMAs for more info. Sub-groups are arranged row-major (stride 4,1,0) for
+  // performance reasons.
+  using TiledMma =                    // M=8,N=16,K=16, D=f32,A=bf16,B=bf16,C=f32
+      typename TiledMMAHelper<MMA_Atom<XE_8x16x16_F32BF16BF16F32_TT>, Layout<TileShape>,
+                                    Layout<Shape<_8, _4, _1>, Stride<_4, _1, _0>>>::TiledMMA;
+
+  // For Intel BMG, PipelineStages defines how many k-blocks ahead to prefetch from A and B.
+  constexpr int PipelineStages = 2;
+  using GEMMDispatchPolicy = cutlass::gemm::MainloopIntelXeXMX16<PipelineStages>;
+  using EpilogueDispatchPolicy = cutlass::epilogue::IntelXeXMX16;
+
+  // This is the 'default' epilogue operation (Linear Combination) which performs everything in:
+  // (D = alpha * (A*B) + beta * C)
+  // aside from the (A*B), which is handled by the GEMM. See 05_bmg_gemm_with_epilogues for more
+  // complex epilogue examples.
+  using EpilogueOp = cutlass::epilogue::fusion::LinearCombination<ElementOutput, ElementComputeEpilogue,
+          ElementAccumulator, ElementAccumulator, cutlass::FloatRoundStyle::round_to_nearest>;
+
+  // FusionCallbacks ties the EpilogueOp to an implementation (based on the dispatch
+  // policy/architecture) and defines the epilogue arguments.
+  using FusionCallBacks = cutlass::epilogue::fusion::FusionCallbacks<EpilogueDispatchPolicy, EpilogueOp, TileShape,
+          decltype(tile_shape(TiledMma()))>;
+  // GEMM Epilogue - loads & stores C/D matrices, performs epilogue operations & load/stores any
+  // auxiliary data required
+  using CollectiveEpilogue = cutlass::epilogue::collective::CollectiveEpilogue<
+          EpilogueDispatchPolicy,
+          TileShape,
+          ElementAccumulator,
+          cutlass::gemm::TagToStrideC_t<LayoutC>, // Converts CUTLASS 2.x to CUTLASS 3.x representation
+          ElementOutput,
+          cutlass::gemm::TagToStrideC_t<LayoutD>, // Converts CUTLASS 2.x to CUTLASS 3.x representation
+          FusionCallBacks,
+          XE_2D_U32x8x16_LD_N, // The copy atom used to load matrix C
+          void, void,
+          XE_2D_U32x8x16_ST_N, // The copy atom used to store matrix D
+          void, void>;
+
+  // GEMM Mainloop - iteration over blocks in K dimension
+  using CollectiveMainloop = cutlass::gemm::collective::CollectiveMma<
+          GEMMDispatchPolicy,
+          TileShape,
+          ElementInputA,
+          cutlass::gemm::TagToStrideA_t<LayoutA>, // Converts CUTLASS 2.x to CUTLASS 3.x representation
+          ElementInputB,
+          cutlass::gemm::TagToStrideB_t<LayoutB>, // Converts CUTLASS 2.x to CUTLASS 3.x representation
+          TiledMma,
+          GmemTiledCopyA, void, void, cute::identity,  // A
+          GmemTiledCopyB, void, void, cute::identity   // B
+  >;
+
+  // Define the whole kernel (mainloop and epilogue)
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+  Shape<int, int, int, int>, // Defer global problem shape definition to runtime
+  CollectiveMainloop,
+  CollectiveEpilogue
+  >;
+
+  // The GemmUniversalAdapter wraps the defined GEMM kernel and handles the launch, and e.g.
+  // persistent scratch memory if required.
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+  ExampleRunner<Gemm> runner;
+
+  CUTLASS_CHECK(runner.run(options, hw_info));
+
+  return 0;
+}
\ No newline at end of file
diff --git a/examples/00_bmg_gemm/legacy/CMakeLists.txt b/examples/00_bmg_gemm/legacy/CMakeLists.txt
new file mode 100644
index 0000000000..1d40199221
--- /dev/null
+++ b/examples/00_bmg_gemm/legacy/CMakeLists.txt
@@ -0,0 +1,41 @@
+# Copyright (c) 2024 - 2025 Codeplay Software Ltd. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+set(TEST_BATCHES --l=2)
+set(TEST_LARGE "--l=513 --m=8 --n=16384 --k=512") # B matrix capacity > uint32_max
+set(TEST_SMALL_SHAPE --m=4 --n=8 --k=8 --l=2)
+
+
+cutlass_example_add_executable(
+  00_bmg_gemm_legacy
+  00_bmg_gemm.cpp
+  TEST_COMMAND_OPTIONS
+  TEST_BATCHES
+  TEST_LARGE
+  TEST_SMALL_SHAPE
+)
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index d141f5b7de..38c6b34b75 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -101,6 +101,7 @@ if(CUTLASS_ENABLE_SYCL)
     message(STATUS "Building examples for Intel GPU targets")
     foreach(EXAMPLE
       00_bmg_gemm
+      00_bmg_gemm/legacy
       01_bmg_gemm_with_collective_builder
       02_bmg_gemm_mixed_dtype
       03_bmg_gemm_streamk
diff --git a/include/cutlass/gemm/collective/xe_mma.hpp b/include/cutlass/gemm/collective/xe_mma.hpp
index 64f23fd93f..bbd6add94e 100644
--- a/include/cutlass/gemm/collective/xe_mma.hpp
+++ b/include/cutlass/gemm/collective/xe_mma.hpp
@@ -100,6 +100,37 @@ struct CollectiveMma<MainloopXeL1Staged<Stages, Schedule>, TileShape_, ElementA_
   static constexpr auto Num_SGs = ATOM_N * ATOM_M * ATOM_K;
   static constexpr uint32_t MaxThreadsPerBlock = size(TiledMma{});
 
+  // Helper struct to deduce CopyOpA type
+  template<class EA, class SA, class TM, class GTCA>
+  struct CopyOpAHelper {
+    static auto get() {
+      auto tmp = make_tensor(make_gmem_ptr(static_cast<EA const*>(nullptr)),
+                             make_layout(make_shape(Int<BLK_M>{}, Int<BLK_K>{}, Int<1>{}), SA{}));
+      if constexpr (!std::is_void_v<GTCA>) {
+        return make_block_2d_copy_A(GTCA{}, TM{}, tmp(_,_,0));
+      } else {
+        return make_block_2d_copy_A(TM{}, tmp(_,_,0));
+      }
+    }
+  };
+
+  // Helper struct to deduce CopyOpB type
+  template<class EB, class SB, class TM, class GTCB>
+  struct CopyOpBHelper {
+    static auto get() {
+      auto tmp = make_tensor(make_gmem_ptr(static_cast<EB const*>(nullptr)),
+                             make_layout(make_shape(Int<BLK_N>{}, Int<BLK_K>{}, Int<1>{}), SB{}));
+      if constexpr (!std::is_void_v<GTCB>) {
+        return make_block_2d_copy_B(GTCB{}, TM{}, tmp(_,_,0));
+      } else {
+        return make_block_2d_copy_B(TM{}, tmp(_,_,0));
+      }
+    }
+  };
+
+  using CopyOpA = decltype(CopyOpAHelper<ElementA, StrideA, TiledMma, GmemTiledCopyA>::get());
+  using CopyOpB = decltype(CopyOpBHelper<ElementB, StrideB, TiledMma, GmemTiledCopyB>::get());
+
   // Host side kernel arguments
   struct Arguments {
     ElementA const* ptr_A;
@@ -113,7 +144,8 @@ struct CollectiveMma<MainloopXeL1Staged<Stages, Schedule>, TileShape_, ElementA_
     StrideA dA;
     ElementB const* ptr_B;
     StrideB dB;
-    int M, N, K, L; 
+    CopyOpA copy_a;
+    CopyOpB copy_b;
   };
 
   //
@@ -129,11 +161,32 @@ struct CollectiveMma<MainloopXeL1Staged<Stages, Schedule>, TileShape_, ElementA_
 
     auto [M,N,K,L] = problem_shape;
 
+    auto mA_mkl = make_tensor(make_gmem_ptr(args.ptr_A),
+                                make_layout(make_shape(M, K, L), args.dA));
+    auto mB_nkl = make_tensor(make_gmem_ptr(args.ptr_B),
+                                make_layout(make_shape(N, K, L), args.dB));
+
+    CopyOpA copy_a = [&]() {
+      if constexpr (!std::is_void_v<GmemTiledCopyA>) {
+        return make_block_2d_copy_A(GmemTiledCopyA{}, TiledMma{}, mA_mkl(_,_,0));
+      } else {
+        return make_block_2d_copy_A(TiledMma{}, mA_mkl(_,_,0));
+      }
+    }();
+
+    CopyOpB copy_b = [&]() {
+      if constexpr (!std::is_void_v<GmemTiledCopyB>) {
+        return make_block_2d_copy_B(GmemTiledCopyB{}, TiledMma{}, mB_nkl(_,_,0));
+      } else {
+        return make_block_2d_copy_B(TiledMma{}, mB_nkl(_,_,0));
+      }
+    }();
+
     return Params{args.ptr_A,
     args.dA,
     args.ptr_B,
     args.dB,
-    M, N, K, L};
+    copy_a, copy_b};
   }
 
   template<class ProblemShape>
@@ -176,32 +229,8 @@ struct CollectiveMma<MainloopXeL1Staged<Stages, Schedule>, TileShape_, ElementA_
     static_assert(is_rmem<FrgTensorD>::value, "D tensor must be rmem resident.");
     static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
 
-    auto mA_mkl = make_tensor(make_gmem_ptr(mainloop.ptr_A), 
-                                make_layout(make_shape(mainloop.M, mainloop.K, mainloop.L), cute::take<0,2>(mainloop.dA)));
-    auto mB_nkl = make_tensor(make_gmem_ptr(mainloop.ptr_B), 
-                                make_layout(make_shape(mainloop.N, mainloop.K, mainloop.L), cute::take<0,2>(mainloop.dB)));
-    auto copy_a = [&]() {
-    if constexpr (!std::is_void_v<GmemTiledCopyA>) {
-      // User provided copy operation
-      return make_block_2d_copy_A(GmemTiledCopyA{}, TiledMma{}, mA_mkl);
-    } else {
-      // make_block_2d_copy_A automatically selects copy operation
-      return make_block_2d_copy_A(TiledMma{}, mA_mkl);
-    }
-  }();
-  
-  auto copy_b = [&]() {
-    if constexpr (!std::is_void_v<GmemTiledCopyB>) {
-      // User provided copy operation
-      return make_block_2d_copy_B(GmemTiledCopyB{}, TiledMma{}, mB_nkl);
-    } else {
-      // make_block_2d_copy_B automatically selects copy operation
-      return make_block_2d_copy_B(TiledMma{}, mB_nkl);
-    }
-  }();
-
-    auto thr_copy_a = copy_a.get_slice(thread_idx);
-    auto thr_copy_b = copy_b.get_slice(thread_idx);
+    auto thr_copy_a = mainloop.copy_a.get_slice(thread_idx);
+    auto thr_copy_b = mainloop.copy_b.get_slice(thread_idx);
 
     // Instantiate the MMA object and get thread slice
     TiledMma tiled_mma;
@@ -220,8 +249,8 @@ struct CollectiveMma<MainloopXeL1Staged<Stages, Schedule>, TileShape_, ElementA_
     Tensor tBgB = thr_copy_b.partition_S(gB);
     
     /* Create prefetch TiledCopy instances */
-    auto prefetch_a = make_block_2d_prefetch(copy_a);
-    auto prefetch_b = make_block_2d_prefetch(copy_b);
+    auto prefetch_a = make_block_2d_prefetch(mainloop.copy_a);
+    auto prefetch_b = make_block_2d_prefetch(mainloop.copy_b);
       
     auto thr_prefetch_A = prefetch_a.get_slice(thread_idx);
     auto thr_prefetch_B = prefetch_b.get_slice(thread_idx);
@@ -238,14 +267,14 @@ struct CollectiveMma<MainloopXeL1Staged<Stages, Schedule>, TileShape_, ElementA_
 
       PRINT(tCrA);
       PRINT(tArA);
-      PRINT(copy_a);
+      PRINT(mainloop.copy_a);
 
       print("======================= B: \n");
       PRINT(tBgB);
 
       PRINT(tCrB);
       PRINT(tBrB);
-      PRINT(copy_b);
+      PRINT(mainloop.copy_b);
       }
 #undef PRINT
 #endif
@@ -266,8 +295,8 @@ struct CollectiveMma<MainloopXeL1Staged<Stages, Schedule>, TileShape_, ElementA_
     for (int k_tile = k_start_idx; k_tile < k_tile_count + k_start_idx; k_tile++, prefetch_k++) {
       barrier_arrive(barrier_scope);
       // Copy gmem to rmem for the first k_tile
-      copy(copy_a, tAgA(_,_,_,k_tile), tArA);
-      copy(copy_b, tBgB(_,_,_,k_tile), tBrB);
+      copy(mainloop.copy_a, tAgA(_,_,_,k_tile), tArA);
+      copy(mainloop.copy_b, tBgB(_,_,_,k_tile), tBrB);
 
       if (prefetch_k < k_tile_count) {
         prefetch(prefetch_a, pAgA(_, _, _, prefetch_k));

From ea67069e04beca4db21d8df3388dd66ae5ba2567 Mon Sep 17 00:00:00 2001
From: Anamika Chatterjee <anamikac@habana.ai>
Date: Fri, 10 Oct 2025 11:32:35 +0300
Subject: [PATCH 13/15] Applied reviwer comment

---
 examples/00_bmg_gemm/00_bmg_gemm.cpp | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/examples/00_bmg_gemm/00_bmg_gemm.cpp b/examples/00_bmg_gemm/00_bmg_gemm.cpp
index 0f7c665820..874fce9c75 100644
--- a/examples/00_bmg_gemm/00_bmg_gemm.cpp
+++ b/examples/00_bmg_gemm/00_bmg_gemm.cpp
@@ -277,8 +277,6 @@ struct ExampleRunner {
     bool passed = verify(problem_size, options.alpha, options.beta);
     std::cout << "Disposition: " << (passed ? "Passed" : "Failed") << std::endl;
 
-    if(!passed) return cutlass::Status::kErrorInternal;
-
     if (options.iterations > 0) {
       GPU_Clock timer;
       timer.start();
@@ -345,7 +343,7 @@ int main(int argc, const char** argv)
   using LayoutC = cutlass::layout::RowMajor;
   using LayoutD = cutlass::layout::RowMajor;
 
-  // [New Copy Atom] When left unspecified (void), make_block_2d_copy_* automatically selects 
+  // [New Copy Atom] When left unspecified (void), MainloopXeL1Staged automatically selects 
   // appropriate 2D block copy operations for matrices A and B. Alternatively, you can 
   // explicitly specify new copy atom operations such as XE_LOAD_2D, XE_LOAD_2D_VNNI 
   // (applicable only to matrix B), or XE_LOAD_2D_TRANSPOSE.

From e9878b9c164a4be800e324457074ebc9bf2c1f9e Mon Sep 17 00:00:00 2001
From: Anamika Chatterjee <anamikac@habana.ai>
Date: Fri, 10 Oct 2025 11:37:45 +0300
Subject: [PATCH 14/15] This is an empty commit


From fbb7bb5c58a52bf2ffacd2473f97945b0c8a199b Mon Sep 17 00:00:00 2001
From: Anamika Chatterjee <anamika.chatterjee@intel.com>
Date: Fri, 10 Oct 2025 16:59:05 +0530
Subject: [PATCH 15/15] Preventing exceptions on older IGC versions

---
 examples/00_bmg_gemm/00_bmg_gemm.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/00_bmg_gemm/00_bmg_gemm.cpp b/examples/00_bmg_gemm/00_bmg_gemm.cpp
index 874fce9c75..508c2e5063 100644
--- a/examples/00_bmg_gemm/00_bmg_gemm.cpp
+++ b/examples/00_bmg_gemm/00_bmg_gemm.cpp
@@ -277,7 +277,7 @@ struct ExampleRunner {
     bool passed = verify(problem_size, options.alpha, options.beta);
     std::cout << "Disposition: " << (passed ? "Passed" : "Failed") << std::endl;
 
-    if (options.iterations > 0) {
+    if (passed && options.iterations > 0) {
       GPU_Clock timer;
       timer.start();
       for (int i = 0; i < options.iterations; ++i) {