intel · jojivk73 · Sep 4, 2025 · Sep 11, 2025 · Sep 16, 2025
diff --git a/include/cutlass/arch/config.h b/include/cutlass/arch/config.h
@@ -39,10 +39,19 @@
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
+
+// This is a workaround and should be removed in the future.
+// Issue in next #if (cuda flags not getting set) need to be fixed for SYCL.
+#if (!defined(CUTLASS_ARCH_MMA_SM90_ENABLED) && defined(__SYCL_CUDA_ARCH__) && (__SYCL_CUDA_ARCH__ >= 900))
+  #define CUTLASS_ARCH_MMA_SM90_ENABLED 1
+  #pragma message("FROM THE PREPROCESSOR CUTLASS_ARCH_MMA_SM90_ENABLED.")
+#endif
+
 // SM90
 #if (__CUDACC_VER_MAJOR__ > 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 0))
   #define CUTLASS_ARCH_MMA_SM90_SUPPORTED 1
-  #if (!defined(CUTLASS_ARCH_MMA_SM90_ENABLED) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 900)
+  #if ((!defined(CUTLASS_ARCH_MMA_SM90_ENABLED) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 900) || \
+       (defined(__SYCL_CUDA_ARCH__) && (__SYCL_CUDA_ARCH__ >= 900)))
     #define CUTLASS_ARCH_MMA_SM90_ENABLED 1
 
     #if (!defined(CUTLASS_ARCH_MMA_SM90A_ENABLED) && defined(__CUDA_ARCH_FEAT_SM90_ALL))

diff --git a/test/unit/common/cutlass_unit_test.h b/test/unit/common/cutlass_unit_test.h
@@ -93,7 +93,7 @@ int CutlassUnitTestProblemCount();
 #define CUTLASS_TEST_UNIT_ENABLE_WARNINGS false
 #endif
 
-#if (__CUDACC_VER_MAJOR__ >= 12)
+#if ((__CUDACC_VER_MAJOR__ >= 12) || (defined(__SYCL_CUDA_ARCH__) && (__SYCL_CUDA_ARCH__ >= 900)))
   #define CUDA_12_0_SM90_FEATURES_SUPPORTED true
 #else
   #define CUDA_12_0_SM90_FEATURES_SUPPORTED false

diff --git a/test/unit/cute/CMakeLists.txt b/test/unit/cute/CMakeLists.txt
@@ -74,7 +74,7 @@ else()
     add_subdirectory(volta)
     add_subdirectory(turing)
     add_subdirectory(ampere)
-    # add_subdirectory(hopper) // Hopper test support to come later once complete support in DPCPP
+    add_subdirectory(hopper)
 
   endif()
 
@@ -87,7 +87,7 @@ else()
       cutlass_test_unit_cute_volta
       cutlass_test_unit_cute_turing
       cutlass_test_unit_cute_ampere
-      #cutlass_test_unit_cute_hopper
+      cutlass_test_unit_cute_hopper
       cutlass_test_unit_cute_msvc_compilation
       )
 
@@ -99,7 +99,7 @@ else()
       test_unit_cute_volta
       test_unit_cute_ampere
       test_unit_cute_turing
-      #test_unit_cute_hopper
+      test_unit_cute_hopper
       test_unit_cute_msvc_compilation
       )
   else()

diff --git a/test/unit/cute/hopper/bulk_load.cu b/test/unit/cute/hopper/bulk_load.cu
@@ -63,10 +63,10 @@ bulk_copy_test_device_cute(T const* g_in,
 {
   // Use Shared Storage structure to allocate and distribute aligned SMEM addresses
   #if defined(__SYCL_DEVICE_ONLY__)
-  auto smem = sycl_ext::get_dynamic_work_group_memory<char>().get();
+  auto shared_memory = sycl_ext::get_work_group_scratch_memory();
   #endif
   #if defined(CUTLASS_ENABLE_SYCL) && !defined(__SYCL_DEVICE_ONLY__)
-    char* smem; // dummy declaration to avoid compilation errors during the host compilation phase
+    char* shared_memory; // dummy declaration to avoid compilation errors during the host compilation phase
   #endif
   #if !defined(CUTLASS_ENABLE_SYCL)
     extern CUTLASS_SHARED char shared_memory[];
@@ -146,7 +146,7 @@ void run_and_validate(GLayout gmem_layout,
   #if defined(CUTLASS_ENABLE_SYCL)
     sc_exp::launch<bulk_copy_test_device_cute<T, GLayout, SLayout>>
     ( sc_exp::launch_policy{sc::dim3(1), sc::dim3(128), 
-      sc_exp::launch_properties{sycl_ext::work_group_static_size(smem_size)}},
+      sc_exp::launch_properties{sycl_ext::work_group_scratch_size(smem_size)}},
       d_in.data(), d_out.data(), gmem_layout, smem_layout);
     sc::wait_and_throw();
   #else

diff --git a/test/unit/cute/hopper/bulk_store.cu b/test/unit/cute/hopper/bulk_store.cu
@@ -62,10 +62,10 @@ bulk_copy_test_device_cute(T const* g_in,
 {
   // Use Shared Storage structure to allocate and distribute aligned SMEM addresses
   #if defined(__SYCL_DEVICE_ONLY__)
-  auto smem = sycl_ext::get_dynamic_work_group_memory<char>().get();
+  auto shared_memory = sycl_ext::get_work_group_scratch_memory();
   #endif
   #if defined(CUTLASS_ENABLE_SYCL) && !defined(__SYCL_DEVICE_ONLY__)
-    char* smem; // dummy declaration to avoid compilation errors during the host compilation phase
+    char* shared_memory; // dummy declaration to avoid compilation errors during the host compilation phase
   #endif
   #if !defined(CUTLASS_ENABLE_SYCL)
     extern CUTLASS_SHARED char shared_memory[];
@@ -83,9 +83,16 @@ bulk_copy_test_device_cute(T const* g_in,
   //
 
   // Input gmem -> smem
+  #if defined(CUTLASS_ENABLE_SYCL) || defined(__SYCL_DEVICE_ONLY__)
+  for (int i = ThreadIdxX(); i < size(sA); i +=  BlockDimX()) {
+    sA(i) = gA(i);
+  }
+  #else
   for (int i = threadIdx.x; i < size(sA); i += blockDim.x) {
     sA(i) = gA(i);
   }
+  #endif
+
 
   cp_async_fence();
   cp_async_wait<0>();
@@ -128,7 +135,7 @@ void run_and_validate(GLayout gmem_layout,
   #if defined(CUTLASS_ENABLE_SYCL)
   sc_exp::launch<bulk_copy_test_device_cute<T, GLayout, SLayout>>
   ( sc_exp::launch_policy{sc::dim3(1), sc::dim3(128), 
-    sc_exp::launch_properties{sycl_ext::work_group_static_size(smem_size)}},
+    sc_exp::launch_properties{sycl_ext::work_group_scratch_size(smem_size)}},
     d_in.data(), d_out.data(), gmem_layout, smem_layout);
   sc::wait_and_throw();
   #else

diff --git a/test/unit/cute/hopper/stsm.cu b/test/unit/cute/hopper/stsm.cu
@@ -59,7 +59,7 @@ stsm_test_device(uint16_t* g_in, uint16_t* g_out)
   }
 
   #if defined(__SYCL_DEVICE_ONLY__)
-    auto smem = sycl_ext::get_dynamic_work_group_memory<uint32_t>().get();
+    auto smem = sycl_ext::get_work_group_scratch_memory();
   #endif
   #if defined(CUTLASS_ENABLE_SYCL) && !defined(__SYCL_DEVICE_ONLY__)
     uint32_t* smem; // dummy declaration to avoid compilation errors during the host compilation phase
@@ -69,15 +69,15 @@ stsm_test_device(uint16_t* g_in, uint16_t* g_out)
   #endif
 
   // load rmem -> smem using STSM
-  uint128_t* smem_ptr = reinterpret_cast<uint128_t*>(smem) + tid;
+  uint32_t* smem_ptr = reinterpret_cast<uint32_t*>(smem) + tid;
   T*         rmem_ptr = reinterpret_cast<T*>(reg);
   cute::copy_stsm(rmem_ptr, smem_ptr);
 
   syncthreads();
 
   // store output smem -> gmem
   for (int i = 0; i < (sizeof(T) / 4); i++) {
-    reinterpret_cast<uint32_t*>(g_out)[tid + (stride * i)] = smem[tid + (stride * i)];
+    reinterpret_cast<uint32_t*>(g_out)[tid + (stride * i)] = smem_ptr[tid + (stride * i)];
   }
 }
 
@@ -89,10 +89,10 @@ stsm_test_device_cute(uint16_t* g_in, uint16_t* g_out,
   using namespace cute;
 
   #if defined(__SYCL_DEVICE_ONLY__)
-    auto smem = sycl_ext::get_dynamic_work_group_memory<uint16_t>().get();
+    char* smem = sycl_ext::dynamic_work_group_memory<char[]>().get();
   #endif
   #if defined(CUTLASS_ENABLE_SYCL) && !defined(__SYCL_DEVICE_ONLY__)
-    char* smem_buf; // dummy declaration to avoid compilation errors during the host compilation phase
+    char* smem; // dummy declaration to avoid compilation errors during the host compilation phase
   #endif
   #if !defined(CUTLASS_ENABLE_SYCL)
     CUTLASS_SHARED uint16_t smem[size(smem_layout)];
@@ -151,8 +151,8 @@ TEST(SM90_CuTe_Hopper, Stsm)
   device_vector<uint16_t> d_out(count);
   #if defined(CUTLASS_ENABLE_SYCL)
   sc_exp::launch<stsm_test_device<uint32_t>>
-  ( sc::launch_policy{sc::dim3(1), sc::dim3(32), 
-    sc_exp::launch_properties{sycl_ext::work_group_static_size(sizeof(uint32_t) / 4 * 32)}},
+  ( sc_exp::launch_policy{sc::dim3(1), sc::dim3(32),
+    sc_exp::launch_properties{sycl_ext::work_group_scratch_size(sizeof(uint32_t) / 4 * 32)}},
     d_in.data(), d_out.data());
   sc::wait_and_throw();
   #else
@@ -176,8 +176,8 @@ TEST(SM90_CuTe_Hopper, Stsm)
   device_vector<uint16_t> d_out(count);
   #if defined(CUTLASS_ENABLE_SYCL)
   sc_exp::launch<stsm_test_device<uint64_t>>
-  ( sc::launch_policy{sc::dim3(1), sc::dim3(32), 
-    sc_exp::launch_properties{sycl_ext::work_group_static_size(sizeof(uint64_t) / 4 * 32)}},
+  ( sc_exp::launch_policy{sc::dim3(1), sc::dim3(32),
+    sc_exp::launch_properties{sycl_ext::work_group_scratch_size(sizeof(uint64_t) / 4 * 32)}},
     d_in.data(), d_out.data());
   sc::wait_and_throw();
   #else
@@ -201,8 +201,8 @@ TEST(SM90_CuTe_Hopper, Stsm)
   device_vector<uint16_t> d_out(count);
   #if defined(CUTLASS_ENABLE_SYCL)
   sc_exp::launch<stsm_test_device<uint128_t>>
-  ( sc_exp::launch_policy{sc::dim3(1), sc::dim3(32), 
-    sc_exp::launch_properties{sycl_ext::work_group_static_size(sizeof(uint128_t) / 4 * 32)}},
+  ( sc_exp::launch_policy{sc::dim3(1), sc::dim3(32),
+    sc_exp::launch_properties{sycl_ext::work_group_scratch_size(sizeof(uint128_t) / 4 * 32)}},
     d_in.data(), d_out.data());
   sc::wait_and_throw();
   #else
@@ -232,8 +232,8 @@ TEST(SM90_CuTe_Hopper, Stsm)
                                     Layout<Shape< _1,_8>>{});
   #if defined(CUTLASS_ENABLE_SYCL)
   sc_exp::launch<stsm_test_device_cute<decltype(tiled_copy), decltype(smem_layout)>>
-  ( sc::launch_policy{sc::dim3(1), sc::dim3(int(size(tiled_copy))), 
-    sc_exp::launch_properties{sycl_ext::work_group_static_size(size(smem_layout))}},
+  ( sc_exp::launch_policy{sc::dim3(1), sc::dim3(int(size(tiled_copy))),
+    sc_exp::launch_properties{sycl_ext::work_group_scratch_size(size(smem_layout))}},
     d_in.data(), d_out.data(), tiled_copy, smem_layout);
   sc::wait_and_throw();
   #else
@@ -262,8 +262,8 @@ TEST(SM90_CuTe_Hopper, Stsm)
 
   #if defined(CUTLASS_ENABLE_SYCL)
   sc_exp::launch<stsm_test_device_cute<decltype(tiled_copy), decltype(smem_layout)>>
-  ( sc::launch_policy{sc::dim3(1), sc::dim3(int(size(tiled_copy))), 
-    sc_exp::launch_properties{sycl_ext::work_group_static_size(size(smem_layout))}},
+  ( sc_exp::launch_policy{sc::dim3(1), sc::dim3(int(size(tiled_copy))),
+    sc_exp::launch_properties{sycl_ext::work_group_scratch_size(size(smem_layout))}},
     d_in.data(), d_out.data(), tiled_copy, smem_layout);
   sc::wait_and_throw();
   #else
@@ -291,8 +291,8 @@ TEST(SM90_CuTe_Hopper, Stsm)
                                     Layout<Shape< _1,_8>>{});
   #if defined(CUTLASS_ENABLE_SYCL)
   sc_exp::launch<stsm_test_device_cute<decltype(tiled_copy), decltype(smem_layout)>>
-  ( sc::launch_policy{sc::dim3(1), sc::dim3(int(size(tiled_copy))), 
-    sc_exp::launch_properties{sycl_ext::work_group_static_size(size(smem_layout))}},
+  ( sc_exp::launch_policy{sc::dim3(1), sc::dim3(int(size(tiled_copy))),
+    sc_exp::launch_properties{sycl_ext::work_group_scratch_size(size(smem_layout))}},
     d_in.data(), d_out.data(), tiled_copy, smem_layout);
   sc::wait_and_throw();
   #else
@@ -321,8 +321,8 @@ TEST(SM90_CuTe_Hopper, Stsm)
 
   #if defined(CUTLASS_ENABLE_SYCL)
   sc_exp::launch<stsm_test_device_cute<decltype(tiled_copy), decltype(smem_layout)>>
-  ( sc::launch_policy{sc::dim3(1), sc::dim3(int(size(tiled_copy))), 
-    sc_exp::launch_properties{sycl_ext::work_group_static_size(size(smem_layout))}},
+  ( sc_exp::launch_policy{sc::dim3(1), sc::dim3(int(size(tiled_copy))),
+    sc_exp::launch_properties{sycl_ext::work_group_scratch_size(size(smem_layout))}},
     d_in.data(), d_out.data(), tiled_copy, smem_layout);
   sc::wait_and_throw();
   #else
@@ -350,8 +350,8 @@ TEST(SM90_CuTe_Hopper, Stsm)
                                     Layout<Shape< _2,_4>>{});
   #if defined(CUTLASS_ENABLE_SYCL)
   sc_exp::launch<stsm_test_device_cute<decltype(tiled_copy), decltype(smem_layout)>>
-  ( sc::launch_policy{sc::dim3(1), sc::dim3(int(size(tiled_copy))), 
-    sc_exp::launch_properties{sycl_ext::work_group_static_size(size(smem_layout))}},
+  ( sc_exp::launch_policy{sc::dim3(1), sc::dim3(int(size(tiled_copy))),
+    sc_exp::launch_properties{sycl_ext::work_group_scratch_size(size(smem_layout))}},
     d_in.data(), d_out.data(), tiled_copy, smem_layout);
   sc::wait_and_throw();
   #else
@@ -379,8 +379,8 @@ TEST(SM90_CuTe_Hopper, Stsm)
                                     Layout<Shape< _2,_4>>{});
   #if defined(CUTLASS_ENABLE_SYCL)
   sc_exp::launch<stsm_test_device_cute<decltype(tiled_copy), decltype(smem_layout)>>
-  ( sc::launch_policy{sc::dim3(1), sc::dim3(int(size(tiled_copy))), 
-    sc_exp::launch_properties{sycl_ext::work_group_static_size(size(smem_layout))}},
+  ( sc_exp::launch_policy{sc::dim3(1), sc::dim3(int(size(tiled_copy))),
+    sc_exp::launch_properties{sycl_ext::work_group_scratch_size(size(smem_layout))}},
     d_in.data(), d_out.data(), tiled_copy, smem_layout);
   sc::wait_and_throw();
   #else
@@ -408,8 +408,8 @@ TEST(SM90_CuTe_Hopper, Stsm)
                                     Layout<Shape< _2,_4>>{});
   #if defined(CUTLASS_ENABLE_SYCL)
   sc_exp::launch<stsm_test_device_cute<decltype(tiled_copy), decltype(smem_layout)>>
-  ( sc::launch_policy{sc::dim3(1), sc::dim3(int(size(tiled_copy))), 
-    sc_exp::launch_properties{sycl_ext::work_group_static_size(size(smem_layout))}},
+  ( sc_exp::launch_policy{sc::dim3(1), sc::dim3(int(size(tiled_copy))),
+    sc_exp::launch_properties{sycl_ext::work_group_scratch_size(size(smem_layout))}},
     d_in.data(), d_out.data(), tiled_copy, smem_layout);
   sc::wait_and_throw();
   #else
@@ -437,8 +437,8 @@ TEST(SM90_CuTe_Hopper, Stsm)
                                     Layout<Shape< _2,_4>>{});
   #if defined(CUTLASS_ENABLE_SYCL)
   sc_exp::launch<stsm_test_device_cute<decltype(tiled_copy), decltype(smem_layout)>>
-  ( sc::launch_policy{sc::dim3(1), sc::dim3(int(size(tiled_copy))), 
-    sc_exp::launch_properties{sycl_ext::work_group_static_size(size(smem_layout))}},
+  ( sc_exp::launch_policy{sc::dim3(1), sc::dim3(int(size(tiled_copy))),
+    sc_exp::launch_properties{sycl_ext::work_group_scratch_size(size(smem_layout))}},
     d_in.data(), d_out.data(), tiled_copy, smem_layout);
   sc::wait_and_throw();
   #else
@@ -466,8 +466,8 @@ TEST(SM90_CuTe_Hopper, Stsm)
                                     Layout<Shape<_2,_1>>{});
   #if defined(CUTLASS_ENABLE_SYCL)
   sc_exp::launch<stsm_test_device_cute<decltype(tiled_copy), decltype(smem_layout)>>
-  ( sc::launch_policy{sc::dim3(1), sc::dim3(int(size(tiled_copy))), 
-    sc_exp::launch_properties{sycl_ext::work_group_static_size(size(smem_layout))}},
+  ( sc_exp::launch_policy{sc::dim3(1), sc::dim3(int(size(tiled_copy))),
+    sc_exp::launch_properties{sycl_ext::work_group_scratch_size(size(smem_layout))}},
     d_in.data(), d_out.data(), tiled_copy, smem_layout);
   sc::wait_and_throw();
   #else
@@ -495,8 +495,8 @@ TEST(SM90_CuTe_Hopper, Stsm)
                                     Layout<Shape<_4,_1>>{});
   #if defined(CUTLASS_ENABLE_SYCL)
   sc_exp::launch<stsm_test_device_cute<decltype(tiled_copy), decltype(smem_layout)>>
-  ( sc::launch_policy{sc::dim3(1), sc::dim3(int(size(tiled_copy))), 
-    sc_exp::launch_properties{sycl_ext::work_group_static_size(size(smem_layout))}},
+  ( sc_exp::launch_policy{sc::dim3(1), sc::dim3(int(size(tiled_copy))),
+    sc_exp::launch_properties{sycl_ext::work_group_scratch_size(size(smem_layout))}},
     d_in.data(), d_out.data(), tiled_copy, smem_layout);
   sc::wait_and_throw();
   #else
@@ -524,8 +524,8 @@ TEST(SM90_CuTe_Hopper, Stsm)
                                     Layout<Shape<_8,_1>>{});
   #if defined(CUTLASS_ENABLE_SYCL)
   sc_exp::launch<stsm_test_device_cute<decltype(tiled_copy), decltype(smem_layout)>>
-  ( sc::launch_policy{sc::dim3(1), sc::dim3(int(size(tiled_copy))), 
-    sc_exp::launch_properties{sycl_ext::work_group_static_size(size(smem_layout))}},
+  ( sc_exp::launch_policy{sc::dim3(1), sc::dim3(int(size(tiled_copy))),
+    sc_exp::launch_properties{sycl_ext::work_group_scratch_size(size(smem_layout))}},
     d_in.data(), d_out.data(), tiled_copy, smem_layout);
   sc::wait_and_throw();
   #else

diff --git a/test/unit/cute/hopper/tma_load.cu b/test/unit/cute/hopper/tma_load.cu
@@ -31,7 +31,7 @@
 
 #include "cutlass_unit_test.h"
 
-#include "../hopper/tma_load_testbed.hpp"
+#include "tma_load_testbed.hpp"
 
 using namespace cute;
 using namespace cutlass::test;

diff --git a/test/unit/cute/hopper/tma_load_testbed.hpp b/test/unit/cute/hopper/tma_load_testbed.hpp
@@ -66,10 +66,10 @@ tma_test_device_cute(T const* g_in, T* g_out,
 
   // Use Shared Storage structure to allocate and distribute aligned SMEM addresses
   #if defined(__SYCL_DEVICE_ONLY__)
-    auto smem = sycl_ext::get_dynamic_work_group_memory<char>().get();
+    auto shared_memory = sycl_ext::get_work_group_scratch_memory();
   #endif
   #if defined(CUTLASS_ENABLE_SYCL) && !defined(__SYCL_DEVICE_ONLY__)
-    char* smem; // dummy declaration to avoid compilation errors during the host compilation phase
+    char* shared_memory; // dummy declaration to avoid compilation errors during the host compilation phase
   #endif
   #if !defined(CUTLASS_ENABLE_SYCL)
     extern CUTLASS_SHARED char shared_memory[];
@@ -172,6 +172,7 @@ tma_test_device_cute(T const* g_in, T* g_out,
   }
 }
 
+template <class> class PT;
 template <class T, class TmaType = T, class CopyOp, class GMEM_Layout, class SMEM_Layout, class CTA_Tile>
 auto
 test_tma_load(CopyOp      const& copy_op,
@@ -201,16 +202,18 @@ test_tma_load(CopyOp      const& copy_op,
   // Launch
   int smem_size = int(sizeof(SharedStorage<T, decltype(smem_layout)>));
   #if defined(CUTLASS_ENABLE_SYCL)
-  auto kernel = tma_test_device_cute<T, 
+  constexpr auto kernel = tma_test_device_cute<T,
                                     decltype(tma), 
                                     CTA_Tile,
-                                    GmemLayout,
-                                    SmemLayout>;
+                                    GMEM_Layout,
+                                    SMEM_Layout>;
+
   sc_exp::launch<kernel>
   ( sc_exp::launch_policy{sc::dim3(1), sc::dim3(128), 
-    sc_exp::launch_properties{sycl_ext::work_group_static_size(smem_size)}},
-    d_in.data(), d_out.data(), tma, cta_tile,
-    gmem_layout, smem_layout);
+    sc_exp::launch_properties{sycl_ext::work_group_scratch_size(smem_size)}},
+    reinterpret_cast<T const*>(raw_pointer_cast(d_in.data())),
+    reinterpret_cast<T*>      (raw_pointer_cast(d_out.data())),
+    tma, cta_tile, gmem_layout, smem_layout);
   sc::wait_and_throw();
   #else
   tma_test_device_cute<<<1, 128, smem_size>>>(