diff --git a/include/cutlass/arch/config.h b/include/cutlass/arch/config.h index e5daf8292b..ea1cda3e83 100644 --- a/include/cutlass/arch/config.h +++ b/include/cutlass/arch/config.h @@ -39,10 +39,19 @@ ///////////////////////////////////////////////////////////////////////////////////////////////// + +// This is a workaround and should be removed in the future. +// Issue in next #if (cuda flags not getting set) need to be fixed for SYCL. +#if (!defined(CUTLASS_ARCH_MMA_SM90_ENABLED) && defined(__SYCL_CUDA_ARCH__) && (__SYCL_CUDA_ARCH__ >= 900)) + #define CUTLASS_ARCH_MMA_SM90_ENABLED 1 + #pragma message("FROM THE PREPROCESSOR CUTLASS_ARCH_MMA_SM90_ENABLED.") +#endif + // SM90 #if (__CUDACC_VER_MAJOR__ > 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 0)) #define CUTLASS_ARCH_MMA_SM90_SUPPORTED 1 - #if (!defined(CUTLASS_ARCH_MMA_SM90_ENABLED) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 900) + #if ((!defined(CUTLASS_ARCH_MMA_SM90_ENABLED) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 900) || \ + (defined(__SYCL_CUDA_ARCH__) && (__SYCL_CUDA_ARCH__ >= 900))) #define CUTLASS_ARCH_MMA_SM90_ENABLED 1 #if (!defined(CUTLASS_ARCH_MMA_SM90A_ENABLED) && defined(__CUDA_ARCH_FEAT_SM90_ALL)) diff --git a/test/unit/common/cutlass_unit_test.h b/test/unit/common/cutlass_unit_test.h index 2fe080d2ca..02d082da60 100644 --- a/test/unit/common/cutlass_unit_test.h +++ b/test/unit/common/cutlass_unit_test.h @@ -93,7 +93,7 @@ int CutlassUnitTestProblemCount(); #define CUTLASS_TEST_UNIT_ENABLE_WARNINGS false #endif -#if (__CUDACC_VER_MAJOR__ >= 12) +#if ((__CUDACC_VER_MAJOR__ >= 12) || (defined(__SYCL_CUDA_ARCH__) && (__SYCL_CUDA_ARCH__ >= 900))) #define CUDA_12_0_SM90_FEATURES_SUPPORTED true #else #define CUDA_12_0_SM90_FEATURES_SUPPORTED false diff --git a/test/unit/cute/CMakeLists.txt b/test/unit/cute/CMakeLists.txt index 98c124abb3..e3b0ff6e5f 100644 --- a/test/unit/cute/CMakeLists.txt +++ b/test/unit/cute/CMakeLists.txt @@ -74,7 +74,7 @@ else() add_subdirectory(volta) add_subdirectory(turing) add_subdirectory(ampere) - # add_subdirectory(hopper) // Hopper test support to come later once complete support in DPCPP + add_subdirectory(hopper) endif() @@ -87,7 +87,7 @@ else() cutlass_test_unit_cute_volta cutlass_test_unit_cute_turing cutlass_test_unit_cute_ampere - #cutlass_test_unit_cute_hopper + cutlass_test_unit_cute_hopper cutlass_test_unit_cute_msvc_compilation ) @@ -99,7 +99,7 @@ else() test_unit_cute_volta test_unit_cute_ampere test_unit_cute_turing - #test_unit_cute_hopper + test_unit_cute_hopper test_unit_cute_msvc_compilation ) else() diff --git a/test/unit/cute/hopper/bulk_load.cu b/test/unit/cute/hopper/bulk_load.cu index e04d0ebd06..ebef5b7818 100644 --- a/test/unit/cute/hopper/bulk_load.cu +++ b/test/unit/cute/hopper/bulk_load.cu @@ -63,10 +63,10 @@ bulk_copy_test_device_cute(T const* g_in, { // Use Shared Storage structure to allocate and distribute aligned SMEM addresses #if defined(__SYCL_DEVICE_ONLY__) - auto smem = sycl_ext::get_dynamic_work_group_memory().get(); + auto shared_memory = sycl_ext::get_work_group_scratch_memory(); #endif #if defined(CUTLASS_ENABLE_SYCL) && !defined(__SYCL_DEVICE_ONLY__) - char* smem; // dummy declaration to avoid compilation errors during the host compilation phase + char* shared_memory; // dummy declaration to avoid compilation errors during the host compilation phase #endif #if !defined(CUTLASS_ENABLE_SYCL) extern CUTLASS_SHARED char shared_memory[]; @@ -146,7 +146,7 @@ void run_and_validate(GLayout gmem_layout, #if defined(CUTLASS_ENABLE_SYCL) sc_exp::launch> ( sc_exp::launch_policy{sc::dim3(1), sc::dim3(128), - sc_exp::launch_properties{sycl_ext::work_group_static_size(smem_size)}}, + sc_exp::launch_properties{sycl_ext::work_group_scratch_size(smem_size)}}, d_in.data(), d_out.data(), gmem_layout, smem_layout); sc::wait_and_throw(); #else diff --git a/test/unit/cute/hopper/bulk_store.cu b/test/unit/cute/hopper/bulk_store.cu index af31095095..ca4b7f4b05 100644 --- a/test/unit/cute/hopper/bulk_store.cu +++ b/test/unit/cute/hopper/bulk_store.cu @@ -62,10 +62,10 @@ bulk_copy_test_device_cute(T const* g_in, { // Use Shared Storage structure to allocate and distribute aligned SMEM addresses #if defined(__SYCL_DEVICE_ONLY__) - auto smem = sycl_ext::get_dynamic_work_group_memory().get(); + auto shared_memory = sycl_ext::get_work_group_scratch_memory(); #endif #if defined(CUTLASS_ENABLE_SYCL) && !defined(__SYCL_DEVICE_ONLY__) - char* smem; // dummy declaration to avoid compilation errors during the host compilation phase + char* shared_memory; // dummy declaration to avoid compilation errors during the host compilation phase #endif #if !defined(CUTLASS_ENABLE_SYCL) extern CUTLASS_SHARED char shared_memory[]; @@ -83,9 +83,16 @@ bulk_copy_test_device_cute(T const* g_in, // // Input gmem -> smem + #if defined(CUTLASS_ENABLE_SYCL) || defined(__SYCL_DEVICE_ONLY__) + for (int i = ThreadIdxX(); i < size(sA); i += BlockDimX()) { + sA(i) = gA(i); + } + #else for (int i = threadIdx.x; i < size(sA); i += blockDim.x) { sA(i) = gA(i); } + #endif + cp_async_fence(); cp_async_wait<0>(); @@ -128,7 +135,7 @@ void run_and_validate(GLayout gmem_layout, #if defined(CUTLASS_ENABLE_SYCL) sc_exp::launch> ( sc_exp::launch_policy{sc::dim3(1), sc::dim3(128), - sc_exp::launch_properties{sycl_ext::work_group_static_size(smem_size)}}, + sc_exp::launch_properties{sycl_ext::work_group_scratch_size(smem_size)}}, d_in.data(), d_out.data(), gmem_layout, smem_layout); sc::wait_and_throw(); #else diff --git a/test/unit/cute/hopper/stsm.cu b/test/unit/cute/hopper/stsm.cu index 5b026af5d5..eb88b58215 100644 --- a/test/unit/cute/hopper/stsm.cu +++ b/test/unit/cute/hopper/stsm.cu @@ -59,7 +59,7 @@ stsm_test_device(uint16_t* g_in, uint16_t* g_out) } #if defined(__SYCL_DEVICE_ONLY__) - auto smem = sycl_ext::get_dynamic_work_group_memory().get(); + auto smem = sycl_ext::get_work_group_scratch_memory(); #endif #if defined(CUTLASS_ENABLE_SYCL) && !defined(__SYCL_DEVICE_ONLY__) uint32_t* smem; // dummy declaration to avoid compilation errors during the host compilation phase @@ -69,7 +69,7 @@ stsm_test_device(uint16_t* g_in, uint16_t* g_out) #endif // load rmem -> smem using STSM - uint128_t* smem_ptr = reinterpret_cast(smem) + tid; + uint32_t* smem_ptr = reinterpret_cast(smem) + tid; T* rmem_ptr = reinterpret_cast(reg); cute::copy_stsm(rmem_ptr, smem_ptr); @@ -77,7 +77,7 @@ stsm_test_device(uint16_t* g_in, uint16_t* g_out) // store output smem -> gmem for (int i = 0; i < (sizeof(T) / 4); i++) { - reinterpret_cast(g_out)[tid + (stride * i)] = smem[tid + (stride * i)]; + reinterpret_cast(g_out)[tid + (stride * i)] = smem_ptr[tid + (stride * i)]; } } @@ -89,10 +89,10 @@ stsm_test_device_cute(uint16_t* g_in, uint16_t* g_out, using namespace cute; #if defined(__SYCL_DEVICE_ONLY__) - auto smem = sycl_ext::get_dynamic_work_group_memory().get(); + char* smem = sycl_ext::dynamic_work_group_memory().get(); #endif #if defined(CUTLASS_ENABLE_SYCL) && !defined(__SYCL_DEVICE_ONLY__) - char* smem_buf; // dummy declaration to avoid compilation errors during the host compilation phase + char* smem; // dummy declaration to avoid compilation errors during the host compilation phase #endif #if !defined(CUTLASS_ENABLE_SYCL) CUTLASS_SHARED uint16_t smem[size(smem_layout)]; @@ -151,8 +151,8 @@ TEST(SM90_CuTe_Hopper, Stsm) device_vector d_out(count); #if defined(CUTLASS_ENABLE_SYCL) sc_exp::launch> - ( sc::launch_policy{sc::dim3(1), sc::dim3(32), - sc_exp::launch_properties{sycl_ext::work_group_static_size(sizeof(uint32_t) / 4 * 32)}}, + ( sc_exp::launch_policy{sc::dim3(1), sc::dim3(32), + sc_exp::launch_properties{sycl_ext::work_group_scratch_size(sizeof(uint32_t) / 4 * 32)}}, d_in.data(), d_out.data()); sc::wait_and_throw(); #else @@ -176,8 +176,8 @@ TEST(SM90_CuTe_Hopper, Stsm) device_vector d_out(count); #if defined(CUTLASS_ENABLE_SYCL) sc_exp::launch> - ( sc::launch_policy{sc::dim3(1), sc::dim3(32), - sc_exp::launch_properties{sycl_ext::work_group_static_size(sizeof(uint64_t) / 4 * 32)}}, + ( sc_exp::launch_policy{sc::dim3(1), sc::dim3(32), + sc_exp::launch_properties{sycl_ext::work_group_scratch_size(sizeof(uint64_t) / 4 * 32)}}, d_in.data(), d_out.data()); sc::wait_and_throw(); #else @@ -201,8 +201,8 @@ TEST(SM90_CuTe_Hopper, Stsm) device_vector d_out(count); #if defined(CUTLASS_ENABLE_SYCL) sc_exp::launch> - ( sc_exp::launch_policy{sc::dim3(1), sc::dim3(32), - sc_exp::launch_properties{sycl_ext::work_group_static_size(sizeof(uint128_t) / 4 * 32)}}, + ( sc_exp::launch_policy{sc::dim3(1), sc::dim3(32), + sc_exp::launch_properties{sycl_ext::work_group_scratch_size(sizeof(uint128_t) / 4 * 32)}}, d_in.data(), d_out.data()); sc::wait_and_throw(); #else @@ -232,8 +232,8 @@ TEST(SM90_CuTe_Hopper, Stsm) Layout>{}); #if defined(CUTLASS_ENABLE_SYCL) sc_exp::launch> - ( sc::launch_policy{sc::dim3(1), sc::dim3(int(size(tiled_copy))), - sc_exp::launch_properties{sycl_ext::work_group_static_size(size(smem_layout))}}, + ( sc_exp::launch_policy{sc::dim3(1), sc::dim3(int(size(tiled_copy))), + sc_exp::launch_properties{sycl_ext::work_group_scratch_size(size(smem_layout))}}, d_in.data(), d_out.data(), tiled_copy, smem_layout); sc::wait_and_throw(); #else @@ -262,8 +262,8 @@ TEST(SM90_CuTe_Hopper, Stsm) #if defined(CUTLASS_ENABLE_SYCL) sc_exp::launch> - ( sc::launch_policy{sc::dim3(1), sc::dim3(int(size(tiled_copy))), - sc_exp::launch_properties{sycl_ext::work_group_static_size(size(smem_layout))}}, + ( sc_exp::launch_policy{sc::dim3(1), sc::dim3(int(size(tiled_copy))), + sc_exp::launch_properties{sycl_ext::work_group_scratch_size(size(smem_layout))}}, d_in.data(), d_out.data(), tiled_copy, smem_layout); sc::wait_and_throw(); #else @@ -291,8 +291,8 @@ TEST(SM90_CuTe_Hopper, Stsm) Layout>{}); #if defined(CUTLASS_ENABLE_SYCL) sc_exp::launch> - ( sc::launch_policy{sc::dim3(1), sc::dim3(int(size(tiled_copy))), - sc_exp::launch_properties{sycl_ext::work_group_static_size(size(smem_layout))}}, + ( sc_exp::launch_policy{sc::dim3(1), sc::dim3(int(size(tiled_copy))), + sc_exp::launch_properties{sycl_ext::work_group_scratch_size(size(smem_layout))}}, d_in.data(), d_out.data(), tiled_copy, smem_layout); sc::wait_and_throw(); #else @@ -321,8 +321,8 @@ TEST(SM90_CuTe_Hopper, Stsm) #if defined(CUTLASS_ENABLE_SYCL) sc_exp::launch> - ( sc::launch_policy{sc::dim3(1), sc::dim3(int(size(tiled_copy))), - sc_exp::launch_properties{sycl_ext::work_group_static_size(size(smem_layout))}}, + ( sc_exp::launch_policy{sc::dim3(1), sc::dim3(int(size(tiled_copy))), + sc_exp::launch_properties{sycl_ext::work_group_scratch_size(size(smem_layout))}}, d_in.data(), d_out.data(), tiled_copy, smem_layout); sc::wait_and_throw(); #else @@ -350,8 +350,8 @@ TEST(SM90_CuTe_Hopper, Stsm) Layout>{}); #if defined(CUTLASS_ENABLE_SYCL) sc_exp::launch> - ( sc::launch_policy{sc::dim3(1), sc::dim3(int(size(tiled_copy))), - sc_exp::launch_properties{sycl_ext::work_group_static_size(size(smem_layout))}}, + ( sc_exp::launch_policy{sc::dim3(1), sc::dim3(int(size(tiled_copy))), + sc_exp::launch_properties{sycl_ext::work_group_scratch_size(size(smem_layout))}}, d_in.data(), d_out.data(), tiled_copy, smem_layout); sc::wait_and_throw(); #else @@ -379,8 +379,8 @@ TEST(SM90_CuTe_Hopper, Stsm) Layout>{}); #if defined(CUTLASS_ENABLE_SYCL) sc_exp::launch> - ( sc::launch_policy{sc::dim3(1), sc::dim3(int(size(tiled_copy))), - sc_exp::launch_properties{sycl_ext::work_group_static_size(size(smem_layout))}}, + ( sc_exp::launch_policy{sc::dim3(1), sc::dim3(int(size(tiled_copy))), + sc_exp::launch_properties{sycl_ext::work_group_scratch_size(size(smem_layout))}}, d_in.data(), d_out.data(), tiled_copy, smem_layout); sc::wait_and_throw(); #else @@ -408,8 +408,8 @@ TEST(SM90_CuTe_Hopper, Stsm) Layout>{}); #if defined(CUTLASS_ENABLE_SYCL) sc_exp::launch> - ( sc::launch_policy{sc::dim3(1), sc::dim3(int(size(tiled_copy))), - sc_exp::launch_properties{sycl_ext::work_group_static_size(size(smem_layout))}}, + ( sc_exp::launch_policy{sc::dim3(1), sc::dim3(int(size(tiled_copy))), + sc_exp::launch_properties{sycl_ext::work_group_scratch_size(size(smem_layout))}}, d_in.data(), d_out.data(), tiled_copy, smem_layout); sc::wait_and_throw(); #else @@ -437,8 +437,8 @@ TEST(SM90_CuTe_Hopper, Stsm) Layout>{}); #if defined(CUTLASS_ENABLE_SYCL) sc_exp::launch> - ( sc::launch_policy{sc::dim3(1), sc::dim3(int(size(tiled_copy))), - sc_exp::launch_properties{sycl_ext::work_group_static_size(size(smem_layout))}}, + ( sc_exp::launch_policy{sc::dim3(1), sc::dim3(int(size(tiled_copy))), + sc_exp::launch_properties{sycl_ext::work_group_scratch_size(size(smem_layout))}}, d_in.data(), d_out.data(), tiled_copy, smem_layout); sc::wait_and_throw(); #else @@ -466,8 +466,8 @@ TEST(SM90_CuTe_Hopper, Stsm) Layout>{}); #if defined(CUTLASS_ENABLE_SYCL) sc_exp::launch> - ( sc::launch_policy{sc::dim3(1), sc::dim3(int(size(tiled_copy))), - sc_exp::launch_properties{sycl_ext::work_group_static_size(size(smem_layout))}}, + ( sc_exp::launch_policy{sc::dim3(1), sc::dim3(int(size(tiled_copy))), + sc_exp::launch_properties{sycl_ext::work_group_scratch_size(size(smem_layout))}}, d_in.data(), d_out.data(), tiled_copy, smem_layout); sc::wait_and_throw(); #else @@ -495,8 +495,8 @@ TEST(SM90_CuTe_Hopper, Stsm) Layout>{}); #if defined(CUTLASS_ENABLE_SYCL) sc_exp::launch> - ( sc::launch_policy{sc::dim3(1), sc::dim3(int(size(tiled_copy))), - sc_exp::launch_properties{sycl_ext::work_group_static_size(size(smem_layout))}}, + ( sc_exp::launch_policy{sc::dim3(1), sc::dim3(int(size(tiled_copy))), + sc_exp::launch_properties{sycl_ext::work_group_scratch_size(size(smem_layout))}}, d_in.data(), d_out.data(), tiled_copy, smem_layout); sc::wait_and_throw(); #else @@ -524,8 +524,8 @@ TEST(SM90_CuTe_Hopper, Stsm) Layout>{}); #if defined(CUTLASS_ENABLE_SYCL) sc_exp::launch> - ( sc::launch_policy{sc::dim3(1), sc::dim3(int(size(tiled_copy))), - sc_exp::launch_properties{sycl_ext::work_group_static_size(size(smem_layout))}}, + ( sc_exp::launch_policy{sc::dim3(1), sc::dim3(int(size(tiled_copy))), + sc_exp::launch_properties{sycl_ext::work_group_scratch_size(size(smem_layout))}}, d_in.data(), d_out.data(), tiled_copy, smem_layout); sc::wait_and_throw(); #else diff --git a/test/unit/cute/hopper/tma_load.cu b/test/unit/cute/hopper/tma_load.cu index c2719ff1d3..024d3dc3bc 100644 --- a/test/unit/cute/hopper/tma_load.cu +++ b/test/unit/cute/hopper/tma_load.cu @@ -31,7 +31,7 @@ #include "cutlass_unit_test.h" -#include "../hopper/tma_load_testbed.hpp" +#include "tma_load_testbed.hpp" using namespace cute; using namespace cutlass::test; diff --git a/test/unit/cute/hopper/tma_load_testbed.hpp b/test/unit/cute/hopper/tma_load_testbed.hpp index 8f9aaf3171..26d46f28e4 100644 --- a/test/unit/cute/hopper/tma_load_testbed.hpp +++ b/test/unit/cute/hopper/tma_load_testbed.hpp @@ -66,10 +66,10 @@ tma_test_device_cute(T const* g_in, T* g_out, // Use Shared Storage structure to allocate and distribute aligned SMEM addresses #if defined(__SYCL_DEVICE_ONLY__) - auto smem = sycl_ext::get_dynamic_work_group_memory().get(); + auto shared_memory = sycl_ext::get_work_group_scratch_memory(); #endif #if defined(CUTLASS_ENABLE_SYCL) && !defined(__SYCL_DEVICE_ONLY__) - char* smem; // dummy declaration to avoid compilation errors during the host compilation phase + char* shared_memory; // dummy declaration to avoid compilation errors during the host compilation phase #endif #if !defined(CUTLASS_ENABLE_SYCL) extern CUTLASS_SHARED char shared_memory[]; @@ -172,6 +172,7 @@ tma_test_device_cute(T const* g_in, T* g_out, } } +template class PT; template auto test_tma_load(CopyOp const& copy_op, @@ -201,16 +202,18 @@ test_tma_load(CopyOp const& copy_op, // Launch int smem_size = int(sizeof(SharedStorage)); #if defined(CUTLASS_ENABLE_SYCL) - auto kernel = tma_test_device_cute; + GMEM_Layout, + SMEM_Layout>; + sc_exp::launch ( sc_exp::launch_policy{sc::dim3(1), sc::dim3(128), - sc_exp::launch_properties{sycl_ext::work_group_static_size(smem_size)}}, - d_in.data(), d_out.data(), tma, cta_tile, - gmem_layout, smem_layout); + sc_exp::launch_properties{sycl_ext::work_group_scratch_size(smem_size)}}, + reinterpret_cast(raw_pointer_cast(d_in.data())), + reinterpret_cast (raw_pointer_cast(d_out.data())), + tma, cta_tile, gmem_layout, smem_layout); sc::wait_and_throw(); #else tma_test_device_cute<<<1, 128, smem_size>>>( diff --git a/test/unit/cute/hopper/tma_mcast_load_testbed.hpp b/test/unit/cute/hopper/tma_mcast_load_testbed.hpp index 535b16a22a..7d72300e03 100644 --- a/test/unit/cute/hopper/tma_mcast_load_testbed.hpp +++ b/test/unit/cute/hopper/tma_mcast_load_testbed.hpp @@ -40,6 +40,12 @@ #include #include +#if defined(CUTLASS_ENABLE_SYCL) +namespace sc = syclcompat; +namespace sc_exp = syclcompat::experimental; +namespace sycl_ext = sycl::ext::oneapi::experimental; +#endif + namespace cutlass::test { template @@ -62,12 +68,12 @@ tma_test_device_cute(T const* g_in, T* g_out, GmemLayout gmem_layout, SmemLayout // Use Shared Storage structure to allocate and distribute aligned SMEM addresses #if defined(CUTLASS_ENABLE_SYCL) - auto smem = sycl_ext::get_dynamic_work_group_memory().get(); - #endif - #if defined(CUTLASS_ENABLE_SYCL) && !defined(__SYCL_DEVICE_ONLY__) - char* smem; // dummy declaration to avoid compilation errors during the host compilation phase + #if !defined(__SYCL_DEVICE_ONLY__) + void *shared_memory ; // dummy declaration to avoid compilation errors during the host compilation phase + #else + auto shared_memory = sycl_ext::get_work_group_scratch_memory(); #endif - #if !defined(CUTLASS_ENABLE_SYCL) + #else extern CUTLASS_SHARED char shared_memory[]; #endif using SharedStorage = SharedStorage; @@ -208,24 +214,27 @@ test_tma_load(CopyOp const& copy_op, auto tma = make_tma_atom(copy_op, gA, smem_layout, cta_tiler, cluster_size); //print(tma); + int smem_size = sizeof(SharedStorage); // Launch #if defined(CUTLASS_ENABLE_SYCL) - auto kernel = tma_test_device_cute; + sc_exp::launch - ( sc_exp::launch_policy::{sc::dim3(1), sc::dim3(32), - sc_exp::launch_properties{sycl_ext::work_group_static_size(smem_size), + ( sc_exp::launch_policy{sc::dim3(1), sc::dim3(32), + sc_exp::launch_properties{sycl_ext::work_group_scratch_size(smem_size), sycl_ext::cuda::cluster_size(sycl::range<1>(sc::dim3(size(cluster_size))))}}, - d_in.data(), d_out.data(), + reinterpret_cast(raw_pointer_cast(d_in.data())), + reinterpret_cast(raw_pointer_cast(d_out.data())), gmem_layout, smem_layout, tma, cta_tiler, cluster_size); + sc::wait_and_throw(); #else dim3 dimBlock(32); dim3 dimCluster(size(cluster_size)); dim3 dimGrid = dimCluster; - int smem_size = sizeof(SharedStorage); void* kernel_ptr = (void*) &tma_test_device_cute; diff --git a/test/unit/cute/hopper/tma_store_testbed.hpp b/test/unit/cute/hopper/tma_store_testbed.hpp index 6a34faef65..0027e0bcf1 100644 --- a/test/unit/cute/hopper/tma_store_testbed.hpp +++ b/test/unit/cute/hopper/tma_store_testbed.hpp @@ -38,6 +38,12 @@ #include +#if defined(CUTLASS_ENABLE_SYCL) +namespace sc = syclcompat; +namespace sc_exp = syclcompat::experimental; +namespace sycl_ext = sycl::ext::oneapi::experimental; +#endif + namespace cutlass::test { template @@ -59,10 +65,10 @@ tma_test_device_cute(T const* g_in, T* g_out, // Use Shared Storage structure to allocate and distribute aligned SMEM addresses #if defined(__SYCL_DEVICE_ONLY__) - auto smem = sycl_ext::get_dynamic_work_group_memory().get(); + auto shared_memory = sycl_ext::get_work_group_scratch_memory(); #endif #if defined(CUTLASS_ENABLE_SYCL) && !defined(__SYCL_DEVICE_ONLY__) - char* smem; // dummy declaration to avoid compilation errors during the host compilation phase + char* shared_memory; // dummy declaration to avoid compilation errors during the host compilation phase #endif #if !defined(CUTLASS_ENABLE_SYCL) extern CUTLASS_SHARED char shared_memory[]; @@ -181,15 +187,17 @@ test_tma_store(CopyOp const& copy_op, // Launch int smem_size = int(sizeof(SharedStorage)); #if defined(CUTLASS_ENABLE_SYCL) - auto kernel = tma_test_device_cute; + GMEM_Layout, + SMEM_Layout>; sc_exp::launch ( sc_exp::launch_policy{sc::dim3(1), sc::dim3(128), - sc_exp::launch_properties{sycl_ext::work_group_static_size(smem_size)}}, - d_in.data(), d_out.data(), tma, cta_tile, + sc_exp::launch_properties{sycl_ext::work_group_scratch_size(smem_size)}}, + reinterpret_cast(raw_pointer_cast(d_in.data())), + reinterpret_cast (raw_pointer_cast(d_out.data())), + tma, cta_tile, gmem_layout, smem_layout); sc::wait_and_throw(); #else