Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion include/cutlass/arch/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,19 @@

/////////////////////////////////////////////////////////////////////////////////////////////////


// This is a workaround and should be removed in the future.
// Issue in next #if (cuda flags not getting set) need to be fixed for SYCL.
#if (!defined(CUTLASS_ARCH_MMA_SM90_ENABLED) && defined(__SYCL_CUDA_ARCH__) && (__SYCL_CUDA_ARCH__ >= 900))
#define CUTLASS_ARCH_MMA_SM90_ENABLED 1
#pragma message("FROM THE PREPROCESSOR CUTLASS_ARCH_MMA_SM90_ENABLED.")
#endif

// SM90
#if (__CUDACC_VER_MAJOR__ > 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 0))
#define CUTLASS_ARCH_MMA_SM90_SUPPORTED 1
#if (!defined(CUTLASS_ARCH_MMA_SM90_ENABLED) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 900)
#if ((!defined(CUTLASS_ARCH_MMA_SM90_ENABLED) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 900) || \
(defined(__SYCL_CUDA_ARCH__) && (__SYCL_CUDA_ARCH__ >= 900)))
#define CUTLASS_ARCH_MMA_SM90_ENABLED 1

#if (!defined(CUTLASS_ARCH_MMA_SM90A_ENABLED) && defined(__CUDA_ARCH_FEAT_SM90_ALL))
Expand Down
2 changes: 1 addition & 1 deletion test/unit/common/cutlass_unit_test.h
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ int CutlassUnitTestProblemCount();
#define CUTLASS_TEST_UNIT_ENABLE_WARNINGS false
#endif

#if (__CUDACC_VER_MAJOR__ >= 12)
#if ((__CUDACC_VER_MAJOR__ >= 12) || (defined(__SYCL_CUDA_ARCH__) && (__SYCL_CUDA_ARCH__ >= 900)))
#define CUDA_12_0_SM90_FEATURES_SUPPORTED true
#else
#define CUDA_12_0_SM90_FEATURES_SUPPORTED false
Expand Down
6 changes: 3 additions & 3 deletions test/unit/cute/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ else()
add_subdirectory(volta)
add_subdirectory(turing)
add_subdirectory(ampere)
# add_subdirectory(hopper) // Hopper test support to come later once complete support in DPCPP
add_subdirectory(hopper)

endif()

Expand All @@ -87,7 +87,7 @@ else()
cutlass_test_unit_cute_volta
cutlass_test_unit_cute_turing
cutlass_test_unit_cute_ampere
#cutlass_test_unit_cute_hopper
cutlass_test_unit_cute_hopper
cutlass_test_unit_cute_msvc_compilation
)

Expand All @@ -99,7 +99,7 @@ else()
test_unit_cute_volta
test_unit_cute_ampere
test_unit_cute_turing
#test_unit_cute_hopper
test_unit_cute_hopper
test_unit_cute_msvc_compilation
)
else()
Expand Down
6 changes: 3 additions & 3 deletions test/unit/cute/hopper/bulk_load.cu
Original file line number Diff line number Diff line change
Expand Up @@ -63,10 +63,10 @@ bulk_copy_test_device_cute(T const* g_in,
{
// Use Shared Storage structure to allocate and distribute aligned SMEM addresses
#if defined(__SYCL_DEVICE_ONLY__)
auto smem = sycl_ext::get_dynamic_work_group_memory<char>().get();
auto shared_memory = sycl_ext::get_work_group_scratch_memory();
#endif
#if defined(CUTLASS_ENABLE_SYCL) && !defined(__SYCL_DEVICE_ONLY__)
char* smem; // dummy declaration to avoid compilation errors during the host compilation phase
char* shared_memory; // dummy declaration to avoid compilation errors during the host compilation phase
#endif
#if !defined(CUTLASS_ENABLE_SYCL)
extern CUTLASS_SHARED char shared_memory[];
Expand Down Expand Up @@ -146,7 +146,7 @@ void run_and_validate(GLayout gmem_layout,
#if defined(CUTLASS_ENABLE_SYCL)
sc_exp::launch<bulk_copy_test_device_cute<T, GLayout, SLayout>>
( sc_exp::launch_policy{sc::dim3(1), sc::dim3(128),
sc_exp::launch_properties{sycl_ext::work_group_static_size(smem_size)}},
sc_exp::launch_properties{sycl_ext::work_group_scratch_size(smem_size)}},
d_in.data(), d_out.data(), gmem_layout, smem_layout);
sc::wait_and_throw();
#else
Expand Down
13 changes: 10 additions & 3 deletions test/unit/cute/hopper/bulk_store.cu
Original file line number Diff line number Diff line change
Expand Up @@ -62,10 +62,10 @@ bulk_copy_test_device_cute(T const* g_in,
{
// Use Shared Storage structure to allocate and distribute aligned SMEM addresses
#if defined(__SYCL_DEVICE_ONLY__)
auto smem = sycl_ext::get_dynamic_work_group_memory<char>().get();
auto shared_memory = sycl_ext::get_work_group_scratch_memory();
#endif
#if defined(CUTLASS_ENABLE_SYCL) && !defined(__SYCL_DEVICE_ONLY__)
char* smem; // dummy declaration to avoid compilation errors during the host compilation phase
char* shared_memory; // dummy declaration to avoid compilation errors during the host compilation phase
#endif
#if !defined(CUTLASS_ENABLE_SYCL)
extern CUTLASS_SHARED char shared_memory[];
Expand All @@ -83,9 +83,16 @@ bulk_copy_test_device_cute(T const* g_in,
//

// Input gmem -> smem
#if defined(CUTLASS_ENABLE_SYCL) || defined(__SYCL_DEVICE_ONLY__)
for (int i = ThreadIdxX(); i < size(sA); i += BlockDimX()) {
sA(i) = gA(i);
}
#else
for (int i = threadIdx.x; i < size(sA); i += blockDim.x) {
sA(i) = gA(i);
}
#endif


cp_async_fence();
cp_async_wait<0>();
Expand Down Expand Up @@ -128,7 +135,7 @@ void run_and_validate(GLayout gmem_layout,
#if defined(CUTLASS_ENABLE_SYCL)
sc_exp::launch<bulk_copy_test_device_cute<T, GLayout, SLayout>>
( sc_exp::launch_policy{sc::dim3(1), sc::dim3(128),
sc_exp::launch_properties{sycl_ext::work_group_static_size(smem_size)}},
sc_exp::launch_properties{sycl_ext::work_group_scratch_size(smem_size)}},
d_in.data(), d_out.data(), gmem_layout, smem_layout);
sc::wait_and_throw();
#else
Expand Down
66 changes: 33 additions & 33 deletions test/unit/cute/hopper/stsm.cu
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ stsm_test_device(uint16_t* g_in, uint16_t* g_out)
}

#if defined(__SYCL_DEVICE_ONLY__)
auto smem = sycl_ext::get_dynamic_work_group_memory<uint32_t>().get();
auto smem = sycl_ext::get_work_group_scratch_memory();
#endif
#if defined(CUTLASS_ENABLE_SYCL) && !defined(__SYCL_DEVICE_ONLY__)
uint32_t* smem; // dummy declaration to avoid compilation errors during the host compilation phase
Expand All @@ -69,15 +69,15 @@ stsm_test_device(uint16_t* g_in, uint16_t* g_out)
#endif

// load rmem -> smem using STSM
uint128_t* smem_ptr = reinterpret_cast<uint128_t*>(smem) + tid;
uint32_t* smem_ptr = reinterpret_cast<uint32_t*>(smem) + tid;
T* rmem_ptr = reinterpret_cast<T*>(reg);
cute::copy_stsm(rmem_ptr, smem_ptr);

syncthreads();

// store output smem -> gmem
for (int i = 0; i < (sizeof(T) / 4); i++) {
reinterpret_cast<uint32_t*>(g_out)[tid + (stride * i)] = smem[tid + (stride * i)];
reinterpret_cast<uint32_t*>(g_out)[tid + (stride * i)] = smem_ptr[tid + (stride * i)];
}
}

Expand All @@ -89,10 +89,10 @@ stsm_test_device_cute(uint16_t* g_in, uint16_t* g_out,
using namespace cute;

#if defined(__SYCL_DEVICE_ONLY__)
auto smem = sycl_ext::get_dynamic_work_group_memory<uint16_t>().get();
char* smem = sycl_ext::dynamic_work_group_memory<char[]>().get();
#endif
#if defined(CUTLASS_ENABLE_SYCL) && !defined(__SYCL_DEVICE_ONLY__)
char* smem_buf; // dummy declaration to avoid compilation errors during the host compilation phase
char* smem; // dummy declaration to avoid compilation errors during the host compilation phase
#endif
#if !defined(CUTLASS_ENABLE_SYCL)
CUTLASS_SHARED uint16_t smem[size(smem_layout)];
Expand Down Expand Up @@ -151,8 +151,8 @@ TEST(SM90_CuTe_Hopper, Stsm)
device_vector<uint16_t> d_out(count);
#if defined(CUTLASS_ENABLE_SYCL)
sc_exp::launch<stsm_test_device<uint32_t>>
( sc::launch_policy{sc::dim3(1), sc::dim3(32),
sc_exp::launch_properties{sycl_ext::work_group_static_size(sizeof(uint32_t) / 4 * 32)}},
( sc_exp::launch_policy{sc::dim3(1), sc::dim3(32),
sc_exp::launch_properties{sycl_ext::work_group_scratch_size(sizeof(uint32_t) / 4 * 32)}},
d_in.data(), d_out.data());
sc::wait_and_throw();
#else
Expand All @@ -176,8 +176,8 @@ TEST(SM90_CuTe_Hopper, Stsm)
device_vector<uint16_t> d_out(count);
#if defined(CUTLASS_ENABLE_SYCL)
sc_exp::launch<stsm_test_device<uint64_t>>
( sc::launch_policy{sc::dim3(1), sc::dim3(32),
sc_exp::launch_properties{sycl_ext::work_group_static_size(sizeof(uint64_t) / 4 * 32)}},
( sc_exp::launch_policy{sc::dim3(1), sc::dim3(32),
sc_exp::launch_properties{sycl_ext::work_group_scratch_size(sizeof(uint64_t) / 4 * 32)}},
d_in.data(), d_out.data());
sc::wait_and_throw();
#else
Expand All @@ -201,8 +201,8 @@ TEST(SM90_CuTe_Hopper, Stsm)
device_vector<uint16_t> d_out(count);
#if defined(CUTLASS_ENABLE_SYCL)
sc_exp::launch<stsm_test_device<uint128_t>>
( sc_exp::launch_policy{sc::dim3(1), sc::dim3(32),
sc_exp::launch_properties{sycl_ext::work_group_static_size(sizeof(uint128_t) / 4 * 32)}},
( sc_exp::launch_policy{sc::dim3(1), sc::dim3(32),
sc_exp::launch_properties{sycl_ext::work_group_scratch_size(sizeof(uint128_t) / 4 * 32)}},
d_in.data(), d_out.data());
sc::wait_and_throw();
#else
Expand Down Expand Up @@ -232,8 +232,8 @@ TEST(SM90_CuTe_Hopper, Stsm)
Layout<Shape< _1,_8>>{});
#if defined(CUTLASS_ENABLE_SYCL)
sc_exp::launch<stsm_test_device_cute<decltype(tiled_copy), decltype(smem_layout)>>
( sc::launch_policy{sc::dim3(1), sc::dim3(int(size(tiled_copy))),
sc_exp::launch_properties{sycl_ext::work_group_static_size(size(smem_layout))}},
( sc_exp::launch_policy{sc::dim3(1), sc::dim3(int(size(tiled_copy))),
sc_exp::launch_properties{sycl_ext::work_group_scratch_size(size(smem_layout))}},
d_in.data(), d_out.data(), tiled_copy, smem_layout);
sc::wait_and_throw();
#else
Expand Down Expand Up @@ -262,8 +262,8 @@ TEST(SM90_CuTe_Hopper, Stsm)

#if defined(CUTLASS_ENABLE_SYCL)
sc_exp::launch<stsm_test_device_cute<decltype(tiled_copy), decltype(smem_layout)>>
( sc::launch_policy{sc::dim3(1), sc::dim3(int(size(tiled_copy))),
sc_exp::launch_properties{sycl_ext::work_group_static_size(size(smem_layout))}},
( sc_exp::launch_policy{sc::dim3(1), sc::dim3(int(size(tiled_copy))),
sc_exp::launch_properties{sycl_ext::work_group_scratch_size(size(smem_layout))}},
d_in.data(), d_out.data(), tiled_copy, smem_layout);
sc::wait_and_throw();
#else
Expand Down Expand Up @@ -291,8 +291,8 @@ TEST(SM90_CuTe_Hopper, Stsm)
Layout<Shape< _1,_8>>{});
#if defined(CUTLASS_ENABLE_SYCL)
sc_exp::launch<stsm_test_device_cute<decltype(tiled_copy), decltype(smem_layout)>>
( sc::launch_policy{sc::dim3(1), sc::dim3(int(size(tiled_copy))),
sc_exp::launch_properties{sycl_ext::work_group_static_size(size(smem_layout))}},
( sc_exp::launch_policy{sc::dim3(1), sc::dim3(int(size(tiled_copy))),
sc_exp::launch_properties{sycl_ext::work_group_scratch_size(size(smem_layout))}},
d_in.data(), d_out.data(), tiled_copy, smem_layout);
sc::wait_and_throw();
#else
Expand Down Expand Up @@ -321,8 +321,8 @@ TEST(SM90_CuTe_Hopper, Stsm)

#if defined(CUTLASS_ENABLE_SYCL)
sc_exp::launch<stsm_test_device_cute<decltype(tiled_copy), decltype(smem_layout)>>
( sc::launch_policy{sc::dim3(1), sc::dim3(int(size(tiled_copy))),
sc_exp::launch_properties{sycl_ext::work_group_static_size(size(smem_layout))}},
( sc_exp::launch_policy{sc::dim3(1), sc::dim3(int(size(tiled_copy))),
sc_exp::launch_properties{sycl_ext::work_group_scratch_size(size(smem_layout))}},
d_in.data(), d_out.data(), tiled_copy, smem_layout);
sc::wait_and_throw();
#else
Expand Down Expand Up @@ -350,8 +350,8 @@ TEST(SM90_CuTe_Hopper, Stsm)
Layout<Shape< _2,_4>>{});
#if defined(CUTLASS_ENABLE_SYCL)
sc_exp::launch<stsm_test_device_cute<decltype(tiled_copy), decltype(smem_layout)>>
( sc::launch_policy{sc::dim3(1), sc::dim3(int(size(tiled_copy))),
sc_exp::launch_properties{sycl_ext::work_group_static_size(size(smem_layout))}},
( sc_exp::launch_policy{sc::dim3(1), sc::dim3(int(size(tiled_copy))),
sc_exp::launch_properties{sycl_ext::work_group_scratch_size(size(smem_layout))}},
d_in.data(), d_out.data(), tiled_copy, smem_layout);
sc::wait_and_throw();
#else
Expand Down Expand Up @@ -379,8 +379,8 @@ TEST(SM90_CuTe_Hopper, Stsm)
Layout<Shape< _2,_4>>{});
#if defined(CUTLASS_ENABLE_SYCL)
sc_exp::launch<stsm_test_device_cute<decltype(tiled_copy), decltype(smem_layout)>>
( sc::launch_policy{sc::dim3(1), sc::dim3(int(size(tiled_copy))),
sc_exp::launch_properties{sycl_ext::work_group_static_size(size(smem_layout))}},
( sc_exp::launch_policy{sc::dim3(1), sc::dim3(int(size(tiled_copy))),
sc_exp::launch_properties{sycl_ext::work_group_scratch_size(size(smem_layout))}},
d_in.data(), d_out.data(), tiled_copy, smem_layout);
sc::wait_and_throw();
#else
Expand Down Expand Up @@ -408,8 +408,8 @@ TEST(SM90_CuTe_Hopper, Stsm)
Layout<Shape< _2,_4>>{});
#if defined(CUTLASS_ENABLE_SYCL)
sc_exp::launch<stsm_test_device_cute<decltype(tiled_copy), decltype(smem_layout)>>
( sc::launch_policy{sc::dim3(1), sc::dim3(int(size(tiled_copy))),
sc_exp::launch_properties{sycl_ext::work_group_static_size(size(smem_layout))}},
( sc_exp::launch_policy{sc::dim3(1), sc::dim3(int(size(tiled_copy))),
sc_exp::launch_properties{sycl_ext::work_group_scratch_size(size(smem_layout))}},
d_in.data(), d_out.data(), tiled_copy, smem_layout);
sc::wait_and_throw();
#else
Expand Down Expand Up @@ -437,8 +437,8 @@ TEST(SM90_CuTe_Hopper, Stsm)
Layout<Shape< _2,_4>>{});
#if defined(CUTLASS_ENABLE_SYCL)
sc_exp::launch<stsm_test_device_cute<decltype(tiled_copy), decltype(smem_layout)>>
( sc::launch_policy{sc::dim3(1), sc::dim3(int(size(tiled_copy))),
sc_exp::launch_properties{sycl_ext::work_group_static_size(size(smem_layout))}},
( sc_exp::launch_policy{sc::dim3(1), sc::dim3(int(size(tiled_copy))),
sc_exp::launch_properties{sycl_ext::work_group_scratch_size(size(smem_layout))}},
d_in.data(), d_out.data(), tiled_copy, smem_layout);
sc::wait_and_throw();
#else
Expand Down Expand Up @@ -466,8 +466,8 @@ TEST(SM90_CuTe_Hopper, Stsm)
Layout<Shape<_2,_1>>{});
#if defined(CUTLASS_ENABLE_SYCL)
sc_exp::launch<stsm_test_device_cute<decltype(tiled_copy), decltype(smem_layout)>>
( sc::launch_policy{sc::dim3(1), sc::dim3(int(size(tiled_copy))),
sc_exp::launch_properties{sycl_ext::work_group_static_size(size(smem_layout))}},
( sc_exp::launch_policy{sc::dim3(1), sc::dim3(int(size(tiled_copy))),
sc_exp::launch_properties{sycl_ext::work_group_scratch_size(size(smem_layout))}},
d_in.data(), d_out.data(), tiled_copy, smem_layout);
sc::wait_and_throw();
#else
Expand Down Expand Up @@ -495,8 +495,8 @@ TEST(SM90_CuTe_Hopper, Stsm)
Layout<Shape<_4,_1>>{});
#if defined(CUTLASS_ENABLE_SYCL)
sc_exp::launch<stsm_test_device_cute<decltype(tiled_copy), decltype(smem_layout)>>
( sc::launch_policy{sc::dim3(1), sc::dim3(int(size(tiled_copy))),
sc_exp::launch_properties{sycl_ext::work_group_static_size(size(smem_layout))}},
( sc_exp::launch_policy{sc::dim3(1), sc::dim3(int(size(tiled_copy))),
sc_exp::launch_properties{sycl_ext::work_group_scratch_size(size(smem_layout))}},
d_in.data(), d_out.data(), tiled_copy, smem_layout);
sc::wait_and_throw();
#else
Expand Down Expand Up @@ -524,8 +524,8 @@ TEST(SM90_CuTe_Hopper, Stsm)
Layout<Shape<_8,_1>>{});
#if defined(CUTLASS_ENABLE_SYCL)
sc_exp::launch<stsm_test_device_cute<decltype(tiled_copy), decltype(smem_layout)>>
( sc::launch_policy{sc::dim3(1), sc::dim3(int(size(tiled_copy))),
sc_exp::launch_properties{sycl_ext::work_group_static_size(size(smem_layout))}},
( sc_exp::launch_policy{sc::dim3(1), sc::dim3(int(size(tiled_copy))),
sc_exp::launch_properties{sycl_ext::work_group_scratch_size(size(smem_layout))}},
d_in.data(), d_out.data(), tiled_copy, smem_layout);
sc::wait_and_throw();
#else
Expand Down
2 changes: 1 addition & 1 deletion test/unit/cute/hopper/tma_load.cu
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@

#include "cutlass_unit_test.h"

#include "../hopper/tma_load_testbed.hpp"
#include "tma_load_testbed.hpp"

using namespace cute;
using namespace cutlass::test;
Expand Down
19 changes: 11 additions & 8 deletions test/unit/cute/hopper/tma_load_testbed.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,10 +66,10 @@ tma_test_device_cute(T const* g_in, T* g_out,

// Use Shared Storage structure to allocate and distribute aligned SMEM addresses
#if defined(__SYCL_DEVICE_ONLY__)
auto smem = sycl_ext::get_dynamic_work_group_memory<char>().get();
auto shared_memory = sycl_ext::get_work_group_scratch_memory();
#endif
#if defined(CUTLASS_ENABLE_SYCL) && !defined(__SYCL_DEVICE_ONLY__)
char* smem; // dummy declaration to avoid compilation errors during the host compilation phase
char* shared_memory; // dummy declaration to avoid compilation errors during the host compilation phase
#endif
#if !defined(CUTLASS_ENABLE_SYCL)
extern CUTLASS_SHARED char shared_memory[];
Expand Down Expand Up @@ -172,6 +172,7 @@ tma_test_device_cute(T const* g_in, T* g_out,
}
}

template <class> class PT;
template <class T, class TmaType = T, class CopyOp, class GMEM_Layout, class SMEM_Layout, class CTA_Tile>
auto
test_tma_load(CopyOp const& copy_op,
Expand Down Expand Up @@ -201,16 +202,18 @@ test_tma_load(CopyOp const& copy_op,
// Launch
int smem_size = int(sizeof(SharedStorage<T, decltype(smem_layout)>));
#if defined(CUTLASS_ENABLE_SYCL)
auto kernel = tma_test_device_cute<T,
constexpr auto kernel = tma_test_device_cute<T,
decltype(tma),
CTA_Tile,
GmemLayout,
SmemLayout>;
GMEM_Layout,
SMEM_Layout>;

sc_exp::launch<kernel>
( sc_exp::launch_policy{sc::dim3(1), sc::dim3(128),
sc_exp::launch_properties{sycl_ext::work_group_static_size(smem_size)}},
d_in.data(), d_out.data(), tma, cta_tile,
gmem_layout, smem_layout);
sc_exp::launch_properties{sycl_ext::work_group_scratch_size(smem_size)}},
reinterpret_cast<T const*>(raw_pointer_cast(d_in.data())),
reinterpret_cast<T*> (raw_pointer_cast(d_out.data())),
tma, cta_tile, gmem_layout, smem_layout);
sc::wait_and_throw();
#else
tma_test_device_cute<<<1, 128, smem_size>>>(
Expand Down
Loading
Loading