Skip to content

Commit 8ce1a95

Browse files
LiyangLingIntelrolandschulzratnampa
authored
G++ host compiler support (#490)
This pull request add `-DDPCPP_HOST_COMPILER=g++` support to cutlass-sycl. The main changes include: * Migrate `syclcompat` to this repo as `cutlasscompat` * Fix most unsupported g++ compilation issues --------- Co-authored-by: Roland Schulz <[email protected]> Co-authored-by: ratnampa <[email protected]>
1 parent d9beb05 commit 8ce1a95

File tree

53 files changed

+447
-203
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

53 files changed

+447
-203
lines changed
Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
name: "SYCL Intel G++ Host Compilation Test"
2+
3+
on:
4+
push:
5+
branches: [ "main" ]
6+
pull_request:
7+
branches: [ "main" ]
8+
merge_group:
9+
branches: [ "main" ]
10+
workflow_dispatch:
11+
inputs:
12+
DPCPP_VERSION:
13+
description: "DPCPP version to use"
14+
type: string
15+
16+
permissions: {}
17+
18+
concurrency:
19+
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
20+
cancel-in-progress: true
21+
22+
jobs:
23+
run-tests:
24+
strategy:
25+
matrix:
26+
include:
27+
- compiler: RELEASE
28+
gpu: BMG
29+
intel_graphics: ROLLING
30+
sycl_target: intel_gpu_bmg_g21
31+
runner: bmg108629-01
32+
- compiler: RELEASE
33+
gpu: PVC
34+
intel_graphics: ROLLING
35+
sycl_target: intel_gpu_pvc
36+
runner: pvc146162-01
37+
38+
39+
name: Run Intel ${{ matrix.compiler }} tests on ${{ matrix.gpu }} with intel-graphics ${{ matrix.intel_graphics }}
40+
runs-on: ${{ matrix.runner }}
41+
timeout-minutes: 120
42+
43+
steps:
44+
- name: Checkout repository
45+
uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
46+
- name: Install Intel graphics drivers
47+
uses: ./.github/actions/install-intel-graphics
48+
with:
49+
GPU: ${{ matrix.gpu }}
50+
IGC: ${{ matrix.intel_graphics }}
51+
- name: Install DPC++
52+
uses: ./.github/actions/install-dpcpp
53+
with:
54+
DPCPP_RELEASE: ${{ matrix.compiler }}
55+
DPCPP_VERSION: ${{ inputs.DPCPP_VERSION }}
56+
GPU: ${{ matrix.gpu }}
57+
IGC: ${{ matrix.intel_graphics }}
58+
- name: Setup virtual environment
59+
shell: bash
60+
run: |
61+
# Install cmake and ninja if not already available
62+
if ! command -v cmake &> /dev/null || ! command -v ninja &> /dev/null; then
63+
echo "Installing cmake and/or ninja..."
64+
sudo apt update
65+
sudo apt install -y cmake ninja-build
66+
else
67+
echo "cmake and ninja already available"
68+
fi
69+
. setvars.sh
70+
export IGC_ExtraOCLOptions="-cl-intel-256-GRF-per-thread"
71+
export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file -gline-tables-only"
72+
export ONEAPI_DEVICE_SELECTOR=level_zero:gpu
73+
export IGC_VectorAliasBBThreshold=100000000000
74+
# Persist environment variables to following steps
75+
env >> $GITHUB_ENV
76+
which $CXX
77+
$CXX --version
78+
g++-13 --version
79+
sycl-ls
80+
- name: Build
81+
shell: bash
82+
run: |
83+
cmake -G Ninja \
84+
-DCUTLASS_ENABLE_SYCL=ON \
85+
-DDPCPP_SYCL_TARGET=${{ matrix.sycl_target }} \
86+
-DCUTLASS_SYCL_RUNNING_CI=ON \
87+
-DDPCPP_HOST_COMPILER=g++-13
88+
cmake --build .
89+
90+
- name: Unit test
91+
shell: bash
92+
run: |
93+
# ninja test_unit_cute_core # Assertion failure in include/cutlass/integer_subbyte.h:105
94+
ninja test_unit_cute_intel_xe
95+
ninja test_unit_cute_layout
96+
ninja test_unit_cute_msvc_compilation
97+
98+
- name: Examples
99+
shell: bash
100+
run: |
101+
cmake --build . --target test_examples -j 1
102+
103+
- name: Benchmarks
104+
shell: bash
105+
run: |
106+
ninja cutlass_benchmarks
107+
108+
- name: Cleanup DPC++
109+
if: always()
110+
shell: bash
111+
run: |
112+
echo "Cleaning up DPC++ installation..."
113+
# Remove DPCPP directory if it exists
114+
DPCPP_PATH="${{ inputs.DPCPP_PATH || '~/dpcpp' }}"
115+
DPCPP_PATH=$(eval echo $DPCPP_PATH) # Expand ~ to home directory
116+
if [ -d "$DPCPP_PATH" ]; then
117+
echo "Removing DPCPP directory: $DPCPP_PATH"
118+
sudo rm -rf "$DPCPP_PATH"
119+
fi
120+
# For RELEASE installs, remove OneAPI packages
121+
if [[ "${{ matrix.compiler }}" == "RELEASE" ]]; then
122+
echo "Removing OneAPI packages..."
123+
sudo apt remove -y intel-oneapi-runtime-libs intel-oneapi-compiler-dpcpp-cpp || true
124+
sudo rm -f /etc/apt/sources.list.d/oneAPI.list
125+
sudo rm -f /usr/share/keyrings/oneapi-archive-keyring.gpg
126+
fi
127+
# Clean up environment files
128+
rm -f setvars.sh
129+
# Clean up build artifacts
130+
rm -rf build/ || true
131+
# Reset environment variables that might interfere
132+
unset CC CXX CPLUS_INCLUDE_PATH C_INCLUDE_PATH LD_LIBRARY_PATH
133+
unset IGC_ExtraOCLOptions SYCL_PROGRAM_COMPILE_OPTIONS ONEAPI_DEVICE_SELECTOR IGC_VectorAliasBBThreshold
134+
echo "DPC++ cleanup completed"

applications/dual_gemm/collective/xe_dual_gemm_mma.hpp

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -74,20 +74,20 @@ struct DualGemmMma<MainloopIntelXeXMX16<Stages, Schedule>, TileShape_, ElementA_
7474

7575
using MmaAtomShape = typename TiledMma::AtomShape_MNK;
7676

77-
static constexpr auto BLK_M = get<0>(WorkgroupTileShape{});
78-
static constexpr auto BLK_N = get<1>(WorkgroupTileShape{});
79-
static constexpr auto BLK_K = get<2>(WorkgroupTileShape{});
77+
static constexpr int BLK_M = get<0>(WorkgroupTileShape{});
78+
static constexpr int BLK_N = get<1>(WorkgroupTileShape{});
79+
static constexpr int BLK_K = get<2>(WorkgroupTileShape{});
8080

81-
static constexpr auto ATOM_M = get<1>(typename TiledMma::ThrLayoutVMNK{}.shape());
82-
static constexpr auto ATOM_N = get<2>(typename TiledMma::ThrLayoutVMNK{}.shape());
83-
static constexpr auto ATOM_K = get<3>(typename TiledMma::ThrLayoutVMNK{}.shape());
81+
static constexpr int ATOM_M = get<1>(typename TiledMma::ThrLayoutVMNK{}.shape());
82+
static constexpr int ATOM_N = get<2>(typename TiledMma::ThrLayoutVMNK{}.shape());
83+
static constexpr int ATOM_K = get<3>(typename TiledMma::ThrLayoutVMNK{}.shape());
8484

85-
static constexpr auto SG_M = ceil_div(BLK_M, ATOM_M);
86-
static constexpr auto SG_N = ceil_div(BLK_N, ATOM_N);
87-
static constexpr auto SG_K = ceil_div(BLK_K, ATOM_K);
85+
static constexpr int SG_M = ceil_div(BLK_M, ATOM_M);
86+
static constexpr int SG_N = ceil_div(BLK_N, ATOM_N);
87+
static constexpr int SG_K = ceil_div(BLK_K, ATOM_K);
8888
using SubgroupTileShape = Shape<decltype(SG_M), decltype(SG_N), decltype(SG_K)>;
8989

90-
static constexpr auto Num_SGs = ATOM_N * ATOM_M * ATOM_K;
90+
static constexpr int Num_SGs = ATOM_N * ATOM_M * ATOM_K;
9191
static constexpr uint32_t MaxThreadsPerBlock = size(TiledMma{});
9292

9393
using traits_load_A = Copy_Traits<GmemTiledCopyA, StrideA>;

applications/flash_attention_v2/kernel/xe_flash_attn_decode.hpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -126,13 +126,13 @@ class FMHADecode {
126126
static constexpr int ATOM_N = CollectiveMainloop::ATOM_N;
127127
static constexpr int ATOM_K = CollectiveMainloop::ATOM_K;
128128

129-
static constexpr auto Num_SGs = ATOM_N * ATOM_M * ATOM_K;
129+
static constexpr int Num_SGs = ATOM_N * ATOM_M * ATOM_K;
130130
static constexpr int Vec = CollectiveMainloop::Vec; // 8
131131
static constexpr int FragsM = CollectiveMainloop::FragsM; // 1
132132
static constexpr int FragsN = CollectiveMainloop::FragsNS; // 4
133133

134134
static constexpr int VSlicer = get<1>(TileShapeOutput{}) / (get<1>(TileShapePV{}) * ATOM_N);
135-
using AccumShape = decltype(make_shape(Int<Vec>{}, Int<FragsM>{}, Int<get<1>(TileShapePV{}) / get<1>(MmaAtomShape())>{}, Int<VSlicer>{}));
135+
using AccumShape = decltype(make_shape(Int<Vec>{}, Int<FragsM>{}, get<1>(TileShapePV{}) / get<1>(MmaAtomShape()), Int<VSlicer>{}));
136136

137137
static_assert(FragsM == 1, "Limit the seq_len_qo to 1 MMA Atom worth of data per work-group.");
138138

benchmarks/flash_attention/flash_attention_decode/CMakeLists.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,8 @@ foreach(name IN LISTS LIB_LIST)
6060
target_include_directories(${name} PRIVATE ${CUTLASS_APPLICATIONS_DIR})
6161
target_link_libraries(${name} PRIVATE CUTLASS cutlass_tools_util_includes benchmark::benchmark)
6262
add_onemkl_to_target(TARGET ${name})
63-
add_sycl_to_target(TARGET ${name})
63+
# Add only SYCL include directories, not the full SYCL flags (to avoid duplication)
64+
add_sycl_include_directories_to_target(${name})
6465
endforeach()
6566

6667
cutlass_benchmark_add_executable(

benchmarks/flash_attention/flash_attention_decode/benchmark_runner.hpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -619,7 +619,7 @@ template <class FMHADecodeConfiguration> struct BenchmarkRunnerFMHADecode {
619619

620620
#if !defined(SYCL_EXT_ONEAPI_WORK_GROUP_SCRATCH_MEMORY)
621621
using namespace compat::experimental;
622-
auto event = launch<cutlass::device_kernel<FMHADecodeKernel>>(
622+
auto event = launch<cutlass::device_kernel<FMHADecodeKernel>, FMHADecodeKernel>(
623623
launch_policy{sycl_grid, sycl_block, local_mem_size{static_cast<std::size_t>(smem_size)},
624624
kernel_properties{sycl_exp::sub_group_size<FMHADecodeKernel::DispatchPolicy::SubgroupSize>}},
625625
params);
@@ -631,7 +631,7 @@ template <class FMHADecodeConfiguration> struct BenchmarkRunnerFMHADecode {
631631
sycl::ext::oneapi::experimental::sub_group_size<FMHADecodeKernel::DispatchPolicy::SubgroupSize>
632632
};
633633
compat::experimental::launch_policy policy{sycl_grid, sycl_block, launch_props, kernel_props};
634-
auto event = compat::experimental::launch<cutlass::device_kernel<FMHADecodeKernel>>(policy, params);
634+
auto event = compat::experimental::launch<cutlass::device_kernel<FMHADecodeKernel>, FMHADecodeKernel>(policy, params);
635635
#endif
636636

637637
EventManager::getInstance().addEvent(event);

benchmarks/flash_attention/flash_attention_prefill/benchmark_runner.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -491,7 +491,7 @@ template <class FMHAPrefillConfiguration> struct BenchmarkRunnerFMHA {
491491
sycl::ext::oneapi::experimental::sub_group_size<GemmKernel::DispatchPolicy::SubgroupSize>
492492
};
493493
compat::experimental::launch_policy policy{sycl_grid, sycl_block, launch_props, kernel_props};
494-
auto event = compat::experimental::launch<cutlass::device_kernel<GemmKernel>>(policy, params);
494+
auto event = compat::experimental::launch<cutlass::device_kernel<GemmKernel>, GemmKernel>(policy, params);
495495
#endif
496496

497497
EventManager::getInstance().addEvent(event);

benchmarks/flash_attention/flash_attention_prefill_cachedKV/benchmark_runner.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -584,7 +584,7 @@ template <class FMHAPrefillConfiguration> struct BenchmarkRunnerFMHA {
584584
sycl::ext::oneapi::experimental::sub_group_size<GemmKernel::DispatchPolicy::SubgroupSize>
585585
};
586586
compat::experimental::launch_policy policy{sycl_grid, sycl_block, launch_props, kernel_props};
587-
auto event = compat::experimental::launch<cutlass::device_kernel<GemmKernel>>(policy, params);
587+
auto event = compat::experimental::launch<cutlass::device_kernel<GemmKernel>, GemmKernel>(policy, params);
588588
#endif
589589

590590
EventManager::getInstance().addEvent(event);

cmake/FindDPCPP.cmake

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,10 @@ find_library(DPCPP_LIB_DIR NAMES sycl sycl6 PATHS "${DPCPP_BIN_DIR}/../lib")
3939
add_library(DPCPP::DPCPP INTERFACE IMPORTED)
4040

4141
set(DPCPP_FLAGS "-fsycl;")
42+
if(DPCPP_HOST_COMPILER)
43+
list(APPEND DPCPP_FLAGS "-fsycl-host-compiler=${DPCPP_HOST_COMPILER}")
44+
list(APPEND DPCPP_FLAGS "-fsycl-host-compiler-options=-Wno-changes-meaning -D$<JOIN:$<TARGET_PROPERTY:COMPILE_DEFINITIONS>, -D> -I$<JOIN:$<TARGET_PROPERTY:INCLUDE_DIRECTORIES>, -I>")
45+
endif()
4246
set(DPCPP_COMPILE_ONLY_FLAGS "")
4347
set(DPCPP_LINK_ONLY_FLAGS "")
4448

examples/06_bmg_flash_attention/bmg_flash_attn_decode_runner.hpp

Lines changed: 2 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -200,21 +200,14 @@ template <class FMHAKernel, bool isVarLen> struct ExampleRunner {
200200
};
201201
PagedKVParams paged_kv_cache;
202202

203-
template <typename SrcT, typename DstT>
204-
void convert_fp8_to_fp16(const SrcT* d_src, DstT* d_dst, size_t size) {
205-
compat::get_default_queue().parallel_for(size, [=](auto indx) {
206-
d_dst[indx] = static_cast<DstT>(d_src[indx]);
207-
}).wait();
208-
}
209-
210203
template <typename T>
211204
static constexpr bool is_fp8_v = cute::is_any_of_v<T, cute::float_e5m2_t, cute::float_e4m3_t>;
212205

213206
template <typename Tin> inline auto in_memory(cutlass::DeviceAllocation<Tin>& in) {
214207
using outType = cutlass::DeviceAllocation<cute::conditional_t<is_fp8_v<Tin>, half_t, Tin>>;
215208
if constexpr(is_fp8_v<Tin>) {
216209
cutlass::DeviceAllocation<half_t> out(in.size());
217-
convert_fp8_to_fp16<Tin, half_t>(in.get(), out.get(), in.size());
210+
convert_dtype<Tin, half_t, ExampleRunner>(in.get(), out.get(), in.size());
218211
return out;
219212
} else {
220213
return in;
@@ -651,7 +644,7 @@ template <class FMHAKernel, bool isVarLen> struct ExampleRunner {
651644
sycl::ext::oneapi::experimental::sub_group_size<FMHAKernel::DispatchPolicy::SubgroupSize>
652645
};
653646
compat::experimental::launch_policy policy{sycl_grid, sycl_block, launch_props, kernel_props};
654-
auto event = compat::experimental::launch<cutlass::device_kernel<FMHAKernel>>(policy, params);
647+
auto event = compat::experimental::launch<cutlass::device_kernel<FMHAKernel>, FMHAKernel>(policy, params);
655648
#endif
656649

657650
EventManager::getInstance().addEvent(event);

examples/06_bmg_flash_attention/bmg_flash_attn_prefill_cachedKV_runner.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -627,7 +627,7 @@ template <class FMHAPrefillCachedKernel, bool isVarLen> struct ExampleRunner {
627627
sycl::ext::oneapi::experimental::sub_group_size<FMHAPrefillCachedKernel::DispatchPolicy::SubgroupSize>
628628
};
629629
compat::experimental::launch_policy policy{sycl_grid, sycl_block, launch_props, kernel_props};
630-
auto event = compat::experimental::launch<cutlass::device_kernel<FMHAPrefillCachedKernel>>(policy, params);
630+
auto event = compat::experimental::launch<cutlass::device_kernel<FMHAPrefillCachedKernel>, FMHAPrefillCachedKernel>(policy, params);
631631
#endif
632632

633633
EventManager::getInstance().addEvent(event);

0 commit comments

Comments
 (0)