intel
diff --git a/‎.github/workflows/intel_test_gpp_host.yml‎
Lines changed: 134 additions & 0 deletions b/‎.github/workflows/intel_test_gpp_host.yml‎
Lines changed: 134 additions & 0 deletions
diff --git a/‎applications/dual_gemm/collective/xe_dual_gemm_mma.hpp‎
Lines changed: 10 additions & 10 deletions b/‎applications/dual_gemm/collective/xe_dual_gemm_mma.hpp‎
Lines changed: 10 additions & 10 deletions
diff --git a/‎applications/flash_attention_v2/kernel/xe_flash_attn_decode.hpp‎
Lines changed: 2 additions & 2 deletions b/‎applications/flash_attention_v2/kernel/xe_flash_attn_decode.hpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎benchmarks/flash_attention/flash_attention_decode/CMakeLists.txt‎
Lines changed: 2 additions & 1 deletion b/‎benchmarks/flash_attention/flash_attention_decode/CMakeLists.txt‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎benchmarks/flash_attention/flash_attention_decode/benchmark_runner.hpp‎
Lines changed: 2 additions & 2 deletions b/‎benchmarks/flash_attention/flash_attention_decode/benchmark_runner.hpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎benchmarks/flash_attention/flash_attention_prefill/benchmark_runner.hpp‎
Lines changed: 1 addition & 1 deletion b/‎benchmarks/flash_attention/flash_attention_prefill/benchmark_runner.hpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmarks/flash_attention/flash_attention_prefill_cachedKV/benchmark_runner.hpp‎
Lines changed: 1 addition & 1 deletion b/‎benchmarks/flash_attention/flash_attention_prefill_cachedKV/benchmark_runner.hpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cmake/FindDPCPP.cmake‎
Lines changed: 4 additions & 0 deletions b/‎cmake/FindDPCPP.cmake‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎examples/06_bmg_flash_attention/bmg_flash_attn_decode_runner.hpp‎
Lines changed: 2 additions & 9 deletions b/‎examples/06_bmg_flash_attention/bmg_flash_attn_decode_runner.hpp‎
Lines changed: 2 additions & 9 deletions
diff --git a/‎examples/06_bmg_flash_attention/bmg_flash_attn_prefill_cachedKV_runner.hpp‎
Lines changed: 1 addition & 1 deletion b/‎examples/06_bmg_flash_attention/bmg_flash_attn_prefill_cachedKV_runner.hpp‎
Lines changed: 1 addition & 1 deletion
@@ -0,0 +1,134 @@
+name: "SYCL Intel G++ Host Compilation Test"
+
+on:
+  push:
+    branches: [ "main" ]
+  pull_request:
+    branches: [ "main" ]
+  merge_group:
+    branches: [ "main" ]
+  workflow_dispatch:
+    inputs:
+      DPCPP_VERSION:
+        description: "DPCPP version to use"
+        type: string
+
+permissions: {}
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  run-tests:
+    strategy:
+      matrix:
+        include:
+          - compiler: RELEASE
+            gpu: BMG
+            intel_graphics: ROLLING
+            sycl_target: intel_gpu_bmg_g21
+            runner: bmg108629-01
+          - compiler: RELEASE
+            gpu: PVC
+            intel_graphics: ROLLING
+            sycl_target: intel_gpu_pvc
+            runner: pvc146162-01
+
+
+    name: Run Intel ${{ matrix.compiler }} tests on ${{ matrix.gpu }} with intel-graphics ${{ matrix.intel_graphics }}
+    runs-on: ${{ matrix.runner }}
+    timeout-minutes: 120
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
+      - name: Install Intel graphics drivers
+        uses: ./.github/actions/install-intel-graphics
+        with:
+          GPU: ${{ matrix.gpu }}
+          IGC: ${{ matrix.intel_graphics }}
+      - name: Install DPC++
+        uses: ./.github/actions/install-dpcpp
+        with:
+          DPCPP_RELEASE: ${{ matrix.compiler }}
+          DPCPP_VERSION: ${{ inputs.DPCPP_VERSION }}
+          GPU: ${{ matrix.gpu }}
+          IGC: ${{ matrix.intel_graphics }}
+      - name: Setup virtual environment
+        shell: bash
+        run: |
+          # Install cmake and ninja if not already available
+          if ! command -v cmake &> /dev/null || ! command -v ninja &> /dev/null; then
+            echo "Installing cmake and/or ninja..."
+            sudo apt update
+            sudo apt install -y cmake ninja-build
+          else
+            echo "cmake and ninja already available"
+          fi
+          . setvars.sh
+          export IGC_ExtraOCLOptions="-cl-intel-256-GRF-per-thread"
+          export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file -gline-tables-only"
+          export ONEAPI_DEVICE_SELECTOR=level_zero:gpu
+          export IGC_VectorAliasBBThreshold=100000000000
+          # Persist environment variables to following steps 
+          env >> $GITHUB_ENV
+          which $CXX
+          $CXX --version
+          g++-13 --version
+          sycl-ls
+      - name: Build
+        shell: bash
+        run: |
+          cmake -G Ninja  \
+            -DCUTLASS_ENABLE_SYCL=ON \
+            -DDPCPP_SYCL_TARGET=${{ matrix.sycl_target }} \
+            -DCUTLASS_SYCL_RUNNING_CI=ON \
+            -DDPCPP_HOST_COMPILER=g++-13
+          cmake --build .
+
+      - name: Unit test
+        shell: bash
+        run: |
+          # ninja test_unit_cute_core           # Assertion failure in include/cutlass/integer_subbyte.h:105
+          ninja test_unit_cute_intel_xe
+          ninja test_unit_cute_layout
+          ninja test_unit_cute_msvc_compilation
+
+      - name: Examples
+        shell: bash
+        run: |
+          cmake --build . --target test_examples -j 1
+
+      - name: Benchmarks
+        shell: bash
+        run: |
+          ninja cutlass_benchmarks
+
+      - name: Cleanup DPC++
+        if: always()
+        shell: bash
+        run: |
+          echo "Cleaning up DPC++ installation..."
+          # Remove DPCPP directory if it exists
+          DPCPP_PATH="${{ inputs.DPCPP_PATH || '~/dpcpp' }}"
+          DPCPP_PATH=$(eval echo $DPCPP_PATH)  # Expand ~ to home directory
+          if [ -d "$DPCPP_PATH" ]; then
+            echo "Removing DPCPP directory: $DPCPP_PATH"
+            sudo rm -rf "$DPCPP_PATH"
+          fi
+          # For RELEASE installs, remove OneAPI packages
+          if [[ "${{ matrix.compiler }}" == "RELEASE" ]]; then
+            echo "Removing OneAPI packages..."
+            sudo apt remove -y intel-oneapi-runtime-libs intel-oneapi-compiler-dpcpp-cpp || true
+            sudo rm -f /etc/apt/sources.list.d/oneAPI.list
+            sudo rm -f /usr/share/keyrings/oneapi-archive-keyring.gpg
+          fi
+          # Clean up environment files
+          rm -f setvars.sh
+          # Clean up build artifacts
+          rm -rf build/ || true
+          # Reset environment variables that might interfere
+          unset CC CXX CPLUS_INCLUDE_PATH C_INCLUDE_PATH LD_LIBRARY_PATH
+          unset IGC_ExtraOCLOptions SYCL_PROGRAM_COMPILE_OPTIONS ONEAPI_DEVICE_SELECTOR IGC_VectorAliasBBThreshold
+          echo "DPC++ cleanup completed"
@@ -74,20 +74,20 @@ struct DualGemmMma<MainloopIntelXeXMX16<Stages, Schedule>, TileShape_, ElementA_
 
   using MmaAtomShape = typename TiledMma::AtomShape_MNK;
 
-  static constexpr auto BLK_M = get<0>(WorkgroupTileShape{});
-  static constexpr auto BLK_N = get<1>(WorkgroupTileShape{});
-  static constexpr auto BLK_K = get<2>(WorkgroupTileShape{});
+  static constexpr int BLK_M = get<0>(WorkgroupTileShape{});
+  static constexpr int BLK_N = get<1>(WorkgroupTileShape{});
+  static constexpr int BLK_K = get<2>(WorkgroupTileShape{});
 
-  static constexpr auto ATOM_M = get<1>(typename TiledMma::ThrLayoutVMNK{}.shape());
-  static constexpr auto ATOM_N = get<2>(typename TiledMma::ThrLayoutVMNK{}.shape());
-  static constexpr auto ATOM_K = get<3>(typename TiledMma::ThrLayoutVMNK{}.shape());
+  static constexpr int ATOM_M = get<1>(typename TiledMma::ThrLayoutVMNK{}.shape());
+  static constexpr int ATOM_N = get<2>(typename TiledMma::ThrLayoutVMNK{}.shape());
+  static constexpr int ATOM_K = get<3>(typename TiledMma::ThrLayoutVMNK{}.shape());
 
-  static constexpr auto SG_M = ceil_div(BLK_M, ATOM_M);
-  static constexpr auto SG_N = ceil_div(BLK_N, ATOM_N);
-  static constexpr auto SG_K = ceil_div(BLK_K, ATOM_K);
+  static constexpr int SG_M = ceil_div(BLK_M, ATOM_M);
+  static constexpr int SG_N = ceil_div(BLK_N, ATOM_N);
+  static constexpr int SG_K = ceil_div(BLK_K, ATOM_K);
   using SubgroupTileShape = Shape<decltype(SG_M), decltype(SG_N), decltype(SG_K)>;
 
-  static constexpr auto Num_SGs = ATOM_N * ATOM_M * ATOM_K;
+  static constexpr int Num_SGs = ATOM_N * ATOM_M * ATOM_K;
   static constexpr uint32_t MaxThreadsPerBlock = size(TiledMma{});
 
   using traits_load_A = Copy_Traits<GmemTiledCopyA, StrideA>;
 
@@ -126,13 +126,13 @@ class FMHADecode {
   static constexpr int ATOM_N = CollectiveMainloop::ATOM_N;
   static constexpr int ATOM_K = CollectiveMainloop::ATOM_K;
 
-  static constexpr auto Num_SGs = ATOM_N * ATOM_M * ATOM_K;
+  static constexpr int Num_SGs = ATOM_N * ATOM_M * ATOM_K;
   static constexpr int Vec = CollectiveMainloop::Vec; // 8
   static constexpr int FragsM = CollectiveMainloop::FragsM;  // 1
   static constexpr int FragsN = CollectiveMainloop::FragsNS; // 4
 
   static constexpr int VSlicer = get<1>(TileShapeOutput{}) / (get<1>(TileShapePV{}) * ATOM_N);
-  using AccumShape =  decltype(make_shape(Int<Vec>{}, Int<FragsM>{}, Int<get<1>(TileShapePV{}) / get<1>(MmaAtomShape())>{}, Int<VSlicer>{}));
+  using AccumShape =  decltype(make_shape(Int<Vec>{}, Int<FragsM>{}, get<1>(TileShapePV{}) / get<1>(MmaAtomShape()), Int<VSlicer>{}));
 
   static_assert(FragsM == 1, "Limit the seq_len_qo to 1 MMA Atom worth of data per work-group.");
 
 
@@ -60,7 +60,8 @@ foreach(name IN LISTS LIB_LIST)
   target_include_directories(${name} PRIVATE ${CUTLASS_APPLICATIONS_DIR})
   target_link_libraries(${name} PRIVATE CUTLASS cutlass_tools_util_includes benchmark::benchmark)
   add_onemkl_to_target(TARGET ${name})
-  add_sycl_to_target(TARGET ${name})
+  # Add only SYCL include directories, not the full SYCL flags (to avoid duplication)
+  add_sycl_include_directories_to_target(${name})
 endforeach()
 
 cutlass_benchmark_add_executable(
 
@@ -619,7 +619,7 @@ template <class FMHADecodeConfiguration> struct BenchmarkRunnerFMHADecode {
 
 #if !defined(SYCL_EXT_ONEAPI_WORK_GROUP_SCRATCH_MEMORY)
     using namespace compat::experimental;
-    auto event = launch<cutlass::device_kernel<FMHADecodeKernel>>(
+    auto event = launch<cutlass::device_kernel<FMHADecodeKernel>, FMHADecodeKernel>(
         launch_policy{sycl_grid, sycl_block, local_mem_size{static_cast<std::size_t>(smem_size)},
                       kernel_properties{sycl_exp::sub_group_size<FMHADecodeKernel::DispatchPolicy::SubgroupSize>}},
         params);
@@ -631,7 +631,7 @@ template <class FMHADecodeConfiguration> struct BenchmarkRunnerFMHADecode {
       sycl::ext::oneapi::experimental::sub_group_size<FMHADecodeKernel::DispatchPolicy::SubgroupSize>
     };
     compat::experimental::launch_policy policy{sycl_grid, sycl_block, launch_props, kernel_props};
-    auto event = compat::experimental::launch<cutlass::device_kernel<FMHADecodeKernel>>(policy, params);
+    auto event = compat::experimental::launch<cutlass::device_kernel<FMHADecodeKernel>, FMHADecodeKernel>(policy, params);
 #endif
 
     EventManager::getInstance().addEvent(event);
 
@@ -491,7 +491,7 @@ template <class FMHAPrefillConfiguration> struct BenchmarkRunnerFMHA {
       sycl::ext::oneapi::experimental::sub_group_size<GemmKernel::DispatchPolicy::SubgroupSize>
     };
     compat::experimental::launch_policy policy{sycl_grid, sycl_block, launch_props, kernel_props};
-    auto event = compat::experimental::launch<cutlass::device_kernel<GemmKernel>>(policy, params);
+    auto event = compat::experimental::launch<cutlass::device_kernel<GemmKernel>, GemmKernel>(policy, params);
 #endif
 
     EventManager::getInstance().addEvent(event);
 
@@ -584,7 +584,7 @@ template <class FMHAPrefillConfiguration> struct BenchmarkRunnerFMHA {
       sycl::ext::oneapi::experimental::sub_group_size<GemmKernel::DispatchPolicy::SubgroupSize>
     };
     compat::experimental::launch_policy policy{sycl_grid, sycl_block, launch_props, kernel_props};
-    auto event = compat::experimental::launch<cutlass::device_kernel<GemmKernel>>(policy, params);
+    auto event = compat::experimental::launch<cutlass::device_kernel<GemmKernel>, GemmKernel>(policy, params);
 #endif
 
     EventManager::getInstance().addEvent(event);
 
@@ -39,6 +39,10 @@ find_library(DPCPP_LIB_DIR NAMES sycl sycl6 PATHS "${DPCPP_BIN_DIR}/../lib")
 add_library(DPCPP::DPCPP INTERFACE IMPORTED)
 
 set(DPCPP_FLAGS "-fsycl;")
+if(DPCPP_HOST_COMPILER)
+  list(APPEND DPCPP_FLAGS "-fsycl-host-compiler=${DPCPP_HOST_COMPILER}")
+  list(APPEND DPCPP_FLAGS "-fsycl-host-compiler-options=-Wno-changes-meaning -D$<JOIN:$<TARGET_PROPERTY:COMPILE_DEFINITIONS>, -D> -I$<JOIN:$<TARGET_PROPERTY:INCLUDE_DIRECTORIES>, -I>")
+endif()
 set(DPCPP_COMPILE_ONLY_FLAGS "")
 set(DPCPP_LINK_ONLY_FLAGS "")
 
 
@@ -200,21 +200,14 @@ template <class FMHAKernel, bool isVarLen> struct ExampleRunner {
   };
   PagedKVParams paged_kv_cache;
 
-  template <typename SrcT, typename DstT>
-  void convert_fp8_to_fp16(const SrcT* d_src, DstT* d_dst, size_t size) {
-    compat::get_default_queue().parallel_for(size, [=](auto indx) {
-      d_dst[indx] = static_cast<DstT>(d_src[indx]);
-    }).wait();
-  }
-
   template <typename T>
   static constexpr bool is_fp8_v = cute::is_any_of_v<T, cute::float_e5m2_t, cute::float_e4m3_t>;
 
   template <typename Tin> inline auto in_memory(cutlass::DeviceAllocation<Tin>& in) {
     using outType = cutlass::DeviceAllocation<cute::conditional_t<is_fp8_v<Tin>, half_t, Tin>>;
     if constexpr(is_fp8_v<Tin>) {
       cutlass::DeviceAllocation<half_t> out(in.size());
-      convert_fp8_to_fp16<Tin, half_t>(in.get(), out.get(), in.size());
+      convert_dtype<Tin, half_t, ExampleRunner>(in.get(), out.get(), in.size());
       return out;
     } else { 
       return in;
@@ -651,7 +644,7 @@ template <class FMHAKernel, bool isVarLen> struct ExampleRunner {
       sycl::ext::oneapi::experimental::sub_group_size<FMHAKernel::DispatchPolicy::SubgroupSize>
     };
     compat::experimental::launch_policy policy{sycl_grid, sycl_block, launch_props, kernel_props};
-    auto event = compat::experimental::launch<cutlass::device_kernel<FMHAKernel>>(policy, params);
+    auto event = compat::experimental::launch<cutlass::device_kernel<FMHAKernel>, FMHAKernel>(policy, params);
 #endif
 
     EventManager::getInstance().addEvent(event);
 
@@ -627,7 +627,7 @@ template <class FMHAPrefillCachedKernel, bool isVarLen> struct ExampleRunner {
       sycl::ext::oneapi::experimental::sub_group_size<FMHAPrefillCachedKernel::DispatchPolicy::SubgroupSize>
     };
     compat::experimental::launch_policy policy{sycl_grid, sycl_block, launch_props, kernel_props};
-    auto event = compat::experimental::launch<cutlass::device_kernel<FMHAPrefillCachedKernel>>(policy, params);
+    auto event = compat::experimental::launch<cutlass::device_kernel<FMHAPrefillCachedKernel>, FMHAPrefillCachedKernel>(policy, params);
 #endif
 
     EventManager::getInstance().addEvent(event);