intel · muhammad-tanvir-1211 · Jun 5, 2025 · Jun 10, 2025 · Jun 10, 2025 · Jun 11, 2025
diff --git a/.github/workflows/intel_test.yml b/.github/workflows/intel_test.yml
@@ -57,7 +57,7 @@ jobs:
 
     name: Run Intel ${{ matrix.compiler }} tests on ${{ matrix.gpu }} with intel-graphics ${{ matrix.intel_graphics }}
     runs-on: ${{ matrix.runner }}
-    timeout-minutes: 30
+    timeout-minutes: 45
 
     steps:
       - name: Checkout repository
@@ -95,7 +95,8 @@ jobs:
           cmake -G Ninja  \
             -DCUTLASS_ENABLE_SYCL=ON \
             -DDPCPP_SYCL_TARGET=${{ matrix.sycl_target }} \
-            -DCUTLASS_SYCL_RUNNING_CI=ON
+            -DCUTLASS_SYCL_RUNNING_CI=ON \
+            -DCUTLASS_ENABLE_BENCHMARKS=OFF
           cmake --build .
       - name: Unit test
         shell: bash
@@ -108,4 +109,9 @@ jobs:
       - name: Benchmarks
         shell: bash
         run: |
-          cmake --build . --target cutlass_benchmarks
+          cmake -G Ninja  \
+            -DCUTLASS_ENABLE_SYCL=ON \
+            -DDPCPP_SYCL_TARGET=${{ matrix.sycl_target }} \
+            -DCUTLASS_SYCL_RUNNING_CI=ON \
+            -DCUTLASS_ENABLE_BENCHMARKS=ON
+          cmake --build . --target cutlass_benchmarks -j 8
diff --git a/benchmarks/device/bmg/input_files/input_sglang_flash_attention_decode_kvcache.in b/benchmarks/device/bmg/input_files/input_sglang_flash_attention_decode_kvcache.in
diff --git a/benchmarks/device/bmg/input_files/input_sglang_flash_attention_decode_nokvcache.in b/benchmarks/device/bmg/input_files/input_sglang_flash_attention_decode_nokvcache.in
diff --git a/benchmarks/flash_attention/flash_attention_decode/CMakeLists.txt b/benchmarks/flash_attention/flash_attention_decode/CMakeLists.txt
@@ -28,46 +28,19 @@
 
 set(CUTLASS_APPLICATIONS_DIR ${CMAKE_SOURCE_DIR}/applications)
 
-# Pass these configuration files for the CI
-set(CONFIG_FILE_NO_KV_CACHE --config_file=${CMAKE_SOURCE_DIR}/benchmarks/device/bmg/input_files/input_sglang_flash_attention_decode_nokvcache.in)
-
 cutlass_benchmark_add_suite(cutlass_benchmarks_flash_attention_decode
                             SUPERSUITE cutlass_benchmarks_flash_attention)
 
-add_library(decode_h64 SHARED
-            benchmarks_h64_512_nonpaged.cpp
-            benchmarks_h64_1024_nonpaged.cpp
-)
-
-add_library(decode_h96 SHARED
-            benchmarks_h96_512_nonpaged.cpp
-            benchmarks_h96_1024_nonpaged.cpp
-)
-
-add_library(decode_h128 SHARED
-            benchmarks_h128_512_nonpaged.cpp
-            benchmarks_h128_1024_nonpaged.cpp
-)
-
-add_library(decode_h192 SHARED
-            benchmarks_h192_512_nonpaged.cpp
-            benchmarks_h192_1024_nonpaged.cpp
-)
-
-set(LIB_LIST decode_h64 decode_h96 decode_h128 decode_h192)
-
-foreach(name IN LISTS LIB_LIST)
-  target_include_directories(${name} PRIVATE ${CUTLASS_APPLICATIONS_DIR})
-  target_link_libraries(${name} PRIVATE CUTLASS cutlass_tools_util_includes benchmark::benchmark)
-  add_onemkl_to_target(TARGET ${name})
-  add_sycl_to_target(TARGET ${name})
-endforeach()
-
-cutlass_benchmark_add_executable(
-    cutlass_benchmarks_flash_attention_decode_xe
+foreach(HEAD_DIM 64 96 128 192)
+  set(input_name "cutlass_benchmarks_flash_attention_decode_h${HEAD_DIM}")
+  set(out_exe "${input_name}_xe")
+  set(SHAPE_H "Shape_h${HEAD_DIM}")
+  cutlass_benchmark_add_executable(
+    ${out_exe}
     main.cpp
-    TEST_COMMAND_OPTIONS CONFIG_FILE_NO_KV_CACHE
-    LIBRARIES decode_h64 decode_h96 decode_h128 decode_h192
-    INCLUDES ${CMAKE_CURRENT_SOURCE_DIR}
     SUITE cutlass_benchmarks_flash_attention_decode
-)
+  )
+  target_compile_definitions(${out_exe} PRIVATE
+                              HEAD_DIM=${HEAD_DIM}
+                              SHAPE_H=${SHAPE_H})
+endforeach()
diff --git a/benchmarks/flash_attention/flash_attention_decode/benchmark_runner.hpp b/benchmarks/flash_attention/flash_attention_decode/benchmark_runner.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2024 - 2025 Codeplay Software Ltd. All rights reserved.
+ * Copyright (c) 2025 - 2025 Codeplay Software Ltd. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -74,14 +74,14 @@ struct FMHADecodeOptions {
   void parse(int argc, char const **args) {
     cutlass::CommandLine cmd(argc, args);
 
+    head_size_vo = HEAD_DIM;
     cmd.get_cmd_line_argument("batch", batch, 32);
     cmd.get_cmd_line_argument("num_heads_q", num_heads_q, 16);
     cmd.get_cmd_line_argument("num_heads_kv", num_heads_kv, num_heads_q);
     cmd.get_cmd_line_argument("seq_len_qo", seq_len_qo, 1);
     cmd.get_cmd_line_argument("seq_len_kv", seq_len_kv, seq_len_qo);
     cmd.get_cmd_line_argument("seq_len_kv_cache", seq_len_kv_cache, 0);
     cmd.get_cmd_line_argument("page_size", page_size, 128);
-    cmd.get_cmd_line_argument("head_size_vo", head_size_vo, 128);
     cmd.get_cmd_line_argument("head_size_qk", head_size_qk, head_size_vo);
     cmd.get_cmd_line_argument("iterations", iterations, 100);
     cmd.get_cmd_line_argument("bm_name", bm_name, std::string("Flash Attention v2"));
@@ -787,14 +787,3 @@ template <class FMHADecodeConfiguration> struct BenchmarkRunnerFMHADecode {
 };
 
 }
-
-#define CUTLASS_FMHA_DECODE_BENCHMARK(F) cutlass::benchmark::BenchmarkRegistry<cutlass::benchmark::FMHADecodeOptions>::Register(#F, &F##_func)
-
-#define CUTLASS_CREATE_FMHA_DECODE_BENCHMARK(F)                          \
-  static void F##_func(                                           \
-      ::benchmark::State& state,                                  \
-      cutlass::benchmark::FMHADecodeOptions const& options,                 \
-      cutlass::KernelHardwareInfo const& hw_info) {               \
-    auto bench = cutlass::benchmark::BenchmarkRunnerFMHADecode<F>();    \
-    bench.run(state, options, hw_info);                           \
-  }
diff --git a/benchmarks/flash_attention/flash_attention_decode/benchmarks.hpp b/benchmarks/flash_attention/flash_attention_decode/benchmarks.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
-* Copyright (c) 2024 - 2025 Codeplay Software Ltd. All rights reserved.
+* Copyright (c) 2025 - 2025 Codeplay Software Ltd. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -31,22 +31,86 @@
 
 #pragma once
 
-#include <benchmarks_h64_512_nonpaged.cpp>
-#include <benchmarks_h64_1024_nonpaged.cpp>
-#include <benchmarks_h96_512_nonpaged.cpp>
-#include <benchmarks_h96_1024_nonpaged.cpp>
-#include <benchmarks_h128_512_nonpaged.cpp>
-#include <benchmarks_h128_1024_nonpaged.cpp>
-#include <benchmarks_h192_512_nonpaged.cpp>
-#include <benchmarks_h192_1024_nonpaged.cpp>
-
-static void register_flash_attention_decode_benchmarks() {
-  register_flash_attention_decode_benchmarks_nonpaged_h64_512();
-  register_flash_attention_decode_benchmarks_nonpaged_h96_512();
-  register_flash_attention_decode_benchmarks_nonpaged_h128_512();
-  register_flash_attention_decode_benchmarks_nonpaged_h192_512();
-  register_flash_attention_decode_benchmarks_nonpaged_h64_1024();
-  register_flash_attention_decode_benchmarks_nonpaged_h96_1024();
-  register_flash_attention_decode_benchmarks_nonpaged_h128_1024();
-  register_flash_attention_decode_benchmarks_nonpaged_h192_1024();
+#include "benchmark_runner.hpp"
+#include "fmha_decode_configuration.hpp"
+
+using namespace cutlass;
+using namespace cutlass::flash_attention;
+
+
+template <typename FMHADecode>
+static void inline FMHADecodeFunc(::benchmark::State& state,
+                                cutlass::benchmark::FMHADecodeOptions const& options,
+                                KernelHardwareInfo const& hw_info) {
+  auto bench = cutlass::benchmark::BenchmarkRunnerFMHADecode<FMHADecode>();
+  bench.run(state, options, hw_info);
+}
+
+struct FMHADecodeBenchGenConfig {
+  static constexpr auto get_bool_tuple() {
+    return std::make_tuple(true, false);
+  }
+
+  static constexpr auto get_kvtile_tuple() {
+    return std::make_tuple(512, 1024);
+  }
+
+  static constexpr auto get_numsg_tuple() {
+    return std::make_tuple(8, 16);
+  }
+};
+
+template <typename String, typename InT, typename AccumT, typename OutT, bool Causal, bool VarLen, int KVTile, int NumSG, bool PagedKV>
+static constexpr void generate_benchmarks() {
+  using F = typename FMHADecodeConfigGen<InT, AccumT, OutT, Causal, VarLen, SHAPE_H<KVTile, NumSG>, PagedKV>::type;
+
+  String str = "FMHADecode";
+  String input_str = str + String{std::is_same_v<InT, bfloat16_t> ? "BF16BF16FP32" : "FP16FP16FP32"};
+  String out_str = input_str + String{std::is_same_v<OutT, bfloat16_t> ? "BF16_RCR_" : std::is_same_v<OutT, half_t> ? "FP16_RCR_" : "FP32_RCR_"};
+  String page_str = out_str + String{PagedKV ? "Paged_" : "NonPaged_"};
+  String kvtile_str = page_str + String{"KVTile"} + String{std::to_string(KVTile)} + String{"_"};
+  String head_dim_str = kvtile_str + String{"h"} + String{std::to_string(HEAD_DIM)} + String{"_"};
+  String causal_str = head_dim_str + String{Causal ? "Causal_" : "NonCausal_"};
+  String bench_name = causal_str + String{VarLen ? "VarLen" : "FixedLen"};
+
+  cutlass::benchmark::BenchmarkRegistry<cutlass::benchmark::FMHADecodeOptions>::Register(bench_name, FMHADecodeFunc<F>);
+}
+
+template <typename ConfigTupleGen, typename InT, typename AccumT, typename OutT, bool Causal, bool VarLen, int KVTile, int NumSG, int paged_idx = 0>
+static constexpr void generate_benchmarks_paged() {
+  if constexpr (paged_idx < std::tuple_size_v<decltype(ConfigTupleGen::get_bool_tuple())>) {
+    generate_benchmarks<std::string, InT, AccumT, OutT, Causal, VarLen, KVTile, NumSG, get<paged_idx>(ConfigTupleGen::get_bool_tuple())>();
+    generate_benchmarks_paged<ConfigTupleGen, InT, AccumT, OutT, Causal, VarLen, KVTile, NumSG, paged_idx + 1>();
+  }
+}
+
+template <typename ConfigTupleGen, typename InT, typename AccumT, typename OutT, bool Causal, bool VarLen, int kvtile_idx = 0>
+static constexpr void generate_benchmarks_kvtile() {
+  if constexpr (kvtile_idx < std::tuple_size_v<decltype(ConfigTupleGen::get_kvtile_tuple())>) {
+    generate_benchmarks_paged<ConfigTupleGen, InT, AccumT, OutT, Causal, VarLen, get<kvtile_idx>(ConfigTupleGen::get_kvtile_tuple()), get<kvtile_idx>(ConfigTupleGen::get_numsg_tuple())>();
+    generate_benchmarks_kvtile<ConfigTupleGen, InT, AccumT, OutT, Causal, VarLen, kvtile_idx + 1>();
+  }
+}
+
+template <typename ConfigTupleGen, typename InT, typename AccumT, typename OutT, bool Causal, int varlen_idx = 0>
+static constexpr void generate_benchmarks_varlen() {
+  if constexpr (varlen_idx < std::tuple_size_v<decltype(ConfigTupleGen::get_bool_tuple())>) {
+    generate_benchmarks_kvtile<ConfigTupleGen, InT, AccumT, OutT, Causal, get<varlen_idx>(ConfigTupleGen::get_bool_tuple())>();
+    generate_benchmarks_varlen<ConfigTupleGen, InT, AccumT, OutT, Causal, varlen_idx + 1>();
+  }
+}
+
+template <typename ConfigTupleGen, typename InT, typename AccumT, typename OutT, int causal_idx = 0>
+static constexpr void generate_benchmarks_causal() {
+  if constexpr (causal_idx < std::tuple_size_v<decltype(ConfigTupleGen::get_bool_tuple())>) {
+    generate_benchmarks_varlen<ConfigTupleGen, InT, AccumT, OutT, get<causal_idx>(ConfigTupleGen::get_bool_tuple())>();
+    generate_benchmarks_causal<ConfigTupleGen, InT, AccumT, OutT, causal_idx + 1>();
+  }
+}
+
+static constexpr void register_flash_attention_decode_benchmarks() {
+  generate_benchmarks_causal<FMHADecodeBenchGenConfig, cutlass::bfloat16_t, float, float>();
+  generate_benchmarks_causal<FMHADecodeBenchGenConfig, cutlass::bfloat16_t, float, cutlass::bfloat16_t>();
+  generate_benchmarks_causal<FMHADecodeBenchGenConfig, cutlass::half_t, float, float>();
+  generate_benchmarks_causal<FMHADecodeBenchGenConfig, cutlass::half_t, float, cutlass::half_t>();
 }
diff --git a/benchmarks/flash_attention/flash_attention_decode/benchmarks_h128_1024_nonpaged.cpp b/benchmarks/flash_attention/flash_attention_decode/benchmarks_h128_1024_nonpaged.cpp