Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
2a5d95c
Add more tests and benchmark configurations
muhammad-tanvir-1211 Jun 5, 2025
beabe4d
Merge branch 'sycl-develop' of https://github.com/codeplaysoftware/cu…
muhammad-tanvir-1211 Jun 10, 2025
55030f0
Merge branch 'sycl-develop' of https://github.com/codeplaysoftware/cu…
muhammad-tanvir-1211 Jun 10, 2025
df9a1e1
Merge branch 'sycl-develop' into flash_decode_separate_out_configs
muhammad-tanvir-1211 Jun 11, 2025
d30c6be
Fix license year
muhammad-tanvir-1211 Jun 13, 2025
338d7fe
Workaround to skip today's DPCPP nightly on CI (#425)
aacostadiaz Jun 11, 2025
a1811a4
Split example for prefill attention with cachedkv (#409)
aacostadiaz Jun 12, 2025
649f904
Avoid failures if latest nightly DPCPP tag didn't provide binaries (…
carlewis Jun 12, 2025
80e4b83
Add BF16BF16FP32 CUTE Example on BMG (#422)
leslie-fang-intel Jun 13, 2025
580e8c8
Merge branch 'sycl-develop' of https://github.com/codeplaysoftware/cu…
muhammad-tanvir-1211 Jun 13, 2025
2ad93de
Merge branch 'sycl-develop' of https://github.com/codeplaysoftware/cu…
muhammad-tanvir-1211 Jun 13, 2025
ea4376f
Simplify test generation
muhammad-tanvir-1211 Jun 13, 2025
540084a
Merge branch 'sycl-develop' into flash_decode_separate_out_configs
muhammad-tanvir-1211 Jun 13, 2025
ae03894
Merge branch 'sycl-develop' of https://github.com/codeplaysoftware/cu…
muhammad-tanvir-1211 Jun 16, 2025
0975c01
Fix benchmark api
muhammad-tanvir-1211 Jun 16, 2025
188fdce
Merge branch 'flash_decode_separate_out_configs' of https://github.co…
muhammad-tanvir-1211 Jun 16, 2025
ff198f5
Fix benchmark names
muhammad-tanvir-1211 Jun 16, 2025
4d446bb
Change intel workflow
muhammad-tanvir-1211 Jun 17, 2025
30c3a79
Merge branch 'sycl-develop' of https://github.com/codeplaysoftware/cu…
muhammad-tanvir-1211 Jun 17, 2025
7f45907
Simplify benchmark generation
muhammad-tanvir-1211 Jun 17, 2025
90d7637
Merge branch 'sycl-develop' of https://github.com/codeplaysoftware/cu…
muhammad-tanvir-1211 Jun 19, 2025
4327493
Increase timeout
muhammad-tanvir-1211 Jun 20, 2025
a859948
Added check for head_size_vo
muhammad-tanvir-1211 Jun 23, 2025
a9173c0
Fix the CI
muhammad-tanvir-1211 Jun 23, 2025
e4f8462
Merge branch 'sycl-develop' of https://github.com/codeplaysoftware/cu…
muhammad-tanvir-1211 Jun 23, 2025
c64c66f
Merge branch 'sycl-develop' of https://github.com/codeplaysoftware/cu…
muhammad-tanvir-1211 Jun 23, 2025
8568330
Merge branch 'sycl-develop' of https://github.com/codeplaysoftware/cu…
muhammad-tanvir-1211 Jul 8, 2025
287b5af
Remove test changes, hardcode head_size_vo
muhammad-tanvir-1211 Jul 8, 2025
96ba5a7
Merge branch 'sycl-develop' into flash_decode_simplify_benchmarks
muhammad-tanvir-1211 Jul 15, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 9 additions & 3 deletions .github/workflows/intel_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ jobs:

name: Run Intel ${{ matrix.compiler }} tests on ${{ matrix.gpu }} with intel-graphics ${{ matrix.intel_graphics }}
runs-on: ${{ matrix.runner }}
timeout-minutes: 30
timeout-minutes: 45

steps:
- name: Checkout repository
Expand Down Expand Up @@ -95,7 +95,8 @@ jobs:
cmake -G Ninja \
-DCUTLASS_ENABLE_SYCL=ON \
-DDPCPP_SYCL_TARGET=${{ matrix.sycl_target }} \
-DCUTLASS_SYCL_RUNNING_CI=ON
-DCUTLASS_SYCL_RUNNING_CI=ON \

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

CUTLASS_SYCL_RUNNING_CI doesn't seem to do anything as far as I can tell?

-DCUTLASS_ENABLE_BENCHMARKS=OFF
cmake --build .
- name: Unit test
shell: bash
Expand All @@ -108,4 +109,9 @@ jobs:
- name: Benchmarks
shell: bash
run: |
cmake --build . --target cutlass_benchmarks
cmake -G Ninja \
-DCUTLASS_ENABLE_SYCL=ON \
-DDPCPP_SYCL_TARGET=${{ matrix.sycl_target }} \
-DCUTLASS_SYCL_RUNNING_CI=ON \
-DCUTLASS_ENABLE_BENCHMARKS=ON
cmake --build . --target cutlass_benchmarks -j 8

Large diffs are not rendered by default.

Large diffs are not rendered by default.

49 changes: 11 additions & 38 deletions benchmarks/flash_attention/flash_attention_decode/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -28,46 +28,19 @@

set(CUTLASS_APPLICATIONS_DIR ${CMAKE_SOURCE_DIR}/applications)

# Pass these configuration files for the CI
set(CONFIG_FILE_NO_KV_CACHE --config_file=${CMAKE_SOURCE_DIR}/benchmarks/device/bmg/input_files/input_sglang_flash_attention_decode_nokvcache.in)

cutlass_benchmark_add_suite(cutlass_benchmarks_flash_attention_decode
SUPERSUITE cutlass_benchmarks_flash_attention)

add_library(decode_h64 SHARED
benchmarks_h64_512_nonpaged.cpp
benchmarks_h64_1024_nonpaged.cpp
)

add_library(decode_h96 SHARED
benchmarks_h96_512_nonpaged.cpp
benchmarks_h96_1024_nonpaged.cpp
)

add_library(decode_h128 SHARED
benchmarks_h128_512_nonpaged.cpp
benchmarks_h128_1024_nonpaged.cpp
)

add_library(decode_h192 SHARED
benchmarks_h192_512_nonpaged.cpp
benchmarks_h192_1024_nonpaged.cpp
)

set(LIB_LIST decode_h64 decode_h96 decode_h128 decode_h192)

foreach(name IN LISTS LIB_LIST)
target_include_directories(${name} PRIVATE ${CUTLASS_APPLICATIONS_DIR})
target_link_libraries(${name} PRIVATE CUTLASS cutlass_tools_util_includes benchmark::benchmark)
add_onemkl_to_target(TARGET ${name})
add_sycl_to_target(TARGET ${name})
endforeach()

cutlass_benchmark_add_executable(
cutlass_benchmarks_flash_attention_decode_xe
foreach(HEAD_DIM 64 96 128 192)
set(input_name "cutlass_benchmarks_flash_attention_decode_h${HEAD_DIM}")
set(out_exe "${input_name}_xe")
set(SHAPE_H "Shape_h${HEAD_DIM}")
cutlass_benchmark_add_executable(
${out_exe}
main.cpp
TEST_COMMAND_OPTIONS CONFIG_FILE_NO_KV_CACHE
LIBRARIES decode_h64 decode_h96 decode_h128 decode_h192
INCLUDES ${CMAKE_CURRENT_SOURCE_DIR}
SUITE cutlass_benchmarks_flash_attention_decode
)
)
target_compile_definitions(${out_exe} PRIVATE
HEAD_DIM=${HEAD_DIM}
SHAPE_H=${SHAPE_H})
endforeach()
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/***************************************************************************************************
* Copyright (c) 2024 - 2025 Codeplay Software Ltd. All rights reserved.
* Copyright (c) 2025 - 2025 Codeplay Software Ltd. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
Expand Down Expand Up @@ -74,14 +74,14 @@ struct FMHADecodeOptions {
void parse(int argc, char const **args) {
cutlass::CommandLine cmd(argc, args);

head_size_vo = HEAD_DIM;
cmd.get_cmd_line_argument("batch", batch, 32);
cmd.get_cmd_line_argument("num_heads_q", num_heads_q, 16);
cmd.get_cmd_line_argument("num_heads_kv", num_heads_kv, num_heads_q);
cmd.get_cmd_line_argument("seq_len_qo", seq_len_qo, 1);
cmd.get_cmd_line_argument("seq_len_kv", seq_len_kv, seq_len_qo);
cmd.get_cmd_line_argument("seq_len_kv_cache", seq_len_kv_cache, 0);
cmd.get_cmd_line_argument("page_size", page_size, 128);
cmd.get_cmd_line_argument("head_size_vo", head_size_vo, 128);
cmd.get_cmd_line_argument("head_size_qk", head_size_qk, head_size_vo);
cmd.get_cmd_line_argument("iterations", iterations, 100);
cmd.get_cmd_line_argument("bm_name", bm_name, std::string("Flash Attention v2"));
Expand Down Expand Up @@ -787,14 +787,3 @@ template <class FMHADecodeConfiguration> struct BenchmarkRunnerFMHADecode {
};

}

#define CUTLASS_FMHA_DECODE_BENCHMARK(F) cutlass::benchmark::BenchmarkRegistry<cutlass::benchmark::FMHADecodeOptions>::Register(#F, &F##_func)

#define CUTLASS_CREATE_FMHA_DECODE_BENCHMARK(F) \
static void F##_func( \
::benchmark::State& state, \
cutlass::benchmark::FMHADecodeOptions const& options, \
cutlass::KernelHardwareInfo const& hw_info) { \
auto bench = cutlass::benchmark::BenchmarkRunnerFMHADecode<F>(); \
bench.run(state, options, hw_info); \
}
102 changes: 83 additions & 19 deletions benchmarks/flash_attention/flash_attention_decode/benchmarks.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/***************************************************************************************************
* Copyright (c) 2024 - 2025 Codeplay Software Ltd. All rights reserved.
* Copyright (c) 2025 - 2025 Codeplay Software Ltd. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
Expand Down Expand Up @@ -31,22 +31,86 @@

#pragma once

#include <benchmarks_h64_512_nonpaged.cpp>
#include <benchmarks_h64_1024_nonpaged.cpp>
#include <benchmarks_h96_512_nonpaged.cpp>
#include <benchmarks_h96_1024_nonpaged.cpp>
#include <benchmarks_h128_512_nonpaged.cpp>
#include <benchmarks_h128_1024_nonpaged.cpp>
#include <benchmarks_h192_512_nonpaged.cpp>
#include <benchmarks_h192_1024_nonpaged.cpp>

static void register_flash_attention_decode_benchmarks() {
register_flash_attention_decode_benchmarks_nonpaged_h64_512();
register_flash_attention_decode_benchmarks_nonpaged_h96_512();
register_flash_attention_decode_benchmarks_nonpaged_h128_512();
register_flash_attention_decode_benchmarks_nonpaged_h192_512();
register_flash_attention_decode_benchmarks_nonpaged_h64_1024();
register_flash_attention_decode_benchmarks_nonpaged_h96_1024();
register_flash_attention_decode_benchmarks_nonpaged_h128_1024();
register_flash_attention_decode_benchmarks_nonpaged_h192_1024();
#include "benchmark_runner.hpp"
#include "fmha_decode_configuration.hpp"

using namespace cutlass;
using namespace cutlass::flash_attention;


template <typename FMHADecode>
static void inline FMHADecodeFunc(::benchmark::State& state,
cutlass::benchmark::FMHADecodeOptions const& options,
KernelHardwareInfo const& hw_info) {
auto bench = cutlass::benchmark::BenchmarkRunnerFMHADecode<FMHADecode>();
bench.run(state, options, hw_info);
}

struct FMHADecodeBenchGenConfig {
static constexpr auto get_bool_tuple() {
return std::make_tuple(true, false);
}

static constexpr auto get_kvtile_tuple() {
return std::make_tuple(512, 1024);
}

static constexpr auto get_numsg_tuple() {
return std::make_tuple(8, 16);
}
};

template <typename String, typename InT, typename AccumT, typename OutT, bool Causal, bool VarLen, int KVTile, int NumSG, bool PagedKV>
static constexpr void generate_benchmarks() {
using F = typename FMHADecodeConfigGen<InT, AccumT, OutT, Causal, VarLen, SHAPE_H<KVTile, NumSG>, PagedKV>::type;

String str = "FMHADecode";
String input_str = str + String{std::is_same_v<InT, bfloat16_t> ? "BF16BF16FP32" : "FP16FP16FP32"};
String out_str = input_str + String{std::is_same_v<OutT, bfloat16_t> ? "BF16_RCR_" : std::is_same_v<OutT, half_t> ? "FP16_RCR_" : "FP32_RCR_"};
String page_str = out_str + String{PagedKV ? "Paged_" : "NonPaged_"};
String kvtile_str = page_str + String{"KVTile"} + String{std::to_string(KVTile)} + String{"_"};
String head_dim_str = kvtile_str + String{"h"} + String{std::to_string(HEAD_DIM)} + String{"_"};
String causal_str = head_dim_str + String{Causal ? "Causal_" : "NonCausal_"};
String bench_name = causal_str + String{VarLen ? "VarLen" : "FixedLen"};

cutlass::benchmark::BenchmarkRegistry<cutlass::benchmark::FMHADecodeOptions>::Register(bench_name, FMHADecodeFunc<F>);
}

template <typename ConfigTupleGen, typename InT, typename AccumT, typename OutT, bool Causal, bool VarLen, int KVTile, int NumSG, int paged_idx = 0>
static constexpr void generate_benchmarks_paged() {
if constexpr (paged_idx < std::tuple_size_v<decltype(ConfigTupleGen::get_bool_tuple())>) {
generate_benchmarks<std::string, InT, AccumT, OutT, Causal, VarLen, KVTile, NumSG, get<paged_idx>(ConfigTupleGen::get_bool_tuple())>();
generate_benchmarks_paged<ConfigTupleGen, InT, AccumT, OutT, Causal, VarLen, KVTile, NumSG, paged_idx + 1>();
}
}

template <typename ConfigTupleGen, typename InT, typename AccumT, typename OutT, bool Causal, bool VarLen, int kvtile_idx = 0>
static constexpr void generate_benchmarks_kvtile() {
if constexpr (kvtile_idx < std::tuple_size_v<decltype(ConfigTupleGen::get_kvtile_tuple())>) {
generate_benchmarks_paged<ConfigTupleGen, InT, AccumT, OutT, Causal, VarLen, get<kvtile_idx>(ConfigTupleGen::get_kvtile_tuple()), get<kvtile_idx>(ConfigTupleGen::get_numsg_tuple())>();
generate_benchmarks_kvtile<ConfigTupleGen, InT, AccumT, OutT, Causal, VarLen, kvtile_idx + 1>();
}
}

template <typename ConfigTupleGen, typename InT, typename AccumT, typename OutT, bool Causal, int varlen_idx = 0>
static constexpr void generate_benchmarks_varlen() {
if constexpr (varlen_idx < std::tuple_size_v<decltype(ConfigTupleGen::get_bool_tuple())>) {
generate_benchmarks_kvtile<ConfigTupleGen, InT, AccumT, OutT, Causal, get<varlen_idx>(ConfigTupleGen::get_bool_tuple())>();
generate_benchmarks_varlen<ConfigTupleGen, InT, AccumT, OutT, Causal, varlen_idx + 1>();
}
}

template <typename ConfigTupleGen, typename InT, typename AccumT, typename OutT, int causal_idx = 0>
static constexpr void generate_benchmarks_causal() {
if constexpr (causal_idx < std::tuple_size_v<decltype(ConfigTupleGen::get_bool_tuple())>) {
generate_benchmarks_varlen<ConfigTupleGen, InT, AccumT, OutT, get<causal_idx>(ConfigTupleGen::get_bool_tuple())>();
generate_benchmarks_causal<ConfigTupleGen, InT, AccumT, OutT, causal_idx + 1>();
}
}

static constexpr void register_flash_attention_decode_benchmarks() {
generate_benchmarks_causal<FMHADecodeBenchGenConfig, cutlass::bfloat16_t, float, float>();
generate_benchmarks_causal<FMHADecodeBenchGenConfig, cutlass::bfloat16_t, float, cutlass::bfloat16_t>();
generate_benchmarks_causal<FMHADecodeBenchGenConfig, cutlass::half_t, float, float>();
generate_benchmarks_causal<FMHADecodeBenchGenConfig, cutlass::half_t, float, cutlass::half_t>();
}

This file was deleted.

Loading
Loading