Skip to content
Merged
Show file tree
Hide file tree
Changes from 46 commits
Commits
Show all changes
50 commits
Select commit Hold shift + click to select a range
d41cf57
add flash attention interface
jikunshang Jul 31, 2025
ce9f31d
update interface
jikunshang Aug 1, 2025
fb6784f
add cutlass deps (#1)
jikunshang Aug 4, 2025
ce27fa2
add chunk_prefill step<1>
YizhouZ Aug 7, 2025
ed0f846
fix register
YizhouZ Aug 7, 2025
b02a5a8
fix cmake
YizhouZ Aug 7, 2025
a4a76ee
debug msg
YizhouZ Aug 8, 2025
ee1b719
functional ready
YizhouZ Aug 11, 2025
4ef938f
dev base
Liangliang-Ma Aug 21, 2025
480c72f
base of grouped_gemm_fp8
Liangliang-Ma Aug 22, 2025
24709b8
update func
Liangliang-Ma Aug 26, 2025
f5757a9
add test
Liangliang-Ma Aug 29, 2025
435e6df
update functor
Liangliang-Ma Aug 29, 2025
f76fb97
update grouped_gemm
Liangliang-Ma Aug 30, 2025
9408e94
build ready
Liangliang-Ma Aug 31, 2025
439cf3c
base integration done
Liangliang-Ma Sep 1, 2025
48abd9f
grouped gemm base ready
Liangliang-Ma Sep 2, 2025
67eeb47
gemm2 use cutlass grouped_mm
Liangliang-Ma Sep 3, 2025
a62752f
gemm1 use cutlass group_mm
Liangliang-Ma Sep 3, 2025
cfb724b
rm flash_attn in this pr
Liangliang-Ma Sep 4, 2025
f7518e0
rebase CMakeLists
Liangliang-Ma Sep 4, 2025
083bde5
use main Cmakes
Liangliang-Ma Sep 4, 2025
48a4808
use main setup
Liangliang-Ma Sep 4, 2025
22d1ade
mv utils
Liangliang-Ma Sep 4, 2025
c0e70c4
Merge branch 'main' into grouped_gemm_cutlass
Liangliang-Ma Sep 4, 2025
1c7f46d
finish rebase
Liangliang-Ma Sep 4, 2025
df0b915
add profile and change to col-maj
Liangliang-Ma Sep 5, 2025
76fe4bc
dont not reserve block_C
Liangliang-Ma Sep 9, 2025
ad0fdd6
remove redundant allocation
Liangliang-Ma Sep 11, 2025
54e64a7
e2e debug
Liangliang-Ma Sep 11, 2025
3c40008
add release func
Liangliang-Ma Sep 11, 2025
985004d
gemm args allocate once
Liangliang-Ma Sep 11, 2025
9c18092
hidden_states copy
Liangliang-Ma Sep 11, 2025
a47ecef
output bf16
Liangliang-Ma Sep 14, 2025
1a2d655
use static tensor buffer
Liangliang-Ma Sep 14, 2025
f7dee65
remove ptr_C
Liangliang-Ma Sep 15, 2025
ad2dc48
fix device lost
Liangliang-Ma Sep 17, 2025
56cb570
acc and oom fixed
Liangliang-Ma Sep 19, 2025
81555ab
Fix acc and oom issue
Liangliang-Ma Sep 19, 2025
d1edf17
base
Liangliang-Ma Sep 23, 2025
55f36a8
update CMakeLists
Liangliang-Ma Sep 23, 2025
54e7219
Merge branch 'main' into grouped_gemm_cutlass
Liangliang-Ma Sep 23, 2025
513377a
refactor csrc of cutlass
Liangliang-Ma Sep 23, 2025
534c7c3
put src in vllm
Liangliang-Ma Sep 23, 2025
1fc6959
add adapter src
Liangliang-Ma Sep 23, 2025
db6b292
clean up
Liangliang-Ma Sep 24, 2025
d651d9d
add test
Liangliang-Ma Sep 24, 2025
a29cfa6
clean up
Liangliang-Ma Sep 24, 2025
c66f152
fix format
Liangliang-Ma Sep 24, 2025
a681e73
fix format f841
Liangliang-Ma Sep 25, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 15 additions & 3 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -171,12 +171,13 @@ if(VLLM_GPU_LANG STREQUAL "SYCL")
set(CUTLASS_ENABLE_HEADERS_ONLY "ON" CACHE BOOL "Enable only the header library")

# Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building.
set(CUTLASS_REVISION "main" CACHE STRING "CUTLASS revision to use")
set(CUTLASS_REVISION "dev" CACHE STRING "CUTLASS revision to use")

# Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
FetchContent_Declare(
cutlass-sycl
GIT_REPOSITORY https://github.com/intel/cutlass-sycl
GIT_REPOSITORY https://github.com/Liangliang-Ma/cutlass-sycl
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why using private forked cutlass-sycl?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I will rebase to cutlass-sycl/main.


# Please keep this in sync with CUTLASS_REVISION line above.
GIT_TAG ${CUTLASS_REVISION}
GIT_PROGRESS TRUE
Expand All @@ -196,7 +197,6 @@ if(VLLM_GPU_LANG STREQUAL "SYCL")
set(CUTLASS_ENABLE_GDC_FOR_SM100_DEFAULT OFF CACHE BOOL "DISABLE CUDA")
# list(APPEND CMAKE_CXX_FLAGS "-ftemplate-backtrace-limit=0 " )
# list(APPEND CMAKE_CXX_FLAGS "-fdiagnostics-color=always " )


FetchContent_MakeAvailable(cutlass-sycl)
set(CUTLASS_INCLUDE_DIR ${cutlass-sycl_SOURCE_DIR}/include CACHE PATH "CUTLASS Header Library")
Expand Down Expand Up @@ -269,11 +269,15 @@ endif ()
#
# xpu only ops/kernels, implemented with cutlass/onednn/sycl.
#
file(GLOB CUTLASS_BACKEND_SRCS
csrc/xpu/cutlass_kernels/*.cpp
)
if(VLLM_GPU_LANG STREQUAL "SYCL")
set(VLLM_EXT_XPU_SRC
"csrc/xpu/torch_bindings.cpp"
"csrc/xpu/lora/lora_shrink.cpp"
"csrc/xpu/lora/lora_expand.cpp"
${CUTLASS_BACKEND_SRCS}
)
include_directories("/usr/include")
set(CMPLR_ROOT $ENV{CMPLR_ROOT})
Expand All @@ -282,6 +286,12 @@ if(VLLM_GPU_LANG STREQUAL "SYCL")
list(APPEND VLLM_GPU_FLAGS "-DVLLM_BUILD_XPU_OPS" )
list(APPEND VLLM_GPU_LINK_FLAGS "-fsycl" "-fsycl-targets=spir64")
list(APPEND VLLM_LINK_LIBRARIES "sycl" "OpenCL" "pthread" "m" "dl" "torch" )
# CUTLASS FLAGS
list(APPEND VLLM_GPU_FLAGS "-O3" "-DNDEBUG")
list(APPEND VLLM_GPU_FLAGS "-gline-tables-only")
list(APPEND VLLM_GPU_FLAGS "-fsycl" "-fsycl-targets=spir64_gen" "-ftemplate-backtrace-limit=10")
list(APPEND VLLM_GPU_LINK_FLAGS "-fsycl" "-fsycl-targets=spir64_gen")
list(APPEND VLLM_GPU_LINK_FLAGS -Xsycl-target-backend=spir64_gen "-device bmg-g21-a0 -internal_options -cl-intel-256-GRF-per-thread")
endif()

if(ONEDNN_FOUND)
Expand All @@ -305,6 +315,8 @@ define_gpu_extension_target(
ARCHITECTURES ${VLLM_GPU_ARCHES}
INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR}
INCLUDE_DIRECTORIES ${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
INCLUDE_DIRECTORIES ${CUTLASS_APP_INCLUDE_DIR}
INCLUDE_DIRECTORIES ${VLLM_INCLUDE_DIR}
USE_SABI 3
WITH_SOABI)

Expand Down
4 changes: 3 additions & 1 deletion csrc/core/registration.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#pragma once

#pragma push_macro("printf")
#undef printf
#include <Python.h>

#define _CONCAT(A, B) A##B
Expand Down Expand Up @@ -32,3 +33,4 @@
nullptr}; \
return PyModule_Create(&module); \
}
#pragma pop_macro("printf")
Loading
Loading