Skip to content

Commit 83eed6e

Browse files
authored
Add collective communication kernels (#3163)
* comm abstraction * add custom * fused rms norm * refactor * push-based kernel * optimize for small hidden dims * integration * clean up * export options & fix things * allgather2d & VMM allocation * optimize allgather2d * remove obsolete comm utils * handle non-multi-gpu build * fix lint * fix lint * avoid using mscclpp repo (some deps are not needed) * fix lint * fix nccl version & clean up deps * fix lint * custom -> native * rename * fix p-lora * fix lm head * log fatal exception explicitly * fix memory location * fix lora buff size * pad max_fwd_token_num * fix allocation for `context_logits_buf_` * simplify
1 parent c3ecd10 commit 83eed6e

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

62 files changed

+3612
-2019
lines changed

CMakeLists.txt

Lines changed: 10 additions & 213 deletions
Original file line numberDiff line numberDiff line change
@@ -17,28 +17,16 @@ project(TurboMind LANGUAGES CXX CUDA)
1717

1818
find_package(CUDA 10.2 REQUIRED)
1919

20+
find_package(CUDAToolkit REQUIRED)
21+
2022
if(${CUDA_VERSION_MAJOR} VERSION_GREATER_EQUAL "11")
2123
add_definitions("-DENABLE_BF16")
2224
message("CUDA_VERSION ${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR} is greater or equal than 11.0, enable -DENABLE_BF16 flag")
2325
endif()
2426

25-
# if((${CUDA_VERSION_MAJOR} VERSION_GREATER_EQUAL "11" AND ${CUDA_VERSION_MINOR} VERSION_GREATER_EQUAL "8") OR (${CUDA_VERSION_MAJOR} VERSION_GREATER_EQUAL "12"))
26-
# add_definitions("-DENABLE_FP8")
27-
# option(ENABLE_FP8 "ENABLE_FP8" OFF)
28-
# if(ENABLE_FP8)
29-
# message("CUDA_VERSION ${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR} is greater or equal than 11.8, enable -DENABLE_FP8 flag")
30-
# endif()
31-
# endif()
32-
3327
set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/Modules)
3428

35-
option(BUILD_PYT "Build in PyTorch TorchScript class mode" OFF)
36-
if(NOT BUILD_MULTI_GPU)
37-
option(BUILD_MULTI_GPU "Build project about multi-GPU" OFF)
38-
endif()
39-
if(NOT USE_TRITONSERVER_DATATYPE)
40-
option(USE_TRITONSERVER_DATATYPE "Build triton backend for triton server" OFF)
41-
endif()
29+
option(BUILD_MULTI_GPU "Build multi-gpu support" ON)
4230
option(BUILD_PY_FFI "Build python ffi" ON)
4331
option(BUILD_TEST "Build tests" OFF)
4432

@@ -89,43 +77,24 @@ if (LMDEPLOY_UBSAN_ENABLE)
8977
endif ()
9078

9179
if(BUILD_MULTI_GPU)
92-
message(STATUS "Add DBUILD_MULTI_GPU, requires MPI and NCCL")
93-
add_definitions("-DBUILD_MULTI_GPU")
94-
set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/Modules)
95-
find_package(MPI REQUIRED)
96-
find_package(NCCL REQUIRED)
97-
set(CMAKE_MODULE_PATH "") # prevent the bugs for pytorch building
80+
add_definitions("-DBUILD_MULTI_GPU=1")
81+
set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/Modules)
82+
find_package(NCCL)
83+
if (NCCL_FOUND)
84+
set(USE_NCCL ON)
85+
add_definitions("-DUSE_NCCL=1")
86+
endif ()
9887
endif()
9988

100-
if(BUILD_PYT)
101-
if(DEFINED ENV{NVIDIA_PYTORCH_VERSION})
102-
if($ENV{NVIDIA_PYTORCH_VERSION} VERSION_LESS "20.03")
103-
message(FATAL_ERROR "NVIDIA PyTorch image is too old for TorchScript mode.")
104-
endif()
105-
if($ENV{NVIDIA_PYTORCH_VERSION} VERSION_EQUAL "20.03")
106-
add_definitions(-DLEGACY_THS=1)
107-
endif()
108-
endif()
109-
endif()
110-
111-
if(USE_TRITONSERVER_DATATYPE)
112-
message("-- USE_TRITONSERVER_DATATYPE")
113-
add_definitions("-DUSE_TRITONSERVER_DATATYPE")
114-
endif()
11589

11690
set(CXX_STD "17" CACHE STRING "C++ standard")
11791
# enable gold linker for binary and .so
11892
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fuse-ld=gold")
11993
set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -fuse-ld=gold")
12094
set(CUDA_PATH ${CUDA_TOOLKIT_ROOT_DIR})
12195

122-
set(TF_PATH "" CACHE STRING "TensorFlow path")
12396
set(CUSPARSELT_PATH "" CACHE STRING "cuSPARSELt path")
12497

125-
if((BUILD_TF OR BUILD_TF2) AND NOT TF_PATH)
126-
message(FATAL_ERROR "TF_PATH must be set if BUILD_TF or BUILD_TF2 (=TensorFlow mode) is on.")
127-
endif()
128-
12998
list(APPEND CMAKE_MODULE_PATH ${CUDA_PATH}/lib64)
13099

131100
# profiling
@@ -204,64 +173,8 @@ if (SPARSITY_SUPPORT)
204173
add_definitions(-DSPARSITY_ENABLED=1)
205174
endif()
206175

207-
if(BUILD_TF)
208-
list(APPEND COMMON_HEADER_DIRS ${TF_PATH}/include)
209-
list(APPEND COMMON_LIB_DIRS ${TF_PATH})
210-
add_definitions(-D_GLIBCXX_USE_CXX11_ABI=0)
211-
endif()
212-
213-
if(BUILD_TF2)
214-
list(APPEND COMMON_HEADER_DIRS ${TF_PATH}/include)
215-
list(APPEND COMMON_LIB_DIRS ${TF_PATH})
216-
add_definitions(-D_GLIBCXX_USE_CXX11_ABI=1)
217-
endif()
218176

219177
set(PYTHON_PATH "python" CACHE STRING "Python path")
220-
if(BUILD_PYT)
221-
execute_process(COMMAND ${PYTHON_PATH} "-c" "from __future__ import print_function; import torch; print(torch.__version__,end='');"
222-
RESULT_VARIABLE _PYTHON_SUCCESS
223-
OUTPUT_VARIABLE TORCH_VERSION)
224-
if (TORCH_VERSION VERSION_LESS "1.5.0")
225-
message(FATAL_ERROR "PyTorch >= 1.5.0 is needed for TorchScript mode.")
226-
endif()
227-
execute_process(COMMAND ${PYTHON_PATH} "-c" "from __future__ import print_function; import os; import torch;
228-
print(os.path.dirname(torch.__file__),end='');"
229-
RESULT_VARIABLE _PYTHON_SUCCESS
230-
OUTPUT_VARIABLE TORCH_DIR)
231-
if (NOT _PYTHON_SUCCESS MATCHES 0)
232-
message(FATAL_ERROR "Torch config Error.")
233-
endif()
234-
list(APPEND CMAKE_PREFIX_PATH ${TORCH_DIR})
235-
find_package(Torch REQUIRED)
236-
execute_process(COMMAND ${PYTHON_PATH} "-c" "from __future__ import print_function; from distutils import sysconfig;
237-
print(sysconfig.get_python_inc());"
238-
RESULT_VARIABLE _PYTHON_SUCCESS
239-
OUTPUT_VARIABLE PY_INCLUDE_DIR)
240-
if (NOT _PYTHON_SUCCESS MATCHES 0)
241-
message(FATAL_ERROR "Python config Error.")
242-
endif()
243-
list(APPEND COMMON_HEADER_DIRS ${PY_INCLUDE_DIR})
244-
execute_process(COMMAND ${PYTHON_PATH} "-c" "from __future__ import print_function; import torch;
245-
print(torch._C._GLIBCXX_USE_CXX11_ABI,end='');"
246-
RESULT_VARIABLE _PYTHON_SUCCESS
247-
OUTPUT_VARIABLE USE_CXX11_ABI)
248-
message("-- USE_CXX11_ABI=${USE_CXX11_ABI}")
249-
if (USE_CXX11_ABI)
250-
set(CMAKE_CUDA_FLAGS_RELWITHDEBINFO "${CMAKE_CUDA_FLAGS_RELWITHDEBINFO} -D_GLIBCXX_USE_CXX11_ABI=1")
251-
set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -D_GLIBCXX_USE_CXX11_ABI=1")
252-
set(CMAKE_CUDA_FLAGS_RELEASE "${CMAKE_CUDA_FLAGS_RELEASE} -D_GLIBCXX_USE_CXX11_ABI=1")
253-
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -D_GLIBCXX_USE_CXX11_ABI=1")
254-
set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -D_GLIBCXX_USE_CXX11_ABI=1")
255-
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -D_GLIBCXX_USE_CXX11_ABI=1")
256-
else()
257-
set(CMAKE_CUDA_FLAGS_RELWITHDEBINFO "${CMAKE_CUDA_FLAGS_RELWITHDEBINFO} -D_GLIBCXX_USE_CXX11_ABI=0")
258-
set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -D_GLIBCXX_USE_CXX11_ABI=0")
259-
set(CMAKE_CUDA_FLAGS_RELEASE "${CMAKE_CUDA_FLAGS_RELEASE} -D_GLIBCXX_USE_CXX11_ABI=0")
260-
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -D_GLIBCXX_USE_CXX11_ABI=0")
261-
set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -D_GLIBCXX_USE_CXX11_ABI=0")
262-
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -D_GLIBCXX_USE_CXX11_ABI=0")
263-
endif()
264-
endif()
265178

266179
# turn off warnings on windows
267180
if (MSVC)
@@ -286,14 +199,6 @@ if (MSVC)
286199
endforeach()
287200
endif()
288201

289-
if (BUILD_MULTI_GPU)
290-
list(APPEND COMMON_HEADER_DIRS ${MPI_INCLUDE_PATH})
291-
endif()
292-
293-
if(USE_TRITONSERVER_DATATYPE)
294-
list(APPEND COMMON_HEADER_DIRS ${PROJECT_SOURCE_DIR}/../repo-core-src/include)
295-
endif()
296-
297202
include_directories(
298203
${COMMON_HEADER_DIRS}
299204
)
@@ -314,111 +219,3 @@ endif()
314219
if (BUILD_PY_FFI)
315220
install(TARGETS _turbomind DESTINATION ${CMAKE_SOURCE_DIR}/lmdeploy/lib)
316221
endif ()
317-
318-
if (MSVC)
319-
return()
320-
endif ()
321-
322-
# # Mesaure the compile time
323-
option(MEASURE_BUILD_TIME "Measure the build time of each module" OFF)
324-
if (MEASURE_BUILD_TIME)
325-
set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${CMAKE_COMMAND} -E time")
326-
set_property(GLOBAL PROPERTY RULE_LAUNCH_CUSTOM "${CMAKE_COMMAND} -E time")
327-
set_property(GLOBAL PROPERTY RULE_LAUNCH_LINK "${CMAKE_COMMAND} -E time")
328-
endif()
329-
330-
########################################
331-
332-
add_library(transformer-shared SHARED
333-
$<TARGET_OBJECTS:DynamicDecodeLayer>
334-
$<TARGET_OBJECTS:Llama>
335-
$<TARGET_OBJECTS:LlamaTritonBackend>
336-
$<TARGET_OBJECTS:TransformerTritonBackend>
337-
$<TARGET_OBJECTS:activation_kernels>
338-
$<TARGET_OBJECTS:ban_bad_words>
339-
$<TARGET_OBJECTS:cublasAlgoMap>
340-
$<TARGET_OBJECTS:cublasMMWrapper>
341-
$<TARGET_OBJECTS:cuda_utils>
342-
$<TARGET_OBJECTS:custom_ar_comm>
343-
$<TARGET_OBJECTS:custom_ar_kernels>
344-
$<TARGET_OBJECTS:attention>
345-
$<TARGET_OBJECTS:decoding_kernels>
346-
$<TARGET_OBJECTS:gpt_kernels>
347-
$<TARGET_OBJECTS:logprob_kernels>
348-
$<TARGET_OBJECTS:logger>
349-
$<TARGET_OBJECTS:memory_utils>
350-
$<TARGET_OBJECTS:mpi_utils>
351-
$<TARGET_OBJECTS:nccl_utils>
352-
$<TARGET_OBJECTS:nvtx_utils>
353-
$<TARGET_OBJECTS:anomaly_handler>
354-
$<TARGET_OBJECTS:sampling_penalty_kernels>
355-
$<TARGET_OBJECTS:sampling_topk_kernels>
356-
$<TARGET_OBJECTS:sampling_topp_kernels>
357-
$<TARGET_OBJECTS:stop_criteria>
358-
$<TARGET_OBJECTS:tensor>
359-
$<TARGET_OBJECTS:unfused_attention_kernels>
360-
)
361-
362-
if (BUILD_MULTI_GPU)
363-
target_link_libraries(transformer-shared PUBLIC
364-
${MPI_CXX_LIBRARIES}
365-
${NCCL_LIBRARIES}
366-
)
367-
endif()
368-
369-
if(USE_NVTX)
370-
target_link_libraries(transformer-shared PUBLIC
371-
-lnvToolsExt
372-
)
373-
endif()
374-
375-
set_target_properties(transformer-shared PROPERTIES POSITION_INDEPENDENT_CODE ON)
376-
set_target_properties(transformer-shared PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON)
377-
set_target_properties(transformer-shared PROPERTIES LINKER_LANGUAGE CXX)
378-
target_link_libraries(transformer-shared PUBLIC -lcudart -lcublas -lcublasLt -lcurand)
379-
380-
include(GNUInstallDirs)
381-
set(INSTALL_CONFIGDIR ${CMAKE_INSTALL_LIBDIR}/cmake/TurboMind)
382-
383-
include(CMakePackageConfigHelpers)
384-
configure_package_config_file(
385-
${CMAKE_CURRENT_LIST_DIR}/cmake/TurboMindConfig.cmake.in
386-
${CMAKE_CURRENT_BINARY_DIR}/TurboMindConfig.cmake
387-
INSTALL_DESTINATION ${INSTALL_CONFIGDIR}
388-
)
389-
390-
install(
391-
FILES
392-
${CMAKE_CURRENT_BINARY_DIR}/TurboMindConfig.cmake
393-
DESTINATION ${INSTALL_CONFIGDIR}
394-
)
395-
396-
install(
397-
TARGETS
398-
transformer-shared
399-
EXPORT
400-
transformer-shared-targets
401-
LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/turbomind
402-
ARCHIVE DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/turbomind
403-
RUNTIME DESTINATION ${CMAKE_INSTALL_PREFIX}/bin
404-
)
405-
406-
install(
407-
EXPORT
408-
transformer-shared-targets
409-
FILE
410-
TurboMindTargets.cmake
411-
DESTINATION
412-
${INSTALL_CONFIGDIR}
413-
)
414-
415-
export(
416-
EXPORT
417-
transformer-shared-targets
418-
FILE
419-
${CMAKE_CURRENT_BINARY_DIR}/TurboMindTargets.cmake
420-
NAMESPACE
421-
TritonCore::
422-
)
423-
424-
export(PACKAGE TurboMind)

benchmark/profile_pipeline_api.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -167,6 +167,7 @@ def parse_args():
167167
ArgumentHelper.quant_policy(tb_group, default=0)
168168
ArgumentHelper.num_tokens_per_iter(tb_group)
169169
ArgumentHelper.max_prefill_iters(tb_group)
170+
ArgumentHelper.communicator(tb_group)
170171

171172
args = parser.parse_args()
172173
return args
@@ -188,6 +189,7 @@ def main():
188189
num_tokens_per_iter=args.num_tokens_per_iter,
189190
max_prefill_iters=args.max_prefill_iters,
190191
enable_prefix_caching=args.enable_prefix_caching,
192+
communicator=args.communicator,
191193
)
192194
elif args.backend == 'pytorch':
193195
engine_config = PytorchEngineConfig(

benchmark/profile_throughput.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -221,6 +221,7 @@ def parse_args():
221221
ArgumentHelper.model_format(tb_group, default='hf')
222222
ArgumentHelper.num_tokens_per_iter(tb_group)
223223
ArgumentHelper.max_prefill_iters(tb_group)
224+
ArgumentHelper.communicator(tb_group)
224225

225226
args = parser.parse_args()
226227
return args
@@ -242,6 +243,7 @@ def main():
242243
max_prefill_iters=args.max_prefill_iters,
243244
enable_prefix_caching=args.enable_prefix_caching,
244245
dtype=args.dtype,
246+
communicator=args.communicator,
245247
)
246248
elif args.backend == 'pytorch':
247249
engine_config = PytorchEngineConfig(

lmdeploy/cli/serve.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@ def add_parser_gradio():
7676
ArgumentHelper.model_format(tb_group)
7777
ArgumentHelper.quant_policy(tb_group)
7878
ArgumentHelper.rope_scaling_factor(tb_group)
79+
ArgumentHelper.communicator(tb_group)
7980

8081
@staticmethod
8182
def add_parser_api_server():
@@ -173,6 +174,7 @@ def add_parser_api_server():
173174
ArgumentHelper.rope_scaling_factor(tb_group)
174175
ArgumentHelper.num_tokens_per_iter(tb_group)
175176
ArgumentHelper.max_prefill_iters(tb_group)
177+
ArgumentHelper.communicator(tb_group)
176178

177179
# vlm args
178180
vision_group = parser.add_argument_group('Vision model arguments')
@@ -255,7 +257,8 @@ def gradio(args):
255257
cache_max_entry_count=args.cache_max_entry_count,
256258
cache_block_seq_len=args.cache_block_seq_len,
257259
enable_prefix_caching=args.enable_prefix_caching,
258-
max_prefill_token_num=args.max_prefill_token_num)
260+
max_prefill_token_num=args.max_prefill_token_num,
261+
communicator=args.communicator)
259262
chat_template_config = get_chat_template(args.chat_template)
260263
run(args.model_path_or_server,
261264
server_name=args.server_name,
@@ -305,7 +308,8 @@ def api_server(args):
305308
cache_max_entry_count=args.cache_max_entry_count,
306309
cache_block_seq_len=args.cache_block_seq_len,
307310
enable_prefix_caching=args.enable_prefix_caching,
308-
max_prefill_token_num=args.max_prefill_token_num)
311+
max_prefill_token_num=args.max_prefill_token_num,
312+
communicator=args.communicator)
309313
chat_template_config = get_chat_template(args.chat_template)
310314

311315
from lmdeploy.messages import VisionConfig

lmdeploy/cli/utils.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -463,3 +463,11 @@ def eager_mode(parser):
463463
default=False,
464464
help='Whether to enable eager mode. '
465465
'If True, cuda graph would be disabled')
466+
467+
@staticmethod
468+
def communicator(parser):
469+
return parser.add_argument('--communicator',
470+
type=str,
471+
default='nccl',
472+
choices=['nccl', 'native'],
473+
help='Communication backend for multi-GPU inference')

lmdeploy/messages.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -223,6 +223,7 @@ class TurbomindEngineConfig:
223223
max_prefill_token_num: int = 8192
224224
num_tokens_per_iter: int = 0
225225
max_prefill_iters: int = 1
226+
communicator: str = 'nccl'
226227

227228
def __post_init__(self):
228229
"""Check input validation."""

lmdeploy/turbomind/chat.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@ def main(model_path: str,
6969
stream_output: bool = True,
7070
request_output_len: int = 1024,
7171
chat_template_config: ChatTemplateConfig = None,
72+
communicator: str = 'nccl',
7273
**kwargs):
7374
"""An example to perform model inference through the command line
7475
interface.
@@ -130,7 +131,8 @@ def main(model_path: str,
130131
quant_policy=quant_policy,
131132
rope_scaling_factor=rope_scaling_factor,
132133
dtype=dtype,
133-
tp=tp)
134+
tp=tp,
135+
communicator=communicator)
134136
print('engine_cfg:\n', engine_cfg, sep='', flush=True)
135137
tokenizer = Tokenizer(model_path)
136138
from lmdeploy import turbomind as tm

0 commit comments

Comments
 (0)