Skip to content

Commit 5a6133d

Browse files
committed
add chunked_prefill + mtp cases on deepseekr1
Signed-off-by: Ivy Zhang <[email protected]>
1 parent 347dd9d commit 5a6133d

File tree

3 files changed

+116
-0
lines changed

3 files changed

+116
-0
lines changed

tests/integration/defs/accuracy/test_llm_api_pytorch.py

Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1871,6 +1871,68 @@ def test_nvfp4_multi_gpus(self, tp_size, pp_size, ep_size, mtp_nextn, fp8kv,
18711871
# task.evaluate(llm,
18721872
# extra_evaluator_kwargs=dict(apply_chat_template=True))
18731873

1874+
@skip_pre_blackwell
1875+
@pytest.mark.parametrize(
1876+
"tp_size,pp_size,ep_size,mtp_nextn,fp8kv,attention_dp,cuda_graph,overlap_scheduler,max_batch_size,moe_backend",
1877+
[
1878+
# Use a larger batch_size to speed up the tests
1879+
pytest.param(8,
1880+
1,
1881+
4,
1882+
3,
1883+
False,
1884+
False,
1885+
True,
1886+
True,
1887+
32,
1888+
"CUTLASS",
1889+
marks=pytest.mark.skip_less_mpi_world_size(8)),
1890+
pytest.param(4,
1891+
1,
1892+
1,
1893+
0,
1894+
True,
1895+
True,
1896+
True,
1897+
True,
1898+
16,
1899+
"CUTLASS",
1900+
marks=pytest.mark.skip_less_mpi_world_size(4)),
1901+
],
1902+
ids=["latency", "throughput_tp4"])
1903+
def test_nvfp4_multi_gpus_chunked_prefill(self, tp_size, pp_size, ep_size,
1904+
mtp_nextn, fp8kv, attention_dp,
1905+
cuda_graph, overlap_scheduler,
1906+
max_batch_size, moe_backend):
1907+
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.70)
1908+
pytorch_config = dict(
1909+
disable_overlap_scheduler=not overlap_scheduler,
1910+
cuda_graph_config=CudaGraphConfig() if cuda_graph else None,
1911+
moe_config=MoeConfig(backend=moe_backend))
1912+
1913+
if fp8kv:
1914+
kv_cache_config.dtype = "fp8"
1915+
1916+
mtp_config = None
1917+
if mtp_nextn > 0:
1918+
mtp_config = MTPDecodingConfig(num_nextn_predict_layers=mtp_nextn)
1919+
with LLM(f"{llm_models_root()}/DeepSeek-R1/DeepSeek-R1-FP4",
1920+
max_batch_size=max_batch_size,
1921+
tensor_parallel_size=tp_size,
1922+
pipeline_parallel_size=pp_size,
1923+
moe_expert_parallel_size=ep_size,
1924+
kv_cache_config=kv_cache_config,
1925+
**pytorch_config,
1926+
enable_attention_dp=attention_dp,
1927+
speculative_config=mtp_config,
1928+
enable_chunked_prefill=True) as llm:
1929+
1930+
assert llm.args.moe_config.backend == moe_backend
1931+
assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4
1932+
1933+
task = GSM8K(self.MODEL_NAME)
1934+
task.evaluate(llm)
1935+
18741936
@pytest.mark.skip_less_mpi_world_size(8)
18751937
@skip_pre_hopper
18761938
@pytest.mark.parametrize(
@@ -1917,6 +1979,51 @@ def test_fp8_blockscale(self, tp_size, pp_size, ep_size, mtp_nextn, fp8kv,
19171979
task = GSM8K(self.MODEL_NAME)
19181980
task.evaluate(llm)
19191981

1982+
@pytest.mark.skip_less_mpi_world_size(8)
1983+
@skip_pre_hopper
1984+
@pytest.mark.parametrize(
1985+
"tp_size,pp_size,ep_size,mtp_nextn,fp8kv,attention_dp,cuda_graph,overlap_scheduler,max_batch_size",
1986+
[(8, 1, 4, 3, False, False, True, True, 1),
1987+
(8, 1, 8, 0, True, True, True, True, 24)],
1988+
ids=["latency", "throughput"])
1989+
def test_fp8_blockscale_chunked_prefill(self, tp_size, pp_size, ep_size,
1990+
mtp_nextn, fp8kv, attention_dp,
1991+
cuda_graph, overlap_scheduler,
1992+
max_batch_size):
1993+
if get_sm_version() == 100:
1994+
moe_config = MoeConfig(backend="DEEPGEMM", max_num_tokens=16384)
1995+
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6)
1996+
else:
1997+
moe_config = MoeConfig()
1998+
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9)
1999+
2000+
pytorch_config = dict(
2001+
disable_overlap_scheduler=not overlap_scheduler,
2002+
cuda_graph_config=CudaGraphConfig() if cuda_graph else None,
2003+
moe_config=moe_config,
2004+
)
2005+
2006+
if fp8kv:
2007+
kv_cache_config.dtype = "fp8"
2008+
2009+
mtp_config = None
2010+
if mtp_nextn > 0:
2011+
mtp_config = MTPDecodingConfig(num_nextn_predict_layers=mtp_nextn)
2012+
with LLM(f"{llm_models_root()}/DeepSeek-R1/DeepSeek-R1",
2013+
max_batch_size=max_batch_size,
2014+
tensor_parallel_size=tp_size,
2015+
pipeline_parallel_size=pp_size,
2016+
moe_expert_parallel_size=ep_size,
2017+
kv_cache_config=kv_cache_config,
2018+
**pytorch_config,
2019+
enable_attention_dp=attention_dp,
2020+
speculative_config=mtp_config,
2021+
enable_chunked_prefill=True) as llm:
2022+
assert llm.args.quant_config.quant_algo == QuantAlgo.FP8_BLOCK_SCALES
2023+
2024+
task = GSM8K(self.MODEL_NAME)
2025+
task.evaluate(llm)
2026+
19202027

19212028
@pytest.mark.timeout(7200)
19222029
@pytest.mark.skip_less_device_memory(100000)

tests/integration/test_lists/qa/llm_function_full.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -527,7 +527,11 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_
527527
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput]
528528
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_tp8]
529529
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_tp4]
530+
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus_chunked_prefill[latency]
531+
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus_chunked_prefill[throughput_tp4]
530532
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput]
533+
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale_chunked_prefill[latency]
534+
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale_chunked_prefill[throughput]
531535
accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_fp8_block_scales[latency]
532536
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_fp8_block_scales[latency-torch_compile=False]
533537
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_fp8_block_scales[latency-torch_compile=True]

tests/integration/test_lists/qa/llm_function_sanity.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,11 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency]
3535
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_tp4]
3636
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_tp8]
3737
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput]
38+
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus_chunked_prefill[latency]
39+
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus_chunked_prefill[throughput_tp4]
40+
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput]
41+
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale_chunked_prefill[latency]
42+
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale_chunked_prefill[throughput]
3843
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus_online_eplb[mtp_nextn=0]
3944
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus_online_eplb[mtp_nextn=2]
4045
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False-enable_chunked_prefill=False]

0 commit comments

Comments
 (0)