@@ -1871,6 +1871,68 @@ def test_nvfp4_multi_gpus(self, tp_size, pp_size, ep_size, mtp_nextn, fp8kv,
1871
1871
# task.evaluate(llm,
1872
1872
# extra_evaluator_kwargs=dict(apply_chat_template=True))
1873
1873
1874
+ @skip_pre_blackwell
1875
+ @pytest .mark .parametrize (
1876
+ "tp_size,pp_size,ep_size,mtp_nextn,fp8kv,attention_dp,cuda_graph,overlap_scheduler,max_batch_size,moe_backend" ,
1877
+ [
1878
+ # Use a larger batch_size to speed up the tests
1879
+ pytest .param (8 ,
1880
+ 1 ,
1881
+ 4 ,
1882
+ 3 ,
1883
+ False ,
1884
+ False ,
1885
+ True ,
1886
+ True ,
1887
+ 32 ,
1888
+ "CUTLASS" ,
1889
+ marks = pytest .mark .skip_less_mpi_world_size (8 )),
1890
+ pytest .param (4 ,
1891
+ 1 ,
1892
+ 1 ,
1893
+ 0 ,
1894
+ True ,
1895
+ True ,
1896
+ True ,
1897
+ True ,
1898
+ 16 ,
1899
+ "CUTLASS" ,
1900
+ marks = pytest .mark .skip_less_mpi_world_size (4 )),
1901
+ ],
1902
+ ids = ["latency" , "throughput_tp4" ])
1903
+ def test_nvfp4_multi_gpus_chunked_prefill (self , tp_size , pp_size , ep_size ,
1904
+ mtp_nextn , fp8kv , attention_dp ,
1905
+ cuda_graph , overlap_scheduler ,
1906
+ max_batch_size , moe_backend ):
1907
+ kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.70 )
1908
+ pytorch_config = dict (
1909
+ disable_overlap_scheduler = not overlap_scheduler ,
1910
+ cuda_graph_config = CudaGraphConfig () if cuda_graph else None ,
1911
+ moe_config = MoeConfig (backend = moe_backend ))
1912
+
1913
+ if fp8kv :
1914
+ kv_cache_config .dtype = "fp8"
1915
+
1916
+ mtp_config = None
1917
+ if mtp_nextn > 0 :
1918
+ mtp_config = MTPDecodingConfig (num_nextn_predict_layers = mtp_nextn )
1919
+ with LLM (f"{ llm_models_root ()} /DeepSeek-R1/DeepSeek-R1-FP4" ,
1920
+ max_batch_size = max_batch_size ,
1921
+ tensor_parallel_size = tp_size ,
1922
+ pipeline_parallel_size = pp_size ,
1923
+ moe_expert_parallel_size = ep_size ,
1924
+ kv_cache_config = kv_cache_config ,
1925
+ ** pytorch_config ,
1926
+ enable_attention_dp = attention_dp ,
1927
+ speculative_config = mtp_config ,
1928
+ enable_chunked_prefill = True ) as llm :
1929
+
1930
+ assert llm .args .moe_config .backend == moe_backend
1931
+ assert llm .args .quant_config .quant_algo == QuantAlgo .NVFP4
1932
+
1933
+ task = GSM8K (self .MODEL_NAME )
1934
+ task .evaluate (llm )
1935
+
1874
1936
@pytest .mark .skip_less_mpi_world_size (8 )
1875
1937
@skip_pre_hopper
1876
1938
@pytest .mark .parametrize (
@@ -1917,6 +1979,51 @@ def test_fp8_blockscale(self, tp_size, pp_size, ep_size, mtp_nextn, fp8kv,
1917
1979
task = GSM8K (self .MODEL_NAME )
1918
1980
task .evaluate (llm )
1919
1981
1982
+ @pytest .mark .skip_less_mpi_world_size (8 )
1983
+ @skip_pre_hopper
1984
+ @pytest .mark .parametrize (
1985
+ "tp_size,pp_size,ep_size,mtp_nextn,fp8kv,attention_dp,cuda_graph,overlap_scheduler,max_batch_size" ,
1986
+ [(8 , 1 , 4 , 3 , False , False , True , True , 1 ),
1987
+ (8 , 1 , 8 , 0 , True , True , True , True , 24 )],
1988
+ ids = ["latency" , "throughput" ])
1989
+ def test_fp8_blockscale_chunked_prefill (self , tp_size , pp_size , ep_size ,
1990
+ mtp_nextn , fp8kv , attention_dp ,
1991
+ cuda_graph , overlap_scheduler ,
1992
+ max_batch_size ):
1993
+ if get_sm_version () == 100 :
1994
+ moe_config = MoeConfig (backend = "DEEPGEMM" , max_num_tokens = 16384 )
1995
+ kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.6 )
1996
+ else :
1997
+ moe_config = MoeConfig ()
1998
+ kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.9 )
1999
+
2000
+ pytorch_config = dict (
2001
+ disable_overlap_scheduler = not overlap_scheduler ,
2002
+ cuda_graph_config = CudaGraphConfig () if cuda_graph else None ,
2003
+ moe_config = moe_config ,
2004
+ )
2005
+
2006
+ if fp8kv :
2007
+ kv_cache_config .dtype = "fp8"
2008
+
2009
+ mtp_config = None
2010
+ if mtp_nextn > 0 :
2011
+ mtp_config = MTPDecodingConfig (num_nextn_predict_layers = mtp_nextn )
2012
+ with LLM (f"{ llm_models_root ()} /DeepSeek-R1/DeepSeek-R1" ,
2013
+ max_batch_size = max_batch_size ,
2014
+ tensor_parallel_size = tp_size ,
2015
+ pipeline_parallel_size = pp_size ,
2016
+ moe_expert_parallel_size = ep_size ,
2017
+ kv_cache_config = kv_cache_config ,
2018
+ ** pytorch_config ,
2019
+ enable_attention_dp = attention_dp ,
2020
+ speculative_config = mtp_config ,
2021
+ enable_chunked_prefill = True ) as llm :
2022
+ assert llm .args .quant_config .quant_algo == QuantAlgo .FP8_BLOCK_SCALES
2023
+
2024
+ task = GSM8K (self .MODEL_NAME )
2025
+ task .evaluate (llm )
2026
+
1920
2027
1921
2028
@pytest .mark .timeout (7200 )
1922
2029
@pytest .mark .skip_less_device_memory (100000 )
0 commit comments