@@ -1947,6 +1947,7 @@ def test_nvfp4_multi_gpus(self, tp_size, pp_size, ep_size, mtp_nextn, fp8kv,
1947
1947
# task.evaluate(llm,
1948
1948
# extra_evaluator_kwargs=dict(apply_chat_template=True))
1949
1949
1950
+ < << << << HEAD
1950
1951
@skip_pre_blackwell
1951
1952
@pytest .mark .parametrize (
1952
1953
"tp_size,pp_size,ep_size,mtp_nextn,fp8kv,attention_dp,cuda_graph,overlap_scheduler,max_batch_size,moe_backend" ,
@@ -2006,6 +2007,41 @@ def test_nvfp4_multi_gpus_chunked_prefill(self, tp_size, pp_size, ep_size,
2006
2007
assert llm .args .moe_config .backend == moe_backend
2007
2008
assert llm .args .quant_config .quant_algo == QuantAlgo .NVFP4
2008
2009
2010
+ == == == =
2011
+ def test_nvfp4_multi_gpus_corner_case (self ):
2012
+ """
2013
+ This test is used to test the corner case of the NVFP4 model.
2014
+ When using the same value for max_seq_len and max_num_tokens, there will be no
2015
+ enough kv block for the dummy requests in CUDA graph warmup when creating
2016
+ the py_executor before estimating kv cache. Then CUDA graph capture will be
2017
+ triggered when estimating kv cache. This may cause some errors.
2018
+ More info in https://nvbugs/5485325.
2019
+ """
2020
+ kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.80 ,
2021
+ dtype = "fp8" ,
2022
+ enable_block_reuse = False )
2023
+ pytorch_config = dict (disable_overlap_scheduler = False ,
2024
+ cuda_graph_config = CudaGraphConfig (
2025
+ enable_padding = True , max_batch_size = 1024 ),
2026
+ moe_config = MoeConfig (backend = "TRTLLM" ))
2027
+
2028
+ mtp_config = MTPDecodingConfig (num_nextn_predict_layers = 1 )
2029
+ with LLM (f"{ llm_models_root ()} /DeepSeek-R1/DeepSeek-R1-FP4" ,
2030
+ tensor_parallel_size = 8 ,
2031
+ pipeline_parallel_size = 1 ,
2032
+ moe_expert_parallel_size = 8 ,
2033
+ kv_cache_config = kv_cache_config ,
2034
+ ** pytorch_config ,
2035
+ enable_attention_dp = False ,
2036
+ speculative_config = mtp_config ,
2037
+ max_seq_len = 5120 ,
2038
+ max_num_tokens = 5120 ) as llm :
2039
+
2040
+ assert llm .args .quant_config .quant_algo == QuantAlgo .NVFP4
2041
+
2042
+ task = MMLU (self .MODEL_NAME )
2043
+ task .evaluate (llm )
2044
+ >> >> >> > 777679303 ([https :// nvbugs / 5485325 ][fix ] Add a postprocess to the model engine to fix the CUDA graph warmup issue when using speculative decoding (#7373))
2009
2045
task = GSM8K (self .MODEL_NAME )
2010
2046
task .evaluate (llm )
2011
2047
0 commit comments