@@ -427,7 +427,7 @@ def test_auto_spec_decode(self):
427427 CudaGraphConfig (batch_sizes = [1 , 32 , 64 ], enable_padding = True )
428428 }
429429 kv_cache_config = KvCacheConfig (enable_block_reuse = False ,
430- free_gpu_memory_fraction = 0.5 )
430+ free_gpu_memory_fraction = 0.7 )
431431 spec_config = AutoDecodingConfig ()
432432 with LLM (model = self .MODEL_PATH ,
433433 ** pytorch_config ,
@@ -469,7 +469,7 @@ def test_auto_dtype_beam_search(self, enable_cuda_graph, enable_padding,
469469
470470 with LLM (
471471 model = self .MODEL_PATH ,
472- kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.5 ),
472+ kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.7 ),
473473 max_batch_size = max_beam_width ,
474474 max_seq_len = 2048 ,
475475 max_beam_width = max_beam_width ,
@@ -514,7 +514,7 @@ def test_fp8_beam_search(self, enable_cuda_graph, enable_padding,
514514
515515 llm = LLM (
516516 model = model_path ,
517- kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.5 ),
517+ kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.7 ),
518518 max_batch_size = max_beam_width ,
519519 max_seq_len = 2048 ,
520520 max_beam_width = max_beam_width ,
@@ -656,7 +656,7 @@ def test_fp8_eagle3_tp8(self, eagle3_one_model, torch_compile):
656656 @parametrize_with_ids ("torch_compile" , [False , True ])
657657 def test_fp8_tp4 (self , torch_compile ):
658658 model_path = f"{ llm_models_root ()} /llama-3.3-models/Llama-3.3-70B-Instruct-FP8"
659- kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.5 )
659+ kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.7 )
660660 torch_compile_config = _get_default_torch_compile_config (torch_compile )
661661 with LLM (model_path ,
662662 tensor_parallel_size = 4 ,
@@ -683,7 +683,7 @@ def test_fp8_tp4(self, torch_compile):
683683 @parametrize_with_ids ("torch_compile" , [False , True ])
684684 def test_nvfp4_tp4 (self , torch_compile ):
685685 model_path = f"{ llm_models_root ()} /llama-3.3-models/Llama-3.3-70B-Instruct-FP4"
686- kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.5 )
686+ kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.7 )
687687 torch_compile_config = _get_default_torch_compile_config (torch_compile )
688688 with LLM (model_path ,
689689 tensor_parallel_size = 4 ,
@@ -710,7 +710,7 @@ def test_nvfp4_tp4(self, torch_compile):
710710 @parametrize_with_ids ("torch_compile" , [False , True ])
711711 def test_fp4_tp2pp2 (self , enable_gemm_allreduce_fusion , torch_compile ):
712712 model_path = f"{ llm_models_root ()} /llama-3.3-models/Llama-3.3-70B-Instruct-FP4"
713- kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.5 )
713+ kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.7 )
714714 torch_compile_config = _get_default_torch_compile_config (torch_compile )
715715
716716 with (mock .patch .dict (
@@ -1052,7 +1052,7 @@ def test_auto_dtype(self):
10521052 kv_cache_config = KvCacheConfig (
10531053 enable_block_reuse = False ,
10541054 enable_partial_reuse = False ,
1055- free_gpu_memory_fraction = 0.5 ,
1055+ free_gpu_memory_fraction = 0.7 ,
10561056 )
10571057 # We use FlashInfer as the attention backend for Gemma3 VLM to support custom mask for images.
10581058 # So, testing with it here.
@@ -1179,7 +1179,7 @@ def test_auto_dtype_vswa_without_reuse_low_memory_available(self):
11791179 enable_block_reuse = False ,
11801180 enable_partial_reuse = False ,
11811181 max_attention_window = [512 , 512 , 512 , 512 , 512 , 32768 ],
1182- free_gpu_memory_fraction = 0.1 ,
1182+ free_gpu_memory_fraction = 0.6 ,
11831183 )
11841184
11851185 with LLM (self .MODEL_PATH , kv_cache_config = kv_cache_config ) as llm :
@@ -1256,7 +1256,7 @@ def test_auto_dtype_vswa_reuse_low_memory_available_no_partial_reuse(self):
12561256 enable_block_reuse = True ,
12571257 enable_partial_reuse = False ,
12581258 max_attention_window = [512 , 512 , 512 , 512 , 512 , 32768 ],
1259- free_gpu_memory_fraction = 0.1 ,
1259+ free_gpu_memory_fraction = 0.6 ,
12601260 )
12611261
12621262 with LLM (self .MODEL_PATH , kv_cache_config = kv_cache_config ) as llm :
@@ -1271,7 +1271,7 @@ def test_auto_dtype_vswa_reuse_low_memory_available_partial_reuse(self):
12711271 enable_block_reuse = True ,
12721272 enable_partial_reuse = True ,
12731273 max_attention_window = [512 , 512 , 512 , 512 , 512 , 32768 ],
1274- free_gpu_memory_fraction = 0.1 ,
1274+ free_gpu_memory_fraction = 0.6 ,
12751275 )
12761276
12771277 with LLM (self .MODEL_PATH , kv_cache_config = kv_cache_config ) as llm :
@@ -1399,7 +1399,7 @@ def test_bfloat16(self, mtp_nextn, attention_dp, cuda_graph,
13991399
14001400 @pytest .mark .skip_less_device_memory (60000 )
14011401 def test_bfloat16_2_model_mtp (self ):
1402- kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.5 )
1402+ kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.7 )
14031403 pytorch_config = dict (
14041404 disable_overlap_scheduler = True ,
14051405 cuda_graph_config = CudaGraphConfig (),
@@ -4178,7 +4178,7 @@ def test_nvfp4(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
41784178 cuda_graph_config = CudaGraphConfig () if cuda_graph else None ,
41794179 moe_config = MoeConfig (backend = moe_backend ))
41804180
4181- kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.4 ,
4181+ kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.7 ,
41824182 enable_block_reuse = not eagle3 )
41834183 spec_config = None
41844184 if eagle3 :
@@ -4226,7 +4226,7 @@ def test_nvfp4_4gpus(self, tp_size, pp_size, ep_size, attention_dp,
42264226 cuda_graph_config = CudaGraphConfig () if cuda_graph else None ,
42274227 moe_config = MoeConfig (backend = moe_backend ))
42284228
4229- kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.4 ,
4229+ kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.7 ,
42304230 enable_block_reuse = not eagle3 )
42314231 spec_config = None
42324232 if eagle3 :
@@ -4482,7 +4482,7 @@ def test_w4_1gpu(self, kv_cache_dtype, moe_backend, cuda_graph,
44824482 disable_overlap_scheduler = not overlap_scheduler ,
44834483 cuda_graph_config = CudaGraphConfig () if cuda_graph else None )
44844484
4485- kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.5 ,
4485+ kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.7 ,
44864486 dtype = kv_cache_dtype )
44874487
44884488 llm = LLM (self .MODEL_PATH ,
@@ -4660,7 +4660,7 @@ def test_w4a16(self, kv_cache_dtype, tp_size, pp_size, ep_size,
46604660 {"scores_filter" : "exact_match,flexible-extract" })
46614661 monkeypatch .setenv ("OVERRIDE_QUANT_ALGO" , "W4A16_MXFP4" )
46624662
4663- kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.5 ,
4663+ kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.7 ,
46644664 dtype = kv_cache_dtype )
46654665
46664666 llm = LLM (self .MODEL_PATH ,
@@ -4702,7 +4702,7 @@ def test_w4_2gpus(self, kv_cache_dtype, moe_backend, tp_size, pp_size,
47024702 disable_overlap_scheduler = not overlap_scheduler ,
47034703 cuda_graph_config = CudaGraphConfig () if cuda_graph else None )
47044704
4705- kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.5 ,
4705+ kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.7 ,
47064706 dtype = kv_cache_dtype )
47074707
47084708 llm = LLM (self .MODEL_PATH ,
@@ -4740,7 +4740,7 @@ def test_w4_2gpus_nvfp4(self, tp_size, pp_size, ep_size, attention_dp,
47404740 disable_overlap_scheduler = not overlap_scheduler ,
47414741 cuda_graph_config = CudaGraphConfig () if cuda_graph else None )
47424742
4743- kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.4 ,
4743+ kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.7 ,
47444744 dtype = "auto" )
47454745
47464746 llm = LLM ("./nvfp4ckpt" ,
@@ -4858,7 +4858,7 @@ def test_eagle3_4gpus(self, moe_backend, one_model, overlap_scheduler,
48584858 # https://nvbugs/5590408: 2-Model overlap scheduling has accuracy issue
48594859 pytorch_config = dict (disable_overlap_scheduler = not overlap_scheduler ,
48604860 cuda_graph_config = CudaGraphConfig ())
4861- kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.4 ,
4861+ kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.7 ,
48624862 dtype = "auto" )
48634863
48644864 eagle_model_dir = f"{ llm_models_root ()} /gpt_oss/gpt-oss-120b-Eagle3"
@@ -4922,7 +4922,7 @@ def test_eagle3_vswa_reuse_4gpus(self, one_model, mocker):
49224922 mocker .patch .object (GPQADiamond , "MAX_INPUT_LEN" , MAX_INPUT_LEN )
49234923
49244924 pytorch_config = dict (cuda_graph_config = CudaGraphConfig ())
4925- kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.4 ,
4925+ kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.7 ,
49264926 dtype = "auto" ,
49274927 enable_block_reuse = True ,
49284928 max_attention_window = [128 , 32768 ])
@@ -4988,7 +4988,7 @@ def test_eagle3_guided_decoding_4gpus(self, one_model, mocker):
49884988 mocker .patch .object (GPQADiamond , "MAX_INPUT_LEN" , MAX_INPUT_LEN )
49894989
49904990 pytorch_config = dict (cuda_graph_config = CudaGraphConfig ())
4991- kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.4 ,
4991+ kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.7 ,
49924992 dtype = "auto" )
49934993
49944994 eagle_model_dir = f"{ llm_models_root ()} /gpt_oss/gpt-oss-120b-Eagle3"
@@ -5044,7 +5044,7 @@ def test_eagle3_2gpus(self, moe_backend, one_model, overlap_scheduler,
50445044 max_batch_size = 8 ,
50455045 disable_overlap_scheduler = not overlap_scheduler ,
50465046 cuda_graph_config = CudaGraphConfig (max_batch_size = 8 ))
5047- kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.4 ,
5047+ kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.7 ,
50485048 dtype = "auto" )
50495049
50505050 eagle_model_dir = f"{ llm_models_root ()} /gpt_oss/gpt-oss-120b-Eagle3"
@@ -5112,7 +5112,7 @@ def test_w4_4gpus_online_eplb(self, kv_cache_dtype, enable_configurable_moe,
51125112 mocker .patch .dict (GSM8K .EVALUATE_KWARGS ,
51135113 {"scores_filter" : "exact_match,flexible-extract" })
51145114
5115- kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.5 ,
5115+ kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.7 ,
51165116 dtype = kv_cache_dtype )
51175117
51185118 # Configure online expert parallel load balancer
@@ -5160,7 +5160,7 @@ class TestQwQ_32B(LlmapiAccuracyTestHarness):
51605160 @pytest .mark .skip_less_device_memory (80000 )
51615161 @pytest .mark .skip_less_device (4 )
51625162 def test_auto_dtype_tp4 (self ):
5163- kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.5 )
5163+ kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.7 )
51645164
51655165 with LLM (self .MODEL_PATH ,
51665166 max_num_tokens = 16384 ,
@@ -5326,7 +5326,7 @@ def test_fp8_8gpus(self):
53265326 pytest .skip (f"Model directory { model_dir } does not exist" )
53275327
53285328 # Configure model settings
5329- kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.4 ,
5329+ kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.7 ,
53305330 enable_block_reuse = True ,
53315331 enable_partial_reuse = False ,
53325332 dtype = "fp8" )
@@ -5527,7 +5527,7 @@ def test_nvfp4_4gpus(self, tp_size, pp_size, ep_size, attention_dp,
55275527 cuda_graph_config = CudaGraphConfig () if cuda_graph else None ,
55285528 moe_config = MoeConfig (backend = moe_backend ))
55295529
5530- kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.4 ,
5530+ kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.7 ,
55315531 enable_block_reuse = not eagle3 )
55325532 spec_config = None
55335533 if eagle3 :
@@ -5578,7 +5578,7 @@ def test_fp8(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
55785578 cuda_graph_config = CudaGraphConfig () if cuda_graph else None ,
55795579 moe_config = MoeConfig (backend = moe_backend ))
55805580
5581- kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.4 ,
5581+ kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.7 ,
55825582 enable_block_reuse = not eagle3 )
55835583 spec_config = None
55845584 if eagle3 :
@@ -5716,7 +5716,7 @@ def test_fp8_4gpus(self, attention_dp):
57165716 kv_cache_config = KvCacheConfig (
57175717 enable_block_reuse = False ,
57185718 mamba_ssm_cache_dtype = "float16" ,
5719- free_gpu_memory_fraction = 0.5 ,
5719+ free_gpu_memory_fraction = 0.7 ,
57205720 ),
57215721 max_batch_size = 32 ,
57225722 tensor_parallel_size = 4 ,
@@ -5756,7 +5756,7 @@ def test_nvfp4_8gpus(self, attention_dp):
57565756 kv_cache_config = KvCacheConfig (
57575757 enable_block_reuse = False ,
57585758 mamba_ssm_cache_dtype = "float16" ,
5759- free_gpu_memory_fraction = 0.5 ,
5759+ free_gpu_memory_fraction = 0.7 ,
57605760 ),
57615761 max_batch_size = 32 ,
57625762 tensor_parallel_size = 8 ,
@@ -5791,7 +5791,7 @@ def test_nvfp4_8gpus_mtp(self):
57915791 kv_cache_config = KvCacheConfig (
57925792 enable_block_reuse = False ,
57935793 mamba_ssm_cache_dtype = "float16" ,
5794- free_gpu_memory_fraction = 0.5 ,
5794+ free_gpu_memory_fraction = 0.7 ,
57955795 ),
57965796 max_batch_size = 128 ,
57975797 tensor_parallel_size = 8 ,
0 commit comments