@@ -3696,10 +3696,14 @@ class TestQwen2_7BInstruct(LlmapiAccuracyTestHarness):
36963696 )
36973697
36983698 def test_auto_dtype (self ):
3699+ kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.6 , )
36993700 with LLM (self .MODEL_PATH ) as llm :
37003701 task = CnnDailymail (self .MODEL_NAME )
3701- task .evaluate (llm ,
3702- extra_evaluator_kwargs = self .EXTRA_EVALUATOR_KWARGS )
3702+ task .evaluate (
3703+ llm ,
3704+ extra_evaluator_kwargs = self .EXTRA_EVALUATOR_KWARGS ,
3705+ kv_cache_config = kv_cache_config ,
3706+ )
37033707
37043708
37053709class TestQwen3_4B (LlmapiAccuracyTestHarness ):
@@ -3744,11 +3748,12 @@ def test_fp8_block_scales(self, tp_size, pp_size, ep_size, attention_dp,
37443748 pytorch_config = dict (
37453749 disable_overlap_scheduler = not overlap_scheduler ,
37463750 cuda_graph_config = CudaGraphConfig () if cuda_graph else None )
3747-
3751+ kv_cache_config = KvCacheConfig ( free_gpu_memory_fraction = 0.5 )
37483752 with LLM (f"{ llm_models_root ()} /Qwen3/Qwen3-8B-FP8" ,
37493753 tensor_parallel_size = tp_size ,
37503754 pipeline_parallel_size = pp_size ,
37513755 moe_expert_parallel_size = ep_size ,
3756+ kv_cache_config = kv_cache_config ,
37523757 ** pytorch_config ,
37533758 enable_attention_dp = attention_dp ,
37543759 max_batch_size = 64 ) as llm :
@@ -3759,9 +3764,11 @@ def test_fp8_block_scales(self, tp_size, pp_size, ep_size, attention_dp,
37593764
37603765 @skip_pre_hopper
37613766 def test_dummy_load_format (self ):
3767+ kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.5 )
37623768 llm = LLM (
37633769 f"{ llm_models_root ()} /Qwen3/Qwen3-8B-FP8" ,
37643770 load_format = "dummy" ,
3771+ kv_cache_config = kv_cache_config ,
37653772 )
37663773 with llm :
37673774 task = MMLU (self .MODEL_NAME )
@@ -3786,11 +3793,13 @@ def test_bf16(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
37863793 disable_overlap_scheduler = not overlap_scheduler ,
37873794 cuda_graph_config = CudaGraphConfig () if cuda_graph else None )
37883795
3796+ kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.5 )
37893797 with LLM (f"{ llm_models_root ()} /Qwen3/Qwen3-8B"
37903798 if is_cached else "Qwen/Qwen3-8B" ,
37913799 tensor_parallel_size = tp_size ,
37923800 pipeline_parallel_size = pp_size ,
37933801 moe_expert_parallel_size = ep_size ,
3802+ kv_cache_config = kv_cache_config ,
37943803 ** pytorch_config ,
37953804 enable_attention_dp = attention_dp ) as llm :
37963805 task = CnnDailymail (self .MODEL_NAME )
0 commit comments