NVIDIA
diff --git a/‎examples/llm-api/_tensorrt_engine/llm_lookahead_decoding.py‎
Lines changed: 1 addition & 1 deletion b/‎examples/llm-api/_tensorrt_engine/llm_lookahead_decoding.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/llm-api/llm_runtime.py‎
Lines changed: 1 addition & 1 deletion b/‎examples/llm-api/llm_runtime.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/ray_orchestrator/llm_inference_async_ray.py‎
Lines changed: 1 addition & 1 deletion b/‎examples/ray_orchestrator/llm_inference_async_ray.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/scaffolding/run_best_of_n_with_reward.py‎
Lines changed: 2 additions & 2 deletions b/‎examples/scaffolding/run_best_of_n_with_reward.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎tensorrt_llm/_torch/pyexecutor/_util.py‎
Lines changed: 1 addition & 1 deletion b/‎tensorrt_llm/_torch/pyexecutor/_util.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tensorrt_llm/_torch/pyexecutor/resource_manager.py‎
Lines changed: 18 additions & 0 deletions b/‎tensorrt_llm/_torch/pyexecutor/resource_manager.py‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎tests/integration/defs/accuracy/test_llm_api_pytorch.py‎
Lines changed: 29 additions & 29 deletions b/‎tests/integration/defs/accuracy/test_llm_api_pytorch.py‎
Lines changed: 29 additions & 29 deletions
diff --git a/‎tests/integration/defs/accuracy/test_llm_api_pytorch_multimodal.py‎
Lines changed: 5 additions & 5 deletions b/‎tests/integration/defs/accuracy/test_llm_api_pytorch_multimodal.py‎
Lines changed: 5 additions & 5 deletions
@@ -15,7 +15,7 @@ def main():
                                                max_ngram_size=4,
                                                max_verification_set_size=4)
 
-    kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4)
+    kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7)
     llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
               kv_cache_config=kv_cache_config,
               build_config=build_config,
 
@@ -73,7 +73,7 @@ def example_cuda_graph_config():
         cuda_graph_config=cuda_graph_config,  # Enable CUDA graphs
         max_batch_size=4,
         max_seq_len=512,
-        kv_cache_config=KvCacheConfig(free_gpu_memory_fraction=0.5))
+        kv_cache_config=KvCacheConfig(free_gpu_memory_fraction=0.7))
 
     prompts = [
         "Hello, my name is",
 
@@ -18,7 +18,7 @@ def main():
     )
     args = parser.parse_args()
     # Configure KV cache memory usage fraction.
-    kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.5,
+    kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8,
                                     max_tokens=4096,
                                     enable_block_reuse=True)
 
 
@@ -32,13 +32,13 @@ def main():
         backend="pytorch",
         max_batch_size=args.sample_num,
         max_num_tokens=8192,
-        kv_cache_free_gpu_memory_fraction=0.1)
+        kv_cache_free_gpu_memory_fraction=0.6)
     reward_worker = TRTLLMWorker.init_with_new_llm(
         args.reward_model,
         backend="pytorch",
         max_batch_size=args.sample_num,
         max_num_tokens=8192,
-        kv_cache_free_gpu_memory_fraction=0.2,
+        kv_cache_free_gpu_memory_fraction=0.6,
         disable_overlap_scheduler=True)
     workers[NativeGenerationController.WorkerTag.GENERATION] = gen_worker
     workers[PRMController.WorkerTag.REWARD] = reward_worker
 
@@ -230,8 +230,8 @@ def _create_kv_cache_manager(
         return kv_cache_manager
 
     def build_managers(self, resources: Dict) -> None:
-        self.configure_kv_cache_capacity()
         """Construct KV caches for model and draft model (if applicable)."""
+        self.configure_kv_cache_capacity()
         kv_cache_manager = self._create_kv_cache_manager(self._model_engine, )
 
         if self._kv_connector_manager is not None and self._draft_model_engine is not None:
 
@@ -266,6 +266,24 @@ def append_to_kv_heads_per_layer(num_kv_heads_per_layer: List[int],
         # Determine if this is VSWA (Variable Sliding Window Attention)
         self.is_vswa = len(set(self.max_attention_window_vec)) > 1
 
+        self.blocks_in_primary_pool = int(kv_cache_config.max_tokens //
+                                          tokens_per_block)
+
+        host_cache_size = kv_cache_config.host_cache_size if kv_cache_config.host_cache_size else 0
+        max_tokens_secondary = host_cache_size // self.get_cache_bytes_per_token(
+        )
+        self.blocks_in_secondary_pool = int(max_tokens_secondary //
+                                            tokens_per_block)
+
+        blocks_per_window = {
+            window_size:
+            (self.blocks_in_primary_pool, self.blocks_in_secondary_pool)
+            for window_size in set(self.max_attention_window_vec)
+        }
+        logger.info(
+            f"[kv cache manager] Primary/secondary blocks for window sizes set to {blocks_per_window} for estimation dry run"
+        )
+
         # Calculate kv cache blocks for each window size
         # FIXME: flashinfer.py accesses kv_cache_manager.blocks_in_primary_pool
         # This dependency should be adjusted as it only covers the single window
 
@@ -427,7 +427,7 @@ def test_auto_spec_decode(self):
             CudaGraphConfig(batch_sizes=[1, 32, 64], enable_padding=True)
         }
         kv_cache_config = KvCacheConfig(enable_block_reuse=False,
-                                        free_gpu_memory_fraction=0.5)
+                                        free_gpu_memory_fraction=0.7)
         spec_config = AutoDecodingConfig()
         with LLM(model=self.MODEL_PATH,
                  **pytorch_config,
@@ -469,7 +469,7 @@ def test_auto_dtype_beam_search(self, enable_cuda_graph, enable_padding,
 
         with LLM(
                 model=self.MODEL_PATH,
-                kv_cache_config=KvCacheConfig(free_gpu_memory_fraction=0.5),
+                kv_cache_config=KvCacheConfig(free_gpu_memory_fraction=0.7),
                 max_batch_size=max_beam_width,
                 max_seq_len=2048,
                 max_beam_width=max_beam_width,
@@ -514,7 +514,7 @@ def test_fp8_beam_search(self, enable_cuda_graph, enable_padding,
 
         llm = LLM(
             model=model_path,
-            kv_cache_config=KvCacheConfig(free_gpu_memory_fraction=0.5),
+            kv_cache_config=KvCacheConfig(free_gpu_memory_fraction=0.7),
             max_batch_size=max_beam_width,
             max_seq_len=2048,
             max_beam_width=max_beam_width,
@@ -656,7 +656,7 @@ def test_fp8_eagle3_tp8(self, eagle3_one_model, torch_compile):
     @parametrize_with_ids("torch_compile", [False, True])
     def test_fp8_tp4(self, torch_compile):
         model_path = f"{llm_models_root()}/llama-3.3-models/Llama-3.3-70B-Instruct-FP8"
-        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.5)
+        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7)
         torch_compile_config = _get_default_torch_compile_config(torch_compile)
         with LLM(model_path,
                  tensor_parallel_size=4,
@@ -683,7 +683,7 @@ def test_fp8_tp4(self, torch_compile):
     @parametrize_with_ids("torch_compile", [False, True])
     def test_nvfp4_tp4(self, torch_compile):
         model_path = f"{llm_models_root()}/llama-3.3-models/Llama-3.3-70B-Instruct-FP4"
-        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.5)
+        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7)
         torch_compile_config = _get_default_torch_compile_config(torch_compile)
         with LLM(model_path,
                  tensor_parallel_size=4,
@@ -710,7 +710,7 @@ def test_nvfp4_tp4(self, torch_compile):
     @parametrize_with_ids("torch_compile", [False, True])
     def test_fp4_tp2pp2(self, enable_gemm_allreduce_fusion, torch_compile):
         model_path = f"{llm_models_root()}/llama-3.3-models/Llama-3.3-70B-Instruct-FP4"
-        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.5)
+        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7)
         torch_compile_config = _get_default_torch_compile_config(torch_compile)
 
         with (mock.patch.dict(
@@ -1052,7 +1052,7 @@ def test_auto_dtype(self):
         kv_cache_config = KvCacheConfig(
             enable_block_reuse=False,
             enable_partial_reuse=False,
-            free_gpu_memory_fraction=0.5,
+            free_gpu_memory_fraction=0.7,
         )
         # We use FlashInfer as the attention backend for Gemma3 VLM to support custom mask for images.
         # So, testing with it here.
@@ -1179,7 +1179,7 @@ def test_auto_dtype_vswa_without_reuse_low_memory_available(self):
             enable_block_reuse=False,
             enable_partial_reuse=False,
             max_attention_window=[512, 512, 512, 512, 512, 32768],
-            free_gpu_memory_fraction=0.1,
+            free_gpu_memory_fraction=0.6,
         )
 
         with LLM(self.MODEL_PATH, kv_cache_config=kv_cache_config) as llm:
@@ -1256,7 +1256,7 @@ def test_auto_dtype_vswa_reuse_low_memory_available_no_partial_reuse(self):
             enable_block_reuse=True,
             enable_partial_reuse=False,
             max_attention_window=[512, 512, 512, 512, 512, 32768],
-            free_gpu_memory_fraction=0.1,
+            free_gpu_memory_fraction=0.6,
         )
 
         with LLM(self.MODEL_PATH, kv_cache_config=kv_cache_config) as llm:
@@ -1271,7 +1271,7 @@ def test_auto_dtype_vswa_reuse_low_memory_available_partial_reuse(self):
             enable_block_reuse=True,
             enable_partial_reuse=True,
             max_attention_window=[512, 512, 512, 512, 512, 32768],
-            free_gpu_memory_fraction=0.1,
+            free_gpu_memory_fraction=0.6,
         )
 
         with LLM(self.MODEL_PATH, kv_cache_config=kv_cache_config) as llm:
@@ -1399,7 +1399,7 @@ def test_bfloat16(self, mtp_nextn, attention_dp, cuda_graph,
 
     @pytest.mark.skip_less_device_memory(60000)
     def test_bfloat16_2_model_mtp(self):
-        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.5)
+        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7)
         pytorch_config = dict(
             disable_overlap_scheduler=True,
             cuda_graph_config=CudaGraphConfig(),
@@ -4178,7 +4178,7 @@ def test_nvfp4(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
             cuda_graph_config=CudaGraphConfig() if cuda_graph else None,
             moe_config=MoeConfig(backend=moe_backend))
 
-        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4,
+        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7,
                                         enable_block_reuse=not eagle3)
         spec_config = None
         if eagle3:
@@ -4226,7 +4226,7 @@ def test_nvfp4_4gpus(self, tp_size, pp_size, ep_size, attention_dp,
             cuda_graph_config=CudaGraphConfig() if cuda_graph else None,
             moe_config=MoeConfig(backend=moe_backend))
 
-        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4,
+        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7,
                                         enable_block_reuse=not eagle3)
         spec_config = None
         if eagle3:
@@ -4482,7 +4482,7 @@ def test_w4_1gpu(self, kv_cache_dtype, moe_backend, cuda_graph,
             disable_overlap_scheduler=not overlap_scheduler,
             cuda_graph_config=CudaGraphConfig() if cuda_graph else None)
 
-        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.5,
+        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7,
                                         dtype=kv_cache_dtype)
 
         llm = LLM(self.MODEL_PATH,
@@ -4660,7 +4660,7 @@ def test_w4a16(self, kv_cache_dtype, tp_size, pp_size, ep_size,
                           {"scores_filter": "exact_match,flexible-extract"})
         monkeypatch.setenv("OVERRIDE_QUANT_ALGO", "W4A16_MXFP4")
 
-        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.5,
+        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7,
                                         dtype=kv_cache_dtype)
 
         llm = LLM(self.MODEL_PATH,
@@ -4702,7 +4702,7 @@ def test_w4_2gpus(self, kv_cache_dtype, moe_backend, tp_size, pp_size,
             disable_overlap_scheduler=not overlap_scheduler,
             cuda_graph_config=CudaGraphConfig() if cuda_graph else None)
 
-        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.5,
+        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7,
                                         dtype=kv_cache_dtype)
 
         llm = LLM(self.MODEL_PATH,
@@ -4740,7 +4740,7 @@ def test_w4_2gpus_nvfp4(self, tp_size, pp_size, ep_size, attention_dp,
             disable_overlap_scheduler=not overlap_scheduler,
             cuda_graph_config=CudaGraphConfig() if cuda_graph else None)
 
-        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4,
+        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7,
                                         dtype="auto")
 
         llm = LLM("./nvfp4ckpt",
@@ -4858,7 +4858,7 @@ def test_eagle3_4gpus(self, moe_backend, one_model, overlap_scheduler,
         # https://nvbugs/5590408: 2-Model overlap scheduling has accuracy issue
         pytorch_config = dict(disable_overlap_scheduler=not overlap_scheduler,
                               cuda_graph_config=CudaGraphConfig())
-        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4,
+        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7,
                                         dtype="auto")
 
         eagle_model_dir = f"{llm_models_root()}/gpt_oss/gpt-oss-120b-Eagle3"
@@ -4922,7 +4922,7 @@ def test_eagle3_vswa_reuse_4gpus(self, one_model, mocker):
         mocker.patch.object(GPQADiamond, "MAX_INPUT_LEN", MAX_INPUT_LEN)
 
         pytorch_config = dict(cuda_graph_config=CudaGraphConfig())
-        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4,
+        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7,
                                         dtype="auto",
                                         enable_block_reuse=True,
                                         max_attention_window=[128, 32768])
@@ -4988,7 +4988,7 @@ def test_eagle3_guided_decoding_4gpus(self, one_model, mocker):
         mocker.patch.object(GPQADiamond, "MAX_INPUT_LEN", MAX_INPUT_LEN)
 
         pytorch_config = dict(cuda_graph_config=CudaGraphConfig())
-        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4,
+        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7,
                                         dtype="auto")
 
         eagle_model_dir = f"{llm_models_root()}/gpt_oss/gpt-oss-120b-Eagle3"
@@ -5044,7 +5044,7 @@ def test_eagle3_2gpus(self, moe_backend, one_model, overlap_scheduler,
             max_batch_size=8,
             disable_overlap_scheduler=not overlap_scheduler,
             cuda_graph_config=CudaGraphConfig(max_batch_size=8))
-        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4,
+        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7,
                                         dtype="auto")
 
         eagle_model_dir = f"{llm_models_root()}/gpt_oss/gpt-oss-120b-Eagle3"
@@ -5112,7 +5112,7 @@ def test_w4_4gpus_online_eplb(self, kv_cache_dtype, enable_configurable_moe,
         mocker.patch.dict(GSM8K.EVALUATE_KWARGS,
                           {"scores_filter": "exact_match,flexible-extract"})
 
-        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.5,
+        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7,
                                         dtype=kv_cache_dtype)
 
         # Configure online expert parallel load balancer
@@ -5160,7 +5160,7 @@ class TestQwQ_32B(LlmapiAccuracyTestHarness):
     @pytest.mark.skip_less_device_memory(80000)
     @pytest.mark.skip_less_device(4)
     def test_auto_dtype_tp4(self):
-        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.5)
+        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7)
 
         with LLM(self.MODEL_PATH,
                  max_num_tokens=16384,
@@ -5326,7 +5326,7 @@ def test_fp8_8gpus(self):
             pytest.skip(f"Model directory {model_dir} does not exist")
 
         # Configure model settings
-        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4,
+        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7,
                                         enable_block_reuse=True,
                                         enable_partial_reuse=False,
                                         dtype="fp8")
@@ -5527,7 +5527,7 @@ def test_nvfp4_4gpus(self, tp_size, pp_size, ep_size, attention_dp,
             cuda_graph_config=CudaGraphConfig() if cuda_graph else None,
             moe_config=MoeConfig(backend=moe_backend))
 
-        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4,
+        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7,
                                         enable_block_reuse=not eagle3)
         spec_config = None
         if eagle3:
@@ -5578,7 +5578,7 @@ def test_fp8(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
             cuda_graph_config=CudaGraphConfig() if cuda_graph else None,
             moe_config=MoeConfig(backend=moe_backend))
 
-        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4,
+        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7,
                                         enable_block_reuse=not eagle3)
         spec_config = None
         if eagle3:
@@ -5716,7 +5716,7 @@ def test_fp8_4gpus(self, attention_dp):
                 kv_cache_config=KvCacheConfig(
                     enable_block_reuse=False,
                     mamba_ssm_cache_dtype="float16",
-                    free_gpu_memory_fraction=0.5,
+                    free_gpu_memory_fraction=0.7,
                 ),
                 max_batch_size=32,
                 tensor_parallel_size=4,
@@ -5756,7 +5756,7 @@ def test_nvfp4_8gpus(self, attention_dp):
                 kv_cache_config=KvCacheConfig(
                     enable_block_reuse=False,
                     mamba_ssm_cache_dtype="float16",
-                    free_gpu_memory_fraction=0.5,
+                    free_gpu_memory_fraction=0.7,
                 ),
                 max_batch_size=32,
                 tensor_parallel_size=8,
@@ -5791,7 +5791,7 @@ def test_nvfp4_8gpus_mtp(self):
                 kv_cache_config=KvCacheConfig(
                     enable_block_reuse=False,
                     mamba_ssm_cache_dtype="float16",
-                    free_gpu_memory_fraction=0.5,
+                    free_gpu_memory_fraction=0.7,
                 ),
                 max_batch_size=128,
                 tensor_parallel_size=8,
 
@@ -116,7 +116,7 @@ class TestLlava_V1_6_Mistral_7B(LlmapiAccuracyTestHarness):
         stop="<|endoftext|>",
     )
 
-    kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6)
+    kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8)
 
     def test_auto_dtype(self):
         with LLM(
@@ -141,7 +141,7 @@ class TestNVILA_8B(LlmapiAccuracyTestHarness):
     )
 
     kv_cache_config = KvCacheConfig(
-        free_gpu_memory_fraction=0.6,
+        free_gpu_memory_fraction=0.8,
         # NOTE: VILA models do not support block reuse.
         enable_block_reuse=False,
     )
@@ -169,7 +169,7 @@ class TestVILA1_5_3B(LlmapiAccuracyTestHarness):
     )
 
     kv_cache_config = KvCacheConfig(
-        free_gpu_memory_fraction=0.6,
+        free_gpu_memory_fraction=0.8,
         # NOTE: VILA models do not support block reuse.
         enable_block_reuse=False,
     )
@@ -256,7 +256,7 @@ class TestGemma3_27BInstruct(LlmapiAccuracyTestHarness):
     kv_cache_config = KvCacheConfig(
         enable_block_reuse=False,
         enable_partial_reuse=False,
-        free_gpu_memory_fraction=0.4,
+        free_gpu_memory_fraction=0.8,
         dtype="fp8",
     )
 
@@ -348,7 +348,7 @@ def test_nvfp4_4gpus(
             moe_config=MoeConfig(backend=moe_backend),
         )
 
-        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4)
+        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8)
 
         with LLM(
             self.MODEL_PATH,
Original file line number	Diff line number	Diff line change
`@@ -18,7 +18,7 @@ def main():`
`18`	`18`	`)`
`19`	`19`	`args = parser.parse_args()`
`20`	`20`	`# Configure KV cache memory usage fraction.`
`21`		`- kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.5,`
	`21`	`+ kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8,`
`22`	`22`	`max_tokens=4096,`
`23`	`23`	`enable_block_reuse=True)`
`24`	`24`
Original file line number	Diff line number	Diff line change
`@@ -116,7 +116,7 @@ class TestLlava_V1_6_Mistral_7B(LlmapiAccuracyTestHarness):`
`116`	`116`	`stop="<\|endoftext\|>",`
`117`	`117`	`)`
`118`	`118`
`119`		`- kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6)`
	`119`	`+ kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8)`
`120`	`120`
`121`	`121`	`def test_auto_dtype(self):`
`122`	`122`	`with LLM(`
`@@ -141,7 +141,7 @@ class TestNVILA_8B(LlmapiAccuracyTestHarness):`
`141`	`141`	`)`
`142`	`142`
`143`	`143`	`kv_cache_config = KvCacheConfig(`
`144`		`- free_gpu_memory_fraction=0.6,`
	`144`	`+ free_gpu_memory_fraction=0.8,`
`145`	`145`	`# NOTE: VILA models do not support block reuse.`
`146`	`146`	`enable_block_reuse=False,`
`147`	`147`	`)`
`@@ -169,7 +169,7 @@ class TestVILA1_5_3B(LlmapiAccuracyTestHarness):`
`169`	`169`	`)`
`170`	`170`
`171`	`171`	`kv_cache_config = KvCacheConfig(`
`172`		`- free_gpu_memory_fraction=0.6,`
	`172`	`+ free_gpu_memory_fraction=0.8,`
`173`	`173`	`# NOTE: VILA models do not support block reuse.`
`174`	`174`	`enable_block_reuse=False,`
`175`	`175`	`)`
`@@ -256,7 +256,7 @@ class TestGemma3_27BInstruct(LlmapiAccuracyTestHarness):`
`256`	`256`	`kv_cache_config = KvCacheConfig(`
`257`	`257`	`enable_block_reuse=False,`
`258`	`258`	`enable_partial_reuse=False,`
`259`		`- free_gpu_memory_fraction=0.4,`
	`259`	`+ free_gpu_memory_fraction=0.8,`
`260`	`260`	`dtype="fp8",`
`261`	`261`	`)`
`262`	`262`
`@@ -348,7 +348,7 @@ def test_nvfp4_4gpus(`
`348`	`348`	`moe_config=MoeConfig(backend=moe_backend),`
`349`	`349`	`)`
`350`	`350`
`351`		`- kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4)`
	`351`	`+ kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8)`
`352`	`352`
`353`	`353`	`with LLM(`
`354`	`354`	`self.MODEL_PATH,`