Skip to content

Commit 66feb7f

Browse files
committed
Modify kv cache fraction value in tests
Signed-off-by: Hui Gao <[email protected]>
1 parent 8a83e02 commit 66feb7f

32 files changed

+107
-89
lines changed

examples/llm-api/_tensorrt_engine/llm_lookahead_decoding.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ def main():
1515
max_ngram_size=4,
1616
max_verification_set_size=4)
1717

18-
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4)
18+
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7)
1919
llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
2020
kv_cache_config=kv_cache_config,
2121
build_config=build_config,

examples/llm-api/llm_runtime.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ def example_cuda_graph_config():
7373
cuda_graph_config=cuda_graph_config, # Enable CUDA graphs
7474
max_batch_size=4,
7575
max_seq_len=512,
76-
kv_cache_config=KvCacheConfig(free_gpu_memory_fraction=0.5))
76+
kv_cache_config=KvCacheConfig(free_gpu_memory_fraction=0.7))
7777

7878
prompts = [
7979
"Hello, my name is",

examples/ray_orchestrator/llm_inference_async_ray.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ def main():
1818
)
1919
args = parser.parse_args()
2020
# Configure KV cache memory usage fraction.
21-
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.5,
21+
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8,
2222
max_tokens=4096,
2323
enable_block_reuse=True)
2424

examples/scaffolding/run_best_of_n_with_reward.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,13 +32,13 @@ def main():
3232
backend="pytorch",
3333
max_batch_size=args.sample_num,
3434
max_num_tokens=8192,
35-
kv_cache_free_gpu_memory_fraction=0.1)
35+
kv_cache_free_gpu_memory_fraction=0.6)
3636
reward_worker = TRTLLMWorker.init_with_new_llm(
3737
args.reward_model,
3838
backend="pytorch",
3939
max_batch_size=args.sample_num,
4040
max_num_tokens=8192,
41-
kv_cache_free_gpu_memory_fraction=0.2,
41+
kv_cache_free_gpu_memory_fraction=0.6,
4242
disable_overlap_scheduler=True)
4343
workers[NativeGenerationController.WorkerTag.GENERATION] = gen_worker
4444
workers[PRMController.WorkerTag.REWARD] = reward_worker

tensorrt_llm/_torch/pyexecutor/_util.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -230,8 +230,8 @@ def _create_kv_cache_manager(
230230
return kv_cache_manager
231231

232232
def build_managers(self, resources: Dict) -> None:
233-
self.configure_kv_cache_capacity()
234233
"""Construct KV caches for model and draft model (if applicable)."""
234+
self.configure_kv_cache_capacity()
235235
kv_cache_manager = self._create_kv_cache_manager(self._model_engine, )
236236

237237
if self._kv_connector_manager is not None and self._draft_model_engine is not None:

tensorrt_llm/_torch/pyexecutor/resource_manager.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -266,6 +266,24 @@ def append_to_kv_heads_per_layer(num_kv_heads_per_layer: List[int],
266266
# Determine if this is VSWA (Variable Sliding Window Attention)
267267
self.is_vswa = len(set(self.max_attention_window_vec)) > 1
268268

269+
self.blocks_in_primary_pool = int(kv_cache_config.max_tokens //
270+
tokens_per_block)
271+
272+
host_cache_size = kv_cache_config.host_cache_size if kv_cache_config.host_cache_size else 0
273+
max_tokens_secondary = host_cache_size // self.get_cache_bytes_per_token(
274+
)
275+
self.blocks_in_secondary_pool = int(max_tokens_secondary //
276+
tokens_per_block)
277+
278+
blocks_per_window = {
279+
window_size:
280+
(self.blocks_in_primary_pool, self.blocks_in_secondary_pool)
281+
for window_size in set(self.max_attention_window_vec)
282+
}
283+
logger.info(
284+
f"[kv cache manager] Primary/secondary blocks for window sizes set to {blocks_per_window} for estimation dry run"
285+
)
286+
269287
# Calculate kv cache blocks for each window size
270288
# FIXME: flashinfer.py accesses kv_cache_manager.blocks_in_primary_pool
271289
# This dependency should be adjusted as it only covers the single window

tests/integration/defs/accuracy/test_llm_api_pytorch.py

Lines changed: 29 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -427,7 +427,7 @@ def test_auto_spec_decode(self):
427427
CudaGraphConfig(batch_sizes=[1, 32, 64], enable_padding=True)
428428
}
429429
kv_cache_config = KvCacheConfig(enable_block_reuse=False,
430-
free_gpu_memory_fraction=0.5)
430+
free_gpu_memory_fraction=0.7)
431431
spec_config = AutoDecodingConfig()
432432
with LLM(model=self.MODEL_PATH,
433433
**pytorch_config,
@@ -469,7 +469,7 @@ def test_auto_dtype_beam_search(self, enable_cuda_graph, enable_padding,
469469

470470
with LLM(
471471
model=self.MODEL_PATH,
472-
kv_cache_config=KvCacheConfig(free_gpu_memory_fraction=0.5),
472+
kv_cache_config=KvCacheConfig(free_gpu_memory_fraction=0.7),
473473
max_batch_size=max_beam_width,
474474
max_seq_len=2048,
475475
max_beam_width=max_beam_width,
@@ -514,7 +514,7 @@ def test_fp8_beam_search(self, enable_cuda_graph, enable_padding,
514514

515515
llm = LLM(
516516
model=model_path,
517-
kv_cache_config=KvCacheConfig(free_gpu_memory_fraction=0.5),
517+
kv_cache_config=KvCacheConfig(free_gpu_memory_fraction=0.7),
518518
max_batch_size=max_beam_width,
519519
max_seq_len=2048,
520520
max_beam_width=max_beam_width,
@@ -656,7 +656,7 @@ def test_fp8_eagle3_tp8(self, eagle3_one_model, torch_compile):
656656
@parametrize_with_ids("torch_compile", [False, True])
657657
def test_fp8_tp4(self, torch_compile):
658658
model_path = f"{llm_models_root()}/llama-3.3-models/Llama-3.3-70B-Instruct-FP8"
659-
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.5)
659+
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7)
660660
torch_compile_config = _get_default_torch_compile_config(torch_compile)
661661
with LLM(model_path,
662662
tensor_parallel_size=4,
@@ -683,7 +683,7 @@ def test_fp8_tp4(self, torch_compile):
683683
@parametrize_with_ids("torch_compile", [False, True])
684684
def test_nvfp4_tp4(self, torch_compile):
685685
model_path = f"{llm_models_root()}/llama-3.3-models/Llama-3.3-70B-Instruct-FP4"
686-
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.5)
686+
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7)
687687
torch_compile_config = _get_default_torch_compile_config(torch_compile)
688688
with LLM(model_path,
689689
tensor_parallel_size=4,
@@ -710,7 +710,7 @@ def test_nvfp4_tp4(self, torch_compile):
710710
@parametrize_with_ids("torch_compile", [False, True])
711711
def test_fp4_tp2pp2(self, enable_gemm_allreduce_fusion, torch_compile):
712712
model_path = f"{llm_models_root()}/llama-3.3-models/Llama-3.3-70B-Instruct-FP4"
713-
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.5)
713+
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7)
714714
torch_compile_config = _get_default_torch_compile_config(torch_compile)
715715

716716
with (mock.patch.dict(
@@ -1052,7 +1052,7 @@ def test_auto_dtype(self):
10521052
kv_cache_config = KvCacheConfig(
10531053
enable_block_reuse=False,
10541054
enable_partial_reuse=False,
1055-
free_gpu_memory_fraction=0.5,
1055+
free_gpu_memory_fraction=0.7,
10561056
)
10571057
# We use FlashInfer as the attention backend for Gemma3 VLM to support custom mask for images.
10581058
# So, testing with it here.
@@ -1179,7 +1179,7 @@ def test_auto_dtype_vswa_without_reuse_low_memory_available(self):
11791179
enable_block_reuse=False,
11801180
enable_partial_reuse=False,
11811181
max_attention_window=[512, 512, 512, 512, 512, 32768],
1182-
free_gpu_memory_fraction=0.1,
1182+
free_gpu_memory_fraction=0.6,
11831183
)
11841184

11851185
with LLM(self.MODEL_PATH, kv_cache_config=kv_cache_config) as llm:
@@ -1256,7 +1256,7 @@ def test_auto_dtype_vswa_reuse_low_memory_available_no_partial_reuse(self):
12561256
enable_block_reuse=True,
12571257
enable_partial_reuse=False,
12581258
max_attention_window=[512, 512, 512, 512, 512, 32768],
1259-
free_gpu_memory_fraction=0.1,
1259+
free_gpu_memory_fraction=0.6,
12601260
)
12611261

12621262
with LLM(self.MODEL_PATH, kv_cache_config=kv_cache_config) as llm:
@@ -1271,7 +1271,7 @@ def test_auto_dtype_vswa_reuse_low_memory_available_partial_reuse(self):
12711271
enable_block_reuse=True,
12721272
enable_partial_reuse=True,
12731273
max_attention_window=[512, 512, 512, 512, 512, 32768],
1274-
free_gpu_memory_fraction=0.1,
1274+
free_gpu_memory_fraction=0.6,
12751275
)
12761276

12771277
with LLM(self.MODEL_PATH, kv_cache_config=kv_cache_config) as llm:
@@ -1399,7 +1399,7 @@ def test_bfloat16(self, mtp_nextn, attention_dp, cuda_graph,
13991399

14001400
@pytest.mark.skip_less_device_memory(60000)
14011401
def test_bfloat16_2_model_mtp(self):
1402-
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.5)
1402+
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7)
14031403
pytorch_config = dict(
14041404
disable_overlap_scheduler=True,
14051405
cuda_graph_config=CudaGraphConfig(),
@@ -4178,7 +4178,7 @@ def test_nvfp4(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
41784178
cuda_graph_config=CudaGraphConfig() if cuda_graph else None,
41794179
moe_config=MoeConfig(backend=moe_backend))
41804180

4181-
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4,
4181+
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7,
41824182
enable_block_reuse=not eagle3)
41834183
spec_config = None
41844184
if eagle3:
@@ -4226,7 +4226,7 @@ def test_nvfp4_4gpus(self, tp_size, pp_size, ep_size, attention_dp,
42264226
cuda_graph_config=CudaGraphConfig() if cuda_graph else None,
42274227
moe_config=MoeConfig(backend=moe_backend))
42284228

4229-
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4,
4229+
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7,
42304230
enable_block_reuse=not eagle3)
42314231
spec_config = None
42324232
if eagle3:
@@ -4482,7 +4482,7 @@ def test_w4_1gpu(self, kv_cache_dtype, moe_backend, cuda_graph,
44824482
disable_overlap_scheduler=not overlap_scheduler,
44834483
cuda_graph_config=CudaGraphConfig() if cuda_graph else None)
44844484

4485-
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.5,
4485+
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7,
44864486
dtype=kv_cache_dtype)
44874487

44884488
llm = LLM(self.MODEL_PATH,
@@ -4660,7 +4660,7 @@ def test_w4a16(self, kv_cache_dtype, tp_size, pp_size, ep_size,
46604660
{"scores_filter": "exact_match,flexible-extract"})
46614661
monkeypatch.setenv("OVERRIDE_QUANT_ALGO", "W4A16_MXFP4")
46624662

4663-
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.5,
4663+
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7,
46644664
dtype=kv_cache_dtype)
46654665

46664666
llm = LLM(self.MODEL_PATH,
@@ -4702,7 +4702,7 @@ def test_w4_2gpus(self, kv_cache_dtype, moe_backend, tp_size, pp_size,
47024702
disable_overlap_scheduler=not overlap_scheduler,
47034703
cuda_graph_config=CudaGraphConfig() if cuda_graph else None)
47044704

4705-
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.5,
4705+
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7,
47064706
dtype=kv_cache_dtype)
47074707

47084708
llm = LLM(self.MODEL_PATH,
@@ -4740,7 +4740,7 @@ def test_w4_2gpus_nvfp4(self, tp_size, pp_size, ep_size, attention_dp,
47404740
disable_overlap_scheduler=not overlap_scheduler,
47414741
cuda_graph_config=CudaGraphConfig() if cuda_graph else None)
47424742

4743-
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4,
4743+
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7,
47444744
dtype="auto")
47454745

47464746
llm = LLM("./nvfp4ckpt",
@@ -4858,7 +4858,7 @@ def test_eagle3_4gpus(self, moe_backend, one_model, overlap_scheduler,
48584858
# https://nvbugs/5590408: 2-Model overlap scheduling has accuracy issue
48594859
pytorch_config = dict(disable_overlap_scheduler=not overlap_scheduler,
48604860
cuda_graph_config=CudaGraphConfig())
4861-
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4,
4861+
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7,
48624862
dtype="auto")
48634863

48644864
eagle_model_dir = f"{llm_models_root()}/gpt_oss/gpt-oss-120b-Eagle3"
@@ -4922,7 +4922,7 @@ def test_eagle3_vswa_reuse_4gpus(self, one_model, mocker):
49224922
mocker.patch.object(GPQADiamond, "MAX_INPUT_LEN", MAX_INPUT_LEN)
49234923

49244924
pytorch_config = dict(cuda_graph_config=CudaGraphConfig())
4925-
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4,
4925+
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7,
49264926
dtype="auto",
49274927
enable_block_reuse=True,
49284928
max_attention_window=[128, 32768])
@@ -4988,7 +4988,7 @@ def test_eagle3_guided_decoding_4gpus(self, one_model, mocker):
49884988
mocker.patch.object(GPQADiamond, "MAX_INPUT_LEN", MAX_INPUT_LEN)
49894989

49904990
pytorch_config = dict(cuda_graph_config=CudaGraphConfig())
4991-
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4,
4991+
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7,
49924992
dtype="auto")
49934993

49944994
eagle_model_dir = f"{llm_models_root()}/gpt_oss/gpt-oss-120b-Eagle3"
@@ -5044,7 +5044,7 @@ def test_eagle3_2gpus(self, moe_backend, one_model, overlap_scheduler,
50445044
max_batch_size=8,
50455045
disable_overlap_scheduler=not overlap_scheduler,
50465046
cuda_graph_config=CudaGraphConfig(max_batch_size=8))
5047-
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4,
5047+
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7,
50485048
dtype="auto")
50495049

50505050
eagle_model_dir = f"{llm_models_root()}/gpt_oss/gpt-oss-120b-Eagle3"
@@ -5112,7 +5112,7 @@ def test_w4_4gpus_online_eplb(self, kv_cache_dtype, enable_configurable_moe,
51125112
mocker.patch.dict(GSM8K.EVALUATE_KWARGS,
51135113
{"scores_filter": "exact_match,flexible-extract"})
51145114

5115-
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.5,
5115+
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7,
51165116
dtype=kv_cache_dtype)
51175117

51185118
# Configure online expert parallel load balancer
@@ -5160,7 +5160,7 @@ class TestQwQ_32B(LlmapiAccuracyTestHarness):
51605160
@pytest.mark.skip_less_device_memory(80000)
51615161
@pytest.mark.skip_less_device(4)
51625162
def test_auto_dtype_tp4(self):
5163-
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.5)
5163+
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7)
51645164

51655165
with LLM(self.MODEL_PATH,
51665166
max_num_tokens=16384,
@@ -5326,7 +5326,7 @@ def test_fp8_8gpus(self):
53265326
pytest.skip(f"Model directory {model_dir} does not exist")
53275327

53285328
# Configure model settings
5329-
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4,
5329+
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7,
53305330
enable_block_reuse=True,
53315331
enable_partial_reuse=False,
53325332
dtype="fp8")
@@ -5527,7 +5527,7 @@ def test_nvfp4_4gpus(self, tp_size, pp_size, ep_size, attention_dp,
55275527
cuda_graph_config=CudaGraphConfig() if cuda_graph else None,
55285528
moe_config=MoeConfig(backend=moe_backend))
55295529

5530-
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4,
5530+
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7,
55315531
enable_block_reuse=not eagle3)
55325532
spec_config = None
55335533
if eagle3:
@@ -5578,7 +5578,7 @@ def test_fp8(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
55785578
cuda_graph_config=CudaGraphConfig() if cuda_graph else None,
55795579
moe_config=MoeConfig(backend=moe_backend))
55805580

5581-
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4,
5581+
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7,
55825582
enable_block_reuse=not eagle3)
55835583
spec_config = None
55845584
if eagle3:
@@ -5716,7 +5716,7 @@ def test_fp8_4gpus(self, attention_dp):
57165716
kv_cache_config=KvCacheConfig(
57175717
enable_block_reuse=False,
57185718
mamba_ssm_cache_dtype="float16",
5719-
free_gpu_memory_fraction=0.5,
5719+
free_gpu_memory_fraction=0.7,
57205720
),
57215721
max_batch_size=32,
57225722
tensor_parallel_size=4,
@@ -5756,7 +5756,7 @@ def test_nvfp4_8gpus(self, attention_dp):
57565756
kv_cache_config=KvCacheConfig(
57575757
enable_block_reuse=False,
57585758
mamba_ssm_cache_dtype="float16",
5759-
free_gpu_memory_fraction=0.5,
5759+
free_gpu_memory_fraction=0.7,
57605760
),
57615761
max_batch_size=32,
57625762
tensor_parallel_size=8,
@@ -5791,7 +5791,7 @@ def test_nvfp4_8gpus_mtp(self):
57915791
kv_cache_config=KvCacheConfig(
57925792
enable_block_reuse=False,
57935793
mamba_ssm_cache_dtype="float16",
5794-
free_gpu_memory_fraction=0.5,
5794+
free_gpu_memory_fraction=0.7,
57955795
),
57965796
max_batch_size=128,
57975797
tensor_parallel_size=8,

tests/integration/defs/accuracy/test_llm_api_pytorch_multimodal.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,7 @@ class TestLlava_V1_6_Mistral_7B(LlmapiAccuracyTestHarness):
116116
stop="<|endoftext|>",
117117
)
118118

119-
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6)
119+
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8)
120120

121121
def test_auto_dtype(self):
122122
with LLM(
@@ -141,7 +141,7 @@ class TestNVILA_8B(LlmapiAccuracyTestHarness):
141141
)
142142

143143
kv_cache_config = KvCacheConfig(
144-
free_gpu_memory_fraction=0.6,
144+
free_gpu_memory_fraction=0.8,
145145
# NOTE: VILA models do not support block reuse.
146146
enable_block_reuse=False,
147147
)
@@ -169,7 +169,7 @@ class TestVILA1_5_3B(LlmapiAccuracyTestHarness):
169169
)
170170

171171
kv_cache_config = KvCacheConfig(
172-
free_gpu_memory_fraction=0.6,
172+
free_gpu_memory_fraction=0.8,
173173
# NOTE: VILA models do not support block reuse.
174174
enable_block_reuse=False,
175175
)
@@ -256,7 +256,7 @@ class TestGemma3_27BInstruct(LlmapiAccuracyTestHarness):
256256
kv_cache_config = KvCacheConfig(
257257
enable_block_reuse=False,
258258
enable_partial_reuse=False,
259-
free_gpu_memory_fraction=0.4,
259+
free_gpu_memory_fraction=0.8,
260260
dtype="fp8",
261261
)
262262

@@ -348,7 +348,7 @@ def test_nvfp4_4gpus(
348348
moe_config=MoeConfig(backend=moe_backend),
349349
)
350350

351-
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4)
351+
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8)
352352

353353
with LLM(
354354
self.MODEL_PATH,

0 commit comments

Comments
 (0)