Skip to content

Commit 8a83e02

Browse files
committed
Remove memory estimation process
Signed-off-by: Hui Gao <[email protected]>
1 parent c7a86f8 commit 8a83e02

File tree

7 files changed

+65
-499
lines changed

7 files changed

+65
-499
lines changed

tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -259,7 +259,6 @@ def create_draft_kv_cache_manager_maybe(
259259
max_num_tokens=ad_config.max_num_tokens,
260260
max_beam_width=ad_config.max_beam_width,
261261
kv_connector_manager=None, # KV connector manager not used in AutoDeploy (no disagg support)
262-
estimating_kv_cache=False,
263262
)
264263

265264

tensorrt_llm/_torch/auto_deploy/shim/interface.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -217,7 +217,6 @@ def _create_kv_cache_manager(self, max_tokens: Optional[int] = None) -> int:
217217
# we don't rely on free_gpu_memory_fraction inside the KVCacheManager. This is similar
218218
# to _torch.pyexecutor._util.KVCacheCreator, which explicitly estimates the max_tokens
219219
# outside of the KVCacheManager.
220-
"is_estimating_kv_cache": False,
221220
}
222221

223222
# update args if we are just doing a dummy cache manager

tensorrt_llm/_torch/pyexecutor/_util.py

Lines changed: 27 additions & 366 deletions
Large diffs are not rendered by default.

tensorrt_llm/_torch/pyexecutor/mamba_cache_manager.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -377,7 +377,6 @@ def __init__(
377377
mapping: Mapping,
378378
dtype: DataType = DataType.HALF,
379379
spec_config: Optional["DecodingBaseConfig"] = None,
380-
is_estimating_kv_cache: bool = False,
381380
execution_stream: Optional[torch.cuda.Stream] = None,
382381
) -> None:
383382

@@ -418,7 +417,6 @@ def __init__(
418417
dtype=dtype,
419418
spec_config=spec_config,
420419
layer_mask=layer_mask,
421-
is_estimating_kv_cache=is_estimating_kv_cache,
422420
execution_stream=execution_stream,
423421
)
424422

tensorrt_llm/_torch/pyexecutor/model_engine.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1302,7 +1302,8 @@ def _init_model_capacity(self):
13021302
self._init_max_num_tokens()
13031303

13041304
def _release_cuda_graphs(self):
1305-
self.cuda_graph_runner.clear()
1305+
if self.cuda_graph_runner is not None:
1306+
self.cuda_graph_runner.clear()
13061307

13071308
def get_max_num_sequences(self) -> int:
13081309
"""

tensorrt_llm/_torch/pyexecutor/py_executor_creator.py

Lines changed: 12 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -33,9 +33,8 @@
3333
get_spec_resource_manager)
3434
from ..virtual_memory import ExecutorMemoryType, RestoreMode
3535
from ..virtual_memory import scope as virtual_memory_scope
36-
from ._util import (KvCacheCreator, _adjust_torch_mem_fraction,
37-
create_py_executor_instance, instantiate_sampler, is_mla,
38-
validate_feature_combination)
36+
from ._util import (KvCacheCreator, create_py_executor_instance,
37+
instantiate_sampler, is_mla, validate_feature_combination)
3938
from .config_utils import is_mla
4039
from .guided_decoder import CapturableGuidedDecoder, GuidedDecoder
4140
from .kv_cache_connector import KvCacheConnectorManager
@@ -222,7 +221,6 @@ def create_py_executor(
222221
tokenizer: Optional[TokenizerBase] = None,
223222
profiling_stage_data: Optional[dict] = None,
224223
) -> PyExecutor:
225-
torch.cuda.set_per_process_memory_fraction(1.0)
226224
garbage_collection_gen0_threshold = llm_args.garbage_collection_gen0_threshold
227225
lora_config = llm_args.lora_config
228226
kv_connector_config = llm_args.kv_connector_config
@@ -434,7 +432,6 @@ def drafting_loop_wrapper(model):
434432

435433
# PyTorchModelEngine modifies these fields, update them
436434
model_engine_max_seq_len = model_engine.max_seq_len
437-
net_max_seq_len = model_engine_max_seq_len
438435
if not llm_args.disable_overlap_scheduler:
439436
model_engine_max_seq_len = model_engine.max_seq_len + 1
440437
if spec_config is not None:
@@ -604,7 +601,6 @@ def drafting_loop_wrapper(model):
604601
kv_connector_manager = None
605602

606603
resources = {}
607-
estimating_kv_cache = False
608604
kv_cache_creator = None
609605

610606
# Create the execution stream for model forward operations
@@ -619,7 +615,6 @@ def drafting_loop_wrapper(model):
619615
model_engine=model_engine,
620616
draft_model_engine=draft_model_engine,
621617
mapping=mapping,
622-
net_max_seq_len=net_max_seq_len,
623618
kv_connector_manager=kv_connector_manager,
624619
max_num_tokens=max_num_tokens,
625620
max_beam_width=max_beam_width,
@@ -633,11 +628,8 @@ def drafting_loop_wrapper(model):
633628
sparse_attention_config=sparse_attention_config,
634629
execution_stream=execution_stream,
635630
)
636-
estimating_kv_cache = kv_cache_creator.try_prepare_estimation()
637-
with allocation_scope(
638-
ExecutorMemoryType.INIT_KV_CACHE if estimating_kv_cache else
639-
ExecutorMemoryType.KV_CACHE, RestoreMode.NONE):
640-
kv_cache_creator.build_managers(resources, estimating_kv_cache)
631+
with allocation_scope(ExecutorMemoryType.KV_CACHE, RestoreMode.NONE):
632+
kv_cache_creator.build_managers(resources)
641633
# Originally, max_seq_len might be mutated inside build_managers as field of executor config.
642634
# Since now, we are changing kv_cache_creator._max_seq_len instead. Restore max_seq_len here.
643635
max_seq_len = kv_cache_creator._max_seq_len
@@ -663,100 +655,40 @@ def drafting_loop_wrapper(model):
663655
spec_resource_manager=spec_resource_manager,
664656
guided_decoder=guided_decoder)
665657

666-
with allocation_scope(
667-
ExecutorMemoryType.INIT_EXTRA_RESOURCES if estimating_kv_cache else
668-
ExecutorMemoryType.EXTRA_RESOURCES, RestoreMode.PINNED):
658+
with allocation_scope(ExecutorMemoryType.EXTRA_RESOURCES,
659+
RestoreMode.PINNED):
660+
661+
# run gc.collect() to free memory of the previous py_executor, avoid cudaFree overlap with cuda graph capture
662+
gc.collect()
669663
py_executor = create_py_executor_instance(
670664
dist=dist,
671665
resources=resources,
672666
mapping=mapping,
673667
llm_args=llm_args,
674668
ctx_chunk_config=ctx_chunk_config,
675669
model_engine=model_engine,
676-
start_worker=False,
677670
sampler=sampler,
678671
drafter=drafter,
679672
guided_decoder=guided_decoder,
680673
lora_config=lora_config,
681674
garbage_collection_gen0_threshold=garbage_collection_gen0_threshold,
682-
kv_connector_manager=kv_connector_manager
683-
if not estimating_kv_cache else None,
675+
kv_connector_manager=kv_connector_manager,
684676
max_seq_len=max_seq_len,
685677
max_batch_size=max_batch_size,
686678
max_beam_width=max_beam_width,
687679
max_num_tokens=max_num_tokens,
688680
peft_cache_config=peft_cache_config,
689681
scheduler_config=scheduler_config,
690682
cache_transceiver_config=cache_transceiver_config,
691-
virtual_memory_pools=vm_pools if not estimating_kv_cache else None,
683+
virtual_memory_pools=vm_pools,
692684
execution_stream=execution_stream,
693685
)
686+
694687
# Originally, peft_cache_config might be mutated inside
695688
# create_py_executor_instance. Restore it here.
696689
peft_cache_config = py_executor.peft_cache_config
697690

698-
if estimating_kv_cache:
699-
assert kv_cache_creator is not None
700-
with allocation_scope(ExecutorMemoryType.MODEL_EXTRA,
701-
RestoreMode.PINNED):
702-
kv_cache_creator.configure_kv_cache_capacity(py_executor)
703-
kv_cache_creator.teardown_managers(resources)
704-
del py_executor # free before constructing new
705-
706-
with allocation_scope(ExecutorMemoryType.KV_CACHE, RestoreMode.NONE):
707-
# Before estimating KV cache size, a minimal KV cache has been allocated using
708-
# create_kv_cache_manager above, which caps kv_cache_creator.max_seq_len. Restoring
709-
# the original value before creating the final KV cache.
710-
kv_cache_creator._max_seq_len = model_engine_max_seq_len
711-
kv_cache_creator.build_managers(resources, False)
712-
# Originally, max_seq_len might be mutated inside build_managers as field of executor config.
713-
# Since now, we are changing kv_cache_creator._max_seq_len instead. Restore max_seq_len here.
714-
max_seq_len = kv_cache_creator._max_seq_len
715-
update_sampler_max_seq_len(max_seq_len, sampler)
716-
717-
for eng in [model_engine, draft_model_engine]:
718-
if eng is None:
719-
continue
720-
if eng.attn_metadata is not None:
721-
if llm_args.cuda_graph_config is not None:
722-
eng._release_cuda_graphs()
723-
eng.attn_metadata = None
724-
725-
with allocation_scope(ExecutorMemoryType.EXTRA_RESOURCES,
726-
RestoreMode.PINNED):
727-
728-
# run gc.collect() to free memory of the previous py_executor, avoid cudaFree overlap with cuda graph capture
729-
gc.collect()
730-
py_executor = create_py_executor_instance(
731-
dist=dist,
732-
resources=resources,
733-
mapping=mapping,
734-
llm_args=llm_args,
735-
ctx_chunk_config=ctx_chunk_config,
736-
model_engine=model_engine,
737-
start_worker=False,
738-
sampler=sampler,
739-
drafter=drafter,
740-
guided_decoder=guided_decoder,
741-
lora_config=lora_config,
742-
garbage_collection_gen0_threshold=
743-
garbage_collection_gen0_threshold,
744-
kv_connector_manager=kv_connector_manager,
745-
max_seq_len=max_seq_len,
746-
max_batch_size=max_batch_size,
747-
max_beam_width=max_beam_width,
748-
max_num_tokens=max_num_tokens,
749-
peft_cache_config=peft_cache_config,
750-
scheduler_config=scheduler_config,
751-
cache_transceiver_config=cache_transceiver_config,
752-
virtual_memory_pools=vm_pools,
753-
execution_stream=execution_stream,
754-
)
755-
756-
_adjust_torch_mem_fraction()
757-
758691
if mapping.rank == 0:
759692
logger.info(f"LLM Args:\n{llm_args}")
760693

761-
py_executor.start_worker()
762694
return py_executor

tensorrt_llm/_torch/pyexecutor/resource_manager.py

Lines changed: 24 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -170,7 +170,6 @@ def __init__(
170170
enable_indexer_k_cache: bool = False,
171171
indexer_k_cache_quant_block_size: int = 128,
172172
indexer_k_cache_index_head_dim: int = 0,
173-
is_estimating_kv_cache: bool = False,
174173
execution_stream: Optional[torch.cuda.Stream] = None,
175174
**kwargs,
176175
) -> None:
@@ -271,57 +270,34 @@ def append_to_kv_heads_per_layer(num_kv_heads_per_layer: List[int],
271270
# FIXME: flashinfer.py accesses kv_cache_manager.blocks_in_primary_pool
272271
# This dependency should be adjusted as it only covers the single window
273272
# case and not VSWA scheme.
274-
if is_estimating_kv_cache:
275-
# If this is an estimation dry run, we have already calculated the
276-
# max_tokens under _util.py::try_prepare_estimation
277-
# Since this is a dry run, assigning the same max_tokens capacity
278-
# to all window sizes as they are full attentions is enough.
279-
self.blocks_in_primary_pool = int(kv_cache_config.max_tokens //
280-
tokens_per_block)
281-
282-
host_cache_size = kv_cache_config.host_cache_size if kv_cache_config.host_cache_size else 0
283-
max_tokens_secondary = host_cache_size // self.get_cache_bytes_per_token(
273+
if self.is_vswa:
274+
# VSWA case: use C++ implementation for variable window sizes
275+
if model_config is None:
276+
raise ValueError(
277+
"model_config is required for VSWA (Variable Sliding Window Attention)"
278+
)
279+
assert isinstance(
280+
kv_cache_config, KvCacheConfig
281+
), "calculate_max_num_blocks_from_cpp only accepts KvCacheConfig"
282+
blocks_per_window = self.calculate_max_num_blocks_from_cpp(
283+
kv_cache_config=kv_cache_config,
284+
model_config=model_config,
285+
extra_cost_memory=0,
286+
)
287+
else:
288+
# Standard case: use original Python implementation
289+
self.blocks_in_primary_pool, self.blocks_in_secondary_pool = self.calculate_max_num_blocks(
290+
kv_cache_config=kv_cache_config,
291+
head_dim=head_dim,
292+
tokens_per_block=tokens_per_block,
293+
mapping=mapping,
294+
dtype=dtype,
295+
kv_factor=self.kv_factor,
284296
)
285-
self.blocks_in_secondary_pool = int(max_tokens_secondary //
286-
tokens_per_block)
287-
288297
blocks_per_window = {
289-
window_size:
298+
self.max_attention_window_vec[0]:
290299
(self.blocks_in_primary_pool, self.blocks_in_secondary_pool)
291-
for window_size in set(self.max_attention_window_vec)
292300
}
293-
logger.info(
294-
f"[kv cache manager] Primary/secondary blocks for window sizes set to {blocks_per_window} for estimation dry run"
295-
)
296-
else:
297-
if self.is_vswa:
298-
# VSWA case: use C++ implementation for variable window sizes
299-
if model_config is None:
300-
raise ValueError(
301-
"model_config is required for VSWA (Variable Sliding Window Attention)"
302-
)
303-
assert isinstance(
304-
kv_cache_config, KvCacheConfig
305-
), "calculate_max_num_blocks_from_cpp only accepts KvCacheConfig"
306-
blocks_per_window = self.calculate_max_num_blocks_from_cpp(
307-
kv_cache_config=kv_cache_config,
308-
model_config=model_config,
309-
extra_cost_memory=0,
310-
)
311-
else:
312-
# Standard case: use original Python implementation
313-
self.blocks_in_primary_pool, self.blocks_in_secondary_pool = self.calculate_max_num_blocks(
314-
kv_cache_config=kv_cache_config,
315-
head_dim=head_dim,
316-
tokens_per_block=tokens_per_block,
317-
mapping=mapping,
318-
dtype=dtype,
319-
kv_factor=self.kv_factor,
320-
)
321-
blocks_per_window = {
322-
self.max_attention_window_vec[0]:
323-
(self.blocks_in_primary_pool, self.blocks_in_secondary_pool)
324-
}
325301

326302
# Validate and adjust attention windows against their upper bounds if needed
327303
blocks_per_window, self.max_seq_len, self.max_attention_window_vec = self._validate_and_adjust_attention_windows(

0 commit comments

Comments
 (0)