Remove memory estimation process

HuiGao-NV · HuiGao-NV · commit 8a83e029923a · 2026-01-29T14:28:54.000Z
Signed-off-by: Hui Gao &lt;huig@nvidia.com&gt;
diff --git a/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py b/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py
@@ -259,7 +259,6 @@ def create_draft_kv_cache_manager_maybe(
         max_num_tokens=ad_config.max_num_tokens,
         max_beam_width=ad_config.max_beam_width,
         kv_connector_manager=None,  # KV connector manager not used in AutoDeploy (no disagg support)
-        estimating_kv_cache=False,
     )
 
 
diff --git a/tensorrt_llm/_torch/auto_deploy/shim/interface.py b/tensorrt_llm/_torch/auto_deploy/shim/interface.py
@@ -217,7 +217,6 @@ def _create_kv_cache_manager(self, max_tokens: Optional[int] = None) -> int:
             # we don't rely on free_gpu_memory_fraction inside the KVCacheManager. This is similar
             # to _torch.pyexecutor._util.KVCacheCreator, which explicitly estimates the max_tokens
             # outside of the KVCacheManager.
-            "is_estimating_kv_cache": False,
         }
 
         # update args if we are just doing a dummy cache manager
diff --git a/tensorrt_llm/_torch/pyexecutor/_util.py b/tensorrt_llm/_torch/pyexecutor/_util.py
diff --git a/tensorrt_llm/_torch/pyexecutor/mamba_cache_manager.py b/tensorrt_llm/_torch/pyexecutor/mamba_cache_manager.py
@@ -377,7 +377,6 @@ def __init__(
         mapping: Mapping,
         dtype: DataType = DataType.HALF,
         spec_config: Optional["DecodingBaseConfig"] = None,
-        is_estimating_kv_cache: bool = False,
         execution_stream: Optional[torch.cuda.Stream] = None,
     ) -> None:
 
@@ -418,7 +417,6 @@ def __init__(
             dtype=dtype,
             spec_config=spec_config,
             layer_mask=layer_mask,
-            is_estimating_kv_cache=is_estimating_kv_cache,
             execution_stream=execution_stream,
         )
 
diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py
@@ -1302,7 +1302,8 @@ def _init_model_capacity(self):
         self._init_max_num_tokens()
 
     def _release_cuda_graphs(self):
-        self.cuda_graph_runner.clear()
+        if self.cuda_graph_runner is not None:
+            self.cuda_graph_runner.clear()
 
     def get_max_num_sequences(self) -> int:
         """
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py b/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py
@@ -33,9 +33,8 @@
                            get_spec_resource_manager)
 from ..virtual_memory import ExecutorMemoryType, RestoreMode
 from ..virtual_memory import scope as virtual_memory_scope
-from ._util import (KvCacheCreator, _adjust_torch_mem_fraction,
-                    create_py_executor_instance, instantiate_sampler, is_mla,
-                    validate_feature_combination)
+from ._util import (KvCacheCreator, create_py_executor_instance,
+                    instantiate_sampler, is_mla, validate_feature_combination)
 from .config_utils import is_mla
 from .guided_decoder import CapturableGuidedDecoder, GuidedDecoder
 from .kv_cache_connector import KvCacheConnectorManager
@@ -222,7 +221,6 @@ def create_py_executor(
     tokenizer: Optional[TokenizerBase] = None,
     profiling_stage_data: Optional[dict] = None,
 ) -> PyExecutor:
-    torch.cuda.set_per_process_memory_fraction(1.0)
     garbage_collection_gen0_threshold = llm_args.garbage_collection_gen0_threshold
     lora_config = llm_args.lora_config
     kv_connector_config = llm_args.kv_connector_config
@@ -434,7 +432,6 @@ def drafting_loop_wrapper(model):
 
     # PyTorchModelEngine modifies these fields, update them
     model_engine_max_seq_len = model_engine.max_seq_len
-    net_max_seq_len = model_engine_max_seq_len
     if not llm_args.disable_overlap_scheduler:
         model_engine_max_seq_len = model_engine.max_seq_len + 1
         if spec_config is not None:
@@ -604,7 +601,6 @@ def drafting_loop_wrapper(model):
         kv_connector_manager = None
 
     resources = {}
-    estimating_kv_cache = False
     kv_cache_creator = None
 
     # Create the execution stream for model forward operations
@@ -619,7 +615,6 @@ def drafting_loop_wrapper(model):
             model_engine=model_engine,
             draft_model_engine=draft_model_engine,
             mapping=mapping,
-            net_max_seq_len=net_max_seq_len,
             kv_connector_manager=kv_connector_manager,
             max_num_tokens=max_num_tokens,
             max_beam_width=max_beam_width,
@@ -633,11 +628,8 @@ def drafting_loop_wrapper(model):
             sparse_attention_config=sparse_attention_config,
             execution_stream=execution_stream,
         )
-        estimating_kv_cache = kv_cache_creator.try_prepare_estimation()
-        with allocation_scope(
-                ExecutorMemoryType.INIT_KV_CACHE if estimating_kv_cache else
-                ExecutorMemoryType.KV_CACHE, RestoreMode.NONE):
-            kv_cache_creator.build_managers(resources, estimating_kv_cache)
+        with allocation_scope(ExecutorMemoryType.KV_CACHE, RestoreMode.NONE):
+            kv_cache_creator.build_managers(resources)
             # Originally, max_seq_len might be mutated inside build_managers as field of executor config.
             # Since now, we are changing kv_cache_creator._max_seq_len instead. Restore max_seq_len here.
             max_seq_len = kv_cache_creator._max_seq_len
@@ -663,100 +655,40 @@ def drafting_loop_wrapper(model):
                                    spec_resource_manager=spec_resource_manager,
                                    guided_decoder=guided_decoder)
 
-    with allocation_scope(
-            ExecutorMemoryType.INIT_EXTRA_RESOURCES if estimating_kv_cache else
-            ExecutorMemoryType.EXTRA_RESOURCES, RestoreMode.PINNED):
+    with allocation_scope(ExecutorMemoryType.EXTRA_RESOURCES,
+                          RestoreMode.PINNED):
+
+        # run gc.collect() to free memory of the previous py_executor, avoid cudaFree overlap with cuda graph capture
+        gc.collect()
         py_executor = create_py_executor_instance(
             dist=dist,
             resources=resources,
             mapping=mapping,
             llm_args=llm_args,
             ctx_chunk_config=ctx_chunk_config,
             model_engine=model_engine,
-            start_worker=False,
             sampler=sampler,
             drafter=drafter,
             guided_decoder=guided_decoder,
             lora_config=lora_config,
             garbage_collection_gen0_threshold=garbage_collection_gen0_threshold,
-            kv_connector_manager=kv_connector_manager
-            if not estimating_kv_cache else None,
+            kv_connector_manager=kv_connector_manager,
             max_seq_len=max_seq_len,
             max_batch_size=max_batch_size,
             max_beam_width=max_beam_width,
             max_num_tokens=max_num_tokens,
             peft_cache_config=peft_cache_config,
             scheduler_config=scheduler_config,
             cache_transceiver_config=cache_transceiver_config,
-            virtual_memory_pools=vm_pools if not estimating_kv_cache else None,
+            virtual_memory_pools=vm_pools,
             execution_stream=execution_stream,
         )
+
         # Originally, peft_cache_config might be mutated inside
         # create_py_executor_instance. Restore it here.
         peft_cache_config = py_executor.peft_cache_config
 
-    if estimating_kv_cache:
-        assert kv_cache_creator is not None
-        with allocation_scope(ExecutorMemoryType.MODEL_EXTRA,
-                              RestoreMode.PINNED):
-            kv_cache_creator.configure_kv_cache_capacity(py_executor)
-        kv_cache_creator.teardown_managers(resources)
-        del py_executor  # free before constructing new
-
-        with allocation_scope(ExecutorMemoryType.KV_CACHE, RestoreMode.NONE):
-            # Before estimating KV cache size, a minimal KV cache has been allocated using
-            # create_kv_cache_manager above, which caps kv_cache_creator.max_seq_len. Restoring
-            # the original value before creating the final KV cache.
-            kv_cache_creator._max_seq_len = model_engine_max_seq_len
-            kv_cache_creator.build_managers(resources, False)
-            # Originally, max_seq_len might be mutated inside build_managers as field of executor config.
-            # Since now, we are changing kv_cache_creator._max_seq_len instead. Restore max_seq_len here.
-            max_seq_len = kv_cache_creator._max_seq_len
-            update_sampler_max_seq_len(max_seq_len, sampler)
-
-            for eng in [model_engine, draft_model_engine]:
-                if eng is None:
-                    continue
-                if eng.attn_metadata is not None:
-                    if llm_args.cuda_graph_config is not None:
-                        eng._release_cuda_graphs()
-                    eng.attn_metadata = None
-
-        with allocation_scope(ExecutorMemoryType.EXTRA_RESOURCES,
-                              RestoreMode.PINNED):
-
-            # run gc.collect() to free memory of the previous py_executor, avoid cudaFree overlap with cuda graph capture
-            gc.collect()
-            py_executor = create_py_executor_instance(
-                dist=dist,
-                resources=resources,
-                mapping=mapping,
-                llm_args=llm_args,
-                ctx_chunk_config=ctx_chunk_config,
-                model_engine=model_engine,
-                start_worker=False,
-                sampler=sampler,
-                drafter=drafter,
-                guided_decoder=guided_decoder,
-                lora_config=lora_config,
-                garbage_collection_gen0_threshold=
-                garbage_collection_gen0_threshold,
-                kv_connector_manager=kv_connector_manager,
-                max_seq_len=max_seq_len,
-                max_batch_size=max_batch_size,
-                max_beam_width=max_beam_width,
-                max_num_tokens=max_num_tokens,
-                peft_cache_config=peft_cache_config,
-                scheduler_config=scheduler_config,
-                cache_transceiver_config=cache_transceiver_config,
-                virtual_memory_pools=vm_pools,
-                execution_stream=execution_stream,
-            )
-
-    _adjust_torch_mem_fraction()
-
     if mapping.rank == 0:
         logger.info(f"LLM Args:\n{llm_args}")
 
-    py_executor.start_worker()
     return py_executor
diff --git a/tensorrt_llm/_torch/pyexecutor/resource_manager.py b/tensorrt_llm/_torch/pyexecutor/resource_manager.py
@@ -170,7 +170,6 @@ def __init__(
         enable_indexer_k_cache: bool = False,
         indexer_k_cache_quant_block_size: int = 128,
         indexer_k_cache_index_head_dim: int = 0,
-        is_estimating_kv_cache: bool = False,
         execution_stream: Optional[torch.cuda.Stream] = None,
         **kwargs,
     ) -> None:
@@ -271,57 +270,34 @@ def append_to_kv_heads_per_layer(num_kv_heads_per_layer: List[int],
         # FIXME: flashinfer.py accesses kv_cache_manager.blocks_in_primary_pool
         # This dependency should be adjusted as it only covers the single window
         # case and not VSWA scheme.
-        if is_estimating_kv_cache:
-            # If this is an estimation dry run, we have already calculated the
-            # max_tokens under _util.py::try_prepare_estimation
-            # Since this is a dry run, assigning the same max_tokens capacity
-            # to all window sizes as they are full attentions is enough.
-            self.blocks_in_primary_pool = int(kv_cache_config.max_tokens //
-                                              tokens_per_block)
-
-            host_cache_size = kv_cache_config.host_cache_size if kv_cache_config.host_cache_size else 0
-            max_tokens_secondary = host_cache_size // self.get_cache_bytes_per_token(
+        if self.is_vswa:
+            # VSWA case: use C++ implementation for variable window sizes
+            if model_config is None:
+                raise ValueError(
+                    "model_config is required for VSWA (Variable Sliding Window Attention)"
+                )
+            assert isinstance(
+                kv_cache_config, KvCacheConfig
+            ), "calculate_max_num_blocks_from_cpp only accepts KvCacheConfig"
+            blocks_per_window = self.calculate_max_num_blocks_from_cpp(
+                kv_cache_config=kv_cache_config,
+                model_config=model_config,
+                extra_cost_memory=0,
+            )
+        else:
+            # Standard case: use original Python implementation
+            self.blocks_in_primary_pool, self.blocks_in_secondary_pool = self.calculate_max_num_blocks(
+                kv_cache_config=kv_cache_config,
+                head_dim=head_dim,
+                tokens_per_block=tokens_per_block,
+                mapping=mapping,
+                dtype=dtype,
+                kv_factor=self.kv_factor,
             )
-            self.blocks_in_secondary_pool = int(max_tokens_secondary //
-                                                tokens_per_block)
-
             blocks_per_window = {
-                window_size:
+                self.max_attention_window_vec[0]:
                 (self.blocks_in_primary_pool, self.blocks_in_secondary_pool)
-                for window_size in set(self.max_attention_window_vec)
             }
-            logger.info(
-                f"[kv cache manager] Primary/secondary blocks for window sizes set to {blocks_per_window} for estimation dry run"
-            )
-        else:
-            if self.is_vswa:
-                # VSWA case: use C++ implementation for variable window sizes
-                if model_config is None:
-                    raise ValueError(
-                        "model_config is required for VSWA (Variable Sliding Window Attention)"
-                    )
-                assert isinstance(
-                    kv_cache_config, KvCacheConfig
-                ), "calculate_max_num_blocks_from_cpp only accepts KvCacheConfig"
-                blocks_per_window = self.calculate_max_num_blocks_from_cpp(
-                    kv_cache_config=kv_cache_config,
-                    model_config=model_config,
-                    extra_cost_memory=0,
-                )
-            else:
-                # Standard case: use original Python implementation
-                self.blocks_in_primary_pool, self.blocks_in_secondary_pool = self.calculate_max_num_blocks(
-                    kv_cache_config=kv_cache_config,
-                    head_dim=head_dim,
-                    tokens_per_block=tokens_per_block,
-                    mapping=mapping,
-                    dtype=dtype,
-                    kv_factor=self.kv_factor,
-                )
-                blocks_per_window = {
-                    self.max_attention_window_vec[0]:
-                    (self.blocks_in_primary_pool, self.blocks_in_secondary_pool)
-                }
 
         # Validate and adjust attention windows against their upper bounds if needed
         blocks_per_window, self.max_seq_len, self.max_attention_window_vec = self._validate_and_adjust_attention_windows(

Original file line number	Diff line number	Diff line change
`@@ -259,7 +259,6 @@ def create_draft_kv_cache_manager_maybe(`
`259`	`259`	`max_num_tokens=ad_config.max_num_tokens,`
`260`	`260`	`max_beam_width=ad_config.max_beam_width,`
`261`	`261`	`kv_connector_manager=None, # KV connector manager not used in AutoDeploy (no disagg support)`
`262`		`- estimating_kv_cache=False,`
`263`	`262`	`)`
`264`	`263`
`265`	`264`
Original file line number	Diff line number	Diff line change
`@@ -217,7 +217,6 @@ def _create_kv_cache_manager(self, max_tokens: Optional[int] = None) -> int:`
`217`	`217`	`# we don't rely on free_gpu_memory_fraction inside the KVCacheManager. This is similar`
`218`	`218`	`# to _torch.pyexecutor._util.KVCacheCreator, which explicitly estimates the max_tokens`
`219`	`219`	`# outside of the KVCacheManager.`
`220`		`- "is_estimating_kv_cache": False,`
`221`	`220`	`}`
`222`	`221`
`223`	`222`	`# update args if we are just doing a dummy cache manager`