Closed the graph break overhead in python

cehongwang · cehongwang · commit c6830ff4100b · 2025-08-06T20:59:55.000Z
diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py
@@ -873,7 +873,7 @@ def preserve_module_specs(
     trt_modules = {}
     # Iterate over all components that can be accelerated
     # Generate the corresponding TRT Module for those
-
+    trt_module = None
     for name, _ in partitioned_module.named_children():
         submodule = getattr(partitioned_module, name)
         # filter on the GraphModule
@@ -994,6 +994,9 @@ def preserve_module_specs(
                     ) as f:
                         f.write(trt_module.get_layer_info())
 
+    if trt_module and settings.use_python_runtime:
+        trt_module.set_requires_unique_output(True)
+
     # Parse the graph I/O and store it in dryrun tracker
     parse_graph_io(gm, dryrun_tracker)
 
diff --git a/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py
@@ -221,8 +221,16 @@ def __init__(
         self.use_output_allocator_outputs = False
         self.device = torch.cuda.current_device()
         self.cudagraphs_enabled = torch_tensorrt.runtime.get_cudagraphs_mode()
+        self.requires_unique_output = False
         if self.serialized_engine is not None and not self.settings.lazy_engine_init:
             self.setup_engine()
+        self.is_shape_inference_io = [
+            self.engine.is_shape_inference_io(input_name)
+            for input_name in self.input_names
+        ]
+
+    def set_requires_unique_output(self, requires_unique_output: bool) -> None:
+        self.requires_unique_output = requires_unique_output
 
     def get_streamable_device_memory_budget(self) -> Any:
         return self.engine.streamable_weights_size
@@ -269,10 +277,10 @@ def setup_engine(self) -> None:
         # otherwise, use the caller stream and disable stream synchronization
         self._caller_stream = torch.cuda.current_stream()
         if self._caller_stream == torch.cuda.default_stream():
-            self._engine_stream = torch.cuda.Stream()
+            self._engine_stream: torch.cuda.Stream = torch.cuda.Stream()
             self.sync_stream = True
         else:
-            self._engine_stream = self._caller_stream
+            self._engine_stream: torch.cuda.Stream = self._caller_stream
             self.sync_stream = False
 
         self.initialized = True
@@ -396,7 +404,7 @@ def setup_input_tensors(
 
             # For shape tensors, we use CPU pointers and for data tensors, we use GPU pointers
             # as per TensorRT requirements
-            if self.engine.is_shape_inference_io(input_name):
+            if self.is_shape_inference_io[i]:
                 # Shape tensor inputs are casted to int64 explicitly
                 # Currently Torch CPU pointers are not working; numpy pointers are used instead
                 # to refer to underlying memory
@@ -500,7 +508,7 @@ def run_standard_execution() -> torch.Tensor | Tuple[torch.Tensor, ...]:
                         raise ValueError(
                             "Encountered dynamic output shapes during runtime. This could mean the network has data-dependent output shapes which is not currently supported."
                         )
-                    if self.output_tensors is None:
+                    if self.output_tensors is None or self.requires_unique_output:
                         self.output_tensors = self.create_output_tensors()
                     outputs = self.output_tensors