- v2.8.0.dev0+ee32da0
+ v2.9.0.dev0+92a6908
@@ -321,7 +321,7 @@
- v2.8.0.dev0+ee32da0
+ v2.9.0.dev0+92a6908
@@ -321,7 +321,7 @@
- v2.8.0.dev0+ee32da0
+ v2.9.0.dev0+92a6908
@@ -321,7 +321,7 @@
- v2.8.0.dev0+ee32da0
+ v2.9.0.dev0+92a6908
@@ -321,7 +321,7 @@
import logging
import platform
from enum import Enum
-from typing import Any, Callable, List, Optional, Sequence, Set
+from typing import Any, Callable, List, Optional, Sequence, Set, Union
import torch
import torch.fx
from torch_tensorrt._enums import dtype
-from torch_tensorrt._features import ENABLED_FEATURES
+from torch_tensorrt._features import ENABLED_FEATURES, needs_cross_compile
from torch_tensorrt._Input import Input
from torch_tensorrt.dynamo import _defaults
from torch_tensorrt.dynamo.runtime._CudaGraphsTorchTensorRTModule import (
@@ -659,7 +658,7 @@ Source code for torch_tensorrt._compile
inputs: Optional[Sequence[Input | torch.Tensor | InputTensorSpec]] = None,
arg_inputs: Optional[Sequence[Sequence[Any]]] = None,
kwarg_inputs: Optional[dict[Any, Any]] = None,
- enabled_precisions: Optional[Set[torch.dtype | dtype]] = None,
+ enabled_precisions: Optional[Set[Union[torch.dtype, dtype]]] = None,
**kwargs: Any,
) -> (
torch.nn.Module | torch.jit.ScriptModule | torch.fx.GraphModule | Callable[..., Any]
@@ -702,7 +701,7 @@ Source code for torch_tensorrt._compile
"""
input_list = inputs if inputs is not None else []
- enabled_precisions_set: Set[dtype | torch.dtype] = (
+ enabled_precisions_set: Set[Union[torch.dtype, dtype]] = (
enabled_precisions
if enabled_precisions is not None
else _defaults.ENABLED_PRECISIONS
@@ -790,13 +789,14 @@ Source code for torch_tensorrt._compile
raise RuntimeError("Module is an unknown format or the ir requested is unknown")
-[docs]def cross_compile_for_windows(
+
@needs_cross_compile
+
def cross_compile_for_windows(
module: torch.nn.Module,
file_path: str,
inputs: Optional[Sequence[Input | torch.Tensor]] = None,
arg_inputs: Optional[Sequence[Sequence[Any]]] = None,
kwarg_inputs: Optional[dict[Any, Any]] = None,
-
enabled_precisions: Optional[Set[torch.dtype | dtype]] = None,
+
enabled_precisions: Optional[Set[Union[torch.dtype, dtype]]] = None,
**kwargs: Any,
) -> None:
"""Compile a PyTorch module using TensorRT in Linux for Inference in Windows
@@ -886,7 +886,7 @@
Source code for torch_tensorrt._compile
)
dynamo_save_cross_compiled_exported_program(trt_gm, file_path)
- logger.debug("successfully compiled and saved the module for windows")
+
logger.debug("successfully compiled and saved the module for windows")
def torch_compile(module: torch.nn.Module, **kwargs: Any) -> Any:
@@ -912,7 +912,7 @@
Source code for torch_tensorrt._compile
arg_inputs: Optional[Sequence[Sequence[Any]]] = None,
kwarg_inputs: Optional[dict[Any, Any]] = None,
ir: str = "default",
- enabled_precisions: Optional[Set[torch.dtype | dtype]] = None,
+ enabled_precisions: Optional[Set[Union[torch.dtype, dtype]]] = None,
**kwargs: Any,
) -> bytes:
"""Convert a TorchScript module method to a serialized TensorRT engine
@@ -1074,6 +1074,7 @@ Source code for torch_tensorrt._compile
kwarg_inputs: Optional[dict[str, Any]] = None,
retrace: bool = False,
pickle_protocol: int = 2,
+ **kwargs: Any,
) -> None:
"""
Save the model to disk in the specified output format.
@@ -1083,7 +1084,7 @@ Source code for torch_tensorrt._compile
inputs (torch.Tensor): Torch input tensors
arg_inputs (Tuple[Any, ...]): Same as inputs. Alias for better understanding with kwarg_inputs.
kwarg_inputs (dict[Any, ...]): Optional, kwarg inputs to the module forward function.
- output_format (str): Format to save the model. Options include exported_program | torchscript.
+ output_format (str): Format to save the model. Options include exported_program | torchscript | aot_inductor.
retrace (bool): When the module type is a fx.GraphModule, this option re-exports the graph using torch.export.export(strict=False) to save it.
This flag is experimental for now.
pickle_protocol (int): The pickle protocol to use to save the model. Default is 2. Increase this to 4 or higher for large models
@@ -1091,7 +1092,7 @@ Source code for torch_tensorrt._compile
if isinstance(module, CudaGraphsTorchTensorRTModule):
module = module.compiled_module
module_type = _parse_module_type(module)
- accepted_formats = {"exported_program", "torchscript"}
+ accepted_formats = {"exported_program", "torchscript", "aot_inductor"}
if arg_inputs is not None and not all(
isinstance(input, torch.Tensor) for input in arg_inputs
):
@@ -1114,6 +1115,10 @@ Source code for torch_tensorrt._compile
raise ValueError(
f"Provided output_format {output_format} is not supported. Supported options are exported_program | torchscript"
)
+ if output_format == "aot_inductor" and platform.system() != "Linux":
+ raise ValueError(
+ f"The AOT Inductor format is only supported on Linux, {platform.system()} is not a supported platform for this format"
+ )
if not file_path:
raise ValueError("File path cannot be empty. Please provide a valid file path")
@@ -1122,9 +1127,9 @@ Source code for torch_tensorrt._compile
"Input model is of type nn.Module. Saving nn.Module directly is not supported. Supported model types torch.jit.ScriptModule | torch.fx.GraphModule | torch.export.ExportedProgram."
)
elif module_type == _ModuleType.ts:
- if output_format == "exported_program":
+ if not all([output_format == f for f in ["exported_program", "aot_inductor"]]):
raise ValueError(
- "Provided model is a torch.jit.ScriptModule but the output_format specified is exported_program. Please verify the output_format"
+ "Provided model is a torch.jit.ScriptModule but the output_format specified is not torchscript. Other output formats are not supported"
)
else:
if arg_inputs is not None:
@@ -1142,7 +1147,22 @@ Source code for torch_tensorrt._compile
logger.warning(
"Provided model is a torch.export.ExportedProgram, inputs or arg_inputs is not necessary during save, it uses the inputs or arg_inputs provided during export and compile"
)
- torch.export.save(module, file_path)
+ if output_format == "exported_program":
+ torch.export.save(module, file_path, pickle_protocol=pickle_protocol)
+ elif output_format == "aot_inductor":
+ inductor_configs = {}
+ if "inductor_configs" in kwargs:
+ inductor_configs = kwargs["inductor_configs"]
+
+ torch._inductor.aoti_compile_and_package(
+ exp_program,
+ inductor_configs=inductor_configs,
+ package_path=file_path,
+ )
+ else:
+ raise RuntimeError(
+ "Attempted to serialize an exported program with an unsupported format. Exported programs support exported_program and aot_inductor"
+ )
elif module_type == _ModuleType.fx:
# The module type is torch.fx.GraphModule
if output_format == "torchscript":
@@ -1159,9 +1179,24 @@ Source code for torch_tensorrt._compile
"Provided model is a torch.fx.GraphModule and retrace is False, inputs or arg_inputs is not necessary during save."
)
exp_program = export(module)
- torch.export.save(
- exp_program, file_path, pickle_protocol=pickle_protocol
- )
+ if output_format == "exported_program":
+ torch.export.save(
+ exp_program, file_path, pickle_protocol=pickle_protocol
+ )
+ elif output_format == "aot_inductor":
+ inductor_configs = {}
+ if "inductor_configs" in kwargs:
+ inductor_configs = kwargs["inductor_configs"]
+
+ torch._inductor.aoti_compile_and_package(
+ exp_program,
+ inductor_configs=inductor_configs,
+ package_path=file_path,
+ )
+ else:
+ raise RuntimeError(
+ "Attempted to serialize an exported program with an unsupported format. Exported programs support exported_program and aot_inductor"
+ )
else:
if arg_inputs is None:
raise ValueError(
@@ -1173,9 +1208,25 @@ Source code for torch_tensorrt._compile
kwargs=kwarg_inputs,
strict=False,
)
- torch.export.save(
- exp_program, file_path, pickle_protocol=pickle_protocol
- )
+
+ if output_format == "exported_program":
+ torch.export.save(
+ exp_program, file_path, pickle_protocol=pickle_protocol
+ )
+ elif output_format == "aot_inductor":
+ inductor_configs = {}
+ if "inductor_configs" in kwargs:
+ inductor_configs = kwargs["inductor_configs"]
+
+ torch._inductor.aoti_compile_and_package(
+ exp_program,
+ inductor_configs=inductor_configs,
+ package_path=file_path,
+ )
+ else:
+ raise RuntimeError(
+ "Attempted to serialize an exported program with an unsupported format. Exported programs support exported_program and aot_inductor"
+ )
diff --git a/docs/_modules/torch_tensorrt/_enums.html b/docs/_modules/torch_tensorrt/_enums.html
index 6fc65ba01b..709a2fe055 100644
--- a/docs/_modules/torch_tensorrt/_enums.html
+++ b/docs/_modules/torch_tensorrt/_enums.html
@@ -9,7 +9,7 @@
- torch_tensorrt._enums — Torch-TensorRT v2.8.0.dev0+ee32da0 documentation
+ torch_tensorrt._enums — Torch-TensorRT v2.9.0.dev0+92a6908 documentation
@@ -290,7 +290,7 @@
- v2.8.0.dev0+ee32da0
+ v2.9.0.dev0+92a6908
@@ -321,7 +321,7 @@
Getting Started
User Guide
@@ -373,9 +373,8 @@
Compiling ResNet with dynamic shapes using the torch.compile backend
Compiling BERT using the torch.compile backend
Compiling Stable Diffusion model using the torch.compile backend
+
Compiling LLM models from Huggingface
Compiling GPT2 using the Torch-TensorRT torch.compile
frontend
-
Compiling GPT2 using the dynamo backend
-
Compiling Llama2 using the dynamo backend
Compiling SAM2 using the dynamo backend
Compiling FLUX.1-dev model using the Torch-TensorRT dynamo backend
Legacy notebooks
@@ -566,6 +565,12 @@
Source code for torch_tensorrt._enums
f8 = auto()
"""8 bit floating-point number, equivalent to ``dtype.fp8`` and ``dtype.float8``
+ :meta hide-value:
+ """
+
+ f4 = auto()
+ """4 bit floating-point number, equivalent to ``dtype.fp4`` and ``dtype.float4``
+
:meta hide-value:
"""
@@ -580,6 +585,9 @@ Source code for torch_tensorrt._enums
float8 = f8
fp8 = f8
+ float4 = f4
+ fp4 = f4
+
half = f16
fp16 = f16
float16 = f16
@@ -651,6 +659,8 @@ Source code for torch_tensorrt._enums
return dtype.i32
elif t == torch.float8_e4m3fn:
return dtype.f8
+ elif t == torch.float4_e2m1fn_x2:
+ return dtype.f4
elif t == torch.half:
return dtype.f16
elif t == torch.float:
@@ -677,6 +687,8 @@ Source code for torch_tensorrt._enums
return dtype.i8
elif t == trt.DataType.FP8:
return dtype.f8
+ elif t == trt.DataType.FP4:
+ return dtype.fp4
elif t == trt.DataType.INT32:
return dtype.i32
elif t == trt.DataType.INT64:
@@ -846,6 +858,8 @@ Source code for torch_tensorrt._enums
return torch.long
elif self == dtype.f8:
return torch.float8_e4m3fn
+ elif self == dtype.f4:
+ return torch.float4_e2m1fn_x2
elif self == dtype.f16:
return torch.half
elif self == dtype.f32:
@@ -883,6 +897,8 @@ Source code for torch_tensorrt._enums
return trt.DataType.BOOL
elif self == dtype.bf16:
return trt.DataType.BF16
+ elif self == dtype.f4:
+ return trt.DataType.FP4
elif use_default:
return trt.DataType.FLOAT
else:
@@ -899,6 +915,8 @@ Source code for torch_tensorrt._enums
return np.int64
elif self == dtype.f16:
return np.float16
+ elif self == dtype.f4:
+ return np.float4_e2m1fn_x2
elif self == dtype.f32:
return np.float32
elif self == dtype.f64:
diff --git a/docs/_modules/torch_tensorrt/dynamo/_compiler.html b/docs/_modules/torch_tensorrt/dynamo/_compiler.html
index 5a74548a0d..6523ccfcb4 100644
--- a/docs/_modules/torch_tensorrt/dynamo/_compiler.html
+++ b/docs/_modules/torch_tensorrt/dynamo/_compiler.html
@@ -9,7 +9,7 @@
- torch_tensorrt.dynamo._compiler — Torch-TensorRT v2.8.0.dev0+ee32da0 documentation
+ torch_tensorrt.dynamo._compiler — Torch-TensorRT v2.9.0.dev0+92a6908 documentation
@@ -290,7 +290,7 @@
- v2.8.0.dev0+ee32da0
+ v2.9.0.dev0+92a6908
@@ -321,7 +321,7 @@
Getting Started
User Guide
@@ -373,9 +373,8 @@
Compiling ResNet with dynamic shapes using the torch.compile backend
Compiling BERT using the torch.compile backend
Compiling Stable Diffusion model using the torch.compile backend
+
Compiling LLM models from Huggingface
Compiling GPT2 using the Torch-TensorRT torch.compile
frontend
-
Compiling GPT2 using the dynamo backend
-
Compiling Llama2 using the dynamo backend
Compiling SAM2 using the dynamo backend
Compiling FLUX.1-dev model using the Torch-TensorRT dynamo backend
Legacy notebooks
@@ -491,6 +490,7 @@
Source code for torch_tensorrt.dynamo._compiler
<
import collections.abc
import logging
+
import os
import platform
import warnings
from typing import Any, Collection, List, Optional, Sequence, Set, Tuple, Union
@@ -500,6 +500,7 @@
Source code for torch_tensorrt.dynamo._compiler
<
from torch.fx.node import Target
from torch_tensorrt._Device import Device
from torch_tensorrt._enums import EngineCapability, dtype
+
from torch_tensorrt._features import needs_cross_compile
from torch_tensorrt._Input import Input
from torch_tensorrt.dynamo import _defaults, partitioning
from torch_tensorrt.dynamo._DryRunTracker import (
@@ -520,6 +521,8 @@
Source code for torch_tensorrt.dynamo._compiler
<
from torch_tensorrt.dynamo.conversion._ConverterRegistry import (
DYNAMO_CONVERTERS as CONVERTERS,
)
+
from torch_tensorrt.dynamo.debug._DebuggerConfig import DebuggerConfig
+
from torch_tensorrt.dynamo.debug._supports_debugger import fn_supports_debugger
from torch_tensorrt.dynamo.lowering import (
get_decompositions,
post_lowering,
@@ -531,7 +534,6 @@
Source code for torch_tensorrt.dynamo._compiler
<
get_output_metadata,
parse_graph_io,
prepare_inputs,
-
set_log_level,
to_torch_device,
to_torch_tensorrt_device,
)
@@ -539,7 +541,8 @@
Source code for torch_tensorrt.dynamo._compiler
<
logger = logging.getLogger(__name__)
-
[docs]def cross_compile_for_windows(
+
@needs_cross_compile
+
def cross_compile_for_windows(
exported_program: ExportedProgram,
inputs: Optional[Sequence[Sequence[Any]]] = None,
*,
@@ -553,7 +556,6 @@
Source code for torch_tensorrt.dynamo._compiler
<
Set[Union[torch.dtype, dtype]], Tuple[Union[torch.dtype, dtype]]
] = _defaults.ENABLED_PRECISIONS,
engine_capability: EngineCapability = _defaults.ENGINE_CAPABILITY,
-
debug: bool = _defaults.DEBUG,
num_avg_timing_iters: int = _defaults.NUM_AVG_TIMING_ITERS,
workspace_size: int = _defaults.WORKSPACE_SIZE,
dla_sram_size: int = _defaults.DLA_SRAM_SIZE,
@@ -627,7 +629,6 @@
Source code for torch_tensorrt.dynamo._compiler
<
assume_dynamic_shape_support (bool): Setting this to true enables the converters work for both dynamic and static shapes. Default: False
sparse_weights (bool): Enable sparsity for convolution and fully connected layers.
enabled_precision (Set(Union(torch.dtype, torch_tensorrt.dtype))): The set of datatypes that TensorRT can use when selecting kernels
-
debug (bool): Enable debuggable engine
capability (torch_tensorrt.EngineCapability): Restrict kernel selection to safe gpu kernels or safe dla kernels
num_avg_timing_iters (int): Number of averaging timing iterations used to select kernels
workspace_size (int): Maximum size of workspace given to TensorRT
@@ -674,8 +675,12 @@
Source code for torch_tensorrt.dynamo._compiler
<
f"Cross compile for windows is only supported on x86-64 Linux architecture, current platform: {platform.system()=}, {platform.architecture()[0]=}"
)
-
if debug:
-
set_log_level(logger.parent, logging.DEBUG)
+
if kwargs.get("debug", False):
+
warnings.warn(
+
"`debug` is deprecated. Please use `with torch_tensorrt.dynamo.Debugger(...)` to wrap your compilation call to enable debugging functionality.",
+
DeprecationWarning,
+
stacklevel=2,
+
)
if "truncate_long_and_double" in kwargs.keys():
if truncate_double is not _defaults.TRUNCATE_DOUBLE:
@@ -741,10 +746,11 @@
Source code for torch_tensorrt.dynamo._compiler
<
if use_explicit_typing:
if len(enabled_precisions) != 1 or not any(
-
x in enabled_precisions for x in {torch.float32, dtype.f32}
+
x in enabled_precisions
+
for x in {torch.float32, dtype.f32, torch.float4_e2m1fn_x2, dtype.f4}
):
raise AssertionError(
-
f"When use_explicit_typing is enabled, only torch.float32 is allowed in the enabled_precisions but found {enabled_precisions}"
+
f"use_explicit_typing was set to True, however found that enabled_precisions was also specified (saw: {enabled_precisions}, expected: dtype.f32, dtype.f4). enabled_precisions should not be used when use_explicit_typing=True"
)
if use_fp32_acc:
@@ -786,7 +792,6 @@
Source code for torch_tensorrt.dynamo._compiler
<
"enabled_precisions": (
enabled_precisions if enabled_precisions else _defaults.ENABLED_PRECISIONS
),
-
"debug": debug,
"device": device,
"assume_dynamic_shape_support": assume_dynamic_shape_support,
"workspace_size": workspace_size,
@@ -871,7 +876,7 @@
Source code for torch_tensorrt.dynamo._compiler
<
trt_kwarg_inputs,
settings,
)
- return trt_gm
+
return trt_gm
[docs]def compile(
@@ -888,7 +893,6 @@
Source code for torch_tensorrt.dynamo._compiler
<
Set[Union[torch.dtype, dtype]], Tuple[Union[torch.dtype, dtype]]
] = _defaults.ENABLED_PRECISIONS,
engine_capability: EngineCapability = _defaults.ENGINE_CAPABILITY,
-
debug: bool = _defaults.DEBUG,
num_avg_timing_iters: int = _defaults.NUM_AVG_TIMING_ITERS,
workspace_size: int = _defaults.WORKSPACE_SIZE,
dla_sram_size: int = _defaults.DLA_SRAM_SIZE,
@@ -964,7 +968,6 @@
Source code for torch_tensorrt.dynamo._compiler
<
assume_dynamic_shape_support (bool): Setting this to true enables the converters work for both dynamic and static shapes. Default: False
sparse_weights (bool): Enable sparsity for convolution and fully connected layers.
enabled_precision (Set(Union(torch.dtype, torch_tensorrt.dtype))): The set of datatypes that TensorRT can use when selecting kernels
-
debug (bool): Enable debuggable engine
capability (torch_tensorrt.EngineCapability): Restrict kernel selection to safe gpu kernels or safe dla kernels
num_avg_timing_iters (int): Number of averaging timing iterations used to select kernels
workspace_size (int): Maximum size of workspace given to TensorRT
@@ -1007,8 +1010,13 @@
Source code for torch_tensorrt.dynamo._compiler
<
torch.fx.GraphModule: Compiled FX Module, when run it will execute via TensorRT
"""
-
if debug:
-
set_log_level(logger.parent, logging.DEBUG)
+
if kwargs.get("debug", False):
+
warnings.warn(
+
"`debug` is deprecated. Please use `with torch_tensorrt.dynamo.Debugger(...)` to wrap your compilation call to enable debugging functionality",
+
DeprecationWarning,
+
stacklevel=2,
+
)
+
if "truncate_long_and_double" in kwargs.keys():
if truncate_double is not _defaults.TRUNCATE_DOUBLE:
raise ValueError(
@@ -1072,10 +1080,11 @@
Source code for torch_tensorrt.dynamo._compiler
<
if use_explicit_typing:
if len(enabled_precisions) != 1 or not any(
-
x in enabled_precisions for x in {torch.float32, dtype.f32}
+
x in enabled_precisions
+
for x in {torch.float32, dtype.f32, torch.float4_e2m1fn_x2, dtype.f4}
):
raise AssertionError(
-
f"When use_explicit_typing is enabled, only torch.float32 is allowed in the enabled_precisions but found {enabled_precisions}"
+
f"use_explicit_typing was set to True, however found that enabled_precisions was also specified (saw: {enabled_precisions}, expected: dtype.f32, dtype.f4). enabled_precisions should not be used when use_explicit_typing=True"
)
if use_fp32_acc:
@@ -1130,7 +1139,6 @@
Source code for torch_tensorrt.dynamo._compiler
<
"enabled_precisions": (
enabled_precisions if enabled_precisions else _defaults.ENABLED_PRECISIONS
),
-
"debug": debug,
"device": device,
"assume_dynamic_shape_support": assume_dynamic_shape_support,
"workspace_size": workspace_size,
@@ -1180,6 +1188,7 @@
Source code for torch_tensorrt.dynamo._compiler
<
)
gm = exported_program.module()
+
# Move the weights in the state_dict to CPU
logger.debug("Input graph: " + str(gm.graph))
# Apply lowering on the graph module
@@ -1204,12 +1213,15 @@
Source code for torch_tensorrt.dynamo._compiler
<
return trt_gm
+
@fn_supports_debugger
def compile_module(
gm: torch.fx.GraphModule,
sample_arg_inputs: Sequence[Input],
sample_kwarg_inputs: Optional[dict[Any, Any]] = None,
settings: CompilationSettings = CompilationSettings(),
engine_cache: Optional[BaseEngineCache] = None,
+
*,
+
_debugger_config: Optional[DebuggerConfig] = None,
) -> torch.fx.GraphModule:
"""Compile a traced FX module
@@ -1233,7 +1245,7 @@
Source code for torch_tensorrt.dynamo._compiler
<
# Check the number of supported operations in the graph
num_supported_ops, total_ops = partitioning.get_graph_converter_support(
-
gm, settings.debug, settings.torch_executed_ops
+
gm, settings.torch_executed_ops
)
dryrun_tracker.total_ops_in_graph = total_ops
@@ -1277,6 +1289,28 @@
Source code for torch_tensorrt.dynamo._compiler
<
"Some nodes do not have metadata (shape and dtype information). This could lead to problems sometimes if the graph has PyTorch and TensorRT segments."
)
+
# Store the original input spec for later use
+
original_in_spec = getattr(gm, "_in_spec", None)
+
original_out_spec = getattr(gm, "_out_spec", None)
+
+
# Function to preserve and restore module specs
+
def preserve_module_specs(
+
in_spec: Any, out_spec: Any, target_module: torch.fx.GraphModule
+
) -> None:
+
"""
+
Applies input and output specs to the target module.
+
+
Args:
+
in_spec: The input spec to apply
+
out_spec: The output spec to apply
+
target_module: The module to apply specs to
+
"""
+
# Apply specs to target module
+
if in_spec is not None:
+
target_module._in_spec = in_spec
+
if out_spec is not None:
+
target_module._out_spec = out_spec
+
# Partition module into components that can be TRT-accelerated
fast_partitioner_failed = False
# If specified, try using the fast partitioner and fall back to the global one on failure
@@ -1285,7 +1319,6 @@
Source code for torch_tensorrt.dynamo._compiler
<
logger.info("Partitioning the graph via the fast partitioner")
partitioned_module, supported_ops = partitioning.fast_partition(
gm,
-
verbose=settings.debug,
min_block_size=settings.min_block_size,
torch_executed_ops=settings.torch_executed_ops,
require_full_compilation=settings.require_full_compilation,
@@ -1306,7 +1339,6 @@
Source code for torch_tensorrt.dynamo._compiler
<
logger.info("Partitioning the graph via the global partitioner")
partitioned_module, supported_ops = partitioning.global_partition(
gm,
-
verbose=settings.debug,
min_block_size=settings.min_block_size,
torch_executed_ops=settings.torch_executed_ops,
require_full_compilation=settings.require_full_compilation,
@@ -1324,6 +1356,7 @@
Source code for torch_tensorrt.dynamo._compiler
<
continue
submodule_node_dict[node.name] = node
+
preserve_module_specs(original_in_spec, original_out_spec, partitioned_module)
# Store TRT replicas of Torch subgraphs
trt_modules = {}
# Iterate over all components that can be accelerated
@@ -1401,7 +1434,7 @@
Source code for torch_tensorrt.dynamo._compiler
<
parse_graph_io(submodule, subgraph_data)
dryrun_tracker.tensorrt_graph_count += 1
dryrun_tracker.per_subgraph_data.append(subgraph_data)
-
+
torch.cuda.empty_cache()
# Create TRT engines from submodule
if not settings.dryrun:
trt_module = convert_module(
@@ -1414,6 +1447,41 @@
Source code for torch_tensorrt.dynamo._compiler
<
trt_modules[name] = trt_module
+
if _debugger_config:
+
+
if _debugger_config.save_engine_profile:
+
if settings.use_python_runtime:
+
if _debugger_config.profile_format != "cudagraph":
+
raise ValueError(
+
"Profiling with TREX can only be enabled when using the C++ runtime. Python runtime profiling only support cudagraph visualization."
+
)
+
else:
+
trt_module.enable_profiling()
+
else:
+
if _debugger_config.profile_format == "cudagraph":
+
raise ValueError(
+
"Profiling with Cudagraph can only be enabled when using the Python runtime. C++ runtime profiling only support TREX/Perfetto visualization."
+
)
+
else:
+
path = os.path.join(
+
_debugger_config.logging_dir,
+
"engine_visualization_profile",
+
)
+
os.makedirs(path, exist_ok=True)
+
trt_module.enable_profiling(
+
profiling_results_dir=path,
+
profile_format=_debugger_config.profile_format,
+
)
+
+
if _debugger_config.save_layer_info:
+
with open(
+
os.path.join(
+
_debugger_config.logging_dir, "engine_layer_info.json"
+
),
+
"w",
+
) as f:
+
f.write(trt_module.get_layer_info())
+
# Parse the graph I/O and store it in dryrun tracker
parse_graph_io(gm, dryrun_tracker)
@@ -1438,10 +1506,9 @@
Source code for torch_tensorrt.dynamo._compiler
<
*,
arg_inputs: Optional[Sequence[Sequence[Any]]] = None,
kwarg_inputs: Optional[dict[Any, Any]] = None,
-
enabled_precisions: (
-
Set[torch.dtype | dtype] | Tuple[torch.dtype | dtype]
-
) = _defaults.ENABLED_PRECISIONS,
-
debug: bool = _defaults.DEBUG,
+
enabled_precisions: Union[
+
Set[Union[torch.dtype, dtype]], Tuple[Union[torch.dtype, dtype]]
+
] = _defaults.ENABLED_PRECISIONS,
assume_dynamic_shape_support: bool = _defaults.ASSUME_DYNAMIC_SHAPE_SUPPORT,
workspace_size: int = _defaults.WORKSPACE_SIZE,
min_block_size: int = _defaults.MIN_BLOCK_SIZE,
@@ -1503,7 +1570,6 @@
Source code for torch_tensorrt.dynamo._compiler
<
torch.randn((1, 3, 224, 244)) # Use an example tensor and let torch_tensorrt infer settings
]
enabled_precisions (Optional[Set[torch.dtype | _enums.dtype]]): The set of datatypes that TensorRT can use
-
debug (bool): Whether to print out verbose debugging information
workspace_size (int): Workspace TRT is allowed to use for the module (0 is default)
min_block_size (int): Minimum number of operators per TRT-Engine Block
torch_executed_ops (Set[str]): Set of operations to run in Torch, regardless of converter coverage
@@ -1543,8 +1609,12 @@
Source code for torch_tensorrt.dynamo._compiler
<
Returns:
bytes: Serialized TensorRT engine, can either be saved to a file or deserialized via TensorRT APIs
"""
-
if debug:
-
set_log_level(logger.parent, logging.DEBUG)
+
if kwargs.get("debug", False):
+
warnings.warn(
+
"`debug` is deprecated. Please use `with torch_tensorrt.dynamo.Debugger(...)` to wrap your compilation call to enable debugging functionality.",
+
DeprecationWarning,
+
stacklevel=2,
+
)
if "truncate_long_and_double" in kwargs.keys():
if truncate_double is not _defaults.TRUNCATE_DOUBLE:
@@ -1628,7 +1698,6 @@
Source code for torch_tensorrt.dynamo._compiler
<
compilation_options = {
"assume_dynamic_shape_support": assume_dynamic_shape_support,
"enabled_precisions": enabled_precisions,
-
"debug": debug,
"workspace_size": workspace_size,
"min_block_size": min_block_size,
"torch_executed_ops": torch_executed_ops,
@@ -1712,7 +1781,8 @@
Source code for torch_tensorrt.dynamo._compiler
<
return serialized_engine
-
[docs]def save_cross_compiled_exported_program(
+
@needs_cross_compile
+
def save_cross_compiled_exported_program(
gm: torch.fx.GraphModule,
file_path: str,
) -> None:
@@ -1730,7 +1800,7 @@
Source code for torch_tensorrt.dynamo._compiler
<
exp_program = export(gm, cross_compile_module=True)
torch.export.save(exp_program, file_path)
- logger.debug(f"successfully saved the module for windows at {file_path}")
+
logger.debug(f"successfully saved the module for windows at {file_path}")
[docs]def load_cross_compiled_exported_program(file_path: str = "") -> Any:
diff --git a/docs/_modules/torch_tensorrt/dynamo/_exporter.html b/docs/_modules/torch_tensorrt/dynamo/_exporter.html
index 738150945c..d9ed0f4997 100644
--- a/docs/_modules/torch_tensorrt/dynamo/_exporter.html
+++ b/docs/_modules/torch_tensorrt/dynamo/_exporter.html
@@ -9,7 +9,7 @@
-
torch_tensorrt.dynamo._exporter — Torch-TensorRT v2.8.0.dev0+ee32da0 documentation
+
torch_tensorrt.dynamo._exporter — Torch-TensorRT v2.9.0.dev0+92a6908 documentation
@@ -290,7 +290,7 @@
- v2.8.0.dev0+ee32da0
+ v2.9.0.dev0+92a6908
@@ -321,7 +321,7 @@
Getting Started
User Guide
@@ -373,9 +373,8 @@
Compiling ResNet with dynamic shapes using the torch.compile backend
Compiling BERT using the torch.compile backend
Compiling Stable Diffusion model using the torch.compile backend
+
Compiling LLM models from Huggingface
Compiling GPT2 using the Torch-TensorRT torch.compile
frontend
-
Compiling GPT2 using the dynamo backend
-
Compiling Llama2 using the dynamo backend
Compiling SAM2 using the dynamo backend
Compiling FLUX.1-dev model using the Torch-TensorRT dynamo backend
Legacy notebooks
diff --git a/docs/_modules/torch_tensorrt/dynamo/_settings.html b/docs/_modules/torch_tensorrt/dynamo/_settings.html
index f1478a4457..9ea9f0838b 100644
--- a/docs/_modules/torch_tensorrt/dynamo/_settings.html
+++ b/docs/_modules/torch_tensorrt/dynamo/_settings.html
@@ -9,7 +9,7 @@
-
torch_tensorrt.dynamo._settings — Torch-TensorRT v2.8.0.dev0+ee32da0 documentation
+
torch_tensorrt.dynamo._settings — Torch-TensorRT v2.9.0.dev0+92a6908 documentation
@@ -290,7 +290,7 @@
- v2.8.0.dev0+ee32da0
+ v2.9.0.dev0+92a6908
@@ -321,7 +321,7 @@
Getting Started
User Guide
@@ -373,9 +373,8 @@
Compiling ResNet with dynamic shapes using the torch.compile backend
Compiling BERT using the torch.compile backend
Compiling Stable Diffusion model using the torch.compile backend
+
Compiling LLM models from Huggingface
Compiling GPT2 using the Torch-TensorRT torch.compile
frontend
-
Compiling GPT2 using the dynamo backend
-
Compiling Llama2 using the dynamo backend
Compiling SAM2 using the dynamo backend
Compiling FLUX.1-dev model using the Torch-TensorRT dynamo backend
Legacy notebooks
@@ -488,7 +487,7 @@
Source code for torch_tensorrt.dynamo._settings
from dataclasses import dataclass, field
-from typing import Collection, Optional, Set, Tuple, Union
+from typing import Any, Collection, Optional, Set, Tuple, Union
from torch.fx.node import Target
from torch_tensorrt._Device import Device
@@ -496,7 +495,6 @@ Source code for torch_tensorrt.dynamo._settings
<
from torch_tensorrt.dynamo._defaults import (
ASSUME_DYNAMIC_SHAPE_SUPPORT,
CACHE_BUILT_ENGINES,
-
DEBUG,
DISABLE_TF32,
DLA_GLOBAL_DRAM_SIZE,
DLA_LOCAL_DRAM_SIZE,
@@ -590,7 +588,6 @@
Source code for torch_tensorrt.dynamo._settings
<
"""
enabled_precisions: Set[dtype] = field(default_factory=lambda: ENABLED_PRECISIONS)
-
debug: bool = DEBUG
workspace_size: int = WORKSPACE_SIZE
min_block_size: int = MIN_BLOCK_SIZE
torch_executed_ops: Collection[Target] = field(default_factory=set)
@@ -630,7 +627,22 @@
Source code for torch_tensorrt.dynamo._settings
<
tiling_optimization_level: str = TILING_OPTIMIZATION_LEVEL
l2_limit_for_tiling: int = L2_LIMIT_FOR_TILING
use_distributed_mode_trace: bool = USE_DISTRIBUTED_MODE_TRACE
- offload_module_to_cpu: bool = OFFLOAD_MODULE_TO_CPU
+
offload_module_to_cpu: bool = OFFLOAD_MODULE_TO_CPU
+
+
def __getstate__(self) -> dict[str, Any]:
+
from torch_tensorrt.dynamo.conversion._ConverterRegistry import (
+
ConverterRegistry,
+
)
+
+
state = self.__dict__.copy()
+
state["torch_executed_ops"] = {
+
op if isinstance(op, str) else ConverterRegistry.qualified_name_or_str(op)
+
for op in state["torch_executed_ops"]
+
}
+
return state
+
+
def __setstate__(self, state: dict[str, Any]) -> None:
+
self.__dict__.update(state)
_SETTINGS_TO_BE_ENGINE_INVARIANT = (
diff --git a/docs/_modules/torch_tensorrt/dynamo/_tracer.html b/docs/_modules/torch_tensorrt/dynamo/_tracer.html
index 3a06cd06ba..46a5d3ac24 100644
--- a/docs/_modules/torch_tensorrt/dynamo/_tracer.html
+++ b/docs/_modules/torch_tensorrt/dynamo/_tracer.html
@@ -9,7 +9,7 @@
-
torch_tensorrt.dynamo._tracer — Torch-TensorRT v2.8.0.dev0+ee32da0 documentation
+
torch_tensorrt.dynamo._tracer — Torch-TensorRT v2.9.0.dev0+92a6908 documentation
@@ -290,7 +290,7 @@
- v2.8.0.dev0+ee32da0
+ v2.9.0.dev0+92a6908
@@ -321,7 +321,7 @@
Getting Started
User Guide
@@ -373,9 +373,8 @@
Compiling ResNet with dynamic shapes using the torch.compile backend
Compiling BERT using the torch.compile backend
Compiling Stable Diffusion model using the torch.compile backend
+
Compiling LLM models from Huggingface
Compiling GPT2 using the Torch-TensorRT torch.compile
frontend
-
Compiling GPT2 using the dynamo backend
-
Compiling Llama2 using the dynamo backend
Compiling SAM2 using the dynamo backend
Compiling FLUX.1-dev model using the Torch-TensorRT dynamo backend
Legacy notebooks
@@ -496,8 +495,8 @@
Source code for torch_tensorrt.dynamo._tracer
import torch
from torch.export import Dim, export
from torch_tensorrt._Input import Input
-from torch_tensorrt.dynamo._defaults import DEBUG, default_device
-from torch_tensorrt.dynamo.utils import get_torch_inputs, set_log_level, to_torch_device
+from torch_tensorrt.dynamo._defaults import default_device
+from torch_tensorrt.dynamo.utils import get_torch_inputs, to_torch_device
logger = logging.getLogger(__name__)
@@ -559,10 +558,6 @@ Source code for torch_tensorrt.dynamo._tracer
if kwarg_inputs is None:
kwarg_inputs = {}
- debug = kwargs.get("debug", DEBUG)
- if debug:
- set_log_level(logger.parent, logging.DEBUG)
-
device = to_torch_device(kwargs.get("device", default_device()))
torch_arg_inputs = get_torch_inputs(arg_inputs, device)
torch_kwarg_inputs = get_torch_inputs(kwarg_inputs, device)
@@ -574,6 +569,7 @@ Source code for torch_tensorrt.dynamo._tracer
tuple(torch_arg_inputs),
kwargs=torch_kwarg_inputs,
dynamic_shapes=dynamic_shapes,
+ strict=kwargs.get("strict", False),
)
return exp_program
diff --git a/docs/_modules/torch_tensorrt/dynamo/runtime/_MutableTorchTensorRTModule.html b/docs/_modules/torch_tensorrt/dynamo/runtime/_MutableTorchTensorRTModule.html
index 12830cd4ee..fc2dbe8120 100644
--- a/docs/_modules/torch_tensorrt/dynamo/runtime/_MutableTorchTensorRTModule.html
+++ b/docs/_modules/torch_tensorrt/dynamo/runtime/_MutableTorchTensorRTModule.html
@@ -9,7 +9,7 @@
- torch_tensorrt.dynamo.runtime._MutableTorchTensorRTModule — Torch-TensorRT v2.8.0.dev0+ee32da0 documentation
+ torch_tensorrt.dynamo.runtime._MutableTorchTensorRTModule — Torch-TensorRT v2.9.0.dev0+92a6908 documentation
@@ -290,7 +290,7 @@
- v2.8.0.dev0+ee32da0
+ v2.9.0.dev0+92a6908
@@ -321,7 +321,7 @@
Getting Started
User Guide
@@ -373,9 +373,8 @@
Compiling ResNet with dynamic shapes using the torch.compile backend
Compiling BERT using the torch.compile backend
Compiling Stable Diffusion model using the torch.compile backend
+Compiling LLM models from Huggingface
Compiling GPT2 using the Torch-TensorRT torch.compile
frontend
-Compiling GPT2 using the dynamo backend
-Compiling Llama2 using the dynamo backend
Compiling SAM2 using the dynamo backend
Compiling FLUX.1-dev model using the Torch-TensorRT dynamo backend
Legacy notebooks
@@ -489,21 +488,23 @@
Source code for torch_tensorrt.dynamo.runtime._MutableTorchTensorRTModule
import inspect
import logging
+import warnings
from copy import deepcopy
from enum import Enum, auto
-from typing import Any, Collection, Dict, Iterator, List, Optional, Set, Union
+from typing import Any, Dict, Iterator, Optional, Set, Union
import numpy as np
import torch
-from torch.fx.node import Target
+import torch_tensorrt
+from torch.export._trace import _export
from torch_tensorrt._Device import Device
-from torch_tensorrt._enums import EngineCapability, dtype
+from torch_tensorrt._enums import dtype
from torch_tensorrt.dynamo import _defaults
from torch_tensorrt.dynamo._compiler import compile as dynamo_compile
from torch_tensorrt.dynamo._refit import refit_module_weights
-from torch_tensorrt.dynamo._settings import CompilationSettings
from torch_tensorrt.dynamo.utils import (
check_output_equal,
+ deallocate_module,
to_torch_device,
to_torch_tensorrt_device,
)
@@ -552,35 +553,12 @@ Source code for torch_tensorrt.dynamo.runtime._MutableTorchTensorRTModulepytorch_model: torch.nn.Module,
*,
device: Optional[Union[Device, torch.device, str]] = _defaults.DEVICE,
- disable_tf32: bool = _defaults.DISABLE_TF32,
- assume_dynamic_shape_support: bool = _defaults.ASSUME_DYNAMIC_SHAPE_SUPPORT,
- sparse_weights: bool = _defaults.SPARSE_WEIGHTS,
- enabled_precisions: Set[
- Union[torch.dtype, dtype]
- ] = _defaults.ENABLED_PRECISIONS,
- engine_capability: EngineCapability = _defaults.ENGINE_CAPABILITY,
- immutable_weights: bool = False,
- debug: bool = _defaults.DEBUG,
- num_avg_timing_iters: int = _defaults.NUM_AVG_TIMING_ITERS,
- workspace_size: int = _defaults.WORKSPACE_SIZE,
- dla_sram_size: int = _defaults.DLA_SRAM_SIZE,
- dla_local_dram_size: int = _defaults.DLA_LOCAL_DRAM_SIZE,
- dla_global_dram_size: int = _defaults.DLA_GLOBAL_DRAM_SIZE,
- truncate_double: bool = _defaults.TRUNCATE_DOUBLE,
- require_full_compilation: bool = _defaults.REQUIRE_FULL_COMPILATION,
- min_block_size: int = _defaults.MIN_BLOCK_SIZE,
- torch_executed_ops: Optional[Collection[Target]] = None,
- torch_executed_modules: Optional[List[str]] = None,
- pass_through_build_failures: bool = _defaults.PASS_THROUGH_BUILD_FAILURES,
- max_aux_streams: Optional[int] = _defaults.MAX_AUX_STREAMS,
- version_compatible: bool = _defaults.VERSION_COMPATIBLE,
- optimization_level: Optional[int] = _defaults.OPTIMIZATION_LEVEL,
use_python_runtime: bool = _defaults.USE_PYTHON_RUNTIME,
- use_fast_partitioner: bool = _defaults.USE_FAST_PARTITIONER,
- enable_experimental_decompositions: bool = _defaults.ENABLE_EXPERIMENTAL_DECOMPOSITIONS,
- dryrun: bool = _defaults.DRYRUN,
- hardware_compatible: bool = _defaults.HARDWARE_COMPATIBLE,
- timing_cache_path: str = _defaults.TIMING_CACHE_PATH,
+ immutable_weights: bool = False,
+ strict: bool = True,
+ allow_complex_guards_as_runtime_asserts: bool = False,
+ weight_streaming_budget: Optional[int] = None,
+ enabled_precisions: Optional[Set[Union[torch.dtype, dtype]]] = None,
**kwargs: Any,
) -> None:
"""
@@ -598,7 +576,6 @@ Source code for torch_tensorrt.dynamo.runtime._MutableTorchTensorRTModule sparse_weights (bool): Enable sparsity for convolution and fully connected layers.
enabled_precision (Set(Union(torch.dtype, torch_tensorrt.dtype))): The set of datatypes that TensorRT can use when selecting kernels
immutable_weights (bool): Build non-refittable engines. This is useful for some layers that are not refittable.
- debug (bool): Enable debuggable engine
capability (torch_tensorrt.EngineCapability): Restrict kernel selection to safe gpu kernels or safe dla kernels
num_avg_timing_iters (int): Number of averaging timing iterations used to select kernels
workspace_size (int): Maximum size of workspace given to TensorRT
@@ -622,6 +599,7 @@ Source code for torch_tensorrt.dynamo.runtime._MutableTorchTensorRTModule hardware_compatible (bool): Build the TensorRT engines compatible with GPU architectures other than that of the GPU on which the engine was built (currently works for NVIDIA Ampere and newer)
timing_cache_path (str): Path to the timing cache if it exists (or) where it will be saved after compilation
lazy_engine_init (bool): Defer setting up engines until the compilation of all engines is complete. Can allow larger models with multiple graph breaks to compile but can lead to oversubscription of GPU memory at runtime.
+ enabled_precisions (Set(Union(torch.dtype, torch_tensorrt.dtype))): The set of datatypes that TensorRT can use when selecting kernels
**kwargs: Any,
Returns:
MutableTorchTensorRTModule
@@ -643,53 +621,38 @@ Source code for torch_tensorrt.dynamo.runtime._MutableTorchTensorRTModuleself.exp_program: Any = None
self.arg_inputs: tuple[Any, ...] = tuple()
self.kwarg_inputs: dict[str, Any] = {}
- device = to_torch_tensorrt_device(device)
- enabled_precisions = {dtype._from(p) for p in enabled_precisions}
+ self.additional_settings = kwargs
+ self.strict = strict
+ self.allow_complex_guards_as_runtime_asserts = (
+ allow_complex_guards_as_runtime_asserts
+ )
+ self.use_python_runtime = use_python_runtime
+ self.trt_device = to_torch_tensorrt_device(device)
assert (
not immutable_weights
- ), "`immutable_weights` has to be False for a MutableTorchTensorRTModule."
- compilation_options = {
- "enabled_precisions": (
- enabled_precisions
- if enabled_precisions
- else _defaults.ENABLED_PRECISIONS
- ),
- "debug": debug,
- "device": device,
- "assume_dynamic_shape_support": assume_dynamic_shape_support,
- "workspace_size": workspace_size,
- "min_block_size": min_block_size,
- "torch_executed_ops": (
- torch_executed_ops if torch_executed_ops is not None else set()
- ),
- "pass_through_build_failures": pass_through_build_failures,
- "max_aux_streams": max_aux_streams,
- "version_compatible": version_compatible,
- "optimization_level": optimization_level,
- "use_python_runtime": use_python_runtime,
- "truncate_double": truncate_double,
- "use_fast_partitioner": use_fast_partitioner,
- "num_avg_timing_iters": num_avg_timing_iters,
- "enable_experimental_decompositions": enable_experimental_decompositions,
- "require_full_compilation": require_full_compilation,
- "disable_tf32": disable_tf32,
- "sparse_weights": sparse_weights,
- "immutable_weights": immutable_weights,
- "engine_capability": engine_capability,
- "dla_sram_size": dla_sram_size,
- "dla_local_dram_size": dla_local_dram_size,
- "dla_global_dram_size": dla_global_dram_size,
- "dryrun": dryrun,
- "hardware_compatible": hardware_compatible,
- "timing_cache_path": timing_cache_path,
- }
+ ), "`immutable_weights has to be False for a MutableTorchTensorRTModule"
+
self.arg_dynamic_shapes: Optional[tuple[Any]] = None
self.kwarg_dynamic_shapes: Optional[dict[Any, Any]] = None
-
- self.settings = CompilationSettings(**compilation_options)
+ self.serializable_dynamic_shapes_dims: dict[str, tuple[str, int, int]] = {}
self.run_info: Optional[tuple[Any, ...]] = None
self.state_dict_metadata: dict[str, torch.Size] = {}
self._store_state_dict_metadata()
+ self.enable_weight_streaming = (
+ kwargs["enable_weight_streaming"]
+ if "enable_weight_streaming" in kwargs
+ else False
+ )
+ self.weight_streaming_ctx = None
+ self.weight_streaming_budget = weight_streaming_budget
+ if self.enable_weight_streaming:
+ if weight_streaming_budget is None:
+ logger.warning(
+ "Weight stremaing budget is not set. Using auto weight streaming budget"
+ )
+ self.enabled_precisions = enabled_precisions
+ if self.enabled_precisions is None:
+ self.enabled_precisions = _defaults.ENABLED_PRECISIONS
cls = self.__class__
self.__class__ = type(
@@ -782,10 +745,9 @@ Source code for torch_tensorrt.dynamo.runtime._MutableTorchTensorRTModule# to determine whether refit/recompilation is needed. If the output is the same, no further process needed.
if self.run_info:
args, kwargs, result = self.run_info
- self.original_model.to(to_torch_device(self.settings.device))
+ self.original_model.to(to_torch_device(self.trt_device))
new_result = self.original_model(*args, **kwargs)
- self.original_model.cpu()
- torch.cuda.empty_cache()
+ deallocate_module(self.original_model, delete_module=False)
if check_output_equal(result, new_result):
self.refit_state.set_state(RefitFlag.LIVE)
return
@@ -814,17 +776,17 @@ Source code for torch_tensorrt.dynamo.runtime._MutableTorchTensorRTModule MutableTorchTensorRTModule automatically catches weight value updates and call this function to refit the module.
If it fails to catch the changes, please call this function manually to update the TRT graph module.
"""
- self.original_model.to(to_torch_device(self.settings.device))
+
if self.exp_program is None:
- self.exp_program = torch.export.export(
- self.original_model, self.arg_inputs, kwargs=self.kwarg_inputs
- )
+ self.original_model.to(to_torch_device(self.trt_device))
+ self.exp_program = self.get_exported_program()
else:
self.exp_program._state_dict = (
MutableTorchTensorRTModule._transform_state_dict(
self.original_model.state_dict()
)
)
+ self.exp_program.module().to(to_torch_device(self.trt_device))
self.gm = refit_module_weights(
self.gm,
self.exp_program,
@@ -834,8 +796,46 @@ Source code for torch_tensorrt.dynamo.runtime._MutableTorchTensorRTModulein_place=True,
)
- self.original_model.cpu()
- torch.cuda.empty_cache()
+ deallocate_module(self.original_model, delete_module=False)
+
+ def get_exported_program(self) -> torch.export.ExportedProgram:
+
+ def export_fn() -> torch.export.ExportedProgram:
+ if self.allow_complex_guards_as_runtime_asserts:
+ return _export(
+ self.original_model,
+ self.arg_inputs,
+ kwargs=self.kwarg_inputs,
+ dynamic_shapes=self._get_total_dynamic_shapes(),
+ strict=self.strict,
+ allow_complex_guards_as_runtime_asserts=self.allow_complex_guards_as_runtime_asserts,
+ )
+ else:
+ return torch.export.export(
+ self.original_model,
+ self.arg_inputs,
+ kwargs=self.kwarg_inputs,
+ dynamic_shapes=self._get_total_dynamic_shapes(),
+ strict=self.strict,
+ )
+
+ # Check if any quantization precision is enabled
+ if self.enabled_precisions and any(
+ precision in self.enabled_precisions
+ for precision in (torch.float8_e4m3fn, torch.int8, torch.float4_e2m1fn_x2)
+ ):
+ try:
+ from modelopt.torch.quantization.utils import export_torch_mode
+
+ assert torch.ops.tensorrt.quantize_op.default
+ except Exception as e:
+ logger.warning(
+ "Unable to import quantization op. Please install modelopt library (https://github.com/NVIDIA/TensorRT-Model-Optimizer?tab=readme-ov-file#installation) to add support for compiling quantized models"
+ )
+ with export_torch_mode():
+ return export_fn()
+ else:
+ return export_fn()
[docs] def compile(self) -> None:
"""
@@ -845,25 +845,37 @@
Source code for torch_tensorrt.dynamo.runtime._MutableTorchTensorRTModule If it fails to catch the changes, please call this function manually to recompile the TRT graph module.
"""
# Export the module
- self.original_model.to(to_torch_device(self.settings.device))
- self.exp_program = torch.export.export(
- self.original_model,
- self.arg_inputs,
- kwargs=self.kwarg_inputs,
- dynamic_shapes=self._get_total_dynamic_shapes(),
- )
+ self.original_model.to(to_torch_device(self.trt_device))
+ self.exp_program = self.get_exported_program()
self.gm = dynamo_compile(
self.exp_program,
arg_inputs=self.arg_inputs,
kwarg_inputs=self.kwarg_inputs,
- **self.settings.__dict__,
+ immutable_weights=False,
+ use_python_runtime=self.use_python_runtime,
+ enabled_precisions=self.enabled_precisions,
+ **self.additional_settings,
+ )
+ deallocate_module(self.original_model, delete_module=False)
+ if self.enable_weight_streaming:
+ self.set_weight_streaming_ctx(self.weight_streaming_budget)
+
+[docs] def set_weight_streaming_ctx(self, requested_budget: Optional[int] = None) -> None:
+
"""
+
Set the weight streaming budget. If budget is not set, then automatic weight streaming budget
+
is used.
+
"""
+
self.weight_streaming_ctx = torch_tensorrt.runtime.weight_streaming(self.gm)
+
requested_budget = (
+
requested_budget
+
if requested_budget is not None
+
else self.weight_streaming_ctx.get_automatic_weight_streaming_budget()
)
-
self.original_model.cpu()
-
torch.cuda.empty_cache()
+ self.weight_streaming_ctx.device_budget = requested_budget
def _validate_inputs(self, *args: Any, **kwargs: Any) -> None:
-
if not self.arg_inputs:
+
if not self.arg_inputs and not self.kwarg_inputs:
logger.info("First time compilation initiated. This may take some time.")
self.refit_state.set_state(RefitFlag.NEEDS_RECOMPILE)
self._store_inputs(args, kwargs)
@@ -953,6 +965,12 @@
Source code for torch_tensorrt.dynamo.runtime._MutableTorchTensorRTModule)
def forward(self, *args: Any, **kwargs: Any) -> Any:
+ warnings.warn(
+ "Direct calls to {self.__class__}.forward() are currently broken by due to https://github.com/pytorch/pytorch/issues/157183. Either call {self.__class__}(...) directly or use {self.__class__}._forward as a work around"
+ )
+ return self._forward(*args, **kwargs)
+
+ def _forward(self, *args: Any, **kwargs: Any) -> Any:
# Step 1: Check whether the input shape has changed
kwargs = MutableTorchTensorRTModule._process_kwarg_inputs(kwargs)
self._validate_inputs(*args, **kwargs)
@@ -980,14 +998,24 @@ Source code for torch_tensorrt.dynamo.runtime._MutableTorchTensorRTModuleself._store_state_dict_metadata()
self.refit_state.set_state(RefitFlag.LIVE)
+ weight_streaming_ctx = (
+ self.weight_streaming_ctx if self.enable_weight_streaming else None
+ )
result = self.gm(*args, **kwargs)
# Storing inputs and outputs for verification when the state is unknown
self.run_info = (args, kwargs, result)
return result
- def to(self, device: str) -> None:
- logger.warning("Original PyTorch model is moved. CPU offload may failed.")
- self.original_model.to(device)
+ def to(self, *args: Any, **kwargs: Any) -> None:
+ logger.warning(
+ "Trying to move the original PyTorch model. This will cause CPU offloading failing and increase GPU memory usage."
+ + "If this is absolute necessary, please call module.pytorch_model.to(...) \n"
+ + "The model is still on the original device."
+ )
+
+ @property
+ def device(self) -> torch.device:
+ return to_torch_device(self.trt_device)
def __deepcopy__(self, memo: Any) -> Any:
cls = self.__class__
@@ -1002,7 +1030,9 @@ Source code for torch_tensorrt.dynamo.runtime._MutableTorchTensorRTModulereturn result
def __call__(self, *args: Any, **kwargs: Any) -> Any:
- return self.forward(*args, **kwargs)
+ # Due to https://github.com/pytorch/pytorch/issues/157183, we cannot use forward call, use _forward as a workaround.
+ # This is a temporary fix.
+ return self._forward(*args, **kwargs)
def __getattr__(self, name: str) -> Any:
if name in self.__dict__:
@@ -1113,18 +1143,58 @@ Source code for torch_tensorrt.dynamo.runtime._MutableTorchTensorRTModulereturn True
+ def serialize_dynamic_shapes(self) -> None:
+ dims = self.serializable_dynamic_shapes_dims
+
+ def resursivly_serialize_dynamic_shape(obj: Any) -> None:
+ if isinstance(obj, dict):
+ for axis, v in obj.items():
+ if isinstance(v, torch.export.dynamic_shapes._Dim):
+ name = str(v).split("'")[1].split(".")[-1]
+ # We use string of the hash to be the unique identifier of Dim object
+ dims.setdefault(str(hash(v)), (name, v.min, v.max))
+ obj[axis] = str(hash(v))
+ else:
+ resursivly_serialize_dynamic_shape(v)
+ if isinstance(obj, (tuple, list)):
+ for v in obj:
+ resursivly_serialize_dynamic_shape(v)
+
+ resursivly_serialize_dynamic_shape(self.arg_dynamic_shapes)
+ resursivly_serialize_dynamic_shape(self.kwarg_dynamic_shapes)
+
+ def deserialize_dynamic_shapes(self) -> None:
+ dims = self.serializable_dynamic_shapes_dims
+
+ def resursivly_deserialize_dynamic_shape(obj: Any) -> None:
+ if isinstance(obj, dict):
+ for axis, v in obj.items():
+ if isinstance(v, str):
+ obj[axis] = torch.export.Dim(
+ dims[v][0], min=dims[v][1], max=dims[v][2]
+ )
+ else:
+ resursivly_deserialize_dynamic_shape(v)
+ if isinstance(obj, (tuple, list)):
+ for v in obj:
+ resursivly_deserialize_dynamic_shape(v)
+
+ resursivly_deserialize_dynamic_shape(self.arg_dynamic_shapes)
+ resursivly_deserialize_dynamic_shape(self.kwarg_dynamic_shapes)
+
@staticmethod
def save(module: Any, path: str) -> None:
# Cast the object back to MutableTorchTensorRTModule to save
assert (
- not module.settings.use_python_runtime
+ not module.use_python_runtime
), "Python runtime does not support serialization. Save failed."
module.init_finished = False
module.__class__ = MutableTorchTensorRTModule
exp_program = module.exp_program
module.pytorch_model = None
module.exp_program = None
- torch.save(module, path)
+ module.serialize_dynamic_shapes()
+ torch.save(module, path, pickle_protocol=4)
# Restore deleted attributes
module.exp_program = exp_program
module.pytorch_model = _make_refit_change_trigger(
@@ -1147,19 +1217,26 @@ Source code for torch_tensorrt.dynamo.runtime._MutableTorchTensorRTModulemodule.pytorch_model = _make_refit_change_trigger(
module.original_model, module.refit_state
)
- module.original_model.to(to_torch_device(module.settings.device))
+ module.original_model.to(to_torch_device(module.device))
module.exp_program = torch.export.export(
module.original_model, module.arg_inputs, kwargs=module.kwarg_inputs
)
- module.original_model.to("cpu")
+ deallocate_module(module.original_model, delete_module=False)
cls = module.__class__
module.__class__ = type(
module.original_model.__class__.__name__,
(cls, module.original_model.__class__),
{},
)
+ module.deserialize_dynamic_shapes()
module.init_finished = True
- return module
+ return module
+
+ def _reset_stateful_cache(obj: Any) -> None:
+ """
+ Does nothing. Support Huggingface CPU offload hooks. Override the huggingface cache reset function because we don't want the TRT module to be handled by HuggingFace.
+ """
+ return
def recursively_remove_trigger(obj: Any) -> Any:
diff --git a/docs/_modules/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.html b/docs/_modules/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.html
index ccba540284..74ab57ca78 100644
--- a/docs/_modules/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.html
+++ b/docs/_modules/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.html
@@ -9,7 +9,7 @@
-
torch_tensorrt.dynamo.runtime._PythonTorchTensorRTModule — Torch-TensorRT v2.8.0.dev0+ee32da0 documentation
+
torch_tensorrt.dynamo.runtime._PythonTorchTensorRTModule — Torch-TensorRT v2.9.0.dev0+92a6908 documentation
@@ -290,7 +290,7 @@
- v2.8.0.dev0+ee32da0
+ v2.9.0.dev0+92a6908
@@ -321,7 +321,7 @@
Getting Started
User Guide
@@ -373,9 +373,8 @@
Compiling ResNet with dynamic shapes using the torch.compile backend
Compiling BERT using the torch.compile backend
Compiling Stable Diffusion model using the torch.compile backend
+
Compiling LLM models from Huggingface
Compiling GPT2 using the Torch-TensorRT torch.compile
frontend
-
Compiling GPT2 using the dynamo backend
-
Compiling Llama2 using the dynamo backend
Compiling SAM2 using the dynamo backend
Compiling FLUX.1-dev model using the Torch-TensorRT dynamo backend
Legacy notebooks
@@ -501,6 +500,8 @@
Source code for torch_tensorrt.dynamo.runtime._PythonTorchTensorRTModule
from
torch_tensorrt._Device import Device
from torch_tensorrt._enums import Platform, dtype
from torch_tensorrt.dynamo._settings import CompilationSettings
+
from torch_tensorrt.dynamo.debug._DebuggerConfig import DebuggerConfig
+
from torch_tensorrt.dynamo.debug._supports_debugger import cls_supports_debugger
from torch_tensorrt.dynamo.utils import DYNAMIC_DIM
from torch_tensorrt.logging import TRT_LOGGER
from torch_tensorrt.runtime._utils import (
@@ -600,7 +601,8 @@
Source code for torch_tensorrt.dynamo.runtime._PythonTorchTensorRTModule
)
-
[docs]class PythonTorchTensorRTModule(Module): # type: ignore[misc]
+
[docs]@cls_supports_debugger
+
class PythonTorchTensorRTModule(Module): # type: ignore[misc]
"""PythonTorchTensorRTModule is a PyTorch module which encompasses an arbitrary TensorRT Engine.
This module is backed by the Torch-TensorRT runtime and is only compatible with
@@ -617,6 +619,7 @@
Source code for torch_tensorrt.dynamo.runtime._PythonTorchTensorRTModule
settings
: CompilationSettings = CompilationSettings(),
weight_name_map: Optional[dict[Any, Any]] = None,
requires_output_allocator: bool = False,
+
_debugger_config: Optional[DebuggerConfig] = None,
):
"""Takes a name, target device, serialized TensorRT engine, and binding names / order and constructs
a PyTorch ``torch.nn.Module`` around it. Uses TensorRT Python APIs to run the engine
@@ -646,6 +649,7 @@
Source code for torch_tensorrt.dynamo.runtime._PythonTorchTensorRTModule
"""
self.context: Any
+
self._debugger_config: Optional[DebuggerConfig] = _debugger_config
super(PythonTorchTensorRTModule, self).__init__()
self._register_state_dict_hook(PythonTorchTensorRTModule._on_state_dict)
@@ -682,7 +686,11 @@
Source code for torch_tensorrt.dynamo.runtime._PythonTorchTensorRTModule
self
.target_device_properties = torch.cuda.get_device_properties(
self.target_device_id
)
-
self.profiling_enabled = settings.debug if settings.debug is not None else False
+
self.profiling_enabled = (
+
_debugger_config.save_engine_profile
+
if _debugger_config is not None
+
else False
+
)
self.settings = settings
self.engine = None
self.weight_name_map = weight_name_map
@@ -1232,7 +1240,14 @@
Source code for torch_tensorrt.dynamo.runtime._PythonTorchTensorRTModule
# Representation of input shapes to a given model
# Shapes are concatenated as so:
# x: (3, 4), y: (4, 5) --> Key: (3,4)(4,5)
-
new_shape_key = "".join(str(tuple(t.shape)).replace(" ", "") for t in inputs)
+
tensor_inputs = []
+
for t in inputs:
+
if not isinstance(t, torch.Tensor):
+
return True
+
tensor_inputs.append(t)
+
new_shape_key = "".join(
+
str(tuple(t.shape)).replace(" ", "") for t in tensor_inputs
+
)
# If the new shape key differs from the existing one,
# invalidate the old shape key and remove the CUDAGraph
diff --git a/docs/_modules/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.html b/docs/_modules/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.html
index 2d464c1e4d..33b2642ff8 100644
--- a/docs/_modules/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.html
+++ b/docs/_modules/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.html
@@ -9,7 +9,7 @@
-
torch_tensorrt.dynamo.runtime._TorchTensorRTModule — Torch-TensorRT v2.8.0.dev0+ee32da0 documentation
+
torch_tensorrt.dynamo.runtime._TorchTensorRTModule — Torch-TensorRT v2.9.0.dev0+92a6908 documentation
@@ -290,7 +290,7 @@
- v2.8.0.dev0+ee32da0
+ v2.9.0.dev0+92a6908
@@ -321,7 +321,7 @@
Getting Started
User Guide
@@ -373,9 +373,8 @@
Compiling ResNet with dynamic shapes using the torch.compile backend
Compiling BERT using the torch.compile backend
Compiling Stable Diffusion model using the torch.compile backend
+
Compiling LLM models from Huggingface
Compiling GPT2 using the Torch-TensorRT torch.compile
frontend
-
Compiling GPT2 using the dynamo backend
-
Compiling Llama2 using the dynamo backend
Compiling SAM2 using the dynamo backend
Compiling FLUX.1-dev model using the Torch-TensorRT dynamo backend
Legacy notebooks
@@ -823,7 +822,11 @@
Source code for torch_tensorrt.dynamo.runtime._TorchTensorRTModule
return
tuple(outputs)
-
def enable_profiling(self, profiling_results_dir: Optional[str] = None) -> None:
+
def enable_profiling(
+
self,
+
profiling_results_dir: Optional[str] = None,
+
profile_format: str = "perfetto",
+
) -> None:
"""Enable the profiler to collect latency information about the execution of the engine
Traces can be visualized using https://ui.perfetto.dev/ or compatible alternatives
@@ -836,7 +839,9 @@
Source code for torch_tensorrt.dynamo.runtime._TorchTensorRTModule
if
profiling_results_dir is not None:
self.engine.profile_path_prefix = profiling_results_dir
+
assert profile_format in ["trex", "perfetto"]
self.engine.enable_profiling()
+
self.engine.set_profile_format(profile_format)
def disable_profiling(self) -> None:
"""Disable the profiler"""
diff --git a/docs/_modules/torch_tensorrt/fx/fx2trt.html b/docs/_modules/torch_tensorrt/fx/fx2trt.html
index 33852bb751..10c70ec75a 100644
--- a/docs/_modules/torch_tensorrt/fx/fx2trt.html
+++ b/docs/_modules/torch_tensorrt/fx/fx2trt.html
@@ -9,7 +9,7 @@
-
torch_tensorrt.fx.fx2trt — Torch-TensorRT v2.8.0.dev0+ee32da0 documentation
+
torch_tensorrt.fx.fx2trt — Torch-TensorRT v2.9.0.dev0+92a6908 documentation
@@ -290,7 +290,7 @@
- v2.8.0.dev0+ee32da0
+ v2.9.0.dev0+92a6908
@@ -321,7 +321,7 @@
Getting Started
User Guide
@@ -373,9 +373,8 @@
Compiling ResNet with dynamic shapes using the torch.compile backend
Compiling BERT using the torch.compile backend
Compiling Stable Diffusion model using the torch.compile backend
+
Compiling LLM models from Huggingface
Compiling GPT2 using the Torch-TensorRT torch.compile
frontend
-
Compiling GPT2 using the dynamo backend
-
Compiling Llama2 using the dynamo backend
Compiling SAM2 using the dynamo backend
Compiling FLUX.1-dev model using the Torch-TensorRT dynamo backend
Legacy notebooks
diff --git a/docs/_modules/torch_tensorrt/fx/input_tensor_spec.html b/docs/_modules/torch_tensorrt/fx/input_tensor_spec.html
index 344e6d8114..ad00adbcfb 100644
--- a/docs/_modules/torch_tensorrt/fx/input_tensor_spec.html
+++ b/docs/_modules/torch_tensorrt/fx/input_tensor_spec.html
@@ -9,7 +9,7 @@
-
torch_tensorrt.fx.input_tensor_spec — Torch-TensorRT v2.8.0.dev0+ee32da0 documentation
+
torch_tensorrt.fx.input_tensor_spec — Torch-TensorRT v2.9.0.dev0+92a6908 documentation
@@ -290,7 +290,7 @@
- v2.8.0.dev0+ee32da0
+ v2.9.0.dev0+92a6908
@@ -321,7 +321,7 @@
Getting Started
User Guide
@@ -373,9 +373,8 @@
Compiling ResNet with dynamic shapes using the torch.compile backend
Compiling BERT using the torch.compile backend
Compiling Stable Diffusion model using the torch.compile backend
+
Compiling LLM models from Huggingface
Compiling GPT2 using the Torch-TensorRT torch.compile
frontend
-
Compiling GPT2 using the dynamo backend
-
Compiling Llama2 using the dynamo backend
Compiling SAM2 using the dynamo backend
Compiling FLUX.1-dev model using the Torch-TensorRT dynamo backend
Legacy notebooks
diff --git a/docs/_modules/torch_tensorrt/fx/lower.html b/docs/_modules/torch_tensorrt/fx/lower.html
index 8b3d02fdf3..96746055e6 100644
--- a/docs/_modules/torch_tensorrt/fx/lower.html
+++ b/docs/_modules/torch_tensorrt/fx/lower.html
@@ -9,7 +9,7 @@
-
torch_tensorrt.fx.lower — Torch-TensorRT v2.8.0.dev0+ee32da0 documentation
+
torch_tensorrt.fx.lower — Torch-TensorRT v2.9.0.dev0+92a6908 documentation
@@ -290,7 +290,7 @@
- v2.8.0.dev0+ee32da0
+ v2.9.0.dev0+92a6908
@@ -321,7 +321,7 @@
Getting Started
User Guide
@@ -373,9 +373,8 @@
Compiling ResNet with dynamic shapes using the torch.compile backend
Compiling BERT using the torch.compile backend
Compiling Stable Diffusion model using the torch.compile backend
+
Compiling LLM models from Huggingface
Compiling GPT2 using the Torch-TensorRT torch.compile
frontend
-
Compiling GPT2 using the dynamo backend
-
Compiling Llama2 using the dynamo backend
Compiling SAM2 using the dynamo backend
Compiling FLUX.1-dev model using the Torch-TensorRT dynamo backend
Legacy notebooks
diff --git a/docs/_modules/torch_tensorrt/fx/trt_module.html b/docs/_modules/torch_tensorrt/fx/trt_module.html
index 1dae0d3b7d..62301e002c 100644
--- a/docs/_modules/torch_tensorrt/fx/trt_module.html
+++ b/docs/_modules/torch_tensorrt/fx/trt_module.html
@@ -9,7 +9,7 @@
-
torch_tensorrt.fx.trt_module — Torch-TensorRT v2.8.0.dev0+ee32da0 documentation
+
torch_tensorrt.fx.trt_module — Torch-TensorRT v2.9.0.dev0+92a6908 documentation
@@ -290,7 +290,7 @@
- v2.8.0.dev0+ee32da0
+ v2.9.0.dev0+92a6908
@@ -321,7 +321,7 @@
Getting Started
User Guide
@@ -373,9 +373,8 @@
Compiling ResNet with dynamic shapes using the torch.compile backend
Compiling BERT using the torch.compile backend
Compiling Stable Diffusion model using the torch.compile backend
+
Compiling LLM models from Huggingface
Compiling GPT2 using the Torch-TensorRT torch.compile
frontend
-
Compiling GPT2 using the dynamo backend
-
Compiling Llama2 using the dynamo backend
Compiling SAM2 using the dynamo backend
Compiling FLUX.1-dev model using the Torch-TensorRT dynamo backend
Legacy notebooks
diff --git a/docs/_modules/torch_tensorrt/logging.html b/docs/_modules/torch_tensorrt/logging.html
index dba7615ea1..8c165b7236 100644
--- a/docs/_modules/torch_tensorrt/logging.html
+++ b/docs/_modules/torch_tensorrt/logging.html
@@ -9,7 +9,7 @@
-
torch_tensorrt.logging — Torch-TensorRT v2.8.0.dev0+ee32da0 documentation
+
torch_tensorrt.logging — Torch-TensorRT v2.9.0.dev0+92a6908 documentation
@@ -290,7 +290,7 @@
- v2.8.0.dev0+ee32da0
+ v2.9.0.dev0+92a6908
@@ -321,7 +321,7 @@
Getting Started
User Guide
@@ -373,9 +373,8 @@
Compiling ResNet with dynamic shapes using the torch.compile backend
Compiling BERT using the torch.compile backend
Compiling Stable Diffusion model using the torch.compile backend
+
Compiling LLM models from Huggingface
Compiling GPT2 using the Torch-TensorRT torch.compile
frontend
-
Compiling GPT2 using the dynamo backend
-
Compiling Llama2 using the dynamo backend
Compiling SAM2 using the dynamo backend
Compiling FLUX.1-dev model using the Torch-TensorRT dynamo backend
Legacy notebooks
diff --git a/docs/_modules/torch_tensorrt/runtime/_cudagraphs.html b/docs/_modules/torch_tensorrt/runtime/_cudagraphs.html
index 1f96b5c517..c7c7c8e52c 100644
--- a/docs/_modules/torch_tensorrt/runtime/_cudagraphs.html
+++ b/docs/_modules/torch_tensorrt/runtime/_cudagraphs.html
@@ -9,7 +9,7 @@
-
torch_tensorrt.runtime._cudagraphs — Torch-TensorRT v2.8.0.dev0+ee32da0 documentation
+
torch_tensorrt.runtime._cudagraphs — Torch-TensorRT v2.9.0.dev0+92a6908 documentation
@@ -290,7 +290,7 @@
- v2.8.0.dev0+ee32da0
+ v2.9.0.dev0+92a6908
@@ -321,7 +321,7 @@
Getting Started
User Guide
@@ -373,9 +373,8 @@
Compiling ResNet with dynamic shapes using the torch.compile backend
Compiling BERT using the torch.compile backend
Compiling Stable Diffusion model using the torch.compile backend
+
Compiling LLM models from Huggingface
Compiling GPT2 using the Torch-TensorRT torch.compile
frontend
-
Compiling GPT2 using the dynamo backend
-
Compiling Llama2 using the dynamo backend
Compiling SAM2 using the dynamo backend
Compiling FLUX.1-dev model using the Torch-TensorRT dynamo backend
Legacy notebooks
@@ -558,48 +557,16 @@
Source code for torch_tensorrt.runtime._cudagraphs
self
.old_mode = _PY_RT_CUDAGRAPHS
self.compiled_module = compiled_module
self.cudagraphs_module: Optional[CudaGraphsTorchTensorRTModule] = None
+
self.old_module = None
-
def __enter__(self) -> torch.nn.Module:
-
global _PY_RT_CUDAGRAPHS
-
-
num_torch_module = 0
-
num_trt_module = 0
-
for name, module in self.compiled_module.named_children():
-
# need to disable cudagraphs if any model requires output allocator
-
if (
-
hasattr(module, "requires_output_allocator")
-
and module.requires_output_allocator
-
):
-
raise RuntimeError(
-
"The model contains submodules that require a dynamic output allocator at runtime, which is incompatible with CUDA Graphs. Please disable CUDA Graphs."
-
)
-
if "_run_on_acc" in name:
-
num_trt_module += 1
-
elif "_run_on_gpu" in name:
-
num_torch_module += 1
-
-
if num_torch_module > 0:
-
# Set whole cudagraphs mode and returns wrapped module
-
_PY_RT_CUDAGRAPHS = CudaGraphsMode.WHOLE_GRAPH_CUDAGRAPHS
-
# Set new mode for C++
-
if torch_tensorrt.ENABLED_FEATURES.torch_tensorrt_runtime:
-
torch.ops.tensorrt.set_cudagraphs_mode(_PY_RT_CUDAGRAPHS)
+
def __enter__(self) -> Union[torch.nn.Module, torch.fx.GraphModule]:
-
logger.debug(
-
"Found pytorch subgraphs in module, wrapping module in CudaGraphsTorchTensorRTModule"
-
)
-
self.cudagraphs_module = CudaGraphsTorchTensorRTModule(self.compiled_module)
-
return self.cudagraphs_module
-
else:
-
if num_trt_module > 0:
-
logger.debug("No graph breaks detected, using runtime cudagraphs mode")
-
else:
-
logger.debug(
-
"Please consider dynamo if there is graph breaks. Using runtime cudagraphs mode"
-
)
-
# Enable cudagraphs for TRT submodule
-
set_cudagraphs_mode(True)
+
if isinstance(self.compiled_module, torch_tensorrt.MutableTorchTensorRTModule):
+
self.old_module = self.compiled_module.gm
+
self.compiled_module.gm = get_cuda_graph_module(self.compiled_module.gm)
return self.compiled_module
+
else:
+
return get_cuda_graph_module(self.compiled_module)
def __exit__(self, *args: Any) -> None:
# Set cudagraphs back to old mode
@@ -607,6 +574,52 @@
Source code for torch_tensorrt.runtime._cudagraphs
# __del__ is not entirely predictable, so we reset cudagraph here
if self.cudagraphs_module:
self.cudagraphs_module._reset_captured_graph()
+
if self.old_module: # MutableTorchTRTModule
+
self.compiled_module.gm = self.old_module
+
+
+
def get_cuda_graph_module(
+
compiled_module: torch.fx.GraphModule,
+
) -> Union[torch.nn.Module, torch.fx.GraphModule]:
+
global _PY_RT_CUDAGRAPHS
+
+
num_torch_module = 0
+
num_trt_module = 0
+
for name, module in compiled_module.named_children():
+
# need to disable cudagraphs if any model requires output allocator
+
if (
+
hasattr(module, "requires_output_allocator")
+
and module.requires_output_allocator
+
):
+
raise RuntimeError(
+
"The model contains submodules that require a dynamic output allocator at runtime, which is incompatible with CUDA Graphs. Please disable CUDA Graphs."
+
)
+
if "_run_on_acc" in name:
+
num_trt_module += 1
+
elif "_run_on_gpu" in name:
+
num_torch_module += 1
+
+
if num_torch_module > 0:
+
# Set whole cudagraphs mode and returns wrapped module
+
_PY_RT_CUDAGRAPHS = CudaGraphsMode.WHOLE_GRAPH_CUDAGRAPHS
+
# Set new mode for C++
+
if torch_tensorrt.ENABLED_FEATURES.torch_tensorrt_runtime:
+
torch.ops.tensorrt.set_cudagraphs_mode(_PY_RT_CUDAGRAPHS)
+
+
logger.debug(
+
"Found pytorch subgraphs in module, wrapping module in CudaGraphsTorchTensorRTModule"
+
)
+
return CudaGraphsTorchTensorRTModule(compiled_module)
+
else:
+
if num_trt_module > 0:
+
logger.debug("No graph breaks detected, using runtime cudagraphs mode")
+
else:
+
logger.debug(
+
"Please consider dynamo if there is graph breaks. Using runtime cudagraphs mode"
+
)
+
# Enable cudagraphs for TRT submodule
+
set_cudagraphs_mode(True)
+
return compiled_module
[docs]def enable_cudagraphs(
diff --git a/docs/_modules/torch_tensorrt/runtime/_multi_device_safe_mode.html b/docs/_modules/torch_tensorrt/runtime/_multi_device_safe_mode.html
index 78d79b9739..2bc5fd9873 100644
--- a/docs/_modules/torch_tensorrt/runtime/_multi_device_safe_mode.html
+++ b/docs/_modules/torch_tensorrt/runtime/_multi_device_safe_mode.html
@@ -9,7 +9,7 @@
-
torch_tensorrt.runtime._multi_device_safe_mode — Torch-TensorRT v2.8.0.dev0+ee32da0 documentation
+
torch_tensorrt.runtime._multi_device_safe_mode — Torch-TensorRT v2.9.0.dev0+92a6908 documentation
@@ -290,7 +290,7 @@
- v2.8.0.dev0+ee32da0
+ v2.9.0.dev0+92a6908
@@ -321,7 +321,7 @@
Getting Started
User Guide
@@ -373,9 +373,8 @@
Compiling ResNet with dynamic shapes using the torch.compile backend
Compiling BERT using the torch.compile backend
Compiling Stable Diffusion model using the torch.compile backend
+
Compiling LLM models from Huggingface
Compiling GPT2 using the Torch-TensorRT torch.compile
frontend
-
Compiling GPT2 using the dynamo backend
-
Compiling Llama2 using the dynamo backend
Compiling SAM2 using the dynamo backend
Compiling FLUX.1-dev model using the Torch-TensorRT dynamo backend
Legacy notebooks
diff --git a/docs/_modules/torch_tensorrt/runtime/_output_allocator.html b/docs/_modules/torch_tensorrt/runtime/_output_allocator.html
index 139a5e0f99..0ea8ab600b 100644
--- a/docs/_modules/torch_tensorrt/runtime/_output_allocator.html
+++ b/docs/_modules/torch_tensorrt/runtime/_output_allocator.html
@@ -9,7 +9,7 @@
-
torch_tensorrt.runtime._output_allocator — Torch-TensorRT v2.8.0.dev0+ee32da0 documentation
+
torch_tensorrt.runtime._output_allocator — Torch-TensorRT v2.9.0.dev0+92a6908 documentation
@@ -290,7 +290,7 @@
- v2.8.0.dev0+ee32da0
+ v2.9.0.dev0+92a6908
@@ -321,7 +321,7 @@
Getting Started
User Guide
@@ -373,9 +373,8 @@
Compiling ResNet with dynamic shapes using the torch.compile backend
Compiling BERT using the torch.compile backend
Compiling Stable Diffusion model using the torch.compile backend
+
Compiling LLM models from Huggingface
Compiling GPT2 using the Torch-TensorRT torch.compile
frontend
-
Compiling GPT2 using the dynamo backend
-
Compiling Llama2 using the dynamo backend
Compiling SAM2 using the dynamo backend
Compiling FLUX.1-dev model using the Torch-TensorRT dynamo backend
Legacy notebooks
diff --git a/docs/_modules/torch_tensorrt/runtime/_pre_allocated_outputs.html b/docs/_modules/torch_tensorrt/runtime/_pre_allocated_outputs.html
index 29df77921e..0bbc9d5f69 100644
--- a/docs/_modules/torch_tensorrt/runtime/_pre_allocated_outputs.html
+++ b/docs/_modules/torch_tensorrt/runtime/_pre_allocated_outputs.html
@@ -9,7 +9,7 @@
-
torch_tensorrt.runtime._pre_allocated_outputs — Torch-TensorRT v2.8.0.dev0+ee32da0 documentation
+
torch_tensorrt.runtime._pre_allocated_outputs — Torch-TensorRT v2.9.0.dev0+92a6908 documentation
@@ -290,7 +290,7 @@
- v2.8.0.dev0+ee32da0
+ v2.9.0.dev0+92a6908
@@ -321,7 +321,7 @@
Getting Started
User Guide
@@ -373,9 +373,8 @@
Compiling ResNet with dynamic shapes using the torch.compile backend
Compiling BERT using the torch.compile backend
Compiling Stable Diffusion model using the torch.compile backend
+
Compiling LLM models from Huggingface
Compiling GPT2 using the Torch-TensorRT torch.compile
frontend
-
Compiling GPT2 using the dynamo backend
-
Compiling Llama2 using the dynamo backend
Compiling SAM2 using the dynamo backend
Compiling FLUX.1-dev model using the Torch-TensorRT dynamo backend
Legacy notebooks
diff --git a/docs/_modules/torch_tensorrt/runtime/_weight_streaming.html b/docs/_modules/torch_tensorrt/runtime/_weight_streaming.html
index ecfc6139bd..822b2fe9ed 100644
--- a/docs/_modules/torch_tensorrt/runtime/_weight_streaming.html
+++ b/docs/_modules/torch_tensorrt/runtime/_weight_streaming.html
@@ -9,7 +9,7 @@
-
torch_tensorrt.runtime._weight_streaming — Torch-TensorRT v2.8.0.dev0+ee32da0 documentation
+
torch_tensorrt.runtime._weight_streaming — Torch-TensorRT v2.9.0.dev0+92a6908 documentation
@@ -290,7 +290,7 @@
- v2.8.0.dev0+ee32da0
+ v2.9.0.dev0+92a6908
@@ -321,7 +321,7 @@
Getting Started
User Guide
@@ -373,9 +373,8 @@
Compiling ResNet with dynamic shapes using the torch.compile backend
Compiling BERT using the torch.compile backend
Compiling Stable Diffusion model using the torch.compile backend
+
Compiling LLM models from Huggingface
Compiling GPT2 using the Torch-TensorRT torch.compile
frontend
-
Compiling GPT2 using the dynamo backend
-
Compiling Llama2 using the dynamo backend
Compiling SAM2 using the dynamo backend
Compiling FLUX.1-dev model using the Torch-TensorRT dynamo backend
Legacy notebooks
diff --git a/docs/_modules/torch_tensorrt/ts/_compile_spec.html b/docs/_modules/torch_tensorrt/ts/_compile_spec.html
index 36d1bd5763..1eda21d32a 100644
--- a/docs/_modules/torch_tensorrt/ts/_compile_spec.html
+++ b/docs/_modules/torch_tensorrt/ts/_compile_spec.html
@@ -9,7 +9,7 @@
-
torch_tensorrt.ts._compile_spec — Torch-TensorRT v2.8.0.dev0+ee32da0 documentation
+
torch_tensorrt.ts._compile_spec — Torch-TensorRT v2.9.0.dev0+92a6908 documentation
@@ -290,7 +290,7 @@
- v2.8.0.dev0+ee32da0
+ v2.9.0.dev0+92a6908
@@ -321,7 +321,7 @@
Getting Started
User Guide
@@ -373,9 +373,8 @@
Compiling ResNet with dynamic shapes using the torch.compile backend
Compiling BERT using the torch.compile backend
Compiling Stable Diffusion model using the torch.compile backend
+
Compiling LLM models from Huggingface
Compiling GPT2 using the Torch-TensorRT torch.compile
frontend
-
Compiling GPT2 using the dynamo backend
-
Compiling Llama2 using the dynamo backend
Compiling SAM2 using the dynamo backend
Compiling FLUX.1-dev model using the Torch-TensorRT dynamo backend
Legacy notebooks
@@ -490,8 +489,9 @@
Source code for torch_tensorrt.ts._compile_spec
<
from __future__ import annotations
from copy import deepcopy
-
from typing import Any, Dict, List, Optional, Set
+
from typing import Any, Dict, List, Optional, Set, Union
+
import tensorrt as trt
import torch
import torch_tensorrt._C.ts as _ts_C
from torch_tensorrt import _C
@@ -502,8 +502,6 @@
Source code for torch_tensorrt.ts._compile_spec
<
from torch_tensorrt.ts._Input import TorchScriptInput
from torch_tensorrt.ts.logging import Level, log
-
import tensorrt as trt
-
def _internal_input_to_torch_class_input(i: _C.Input) -> torch.classes.tensorrt._Input:
clone = torch.classes.tensorrt._Input()
@@ -799,7 +797,7 @@
Source code for torch_tensorrt.ts._compile_spec
<
device: Optional[torch.device | Device] = None,
disable_tf32: bool = False,
sparse_weights: bool = False,
-
enabled_precisions: Optional[Set[torch.dtype | dtype]] = None,
+
enabled_precisions: Optional[Set[Union[torch.dtype, dtype]]] = None,
refit: bool = False,
debug: bool = False,
capability: EngineCapability = EngineCapability.STANDARD,
diff --git a/docs/_modules/torch_tensorrt/ts/_compiler.html b/docs/_modules/torch_tensorrt/ts/_compiler.html
index ac653830ec..6e4eaa9a3d 100644
--- a/docs/_modules/torch_tensorrt/ts/_compiler.html
+++ b/docs/_modules/torch_tensorrt/ts/_compiler.html
@@ -9,7 +9,7 @@
-
torch_tensorrt.ts._compiler — Torch-TensorRT v2.8.0.dev0+ee32da0 documentation
+
torch_tensorrt.ts._compiler — Torch-TensorRT v2.9.0.dev0+92a6908 documentation
@@ -290,7 +290,7 @@
- v2.8.0.dev0+ee32da0
+ v2.9.0.dev0+92a6908
@@ -321,7 +321,7 @@
Getting Started
User Guide
@@ -373,9 +373,8 @@
Compiling ResNet with dynamic shapes using the torch.compile backend
Compiling BERT using the torch.compile backend
Compiling Stable Diffusion model using the torch.compile backend
+
Compiling LLM models from Huggingface
Compiling GPT2 using the Torch-TensorRT torch.compile
frontend
-
Compiling GPT2 using the dynamo backend
-
Compiling Llama2 using the dynamo backend
Compiling SAM2 using the dynamo backend
Compiling FLUX.1-dev model using the Torch-TensorRT dynamo backend
Legacy notebooks
@@ -490,7 +489,7 @@
Source code for torch_tensorrt.ts._compiler
from __future__ import annotations
import warnings
-from typing import Any, List, Optional, Sequence, Set, Tuple
+from typing import Any, List, Optional, Sequence, Set, Tuple, Union
import torch
import torch_tensorrt._C.ts as _C
@@ -507,7 +506,7 @@ Source code for torch_tensorrt.ts._compiler
device: Device = Device._current_device(),
disable_tf32: bool = False,
sparse_weights: bool = False,
- enabled_precisions: Optional[Set[torch.dtype | dtype]] = None,
+ enabled_precisions: Optional[Set[Union[torch.dtype, dtype]]] = None,
refit: bool = False,
debug: bool = False,
capability: EngineCapability = EngineCapability.STANDARD,
@@ -661,7 +660,7 @@ Source code for torch_tensorrt.ts._compiler
device: Device = Device._current_device(),
disable_tf32: bool = False,
sparse_weights: bool = False,
- enabled_precisions: Optional[Set[torch.dtype | dtype]] = None,
+ enabled_precisions: Optional[Set[Union[torch.dtype, dtype]]] = None,
refit: bool = False,
debug: bool = False,
capability: EngineCapability = EngineCapability.STANDARD,
diff --git a/docs/_modules/torch_tensorrt/ts/ptq.html b/docs/_modules/torch_tensorrt/ts/ptq.html
index f5b2e78d2c..765bf42bde 100644
--- a/docs/_modules/torch_tensorrt/ts/ptq.html
+++ b/docs/_modules/torch_tensorrt/ts/ptq.html
@@ -9,7 +9,7 @@
- torch_tensorrt.ts.ptq — Torch-TensorRT v2.8.0.dev0+ee32da0 documentation
+ torch_tensorrt.ts.ptq — Torch-TensorRT v2.9.0.dev0+92a6908 documentation
@@ -290,7 +290,7 @@
- v2.8.0.dev0+ee32da0
+ v2.9.0.dev0+92a6908
@@ -321,7 +321,7 @@
Getting Started
User Guide
@@ -373,9 +373,8 @@
Compiling ResNet with dynamic shapes using the torch.compile backend
Compiling BERT using the torch.compile backend
Compiling Stable Diffusion model using the torch.compile backend
+
Compiling LLM models from Huggingface
Compiling GPT2 using the Torch-TensorRT torch.compile
frontend
-
Compiling GPT2 using the dynamo backend
-
Compiling Llama2 using the dynamo backend
Compiling SAM2 using the dynamo backend
Compiling FLUX.1-dev model using the Torch-TensorRT dynamo backend
Legacy notebooks
diff --git a/docs/_sources/_cpp_api/program_listing_file_cpp_include_torch_tensorrt_macros.h.rst.txt b/docs/_sources/_cpp_api/program_listing_file_cpp_include_torch_tensorrt_macros.h.rst.txt
index e9d90659f9..2763a898e7 100644
--- a/docs/_sources/_cpp_api/program_listing_file_cpp_include_torch_tensorrt_macros.h.rst.txt
+++ b/docs/_sources/_cpp_api/program_listing_file_cpp_include_torch_tensorrt_macros.h.rst.txt
@@ -36,7 +36,7 @@ Program Listing for File macros.h
#define STR(x) XSTR(x)
#define TORCH_TENSORRT_MAJOR_VERSION 2
- #define TORCH_TENSORRT_MINOR_VERSION 6
+ #define TORCH_TENSORRT_MINOR_VERSION 9
#define TORCH_TENSORRT_PATCH_VERSION 0
#define TORCH_TENSORRT_VERSION \
STR(TORCH_TENSORRT_MAJOR_VERSION) \
diff --git a/docs/_sources/contributors/partitioning.rst.txt b/docs/_sources/contributors/partitioning.rst.txt
index 8c83ddcadc..77880cef6a 100644
--- a/docs/_sources/contributors/partitioning.rst.txt
+++ b/docs/_sources/contributors/partitioning.rst.txt
@@ -239,3 +239,16 @@ In this example we will collect the arithmetic ops in a TensorRT segment and the
In some cases this approach may create adjacent segments in the partition which have the same target. As a clean-up step we can consolidate these adjacent segments to further reduce the number of segments in the final partition.
The merge segments step identifies a list of segments that are adjacent in the graph, have the same target, and are not marked as `do_not_merge`. The nodes from these segments will be combined into a single new segment that will replace the merged segments in the partition.
The `do_not_merge` marking is used to prevent merging of segments created for conditional nodes and loops that are handled as special cases in graph stitching and should not be merged with adjacent segments of the same type.
+
+
+Hierarchical Partitioner for Dynamo
+===================================
+
+The Hierarchical Partitioner is an extension to the standard TensorRT partitioner that allows for more sophisticated partitioning strategies by considering backend priority and operator support. This is particularly useful when you want to distribute different parts of your model across multiple backends based on their capabilities and priorities.
+
+We currently support hierarchical adjacency partitioner, which extends the standard adjacency partitioner with the following capabilities:
+
+1. **Backend priority ordering**: Assign operators to backends based on a priority order, ensuring that operators are assigned to the highest-priority backend that supports them.
+2. **Multi-backend support**: Distribute model execution across multiple backends based on operator support.
+
+Please refer to `hierarchical_partitioner_example
`_ for more details.
diff --git a/docs/_sources/getting_started/jetpack.rst.txt b/docs/_sources/getting_started/jetpack.rst.txt
index ddbf89dc63..edfe1ae52e 100644
--- a/docs/_sources/getting_started/jetpack.rst.txt
+++ b/docs/_sources/getting_started/jetpack.rst.txt
@@ -1,119 +1,122 @@
-.. _Torch_TensorRT_in_JetPack_6.1
+.. _Torch_TensorRT_in_JetPack:
-Overview
-##################
-
-JetPack 6.1
----------------------
-Nvida JetPack 6.1 is the latest production release ofJetPack 6.
-With this release it incorporates:
-CUDA 12.6
-TensorRT 10.3
-cuDNN 9.3
-DLFW 24.09
+Torch-TensorRT in JetPack
+#############################
-You can find more details for the JetPack 6.1:
+Overview
+********
- * https://docs.nvidia.com/jetson/jetpack/release-notes/index.html
- * https://docs.nvidia.com/deeplearning/frameworks/install-pytorch-jetson-platform/index.html
+JetPack 6.2
+===========
+NVIDIA JetPack 6.2 is the latest production release for Jetson platforms, featuring:
+- CUDA 12.6
+- TensorRT 10.3
+- cuDNN 9.3
+For detailed information about JetPack 6.2, refer to:
+* `JetPack 6.2 Release Notes `_
+* `PyTorch for Jetson Platform `_
Prerequisites
-~~~~~~~~~~~~~~
+*************
+System Preparation
+==================
+1. **Flash your Jetson device**
-Ensure your jetson developer kit has been flashed with the latest JetPack 6.1. You can find more details on how to flash Jetson board via sdk-manager:
+ with JetPack 6.2 using SDK Manager:
+ - `SDK Manager Guide `_
- * https://developer.nvidia.com/sdk-manager
+2. **Verify JetPack installation**:
+ .. code-block:: sh
-check the current jetpack version using
+ apt show nvidia-jetpack
-.. code-block:: sh
+3. **Install development components**:
+ .. code-block:: sh
- apt show nvidia-jetpack
+ sudo apt-get update
+ sudo apt-get install nvidia-jetpack
-Ensure you have installed JetPack Dev components. This step is required if you need to build on jetson board.
+4. **Confirm CUDA 12.6 installation**:
-You can only install the dev components that you require: ex, tensorrt-dev would be the meta-package for all TRT development or install everthing.
+ .. code-block:: sh
-.. code-block:: sh
- # install all the nvidia-jetpack dev components
- sudo apt-get update
- sudo apt-get install nvidia-jetpack
+ nvcc --version
+ # If missing or incorrect version:
+ sudo apt-get install cuda-toolkit-12-6
-Ensure you have cuda 12.6 installed(this should be installed automatically from nvidia-jetpack)
+5. **Validate cuSPARSELt library**:
-.. code-block:: sh
+ .. code-block:: sh
- # check the cuda version
- nvcc --version
- # if not installed or the version is not 12.6, install via the below cmd:
- sudo apt-get update
- sudo apt-get install cuda-toolkit-12-6
+ # Check library presence
+ ls /usr/local/cuda/lib64/libcusparseLt.so
-Ensure libcusparseLt.so exists at /usr/local/cuda/lib64/:
+ # Install if missing
+ wget https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-sbsa/libcusparse_lt-linux-sbsa-0.5.2.1-archive.tar.xz
+ tar xf libcusparse_lt-linux-sbsa-0.5.2.1-archive.tar.xz
+ sudo cp -a libcusparse_lt-linux-sbsa-0.5.2.1-archive/include/* /usr/local/cuda/include/
+ sudo cp -a libcusparse_lt-linux-sbsa-0.5.2.1-archive/lib/* /usr/local/cuda/lib64/
-.. code-block:: sh
+Building Torch-TensorRT
+***********************
- # if not exist, download and copy to the directory
- wget https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-sbsa/libcusparse_lt-linux-sbsa-0.5.2.1-archive.tar.xz
- tar xf libcusparse_lt-linux-sbsa-0.5.2.1-archive.tar.xz
- sudo cp -a libcusparse_lt-linux-sbsa-0.5.2.1-archive/include/* /usr/local/cuda/include/
- sudo cp -a libcusparse_lt-linux-sbsa-0.5.2.1-archive/lib/* /usr/local/cuda/lib64/
+Build Environment Setup
+=======================
+1. **Install Build Dependencies**:
+ .. code-block:: sh
-Build torch_tensorrt
-~~~~~~~~~~~~~~
+ wget https://github.com/bazelbuild/bazelisk/releases/download/v1.26.0/bazelisk-linux-arm64
+ sudo mv bazelisk-linux-arm64 /usr/bin/bazel
+ sudo chmod +x /usr/bin/bazel
+ .. code-block:: sh
-Install bazel
+ apt-get install ninja-build vim libopenblas-dev git
-.. code-block:: sh
+2. **Install Python dependencies**:
- wget -v https://github.com/bazelbuild/bazelisk/releases/download/v1.20.0/bazelisk-linux-arm64
- sudo mv bazelisk-linux-arm64 /usr/bin/bazel
- chmod +x /usr/bin/bazel
+ .. code-block:: sh
-Install pip and required python packages:
- * https://pip.pypa.io/en/stable/installation/
+ wget https://bootstrap.pypa.io/get-pip.py
+ python get-pip.py
+ python -m pip install pyyaml
-.. code-block:: sh
+3. **Install PyTorch**:
- # install pip
- wget https://bootstrap.pypa.io/get-pip.py
- python get-pip.py
+ .. code-block:: sh
-.. code-block:: sh
-
- # install pytorch from nvidia jetson distribution: https://developer.download.nvidia.com/compute/redist/jp/v61/pytorch
- python -m pip install torch https://developer.download.nvidia.com/compute/redist/jp/v61/pytorch/torch-2.5.0a0+872d972e41.nv24.08.17622132-cp310-cp310-linux_aarch64.whl
-
-.. code-block:: sh
+ # Can only install the torch and torchvision wheel from the JPL repo which is built specifically for JetPack 6.2
+ python -m pip install torch==2.7.0 torchvision==0.22.0 --index-url=https://pypi.jetson-ai-lab.dev/jp6/cu126/
- # install required python packages
- python -m pip install -r toolchains/jp_workspaces/requirements.txt
- # if you want to run the test cases, then install the test required python packages
- python -m pip install -r toolchains/jp_workspaces/test_requirements.txt
+Building the Wheel
+==================
+.. code-block:: sh
+ python setup.py bdist_wheel
-Build and Install torch_tensorrt wheel file
-
+Installation
+============
-Since torch_tensorrt version has dependencies on torch version. torch version supported by JetPack6.1 is from DLFW 24.08/24.09(torch 2.5.0).
+.. code-block:: sh
+ # you will be able to find the wheel in the dist directory, has platform name linux_tegra_aarch64
+ cd dist
+ python -m pip install torch_tensorrt-2.8.0.dev0+d8318d8fc-cp310-cp310-linux_tegra_aarch64.whl
-Please make sure to build torch_tensorrt wheel file from source release/2.5 branch
-(TODO: lanl to update the branch name once release/ngc branch is available)
+Post-Installation Verification
+==============================
-.. code-block:: sh
+Verify installation by importing in Python:
+.. code-block:: python
- cuda_version=$(nvcc --version | grep Cuda | grep release | cut -d ',' -f 2 | sed -e 's/ release //g')
- export TORCH_INSTALL_PATH=$(python -c "import torch, os; print(os.path.dirname(torch.__file__))")
- export SITE_PACKAGE_PATH=${TORCH_INSTALL_PATH::-6}
- export CUDA_HOME=/usr/local/cuda-${cuda_version}/
- # replace the MODULE.bazel with the jetpack one
- cat toolchains/jp_workspaces/MODULE.bazel.tmpl | envsubst > MODULE.bazel
- # build and install torch_tensorrt wheel file
- python setup.py install --user
+ # verify whether the torch-tensorrt can be imported
+ import torch
+ import torch_tensorrt
+ print(torch_tensorrt.__version__)
+ # verify whether the examples can be run
+ python examples/dynamo/torch_compile_resnet_example.py
diff --git a/docs/_sources/index.rst.txt b/docs/_sources/index.rst.txt
index 67fbdc56f5..4d28d77640 100644
--- a/docs/_sources/index.rst.txt
+++ b/docs/_sources/index.rst.txt
@@ -140,11 +140,10 @@ Model Zoo
* :ref:`torch_compile_resnet`
* :ref:`torch_compile_transformer`
* :ref:`torch_compile_stable_diffusion`
+* :ref:`compile_hf_models`
* :ref:`torch_compile_gpt2`
* :ref:`torch_export_gpt2`
-* :ref:`torch_export_llama2`
* :ref:`torch_export_sam2`
-* :ref:`torch_export_flux_dev`
* :ref:`notebooks`
.. toctree::
@@ -155,11 +154,10 @@ Model Zoo
tutorials/_rendered_examples/dynamo/torch_compile_resnet_example
tutorials/_rendered_examples/dynamo/torch_compile_transformers_example
tutorials/_rendered_examples/dynamo/torch_compile_stable_diffusion
+ tutorials/compile_hf_models
tutorials/_rendered_examples/distributed_inference/data_parallel_gpt2
tutorials/_rendered_examples/distributed_inference/data_parallel_stable_diffusion
tutorials/_rendered_examples/dynamo/torch_compile_gpt2
- tutorials/_rendered_examples/dynamo/torch_export_gpt2
- tutorials/_rendered_examples/dynamo/torch_export_llama2
tutorials/_rendered_examples/dynamo/torch_export_sam2
tutorials/_rendered_examples/dynamo/torch_export_flux_dev
tutorials/notebooks
diff --git a/docs/_sources/tutorials/_rendered_examples/dynamo/aot_plugin.rst.txt b/docs/_sources/tutorials/_rendered_examples/dynamo/aot_plugin.rst.txt
new file mode 100644
index 0000000000..522a1fc707
--- /dev/null
+++ b/docs/_sources/tutorials/_rendered_examples/dynamo/aot_plugin.rst.txt
@@ -0,0 +1,227 @@
+
+.. DO NOT EDIT.
+.. THIS FILE WAS AUTOMATICALLY GENERATED BY SPHINX-GALLERY.
+.. TO MAKE CHANGES, EDIT THE SOURCE PYTHON FILE:
+.. "tutorials/_rendered_examples/dynamo/aot_plugin.py"
+.. LINE NUMBERS ARE GIVEN BELOW.
+
+.. only:: html
+
+ .. note::
+ :class: sphx-glr-download-link-note
+
+ :ref:`Go to the end `
+ to download the full example code
+
+.. rst-class:: sphx-glr-example-title
+
+.. _sphx_glr_tutorials__rendered_examples_dynamo_aot_plugin.py:
+
+
+.. _aot_plugin:
+Automatically Generate a TensorRT AOT Plugin
+===================================================================
+We are going to demonstrate how to automatically generate a plugin for a custom kernel using Torch-TensorRT using
+the new Python based plugin system in TensorRT 10.7.
+
+Torch-TensorRT supports falling back to PyTorch implementations of operations in the case that Torch-TensorRT
+does not know how to compile them in TensorRT. However, this comes at the cost of a graph break and will reduce the performance of the model.
+The easiest way to fix lack of support for ops is by adding a decomposition (see:
+`Writing lowering passes for the Dynamo frontend `_) - which defines the operator
+in terms of PyTorch ops that are supported in Torch-TensorRT or a converter (see:
+`Writing converters for the Dynamo frontend `_) - which defines the operator in terms of TensorRT operators.
+
+In some cases there isn't a great way to do either of these, perhaps because the operator is a custom kernel that is not part of standard PyTorch or
+TensorRT cannot support it natively.
+
+For these cases, it is possible to use a TensorRT plugin to replace the operator **inside** the TensorRT engine, thereby avoiding
+the performance and resource overhead from a graph break.
+
+Previously this involved a complex process in not only building a performant kernel but setting it up to run in TensorRT (see: `Using Custom Kernels within TensorRT Engines with Torch-TensorRT `_).
+As of TensorRT 10.7, there is a new Python native plugin system which greatly streamlines this process. This
+plugin system also allows Torch-TensorRT to automatically generate the necessary conversion code to convert the
+operation in PyTorch to TensorRT.
+
+In addition, Torch-TensorRT provides automatic generation of TensorRT plugin feature (see: `Automatically Generate a Plugin for a Custom Kernel `_).
+However, the above methods generates a JIT plugin that might not satisfy user's performance requirements.
+To support that, Torch-TensorRT provides auto generation of TensorRT AOT Plugin which raps a function to define an Ahead-of-Time (AOT) implementation for a plugin already registered.
+This provides a performance boost comparing to JIT plugin.
+
+.. GENERATED FROM PYTHON SOURCE LINES 31-175
+
+.. code-block:: python
+
+
+ import argparse
+ from typing import Tuple, Union
+
+ import tensorrt as trt
+ import tensorrt.plugin as trtp
+ import torch
+ import torch_tensorrt
+ import triton
+ import triton.language as tl
+
+ trt_logger = trt.Logger(trt.Logger.VERBOSE)
+
+
+ @triton.jit
+ def add_one_kernel(x_ptr, n_elements, y_ptr, BLOCK_SIZE: tl.constexpr):
+ pid = tl.program_id(0)
+ block_start = pid * BLOCK_SIZE
+ offsets = block_start + tl.arange(0, BLOCK_SIZE)
+ mask = offsets < n_elements
+ x = tl.load(x_ptr + offsets, mask=mask)
+ output = x + 1
+ tl.store(y_ptr + offsets, output, mask=mask)
+
+
+ @torch.library.custom_op("my::add_one", mutates_args=()) # type: ignore[misc]
+ def add_one(X: torch.Tensor) -> torch.Tensor:
+ # Ensure the tensors are on the GPU
+ assert X.is_cuda
+
+ # Create output tensor
+ Y = torch.empty_like(X)
+
+ # Define block size
+ BLOCK_SIZE = 256
+
+ # Grid of programs
+ grid = lambda meta: (triton.cdiv(X.numel(), meta["BLOCK_SIZE"]),)
+
+ # Launch the kernel
+ add_one_kernel[grid](X, X.numel(), Y, BLOCK_SIZE=BLOCK_SIZE)
+
+ return Y
+
+
+ @torch.library.register_fake("my::add_one")
+ def _(X: torch.Tensor) -> torch.Tensor:
+ return X
+
+
+ @trtp.register("my::add_one")
+ def add_plugin_desc(X: trtp.TensorDesc) -> Tuple[trtp.TensorDesc]:
+ return X.like()
+
+
+ @trtp.aot_impl("my::add_one")
+ def add_plugin_aot_impl(
+ X: trtp.TensorDesc, outputs: Tuple[trtp.TensorDesc], tactic: int
+ ) -> Tuple[
+ Union[str, bytes], Union[str, bytes], trtp.KernelLaunchParams, trtp.SymExprs
+ ]:
+ type_str = "fp32" if X.dtype == trt.float32 else "fp16"
+
+ block_size = 256
+ src = triton.compiler.ASTSource(
+ fn=add_one_kernel,
+ signature={
+ "x_ptr": f"*{type_str}",
+ "n_elements": "i32",
+ "y_ptr": f"*{type_str}",
+ "BLOCK_SIZE": "constexpr",
+ },
+ constants={
+ "BLOCK_SIZE": block_size,
+ },
+ )
+
+ compiled_kernel = triton.compile(src)
+
+ N = X.shape_expr.numel()
+ launch_params = trtp.KernelLaunchParams()
+
+ # grid dims
+ launch_params.grid_x = trtp.cdiv(N, block_size)
+ # block dims
+ launch_params.block_x = compiled_kernel.metadata.num_warps * 32
+ # shared memory
+ launch_params.shared_mem = compiled_kernel.metadata.shared
+
+ extra_args = trtp.SymIntExprs(1)
+ extra_args[0] = trtp.SymInt32(N)
+
+ return (
+ compiled_kernel.metadata.name,
+ compiled_kernel.asm["ptx"],
+ launch_params,
+ extra_args,
+ )
+
+
+ torch_tensorrt.dynamo.conversion.plugins.generate_plugin_converter(
+ "my::add_one",
+ supports_dynamic_shapes=False,
+ requires_output_allocator=False,
+ use_aot_if_available=True,
+ )
+
+
+ class MyModel(torch.nn.Module):
+ def __init__(self):
+ super().__init__()
+
+ def forward(self, X: torch.Tensor) -> torch.Tensor:
+ res = torch.ops.my.add_one.default(X)
+
+ return res
+
+
+ if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ "--aot", action="store_true", help="Try to use AOT compilation", default=False
+ )
+ args = parser.parse_args()
+
+ my_model = MyModel().to("cuda")
+ m = torch.full((64, 64), 2, device="cuda", dtype=torch.float)
+
+ assert my_model(X=m)[0][0] == 3.0
+
+ with torch_tensorrt.logging.debug():
+ trt_inputs = [m]
+ model_trt = torch_tensorrt.compile(
+ my_model,
+ inputs=trt_inputs,
+ min_block_size=1,
+ )
+ print("Model compiled successfully!")
+ print("Running inference with compiled model...")
+ for i in range(10):
+ res = model_trt(m)
+ assert torch.allclose(res, my_model(m)), "Results do not match!"
+
+ print("Inference successful!")
+
+
+.. rst-class:: sphx-glr-timing
+
+ **Total running time of the script:** ( 0 minutes 0.000 seconds)
+
+
+.. _sphx_glr_download_tutorials__rendered_examples_dynamo_aot_plugin.py:
+
+.. only:: html
+
+ .. container:: sphx-glr-footer sphx-glr-footer-example
+
+
+
+
+ .. container:: sphx-glr-download sphx-glr-download-python
+
+ :download:`Download Python source code: aot_plugin.py `
+
+ .. container:: sphx-glr-download sphx-glr-download-jupyter
+
+ :download:`Download Jupyter notebook: aot_plugin.ipynb `
+
+
+.. only:: html
+
+ .. rst-class:: sphx-glr-signature
+
+ `Gallery generated by Sphinx-Gallery `_
diff --git a/docs/_sources/tutorials/_rendered_examples/dynamo/auto_generate_converters.rst.txt b/docs/_sources/tutorials/_rendered_examples/dynamo/auto_generate_converters.rst.txt
index 2767c6856c..22754cb3f7 100644
--- a/docs/_sources/tutorials/_rendered_examples/dynamo/auto_generate_converters.rst.txt
+++ b/docs/_sources/tutorials/_rendered_examples/dynamo/auto_generate_converters.rst.txt
@@ -213,7 +213,7 @@ Now we can use our custom operator in a model and compile it with Torch-TensorRT
We can see that the custom operator is used as one of the operations in the forward pass of the model.
The process of compiling the model at this point is identical to standard Torch-TensorRT usage.
-.. GENERATED FROM PYTHON SOURCE LINES 161-185
+.. GENERATED FROM PYTHON SOURCE LINES 161-183
.. code-block:: python
@@ -233,9 +233,7 @@ The process of compiling the model at this point is identical to standard Torch-
n = torch.full((64, 64), 3, device="cuda", dtype=torch.float)
with torch_tensorrt.logging.errors():
- model_trt = torch_tensorrt.compile(
- my_model, inputs=[m, n], debug=True, min_block_size=1
- )
+ model_trt = torch_tensorrt.compile(my_model, inputs=[m, n], min_block_size=1)
for i in range(300):
res = model_trt(m, n)
assert torch.allclose(res, my_model(m, n))
diff --git a/docs/_sources/tutorials/_rendered_examples/dynamo/auto_generate_plugins.rst.txt b/docs/_sources/tutorials/_rendered_examples/dynamo/auto_generate_plugins.rst.txt
index 4ecc5f949a..eb7133c18a 100644
--- a/docs/_sources/tutorials/_rendered_examples/dynamo/auto_generate_plugins.rst.txt
+++ b/docs/_sources/tutorials/_rendered_examples/dynamo/auto_generate_plugins.rst.txt
@@ -169,7 +169,7 @@ Now we can use our custom operator in a model and compile it with Torch-TensorRT
We can see that the custom operator is used as one of the operations in the forward pass of the model.
The process of compiling the model at this point is identical to standard Torch-TensorRT usage.
-.. GENERATED FROM PYTHON SOURCE LINES 131-155
+.. GENERATED FROM PYTHON SOURCE LINES 131-153
.. code-block:: python
@@ -189,9 +189,7 @@ The process of compiling the model at this point is identical to standard Torch-
n = torch.randint(0, 5, (64, 64), device="cuda", dtype=torch.float)
with torch_tensorrt.logging.errors():
- model_trt = torch_tensorrt.compile(
- my_model, inputs=[m, n], debug=True, min_block_size=1
- )
+ model_trt = torch_tensorrt.compile(my_model, inputs=[m, n], min_block_size=1)
for i in range(300):
res = model_trt(m, n)
assert torch.allclose(res, my_model(m, n))
diff --git a/docs/_sources/tutorials/_rendered_examples/dynamo/cross_runtime_compilation_for_windows.rst.txt b/docs/_sources/tutorials/_rendered_examples/dynamo/cross_runtime_compilation_for_windows.rst.txt
index dfc8544c0c..3f566227e0 100644
--- a/docs/_sources/tutorials/_rendered_examples/dynamo/cross_runtime_compilation_for_windows.rst.txt
+++ b/docs/_sources/tutorials/_rendered_examples/dynamo/cross_runtime_compilation_for_windows.rst.txt
@@ -80,7 +80,7 @@ According to the argument, it is either cross compile and save resnet model for
or load the saved resnet model in Windows
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. GENERATED FROM PYTHON SOURCE LINES 57-83
+.. GENERATED FROM PYTHON SOURCE LINES 57-82
.. code-block:: python
@@ -101,7 +101,6 @@ or load the saved resnet model in Windows
"cross runtime compiled model for windows can only be compiled in Linux system"
)
compile_spec = {
- "debug": True,
"min_block_size": 1,
}
torchtrt.cross_compile_for_windows(
diff --git a/docs/_sources/tutorials/_rendered_examples/dynamo/custom_kernel_plugins.rst.txt b/docs/_sources/tutorials/_rendered_examples/dynamo/custom_kernel_plugins.rst.txt
index 09549cce33..f931372fe3 100644
--- a/docs/_sources/tutorials/_rendered_examples/dynamo/custom_kernel_plugins.rst.txt
+++ b/docs/_sources/tutorials/_rendered_examples/dynamo/custom_kernel_plugins.rst.txt
@@ -340,7 +340,7 @@ If we try to compile this model with Torch-TensorRT, we can see that (as of Torc
Node: torch.ops.torchtrt_ex.triton_circular_pad.default, with layer location: __/triton_circular_pad
Note: Some of the above nodes may be supported, but were not included in a TRT graph by the partitioner
- Compiled with: CompilationSettings(enabled_precisions={}, debug=False, workspace_size=0, min_block_size=1, torch_executed_ops=set(), pass_through_build_failures=False, max_aux_streams=None, version_compatible=False, optimization_level=None, use_python_runtime=False, truncate_double=False, use_fast_partitioner=True, enable_experimental_decompositions=False, device=Device(type=DeviceType.GPU, gpu_id=0), require_full_compilation=False, disable_tf32=False, sparse_weights=False, refit=False, engine_capability=, num_avg_timing_iters=1, dla_sram_size=1048576, dla_local_dram_size=1073741824, dla_global_dram_size=536870912, dryrun=True, hardware_compatible=False)
+ Compiled with: CompilationSettings(enabled_precisions={}, workspace_size=0, min_block_size=1, torch_executed_ops=set(), pass_through_build_failures=False, max_aux_streams=None, version_compatible=False, optimization_level=None, use_python_runtime=False, truncate_double=False, use_fast_partitioner=True, enable_experimental_decompositions=False, device=Device(type=DeviceType.GPU, gpu_id=0), require_full_compilation=False, disable_tf32=False, sparse_weights=False, refit=False, engine_capability=, num_avg_timing_iters=1, dla_sram_size=1048576, dla_local_dram_size=1073741824, dla_global_dram_size=536870912, dryrun=True, hardware_compatible=False)
Graph Structure:
@@ -666,7 +666,7 @@ Finally, we are now able to fully compile our model
The graph consists of 2 Total Operators, of which 2 operators are supported, 100.0% coverage
- Compiled with: CompilationSettings(enabled_precisions={}, debug=True, workspace_size=0, min_block_size=1, torch_executed_ops=set(), pass_through_build_failures=False, max_aux_streams=None, version_compatible=False, optimization_level=None, use_python_runtime=False, truncate_double=False, use_fast_partitioner=True, enable_experimental_decompositions=False, device=Device(type=DeviceType.GPU, gpu_id=0), require_full_compilation=False, disable_tf32=False, sparse_weights=False, refit=False, engine_capability=, num_avg_timing_iters=1, dla_sram_size=1048576, dla_local_dram_size=1073741824, dla_global_dram_size=536870912, dryrun=False, hardware_compatible=False)
+ Compiled with: CompilationSettings(enabled_precisions={}, workspace_size=0, min_block_size=1, torch_executed_ops=set(), pass_through_build_failures=False, max_aux_streams=None, version_compatible=False, optimization_level=None, use_python_runtime=False, truncate_double=False, use_fast_partitioner=True, enable_experimental_decompositions=False, device=Device(type=DeviceType.GPU, gpu_id=0), require_full_compilation=False, disable_tf32=False, sparse_weights=False, refit=False, engine_capability=, num_avg_timing_iters=1, dla_sram_size=1048576, dla_local_dram_size=1073741824, dla_global_dram_size=536870912, dryrun=False, hardware_compatible=False)
Graph Structure:
diff --git a/docs/_sources/tutorials/_rendered_examples/dynamo/engine_caching_bert_example.rst.txt b/docs/_sources/tutorials/_rendered_examples/dynamo/engine_caching_bert_example.rst.txt
index cc84e2b968..552204a47d 100644
--- a/docs/_sources/tutorials/_rendered_examples/dynamo/engine_caching_bert_example.rst.txt
+++ b/docs/_sources/tutorials/_rendered_examples/dynamo/engine_caching_bert_example.rst.txt
@@ -25,7 +25,7 @@ Engine Caching (BERT)
Small caching example on BERT.
-.. GENERATED FROM PYTHON SOURCE LINES 10-76
+.. GENERATED FROM PYTHON SOURCE LINES 10-75
.. code-block:: python
@@ -72,7 +72,6 @@ Small caching example on BERT.
"use_python_runtime": False,
"enabled_precisions": {torch.float},
"truncate_double": True,
- "debug": False,
"min_block_size": 1,
"immutable_weights": False,
"cache_built_engines": cache_built_engines,
diff --git a/docs/_sources/tutorials/_rendered_examples/dynamo/engine_caching_example.rst.txt b/docs/_sources/tutorials/_rendered_examples/dynamo/engine_caching_example.rst.txt
index a21b53f623..1b70b7430a 100644
--- a/docs/_sources/tutorials/_rendered_examples/dynamo/engine_caching_example.rst.txt
+++ b/docs/_sources/tutorials/_rendered_examples/dynamo/engine_caching_example.rst.txt
@@ -41,7 +41,7 @@ The example uses a pre-trained ResNet18 model and shows the
differences between compilation without caching, with caching enabled,
and when reusing cached engines.
-.. GENERATED FROM PYTHON SOURCE LINES 26-52
+.. GENERATED FROM PYTHON SOURCE LINES 26-51
.. code-block:: python
@@ -61,7 +61,6 @@ and when reusing cached engines.
model = models.resnet18(pretrained=True).eval().to("cuda")
enabled_precisions = {torch.float}
- debug = False
min_block_size = 1
use_python_runtime = False
@@ -72,7 +71,7 @@ and when reusing cached engines.
-.. GENERATED FROM PYTHON SOURCE LINES 53-67
+.. GENERATED FROM PYTHON SOURCE LINES 52-66
Engine Caching for JIT Compilation
----------------------------------
@@ -89,7 +88,7 @@ pull the built engine and **refit** the weights which can reduce compilation tim
As such, in order to insert a new engine into the cache (i.e. ``cache_built_engines=True``),
the engine must be refittable (``immutable_weights=False``). See :ref:`refit_engine_example` for more details.
-.. GENERATED FROM PYTHON SOURCE LINES 67-118
+.. GENERATED FROM PYTHON SOURCE LINES 66-116
.. code-block:: python
@@ -124,7 +123,6 @@ the engine must be refittable (``immutable_weights=False``). See :ref:`refit_eng
options={
"use_python_runtime": True,
"enabled_precisions": enabled_precisions,
- "debug": debug,
"min_block_size": min_block_size,
"immutable_weights": False,
"cache_built_engines": cache_built_engines,
@@ -145,7 +143,7 @@ the engine must be refittable (``immutable_weights=False``). See :ref:`refit_eng
torch_compile()
-.. GENERATED FROM PYTHON SOURCE LINES 119-124
+.. GENERATED FROM PYTHON SOURCE LINES 117-122
Engine Caching for AOT Compilation
----------------------------------
@@ -153,7 +151,7 @@ Similarly to the JIT workflow, AOT workflows can benefit from engine caching.
As the same architecture or common subgraphs get recompiled, the cache will pull
previously built engines and refit the weights.
-.. GENERATED FROM PYTHON SOURCE LINES 124-178
+.. GENERATED FROM PYTHON SOURCE LINES 122-175
.. code-block:: python
@@ -191,7 +189,6 @@ previously built engines and refit the weights.
tuple(inputs),
use_python_runtime=use_python_runtime,
enabled_precisions=enabled_precisions,
- debug=debug,
min_block_size=min_block_size,
immutable_weights=False,
cache_built_engines=cache_built_engines,
@@ -212,7 +209,7 @@ previously built engines and refit the weights.
dynamo_compile()
-.. GENERATED FROM PYTHON SOURCE LINES 179-195
+.. GENERATED FROM PYTHON SOURCE LINES 176-192
Custom Engine Cache
----------------------
@@ -231,7 +228,7 @@ The blob contains a serialized engine, calling spec data, and weight map informa
Below is an example of a custom engine cache implementation that implents a ``RAMEngineCache``.
-.. GENERATED FROM PYTHON SOURCE LINES 195-289
+.. GENERATED FROM PYTHON SOURCE LINES 192-285
.. code-block:: python
@@ -309,7 +306,6 @@ Below is an example of a custom engine cache implementation that implents a ``RA
options={
"use_python_runtime": True,
"enabled_precisions": enabled_precisions,
- "debug": debug,
"min_block_size": min_block_size,
"immutable_weights": False,
"cache_built_engines": cache_built_engines,
diff --git a/docs/_sources/tutorials/_rendered_examples/dynamo/hierarchical_partitioner_example.rst.txt b/docs/_sources/tutorials/_rendered_examples/dynamo/hierarchical_partitioner_example.rst.txt
new file mode 100644
index 0000000000..1f43b1b627
--- /dev/null
+++ b/docs/_sources/tutorials/_rendered_examples/dynamo/hierarchical_partitioner_example.rst.txt
@@ -0,0 +1,239 @@
+
+.. DO NOT EDIT.
+.. THIS FILE WAS AUTOMATICALLY GENERATED BY SPHINX-GALLERY.
+.. TO MAKE CHANGES, EDIT THE SOURCE PYTHON FILE:
+.. "tutorials/_rendered_examples/dynamo/hierarchical_partitioner_example.py"
+.. LINE NUMBERS ARE GIVEN BELOW.
+
+.. only:: html
+
+ .. note::
+ :class: sphx-glr-download-link-note
+
+ :ref:`Go to the end `
+ to download the full example code
+
+.. rst-class:: sphx-glr-example-title
+
+.. _sphx_glr_tutorials__rendered_examples_dynamo_hierarchical_partitioner_example.py:
+
+
+.. _hierarchical_partitioner_example:
+
+Hierarchical Partitioner Example
+================================
+
+Basic example on how to use the hierarchical adjacency partitioner function and manually compile the partitioned model.
+Not yet available in the compile API.
+
+.. GENERATED FROM PYTHON SOURCE LINES 11-188
+
+.. code-block:: python
+
+
+ from typing import Any, Callable
+
+ import torch
+ import torch.nn as nn
+ import torch_tensorrt
+ from torch_tensorrt._enums import dtype
+ from torch_tensorrt.dynamo import partitioning
+ from torch_tensorrt.dynamo._compiler import convert_module
+ from torch_tensorrt.dynamo.conversion._ConverterRegistry import (
+ DYNAMO_CONVERTERS as CONVERTERS,
+ )
+ from torch_tensorrt.dynamo.lowering import (
+ get_decompositions,
+ pre_export_lowering,
+ )
+ from torch_tensorrt.dynamo.partitioning._hierarchical_partitioner import (
+ hierarchical_adjacency_partition,
+ )
+ from torch_tensorrt.dynamo.utils import (
+ get_output_metadata,
+ )
+ from torchvision import models
+
+
+ class InductorModule(torch.nn.Module): # type: ignore[misc]
+ """Wrapper module for inductor compiled function."""
+
+ def __init__(self, func: Callable[..., Any]) -> None:
+ super().__init__()
+ self.func = func
+
+ def forward(self, *args: Any, **kwargs: Any) -> Any:
+ return self.func(*args, **kwargs)
+
+
+ class SimpleModel(nn.Module):
+ def __init__(self):
+ super().__init__()
+ self.conv1 = nn.Conv2d(3, 64, kernel_size=3, padding=1)
+ self.conv2 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
+ self.bn1 = nn.BatchNorm2d(64)
+ self.bn2 = nn.BatchNorm2d(128)
+
+ def forward(self, x):
+ x = self.conv1(x)
+ x = self.bn1(x)
+ x = torch.relu(x)
+ x = self.conv2(x)
+ x = self.bn2(x)
+ x = torch.relu(x)
+ return x
+
+
+ def main():
+ # Create model
+ model = SimpleModel().cuda()
+ # model = models.efficientnet_b0(pretrained=True).cuda()
+ model = model.eval()
+
+ # Create example input
+ example_input = torch.randn(1, 3, 224, 224).cuda()
+
+ exported_program = torch.export.export(model, (example_input,))
+ exported_program = pre_export_lowering(exported_program)
+ exported_program = exported_program.run_decompositions(get_decompositions())
+
+ gm = exported_program.module()
+
+ print("Original Model Structure:\n", gm)
+
+ original_output = model(example_input)
+
+ # 1. Partition the model into blocks that can be executed by different backends
+ partitioned_model, op_support = hierarchical_adjacency_partition(
+ gm,
+ min_block_size=1,
+ backend_priority=["inductor", "tensorrt"],
+ backend_support_map={
+ "inductor": {
+ "torch.ops.aten.convolution.default",
+ },
+ "tensorrt": CONVERTERS.keys(),
+ },
+ torch_executed_ops={
+ "torch.ops.aten._native_batch_norm_legit_no_training.default"
+ },
+ require_full_compilation=False,
+ skip_fusion=True,
+ )
+
+ print("1. Partitioned Model Structure:\n", partitioned_model)
+
+ # 2. Compile each submodule with the corresponding backend
+ submodule_node_dict = {}
+ for node in partitioned_model.graph.nodes:
+ if "_run_on_acc" not in node.name:
+ continue
+ submodule_node_dict[node.name] = node
+
+ # Store compiled replicas of Torch subgraphs
+ compiled_modules = {}
+
+ for name, _ in partitioned_model.named_children():
+ submodule = getattr(partitioned_model, name)
+ if not isinstance(submodule, torch.fx.graph_module.GraphModule):
+ continue
+
+ if "_run_on_acc" not in name:
+ submodule.to("cuda")
+ continue
+
+ if name not in submodule_node_dict:
+ raise ValueError(
+ f"node_name: {name} does not exist in the submodule node dictionary"
+ )
+
+ # set the submodule metadata back to the parent module_node
+ metadata_list = get_output_metadata(submodule)
+ assert len(metadata_list) > 0
+ metadata_keys = ["val", "tensor_meta"]
+ for key in metadata_keys:
+ if key not in submodule_node_dict[name].meta:
+ meta_val_list = [
+ metadata[key] for metadata in metadata_list if key in metadata
+ ]
+ submodule_node_dict[name].meta[key] = meta_val_list
+ break
+
+ # Get the submodule inputs for min, opt, max shapes of the graph inputs
+ submodule_inputs = partitioning.construct_submodule_inputs(submodule)
+ assert submodule_inputs is not None
+
+ # compile submodule with pytorch inductor backend
+ if "_run_on_acc_inductor" in name:
+ sub_inputs = []
+ for input in submodule_inputs:
+ sub_input = input.torch_tensor.to(
+ dtype.to(input.dtype, t=torch.dtype)
+ ).cuda()
+ sub_inputs.append(sub_input)
+
+ compiled_func = torch._inductor.compile(
+ submodule,
+ sub_inputs,
+ )
+ # Wrap the compiled function to be a torch.nn.Module
+ compiled_submodule = InductorModule(compiled_func)
+
+ # compile submodule with tensorrt backend
+ elif "_run_on_acc_tensorrt" in name:
+ compiled_submodule = convert_module(
+ submodule,
+ submodule_inputs,
+ name=name,
+ )
+ else:
+ raise ValueError(f"Unknown backend for submodule: {name}")
+
+ compiled_modules[name] = compiled_submodule
+
+ # Replace all FX Modules with compiled Modules
+ for name, compiled_module in compiled_modules.items():
+ setattr(partitioned_model, name, compiled_module)
+
+ print("2. Compiled Model Structure:\n", partitioned_model)
+
+ with torch.no_grad():
+ partitioned_output = partitioned_model(example_input)
+ print(
+ "3. Verify that Partitioned output == Original output:",
+ torch.allclose(partitioned_output, original_output, 1e-2, 1e-2),
+ )
+
+
+ if __name__ == "__main__":
+ main()
+
+
+.. rst-class:: sphx-glr-timing
+
+ **Total running time of the script:** ( 0 minutes 0.000 seconds)
+
+
+.. _sphx_glr_download_tutorials__rendered_examples_dynamo_hierarchical_partitioner_example.py:
+
+.. only:: html
+
+ .. container:: sphx-glr-footer sphx-glr-footer-example
+
+
+
+
+ .. container:: sphx-glr-download sphx-glr-download-python
+
+ :download:`Download Python source code: hierarchical_partitioner_example.py `
+
+ .. container:: sphx-glr-download sphx-glr-download-jupyter
+
+ :download:`Download Jupyter notebook: hierarchical_partitioner_example.ipynb `
+
+
+.. only:: html
+
+ .. rst-class:: sphx-glr-signature
+
+ `Gallery generated by Sphinx-Gallery `_
diff --git a/docs/_sources/tutorials/_rendered_examples/dynamo/index.rst.txt b/docs/_sources/tutorials/_rendered_examples/dynamo/index.rst.txt
index 8ee9bea380..fb2709eaa1 100644
--- a/docs/_sources/tutorials/_rendered_examples/dynamo/index.rst.txt
+++ b/docs/_sources/tutorials/_rendered_examples/dynamo/index.rst.txt
@@ -138,35 +138,35 @@ Model Zoo
.. raw:: html
-
+
.. only:: html
- .. image:: /tutorials/_rendered_examples/dynamo/images/thumb/sphx_glr_torch_export_cudagraphs_thumb.png
+ .. image:: /tutorials/_rendered_examples/dynamo/images/thumb/sphx_glr_engine_caching_bert_example_thumb.png
:alt:
- :ref:`sphx_glr_tutorials__rendered_examples_dynamo_torch_export_cudagraphs.py`
+ :ref:`sphx_glr_tutorials__rendered_examples_dynamo_engine_caching_bert_example.py`
.. raw:: html
-
Torch Export with Cudagraphs
+
Engine Caching (BERT)
.. raw:: html
-
+
.. only:: html
- .. image:: /tutorials/_rendered_examples/dynamo/images/thumb/sphx_glr_engine_caching_bert_example_thumb.png
+ .. image:: /tutorials/_rendered_examples/dynamo/images/thumb/sphx_glr_torch_export_cudagraphs_thumb.png
:alt:
- :ref:`sphx_glr_tutorials__rendered_examples_dynamo_engine_caching_bert_example.py`
+ :ref:`sphx_glr_tutorials__rendered_examples_dynamo_torch_export_cudagraphs.py`
.. raw:: html
-
Engine Caching (BERT)
+
Torch Export with Cudagraphs
@@ -223,120 +223,120 @@ Model Zoo
.. raw:: html
-
+
.. only:: html
- .. image:: /tutorials/_rendered_examples/dynamo/images/thumb/sphx_glr_torch_export_gpt2_thumb.png
+ .. image:: /tutorials/_rendered_examples/dynamo/images/thumb/sphx_glr_auto_generate_converters_thumb.png
:alt:
- :ref:`sphx_glr_tutorials__rendered_examples_dynamo_torch_export_gpt2.py`
+ :ref:`sphx_glr_tutorials__rendered_examples_dynamo_auto_generate_converters.py`
.. raw:: html
-
Compiling GPT2 using the dynamo backend
+
Automatically Generate a Converter for a Custom Kernel
.. raw:: html
-
+
.. only:: html
- .. image:: /tutorials/_rendered_examples/dynamo/images/thumb/sphx_glr_torch_export_llama2_thumb.png
+ .. image:: /tutorials/_rendered_examples/dynamo/images/thumb/sphx_glr_auto_generate_plugins_thumb.png
:alt:
- :ref:`sphx_glr_tutorials__rendered_examples_dynamo_torch_export_llama2.py`
+ :ref:`sphx_glr_tutorials__rendered_examples_dynamo_auto_generate_plugins.py`
.. raw:: html
-
Compiling Llama2 using the dynamo backend
+
Automatically Generate a Plugin for a Custom Kernel
.. raw:: html
-
+
.. only:: html
- .. image:: /tutorials/_rendered_examples/dynamo/images/thumb/sphx_glr_auto_generate_converters_thumb.png
+ .. image:: /tutorials/_rendered_examples/dynamo/images/thumb/sphx_glr_converter_overloading_thumb.png
:alt:
- :ref:`sphx_glr_tutorials__rendered_examples_dynamo_auto_generate_converters.py`
+ :ref:`sphx_glr_tutorials__rendered_examples_dynamo_converter_overloading.py`
.. raw:: html
-
Automatically Generate a Converter for a Custom Kernel
+
Overloading Torch-TensorRT Converters with Custom Converters
.. raw:: html
-
+
.. only:: html
- .. image:: /tutorials/_rendered_examples/dynamo/images/thumb/sphx_glr_auto_generate_plugins_thumb.png
+ .. image:: /tutorials/_rendered_examples/dynamo/images/thumb/sphx_glr_aot_plugin_thumb.png
:alt:
- :ref:`sphx_glr_tutorials__rendered_examples_dynamo_auto_generate_plugins.py`
+ :ref:`sphx_glr_tutorials__rendered_examples_dynamo_aot_plugin.py`
.. raw:: html
-
Automatically Generate a Plugin for a Custom Kernel
+
Torch-TensorRT supports falling back to PyTorch implementations of operations in the case that Torch-TensorRT
.. raw:: html
-
+
.. only:: html
- .. image:: /tutorials/_rendered_examples/dynamo/images/thumb/sphx_glr_converter_overloading_thumb.png
+ .. image:: /tutorials/_rendered_examples/dynamo/images/thumb/sphx_glr_hierarchical_partitioner_example_thumb.png
:alt:
- :ref:`sphx_glr_tutorials__rendered_examples_dynamo_converter_overloading.py`
+ :ref:`sphx_glr_tutorials__rendered_examples_dynamo_hierarchical_partitioner_example.py`
.. raw:: html
-
Overloading Torch-TensorRT Converters with Custom Converters
+
Hierarchical Partitioner Example
.. raw:: html
-
+
.. only:: html
- .. image:: /tutorials/_rendered_examples/dynamo/images/thumb/sphx_glr_weight_streaming_example_thumb.png
+ .. image:: /tutorials/_rendered_examples/dynamo/images/thumb/sphx_glr_mutable_torchtrt_module_example_thumb.png
:alt:
- :ref:`sphx_glr_tutorials__rendered_examples_dynamo_weight_streaming_example.py`
+ :ref:`sphx_glr_tutorials__rendered_examples_dynamo_mutable_torchtrt_module_example.py`
.. raw:: html
-
Weight Streaming
+
Mutable Torch TensorRT Module
.. raw:: html
-
+
.. only:: html
- .. image:: /tutorials/_rendered_examples/dynamo/images/thumb/sphx_glr_mutable_torchtrt_module_example_thumb.png
+ .. image:: /tutorials/_rendered_examples/dynamo/images/thumb/sphx_glr_weight_streaming_example_thumb.png
:alt:
- :ref:`sphx_glr_tutorials__rendered_examples_dynamo_mutable_torchtrt_module_example.py`
+ :ref:`sphx_glr_tutorials__rendered_examples_dynamo_weight_streaming_example.py`
.. raw:: html
-
Mutable Torch TensorRT Module
+
Weight Streaming
@@ -439,18 +439,18 @@ Model Zoo
/tutorials/_rendered_examples/dynamo/torch_compile_transformers_example
/tutorials/_rendered_examples/dynamo/torch_compile_gpt2
/tutorials/_rendered_examples/dynamo/torch_compile_advanced_usage
- /tutorials/_rendered_examples/dynamo/torch_export_cudagraphs
/tutorials/_rendered_examples/dynamo/engine_caching_bert_example
+ /tutorials/_rendered_examples/dynamo/torch_export_cudagraphs
/tutorials/_rendered_examples/dynamo/pre_allocated_output_example
/tutorials/_rendered_examples/dynamo/torch_compile_resnet_example
/tutorials/_rendered_examples/dynamo/torch_export_flux_dev
- /tutorials/_rendered_examples/dynamo/torch_export_gpt2
- /tutorials/_rendered_examples/dynamo/torch_export_llama2
/tutorials/_rendered_examples/dynamo/auto_generate_converters
/tutorials/_rendered_examples/dynamo/auto_generate_plugins
/tutorials/_rendered_examples/dynamo/converter_overloading
- /tutorials/_rendered_examples/dynamo/weight_streaming_example
+ /tutorials/_rendered_examples/dynamo/aot_plugin
+ /tutorials/_rendered_examples/dynamo/hierarchical_partitioner_example
/tutorials/_rendered_examples/dynamo/mutable_torchtrt_module_example
+ /tutorials/_rendered_examples/dynamo/weight_streaming_example
/tutorials/_rendered_examples/dynamo/torch_export_sam2
/tutorials/_rendered_examples/dynamo/vgg16_ptq
/tutorials/_rendered_examples/dynamo/engine_caching_example
diff --git a/docs/_sources/tutorials/_rendered_examples/dynamo/llama2_flashinfer_rmsnorm.rst.txt b/docs/_sources/tutorials/_rendered_examples/dynamo/llama2_flashinfer_rmsnorm.rst.txt
index c7312ea5fd..740ed8f468 100644
--- a/docs/_sources/tutorials/_rendered_examples/dynamo/llama2_flashinfer_rmsnorm.rst.txt
+++ b/docs/_sources/tutorials/_rendered_examples/dynamo/llama2_flashinfer_rmsnorm.rst.txt
@@ -33,7 +33,7 @@ Key features:
This example illustrates advanced extensibility in Torch-TensorRT through automatic plugin generation and operator lowering customization.
-.. GENERATED FROM PYTHON SOURCE LINES 17-259
+.. GENERATED FROM PYTHON SOURCE LINES 17-258
.. code-block:: python
@@ -272,7 +272,6 @@ This example illustrates advanced extensibility in Torch-TensorRT through automa
disable_tf32=True,
use_explicit_typing=False,
use_fp32_acc=True,
- # debug=True,
)
input_ids = input_ids.to(DEVICE)
diff --git a/docs/_sources/tutorials/_rendered_examples/dynamo/mutable_torchtrt_module_example.rst.txt b/docs/_sources/tutorials/_rendered_examples/dynamo/mutable_torchtrt_module_example.rst.txt
index e3c00f2c64..52765a7bef 100644
--- a/docs/_sources/tutorials/_rendered_examples/dynamo/mutable_torchtrt_module_example.rst.txt
+++ b/docs/_sources/tutorials/_rendered_examples/dynamo/mutable_torchtrt_module_example.rst.txt
@@ -35,7 +35,7 @@ In this tutorial, we are going to walk through
3. Integration with Huggingface pipeline in LoRA use case
4. Usage of dynamic shape with Mutable Torch TensorRT Module
-.. GENERATED FROM PYTHON SOURCE LINES 21-30
+.. GENERATED FROM PYTHON SOURCE LINES 21-31
.. code-block:: python
@@ -43,23 +43,24 @@ In this tutorial, we are going to walk through
import torch
import torch_tensorrt as torch_trt
import torchvision.models as models
+ from diffusers import DiffusionPipeline
np.random.seed(5)
torch.manual_seed(5)
inputs = [torch.rand((1, 3, 224, 224)).to("cuda")]
-.. GENERATED FROM PYTHON SOURCE LINES 31-33
+.. GENERATED FROM PYTHON SOURCE LINES 32-34
Initialize the Mutable Torch TensorRT Module with settings.
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. GENERATED FROM PYTHON SOURCE LINES 33-44
+.. GENERATED FROM PYTHON SOURCE LINES 34-44
.. code-block:: python
settings = {
- "use_python": False,
+ "use_python_runtime": False,
"enabled_precisions": {torch.float32},
"immutable_weights": False,
}
@@ -69,7 +70,6 @@ Initialize the Mutable Torch TensorRT Module with settings.
# You can use the mutable module just like the original pytorch module. The compilation happens while you first call the mutable module.
mutable_module(*inputs)
-
.. GENERATED FROM PYTHON SOURCE LINES 45-47
Make modifications to the mutable module.
@@ -118,18 +118,16 @@ Saving Mutable Torch TensorRT Module
Stable Diffusion with Huggingface
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. GENERATED FROM PYTHON SOURCE LINES 75-146
+.. GENERATED FROM PYTHON SOURCE LINES 75-144
.. code-block:: python
- from diffusers import DiffusionPipeline
with torch.no_grad():
settings = {
"use_python_runtime": True,
"enabled_precisions": {torch.float16},
- "debug": True,
"immutable_weights": False,
}
@@ -156,7 +154,7 @@ Stable Diffusion with Huggingface
"text_embeds": {0: BATCH},
"time_ids": {0: BATCH},
},
- "return_dict": False,
+ "return_dict": None,
}
pipe.unet.set_expected_dynamic_shape_range(
args_dynamic_shapes, kwargs_dynamic_shapes
@@ -194,7 +192,7 @@ Stable Diffusion with Huggingface
-.. GENERATED FROM PYTHON SOURCE LINES 147-153
+.. GENERATED FROM PYTHON SOURCE LINES 145-151
Use Mutable Torch TensorRT module with dynamic shape
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -203,7 +201,7 @@ and should not omit any entries (except None in the kwarg_inputs). If there is a
If the dynamic shape is not required for an input, an empty dictionary should be given as the shape hint for that input.
Note that you should exclude keyword arguments with value None as those will be filtered out.
-.. GENERATED FROM PYTHON SOURCE LINES 153-196
+.. GENERATED FROM PYTHON SOURCE LINES 151-194
.. code-block:: python
@@ -238,7 +236,7 @@ Note that you should exclude keyword arguments with value None as those will be
}, # a's shape does not change so we give it an empty dict
}
# Export the model first with custom dynamic shape constraints
- model = torch_trt.MutableTorchTensorRTModule(model, debug=True, min_block_size=1)
+ model = torch_trt.MutableTorchTensorRTModule(model, min_block_size=1)
model.set_expected_dynamic_shape_range(args_dynamic_shapes, kwarg_dynamic_shapes)
# Compile
model(*inputs, **kwargs)
@@ -251,13 +249,13 @@ Note that you should exclude keyword arguments with value None as those will be
model(*inputs_2, **kwargs_2)
-.. GENERATED FROM PYTHON SOURCE LINES 197-200
+.. GENERATED FROM PYTHON SOURCE LINES 195-198
Use Mutable Torch TensorRT module with persistent cache
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Leveraging engine caching, we are able to shortcut the engine compilation and save much time.
-.. GENERATED FROM PYTHON SOURCE LINES 200-245
+.. GENERATED FROM PYTHON SOURCE LINES 198-242
.. code-block:: python
@@ -276,7 +274,6 @@ Leveraging engine caching, we are able to shortcut the engine compilation and sa
model,
use_python_runtime=True,
enabled_precisions={torch.float},
- debug=True,
min_block_size=1,
immutable_weights=False,
cache_built_engines=True,
diff --git a/docs/_sources/tutorials/_rendered_examples/dynamo/refit_engine_example.rst.txt b/docs/_sources/tutorials/_rendered_examples/dynamo/refit_engine_example.rst.txt
index c0acbf4cb8..ffc06f716d 100644
--- a/docs/_sources/tutorials/_rendered_examples/dynamo/refit_engine_example.rst.txt
+++ b/docs/_sources/tutorials/_rendered_examples/dynamo/refit_engine_example.rst.txt
@@ -80,7 +80,7 @@ these setttings will not be able to be refit.
In this case we are going to compile a ResNet18 model with randomly initialized weights and save it.
-.. GENERATED FROM PYTHON SOURCE LINES 55-80
+.. GENERATED FROM PYTHON SOURCE LINES 55-78
.. code-block:: python
@@ -88,7 +88,6 @@ In this case we are going to compile a ResNet18 model with randomly initialized
model = models.resnet18(pretrained=False).eval().to("cuda")
exp_program = torch.export.export(model, tuple(inputs))
enabled_precisions = {torch.float}
- debug = False
workspace_size = 20 << 30
min_block_size = 0
use_python_runtime = False
@@ -98,7 +97,6 @@ In this case we are going to compile a ResNet18 model with randomly initialized
tuple(inputs),
use_python_runtime=use_python_runtime,
enabled_precisions=enabled_precisions,
- debug=debug,
min_block_size=min_block_size,
torch_executed_ops=torch_executed_ops,
immutable_weights=False,
@@ -110,7 +108,7 @@ In this case we are going to compile a ResNet18 model with randomly initialized
-.. GENERATED FROM PYTHON SOURCE LINES 81-88
+.. GENERATED FROM PYTHON SOURCE LINES 79-86
Refit the Program with Pretrained Weights
------------------------------------------
@@ -120,7 +118,7 @@ refit the model with the pretrained weights. This is done by setting up another
with the target weights and exporting it as an ExportedProgram. Then the ``refit_module_weights``
function is used to update the weights of the compiled module with the new weights.
-.. GENERATED FROM PYTHON SOURCE LINES 88-112
+.. GENERATED FROM PYTHON SOURCE LINES 86-111
.. code-block:: python
@@ -140,6 +138,7 @@ function is used to update the weights of the compiled module with the new weigh
)
# Check the output
+ model2.to("cuda")
expected_outputs, refitted_outputs = exp_program2.module()(*inputs), new_trt_gm(*inputs)
for expected_output, refitted_output in zip(expected_outputs, refitted_outputs):
assert torch.allclose(
@@ -149,7 +148,7 @@ function is used to update the weights of the compiled module with the new weigh
print("Refit successfully!")
-.. GENERATED FROM PYTHON SOURCE LINES 113-141
+.. GENERATED FROM PYTHON SOURCE LINES 112-140
Advanced Usage
-----------------------------
diff --git a/docs/_sources/tutorials/_rendered_examples/dynamo/torch_compile_advanced_usage.rst.txt b/docs/_sources/tutorials/_rendered_examples/dynamo/torch_compile_advanced_usage.rst.txt
index 1bcf8a50c5..132b701731 100644
--- a/docs/_sources/tutorials/_rendered_examples/dynamo/torch_compile_advanced_usage.rst.txt
+++ b/docs/_sources/tutorials/_rendered_examples/dynamo/torch_compile_advanced_usage.rst.txt
@@ -109,7 +109,7 @@ Compilation with `torch.compile` Using Custom Settings
model_half = Model().eval().cuda()
-.. GENERATED FROM PYTHON SOURCE LINES 66-92
+.. GENERATED FROM PYTHON SOURCE LINES 66-91
.. code-block:: python
@@ -123,7 +123,6 @@ Compilation with `torch.compile` Using Custom Settings
# py/torch_tensorrt/dynamo/_settings.py
backend_kwargs = {
"enabled_precisions": {torch.half},
- "debug": True,
"min_block_size": 2,
"torch_executed_ops": {"torch.ops.aten.sub.Tensor"},
"optimization_level": 4,
@@ -140,12 +139,12 @@ Compilation with `torch.compile` Using Custom Settings
optimized_model_custom(*sample_inputs_half)
-.. GENERATED FROM PYTHON SOURCE LINES 93-95
+.. GENERATED FROM PYTHON SOURCE LINES 92-94
Cleanup
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. GENERATED FROM PYTHON SOURCE LINES 95-99
+.. GENERATED FROM PYTHON SOURCE LINES 94-98
.. code-block:: python
@@ -154,7 +153,7 @@ Cleanup
torch._dynamo.reset()
-.. GENERATED FROM PYTHON SOURCE LINES 100-109
+.. GENERATED FROM PYTHON SOURCE LINES 99-108
Cuda Driver Error Note
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/docs/_sources/tutorials/_rendered_examples/dynamo/torch_compile_resnet_example.rst.txt b/docs/_sources/tutorials/_rendered_examples/dynamo/torch_compile_resnet_example.rst.txt
index d57a490ff7..c83d0c9452 100644
--- a/docs/_sources/tutorials/_rendered_examples/dynamo/torch_compile_resnet_example.rst.txt
+++ b/docs/_sources/tutorials/_rendered_examples/dynamo/torch_compile_resnet_example.rst.txt
@@ -55,7 +55,7 @@ Imports and Model Definition
Optional Input Arguments to `torch_tensorrt.compile`
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. GENERATED FROM PYTHON SOURCE LINES 27-44
+.. GENERATED FROM PYTHON SOURCE LINES 27-42
.. code-block:: python
@@ -63,8 +63,6 @@ Optional Input Arguments to `torch_tensorrt.compile`
# Enabled precision for TensorRT optimization
enabled_precisions = {torch.half}
- # Whether to print verbose logs
- debug = True
# Workspace size for TensorRT
workspace_size = 20 << 30
@@ -77,12 +75,12 @@ Optional Input Arguments to `torch_tensorrt.compile`
torch_executed_ops = {}
-.. GENERATED FROM PYTHON SOURCE LINES 45-47
+.. GENERATED FROM PYTHON SOURCE LINES 43-45
Compilation with `torch_tensorrt.compile`
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. GENERATED FROM PYTHON SOURCE LINES 47-60
+.. GENERATED FROM PYTHON SOURCE LINES 45-57
.. code-block:: python
@@ -93,24 +91,23 @@ Compilation with `torch_tensorrt.compile`
ir="torch_compile",
inputs=inputs,
enabled_precisions=enabled_precisions,
- debug=debug,
workspace_size=workspace_size,
min_block_size=min_block_size,
torch_executed_ops=torch_executed_ops,
)
-.. GENERATED FROM PYTHON SOURCE LINES 61-63
+.. GENERATED FROM PYTHON SOURCE LINES 58-60
Equivalently, we could have run the above via the torch.compile frontend, as so:
`optimized_model = torch.compile(model, backend="torch_tensorrt", options={"enabled_precisions": enabled_precisions, ...}); optimized_model(*inputs)`
-.. GENERATED FROM PYTHON SOURCE LINES 65-67
+.. GENERATED FROM PYTHON SOURCE LINES 62-64
Inference
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. GENERATED FROM PYTHON SOURCE LINES 67-72
+.. GENERATED FROM PYTHON SOURCE LINES 64-69
.. code-block:: python
@@ -120,7 +117,7 @@ Inference
new_outputs = optimized_model(*new_inputs)
-.. GENERATED FROM PYTHON SOURCE LINES 73-78
+.. GENERATED FROM PYTHON SOURCE LINES 70-75
.. code-block:: python
@@ -130,12 +127,12 @@ Inference
new_batch_size_outputs = optimized_model(*new_batch_size_inputs)
-.. GENERATED FROM PYTHON SOURCE LINES 79-81
+.. GENERATED FROM PYTHON SOURCE LINES 76-78
Avoid recompilation by specifying dynamic shapes before Torch-TRT compilation
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. GENERATED FROM PYTHON SOURCE LINES 81-121
+.. GENERATED FROM PYTHON SOURCE LINES 78-117
.. code-block:: python
@@ -149,7 +146,6 @@ Avoid recompilation by specifying dynamic shapes before Torch-TRT compilation
ir="torch_compile",
inputs=inputs_bs8,
enabled_precisions=enabled_precisions,
- debug=debug,
workspace_size=workspace_size,
min_block_size=min_block_size,
torch_executed_ops=torch_executed_ops,
diff --git a/docs/_sources/tutorials/_rendered_examples/dynamo/torch_compile_transformers_example.rst.txt b/docs/_sources/tutorials/_rendered_examples/dynamo/torch_compile_transformers_example.rst.txt
index f7f6a67020..21a5902ea7 100644
--- a/docs/_sources/tutorials/_rendered_examples/dynamo/torch_compile_transformers_example.rst.txt
+++ b/docs/_sources/tutorials/_rendered_examples/dynamo/torch_compile_transformers_example.rst.txt
@@ -59,7 +59,7 @@ Imports and Model Definition
Optional Input Arguments to `torch_tensorrt.compile`
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. GENERATED FROM PYTHON SOURCE LINES 31-48
+.. GENERATED FROM PYTHON SOURCE LINES 31-45
.. code-block:: python
@@ -67,9 +67,6 @@ Optional Input Arguments to `torch_tensorrt.compile`
# Enabled precision for TensorRT optimization
enabled_precisions = {torch.float}
- # Whether to print verbose logs
- debug = True
-
# Workspace size for TensorRT
workspace_size = 20 << 30
@@ -81,12 +78,12 @@ Optional Input Arguments to `torch_tensorrt.compile`
torch_executed_ops = {}
-.. GENERATED FROM PYTHON SOURCE LINES 49-51
+.. GENERATED FROM PYTHON SOURCE LINES 46-48
Compilation with `torch.compile`
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. GENERATED FROM PYTHON SOURCE LINES 51-70
+.. GENERATED FROM PYTHON SOURCE LINES 48-66
.. code-block:: python
@@ -94,7 +91,6 @@ Compilation with `torch.compile`
# Define backend compilation keyword arguments
compilation_kwargs = {
"enabled_precisions": enabled_precisions,
- "debug": debug,
"workspace_size": workspace_size,
"min_block_size": min_block_size,
"torch_executed_ops": torch_executed_ops,
@@ -110,17 +106,17 @@ Compilation with `torch.compile`
optimized_model(*inputs)
-.. GENERATED FROM PYTHON SOURCE LINES 71-73
+.. GENERATED FROM PYTHON SOURCE LINES 67-69
Equivalently, we could have run the above via the convenience frontend, as so:
`torch_tensorrt.compile(model, ir="torch_compile", inputs=inputs, **compilation_kwargs)`
-.. GENERATED FROM PYTHON SOURCE LINES 75-77
+.. GENERATED FROM PYTHON SOURCE LINES 71-73
Inference
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. GENERATED FROM PYTHON SOURCE LINES 77-85
+.. GENERATED FROM PYTHON SOURCE LINES 73-81
.. code-block:: python
@@ -133,7 +129,7 @@ Inference
new_outputs = optimized_model(*new_inputs)
-.. GENERATED FROM PYTHON SOURCE LINES 86-94
+.. GENERATED FROM PYTHON SOURCE LINES 82-90
.. code-block:: python
@@ -146,12 +142,12 @@ Inference
new_outputs = optimized_model(*new_inputs)
-.. GENERATED FROM PYTHON SOURCE LINES 95-97
+.. GENERATED FROM PYTHON SOURCE LINES 91-93
Cleanup
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. GENERATED FROM PYTHON SOURCE LINES 97-101
+.. GENERATED FROM PYTHON SOURCE LINES 93-97
.. code-block:: python
@@ -160,7 +156,7 @@ Cleanup
torch._dynamo.reset()
-.. GENERATED FROM PYTHON SOURCE LINES 102-111
+.. GENERATED FROM PYTHON SOURCE LINES 98-107
Cuda Driver Error Note
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/docs/_sources/tutorials/_rendered_examples/dynamo/torch_export_flux_dev.rst.txt b/docs/_sources/tutorials/_rendered_examples/dynamo/torch_export_flux_dev.rst.txt
index 1b21d941c9..9b6bb7e3e0 100644
--- a/docs/_sources/tutorials/_rendered_examples/dynamo/torch_export_flux_dev.rst.txt
+++ b/docs/_sources/tutorials/_rendered_examples/dynamo/torch_export_flux_dev.rst.txt
@@ -37,12 +37,20 @@ To run this demo, you need to have access to Flux model (request for access if y
There are different components of the ``FLUX.1-dev`` pipeline such as ``transformer``, ``vae``, ``text_encoder``, ``tokenizer`` and ``scheduler``. In this example,
we demonstrate optimizing the ``transformer`` component of the model (which typically consumes >95% of the e2e diffusion latency)
-.. GENERATED FROM PYTHON SOURCE LINES 23-25
+.. GENERATED FROM PYTHON SOURCE LINES 21-24
+
+.. code-block:: python
+
+
+ import register_sdpa # Register SDPA as a standalone operator
+
+
+.. GENERATED FROM PYTHON SOURCE LINES 25-27
Import the following libraries
-----------------------------
-.. GENERATED FROM PYTHON SOURCE LINES 25-30
+.. GENERATED FROM PYTHON SOURCE LINES 27-32
.. code-block:: python
@@ -52,7 +60,7 @@ Import the following libraries
from torch.export._trace import _export
-.. GENERATED FROM PYTHON SOURCE LINES 31-36
+.. GENERATED FROM PYTHON SOURCE LINES 33-38
Define the FLUX-1.dev model
-----------------------------
@@ -60,7 +68,7 @@ Load the ``FLUX-1.dev`` pretrained pipeline using ``FluxPipeline`` class.
``FluxPipeline`` includes different components such as ``transformer``, ``vae``, ``text_encoder``, ``tokenizer`` and ``scheduler`` necessary
to generate an image. We load the weights in ``FP16`` precision using ``torch_dtype`` argument
-.. GENERATED FROM PYTHON SOURCE LINES 36-46
+.. GENERATED FROM PYTHON SOURCE LINES 38-48
.. code-block:: python
@@ -75,14 +83,14 @@ to generate an image. We load the weights in ``FP16`` precision using ``torch_dt
backbone = pipe.transformer.to(DEVICE)
-.. GENERATED FROM PYTHON SOURCE LINES 47-51
+.. GENERATED FROM PYTHON SOURCE LINES 49-53
Export the backbone using torch.export
--------------------------------------------------
Define the dummy inputs and their respective dynamic shapes. We export the transformer backbone with dynamic shapes with a ``batch_size=2``
due to `0/1 specialization
`_
-.. GENERATED FROM PYTHON SOURCE LINES 51-96
+.. GENERATED FROM PYTHON SOURCE LINES 53-98
.. code-block:: python
@@ -132,7 +140,7 @@ due to `0/1 specialization `_ and only float32 precision is allowed in enabled_precisions option
-.. GENERATED FROM PYTHON SOURCE LINES 93-108
+.. GENERATED FROM PYTHON SOURCE LINES 129-144
.. code-block:: python
@@ -137,7 +173,7 @@ the engine with weight streaming feature. use_explicit_typing=True option create
_ = time_generate(trt_model, input_tensors, osl, 3)
-.. GENERATED FROM PYTHON SOURCE LINES 109-115
+.. GENERATED FROM PYTHON SOURCE LINES 145-151
Running with automatic budget size
----------------------------------
@@ -146,7 +182,7 @@ Once you specify the enable_weight_streaming compile option, automatic budget si
This automatic size may not always provide the optimal solution because the automatically determined
budget lacks insight into the user's specific memory constraints and usage patterns
-.. GENERATED FROM PYTHON SOURCE LINES 115-128
+.. GENERATED FROM PYTHON SOURCE LINES 151-164
.. code-block:: python
@@ -164,7 +200,7 @@ budget lacks insight into the user's specific memory constraints and usage patte
)
-.. GENERATED FROM PYTHON SOURCE LINES 129-137
+.. GENERATED FROM PYTHON SOURCE LINES 165-173
Running with weight streaming context manager
----------------------------------
@@ -175,7 +211,7 @@ The permissible range for the budget size is from 0 to ctx.total_device_budget.
equal to ctx.total_device_budget will disable weight streaming.
If multiple trt engines are created, budgets are distributed proportionally
-.. GENERATED FROM PYTHON SOURCE LINES 137-175
+.. GENERATED FROM PYTHON SOURCE LINES 173-211
.. code-block:: python
diff --git a/docs/_sources/tutorials/_rendered_examples/index.rst.txt b/docs/_sources/tutorials/_rendered_examples/index.rst.txt
index bcdac2769d..c9b43f6a58 100644
--- a/docs/_sources/tutorials/_rendered_examples/index.rst.txt
+++ b/docs/_sources/tutorials/_rendered_examples/index.rst.txt
@@ -150,35 +150,35 @@ Model Zoo
.. raw:: html
-
+
.. only:: html
- .. image:: /tutorials/_rendered_examples/dynamo/images/thumb/sphx_glr_torch_export_cudagraphs_thumb.png
+ .. image:: /tutorials/_rendered_examples/dynamo/images/thumb/sphx_glr_engine_caching_bert_example_thumb.png
:alt:
- :ref:`sphx_glr_tutorials__rendered_examples_dynamo_torch_export_cudagraphs.py`
+ :ref:`sphx_glr_tutorials__rendered_examples_dynamo_engine_caching_bert_example.py`
.. raw:: html
-
Torch Export with Cudagraphs
+
Engine Caching (BERT)
.. raw:: html
-
+
.. only:: html
- .. image:: /tutorials/_rendered_examples/dynamo/images/thumb/sphx_glr_engine_caching_bert_example_thumb.png
+ .. image:: /tutorials/_rendered_examples/dynamo/images/thumb/sphx_glr_torch_export_cudagraphs_thumb.png
:alt:
- :ref:`sphx_glr_tutorials__rendered_examples_dynamo_engine_caching_bert_example.py`
+ :ref:`sphx_glr_tutorials__rendered_examples_dynamo_torch_export_cudagraphs.py`
.. raw:: html
-
Engine Caching (BERT)
+
Torch Export with Cudagraphs
@@ -235,120 +235,120 @@ Model Zoo
.. raw:: html
-
+
.. only:: html
- .. image:: /tutorials/_rendered_examples/dynamo/images/thumb/sphx_glr_torch_export_gpt2_thumb.png
+ .. image:: /tutorials/_rendered_examples/dynamo/images/thumb/sphx_glr_auto_generate_converters_thumb.png
:alt:
- :ref:`sphx_glr_tutorials__rendered_examples_dynamo_torch_export_gpt2.py`
+ :ref:`sphx_glr_tutorials__rendered_examples_dynamo_auto_generate_converters.py`
.. raw:: html
-
Compiling GPT2 using the dynamo backend
+
Automatically Generate a Converter for a Custom Kernel
.. raw:: html
-
+
.. only:: html
- .. image:: /tutorials/_rendered_examples/dynamo/images/thumb/sphx_glr_torch_export_llama2_thumb.png
+ .. image:: /tutorials/_rendered_examples/dynamo/images/thumb/sphx_glr_auto_generate_plugins_thumb.png
:alt:
- :ref:`sphx_glr_tutorials__rendered_examples_dynamo_torch_export_llama2.py`
+ :ref:`sphx_glr_tutorials__rendered_examples_dynamo_auto_generate_plugins.py`
.. raw:: html
-
Compiling Llama2 using the dynamo backend
+
Automatically Generate a Plugin for a Custom Kernel
.. raw:: html
-
+
.. only:: html
- .. image:: /tutorials/_rendered_examples/dynamo/images/thumb/sphx_glr_auto_generate_converters_thumb.png
+ .. image:: /tutorials/_rendered_examples/dynamo/images/thumb/sphx_glr_converter_overloading_thumb.png
:alt:
- :ref:`sphx_glr_tutorials__rendered_examples_dynamo_auto_generate_converters.py`
+ :ref:`sphx_glr_tutorials__rendered_examples_dynamo_converter_overloading.py`
.. raw:: html
-
Automatically Generate a Converter for a Custom Kernel
+
Overloading Torch-TensorRT Converters with Custom Converters
.. raw:: html
-
+
.. only:: html
- .. image:: /tutorials/_rendered_examples/dynamo/images/thumb/sphx_glr_auto_generate_plugins_thumb.png
+ .. image:: /tutorials/_rendered_examples/dynamo/images/thumb/sphx_glr_aot_plugin_thumb.png
:alt:
- :ref:`sphx_glr_tutorials__rendered_examples_dynamo_auto_generate_plugins.py`
+ :ref:`sphx_glr_tutorials__rendered_examples_dynamo_aot_plugin.py`
.. raw:: html
-
Automatically Generate a Plugin for a Custom Kernel
+
Torch-TensorRT supports falling back to PyTorch implementations of operations in the case that Torch-TensorRT
.. raw:: html
-
+
.. only:: html
- .. image:: /tutorials/_rendered_examples/dynamo/images/thumb/sphx_glr_converter_overloading_thumb.png
+ .. image:: /tutorials/_rendered_examples/dynamo/images/thumb/sphx_glr_hierarchical_partitioner_example_thumb.png
:alt:
- :ref:`sphx_glr_tutorials__rendered_examples_dynamo_converter_overloading.py`
+ :ref:`sphx_glr_tutorials__rendered_examples_dynamo_hierarchical_partitioner_example.py`
.. raw:: html
-
Overloading Torch-TensorRT Converters with Custom Converters
+
Hierarchical Partitioner Example
.. raw:: html
-
+
.. only:: html
- .. image:: /tutorials/_rendered_examples/dynamo/images/thumb/sphx_glr_weight_streaming_example_thumb.png
+ .. image:: /tutorials/_rendered_examples/dynamo/images/thumb/sphx_glr_mutable_torchtrt_module_example_thumb.png
:alt:
- :ref:`sphx_glr_tutorials__rendered_examples_dynamo_weight_streaming_example.py`
+ :ref:`sphx_glr_tutorials__rendered_examples_dynamo_mutable_torchtrt_module_example.py`
.. raw:: html
-
Weight Streaming
+
Mutable Torch TensorRT Module
.. raw:: html
-
+
.. only:: html
- .. image:: /tutorials/_rendered_examples/dynamo/images/thumb/sphx_glr_mutable_torchtrt_module_example_thumb.png
+ .. image:: /tutorials/_rendered_examples/dynamo/images/thumb/sphx_glr_weight_streaming_example_thumb.png
:alt:
- :ref:`sphx_glr_tutorials__rendered_examples_dynamo_mutable_torchtrt_module_example.py`
+ :ref:`sphx_glr_tutorials__rendered_examples_dynamo_weight_streaming_example.py`
.. raw:: html
-
Mutable Torch TensorRT Module
+
Weight Streaming
diff --git a/docs/_sources/tutorials/compile_hf_models.rst.txt b/docs/_sources/tutorials/compile_hf_models.rst.txt
new file mode 100644
index 0000000000..f6da87b145
--- /dev/null
+++ b/docs/_sources/tutorials/compile_hf_models.rst.txt
@@ -0,0 +1,218 @@
+.. _compile_hf_models:
+
+Compiling LLM models from Huggingface
+======================================
+
+This tutorial walks you through how to compile LLM models from Huggingface using Torch-TensorRT. We also introduce KV caching in Torch-TensorRT which can greatly improve the performance of LLM inference.
+The code is available in the `tools/llm
`_ directory. We use the ``run_llm.py`` script to compile the model, generate outputs, and measure the performance.
+
+.. note::
+ This is an **experimental release** and APIs may change in future versions.
+
+.. note::
+ The compilation scripts and tutorials for Llama-2-7b-chat-hf and gpt2 models have been consolidated into the unified ``run_llm.py`` script located in the `tools/llm `_ directory.
+
+Overview of tools/llm Directory
+-------------------------------
+
+The ``tools/llm`` directory provides the following tools to compile LLM models from Huggingface:
+
+* **run_llm.py**: Main entry point for model compilation, generating outputs, and benchmarking
+* **Static Cache Utilities**: ``static_cache_v1.py`` and ``static_cache_v2.py`` for KV cache optimization
+* **SDPA Attention**: ``sdpa_converter.py`` and ``register_sdpa.py`` for registering scaled dot-product attention converter and lowering pass.
+* **Testing Components**: Model-specific test files for validation
+* **Utility Functions**: ``utils.py`` and ``cache_utils.py`` for common operations
+
+Supported Models
+----------------
+We have officially verified support for the following LLM families:
+
+.. list-table::
+ :widths: 20 40 20 20
+ :header-rows: 1
+
+ * - Model Series
+ - HuggingFace Model Card
+ - Precision
+ - KV Cache Support ?
+ * - GPT-2
+ - gpt2
+ - FP16, FP32
+ - Yes
+ * - LLaMA 2
+ - meta-llama/Llama-2-7b-chat-hf
+ - FP16, FP32
+ - Yes
+ * - LLaMA 3.1
+ - meta-llama/Llama-3.1-8B-Instruct
+ - FP16, FP32
+ - Yes
+ * - LLaMA 3.2
+ - | meta-llama/Llama-3.2-1B-Instruct
+ | meta-llama/Llama-3.2-3B-Instruct
+ - FP16, FP32
+ - Yes
+ * - Qwen 2.5
+ - | Qwen/Qwen2.5-0.5B-Instruct
+ | Qwen/Qwen2.5-1.5B-Instruct
+ | Qwen/Qwen2.5-3B-Instruct
+ | Qwen/Qwen2.5-7B-Instruct
+ - FP16, FP32
+ - Yes
+
+Getting Started with run_llm.py
+-------------------------------
+
+The main entry point is ``run_llm.py``, which provides a complete workflow for model compilation and benchmarking.
+
+Basic Usage
+^^^^^^^^^^^
+
+.. code-block:: bash
+
+ python tools/llm/run_llm.py \
+ --model meta-llama/Llama-3.2-1B-Instruct \
+ --prompt "What is parallel programming?" \
+ --precision FP16 \
+ --num_tokens 128 \
+ --cache static_v2 \
+ --benchmark
+
+Key Arguments
+^^^^^^^^^^^^^
+
+* ``--model``: Name or path of the HuggingFace LLM
+* ``--tokenizer``: (Optional) Tokenizer name; defaults to model name
+* ``--prompt``: Input prompt for text generation
+* ``--precision``: Precision mode (``FP16``, ``FP32``)
+* ``--num_tokens``: Number of output tokens to generate
+* ``--cache``: KV cache type (``static_v1``, ``static_v2``, or empty for no KV caching)
+* ``--benchmark``: Enable benchmarking mode for performance comparison
+* ``--enable_pytorch_run``: Also run and compare PyTorch baseline
+
+
+Other Usage Examples
+^^^^^^^^^^^^^^^^^^^^
+.. code-block:: bash
+
+ # Compare different models performance
+ python tools/llm/run_llm.py --model gpt2 --benchmark --enable_pytorch_run
+ python tools/llm/run_llm.py --model meta-llama/Llama-3.2-1B-Instruct --benchmark --enable_pytorch_run
+
+ # Generate the outputs (disable benchmarking) by specifying the number of tokens to generate. Default = 128
+ python tools/llm/run_llm.py --model gpt2 --prompt "What is parallel programming?" --num_tokens 128
+ python tools/llm/run_llm.py --model meta-llama/Llama-3.2-1B-Instruct --prompt "What is parallel programming?" --num_tokens 128
+
+ # Test different caching approaches
+ python tools/llm/run_llm.py --model meta-llama/Llama-3.2-1B-Instruct --cache static_v1
+ python tools/llm/run_llm.py --model meta-llama/Llama-3.2-1B-Instruct --cache static_v2
+
+ # Compare FP16 vs FP32 performance
+ python tools/llm/run_llm.py --model Qwen/Qwen2.5-1.5B-Instruct --precision FP16 --benchmark
+ python tools/llm/run_llm.py --model Qwen/Qwen2.5-1.5B-Instruct --precision FP32 --benchmark
+
+
+KV Caching in Torch-TensorRT
+---------------------------------
+
+We provide two versions of static KV caching: `static_cache_v1 `_ and `static_cache_v2 `_.
+In both implementations, we add static KV cache tensors as model inputs/outputs without storing them as external memory.
+The length of KV cache = input sequence length + output sequence length (specified by ``--num_tokens``). The number of heads and head dimension are determined by the model config.
+
+Static Cache v1
+^^^^^^^^^^^^^^^^
+
+The ``static_cache_v1.py`` implements KV cache in the model graph as follows:
+
+.. code-block:: python
+
+ class StaticCacheV1Model(nn.Module):
+ def __init__(self):
+ super().__init__()
+
+ def forward(self, q, k, v, key_cache, value_cache, start_idx, end_idx, is_causal=True):
+ # Concatenate new key/value pairs with existing cache
+ new_key_cache = torch.cat((key_cache[:, :, :start_idx, :], k, key_cache[:, :, end_idx:, :]), dim=2)
+ new_value_cache = torch.cat((value_cache[:, :, :start_idx, :], v, value_cache[:, :, end_idx:, :]), dim=2)
+
+ # Compute attention using the updated cache
+ attn_output = torch._C._nn.scaled_dot_product_attention(
+ q,
+ new_key_cache[:, :, :end_idx, :],
+ new_value_cache[:, :, :end_idx, :],
+ dropout_p=0.0,
+ is_causal=is_causal
+ )
+
+ return attn_output, new_key_cache, new_value_cache
+
+In the above code, we concatenate the new key/value pairs with the existing cache and update it. To compute the attention, we use the updated cache and gather the corresponding keys/values from the cache up until and including the current token index.
+The above code is actually implemented as a FX graph transformation pass. We register it as a Torch-TensorRT lowering pass using the decorator ``@_aten_lowering_pass`` when we import the ``static_cache_v1.py`` module.
+
+.. note::
+ The ``start_idx`` and ``end_idx`` are the start and end indices of the current token in the cache. For prefill phase, ``start_idx`` is 0 and ``end_idx`` is the input sequence length.
+ For decode phase, ``start_idx`` begins at the input sequence length and ``end_idx`` equals ``start_idx + 1``. The ``start_idx`` is incremented by 1 until the end of the sequence or we reach the maximum number of tokens to generate.
+
+
+Static Cache v2
+^^^^^^^^^^^^^^^^
+
+The ``static_cache_v2.py`` is similar to ``static_cache_v1.py`` but it uses less number of slice operations. It implements KV cache in the model graph as follows:
+
+.. code-block:: python
+
+ class StaticCacheV2Model(nn.Module):
+ def __init__(self):
+ super().__init__()
+
+ def forward(self, q, k, v, key_cache, value_cache, start_idx, end_idx, is_causal=True):
+ concat_keys = torch.cat((key_cache[:, :, :start_idx, :], k), dim=2)
+ concat_values = torch.cat((value_cache[:, :, :start_idx, :], v), dim=2)
+ new_key_cache = torch.cat((concat_keys, key_cache[:, :, end_idx:, :]), dim=2)
+ new_value_cache = torch.cat((concat_values, value_cache[:, :, end_idx:, :]), dim=2)
+ attn_output = torch._C._nn.scaled_dot_product_attention(
+ q, concat_keys, concat_values, dropout_p=0.0, is_causal=is_causal
+ )
+
+ return attn_output, new_key_cache, new_value_cache
+
+In the above code, we concatenate the existing key/value cache with current key/value of the token. We use this to directly compute the attention and update the key/value cache inserting the current key/value.
+The above code is actually implemented as a FX graph transformation pass. We register it as a Torch-TensorRT lowering pass using the decorator ``@_aten_lowering_pass`` when we import the ``static_cache_v1.py`` module.
+The definitons of ``start_idx`` and ``end_idx`` are the same as ``static_cache_v1.py``.
+
+After the model is compiled with static KV cache, the input signature of the model is changed. The new input signature is ``(input_ids, position_ids, key_cache_0, value_cache_0, ..., start_idx, end_idx)``.
+The number of key/value cache tensors is equal to the number of attention heads in the model. We can use the ``generate_with_static_cache`` function to generate the outputs.
+
+Generating Outputs
+-------------------
+We use custom `generate `_ function to generate the outputs. This function performs standard autoregressive decoding without KV caching.
+There is also a `generate_with_static_cache `_ function that performs autoregressive decoding with KV caching.
+
+The ``generate_with_static_cache`` function takes care of preparing the inputs to the model compiled with static KV cache.
+The model inputs are ``input_ids``, ``position_ids``, ``key_cache_0``, ``value_cache_0``, ...., ``start_idx``, ``end_idx``.
+We initialize the key/value cache tensors with zeros and for every token generated, the new key/value cache tensors are the outputs of the model.
+
+SDPA Converter (sdpa_converter.py)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+* Converts scaled dot-product attention operation using TRT Python API.
+* Supports causal and standard self-attention.
+
+SDPA Registration (register_sdpa.py)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+* This is a Torch-TensorRT lowering pass that replaces variants of SDPA with ``torch.nn.functional.scaled_dot_product_attention``.
+* Registers the SDPA converter which is used for converting ``torch.nn.functional.scaled_dot_product_attention`` operation.
+
+
+Limitations and Known Issues
+----------------------------
+
+* Sliding window attention (used in Gemma3 and Qwen 3 models) is not yet supported
+* Some model architectures (e.g. Phi-4) have issues with exporting the torch model.
+
+Requirements
+^^^^^^^^^^^^
+
+* Torch-TensorRT 2.8.0 or later
+* Transformers v4.52.3
\ No newline at end of file
diff --git a/docs/_sources/user_guide/runtime.rst.txt b/docs/_sources/user_guide/runtime.rst.txt
index fc73aef8ac..8d2151a615 100644
--- a/docs/_sources/user_guide/runtime.rst.txt
+++ b/docs/_sources/user_guide/runtime.rst.txt
@@ -24,7 +24,7 @@ programs just as you would otherwise via PyTorch API.
.. note:: If you are linking ``libtorchtrt_runtime.so``, likely using the following flags will help ``-Wl,--no-as-needed -ltorchtrt -Wl,--as-needed`` as there's no direct symbol dependency to anything in the Torch-TensorRT runtime for most Torch-TensorRT runtime applications
-An example of how to use ``libtorchtrt_runtime.so`` can be found here: https://github.com/pytorch/TensorRT/tree/master/examples/torchtrt_runtime_example
+An example of how to use ``libtorchtrt_runtime.so`` can be found here: https://github.com/pytorch/TensorRT/tree/master/examples/torchtrt_aoti_example
Plugin Library
---------------
@@ -87,8 +87,8 @@ Cudagraphs can accelerate certain models by reducing kernel overheads, as docume
with torch_tensorrt.runtime.enable_cudagraphs(trt_module):
...
-In the current implementation, use of a new input shape (for instance in dynamic shape
-cases), will cause the cudagraph to be re-recorded. Cudagraph recording is generally
+In the current implementation, use of a new input shape (for instance in dynamic shape
+cases), will cause the cudagraph to be re-recorded. Cudagraph recording is generally
not latency intensive, and future improvements include caching cudagraphs for multiple input shapes.
Dynamic Output Allocation Mode
@@ -101,11 +101,11 @@ Without dynamic output allocation, the output buffer is allocated based on the i
There are two scenarios in which dynamic output allocation is enabled:
-1. The model has been identified at compile time to require dynamic output allocation for at least one TensorRT subgraph.
-These models will engage the runtime mode automatically (with logging) and are incompatible with other runtime modes
+1. The model has been identified at compile time to require dynamic output allocation for at least one TensorRT subgraph.
+These models will engage the runtime mode automatically (with logging) and are incompatible with other runtime modes
such as CUDA Graphs.
-Converters can declare that subgraphs that they produce will require the output allocator using `requires_output_allocator=True`
+Converters can declare that subgraphs that they produce will require the output allocator using `requires_output_allocator=True`
there by forcing any model which utilizes the converter to automatically use the output allocator runtime mode. e.g.,
.. code-block:: python
@@ -131,3 +131,127 @@ there by forcing any model which utilizes the converter to automatically use the
# Enables Dynamic Output Allocation Mode, then resets the mode to its prior setting
with torch_tensorrt.runtime.enable_output_allocator(trt_module):
...
+
+Deploying Torch-TensorRT Programs without Python
+--------------------------------------------------------
+
+AOT-Inductor
+~~~~~~~~~~~~~~~~
+
+AOTInductor is a specialized version of TorchInductor, designed to process exported PyTorch models, optimize them, and produce shared
+libraries as well as other relevant artifacts. These compiled artifacts are specifically crafted for deployment in non-Python environments,
+which are frequently employed for inference deployments on the server side.
+
+Torch-TensorRT is able to accelerate subgraphs within AOTInductor exports in the same way it does in Python.
+
+.. code-block:: py
+
+ dynamo_model = torch_tensorrt.compile(model, ir="dynamo", arg_inputs=[...])
+ torch_tensorrt.save(
+ dynamo_model,
+ file_path=os.path.join(os.getcwd(), "model.pt2"),
+ output_format="aot_inductor",
+ retrace=True,
+ arg_inputs=[...],
+ )
+
+This artifact then can be loaded in a C++ application to be executed with out a Python dependency.
+
+.. code-block:: c++
+
+ #include
+ #include
+
+ #include "torch/torch.h"
+ #include "torch/csrc/inductor/aoti_package/model_package_loader.h"
+
+ int main(int argc, const char* argv[]) {
+ // Check for correct number of command-line arguments
+ std::string trt_aoti_module_path = "model.pt2";
+
+ if (argc == 2) {
+ trt_aoti_module_path = argv[1];
+ }
+
+ std::cout << trt_aoti_module_path << std::endl;
+
+ // Get the path to the TRT AOTI model package from the command line
+ c10::InferenceMode mode;
+
+ torch::inductor::AOTIModelPackageLoader loader(trt_aoti_module_path);
+ // Assume running on CUDA
+ std::vector inputs = {torch::randn({8, 10}, at::kCUDA)};
+ std::vector outputs = loader.run(inputs);
+ std::cout << "Result from the first inference:"<< std::endl;
+ std::cout << outputs << std::endl;
+
+ // The second inference uses a different batch size and it works because we
+ // specified that dimension as dynamic when compiling model.pt2.
+ std::cout << "Result from the second inference:"<< std::endl;
+ // Assume running on CUDA
+ std::cout << loader.run({torch::randn({1, 10}, at::kCUDA)}) << std::endl;
+
+ return 0;
+ }
+
+Note: Similar to Python, at runtime, no Torch-TensorRT APIs are used to operate the model. Therefore typically additional
+flags are needed to make sure that ``libtorchtrt_runtime.so`` gets optimized out (see above).
+
+See: ``//examples/torchtrt_aoti_example`` for a full end to end demo of this workflow
+
+
+TorchScript
+~~~~~~~~~~~~~~
+
+TorchScript is a legacy compiler stack for PyTorch that includes a Python-less interpreter for TorchScript programs.
+It has historically been used by Torch-TensorRT to execute models without Python. Even after the transition to TorchDynamo,
+the TorchScript interpreter can continue to be used to run PyTorch models with TensorRT engines outside of Python.
+
+.. code-block:: py
+
+ dynamo_model = torch_tensorrt.compile(model, ir="dynamo", arg_inputs=[...])
+ ts_model = torch.jit.trace(dynamo_model, inputs=[...])
+ torch.jit.save(ts_model, os.path.join(os.getcwd(), "model.ts"),)
+
+This artifact then can be loaded in a C++ application to be executed with out a Python dependency.
+
+.. code-block:: c++
+
+ #include
+ #include
+ #include
+ #include
+ #include
+ #include "torch/script.h"
+
+ int main(int argc, const char* argv[]) {
+ if (argc < 2) {
+ std::cerr << "usage: samplertapp \n";
+ return -1;
+ }
+
+ std::string trt_ts_module_path = argv[1];
+
+ torch::jit::Module trt_ts_mod;
+ try {
+ // Deserialize the ScriptModule from a file using torch::jit::load().
+ trt_ts_mod = torch::jit::load(trt_ts_module_path);
+ } catch (const c10::Error& e) {
+ std::cerr << "error loading the model from : " << trt_ts_module_path << std::endl;
+ return -1;
+ }
+
+ std::cout << "Running TRT engine" << std::endl;
+ std::vector trt_inputs_ivalues;
+ trt_inputs_ivalues.push_back(at::randint(-5, 5, {1, 3, 5, 5}, {at::kCUDA}).to(torch::kFloat32));
+ torch::jit::IValue trt_results_ivalues = trt_ts_mod.forward(trt_inputs_ivalues);
+ std::cout << "==================TRT outputs================" << std::endl;
+ std::cout << trt_results_ivalues << std::endl;
+ std::cout << "=============================================" << std::endl;
+ std::cout << "TRT engine execution completed. " << std::endl;
+ }
+
+Note: Similar to Python, at runtime, no Torch-TensorRT APIs are used to operate the model. Therefore typically additional
+flags are needed to make sure that ``libtorchtrt_runtime.so`` gets optimized out (see above).
+
+See: ``//examples/torchtrt_runtime_example`` for a full end to end demo of this workflow
diff --git a/docs/_sources/user_guide/saving_models.rst.txt b/docs/_sources/user_guide/saving_models.rst.txt
index dc4b5da222..bef9b4dec3 100644
--- a/docs/_sources/user_guide/saving_models.rst.txt
+++ b/docs/_sources/user_guide/saving_models.rst.txt
@@ -14,12 +14,13 @@ Saving models compiled with Torch-TensorRT can be done using `torch_tensorrt.sav
Dynamo IR
-------------
-The output type of `ir=dynamo` compilation of Torch-TensorRT is `torch.fx.GraphModule` object by default.
-We can save this object in either `TorchScript` (`torch.jit.ScriptModule`) or `ExportedProgram` (`torch.export.ExportedProgram`) formats by
+The output type of `ir=dynamo` compilation of Torch-TensorRT is `torch.fx.GraphModule` object by default.
+We can save this object in either `TorchScript` (`torch.jit.ScriptModule`), `ExportedProgram` (`torch.export.ExportedProgram`) or `PT2` formats by
specifying the `output_format` flag. Here are the options `output_format` will accept
* `exported_program` : This is the default. We perform transformations on the graphmodule first and use `torch.export.save` to save the module.
* `torchscript` : We trace the graphmodule via `torch.jit.trace` and save it via `torch.jit.save`.
+* `PT2 Format` : This is a next generation runtime for PyTorch models, allowing them to run in Python and in C++
a) ExportedProgram
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -52,8 +53,8 @@ b) Torchscript
model = MyModel().eval().cuda()
inputs = [torch.randn((1, 3, 224, 224)).cuda()]
# trt_gm is a torch.fx.GraphModule object
- trt_gm = torch_tensorrt.compile(model, ir="dynamo", inputs=inputs)
- torch_tensorrt.save(trt_gm, "trt.ts", output_format="torchscript", inputs=inputs)
+ trt_gm = torch_tensorrt.compile(model, ir="dynamo", arg_inputs=inputs)
+ torch_tensorrt.save(trt_gm, "trt.ts", output_format="torchscript", arg_inputs=inputs)
# Later, you can load it and run inference
model = torch.jit.load("trt.ts").cuda()
@@ -73,7 +74,7 @@ For `ir=ts`, this behavior stays the same in 2.X versions as well.
model = MyModel().eval().cuda()
inputs = [torch.randn((1, 3, 224, 224)).cuda()]
- trt_ts = torch_tensorrt.compile(model, ir="ts", inputs=inputs) # Output is a ScriptModule object
+ trt_ts = torch_tensorrt.compile(model, ir="ts", arg_inputs=inputs) # Output is a ScriptModule object
torch.jit.save(trt_ts, "trt_model.ts")
# Later, you can load it and run inference
@@ -98,3 +99,26 @@ Here's an example usage
inputs = [torch.randn((1, 3, 224, 224)).cuda()]
model = torch_tensorrt.load().module()
model(*inputs)
+
+b) PT2 Format
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+PT2 is a new format that allows models to be run outside of Python in the future. It utilizes `AOTInductor `_
+to generate kernels for components that will not be run in TensorRT.
+
+Here's an example on how to save and load Torch-TensorRT Module using AOTInductor in Python
+
+.. code-block:: python
+
+ import torch
+ import torch_tensorrt
+
+ model = MyModel().eval().cuda()
+ inputs = [torch.randn((1, 3, 224, 224)).cuda()]
+ # trt_ep is a torch.fx.GraphModule object
+ trt_gm = torch_tensorrt.compile(model, ir="dynamo", inputs=inputs)
+ torch_tensorrt.save(trt_gm, "trt.pt2", arg_inputs=inputs, output_format="aot_inductor", retrace=True)
+
+ # Later, you can load it and run inference
+ model = torch._inductor.aoti_load_package("trt.pt2")
+ model(*inputs)
diff --git a/docs/_static/documentation_options.js b/docs/_static/documentation_options.js
index 786e44a6f1..a2fde8fe2e 100644
--- a/docs/_static/documentation_options.js
+++ b/docs/_static/documentation_options.js
@@ -1,6 +1,6 @@
var DOCUMENTATION_OPTIONS = {
URL_ROOT: document.getElementById("documentation_options").getAttribute('data-url_root'),
- VERSION: 'v2.8.0.dev0+ee32da0',
+ VERSION: 'v2.9.0.dev0+92a6908',
LANGUAGE: 'en',
COLLAPSE_INDEX: false,
BUILDER: 'html',
diff --git a/docs/_static/js/theme.js b/docs/_static/js/theme.js
index ceee8f062c..1f071b8486 100644
--- a/docs/_static/js/theme.js
+++ b/docs/_static/js/theme.js
@@ -945,10 +945,19 @@ if (downloadNote.length >= 1) {
var tutorialUrlArray = $("#tutorial-type").text().split('/');
tutorialUrlArray[0] = tutorialUrlArray[0] + "_source"
- var githubLink = "https://github.com/pytorch/tutorials/blob/master/" + tutorialUrlArray.join("/") + ".py",
- notebookLink = $(".reference.download")[1].href,
- notebookDownloadPath = notebookLink.split('_downloads')[1],
- colabLink = "https://colab.research.google.com/github/pytorch/tutorials/blob/gh-pages/_downloads" + notebookDownloadPath;
+ var githubLink = "https://github.com/pytorch/tutorials/blob/master/" + tutorialUrlArray.join("/") + ".py";
+ var notebookLink = "";
+ // some versions of sphinx gallery have different orders of the download
+ // links so we need to check if the link ends with .ipynb to find the
+ // correct one
+ for (var i = 0; i < $(".reference.download").length; i++) {
+ notebookLink = $(".reference.download")[i].href;
+ if (notebookLink.endsWith(".ipynb")) {
+ break;
+ }
+ }
+ var notebookDownloadPath = notebookLink.split('_downloads')[1];
+ var colabLink = "https://colab.research.google.com/github/pytorch/tutorials/blob/gh-pages/_downloads" + notebookDownloadPath;
$("#google-colab-link").wrap("");
$("#download-notebook-link").wrap("");
diff --git a/docs/cli/torchtrtc.html b/docs/cli/torchtrtc.html
index 3878948003..add26a1144 100644
--- a/docs/cli/torchtrtc.html
+++ b/docs/cli/torchtrtc.html
@@ -10,7 +10,7 @@
- torchtrtc — Torch-TensorRT v2.8.0.dev0+ee32da0 documentation
+ torchtrtc — Torch-TensorRT v2.9.0.dev0+92a6908 documentation
@@ -293,7 +293,7 @@
- v2.8.0.dev0+ee32da0
+ v2.9.0.dev0+92a6908
@@ -324,7 +324,7 @@
Getting Started
User Guide
@@ -376,9 +376,8 @@
Compiling ResNet with dynamic shapes using the torch.compile backend
Compiling BERT using the torch.compile backend
Compiling Stable Diffusion model using the torch.compile backend
+Compiling LLM models from Huggingface
Compiling GPT2 using the Torch-TensorRT torch.compile
frontend
-Compiling GPT2 using the dynamo backend
-Compiling Llama2 using the dynamo backend
Compiling SAM2 using the dynamo backend
Compiling FLUX.1-dev model using the Torch-TensorRT dynamo backend
Legacy notebooks
diff --git a/docs/contributors/conversion.html b/docs/contributors/conversion.html
index 8f9e39e5d6..7afb5077ef 100644
--- a/docs/contributors/conversion.html
+++ b/docs/contributors/conversion.html
@@ -10,7 +10,7 @@
- Conversion Phase — Torch-TensorRT v2.8.0.dev0+ee32da0 documentation
+ Conversion Phase — Torch-TensorRT v2.9.0.dev0+92a6908 documentation
@@ -293,7 +293,7 @@
- v2.8.0.dev0+ee32da0
+ v2.9.0.dev0+92a6908
@@ -324,7 +324,7 @@
Getting Started
User Guide
@@ -376,9 +376,8 @@
Compiling ResNet with dynamic shapes using the torch.compile backend
Compiling BERT using the torch.compile backend
Compiling Stable Diffusion model using the torch.compile backend
+Compiling LLM models from Huggingface
Compiling GPT2 using the Torch-TensorRT torch.compile
frontend
-Compiling GPT2 using the dynamo backend
-Compiling Llama2 using the dynamo backend
Compiling SAM2 using the dynamo backend
Compiling FLUX.1-dev model using the Torch-TensorRT dynamo backend
Legacy notebooks
diff --git a/docs/contributors/dynamo_converters.html b/docs/contributors/dynamo_converters.html
index 077dccd7d2..20a4a73aa9 100644
--- a/docs/contributors/dynamo_converters.html
+++ b/docs/contributors/dynamo_converters.html
@@ -10,7 +10,7 @@
- Writing Dynamo Converters — Torch-TensorRT v2.8.0.dev0+ee32da0 documentation
+ Writing Dynamo Converters — Torch-TensorRT v2.9.0.dev0+92a6908 documentation
@@ -293,7 +293,7 @@
- v2.8.0.dev0+ee32da0
+ v2.9.0.dev0+92a6908
@@ -324,7 +324,7 @@
Getting Started
User Guide
@@ -376,9 +376,8 @@
Compiling ResNet with dynamic shapes using the torch.compile backend
Compiling BERT using the torch.compile backend
Compiling Stable Diffusion model using the torch.compile backend
+Compiling LLM models from Huggingface
Compiling GPT2 using the Torch-TensorRT torch.compile
frontend
-Compiling GPT2 using the dynamo backend
-Compiling Llama2 using the dynamo backend
Compiling SAM2 using the dynamo backend
Compiling FLUX.1-dev model using the Torch-TensorRT dynamo backend
Legacy notebooks
diff --git a/docs/contributors/lowering.html b/docs/contributors/lowering.html
index db68f640af..b4332cfca6 100644
--- a/docs/contributors/lowering.html
+++ b/docs/contributors/lowering.html
@@ -10,7 +10,7 @@
- Lowering Phase — Torch-TensorRT v2.8.0.dev0+ee32da0 documentation
+ Lowering Phase — Torch-TensorRT v2.9.0.dev0+92a6908 documentation
@@ -293,7 +293,7 @@
- v2.8.0.dev0+ee32da0
+ v2.9.0.dev0+92a6908
@@ -324,7 +324,7 @@
Getting Started
User Guide
@@ -376,9 +376,8 @@
Compiling ResNet with dynamic shapes using the torch.compile backend
Compiling BERT using the torch.compile backend
Compiling Stable Diffusion model using the torch.compile backend
+Compiling LLM models from Huggingface
Compiling GPT2 using the Torch-TensorRT torch.compile
frontend
-Compiling GPT2 using the dynamo backend
-Compiling Llama2 using the dynamo backend
Compiling SAM2 using the dynamo backend
Compiling FLUX.1-dev model using the Torch-TensorRT dynamo backend
Legacy notebooks
diff --git a/docs/contributors/partitioning.html b/docs/contributors/partitioning.html
index dcc0bee79e..ac6a13f0c3 100644
--- a/docs/contributors/partitioning.html
+++ b/docs/contributors/partitioning.html
@@ -10,7 +10,7 @@
- Partitioning Phase — Torch-TensorRT v2.8.0.dev0+ee32da0 documentation
+ Partitioning Phase — Torch-TensorRT v2.9.0.dev0+92a6908 documentation
@@ -293,7 +293,7 @@
- v2.8.0.dev0+ee32da0
+ v2.9.0.dev0+92a6908
@@ -324,7 +324,7 @@
Getting Started
User Guide
@@ -376,9 +376,8 @@
Compiling ResNet with dynamic shapes using the torch.compile backend
Compiling BERT using the torch.compile backend
Compiling Stable Diffusion model using the torch.compile backend
+Compiling LLM models from Huggingface
Compiling GPT2 using the Torch-TensorRT torch.compile
frontend
-Compiling GPT2 using the dynamo backend
-Compiling Llama2 using the dynamo backend
Compiling SAM2 using the dynamo backend
Compiling FLUX.1-dev model using the Torch-TensorRT dynamo backend
Legacy notebooks
@@ -711,6 +710,16 @@ Dependency Aware Partitioning
diff --git a/docs/contributors/phases.html b/docs/contributors/phases.html
index cbc7111d3d..3056cd57d5 100644
--- a/docs/contributors/phases.html
+++ b/docs/contributors/phases.html
@@ -10,7 +10,7 @@
-
Compiler Phases — Torch-TensorRT v2.8.0.dev0+ee32da0 documentation
+
Compiler Phases — Torch-TensorRT v2.9.0.dev0+92a6908 documentation
@@ -291,7 +291,7 @@
- v2.8.0.dev0+ee32da0
+ v2.9.0.dev0+92a6908
@@ -322,7 +322,7 @@
Getting Started
User Guide
@@ -374,9 +374,8 @@
Compiling ResNet with dynamic shapes using the torch.compile backend
Compiling BERT using the torch.compile backend
Compiling Stable Diffusion model using the torch.compile backend
+
Compiling LLM models from Huggingface
Compiling GPT2 using the Torch-TensorRT torch.compile
frontend
-
Compiling GPT2 using the dynamo backend
-
Compiling Llama2 using the dynamo backend
Compiling SAM2 using the dynamo backend
Compiling FLUX.1-dev model using the Torch-TensorRT dynamo backend
Legacy notebooks
diff --git a/docs/contributors/runtime.html b/docs/contributors/runtime.html
index ab1b911002..c34bfbbb5c 100644
--- a/docs/contributors/runtime.html
+++ b/docs/contributors/runtime.html
@@ -10,7 +10,7 @@
-
Runtime Phase — Torch-TensorRT v2.8.0.dev0+ee32da0 documentation
+
Runtime Phase — Torch-TensorRT v2.9.0.dev0+92a6908 documentation
@@ -293,7 +293,7 @@
- v2.8.0.dev0+ee32da0
+ v2.9.0.dev0+92a6908
@@ -324,7 +324,7 @@
Getting Started
User Guide
@@ -376,9 +376,8 @@
Compiling ResNet with dynamic shapes using the torch.compile backend
Compiling BERT using the torch.compile backend
Compiling Stable Diffusion model using the torch.compile backend
+
Compiling LLM models from Huggingface
Compiling GPT2 using the Torch-TensorRT torch.compile
frontend
-
Compiling GPT2 using the dynamo backend
-
Compiling Llama2 using the dynamo backend
Compiling SAM2 using the dynamo backend
Compiling FLUX.1-dev model using the Torch-TensorRT dynamo backend
Legacy notebooks
diff --git a/docs/contributors/system_overview.html b/docs/contributors/system_overview.html
index 43a147b2ab..42a4303b6f 100644
--- a/docs/contributors/system_overview.html
+++ b/docs/contributors/system_overview.html
@@ -10,7 +10,7 @@
-
System Overview — Torch-TensorRT v2.8.0.dev0+ee32da0 documentation
+
System Overview — Torch-TensorRT v2.9.0.dev0+92a6908 documentation
@@ -293,7 +293,7 @@
- v2.8.0.dev0+ee32da0
+ v2.9.0.dev0+92a6908
@@ -324,7 +324,7 @@
Getting Started
User Guide
@@ -376,9 +376,8 @@
Compiling ResNet with dynamic shapes using the torch.compile backend
Compiling BERT using the torch.compile backend
Compiling Stable Diffusion model using the torch.compile backend
+
Compiling LLM models from Huggingface
Compiling GPT2 using the Torch-TensorRT torch.compile
frontend
-
Compiling GPT2 using the dynamo backend
-
Compiling Llama2 using the dynamo backend
Compiling SAM2 using the dynamo backend
Compiling FLUX.1-dev model using the Torch-TensorRT dynamo backend
Legacy notebooks
diff --git a/docs/contributors/ts_converters.html b/docs/contributors/ts_converters.html
index 3b632fe43d..feacf295fc 100644
--- a/docs/contributors/ts_converters.html
+++ b/docs/contributors/ts_converters.html
@@ -10,7 +10,7 @@
-
Writing TorchScript Converters — Torch-TensorRT v2.8.0.dev0+ee32da0 documentation
+
Writing TorchScript Converters — Torch-TensorRT v2.9.0.dev0+92a6908 documentation
@@ -293,7 +293,7 @@
- v2.8.0.dev0+ee32da0
+ v2.9.0.dev0+92a6908
@@ -324,7 +324,7 @@
Getting Started
User Guide
@@ -376,9 +376,8 @@
Compiling ResNet with dynamic shapes using the torch.compile backend
Compiling BERT using the torch.compile backend
Compiling Stable Diffusion model using the torch.compile backend
+
Compiling LLM models from Huggingface
Compiling GPT2 using the Torch-TensorRT torch.compile
frontend
-
Compiling GPT2 using the dynamo backend
-
Compiling Llama2 using the dynamo backend
Compiling SAM2 using the dynamo backend
Compiling FLUX.1-dev model using the Torch-TensorRT dynamo backend
Legacy notebooks
diff --git a/docs/contributors/useful_links.html b/docs/contributors/useful_links.html
index fb82b34dc3..a2d8af54e9 100644
--- a/docs/contributors/useful_links.html
+++ b/docs/contributors/useful_links.html
@@ -10,7 +10,7 @@
-
Useful Links for Torch-TensorRT Development — Torch-TensorRT v2.8.0.dev0+ee32da0 documentation
+
Useful Links for Torch-TensorRT Development — Torch-TensorRT v2.9.0.dev0+92a6908 documentation
@@ -293,7 +293,7 @@
- v2.8.0.dev0+ee32da0
+ v2.9.0.dev0+92a6908
@@ -324,7 +324,7 @@
Getting Started
User Guide
@@ -376,9 +376,8 @@
Compiling ResNet with dynamic shapes using the torch.compile backend
Compiling BERT using the torch.compile backend
Compiling Stable Diffusion model using the torch.compile backend
+
Compiling LLM models from Huggingface
Compiling GPT2 using the Torch-TensorRT torch.compile
frontend
-
Compiling GPT2 using the dynamo backend
-
Compiling Llama2 using the dynamo backend
Compiling SAM2 using the dynamo backend
Compiling FLUX.1-dev model using the Torch-TensorRT dynamo backend
Legacy notebooks
diff --git a/docs/contributors/writing_dynamo_aten_lowering_passes.html b/docs/contributors/writing_dynamo_aten_lowering_passes.html
index a5d0fb5b24..7b713c0eb8 100644
--- a/docs/contributors/writing_dynamo_aten_lowering_passes.html
+++ b/docs/contributors/writing_dynamo_aten_lowering_passes.html
@@ -10,7 +10,7 @@
-
Writing Dynamo ATen Lowering Passes — Torch-TensorRT v2.8.0.dev0+ee32da0 documentation
+
Writing Dynamo ATen Lowering Passes — Torch-TensorRT v2.9.0.dev0+92a6908 documentation
@@ -293,7 +293,7 @@
- v2.8.0.dev0+ee32da0
+ v2.9.0.dev0+92a6908
@@ -324,7 +324,7 @@
Getting Started
User Guide
@@ -376,9 +376,8 @@
Compiling ResNet with dynamic shapes using the torch.compile backend
Compiling BERT using the torch.compile backend
Compiling Stable Diffusion model using the torch.compile backend
+
Compiling LLM models from Huggingface
Compiling GPT2 using the Torch-TensorRT torch.compile
frontend
-
Compiling GPT2 using the dynamo backend
-
Compiling Llama2 using the dynamo backend
Compiling SAM2 using the dynamo backend
Compiling FLUX.1-dev model using the Torch-TensorRT dynamo backend
Legacy notebooks
diff --git a/docs/dynamo/dynamo_export.html b/docs/dynamo/dynamo_export.html
index 00bf97e87f..7b14272b68 100644
--- a/docs/dynamo/dynamo_export.html
+++ b/docs/dynamo/dynamo_export.html
@@ -10,7 +10,7 @@
-
Compiling Exported Programs with Torch-TensorRT — Torch-TensorRT v2.8.0.dev0+ee32da0 documentation
+
Compiling Exported Programs with Torch-TensorRT — Torch-TensorRT v2.9.0.dev0+92a6908 documentation
@@ -293,7 +293,7 @@
- v2.8.0.dev0+ee32da0
+ v2.9.0.dev0+92a6908
@@ -324,7 +324,7 @@
Getting Started
User Guide
@@ -376,9 +376,8 @@
Compiling ResNet with dynamic shapes using the torch.compile backend
Compiling BERT using the torch.compile backend
Compiling Stable Diffusion model using the torch.compile backend
+
Compiling LLM models from Huggingface
Compiling GPT2 using the Torch-TensorRT torch.compile
frontend
-
Compiling GPT2 using the dynamo backend
-
Compiling Llama2 using the dynamo backend
Compiling SAM2 using the dynamo backend
Compiling FLUX.1-dev model using the Torch-TensorRT dynamo backend
Legacy notebooks
diff --git a/docs/dynamo/torch_compile.html b/docs/dynamo/torch_compile.html
index 346d514736..8ba8f59eb6 100644
--- a/docs/dynamo/torch_compile.html
+++ b/docs/dynamo/torch_compile.html
@@ -10,7 +10,7 @@
-
TensorRT Backend for torch.compile — Torch-TensorRT v2.8.0.dev0+ee32da0 documentation
+
TensorRT Backend for torch.compile — Torch-TensorRT v2.9.0.dev0+92a6908 documentation
@@ -293,7 +293,7 @@
- v2.8.0.dev0+ee32da0
+ v2.9.0.dev0+92a6908
@@ -324,7 +324,7 @@
Getting Started
User Guide
@@ -376,9 +376,8 @@
Compiling ResNet with dynamic shapes using the torch.compile backend
Compiling BERT using the torch.compile backend
Compiling Stable Diffusion model using the torch.compile backend
+
Compiling LLM models from Huggingface
Compiling GPT2 using the Torch-TensorRT torch.compile
frontend
-
Compiling GPT2 using the dynamo backend
-
Compiling Llama2 using the dynamo backend
Compiling SAM2 using the dynamo backend
Compiling FLUX.1-dev model using the Torch-TensorRT dynamo backend
Legacy notebooks
@@ -512,7 +511,7 @@
Key Features
-
-class torch_tensorrt.dynamo.CompilationSettings(enabled_precisions: ~typing.Set[~torch_tensorrt._enums.dtype] = <factory>, debug: bool = False, workspace_size: int = 0, min_block_size: int = 5, torch_executed_ops: ~typing.Collection[~typing.Union[~typing.Callable[[...], ~typing.Any], str]] = <factory>, pass_through_build_failures: bool = False, max_aux_streams: ~typing.Optional[int] = None, version_compatible: bool = False, optimization_level: ~typing.Optional[int] = None, use_python_runtime: ~typing.Optional[bool] = False, truncate_double: bool = False, use_fast_partitioner: bool = True, enable_experimental_decompositions: bool = False, device: ~torch_tensorrt._Device.Device = <factory>, require_full_compilation: bool = False, disable_tf32: bool = False, assume_dynamic_shape_support: bool = False, sparse_weights: bool = False, engine_capability: ~torch_tensorrt._enums.EngineCapability = <factory>, num_avg_timing_iters: int = 1, dla_sram_size: int = 1048576, dla_local_dram_size: int = 1073741824, dla_global_dram_size: int = 536870912, dryrun: ~typing.Union[bool, str] = False, hardware_compatible: bool = False, timing_cache_path: str = '/tmp/torch_tensorrt_engine_cache/timing_cache.bin', lazy_engine_init: bool = False, cache_built_engines: bool = False, reuse_cached_engines: bool = False, use_explicit_typing: bool = False, use_fp32_acc: bool = False, refit_identical_engine_weights: bool = False, strip_engine_weights: bool = False, immutable_weights: bool = True, enable_weight_streaming: bool = False, enable_cross_compile_for_windows: bool = False, tiling_optimization_level: str = 'none', l2_limit_for_tiling: int = -1, use_distributed_mode_trace: bool = False, offload_module_to_cpu: bool = False)[source]
+class torch_tensorrt.dynamo.CompilationSettings(enabled_precisions: ~typing.Set[~torch_tensorrt._enums.dtype] = <factory>, workspace_size: int = 0, min_block_size: int = 5, torch_executed_ops: ~typing.Collection[~typing.Union[~typing.Callable[[...], ~typing.Any], str]] = <factory>, pass_through_build_failures: bool = False, max_aux_streams: ~typing.Optional[int] = None, version_compatible: bool = False, optimization_level: ~typing.Optional[int] = None, use_python_runtime: ~typing.Optional[bool] = False, truncate_double: bool = False, use_fast_partitioner: bool = True, enable_experimental_decompositions: bool = False, device: ~torch_tensorrt._Device.Device = <factory>, require_full_compilation: bool = False, disable_tf32: bool = False, assume_dynamic_shape_support: bool = False, sparse_weights: bool = False, engine_capability: ~torch_tensorrt._enums.EngineCapability = <factory>, num_avg_timing_iters: int = 1, dla_sram_size: int = 1048576, dla_local_dram_size: int = 1073741824, dla_global_dram_size: int = 536870912, dryrun: ~typing.Union[bool, str] = False, hardware_compatible: bool = False, timing_cache_path: str = '/tmp/torch_tensorrt_engine_cache/timing_cache.bin', lazy_engine_init: bool = False, cache_built_engines: bool = False, reuse_cached_engines: bool = False, use_explicit_typing: bool = False, use_fp32_acc: bool = False, refit_identical_engine_weights: bool = False, strip_engine_weights: bool = False, immutable_weights: bool = True, enable_weight_streaming: bool = False, enable_cross_compile_for_windows: bool = False, tiling_optimization_level: str = 'none', l2_limit_for_tiling: int = -1, use_distributed_mode_trace: bool = False, offload_module_to_cpu: bool = False)[source]
Compilation settings for Torch-TensorRT Dynamo Paths
- Parameters
diff --git a/docs/fx/getting_started_with_fx_path.html b/docs/fx/getting_started_with_fx_path.html
index 8ed5eb46b4..111d7cdf21 100644
--- a/docs/fx/getting_started_with_fx_path.html
+++ b/docs/fx/getting_started_with_fx_path.html
@@ -10,7 +10,7 @@
- Torch-TensorRT (FX Frontend) User Guide — Torch-TensorRT v2.8.0.dev0+ee32da0 documentation
+ Torch-TensorRT (FX Frontend) User Guide — Torch-TensorRT v2.9.0.dev0+92a6908 documentation
@@ -293,7 +293,7 @@
- v2.8.0.dev0+ee32da0
+ v2.9.0.dev0+92a6908
@@ -324,7 +324,7 @@
Getting Started
User Guide
@@ -376,9 +376,8 @@
- Compiling ResNet with dynamic shapes using the torch.compile backend
- Compiling BERT using the torch.compile backend
- Compiling Stable Diffusion model using the torch.compile backend
+- Compiling LLM models from Huggingface
- Compiling GPT2 using the Torch-TensorRT
torch.compile
frontend
-- Compiling GPT2 using the dynamo backend
-- Compiling Llama2 using the dynamo backend
- Compiling SAM2 using the dynamo backend
- Compiling FLUX.1-dev model using the Torch-TensorRT dynamo backend
- Legacy notebooks
diff --git a/docs/genindex.html b/docs/genindex.html
index 461c82c6e2..c4bb49d297 100644
--- a/docs/genindex.html
+++ b/docs/genindex.html
@@ -9,7 +9,7 @@
- Index — Torch-TensorRT v2.8.0.dev0+ee32da0 documentation
+ Index — Torch-TensorRT v2.9.0.dev0+92a6908 documentation
@@ -290,7 +290,7 @@
- v2.8.0.dev0+ee32da0
+ v2.9.0.dev0+92a6908
@@ -321,7 +321,7 @@
Getting Started
User Guide
@@ -373,9 +373,8 @@
- Compiling ResNet with dynamic shapes using the torch.compile backend
- Compiling BERT using the torch.compile backend
- Compiling Stable Diffusion model using the torch.compile backend
+- Compiling LLM models from Huggingface
- Compiling GPT2 using the Torch-TensorRT
torch.compile
frontend
-- Compiling GPT2 using the dynamo backend
-- Compiling Llama2 using the dynamo backend
- Compiling SAM2 using the dynamo backend
- Compiling FLUX.1-dev model using the Torch-TensorRT dynamo backend
- Legacy notebooks
@@ -671,6 +670,8 @@ F
- f16 (torch_tensorrt.dtype attribute)
- f32 (torch_tensorrt.dtype attribute)
+
+ - f4 (torch_tensorrt.dtype attribute)
- f64 (torch_tensorrt.dtype attribute)
@@ -853,13 +854,15 @@ S
- set_cudagraphs_mode() (in module torch_tensorrt.runtime)
-
- |
+ |