NVIDIA
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docker/common/install_base.sh‎
Lines changed: 1 addition & 0 deletions b/‎docker/common/install_base.sh‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/auto_deploy/onnx_export_llm.py‎
Lines changed: 83 additions & 0 deletions b/‎examples/auto_deploy/onnx_export_llm.py‎
Lines changed: 83 additions & 0 deletions
diff --git a/‎requirements.txt‎
Lines changed: 2 additions & 0 deletions b/‎requirements.txt‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎tensorrt_llm/_torch/auto_deploy/config/export_driveos_llm_onnx.yaml‎
Lines changed: 148 additions & 0 deletions b/‎tensorrt_llm/_torch/auto_deploy/config/export_driveos_llm_onnx.yaml‎
Lines changed: 148 additions & 0 deletions
@@ -1,5 +1,6 @@
 __pycache__/
 .vscode
+.cursor
 *.engine
 *.engine.config
 *.cache
 
@@ -53,6 +53,7 @@ init_ubuntu() {
     gdb \
     git-lfs \
     clang \
+    graphviz \
     lld \
     llvm \
     libclang-rt-dev \
 
@@ -0,0 +1,83 @@
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import argparse
+
+from tensorrt_llm._torch.auto_deploy import LLM, AutoDeployConfig
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+        help="The HF model to use for onnx export.",
+    )
+    parser.add_argument(
+        "--max_seq_len",
+        type=int,
+        default=4,
+        help="The max sequence length to use for the model.",
+    )
+    parser.add_argument(
+        "--max_batch_size",
+        type=int,
+        # NOTE(yoco): Originally this is 2, however, don't know why, when set to 2,
+        # the batch_size will collapse static int 2 even we explicitly it is dynamic axis.
+        # And more weird, when set to 13, the batch_size will be dynamic.
+        default=13,  # to enable dynamic batch_size, the match size must > 1
+        help="The max batch size to use for the model.",
+    )
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="cpu",
+        help="The device to use for the model.",
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default=None,
+        help="The directory to save the exported ONNX model.",
+    )
+    parser.add_argument(
+        "--output_name",
+        type=str,
+        default=None,
+        help="The name of the exported ONNX model.",
+    )
+    args = parser.parse_args()
+
+    print(f"Constructing model from {args.model}")
+
+    # Prepare the AutoDeploy config, mode is export_driveos_llm_onnx
+    ad_config = AutoDeployConfig(
+        model=args.model,
+        mode="export_driveos_llm_onnx",
+        max_batch_size=args.max_batch_size,
+        max_seq_len=args.max_seq_len,
+        device=args.device,
+    )
+    ad_config.attn_backend = "torch"
+    if args.output_dir is not None:
+        ad_config.transforms["export_to_onnx"]["output_dir"] = args.output_dir
+    if args.output_name is not None:
+        ad_config.transforms["export_to_onnx"]["output_name"] = args.output_name
+    _ = LLM(**ad_config.to_llm_kwargs())
+
+
+if __name__ == "__main__":
+    main()
@@ -10,6 +10,8 @@ mpi4py
 numpy<2
 onnx>=1.18.0,<1.20.0
 onnx_graphsurgeon>=0.5.2
+onnxscript==0.5.4
+graphviz
 openai
 polygraphy
 psutil
 
@@ -0,0 +1,148 @@
+# This is the set of transforms running in "graph" mode. In this mode, we capture the full graph
+# of the model and optimize it for inference.
+transforms:
+  ############################################################################################
+  # BUILD MODEL, EXPORT TO GRAPH MODULE, AND CLEAN UP
+  ############################################################################################
+  build_model:
+    stage: factory
+    run_per_gm: false
+    device: meta
+    requires_clean_graph: false
+  export_to_gm:
+    stage: export
+    clone_state_dict: false
+    strict: false
+    run_per_gm: false
+    requires_clean_graph: false
+  cleanup_noop_slice:
+    stage: post_export
+  cleanup_noop_add:
+    stage: post_export
+  cleanup_input_constraints:
+    stage: post_export
+  ############################################################################################
+  # RUN PATTERN MATCHER TRANSFORMATIONS TO STANDARDIZE GRAPH REPRESENTATION
+  ############################################################################################
+  match_moe_pattern:
+    stage: pattern_matcher
+  match_dense_moe_pattern:
+    stage: pattern_matcher
+  match_repeat_kv:
+    stage: pattern_matcher
+    run_shape_prop: true
+  match_eager_attention:
+    stage: pattern_matcher
+    requires_shape_prop: true
+  match_sdpa_to_torch_attention:
+    stage: pattern_matcher
+  match_grouped_attention:
+    stage: pattern_matcher
+  match_attention_layout:
+    stage: pattern_matcher
+    attn_layout: bsnd
+  match_rope_pattern:
+    stage: pattern_matcher
+  match_rope_layout:
+    stage: pattern_matcher
+    expected_layout: bsnd
+  ############################################################################################
+  # RUN TRANSFORMATIONS ON STANDARDIZED GRAPH REPRESENTATION
+  ############################################################################################
+  eliminate_redundant_transposes:
+    stage: pattern_matcher
+  # TODO (lucaslie): let's move this to perf optimization once TP sharding is improved
+  # see https://github.com/NVIDIA/TensorRT-LLM/pull/3668#discussion_r2052714528
+  # NOTE (yoco): To export ONNX for DriveOS LLM, we don't need this optimization,
+  # because the rope will be fused into the AttentionPlugin operation
+  # in the fuse_rope_attention transform.
+  # optimize_rope:
+  #   stage: pattern_matcher
+  quantize_int4_linear_from_config:
+    stage: pattern_matcher
+  quantize_fp8_linear_from_config:
+    stage: pattern_matcher
+  quantize_nvfp4_linear_from_config:
+    stage: pattern_matcher
+  quantize_fp8_bmm_from_config:
+    stage: pattern_matcher
+  quantize_fp8_from_graph:
+    stage: pattern_matcher
+  quantize_nvfp4_from_graph:
+    stage: pattern_matcher
+  quantize_fp8_moe:
+    stage: pattern_matcher
+  quantize_nvfp4_moe:
+    stage: pattern_matcher
+  quantize_mxfp4_moe:
+    stage: pattern_matcher
+  detect_sharding:
+    stage: sharding
+    simple_shard_only: false
+    sharding_source: ["manual", "factory", "heuristic"]
+    support_partial_config: true
+    sharding_dims: ["tp", "ep", "bmm"]
+    allreduce_strategy: "AUTO"
+    requires_shape_prop: true
+  sharding_transform_executor:
+    stage: sharding
+    run_shape_prop: true
+  ############################################################################################
+  # MOVE MODEL AND LOAD WEIGHTS
+  ############################################################################################
+  load_weights:
+    stage: weight_load
+    run_per_gm: false
+    checkpoint_device: cpu
+  move_inputs_to_device:
+    stage: weight_load
+    checkpoint_device: cpu
+    run_per_gm: false
+  ############################################################################################
+  # RUN POST-LOAD FUSION AND OPTIMIZATIONS
+  ############################################################################################
+  fuse_gemms:
+    stage: post_load_fusion
+    enabled: false # TODO: https://github.com/NVIDIA/TensorRT-LLM/issues/4674 this is causing OOMs
+  fuse_fp4_gemms:
+    stage: post_load_fusion
+    enabled: false # TODO: https://github.com/NVIDIA/TensorRT-LLM/issues/4674 this is causing OOMs
+  fuse_fp8_gemms:
+    stage: post_load_fusion
+    enabled: false # TODO: https://github.com/NVIDIA/TensorRT-LLM/issues/4674 this is causing OOMs
+  fuse_fp8_linear:
+    stage: post_load_fusion
+    backend: trtllm
+  fuse_nvfp4_linear:
+    stage: post_load_fusion
+    backend: trtllm
+  fuse_moe:
+    stage: post_load_fusion
+    enabled: true
+    backend: trtllm
+  fuse_fp8_moe:
+    stage: post_load_fusion
+    enabled: true
+    backend: trtllm
+
+  ############################################################################################
+  # VISUALIZE GRAPH
+  ############################################################################################
+  visualize_namespace:
+    stage: visualize
+    enabled: false # TODO: https://github.com/NVIDIA/TensorRT-LLM/issues/8460
+  ############################################################################################
+  # FUSE Rope Attention & export to ONNX
+  ############################################################################################
+  fuse_rope_attention:
+    stage: export_onnx
+  short_reshape_attention_output:
+    stage: export_onnx
+  gather_last_token_ids:
+    stage: export_onnx
+  adapt_to_driveos_llm:
+    stage: export_onnx
+  export_to_onnx:
+    stage: export_onnx
+    output_dir: "."
+    output_name: "model.onnx"