pytorch · zewenli98 · Aug 9, 2025 · Aug 13, 2025
diff --git a/examples/dynamo/aot_plugin.py b/examples/dynamo/aot_plugin.py
@@ -153,7 +153,7 @@ def forward(self, X: torch.Tensor) -> torch.Tensor:
     )
     args = parser.parse_args()
 
-    my_model = MyModel().to("cuda")
+    my_model = MyModel().to("cuda").eval()
     m = torch.full((64, 64), 2, device="cuda", dtype=torch.float)
 
     assert my_model(X=m)[0][0] == 3.0
@@ -167,8 +167,9 @@ def forward(self, X: torch.Tensor) -> torch.Tensor:
         )
         print("Model compiled successfully!")
         print("Running inference with compiled model...")
-        for i in range(10):
-            res = model_trt(m)
-            assert torch.allclose(res, my_model(m)), "Results do not match!"
+        with torch.no_grad():
+            for i in range(10):
+                res = model_trt(m)
+                assert torch.allclose(res, my_model(m)), "Results do not match!"
 
     print("Inference successful!")
diff --git a/examples/dynamo/auto_generate_converters.py b/examples/dynamo/auto_generate_converters.py
@@ -169,14 +169,15 @@ def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
         return res
 
 
-my_model = MyModel().to("cuda")
+my_model = MyModel().to("cuda").eval()
 m = torch.full((64, 64), 2, device="cuda", dtype=torch.float)
 n = torch.full((64, 64), 3, device="cuda", dtype=torch.float)
 
 with torch_tensorrt.logging.errors():
     model_trt = torch_tensorrt.compile(my_model, inputs=[m, n], min_block_size=1)
-    for i in range(300):
-        res = model_trt(m, n)
-        assert torch.allclose(res, my_model(m, n))
+    with torch.no_grad():
+        for i in range(300):
+            res = model_trt(m, n)
+            assert torch.allclose(res, my_model(m, n))
 
 print("Ran with custom plugin!")
diff --git a/examples/dynamo/auto_generate_plugins.py b/examples/dynamo/auto_generate_plugins.py
@@ -139,14 +139,15 @@ def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
         return res
 
 
-my_model = MyModel().to("cuda")
+my_model = MyModel().to("cuda").eval()
 m = torch.randint(0, 5, (64, 64), device="cuda", dtype=torch.float)
 n = torch.randint(0, 5, (64, 64), device="cuda", dtype=torch.float)
 
 with torch_tensorrt.logging.errors():
     model_trt = torch_tensorrt.compile(my_model, inputs=[m, n], min_block_size=1)
-    for i in range(300):
-        res = model_trt(m, n)
-        assert torch.allclose(res, my_model(m, n))
+    with torch.no_grad():
+        for i in range(300):
+            res = model_trt(m, n)
+            assert torch.allclose(res, my_model(m, n))
 
 print("Ran with custom plugin!")
diff --git a/examples/dynamo/converter_overloading.py b/examples/dynamo/converter_overloading.py
@@ -34,7 +34,7 @@ def forward(self, x):
         return torch.nn.functional.gelu(x, approximate=self.mode)
 
 
-my_mod = GeLU(mode="tanh")
+my_mod = GeLU(mode="tanh").to("cuda").eval()
 ex_input = torch.randn(2, 5).to("cuda")
 
 
@@ -182,9 +182,9 @@ def get_op_count():
 my_custom_gelu = torch_tensorrt.compile(
     my_mod, arg_inputs=(ex_input,), min_block_size=1
 )
-
-print(my_custom_gelu.graph)
-print(my_custom_gelu(ex_input))
+with torch.no_grad():
+    print(my_custom_gelu.graph)
+    print(my_custom_gelu(ex_input))
 
 # %%
 #
@@ -198,7 +198,7 @@ def get_op_count():
 #
 # Finally, we want to verify that in the case that the ``approximate`` argument is not set to ``tanh``, our custom converter is not used.
 
-my_mod_erf = GeLU(mode="none")
+my_mod_erf = GeLU(mode="none").to("cuda").eval()
 my_gelu_erf = torch_tensorrt.compile(
     my_mod_erf, arg_inputs=(ex_input,), min_block_size=1
 )
@@ -207,6 +207,6 @@ def get_op_count():
 #
 # Notice that we don't see the print statement from our custom converter, indicating that it was not used. However, looking at the graph, we can still see that a TensorRT engine was created to run the GeLU operation.
 # In this case, the validator for our custom converter returned ``False``, so the conversion system moved on to the next converter in the list, the standard GeLU converter and used that one to convert the operation.
-
-print(my_gelu_erf.graph)
-print(my_gelu_erf(ex_input))
+with torch.no_grad():
+    print(my_gelu_erf.graph)
+    print(my_gelu_erf(ex_input))
diff --git a/examples/dynamo/cross_runtime_compilation_for_windows.py b/examples/dynamo/cross_runtime_compilation_for_windows.py
@@ -46,7 +46,7 @@
 
 args = PARSER.parse_args()
 torch.manual_seed(0)
-model = models.resnet18().eval().cuda()
+model = models.resnet18().cuda().eval()
 input = torch.rand((1, 3, 224, 224)).to("cuda")
 inputs = [input]
 
@@ -63,7 +63,8 @@
     loaded_model = torchtrt.load_cross_compiled_exported_program(args.path).module()
     print(f"model has been successfully loaded from ${args.path}")
     # inference
-    trt_output = loaded_model(input)
+    with torch.no_grad():
+        trt_output = loaded_model(input)
     print(f"inference result: {trt_output}")
 else:
     if platform.system() != "Linux" or platform.architecture()[0] != "64bit":

diff --git a/examples/dynamo/custom_kernel_plugins.py b/examples/dynamo/custom_kernel_plugins.py
@@ -217,8 +217,9 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         return y
 
 
-my_model = MyModel((1, 1, 2, 0)).to("cuda")
-my_model(ex_input)
+my_model = MyModel((1, 1, 2, 0)).to("cuda").eval()
+with torch.no_grad():
+    my_model(ex_input)
 
 ##############################################################################
 # .. code-block:: none
@@ -607,7 +608,8 @@ def circular_padding_converter(
 ##############################################
 # As you can see, now there is only one subgraph created for the TensorRT engine that contains both our custom kernel and the native convolution operator.
 
-print(trt_model(ex_input))
+with torch.no_grad():
+    print(trt_model(ex_input))
 
 ##############################################################################
 #    .. code-block:: none
@@ -636,7 +638,8 @@ def circular_padding_converter(
 # %%
 # We can verify our implementation is run correctly by both TensorRT and PyTorch
 
-print(my_model(ex_input) - trt_model(ex_input))
+with torch.no_grad():
+    print(my_model(ex_input) - trt_model(ex_input))
 
 ##############################################################################
 # .. code-block:: none

diff --git a/examples/dynamo/engine_caching_bert_example.py b/examples/dynamo/engine_caching_bert_example.py
@@ -62,7 +62,8 @@ def compile_bert(iterations=3):
             backend="torch_tensorrt",
             options=compilation_kwargs,
         )
-        optimized_model(*inputs)
+        with torch.no_grad():
+            optimized_model(*inputs)
         end.record()
         torch.cuda.synchronize()
         times.append(start.elapsed_time(end))

diff --git a/examples/dynamo/engine_caching_example.py b/examples/dynamo/engine_caching_example.py
@@ -37,7 +37,7 @@
 np.random.seed(0)
 torch.manual_seed(0)
 
-model = models.resnet18(pretrained=True).eval().to("cuda")
+model = models.resnet18(pretrained=True).to("cuda").eval()
 enabled_precisions = {torch.float}
 min_block_size = 1
 use_python_runtime = False
@@ -100,7 +100,8 @@ def torch_compile(iterations=3):
                 "reuse_cached_engines": reuse_cached_engines,
             },
         )
-        compiled_model(*inputs)  # trigger the compilation
+        with torch.no_grad():
+            compiled_model(*inputs)  # trigger the compilation
         end.record()
         torch.cuda.synchronize()
         times.append(start.elapsed_time(end))
@@ -270,7 +271,8 @@ def torch_compile_my_cache(iterations=3):
                 "custom_engine_cache": engine_cache,
             },
         )
-        compiled_model(*inputs)  # trigger the compilation
+        with torch.no_grad():
+            compiled_model(*inputs)  # trigger the compilation
         end.record()
         torch.cuda.synchronize()
         times.append(start.elapsed_time(end))

diff --git a/examples/dynamo/hierarchical_partitioner_example.py b/examples/dynamo/hierarchical_partitioner_example.py
@@ -79,7 +79,8 @@ def main():
 
     print("Original Model Structure:\n", gm)
 
-    original_output = model(example_input)
+    with torch.no_grad():
+        original_output = model(example_input)
 
     # 1. Partition the model into blocks that can be executed by different backends
     partitioned_model, op_support = hierarchical_adjacency_partition(

diff --git a/examples/dynamo/llama2_flashinfer_rmsnorm.py b/examples/dynamo/llama2_flashinfer_rmsnorm.py
@@ -220,7 +220,7 @@ def replace_rmsnorm(
 
 # 2. Initialize model (random weights)
 with torch.no_grad():
-    model = LlamaForCausalLM(config).eval().half()
+    model = LlamaForCausalLM(config).cuda().half().eval()
 
 # 3. Export with static shapes
 input_ids = torch.randint(0, 32000, (1, 64))  # Static [batch=1, seq=64]
@@ -253,5 +253,6 @@ def replace_rmsnorm(
 
 input_ids = input_ids.to(DEVICE)
 
-res = trt_model.forward(input_ids)
+with torch.no_grad():
+    res = trt_model.forward(input_ids)
 print(res)
diff --git a/examples/dynamo/mutable_torchtrt_module_example.py b/examples/dynamo/mutable_torchtrt_module_example.py
@@ -37,23 +37,25 @@
     "immutable_weights": False,
 }
 
-model = models.resnet18(pretrained=True).eval().to("cuda")
+model = models.resnet18(pretrained=True).to("cuda").eval()
 mutable_module = torch_trt.MutableTorchTensorRTModule(model, **settings)
 # You can use the mutable module just like the original pytorch module. The compilation happens while you first call the mutable module.
-mutable_module(*inputs)
+with torch.no_grad():
+    mutable_module(*inputs)
 # %%
 # Make modifications to the mutable module.
 # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 # %%
 # Making changes to mutable module can trigger refit or re-compilation. For example, loading a different state_dict and setting new weight values will trigger refit, and adding a module to the model will trigger re-compilation.
-model2 = models.resnet18(pretrained=False).eval().to("cuda")
+model2 = models.resnet18(pretrained=False).to("cuda").eval()
 mutable_module.load_state_dict(model2.state_dict())
 
 
 # Check the output
 # The refit happens while you call the mutable module again.
-expected_outputs, refitted_outputs = model2(*inputs), mutable_module(*inputs)
+with torch.no_grad():
+    expected_outputs, refitted_outputs = model2(*inputs), mutable_module(*inputs)
 for expected_output, refitted_output in zip(expected_outputs, refitted_outputs):
     assert torch.allclose(
         expected_output, refitted_output, 1e-2, 1e-2
@@ -163,7 +165,7 @@ def forward(self, a, b, c={}):
 
 
 device = "cuda:0"
-model = Model().eval().to(device)
+model = Model().to(device).eval()
 inputs = (torch.rand(10, 3).to(device), torch.rand(3, 30).to(device))
 kwargs = {
     "c": {"a": torch.rand(10, 30).to(device), "b": torch.rand(10, 30).to(device)},
@@ -182,14 +184,15 @@ def forward(self, a, b, c={}):
 model = torch_trt.MutableTorchTensorRTModule(model, min_block_size=1)
 model.set_expected_dynamic_shape_range(args_dynamic_shapes, kwarg_dynamic_shapes)
 # Compile
-model(*inputs, **kwargs)
-# Change input shape
-inputs_2 = (torch.rand(10, 5).to(device), torch.rand(10, 30).to(device))
-kwargs_2 = {
-    "c": {"a": torch.rand(10, 30).to(device), "b": torch.rand(5, 30).to(device)},
-}
-# Run without recompiling
-model(*inputs_2, **kwargs_2)
+with torch.no_grad():
+    model(*inputs, **kwargs)
+    # Change input shape
+    inputs_2 = (torch.rand(10, 5).to(device), torch.rand(10, 30).to(device))
+    kwargs_2 = {
+        "c": {"a": torch.rand(10, 30).to(device), "b": torch.rand(5, 30).to(device)},
+    }
+    # Run without recompiling
+    model(*inputs_2, **kwargs_2)
 
 # %%
 # Use Mutable Torch TensorRT module with persistent cache
@@ -199,7 +202,7 @@ def forward(self, a, b, c={}):
 
 from torch_tensorrt.dynamo._defaults import TIMING_CACHE_PATH
 
-model = models.resnet18(pretrained=True).eval().to("cuda")
+model = models.resnet18(pretrained=True).to("cuda").eval()
 
 times = []
 start = torch.cuda.Event(enable_timing=True)
@@ -225,14 +228,15 @@ def remove_timing_cache(path=TIMING_CACHE_PATH):
 
 remove_timing_cache()
 
-for i in range(4):
-    inputs = [torch.rand((100 + i, 3, 224, 224)).to("cuda")]
-
-    start.record()
-    model(*inputs)  # Recompile
-    end.record()
-    torch.cuda.synchronize()
-    times.append(start.elapsed_time(end))
+with torch.no_grad():
+    for i in range(4):
+        inputs = [torch.rand((100 + i, 3, 224, 224)).to("cuda")]
+
+        start.record()
+        model(*inputs)  # Recompile
+        end.record()
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
 
 print("----------------dynamo_compile----------------")
 print("Without engine caching, used:", times[0], "ms")

diff --git a/examples/dynamo/pre_allocated_output_example.py b/examples/dynamo/pre_allocated_output_example.py
@@ -43,10 +43,9 @@ def test_module_perf(model, *input):
     with torch.no_grad():
         for _ in range(3):
             model(*input)
-    torch.cuda.synchronize()
 
-    # Timing phase to measure inference performance
-    with torch.no_grad():
+        torch.cuda.synchronize()
+        # Timing phase to measure inference performance
         for i in range(10):
             start_time = timeit.default_timer()
             model(*input)
@@ -67,9 +66,9 @@ def test_module_perf(model, *input):
 # Load bert model
 model = (
     BertModel.from_pretrained("bert-base-uncased", torchscript=True)
-    .eval()
-    .half()
     .to("cuda")
+    .half()
+    .eval()
 )
 # Define sample inputs
 inputs = [
@@ -89,25 +88,26 @@ def test_module_perf(model, *input):
 # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 # Enable pre-allocated output buffer using a context manager
-with torch_tensorrt.runtime.enable_pre_allocated_outputs(optimized_model):
+with torch.no_grad():
+    with torch_tensorrt.runtime.enable_pre_allocated_outputs(optimized_model):
+        out_trt = optimized_model(*inputs)
+        # Subsequent inferences can use the pre-allocated output buffer (no shape change)
+        out_trt = optimized_model(*inputs)
+
+    # Alternatively, we can enable the feature using a context object
+    pre_allocated_output_ctx = torch_tensorrt.runtime.enable_pre_allocated_outputs(
+        optimized_model
+    )
+    pre_allocated_output_ctx.set_pre_allocated_output(True)
+    time_opt = test_module_perf(optimized_model, *inputs)
+
+    # Disable the pre-allocated output buffer feature and perform inference normally
+    pre_allocated_output_ctx.set_pre_allocated_output(False)
     out_trt = optimized_model(*inputs)
-    # Subsequent inferences can use the pre-allocated output buffer (no shape change)
-    out_trt = optimized_model(*inputs)
-
-# Alternatively, we can enable the feature using a context object
-pre_allocated_output_ctx = torch_tensorrt.runtime.enable_pre_allocated_outputs(
-    optimized_model
-)
-pre_allocated_output_ctx.set_pre_allocated_output(True)
-time_opt = test_module_perf(optimized_model, *inputs)
-
-# Disable the pre-allocated output buffer feature and perform inference normally
-pre_allocated_output_ctx.set_pre_allocated_output(False)
-out_trt = optimized_model(*inputs)
-time_normal = test_module_perf(optimized_model, *inputs)
+    time_normal = test_module_perf(optimized_model, *inputs)
 
-time_opt_ms = time_opt * 1000
-time_normal_ms = time_normal * 1000
+    time_opt_ms = time_opt * 1000
+    time_normal_ms = time_normal * 1000
 
-print(f"normal trt model time: {time_normal_ms:.3f} ms")
-print(f"pre-allocated output buffer model time: {time_opt_ms:.3f} ms")
+    print(f"normal trt model time: {time_normal_ms:.3f} ms")
+    print(f"pre-allocated output buffer model time: {time_opt_ms:.3f} ms")