diff --git a/examples/dynamo/aot_plugin.py b/examples/dynamo/aot_plugin.py
index 86dccfddfc..c322d37401 100644
--- a/examples/dynamo/aot_plugin.py
+++ b/examples/dynamo/aot_plugin.py
@@ -153,7 +153,7 @@ def forward(self, X: torch.Tensor) -> torch.Tensor:
     )
     args = parser.parse_args()
 
-    my_model = MyModel().to("cuda")
+    my_model = MyModel().to("cuda").eval()
     m = torch.full((64, 64), 2, device="cuda", dtype=torch.float)
 
     assert my_model(X=m)[0][0] == 3.0
@@ -167,8 +167,9 @@ def forward(self, X: torch.Tensor) -> torch.Tensor:
         )
         print("Model compiled successfully!")
         print("Running inference with compiled model...")
-        for i in range(10):
-            res = model_trt(m)
-            assert torch.allclose(res, my_model(m)), "Results do not match!"
+        with torch.no_grad():
+            for i in range(10):
+                res = model_trt(m)
+                assert torch.allclose(res, my_model(m)), "Results do not match!"
 
     print("Inference successful!")
diff --git a/examples/dynamo/auto_generate_converters.py b/examples/dynamo/auto_generate_converters.py
index af9cffb8ff..5ab242443c 100644
--- a/examples/dynamo/auto_generate_converters.py
+++ b/examples/dynamo/auto_generate_converters.py
@@ -169,14 +169,15 @@ def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
         return res
 
 
-my_model = MyModel().to("cuda")
+my_model = MyModel().to("cuda").eval()
 m = torch.full((64, 64), 2, device="cuda", dtype=torch.float)
 n = torch.full((64, 64), 3, device="cuda", dtype=torch.float)
 
 with torch_tensorrt.logging.errors():
     model_trt = torch_tensorrt.compile(my_model, inputs=[m, n], min_block_size=1)
-    for i in range(300):
-        res = model_trt(m, n)
-        assert torch.allclose(res, my_model(m, n))
+    with torch.no_grad():
+        for i in range(300):
+            res = model_trt(m, n)
+            assert torch.allclose(res, my_model(m, n))
 
 print("Ran with custom plugin!")
diff --git a/examples/dynamo/auto_generate_plugins.py b/examples/dynamo/auto_generate_plugins.py
index 68a8635454..57a4300779 100644
--- a/examples/dynamo/auto_generate_plugins.py
+++ b/examples/dynamo/auto_generate_plugins.py
@@ -139,14 +139,15 @@ def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
         return res
 
 
-my_model = MyModel().to("cuda")
+my_model = MyModel().to("cuda").eval()
 m = torch.randint(0, 5, (64, 64), device="cuda", dtype=torch.float)
 n = torch.randint(0, 5, (64, 64), device="cuda", dtype=torch.float)
 
 with torch_tensorrt.logging.errors():
     model_trt = torch_tensorrt.compile(my_model, inputs=[m, n], min_block_size=1)
-    for i in range(300):
-        res = model_trt(m, n)
-        assert torch.allclose(res, my_model(m, n))
+    with torch.no_grad():
+        for i in range(300):
+            res = model_trt(m, n)
+            assert torch.allclose(res, my_model(m, n))
 
 print("Ran with custom plugin!")
diff --git a/examples/dynamo/converter_overloading.py b/examples/dynamo/converter_overloading.py
index e27c53cb50..011bed9307 100644
--- a/examples/dynamo/converter_overloading.py
+++ b/examples/dynamo/converter_overloading.py
@@ -34,7 +34,7 @@ def forward(self, x):
         return torch.nn.functional.gelu(x, approximate=self.mode)
 
 
-my_mod = GeLU(mode="tanh")
+my_mod = GeLU(mode="tanh").to("cuda").eval()
 ex_input = torch.randn(2, 5).to("cuda")
 
 
@@ -182,9 +182,9 @@ def get_op_count():
 my_custom_gelu = torch_tensorrt.compile(
     my_mod, arg_inputs=(ex_input,), min_block_size=1
 )
-
-print(my_custom_gelu.graph)
-print(my_custom_gelu(ex_input))
+with torch.no_grad():
+    print(my_custom_gelu.graph)
+    print(my_custom_gelu(ex_input))
 
 # %%
 #
@@ -198,7 +198,7 @@ def get_op_count():
 #
 # Finally, we want to verify that in the case that the ``approximate`` argument is not set to ``tanh``, our custom converter is not used.
 
-my_mod_erf = GeLU(mode="none")
+my_mod_erf = GeLU(mode="none").to("cuda").eval()
 my_gelu_erf = torch_tensorrt.compile(
     my_mod_erf, arg_inputs=(ex_input,), min_block_size=1
 )
@@ -207,6 +207,6 @@ def get_op_count():
 #
 # Notice that we don't see the print statement from our custom converter, indicating that it was not used. However, looking at the graph, we can still see that a TensorRT engine was created to run the GeLU operation.
 # In this case, the validator for our custom converter returned ``False``, so the conversion system moved on to the next converter in the list, the standard GeLU converter and used that one to convert the operation.
-
-print(my_gelu_erf.graph)
-print(my_gelu_erf(ex_input))
+with torch.no_grad():
+    print(my_gelu_erf.graph)
+    print(my_gelu_erf(ex_input))
diff --git a/examples/dynamo/cross_runtime_compilation_for_windows.py b/examples/dynamo/cross_runtime_compilation_for_windows.py
index 433df12d29..d3339f8f34 100644
--- a/examples/dynamo/cross_runtime_compilation_for_windows.py
+++ b/examples/dynamo/cross_runtime_compilation_for_windows.py
@@ -46,7 +46,7 @@
 
 args = PARSER.parse_args()
 torch.manual_seed(0)
-model = models.resnet18().eval().cuda()
+model = models.resnet18().cuda().eval()
 input = torch.rand((1, 3, 224, 224)).to("cuda")
 inputs = [input]
 
@@ -63,7 +63,8 @@
     loaded_model = torchtrt.load_cross_compiled_exported_program(args.path).module()
     print(f"model has been successfully loaded from ${args.path}")
     # inference
-    trt_output = loaded_model(input)
+    with torch.no_grad():
+        trt_output = loaded_model(input)
     print(f"inference result: {trt_output}")
 else:
     if platform.system() != "Linux" or platform.architecture()[0] != "64bit":
diff --git a/examples/dynamo/custom_kernel_plugins.py b/examples/dynamo/custom_kernel_plugins.py
index dccb0ff0cf..d58ce43378 100644
--- a/examples/dynamo/custom_kernel_plugins.py
+++ b/examples/dynamo/custom_kernel_plugins.py
@@ -217,8 +217,9 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         return y
 
 
-my_model = MyModel((1, 1, 2, 0)).to("cuda")
-my_model(ex_input)
+my_model = MyModel((1, 1, 2, 0)).to("cuda").eval()
+with torch.no_grad():
+    my_model(ex_input)
 
 ##############################################################################
 # .. code-block:: none
@@ -607,7 +608,8 @@ def circular_padding_converter(
 ##############################################
 # As you can see, now there is only one subgraph created for the TensorRT engine that contains both our custom kernel and the native convolution operator.
 
-print(trt_model(ex_input))
+with torch.no_grad():
+    print(trt_model(ex_input))
 
 ##############################################################################
 #    .. code-block:: none
@@ -636,7 +638,8 @@ def circular_padding_converter(
 # %%
 # We can verify our implementation is run correctly by both TensorRT and PyTorch
 
-print(my_model(ex_input) - trt_model(ex_input))
+with torch.no_grad():
+    print(my_model(ex_input) - trt_model(ex_input))
 
 ##############################################################################
 # .. code-block:: none
diff --git a/examples/dynamo/engine_caching_bert_example.py b/examples/dynamo/engine_caching_bert_example.py
index 66f5a69ac0..6aa90302e3 100644
--- a/examples/dynamo/engine_caching_bert_example.py
+++ b/examples/dynamo/engine_caching_bert_example.py
@@ -62,7 +62,8 @@ def compile_bert(iterations=3):
             backend="torch_tensorrt",
             options=compilation_kwargs,
         )
-        optimized_model(*inputs)
+        with torch.no_grad():
+            optimized_model(*inputs)
         end.record()
         torch.cuda.synchronize()
         times.append(start.elapsed_time(end))
diff --git a/examples/dynamo/engine_caching_example.py b/examples/dynamo/engine_caching_example.py
index 34fa56f9a1..45bcd363ab 100644
--- a/examples/dynamo/engine_caching_example.py
+++ b/examples/dynamo/engine_caching_example.py
@@ -37,7 +37,7 @@
 np.random.seed(0)
 torch.manual_seed(0)
 
-model = models.resnet18(pretrained=True).eval().to("cuda")
+model = models.resnet18(pretrained=True).to("cuda").eval()
 enabled_precisions = {torch.float}
 min_block_size = 1
 use_python_runtime = False
@@ -100,7 +100,8 @@ def torch_compile(iterations=3):
                 "reuse_cached_engines": reuse_cached_engines,
             },
         )
-        compiled_model(*inputs)  # trigger the compilation
+        with torch.no_grad():
+            compiled_model(*inputs)  # trigger the compilation
         end.record()
         torch.cuda.synchronize()
         times.append(start.elapsed_time(end))
@@ -270,7 +271,8 @@ def torch_compile_my_cache(iterations=3):
                 "custom_engine_cache": engine_cache,
             },
         )
-        compiled_model(*inputs)  # trigger the compilation
+        with torch.no_grad():
+            compiled_model(*inputs)  # trigger the compilation
         end.record()
         torch.cuda.synchronize()
         times.append(start.elapsed_time(end))
diff --git a/examples/dynamo/hierarchical_partitioner_example.py b/examples/dynamo/hierarchical_partitioner_example.py
index 73975e2453..4de370bd2c 100644
--- a/examples/dynamo/hierarchical_partitioner_example.py
+++ b/examples/dynamo/hierarchical_partitioner_example.py
@@ -79,7 +79,8 @@ def main():
 
     print("Original Model Structure:\n", gm)
 
-    original_output = model(example_input)
+    with torch.no_grad():
+        original_output = model(example_input)
 
     # 1. Partition the model into blocks that can be executed by different backends
     partitioned_model, op_support = hierarchical_adjacency_partition(
diff --git a/examples/dynamo/llama2_flashinfer_rmsnorm.py b/examples/dynamo/llama2_flashinfer_rmsnorm.py
index 7542a9a1b7..c724954a18 100644
--- a/examples/dynamo/llama2_flashinfer_rmsnorm.py
+++ b/examples/dynamo/llama2_flashinfer_rmsnorm.py
@@ -220,7 +220,7 @@ def replace_rmsnorm(
 
 # 2. Initialize model (random weights)
 with torch.no_grad():
-    model = LlamaForCausalLM(config).eval().half()
+    model = LlamaForCausalLM(config).cuda().half().eval()
 
 # 3. Export with static shapes
 input_ids = torch.randint(0, 32000, (1, 64))  # Static [batch=1, seq=64]
@@ -253,5 +253,6 @@ def replace_rmsnorm(
 
 input_ids = input_ids.to(DEVICE)
 
-res = trt_model.forward(input_ids)
+with torch.no_grad():
+    res = trt_model.forward(input_ids)
 print(res)
diff --git a/examples/dynamo/mutable_torchtrt_module_example.py b/examples/dynamo/mutable_torchtrt_module_example.py
index f422a6e629..84abcddf44 100644
--- a/examples/dynamo/mutable_torchtrt_module_example.py
+++ b/examples/dynamo/mutable_torchtrt_module_example.py
@@ -37,23 +37,25 @@
     "immutable_weights": False,
 }
 
-model = models.resnet18(pretrained=True).eval().to("cuda")
+model = models.resnet18(pretrained=True).to("cuda").eval()
 mutable_module = torch_trt.MutableTorchTensorRTModule(model, **settings)
 # You can use the mutable module just like the original pytorch module. The compilation happens while you first call the mutable module.
-mutable_module(*inputs)
+with torch.no_grad():
+    mutable_module(*inputs)
 # %%
 # Make modifications to the mutable module.
 # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 # %%
 # Making changes to mutable module can trigger refit or re-compilation. For example, loading a different state_dict and setting new weight values will trigger refit, and adding a module to the model will trigger re-compilation.
-model2 = models.resnet18(pretrained=False).eval().to("cuda")
+model2 = models.resnet18(pretrained=False).to("cuda").eval()
 mutable_module.load_state_dict(model2.state_dict())
 
 
 # Check the output
 # The refit happens while you call the mutable module again.
-expected_outputs, refitted_outputs = model2(*inputs), mutable_module(*inputs)
+with torch.no_grad():
+    expected_outputs, refitted_outputs = model2(*inputs), mutable_module(*inputs)
 for expected_output, refitted_output in zip(expected_outputs, refitted_outputs):
     assert torch.allclose(
         expected_output, refitted_output, 1e-2, 1e-2
@@ -163,7 +165,7 @@ def forward(self, a, b, c={}):
 
 
 device = "cuda:0"
-model = Model().eval().to(device)
+model = Model().to(device).eval()
 inputs = (torch.rand(10, 3).to(device), torch.rand(3, 30).to(device))
 kwargs = {
     "c": {"a": torch.rand(10, 30).to(device), "b": torch.rand(10, 30).to(device)},
@@ -182,14 +184,15 @@ def forward(self, a, b, c={}):
 model = torch_trt.MutableTorchTensorRTModule(model, min_block_size=1)
 model.set_expected_dynamic_shape_range(args_dynamic_shapes, kwarg_dynamic_shapes)
 # Compile
-model(*inputs, **kwargs)
-# Change input shape
-inputs_2 = (torch.rand(10, 5).to(device), torch.rand(10, 30).to(device))
-kwargs_2 = {
-    "c": {"a": torch.rand(10, 30).to(device), "b": torch.rand(5, 30).to(device)},
-}
-# Run without recompiling
-model(*inputs_2, **kwargs_2)
+with torch.no_grad():
+    model(*inputs, **kwargs)
+    # Change input shape
+    inputs_2 = (torch.rand(10, 5).to(device), torch.rand(10, 30).to(device))
+    kwargs_2 = {
+        "c": {"a": torch.rand(10, 30).to(device), "b": torch.rand(5, 30).to(device)},
+    }
+    # Run without recompiling
+    model(*inputs_2, **kwargs_2)
 
 # %%
 # Use Mutable Torch TensorRT module with persistent cache
@@ -199,7 +202,7 @@ def forward(self, a, b, c={}):
 
 from torch_tensorrt.dynamo._defaults import TIMING_CACHE_PATH
 
-model = models.resnet18(pretrained=True).eval().to("cuda")
+model = models.resnet18(pretrained=True).to("cuda").eval()
 
 times = []
 start = torch.cuda.Event(enable_timing=True)
@@ -225,14 +228,15 @@ def remove_timing_cache(path=TIMING_CACHE_PATH):
 
 remove_timing_cache()
 
-for i in range(4):
-    inputs = [torch.rand((100 + i, 3, 224, 224)).to("cuda")]
-
-    start.record()
-    model(*inputs)  # Recompile
-    end.record()
-    torch.cuda.synchronize()
-    times.append(start.elapsed_time(end))
+with torch.no_grad():
+    for i in range(4):
+        inputs = [torch.rand((100 + i, 3, 224, 224)).to("cuda")]
+
+        start.record()
+        model(*inputs)  # Recompile
+        end.record()
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
 
 print("----------------dynamo_compile----------------")
 print("Without engine caching, used:", times[0], "ms")
diff --git a/examples/dynamo/pre_allocated_output_example.py b/examples/dynamo/pre_allocated_output_example.py
index d938034758..2ad1b8f514 100644
--- a/examples/dynamo/pre_allocated_output_example.py
+++ b/examples/dynamo/pre_allocated_output_example.py
@@ -43,10 +43,9 @@ def test_module_perf(model, *input):
     with torch.no_grad():
         for _ in range(3):
             model(*input)
-    torch.cuda.synchronize()
 
-    # Timing phase to measure inference performance
-    with torch.no_grad():
+        torch.cuda.synchronize()
+        # Timing phase to measure inference performance
         for i in range(10):
             start_time = timeit.default_timer()
             model(*input)
@@ -67,9 +66,9 @@ def test_module_perf(model, *input):
 # Load bert model
 model = (
     BertModel.from_pretrained("bert-base-uncased", torchscript=True)
-    .eval()
-    .half()
     .to("cuda")
+    .half()
+    .eval()
 )
 # Define sample inputs
 inputs = [
@@ -89,25 +88,26 @@ def test_module_perf(model, *input):
 # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 # Enable pre-allocated output buffer using a context manager
-with torch_tensorrt.runtime.enable_pre_allocated_outputs(optimized_model):
+with torch.no_grad():
+    with torch_tensorrt.runtime.enable_pre_allocated_outputs(optimized_model):
+        out_trt = optimized_model(*inputs)
+        # Subsequent inferences can use the pre-allocated output buffer (no shape change)
+        out_trt = optimized_model(*inputs)
+
+    # Alternatively, we can enable the feature using a context object
+    pre_allocated_output_ctx = torch_tensorrt.runtime.enable_pre_allocated_outputs(
+        optimized_model
+    )
+    pre_allocated_output_ctx.set_pre_allocated_output(True)
+    time_opt = test_module_perf(optimized_model, *inputs)
+
+    # Disable the pre-allocated output buffer feature and perform inference normally
+    pre_allocated_output_ctx.set_pre_allocated_output(False)
     out_trt = optimized_model(*inputs)
-    # Subsequent inferences can use the pre-allocated output buffer (no shape change)
-    out_trt = optimized_model(*inputs)
-
-# Alternatively, we can enable the feature using a context object
-pre_allocated_output_ctx = torch_tensorrt.runtime.enable_pre_allocated_outputs(
-    optimized_model
-)
-pre_allocated_output_ctx.set_pre_allocated_output(True)
-time_opt = test_module_perf(optimized_model, *inputs)
-
-# Disable the pre-allocated output buffer feature and perform inference normally
-pre_allocated_output_ctx.set_pre_allocated_output(False)
-out_trt = optimized_model(*inputs)
-time_normal = test_module_perf(optimized_model, *inputs)
+    time_normal = test_module_perf(optimized_model, *inputs)
 
-time_opt_ms = time_opt * 1000
-time_normal_ms = time_normal * 1000
+    time_opt_ms = time_opt * 1000
+    time_normal_ms = time_normal * 1000
 
-print(f"normal trt model time: {time_normal_ms:.3f} ms")
-print(f"pre-allocated output buffer model time: {time_opt_ms:.3f} ms")
+    print(f"normal trt model time: {time_normal_ms:.3f} ms")
+    print(f"pre-allocated output buffer model time: {time_opt_ms:.3f} ms")
diff --git a/examples/dynamo/refit_engine_example.py b/examples/dynamo/refit_engine_example.py
index 2637b4fd7e..e0b0457a09 100644
--- a/examples/dynamo/refit_engine_example.py
+++ b/examples/dynamo/refit_engine_example.py
@@ -53,7 +53,7 @@
 #
 # In this case we are going to compile a ResNet18 model with randomly initialized weights and save it.
 
-model = models.resnet18(pretrained=False).eval().to("cuda")
+model = models.resnet18(pretrained=False).to("cuda").eval()
 exp_program = torch.export.export(model, tuple(inputs))
 enabled_precisions = {torch.float}
 workspace_size = 20 << 30
@@ -85,7 +85,7 @@
 # function is used to update the weights of the compiled module with the new weights.
 
 # Create and compile the updated model
-model2 = models.resnet18(pretrained=True).eval().to("cuda")
+model2 = models.resnet18(pretrained=True).to("cuda").eval()
 exp_program2 = torch.export.export(model2, tuple(inputs))
 
 
@@ -99,12 +99,14 @@
 )
 
 # Check the output
-model2.to("cuda")
-expected_outputs, refitted_outputs = exp_program2.module()(*inputs), new_trt_gm(*inputs)
-for expected_output, refitted_output in zip(expected_outputs, refitted_outputs):
-    assert torch.allclose(
-        expected_output, refitted_output, 1e-2, 1e-2
-    ), "Refit Result is not correct. Refit failed"
+with torch.no_grad():
+    expected_outputs, refitted_outputs = exp_program2.module()(*inputs), new_trt_gm(
+        *inputs
+    )
+    for expected_output, refitted_output in zip(expected_outputs, refitted_outputs):
+        assert torch.allclose(
+            expected_output, refitted_output, 1e-2, 1e-2
+        ), "Refit Result is not correct. Refit failed"
 
 print("Refit successfully!")
 
diff --git a/examples/dynamo/torch_compile_advanced_usage.py b/examples/dynamo/torch_compile_advanced_usage.py
index 71d0d77005..b366c292d9 100644
--- a/examples/dynamo/torch_compile_advanced_usage.py
+++ b/examples/dynamo/torch_compile_advanced_usage.py
@@ -36,7 +36,7 @@ def forward(self, x: torch.Tensor, y: torch.Tensor):
 
 # Define sample float inputs and initialize model
 sample_inputs = [torch.rand((5, 7)).cuda(), torch.rand((5, 7)).cuda()]
-model = Model().eval().cuda()
+model = Model().cuda().eval()
 
 # %%
 
@@ -45,7 +45,8 @@ def forward(self, x: torch.Tensor, y: torch.Tensor):
 # with the backend "torch_tensorrt", and run the model on an
 # input to cause compilation, as so:
 optimized_model = torch.compile(model, backend="torch_tensorrt", dynamic=False)
-optimized_model(*sample_inputs)
+with torch.no_grad():
+    optimized_model(*sample_inputs)
 
 # %%
 # Compilation with `torch.compile` Using Custom Settings
@@ -60,7 +61,7 @@ def forward(self, x: torch.Tensor, y: torch.Tensor):
     torch.rand((5, 7)).half().cuda(),
     torch.rand((5, 7)).half().cuda(),
 ]
-model_half = Model().eval().cuda()
+model_half = Model().cuda().eval()
 
 # %%
 
@@ -86,7 +87,8 @@ def forward(self, x: torch.Tensor, y: torch.Tensor):
     options=backend_kwargs,
     dynamic=False,
 )
-optimized_model_custom(*sample_inputs_half)
+with torch.no_grad():
+    optimized_model_custom(*sample_inputs_half)
 
 # %%
 # Cleanup
diff --git a/examples/dynamo/torch_compile_gpt2.py b/examples/dynamo/torch_compile_gpt2.py
index 5d41c3ed84..fd5f13df85 100644
--- a/examples/dynamo/torch_compile_gpt2.py
+++ b/examples/dynamo/torch_compile_gpt2.py
@@ -44,8 +44,8 @@
             use_cache=False,
             attn_implementation="eager",
         )
+        .to(DEVICE)
         .eval()
-        .cuda()
     )
 
 # %%
@@ -54,16 +54,17 @@
 # Tokenize a sample input prompt and get pytorch model outputs
 prompt = "I enjoy walking with my cute dog"
 model_inputs = tokenizer(prompt, return_tensors="pt")
-input_ids = model_inputs["input_ids"].cuda()
+input_ids = model_inputs["input_ids"].to(DEVICE)
 
 # %%
 # The ``generate()`` API of the ``AutoModelForCausalLM`` class is used for auto-regressive generation with greedy decoding.
-pyt_gen_tokens = model.generate(
-    input_ids,
-    max_length=MAX_LENGTH,
-    use_cache=False,
-    pad_token_id=tokenizer.eos_token_id,
-)
+with torch.no_grad():
+    pyt_gen_tokens = model.generate(
+        input_ids,
+        max_length=MAX_LENGTH,
+        use_cache=False,
+        pad_token_id=tokenizer.eos_token_id,
+    )
 
 # %%
 # Torch-TensorRT compilation and inference
@@ -87,12 +88,13 @@
 # Auto-regressive generation loop for greedy decoding using TensorRT model
 # The first token generation compiles the model using TensorRT and the second token
 # encounters recompilation (which is an issue currently that would be resolved in the future)
-trt_gen_tokens = model.generate(
-    inputs=input_ids,
-    max_length=MAX_LENGTH,
-    use_cache=False,
-    pad_token_id=tokenizer.eos_token_id,
-)
+with torch.no_grad():
+    trt_gen_tokens = model.generate(
+        inputs=input_ids,
+        max_length=MAX_LENGTH,
+        use_cache=False,
+        pad_token_id=tokenizer.eos_token_id,
+    )
 
 # %%
 # Decode the output sentences of PyTorch and TensorRT
diff --git a/examples/dynamo/torch_compile_resnet_example.py b/examples/dynamo/torch_compile_resnet_example.py
index 6a85de6fbc..506982c7f4 100644
--- a/examples/dynamo/torch_compile_resnet_example.py
+++ b/examples/dynamo/torch_compile_resnet_example.py
@@ -18,7 +18,7 @@
 # %%
 
 # Initialize model with half precision and sample inputs
-model = models.resnet18(pretrained=True).half().eval().to("cuda")
+model = models.resnet18(pretrained=True).to("cuda").half().eval()
 inputs = [torch.randn((1, 3, 224, 224)).to("cuda").half()]
 
 # %%
@@ -63,21 +63,23 @@
 # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 # Does not cause recompilation (same batch size as input)
-new_inputs = [torch.randn((1, 3, 224, 224)).half().to("cuda")]
-new_outputs = optimized_model(*new_inputs)
+new_inputs = [torch.randn((1, 3, 224, 224)).to("cuda").half()]
+with torch.no_grad():
+    new_outputs = optimized_model(*new_inputs)
 
 # %%
 
 # Does cause recompilation (new batch size)
-new_batch_size_inputs = [torch.randn((8, 3, 224, 224)).half().to("cuda")]
-new_batch_size_outputs = optimized_model(*new_batch_size_inputs)
+new_batch_size_inputs = [torch.randn((8, 3, 224, 224)).to("cuda").half()]
+with torch.no_grad():
+    new_batch_size_outputs = optimized_model(*new_batch_size_inputs)
 
 # %%
 # Avoid recompilation by specifying dynamic shapes before Torch-TRT compilation
 # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 # The following code illustrates the workflow using ir=torch_compile (which uses torch.compile under the hood)
-inputs_bs8 = torch.randn((8, 3, 224, 224)).half().to("cuda")
+inputs_bs8 = torch.randn((8, 3, 224, 224)).to("cuda").half()
 # This indicates dimension 0 of inputs_bs8 is dynamic whose range of values is [2, 16]
 torch._dynamo.mark_dynamic(inputs_bs8, 0, min=2, max=16)
 optimized_model = torch_tensorrt.compile(
@@ -89,11 +91,14 @@
     min_block_size=min_block_size,
     torch_executed_ops=torch_executed_ops,
 )
-outputs_bs8 = optimized_model(inputs_bs8)
+
+with torch.no_grad():
+    outputs_bs8 = optimized_model(inputs_bs8)
 
 # No recompilation happens for batch size = 12
-inputs_bs12 = torch.randn((12, 3, 224, 224)).half().to("cuda")
-outputs_bs12 = optimized_model(inputs_bs12)
+inputs_bs12 = torch.randn((12, 3, 224, 224)).to("cuda").half()
+with torch.no_grad():
+    outputs_bs12 = optimized_model(inputs_bs12)
 
 # The following code illustrates the workflow using ir=dynamo (which uses torch.export APIs under the hood)
 # dynamic shapes for any inputs are specified using torch_tensorrt.Input API
@@ -112,5 +117,6 @@
 trt_model = torch_tensorrt.compile(model, **compile_spec)
 
 # No recompilation happens for batch size = 12
-inputs_bs12 = torch.randn((12, 3, 224, 224)).half().to("cuda")
-outputs_bs12 = trt_model(inputs_bs12)
+inputs_bs12 = torch.randn((12, 3, 224, 224)).to("cuda").half()
+with torch.no_grad():
+    outputs_bs12 = trt_model(inputs_bs12)
diff --git a/examples/dynamo/torch_compile_stable_diffusion.py b/examples/dynamo/torch_compile_stable_diffusion.py
index fe49da74d1..b894dd9c3f 100644
--- a/examples/dynamo/torch_compile_stable_diffusion.py
+++ b/examples/dynamo/torch_compile_stable_diffusion.py
@@ -46,9 +46,9 @@
 # %%
 # Inference
 # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+with torch.no_grad():
+    prompt = "a majestic castle in the clouds"
+    image = pipe(prompt).images[0]
 
-prompt = "a majestic castle in the clouds"
-image = pipe(prompt).images[0]
-
-image.save("images/majestic_castle.png")
-image.show()
+    image.save("images/majestic_castle.png")
+    image.show()
diff --git a/examples/dynamo/torch_compile_transformers_example.py b/examples/dynamo/torch_compile_transformers_example.py
index 7737e95682..f73da79fd9 100644
--- a/examples/dynamo/torch_compile_transformers_example.py
+++ b/examples/dynamo/torch_compile_transformers_example.py
@@ -18,7 +18,7 @@
 # %%
 
 # Initialize model with float precision and sample inputs
-model = BertModel.from_pretrained("bert-base-uncased").eval().to("cuda")
+model = BertModel.from_pretrained("bert-base-uncased").to("cuda").eval()
 inputs = [
     torch.randint(0, 2, (1, 14), dtype=torch.int32).to("cuda"),
     torch.randint(0, 2, (1, 14), dtype=torch.int32).to("cuda"),
@@ -61,7 +61,8 @@
     dynamic=False,
     options=compilation_kwargs,
 )
-optimized_model(*inputs)
+with torch.no_grad():
+    optimized_model(*inputs)
 
 # %%
 # Equivalently, we could have run the above via the convenience frontend, as so:
@@ -76,7 +77,8 @@
     torch.randint(0, 2, (1, 14), dtype=torch.int32).to("cuda"),
     torch.randint(0, 2, (1, 14), dtype=torch.int32).to("cuda"),
 ]
-new_outputs = optimized_model(*new_inputs)
+with torch.no_grad():
+    new_outputs = optimized_model(*new_inputs)
 
 # %%
 
@@ -85,7 +87,8 @@
     torch.randint(0, 2, (4, 14), dtype=torch.int32).to("cuda"),
     torch.randint(0, 2, (4, 14), dtype=torch.int32).to("cuda"),
 ]
-new_outputs = optimized_model(*new_inputs)
+with torch.no_grad():
+    new_outputs = optimized_model(*new_inputs)
 
 # %%
 # Cleanup
diff --git a/examples/dynamo/torch_export_cudagraphs.py b/examples/dynamo/torch_export_cudagraphs.py
index e316dffc58..f19a78ea0f 100644
--- a/examples/dynamo/torch_export_cudagraphs.py
+++ b/examples/dynamo/torch_export_cudagraphs.py
@@ -25,7 +25,7 @@
 # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 # We begin by defining and initializing a model
-model = models.resnet18(pretrained=True).eval().to("cuda")
+model = models.resnet18(pretrained=True).cuda().eval()
 
 # Define sample inputs
 inputs = torch.randn((16, 3, 224, 224)).cuda()
@@ -52,16 +52,17 @@
 # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 # We can enable the cudagraphs API with a context manager
-with torch_tensorrt.runtime.enable_cudagraphs(opt) as cudagraphs_module:
-    out_trt = cudagraphs_module(inputs)
+with torch.no_grad():
+    with torch_tensorrt.runtime.enable_cudagraphs(opt) as cudagraphs_module:
+        out_trt = cudagraphs_module(inputs)
 
-# Alternatively, we can set the cudagraphs mode for the session
-torch_tensorrt.runtime.set_cudagraphs_mode(True)
-out_trt = opt(inputs)
+    # Alternatively, we can set the cudagraphs mode for the session
+    torch_tensorrt.runtime.set_cudagraphs_mode(True)
+    out_trt = opt(inputs)
 
-# We can also turn off cudagraphs mode and perform inference as normal
-torch_tensorrt.runtime.set_cudagraphs_mode(False)
-out_trt = opt(inputs)
+    # We can also turn off cudagraphs mode and perform inference as normal
+    torch_tensorrt.runtime.set_cudagraphs_mode(False)
+    out_trt = opt(inputs)
 
 # %%
 
@@ -69,9 +70,10 @@
 inputs_2 = torch.randn((8, 3, 224, 224)).cuda()
 inputs_3 = torch.randn((4, 3, 224, 224)).cuda()
 
-with torch_tensorrt.runtime.enable_cudagraphs(opt) as cudagraphs_module:
-    out_trt_2 = cudagraphs_module(inputs_2)
-    out_trt_3 = cudagraphs_module(inputs_3)
+with torch.no_grad():
+    with torch_tensorrt.runtime.enable_cudagraphs(opt) as cudagraphs_module:
+        out_trt_2 = cudagraphs_module(inputs_2)
+        out_trt_3 = cudagraphs_module(inputs_3)
 
 # %%
 # Cuda graphs with module that contains graph breaks
@@ -101,8 +103,8 @@ def forward(self, x):
         return torch.relu((x + 2) * 0.5)
 
 
-model = SampleModel().eval().cuda()
-input = torch.randn((1, 3, 224, 224)).to("cuda")
+model = SampleModel().cuda().eval()
+input = torch.randn((1, 3, 224, 224)).cuda()
 
 # The 'torch_executed_ops' compiler option is used in this example to intentionally introduce graph breaks within the module.
 # Note: The Dynamo backend is required for the CUDA Graph context manager to handle modules in an Ahead-Of-Time (AOT) manner.
@@ -117,7 +119,8 @@ def forward(self, x):
 
 # %%
 # If module has graph breaks, whole submodules are recorded and replayed by cuda graphs
-with torch_tensorrt.runtime.enable_cudagraphs(
-    opt_with_graph_break
-) as cudagraphs_module:
-    cudagraphs_module(input)
+with torch.no_grad():
+    with torch_tensorrt.runtime.enable_cudagraphs(
+        opt_with_graph_break
+    ) as cudagraphs_module:
+        cudagraphs_module(input)
diff --git a/examples/dynamo/torch_export_flux_dev.py b/examples/dynamo/torch_export_flux_dev.py
index 4a6d36a960..8f471668f1 100644
--- a/examples/dynamo/torch_export_flux_dev.py
+++ b/examples/dynamo/torch_export_flux_dev.py
@@ -140,14 +140,15 @@
 # Function which generates images from the flux pipeline
 def generate_image(pipe, prompt, image_name):
     seed = 42
-    image = pipe(
-        prompt,
-        output_type="pil",
-        num_inference_steps=20,
-        generator=torch.Generator("cuda").manual_seed(seed),
-    ).images[0]
-    image.save(f"{image_name}.png")
-    print(f"Image generated using {image_name} model saved as {image_name}.png")
+    with torch.no_grad():
+        image = pipe(
+            prompt,
+            output_type="pil",
+            num_inference_steps=20,
+            generator=torch.Generator("cuda").manual_seed(seed),
+        ).images[0]
+        image.save(f"{image_name}.png")
+        print(f"Image generated using {image_name} model saved as {image_name}.png")
 
 
 generate_image(pipe, ["A golden retriever holding a sign to code"], "dog_code")
diff --git a/examples/dynamo/torch_export_sam2.py b/examples/dynamo/torch_export_sam2.py
index 5a122bb836..bec1799432 100644
--- a/examples/dynamo/torch_export_sam2.py
+++ b/examples/dynamo/torch_export_sam2.py
@@ -110,8 +110,8 @@ def forward(self, image, point_coords, point_labels):
 # Initialize the ``SAM2FullModel`` with the pretrained weights. Since we already initialized
 # ``SAM2ImagePredictor``, we can directly use the model from it (``predictor.model``). We cast the model
 # to FP16 precision for faster performance.
-encoder = predictor.model.eval().cuda()
-sam_model = SAM2FullModel(encoder.half()).eval().cuda()
+encoder = predictor.model.cuda().eval()
+sam_model = SAM2FullModel(encoder.half()).cuda().eval()
 
 # %%
 # Load a sample image provided in the repository.
@@ -261,7 +261,9 @@ def visualize_masks(
     enabled_precisions={torch.float16},
     use_fp32_acc=True,
 )
-trt_out = trt_model(*torchtrt_inputs)
+
+with torch.no_grad():
+    trt_out = trt_model(*torchtrt_inputs)
 
 # %%
 # Output visualization
diff --git a/examples/dynamo/vgg16_ptq.py b/examples/dynamo/vgg16_ptq.py
index c72cf9281d..fc02f323ee 100644
--- a/examples/dynamo/vgg16_ptq.py
+++ b/examples/dynamo/vgg16_ptq.py
@@ -120,7 +120,7 @@ def vgg16(num_classes=1000, init_weights=False):
 args = PARSER.parse_args()
 
 model = vgg16(num_classes=10, init_weights=False)
-model = model.cuda()
+model = model.cuda().eval()
 
 # %%
 # Load the pre-trained model weights
@@ -254,18 +254,21 @@ def calibrate_loop(model):
         loss = 0.0
         class_probs = []
         class_preds = []
-        for data, labels in testing_dataloader:
-            data, labels = data.cuda(), labels.cuda(non_blocking=True)
-            out = trt_model(data)
-            loss += crit(out, labels)
-            preds = torch.max(out, 1)[1]
-            class_probs.append([F.softmax(i, dim=0) for i in out])
-            class_preds.append(preds)
-            total += labels.size(0)
-            correct += (preds == labels).sum().item()
-
-        test_probs = torch.cat([torch.stack(batch) for batch in class_probs])
-        test_preds = torch.cat(class_preds)
-        test_loss = loss / total
-        test_acc = correct / total
-        print("Test Loss: {:.5f} Test Acc: {:.2f}%".format(test_loss, 100 * test_acc))
+        with torch.no_grad():
+            for data, labels in testing_dataloader:
+                data, labels = data.cuda(), labels.cuda(non_blocking=True)
+                out = trt_model(data)
+                loss += crit(out, labels)
+                preds = torch.max(out, 1)[1]
+                class_probs.append([F.softmax(i, dim=0) for i in out])
+                class_preds.append(preds)
+                total += labels.size(0)
+                correct += (preds == labels).sum().item()
+
+            test_probs = torch.cat([torch.stack(batch) for batch in class_probs])
+            test_preds = torch.cat(class_preds)
+            test_loss = loss / total
+            test_acc = correct / total
+            print(
+                "Test Loss: {:.5f} Test Acc: {:.2f}%".format(test_loss, 100 * test_acc)
+            )