diff --git a/examples/dynamo/aot_plugin.py b/examples/dynamo/aot_plugin.py index 86dccfddfc..c322d37401 100644 --- a/examples/dynamo/aot_plugin.py +++ b/examples/dynamo/aot_plugin.py @@ -153,7 +153,7 @@ def forward(self, X: torch.Tensor) -> torch.Tensor: ) args = parser.parse_args() - my_model = MyModel().to("cuda") + my_model = MyModel().to("cuda").eval() m = torch.full((64, 64), 2, device="cuda", dtype=torch.float) assert my_model(X=m)[0][0] == 3.0 @@ -167,8 +167,9 @@ def forward(self, X: torch.Tensor) -> torch.Tensor: ) print("Model compiled successfully!") print("Running inference with compiled model...") - for i in range(10): - res = model_trt(m) - assert torch.allclose(res, my_model(m)), "Results do not match!" + with torch.no_grad(): + for i in range(10): + res = model_trt(m) + assert torch.allclose(res, my_model(m)), "Results do not match!" print("Inference successful!") diff --git a/examples/dynamo/auto_generate_converters.py b/examples/dynamo/auto_generate_converters.py index af9cffb8ff..5ab242443c 100644 --- a/examples/dynamo/auto_generate_converters.py +++ b/examples/dynamo/auto_generate_converters.py @@ -169,14 +169,15 @@ def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor: return res -my_model = MyModel().to("cuda") +my_model = MyModel().to("cuda").eval() m = torch.full((64, 64), 2, device="cuda", dtype=torch.float) n = torch.full((64, 64), 3, device="cuda", dtype=torch.float) with torch_tensorrt.logging.errors(): model_trt = torch_tensorrt.compile(my_model, inputs=[m, n], min_block_size=1) - for i in range(300): - res = model_trt(m, n) - assert torch.allclose(res, my_model(m, n)) + with torch.no_grad(): + for i in range(300): + res = model_trt(m, n) + assert torch.allclose(res, my_model(m, n)) print("Ran with custom plugin!") diff --git a/examples/dynamo/auto_generate_plugins.py b/examples/dynamo/auto_generate_plugins.py index 68a8635454..57a4300779 100644 --- a/examples/dynamo/auto_generate_plugins.py +++ b/examples/dynamo/auto_generate_plugins.py @@ -139,14 +139,15 @@ def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor: return res -my_model = MyModel().to("cuda") +my_model = MyModel().to("cuda").eval() m = torch.randint(0, 5, (64, 64), device="cuda", dtype=torch.float) n = torch.randint(0, 5, (64, 64), device="cuda", dtype=torch.float) with torch_tensorrt.logging.errors(): model_trt = torch_tensorrt.compile(my_model, inputs=[m, n], min_block_size=1) - for i in range(300): - res = model_trt(m, n) - assert torch.allclose(res, my_model(m, n)) + with torch.no_grad(): + for i in range(300): + res = model_trt(m, n) + assert torch.allclose(res, my_model(m, n)) print("Ran with custom plugin!") diff --git a/examples/dynamo/converter_overloading.py b/examples/dynamo/converter_overloading.py index e27c53cb50..011bed9307 100644 --- a/examples/dynamo/converter_overloading.py +++ b/examples/dynamo/converter_overloading.py @@ -34,7 +34,7 @@ def forward(self, x): return torch.nn.functional.gelu(x, approximate=self.mode) -my_mod = GeLU(mode="tanh") +my_mod = GeLU(mode="tanh").to("cuda").eval() ex_input = torch.randn(2, 5).to("cuda") @@ -182,9 +182,9 @@ def get_op_count(): my_custom_gelu = torch_tensorrt.compile( my_mod, arg_inputs=(ex_input,), min_block_size=1 ) - -print(my_custom_gelu.graph) -print(my_custom_gelu(ex_input)) +with torch.no_grad(): + print(my_custom_gelu.graph) + print(my_custom_gelu(ex_input)) # %% # @@ -198,7 +198,7 @@ def get_op_count(): # # Finally, we want to verify that in the case that the ``approximate`` argument is not set to ``tanh``, our custom converter is not used. -my_mod_erf = GeLU(mode="none") +my_mod_erf = GeLU(mode="none").to("cuda").eval() my_gelu_erf = torch_tensorrt.compile( my_mod_erf, arg_inputs=(ex_input,), min_block_size=1 ) @@ -207,6 +207,6 @@ def get_op_count(): # # Notice that we don't see the print statement from our custom converter, indicating that it was not used. However, looking at the graph, we can still see that a TensorRT engine was created to run the GeLU operation. # In this case, the validator for our custom converter returned ``False``, so the conversion system moved on to the next converter in the list, the standard GeLU converter and used that one to convert the operation. - -print(my_gelu_erf.graph) -print(my_gelu_erf(ex_input)) +with torch.no_grad(): + print(my_gelu_erf.graph) + print(my_gelu_erf(ex_input)) diff --git a/examples/dynamo/cross_runtime_compilation_for_windows.py b/examples/dynamo/cross_runtime_compilation_for_windows.py index 433df12d29..d3339f8f34 100644 --- a/examples/dynamo/cross_runtime_compilation_for_windows.py +++ b/examples/dynamo/cross_runtime_compilation_for_windows.py @@ -46,7 +46,7 @@ args = PARSER.parse_args() torch.manual_seed(0) -model = models.resnet18().eval().cuda() +model = models.resnet18().cuda().eval() input = torch.rand((1, 3, 224, 224)).to("cuda") inputs = [input] @@ -63,7 +63,8 @@ loaded_model = torchtrt.load_cross_compiled_exported_program(args.path).module() print(f"model has been successfully loaded from ${args.path}") # inference - trt_output = loaded_model(input) + with torch.no_grad(): + trt_output = loaded_model(input) print(f"inference result: {trt_output}") else: if platform.system() != "Linux" or platform.architecture()[0] != "64bit": diff --git a/examples/dynamo/custom_kernel_plugins.py b/examples/dynamo/custom_kernel_plugins.py index dccb0ff0cf..d58ce43378 100644 --- a/examples/dynamo/custom_kernel_plugins.py +++ b/examples/dynamo/custom_kernel_plugins.py @@ -217,8 +217,9 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: return y -my_model = MyModel((1, 1, 2, 0)).to("cuda") -my_model(ex_input) +my_model = MyModel((1, 1, 2, 0)).to("cuda").eval() +with torch.no_grad(): + my_model(ex_input) ############################################################################## # .. code-block:: none @@ -607,7 +608,8 @@ def circular_padding_converter( ############################################## # As you can see, now there is only one subgraph created for the TensorRT engine that contains both our custom kernel and the native convolution operator. -print(trt_model(ex_input)) +with torch.no_grad(): + print(trt_model(ex_input)) ############################################################################## # .. code-block:: none @@ -636,7 +638,8 @@ def circular_padding_converter( # %% # We can verify our implementation is run correctly by both TensorRT and PyTorch -print(my_model(ex_input) - trt_model(ex_input)) +with torch.no_grad(): + print(my_model(ex_input) - trt_model(ex_input)) ############################################################################## # .. code-block:: none diff --git a/examples/dynamo/engine_caching_bert_example.py b/examples/dynamo/engine_caching_bert_example.py index 66f5a69ac0..6aa90302e3 100644 --- a/examples/dynamo/engine_caching_bert_example.py +++ b/examples/dynamo/engine_caching_bert_example.py @@ -62,7 +62,8 @@ def compile_bert(iterations=3): backend="torch_tensorrt", options=compilation_kwargs, ) - optimized_model(*inputs) + with torch.no_grad(): + optimized_model(*inputs) end.record() torch.cuda.synchronize() times.append(start.elapsed_time(end)) diff --git a/examples/dynamo/engine_caching_example.py b/examples/dynamo/engine_caching_example.py index 34fa56f9a1..45bcd363ab 100644 --- a/examples/dynamo/engine_caching_example.py +++ b/examples/dynamo/engine_caching_example.py @@ -37,7 +37,7 @@ np.random.seed(0) torch.manual_seed(0) -model = models.resnet18(pretrained=True).eval().to("cuda") +model = models.resnet18(pretrained=True).to("cuda").eval() enabled_precisions = {torch.float} min_block_size = 1 use_python_runtime = False @@ -100,7 +100,8 @@ def torch_compile(iterations=3): "reuse_cached_engines": reuse_cached_engines, }, ) - compiled_model(*inputs) # trigger the compilation + with torch.no_grad(): + compiled_model(*inputs) # trigger the compilation end.record() torch.cuda.synchronize() times.append(start.elapsed_time(end)) @@ -270,7 +271,8 @@ def torch_compile_my_cache(iterations=3): "custom_engine_cache": engine_cache, }, ) - compiled_model(*inputs) # trigger the compilation + with torch.no_grad(): + compiled_model(*inputs) # trigger the compilation end.record() torch.cuda.synchronize() times.append(start.elapsed_time(end)) diff --git a/examples/dynamo/hierarchical_partitioner_example.py b/examples/dynamo/hierarchical_partitioner_example.py index 73975e2453..4de370bd2c 100644 --- a/examples/dynamo/hierarchical_partitioner_example.py +++ b/examples/dynamo/hierarchical_partitioner_example.py @@ -79,7 +79,8 @@ def main(): print("Original Model Structure:\n", gm) - original_output = model(example_input) + with torch.no_grad(): + original_output = model(example_input) # 1. Partition the model into blocks that can be executed by different backends partitioned_model, op_support = hierarchical_adjacency_partition( diff --git a/examples/dynamo/llama2_flashinfer_rmsnorm.py b/examples/dynamo/llama2_flashinfer_rmsnorm.py index 7542a9a1b7..c724954a18 100644 --- a/examples/dynamo/llama2_flashinfer_rmsnorm.py +++ b/examples/dynamo/llama2_flashinfer_rmsnorm.py @@ -220,7 +220,7 @@ def replace_rmsnorm( # 2. Initialize model (random weights) with torch.no_grad(): - model = LlamaForCausalLM(config).eval().half() + model = LlamaForCausalLM(config).cuda().half().eval() # 3. Export with static shapes input_ids = torch.randint(0, 32000, (1, 64)) # Static [batch=1, seq=64] @@ -253,5 +253,6 @@ def replace_rmsnorm( input_ids = input_ids.to(DEVICE) -res = trt_model.forward(input_ids) +with torch.no_grad(): + res = trt_model.forward(input_ids) print(res) diff --git a/examples/dynamo/mutable_torchtrt_module_example.py b/examples/dynamo/mutable_torchtrt_module_example.py index f422a6e629..84abcddf44 100644 --- a/examples/dynamo/mutable_torchtrt_module_example.py +++ b/examples/dynamo/mutable_torchtrt_module_example.py @@ -37,23 +37,25 @@ "immutable_weights": False, } -model = models.resnet18(pretrained=True).eval().to("cuda") +model = models.resnet18(pretrained=True).to("cuda").eval() mutable_module = torch_trt.MutableTorchTensorRTModule(model, **settings) # You can use the mutable module just like the original pytorch module. The compilation happens while you first call the mutable module. -mutable_module(*inputs) +with torch.no_grad(): + mutable_module(*inputs) # %% # Make modifications to the mutable module. # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ # %% # Making changes to mutable module can trigger refit or re-compilation. For example, loading a different state_dict and setting new weight values will trigger refit, and adding a module to the model will trigger re-compilation. -model2 = models.resnet18(pretrained=False).eval().to("cuda") +model2 = models.resnet18(pretrained=False).to("cuda").eval() mutable_module.load_state_dict(model2.state_dict()) # Check the output # The refit happens while you call the mutable module again. -expected_outputs, refitted_outputs = model2(*inputs), mutable_module(*inputs) +with torch.no_grad(): + expected_outputs, refitted_outputs = model2(*inputs), mutable_module(*inputs) for expected_output, refitted_output in zip(expected_outputs, refitted_outputs): assert torch.allclose( expected_output, refitted_output, 1e-2, 1e-2 @@ -163,7 +165,7 @@ def forward(self, a, b, c={}): device = "cuda:0" -model = Model().eval().to(device) +model = Model().to(device).eval() inputs = (torch.rand(10, 3).to(device), torch.rand(3, 30).to(device)) kwargs = { "c": {"a": torch.rand(10, 30).to(device), "b": torch.rand(10, 30).to(device)}, @@ -182,14 +184,15 @@ def forward(self, a, b, c={}): model = torch_trt.MutableTorchTensorRTModule(model, min_block_size=1) model.set_expected_dynamic_shape_range(args_dynamic_shapes, kwarg_dynamic_shapes) # Compile -model(*inputs, **kwargs) -# Change input shape -inputs_2 = (torch.rand(10, 5).to(device), torch.rand(10, 30).to(device)) -kwargs_2 = { - "c": {"a": torch.rand(10, 30).to(device), "b": torch.rand(5, 30).to(device)}, -} -# Run without recompiling -model(*inputs_2, **kwargs_2) +with torch.no_grad(): + model(*inputs, **kwargs) + # Change input shape + inputs_2 = (torch.rand(10, 5).to(device), torch.rand(10, 30).to(device)) + kwargs_2 = { + "c": {"a": torch.rand(10, 30).to(device), "b": torch.rand(5, 30).to(device)}, + } + # Run without recompiling + model(*inputs_2, **kwargs_2) # %% # Use Mutable Torch TensorRT module with persistent cache @@ -199,7 +202,7 @@ def forward(self, a, b, c={}): from torch_tensorrt.dynamo._defaults import TIMING_CACHE_PATH -model = models.resnet18(pretrained=True).eval().to("cuda") +model = models.resnet18(pretrained=True).to("cuda").eval() times = [] start = torch.cuda.Event(enable_timing=True) @@ -225,14 +228,15 @@ def remove_timing_cache(path=TIMING_CACHE_PATH): remove_timing_cache() -for i in range(4): - inputs = [torch.rand((100 + i, 3, 224, 224)).to("cuda")] - - start.record() - model(*inputs) # Recompile - end.record() - torch.cuda.synchronize() - times.append(start.elapsed_time(end)) +with torch.no_grad(): + for i in range(4): + inputs = [torch.rand((100 + i, 3, 224, 224)).to("cuda")] + + start.record() + model(*inputs) # Recompile + end.record() + torch.cuda.synchronize() + times.append(start.elapsed_time(end)) print("----------------dynamo_compile----------------") print("Without engine caching, used:", times[0], "ms") diff --git a/examples/dynamo/pre_allocated_output_example.py b/examples/dynamo/pre_allocated_output_example.py index d938034758..2ad1b8f514 100644 --- a/examples/dynamo/pre_allocated_output_example.py +++ b/examples/dynamo/pre_allocated_output_example.py @@ -43,10 +43,9 @@ def test_module_perf(model, *input): with torch.no_grad(): for _ in range(3): model(*input) - torch.cuda.synchronize() - # Timing phase to measure inference performance - with torch.no_grad(): + torch.cuda.synchronize() + # Timing phase to measure inference performance for i in range(10): start_time = timeit.default_timer() model(*input) @@ -67,9 +66,9 @@ def test_module_perf(model, *input): # Load bert model model = ( BertModel.from_pretrained("bert-base-uncased", torchscript=True) - .eval() - .half() .to("cuda") + .half() + .eval() ) # Define sample inputs inputs = [ @@ -89,25 +88,26 @@ def test_module_perf(model, *input): # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ # Enable pre-allocated output buffer using a context manager -with torch_tensorrt.runtime.enable_pre_allocated_outputs(optimized_model): +with torch.no_grad(): + with torch_tensorrt.runtime.enable_pre_allocated_outputs(optimized_model): + out_trt = optimized_model(*inputs) + # Subsequent inferences can use the pre-allocated output buffer (no shape change) + out_trt = optimized_model(*inputs) + + # Alternatively, we can enable the feature using a context object + pre_allocated_output_ctx = torch_tensorrt.runtime.enable_pre_allocated_outputs( + optimized_model + ) + pre_allocated_output_ctx.set_pre_allocated_output(True) + time_opt = test_module_perf(optimized_model, *inputs) + + # Disable the pre-allocated output buffer feature and perform inference normally + pre_allocated_output_ctx.set_pre_allocated_output(False) out_trt = optimized_model(*inputs) - # Subsequent inferences can use the pre-allocated output buffer (no shape change) - out_trt = optimized_model(*inputs) - -# Alternatively, we can enable the feature using a context object -pre_allocated_output_ctx = torch_tensorrt.runtime.enable_pre_allocated_outputs( - optimized_model -) -pre_allocated_output_ctx.set_pre_allocated_output(True) -time_opt = test_module_perf(optimized_model, *inputs) - -# Disable the pre-allocated output buffer feature and perform inference normally -pre_allocated_output_ctx.set_pre_allocated_output(False) -out_trt = optimized_model(*inputs) -time_normal = test_module_perf(optimized_model, *inputs) + time_normal = test_module_perf(optimized_model, *inputs) -time_opt_ms = time_opt * 1000 -time_normal_ms = time_normal * 1000 + time_opt_ms = time_opt * 1000 + time_normal_ms = time_normal * 1000 -print(f"normal trt model time: {time_normal_ms:.3f} ms") -print(f"pre-allocated output buffer model time: {time_opt_ms:.3f} ms") + print(f"normal trt model time: {time_normal_ms:.3f} ms") + print(f"pre-allocated output buffer model time: {time_opt_ms:.3f} ms") diff --git a/examples/dynamo/refit_engine_example.py b/examples/dynamo/refit_engine_example.py index 2637b4fd7e..e0b0457a09 100644 --- a/examples/dynamo/refit_engine_example.py +++ b/examples/dynamo/refit_engine_example.py @@ -53,7 +53,7 @@ # # In this case we are going to compile a ResNet18 model with randomly initialized weights and save it. -model = models.resnet18(pretrained=False).eval().to("cuda") +model = models.resnet18(pretrained=False).to("cuda").eval() exp_program = torch.export.export(model, tuple(inputs)) enabled_precisions = {torch.float} workspace_size = 20 << 30 @@ -85,7 +85,7 @@ # function is used to update the weights of the compiled module with the new weights. # Create and compile the updated model -model2 = models.resnet18(pretrained=True).eval().to("cuda") +model2 = models.resnet18(pretrained=True).to("cuda").eval() exp_program2 = torch.export.export(model2, tuple(inputs)) @@ -99,12 +99,14 @@ ) # Check the output -model2.to("cuda") -expected_outputs, refitted_outputs = exp_program2.module()(*inputs), new_trt_gm(*inputs) -for expected_output, refitted_output in zip(expected_outputs, refitted_outputs): - assert torch.allclose( - expected_output, refitted_output, 1e-2, 1e-2 - ), "Refit Result is not correct. Refit failed" +with torch.no_grad(): + expected_outputs, refitted_outputs = exp_program2.module()(*inputs), new_trt_gm( + *inputs + ) + for expected_output, refitted_output in zip(expected_outputs, refitted_outputs): + assert torch.allclose( + expected_output, refitted_output, 1e-2, 1e-2 + ), "Refit Result is not correct. Refit failed" print("Refit successfully!") diff --git a/examples/dynamo/torch_compile_advanced_usage.py b/examples/dynamo/torch_compile_advanced_usage.py index 71d0d77005..b366c292d9 100644 --- a/examples/dynamo/torch_compile_advanced_usage.py +++ b/examples/dynamo/torch_compile_advanced_usage.py @@ -36,7 +36,7 @@ def forward(self, x: torch.Tensor, y: torch.Tensor): # Define sample float inputs and initialize model sample_inputs = [torch.rand((5, 7)).cuda(), torch.rand((5, 7)).cuda()] -model = Model().eval().cuda() +model = Model().cuda().eval() # %% @@ -45,7 +45,8 @@ def forward(self, x: torch.Tensor, y: torch.Tensor): # with the backend "torch_tensorrt", and run the model on an # input to cause compilation, as so: optimized_model = torch.compile(model, backend="torch_tensorrt", dynamic=False) -optimized_model(*sample_inputs) +with torch.no_grad(): + optimized_model(*sample_inputs) # %% # Compilation with `torch.compile` Using Custom Settings @@ -60,7 +61,7 @@ def forward(self, x: torch.Tensor, y: torch.Tensor): torch.rand((5, 7)).half().cuda(), torch.rand((5, 7)).half().cuda(), ] -model_half = Model().eval().cuda() +model_half = Model().cuda().eval() # %% @@ -86,7 +87,8 @@ def forward(self, x: torch.Tensor, y: torch.Tensor): options=backend_kwargs, dynamic=False, ) -optimized_model_custom(*sample_inputs_half) +with torch.no_grad(): + optimized_model_custom(*sample_inputs_half) # %% # Cleanup diff --git a/examples/dynamo/torch_compile_gpt2.py b/examples/dynamo/torch_compile_gpt2.py index 5d41c3ed84..fd5f13df85 100644 --- a/examples/dynamo/torch_compile_gpt2.py +++ b/examples/dynamo/torch_compile_gpt2.py @@ -44,8 +44,8 @@ use_cache=False, attn_implementation="eager", ) + .to(DEVICE) .eval() - .cuda() ) # %% @@ -54,16 +54,17 @@ # Tokenize a sample input prompt and get pytorch model outputs prompt = "I enjoy walking with my cute dog" model_inputs = tokenizer(prompt, return_tensors="pt") -input_ids = model_inputs["input_ids"].cuda() +input_ids = model_inputs["input_ids"].to(DEVICE) # %% # The ``generate()`` API of the ``AutoModelForCausalLM`` class is used for auto-regressive generation with greedy decoding. -pyt_gen_tokens = model.generate( - input_ids, - max_length=MAX_LENGTH, - use_cache=False, - pad_token_id=tokenizer.eos_token_id, -) +with torch.no_grad(): + pyt_gen_tokens = model.generate( + input_ids, + max_length=MAX_LENGTH, + use_cache=False, + pad_token_id=tokenizer.eos_token_id, + ) # %% # Torch-TensorRT compilation and inference @@ -87,12 +88,13 @@ # Auto-regressive generation loop for greedy decoding using TensorRT model # The first token generation compiles the model using TensorRT and the second token # encounters recompilation (which is an issue currently that would be resolved in the future) -trt_gen_tokens = model.generate( - inputs=input_ids, - max_length=MAX_LENGTH, - use_cache=False, - pad_token_id=tokenizer.eos_token_id, -) +with torch.no_grad(): + trt_gen_tokens = model.generate( + inputs=input_ids, + max_length=MAX_LENGTH, + use_cache=False, + pad_token_id=tokenizer.eos_token_id, + ) # %% # Decode the output sentences of PyTorch and TensorRT diff --git a/examples/dynamo/torch_compile_resnet_example.py b/examples/dynamo/torch_compile_resnet_example.py index 6a85de6fbc..506982c7f4 100644 --- a/examples/dynamo/torch_compile_resnet_example.py +++ b/examples/dynamo/torch_compile_resnet_example.py @@ -18,7 +18,7 @@ # %% # Initialize model with half precision and sample inputs -model = models.resnet18(pretrained=True).half().eval().to("cuda") +model = models.resnet18(pretrained=True).to("cuda").half().eval() inputs = [torch.randn((1, 3, 224, 224)).to("cuda").half()] # %% @@ -63,21 +63,23 @@ # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ # Does not cause recompilation (same batch size as input) -new_inputs = [torch.randn((1, 3, 224, 224)).half().to("cuda")] -new_outputs = optimized_model(*new_inputs) +new_inputs = [torch.randn((1, 3, 224, 224)).to("cuda").half()] +with torch.no_grad(): + new_outputs = optimized_model(*new_inputs) # %% # Does cause recompilation (new batch size) -new_batch_size_inputs = [torch.randn((8, 3, 224, 224)).half().to("cuda")] -new_batch_size_outputs = optimized_model(*new_batch_size_inputs) +new_batch_size_inputs = [torch.randn((8, 3, 224, 224)).to("cuda").half()] +with torch.no_grad(): + new_batch_size_outputs = optimized_model(*new_batch_size_inputs) # %% # Avoid recompilation by specifying dynamic shapes before Torch-TRT compilation # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ # The following code illustrates the workflow using ir=torch_compile (which uses torch.compile under the hood) -inputs_bs8 = torch.randn((8, 3, 224, 224)).half().to("cuda") +inputs_bs8 = torch.randn((8, 3, 224, 224)).to("cuda").half() # This indicates dimension 0 of inputs_bs8 is dynamic whose range of values is [2, 16] torch._dynamo.mark_dynamic(inputs_bs8, 0, min=2, max=16) optimized_model = torch_tensorrt.compile( @@ -89,11 +91,14 @@ min_block_size=min_block_size, torch_executed_ops=torch_executed_ops, ) -outputs_bs8 = optimized_model(inputs_bs8) + +with torch.no_grad(): + outputs_bs8 = optimized_model(inputs_bs8) # No recompilation happens for batch size = 12 -inputs_bs12 = torch.randn((12, 3, 224, 224)).half().to("cuda") -outputs_bs12 = optimized_model(inputs_bs12) +inputs_bs12 = torch.randn((12, 3, 224, 224)).to("cuda").half() +with torch.no_grad(): + outputs_bs12 = optimized_model(inputs_bs12) # The following code illustrates the workflow using ir=dynamo (which uses torch.export APIs under the hood) # dynamic shapes for any inputs are specified using torch_tensorrt.Input API @@ -112,5 +117,6 @@ trt_model = torch_tensorrt.compile(model, **compile_spec) # No recompilation happens for batch size = 12 -inputs_bs12 = torch.randn((12, 3, 224, 224)).half().to("cuda") -outputs_bs12 = trt_model(inputs_bs12) +inputs_bs12 = torch.randn((12, 3, 224, 224)).to("cuda").half() +with torch.no_grad(): + outputs_bs12 = trt_model(inputs_bs12) diff --git a/examples/dynamo/torch_compile_stable_diffusion.py b/examples/dynamo/torch_compile_stable_diffusion.py index fe49da74d1..b894dd9c3f 100644 --- a/examples/dynamo/torch_compile_stable_diffusion.py +++ b/examples/dynamo/torch_compile_stable_diffusion.py @@ -46,9 +46,9 @@ # %% # Inference # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +with torch.no_grad(): + prompt = "a majestic castle in the clouds" + image = pipe(prompt).images[0] -prompt = "a majestic castle in the clouds" -image = pipe(prompt).images[0] - -image.save("images/majestic_castle.png") -image.show() + image.save("images/majestic_castle.png") + image.show() diff --git a/examples/dynamo/torch_compile_transformers_example.py b/examples/dynamo/torch_compile_transformers_example.py index 7737e95682..f73da79fd9 100644 --- a/examples/dynamo/torch_compile_transformers_example.py +++ b/examples/dynamo/torch_compile_transformers_example.py @@ -18,7 +18,7 @@ # %% # Initialize model with float precision and sample inputs -model = BertModel.from_pretrained("bert-base-uncased").eval().to("cuda") +model = BertModel.from_pretrained("bert-base-uncased").to("cuda").eval() inputs = [ torch.randint(0, 2, (1, 14), dtype=torch.int32).to("cuda"), torch.randint(0, 2, (1, 14), dtype=torch.int32).to("cuda"), @@ -61,7 +61,8 @@ dynamic=False, options=compilation_kwargs, ) -optimized_model(*inputs) +with torch.no_grad(): + optimized_model(*inputs) # %% # Equivalently, we could have run the above via the convenience frontend, as so: @@ -76,7 +77,8 @@ torch.randint(0, 2, (1, 14), dtype=torch.int32).to("cuda"), torch.randint(0, 2, (1, 14), dtype=torch.int32).to("cuda"), ] -new_outputs = optimized_model(*new_inputs) +with torch.no_grad(): + new_outputs = optimized_model(*new_inputs) # %% @@ -85,7 +87,8 @@ torch.randint(0, 2, (4, 14), dtype=torch.int32).to("cuda"), torch.randint(0, 2, (4, 14), dtype=torch.int32).to("cuda"), ] -new_outputs = optimized_model(*new_inputs) +with torch.no_grad(): + new_outputs = optimized_model(*new_inputs) # %% # Cleanup diff --git a/examples/dynamo/torch_export_cudagraphs.py b/examples/dynamo/torch_export_cudagraphs.py index e316dffc58..f19a78ea0f 100644 --- a/examples/dynamo/torch_export_cudagraphs.py +++ b/examples/dynamo/torch_export_cudagraphs.py @@ -25,7 +25,7 @@ # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ # We begin by defining and initializing a model -model = models.resnet18(pretrained=True).eval().to("cuda") +model = models.resnet18(pretrained=True).cuda().eval() # Define sample inputs inputs = torch.randn((16, 3, 224, 224)).cuda() @@ -52,16 +52,17 @@ # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ # We can enable the cudagraphs API with a context manager -with torch_tensorrt.runtime.enable_cudagraphs(opt) as cudagraphs_module: - out_trt = cudagraphs_module(inputs) +with torch.no_grad(): + with torch_tensorrt.runtime.enable_cudagraphs(opt) as cudagraphs_module: + out_trt = cudagraphs_module(inputs) -# Alternatively, we can set the cudagraphs mode for the session -torch_tensorrt.runtime.set_cudagraphs_mode(True) -out_trt = opt(inputs) + # Alternatively, we can set the cudagraphs mode for the session + torch_tensorrt.runtime.set_cudagraphs_mode(True) + out_trt = opt(inputs) -# We can also turn off cudagraphs mode and perform inference as normal -torch_tensorrt.runtime.set_cudagraphs_mode(False) -out_trt = opt(inputs) + # We can also turn off cudagraphs mode and perform inference as normal + torch_tensorrt.runtime.set_cudagraphs_mode(False) + out_trt = opt(inputs) # %% @@ -69,9 +70,10 @@ inputs_2 = torch.randn((8, 3, 224, 224)).cuda() inputs_3 = torch.randn((4, 3, 224, 224)).cuda() -with torch_tensorrt.runtime.enable_cudagraphs(opt) as cudagraphs_module: - out_trt_2 = cudagraphs_module(inputs_2) - out_trt_3 = cudagraphs_module(inputs_3) +with torch.no_grad(): + with torch_tensorrt.runtime.enable_cudagraphs(opt) as cudagraphs_module: + out_trt_2 = cudagraphs_module(inputs_2) + out_trt_3 = cudagraphs_module(inputs_3) # %% # Cuda graphs with module that contains graph breaks @@ -101,8 +103,8 @@ def forward(self, x): return torch.relu((x + 2) * 0.5) -model = SampleModel().eval().cuda() -input = torch.randn((1, 3, 224, 224)).to("cuda") +model = SampleModel().cuda().eval() +input = torch.randn((1, 3, 224, 224)).cuda() # The 'torch_executed_ops' compiler option is used in this example to intentionally introduce graph breaks within the module. # Note: The Dynamo backend is required for the CUDA Graph context manager to handle modules in an Ahead-Of-Time (AOT) manner. @@ -117,7 +119,8 @@ def forward(self, x): # %% # If module has graph breaks, whole submodules are recorded and replayed by cuda graphs -with torch_tensorrt.runtime.enable_cudagraphs( - opt_with_graph_break -) as cudagraphs_module: - cudagraphs_module(input) +with torch.no_grad(): + with torch_tensorrt.runtime.enable_cudagraphs( + opt_with_graph_break + ) as cudagraphs_module: + cudagraphs_module(input) diff --git a/examples/dynamo/torch_export_flux_dev.py b/examples/dynamo/torch_export_flux_dev.py index 4a6d36a960..8f471668f1 100644 --- a/examples/dynamo/torch_export_flux_dev.py +++ b/examples/dynamo/torch_export_flux_dev.py @@ -140,14 +140,15 @@ # Function which generates images from the flux pipeline def generate_image(pipe, prompt, image_name): seed = 42 - image = pipe( - prompt, - output_type="pil", - num_inference_steps=20, - generator=torch.Generator("cuda").manual_seed(seed), - ).images[0] - image.save(f"{image_name}.png") - print(f"Image generated using {image_name} model saved as {image_name}.png") + with torch.no_grad(): + image = pipe( + prompt, + output_type="pil", + num_inference_steps=20, + generator=torch.Generator("cuda").manual_seed(seed), + ).images[0] + image.save(f"{image_name}.png") + print(f"Image generated using {image_name} model saved as {image_name}.png") generate_image(pipe, ["A golden retriever holding a sign to code"], "dog_code") diff --git a/examples/dynamo/torch_export_sam2.py b/examples/dynamo/torch_export_sam2.py index 5a122bb836..bec1799432 100644 --- a/examples/dynamo/torch_export_sam2.py +++ b/examples/dynamo/torch_export_sam2.py @@ -110,8 +110,8 @@ def forward(self, image, point_coords, point_labels): # Initialize the ``SAM2FullModel`` with the pretrained weights. Since we already initialized # ``SAM2ImagePredictor``, we can directly use the model from it (``predictor.model``). We cast the model # to FP16 precision for faster performance. -encoder = predictor.model.eval().cuda() -sam_model = SAM2FullModel(encoder.half()).eval().cuda() +encoder = predictor.model.cuda().eval() +sam_model = SAM2FullModel(encoder.half()).cuda().eval() # %% # Load a sample image provided in the repository. @@ -261,7 +261,9 @@ def visualize_masks( enabled_precisions={torch.float16}, use_fp32_acc=True, ) -trt_out = trt_model(*torchtrt_inputs) + +with torch.no_grad(): + trt_out = trt_model(*torchtrt_inputs) # %% # Output visualization diff --git a/examples/dynamo/vgg16_ptq.py b/examples/dynamo/vgg16_ptq.py index c72cf9281d..fc02f323ee 100644 --- a/examples/dynamo/vgg16_ptq.py +++ b/examples/dynamo/vgg16_ptq.py @@ -120,7 +120,7 @@ def vgg16(num_classes=1000, init_weights=False): args = PARSER.parse_args() model = vgg16(num_classes=10, init_weights=False) -model = model.cuda() +model = model.cuda().eval() # %% # Load the pre-trained model weights @@ -254,18 +254,21 @@ def calibrate_loop(model): loss = 0.0 class_probs = [] class_preds = [] - for data, labels in testing_dataloader: - data, labels = data.cuda(), labels.cuda(non_blocking=True) - out = trt_model(data) - loss += crit(out, labels) - preds = torch.max(out, 1)[1] - class_probs.append([F.softmax(i, dim=0) for i in out]) - class_preds.append(preds) - total += labels.size(0) - correct += (preds == labels).sum().item() - - test_probs = torch.cat([torch.stack(batch) for batch in class_probs]) - test_preds = torch.cat(class_preds) - test_loss = loss / total - test_acc = correct / total - print("Test Loss: {:.5f} Test Acc: {:.2f}%".format(test_loss, 100 * test_acc)) + with torch.no_grad(): + for data, labels in testing_dataloader: + data, labels = data.cuda(), labels.cuda(non_blocking=True) + out = trt_model(data) + loss += crit(out, labels) + preds = torch.max(out, 1)[1] + class_probs.append([F.softmax(i, dim=0) for i in out]) + class_preds.append(preds) + total += labels.size(0) + correct += (preds == labels).sum().item() + + test_probs = torch.cat([torch.stack(batch) for batch in class_probs]) + test_preds = torch.cat(class_preds) + test_loss = loss / total + test_acc = correct / total + print( + "Test Loss: {:.5f} Test Acc: {:.2f}%".format(test_loss, 100 * test_acc) + )