Skip to content

fix: set example models to eval mode and follow the convention #3770

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions examples/dynamo/aot_plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ def forward(self, X: torch.Tensor) -> torch.Tensor:
)
args = parser.parse_args()

my_model = MyModel().to("cuda")
my_model = MyModel().to("cuda").eval()
m = torch.full((64, 64), 2, device="cuda", dtype=torch.float)

assert my_model(X=m)[0][0] == 3.0
Expand All @@ -167,8 +167,9 @@ def forward(self, X: torch.Tensor) -> torch.Tensor:
)
print("Model compiled successfully!")
print("Running inference with compiled model...")
for i in range(10):
res = model_trt(m)
assert torch.allclose(res, my_model(m)), "Results do not match!"
with torch.no_grad():
for i in range(10):
res = model_trt(m)
assert torch.allclose(res, my_model(m)), "Results do not match!"

print("Inference successful!")
9 changes: 5 additions & 4 deletions examples/dynamo/auto_generate_converters.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,14 +169,15 @@ def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
return res


my_model = MyModel().to("cuda")
my_model = MyModel().to("cuda").eval()
m = torch.full((64, 64), 2, device="cuda", dtype=torch.float)
n = torch.full((64, 64), 3, device="cuda", dtype=torch.float)

with torch_tensorrt.logging.errors():
model_trt = torch_tensorrt.compile(my_model, inputs=[m, n], min_block_size=1)
for i in range(300):
res = model_trt(m, n)
assert torch.allclose(res, my_model(m, n))
with torch.no_grad():
for i in range(300):
res = model_trt(m, n)
assert torch.allclose(res, my_model(m, n))

print("Ran with custom plugin!")
9 changes: 5 additions & 4 deletions examples/dynamo/auto_generate_plugins.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,14 +139,15 @@ def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
return res


my_model = MyModel().to("cuda")
my_model = MyModel().to("cuda").eval()
m = torch.randint(0, 5, (64, 64), device="cuda", dtype=torch.float)
n = torch.randint(0, 5, (64, 64), device="cuda", dtype=torch.float)

with torch_tensorrt.logging.errors():
model_trt = torch_tensorrt.compile(my_model, inputs=[m, n], min_block_size=1)
for i in range(300):
res = model_trt(m, n)
assert torch.allclose(res, my_model(m, n))
with torch.no_grad():
for i in range(300):
res = model_trt(m, n)
assert torch.allclose(res, my_model(m, n))

print("Ran with custom plugin!")
16 changes: 8 additions & 8 deletions examples/dynamo/converter_overloading.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def forward(self, x):
return torch.nn.functional.gelu(x, approximate=self.mode)


my_mod = GeLU(mode="tanh")
my_mod = GeLU(mode="tanh").to("cuda").eval()
ex_input = torch.randn(2, 5).to("cuda")


Expand Down Expand Up @@ -182,9 +182,9 @@ def get_op_count():
my_custom_gelu = torch_tensorrt.compile(
my_mod, arg_inputs=(ex_input,), min_block_size=1
)

print(my_custom_gelu.graph)
print(my_custom_gelu(ex_input))
with torch.no_grad():
print(my_custom_gelu.graph)
print(my_custom_gelu(ex_input))

# %%
#
Expand All @@ -198,7 +198,7 @@ def get_op_count():
#
# Finally, we want to verify that in the case that the ``approximate`` argument is not set to ``tanh``, our custom converter is not used.

my_mod_erf = GeLU(mode="none")
my_mod_erf = GeLU(mode="none").to("cuda").eval()
my_gelu_erf = torch_tensorrt.compile(
my_mod_erf, arg_inputs=(ex_input,), min_block_size=1
)
Expand All @@ -207,6 +207,6 @@ def get_op_count():
#
# Notice that we don't see the print statement from our custom converter, indicating that it was not used. However, looking at the graph, we can still see that a TensorRT engine was created to run the GeLU operation.
# In this case, the validator for our custom converter returned ``False``, so the conversion system moved on to the next converter in the list, the standard GeLU converter and used that one to convert the operation.

print(my_gelu_erf.graph)
print(my_gelu_erf(ex_input))
with torch.no_grad():
print(my_gelu_erf.graph)
print(my_gelu_erf(ex_input))
5 changes: 3 additions & 2 deletions examples/dynamo/cross_runtime_compilation_for_windows.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@

args = PARSER.parse_args()
torch.manual_seed(0)
model = models.resnet18().eval().cuda()
model = models.resnet18().cuda().eval()
input = torch.rand((1, 3, 224, 224)).to("cuda")
inputs = [input]

Expand All @@ -63,7 +63,8 @@
loaded_model = torchtrt.load_cross_compiled_exported_program(args.path).module()
print(f"model has been successfully loaded from ${args.path}")
# inference
trt_output = loaded_model(input)
with torch.no_grad():
trt_output = loaded_model(input)
print(f"inference result: {trt_output}")
else:
if platform.system() != "Linux" or platform.architecture()[0] != "64bit":
Expand Down
11 changes: 7 additions & 4 deletions examples/dynamo/custom_kernel_plugins.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,8 +217,9 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
return y


my_model = MyModel((1, 1, 2, 0)).to("cuda")
my_model(ex_input)
my_model = MyModel((1, 1, 2, 0)).to("cuda").eval()
with torch.no_grad():
my_model(ex_input)

##############################################################################
# .. code-block:: none
Expand Down Expand Up @@ -607,7 +608,8 @@ def circular_padding_converter(
##############################################
# As you can see, now there is only one subgraph created for the TensorRT engine that contains both our custom kernel and the native convolution operator.

print(trt_model(ex_input))
with torch.no_grad():
print(trt_model(ex_input))

##############################################################################
# .. code-block:: none
Expand Down Expand Up @@ -636,7 +638,8 @@ def circular_padding_converter(
# %%
# We can verify our implementation is run correctly by both TensorRT and PyTorch

print(my_model(ex_input) - trt_model(ex_input))
with torch.no_grad():
print(my_model(ex_input) - trt_model(ex_input))

##############################################################################
# .. code-block:: none
Expand Down
3 changes: 2 additions & 1 deletion examples/dynamo/engine_caching_bert_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,8 @@ def compile_bert(iterations=3):
backend="torch_tensorrt",
options=compilation_kwargs,
)
optimized_model(*inputs)
with torch.no_grad():
optimized_model(*inputs)
end.record()
torch.cuda.synchronize()
times.append(start.elapsed_time(end))
Expand Down
8 changes: 5 additions & 3 deletions examples/dynamo/engine_caching_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
np.random.seed(0)
torch.manual_seed(0)

model = models.resnet18(pretrained=True).eval().to("cuda")
model = models.resnet18(pretrained=True).to("cuda").eval()
enabled_precisions = {torch.float}
min_block_size = 1
use_python_runtime = False
Expand Down Expand Up @@ -100,7 +100,8 @@ def torch_compile(iterations=3):
"reuse_cached_engines": reuse_cached_engines,
},
)
compiled_model(*inputs) # trigger the compilation
with torch.no_grad():
compiled_model(*inputs) # trigger the compilation
end.record()
torch.cuda.synchronize()
times.append(start.elapsed_time(end))
Expand Down Expand Up @@ -270,7 +271,8 @@ def torch_compile_my_cache(iterations=3):
"custom_engine_cache": engine_cache,
},
)
compiled_model(*inputs) # trigger the compilation
with torch.no_grad():
compiled_model(*inputs) # trigger the compilation
end.record()
torch.cuda.synchronize()
times.append(start.elapsed_time(end))
Expand Down
3 changes: 2 additions & 1 deletion examples/dynamo/hierarchical_partitioner_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,8 @@ def main():

print("Original Model Structure:\n", gm)

original_output = model(example_input)
with torch.no_grad():
original_output = model(example_input)

# 1. Partition the model into blocks that can be executed by different backends
partitioned_model, op_support = hierarchical_adjacency_partition(
Expand Down
5 changes: 3 additions & 2 deletions examples/dynamo/llama2_flashinfer_rmsnorm.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,7 +220,7 @@ def replace_rmsnorm(

# 2. Initialize model (random weights)
with torch.no_grad():
model = LlamaForCausalLM(config).eval().half()
model = LlamaForCausalLM(config).cuda().half().eval()

# 3. Export with static shapes
input_ids = torch.randint(0, 32000, (1, 64)) # Static [batch=1, seq=64]
Expand Down Expand Up @@ -253,5 +253,6 @@ def replace_rmsnorm(

input_ids = input_ids.to(DEVICE)

res = trt_model.forward(input_ids)
with torch.no_grad():
res = trt_model.forward(input_ids)
print(res)
48 changes: 26 additions & 22 deletions examples/dynamo/mutable_torchtrt_module_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,23 +37,25 @@
"immutable_weights": False,
}

model = models.resnet18(pretrained=True).eval().to("cuda")
model = models.resnet18(pretrained=True).to("cuda").eval()
mutable_module = torch_trt.MutableTorchTensorRTModule(model, **settings)
# You can use the mutable module just like the original pytorch module. The compilation happens while you first call the mutable module.
mutable_module(*inputs)
with torch.no_grad():
mutable_module(*inputs)
# %%
# Make modifications to the mutable module.
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

# %%
# Making changes to mutable module can trigger refit or re-compilation. For example, loading a different state_dict and setting new weight values will trigger refit, and adding a module to the model will trigger re-compilation.
model2 = models.resnet18(pretrained=False).eval().to("cuda")
model2 = models.resnet18(pretrained=False).to("cuda").eval()
mutable_module.load_state_dict(model2.state_dict())


# Check the output
# The refit happens while you call the mutable module again.
expected_outputs, refitted_outputs = model2(*inputs), mutable_module(*inputs)
with torch.no_grad():
expected_outputs, refitted_outputs = model2(*inputs), mutable_module(*inputs)
for expected_output, refitted_output in zip(expected_outputs, refitted_outputs):
assert torch.allclose(
expected_output, refitted_output, 1e-2, 1e-2
Expand Down Expand Up @@ -163,7 +165,7 @@ def forward(self, a, b, c={}):


device = "cuda:0"
model = Model().eval().to(device)
model = Model().to(device).eval()
inputs = (torch.rand(10, 3).to(device), torch.rand(3, 30).to(device))
kwargs = {
"c": {"a": torch.rand(10, 30).to(device), "b": torch.rand(10, 30).to(device)},
Expand All @@ -182,14 +184,15 @@ def forward(self, a, b, c={}):
model = torch_trt.MutableTorchTensorRTModule(model, min_block_size=1)
model.set_expected_dynamic_shape_range(args_dynamic_shapes, kwarg_dynamic_shapes)
# Compile
model(*inputs, **kwargs)
# Change input shape
inputs_2 = (torch.rand(10, 5).to(device), torch.rand(10, 30).to(device))
kwargs_2 = {
"c": {"a": torch.rand(10, 30).to(device), "b": torch.rand(5, 30).to(device)},
}
# Run without recompiling
model(*inputs_2, **kwargs_2)
with torch.no_grad():
model(*inputs, **kwargs)
# Change input shape
inputs_2 = (torch.rand(10, 5).to(device), torch.rand(10, 30).to(device))
kwargs_2 = {
"c": {"a": torch.rand(10, 30).to(device), "b": torch.rand(5, 30).to(device)},
}
# Run without recompiling
model(*inputs_2, **kwargs_2)

# %%
# Use Mutable Torch TensorRT module with persistent cache
Expand All @@ -199,7 +202,7 @@ def forward(self, a, b, c={}):

from torch_tensorrt.dynamo._defaults import TIMING_CACHE_PATH

model = models.resnet18(pretrained=True).eval().to("cuda")
model = models.resnet18(pretrained=True).to("cuda").eval()

times = []
start = torch.cuda.Event(enable_timing=True)
Expand All @@ -225,14 +228,15 @@ def remove_timing_cache(path=TIMING_CACHE_PATH):

remove_timing_cache()

for i in range(4):
inputs = [torch.rand((100 + i, 3, 224, 224)).to("cuda")]

start.record()
model(*inputs) # Recompile
end.record()
torch.cuda.synchronize()
times.append(start.elapsed_time(end))
with torch.no_grad():
for i in range(4):
inputs = [torch.rand((100 + i, 3, 224, 224)).to("cuda")]

start.record()
model(*inputs) # Recompile
end.record()
torch.cuda.synchronize()
times.append(start.elapsed_time(end))

print("----------------dynamo_compile----------------")
print("Without engine caching, used:", times[0], "ms")
Expand Down
48 changes: 24 additions & 24 deletions examples/dynamo/pre_allocated_output_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,10 +43,9 @@ def test_module_perf(model, *input):
with torch.no_grad():
for _ in range(3):
model(*input)
torch.cuda.synchronize()

# Timing phase to measure inference performance
with torch.no_grad():
torch.cuda.synchronize()
# Timing phase to measure inference performance
for i in range(10):
start_time = timeit.default_timer()
model(*input)
Expand All @@ -67,9 +66,9 @@ def test_module_perf(model, *input):
# Load bert model
model = (
BertModel.from_pretrained("bert-base-uncased", torchscript=True)
.eval()
.half()
.to("cuda")
.half()
.eval()
)
# Define sample inputs
inputs = [
Expand All @@ -89,25 +88,26 @@ def test_module_perf(model, *input):
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

# Enable pre-allocated output buffer using a context manager
with torch_tensorrt.runtime.enable_pre_allocated_outputs(optimized_model):
with torch.no_grad():
with torch_tensorrt.runtime.enable_pre_allocated_outputs(optimized_model):
out_trt = optimized_model(*inputs)
# Subsequent inferences can use the pre-allocated output buffer (no shape change)
out_trt = optimized_model(*inputs)

# Alternatively, we can enable the feature using a context object
pre_allocated_output_ctx = torch_tensorrt.runtime.enable_pre_allocated_outputs(
optimized_model
)
pre_allocated_output_ctx.set_pre_allocated_output(True)
time_opt = test_module_perf(optimized_model, *inputs)

# Disable the pre-allocated output buffer feature and perform inference normally
pre_allocated_output_ctx.set_pre_allocated_output(False)
out_trt = optimized_model(*inputs)
# Subsequent inferences can use the pre-allocated output buffer (no shape change)
out_trt = optimized_model(*inputs)

# Alternatively, we can enable the feature using a context object
pre_allocated_output_ctx = torch_tensorrt.runtime.enable_pre_allocated_outputs(
optimized_model
)
pre_allocated_output_ctx.set_pre_allocated_output(True)
time_opt = test_module_perf(optimized_model, *inputs)

# Disable the pre-allocated output buffer feature and perform inference normally
pre_allocated_output_ctx.set_pre_allocated_output(False)
out_trt = optimized_model(*inputs)
time_normal = test_module_perf(optimized_model, *inputs)
time_normal = test_module_perf(optimized_model, *inputs)

time_opt_ms = time_opt * 1000
time_normal_ms = time_normal * 1000
time_opt_ms = time_opt * 1000
time_normal_ms = time_normal * 1000

print(f"normal trt model time: {time_normal_ms:.3f} ms")
print(f"pre-allocated output buffer model time: {time_opt_ms:.3f} ms")
print(f"normal trt model time: {time_normal_ms:.3f} ms")
print(f"pre-allocated output buffer model time: {time_opt_ms:.3f} ms")
Loading
Loading