diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..cd56c03b --- /dev/null +++ b/.gitignore @@ -0,0 +1,19 @@ +.ninja_deps +.ninja_log +build.ninja +tags +*.o +*.pb.o +torch2trt.egg-info +build/ +dist/ +__pycache__/ +*.so +*.pb.h +*.pb.cc +*_pb2.py +*.pyc +*.ipynb_checkpoints +*.pth +docs/converters.md +site \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 00000000..1e4af331 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,20 @@ +# Changes + +## [Master] + +## [0.2.0] - 03/02/2021 + +- Added converter for ``torch.Tensor.flatten`` +- Added converter for ``torch.nn.functional.conv2d`` and ``torch.nn.functional.conv3d`` + +### Added + +- Added converter for ``torch.Tensor.expand`` +- Added support for custom converters for methods defined outside of ``torch`` module +- Added names for TensorRT layers +- Added GroupNorm plugin which internally uses PyTorch aten::group_norm +- Replaced Tensor.ndim references with len(tensor.shape) to support older pytorch versions +- Added reduced precision documentation page +- Added converters for ``floordiv``, ``mod``, ``ne``, and ``torch.tensor`` operations +- Extended ``relu`` converter to support ``Tensor.relu`` operation +- Extended ``sigmoid`` converter to support ``Tensor.sigmoid`` operation diff --git a/CLA.md b/CLA.md new file mode 100644 index 00000000..389a4746 --- /dev/null +++ b/CLA.md @@ -0,0 +1,58 @@ +## Individual Contributor License Agreement (CLA) + +**Thank you for submitting your contributions to this project.** + +By signing this CLA, you agree that the following terms apply to all of your past, present and future contributions +to the project. + +### License. + +You hereby represent that all present, past and future contributions are governed by the +[MIT License](https://opensource.org/licenses/MIT) +copyright statement. + +This entails that to the extent possible under law, you transfer all copyright and related or neighboring rights +of the code or documents you contribute to the project itself or its maintainers. +Furthermore you also represent that you have the authority to perform the above waiver +with respect to the entirety of you contributions. + +### Moral Rights. + +To the fullest extent permitted under applicable law, you hereby waive, and agree not to +assert, all of your “moral rights” in or relating to your contributions for the benefit of the project. + +### Third Party Content. + +If your Contribution includes or is based on any source code, object code, bug fixes, configuration changes, tools, +specifications, documentation, data, materials, feedback, information or other works of authorship that were not +authored by you (“Third Party Content”) or if you are aware of any third party intellectual property or proprietary +rights associated with your Contribution (“Third Party Rights”), +then you agree to include with the submission of your Contribution full details respecting such Third Party +Content and Third Party Rights, including, without limitation, identification of which aspects of your +Contribution contain Third Party Content or are associated with Third Party Rights, the owner/author of the +Third Party Content and Third Party Rights, where you obtained the Third Party Content, and any applicable +third party license terms or restrictions respecting the Third Party Content and Third Party Rights. For greater +certainty, the foregoing obligations respecting the identification of Third Party Content and Third Party Rights +do not apply to any portion of a Project that is incorporated into your Contribution to that same Project. + +### Representations. + +You represent that, other than the Third Party Content and Third Party Rights identified by +you in accordance with this Agreement, you are the sole author of your Contributions and are legally entitled +to grant the foregoing licenses and waivers in respect of your Contributions. If your Contributions were +created in the course of your employment with your past or present employer(s), you represent that such +employer(s) has authorized you to make your Contributions on behalf of such employer(s) or such employer +(s) has waived all of their right, title or interest in or to your Contributions. + +### Disclaimer. + +To the fullest extent permitted under applicable law, your Contributions are provided on an "as is" +basis, without any warranties or conditions, express or implied, including, without limitation, any implied +warranties or conditions of non-infringement, merchantability or fitness for a particular purpose. You are not +required to provide support for your Contributions, except to the extent you desire to provide support. + +### No Obligation. + +You acknowledge that the maintainers of this project are under no obligation to use or incorporate your contributions +into the project. The decision to use or incorporate your contributions into the project will be made at the +sole discretion of the maintainers or their authorized delegates. diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 00000000..6479c3a6 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,104 @@ +# Contributing + +## Forms of contribution + +### Submit an Issue + +torch2trt is use case driven. We originally created it to solve +use cases related to NVIDIA Jetson, but the layer support has grown +largely since it's release and we've found that it has +helped many other developers as well. + +The growth of torch2trt has been largely driven by issues submitted on [GitHub](https://github.com/NVIDIA-AI-IOT/torch2trt/issues). +We learn a lot from the reported issues. Submitting an issue it is one of the best ways to begin contributing to torch2trt. + +The reported issues typically are one of the following, + +* A bug or unexpected result +* A model with unsupported layers + +If you report an issue, we typically find the following information helpful + +* PyTorch version +* TensorRT version +* Platform (ie: Jetson Nano) +* The PyTorch Module you're attempting to convert +* The steps taken to convert the PyTorch module + +If you're not sure how to provide any of these pieces of information, don't worry. Just open the issue +and we're happy to discuss and help work out the details. + +### Ask a Question + +Another great way to contribute is to ask a question on [GitHub](https://github.com/NVIDIA-AI-IOT/torch2trt/issues). +There are often other developers who share your question, and they may find the discussion helpful. This also +helps us gauge feature interest and identify gaps in documentation. + +### Submit a Pull Request + +torch2trt is use case driven and has limited maintainence, for this reason we value community contributions greatly. +Another great way to contribute is by submitting a pull request. Pull requests which are most likely to be accepted are + +* A new converter +* A test case +* A bug fix + +If you add a new converter, it is best to include a few test +cases that cross validate the converter against the original PyTorch. We provide a utility function to do this, +as described in the [Custom Converter](usage/custom_converter.md) usage guide. + +Ideally pull requests solve one thing at a time. This makes it easy +to evaluate the impact that the changes have on the project step-by-step. The more confident we are that +the changes will not adversely impact the experience of other developers, the more likely we are to accept them. + +## Running module test cases + +Before any change is accepted, we run the test cases on at least one platform. This performs a large number +of cross validation checks against PyTorch. To do this + +```bash +python3 -m torch2trt.test --name=converters --tolerance=1e-2 +``` + +This will not hard-fail, but will highlight any build errors or max error checks. It is helpful if you include +the status of this command in any pull-request, as well as system information like + +* PyTorch version +* TensorRT version +* Platform (ie: Jetson Nano) + +## Testing documentation + +If you have a change that modifies the documentation, it is relatively straightforward to test. We +use ``mkdocs-material`` for documentation, which parses markdown files in the ``docs`` folder. + +To view the docs, simply call + +``` +./scripts/test_docs.sh +``` + +And then navigate to ``https://:8000``. + +Please note, this will not include dynamically generated documentation pages like the converters page. +These contain cross reference links to the GitHub source code. If you want to test these +you can call + +```bash +./scripts/build_docs.sh +``` + +Pointing to the public reflection +of your local repository. For example, if we're working off the upstream master branch, we +would call + +```bash +./scripts/build_docs.sh https://github.com/NVIDIA-AI-IOT/torch2trt master +``` + +If your changes are pushed to your fork, you would do + +```bash +./scripts/build_docs.sh https://github.com//torch2trt my_branch +``` + diff --git a/LICENSE.md b/LICENSE.md new file mode 100644 index 00000000..3e950018 --- /dev/null +++ b/LICENSE.md @@ -0,0 +1,7 @@ +Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/README.md b/README.md index b36cb075..b770bd88 100644 --- a/README.md +++ b/README.md @@ -1,18 +1,29 @@ -# torch2trt - A PyTorch -> TensorRT Converter +# torch2trt -This is an experimental PyTorch to TensorRT converter which utilizes the -TensorRT Python API. We've found it useful in some examples, but it is -not comprehensive. +> What models are you using, or hoping to use, with TensorRT? Feel free to join the discussion [here](https://github.com/NVIDIA-AI-IOT/torch2trt/discussions/531). + + -### Setup +torch2trt is a PyTorch to TensorRT converter which utilizes the +TensorRT Python API. The converter is -```bash -python setup.py install --user -``` +* Easy to use - Convert modules with a single function call ``torch2trt`` + +* Easy to extend - Write your own layer converter in Python and register it with ``@tensorrt_converter`` + +If you find an issue, please [let us know](../..//issues)! + +> Please note, this converter has limited coverage of TensorRT / PyTorch. We created it primarily +> to easily optimize the models used in the [JetBot](https://github.com/NVIDIA-AI-IOT/jetbot) project. If you find the converter helpful with other models, please [let us know](../..//issues). + +## Usage + +Below are some usage examples, for more check out the [notebooks](notebooks). -### Usage +### Convert ```python +import torch from torch2trt import torch2trt from torchvision.models.alexnet import alexnet @@ -26,43 +37,112 @@ x = torch.ones((1, 3, 224, 224)).cuda() model_trt = torch2trt(model, [x]) ``` -We can then test the output of the regular and TensorRT optimized models +### Execute -``` +We can execute the returned ``TRTModule`` just like the original PyTorch model + +```python y = model(x) y_trt = model_trt(x) +# check the output against PyTorch print(torch.max(torch.abs(y - y_trt))) ``` -### Tested models - -| Model | PyTorch FP16 (Jetson Nano) | TensorRT FP16 (Jetson Nano) | -|-------|--------------|-----------------| -| alexnet | 18.3s | 13.2 | -| squeezenet1_0 | 0.021 | 0.008 | -| squeezenet1_1 | | | -| resnet18 | | | -| resnet50 | | | -| resnet101 | | | -| resnet152 | | | -| densenet121 | | | -| densenet169 | | | -| densenet201 | | | -| densenet161 | | | -| vgg11 | | | -| vgg13 | | | -| vgg16 | | | -| vgg19 | | | -| vgg11_bn | | | -| vgg13_bn | | | -| vgg16_bn | | | -| vgg19_bn | | | - - -### Add (or override) a converter - -Here we show how to add an example converter using the TensorRT +### Save and load + +We can save the model as a ``state_dict``. + +```python +torch.save(model_trt.state_dict(), 'alexnet_trt.pth') +``` + +We can load the saved model into a ``TRTModule`` + +```python +from torch2trt import TRTModule + +model_trt = TRTModule() + +model_trt.load_state_dict(torch.load('alexnet_trt.pth')) +``` + +## Models + +We tested the converter against these models using the [test.sh](test.sh) script. You can generate the results by calling + +```bash +./test.sh TEST_OUTPUT.md +``` + +> The results below show the throughput in FPS. You can find the raw output, which includes latency, in the [benchmarks folder](benchmarks). + +| Model | Nano (PyTorch) | Nano (TensorRT) | Xavier (PyTorch) | Xavier (TensorRT) | +|-------|:--------------:|:---------------:|:----------------:|:-----------------:| +| alexnet | 46.4 | 69.9 | 250 | 580 | +| squeezenet1_0 | 44 | 137 | 130 | 890 | +| squeezenet1_1 | 76.6 | 248 | 132 | 1390 | +| resnet18 | 29.4 | 90.2 | 140 | 712 | +| resnet34 | 15.5 | 50.7 | 79.2 | 393 | +| resnet50 | 12.4 | 34.2 | 55.5 | 312 | +| resnet101 | 7.18 | 19.9 | 28.5 | 170 | +| resnet152 | 4.96 | 14.1 | 18.9 | 121 | +| densenet121 | 11.5 | 41.9 | 23.0 | 168 | +| densenet169 | 8.25 | 33.2 | 16.3 | 118 | +| densenet201 | 6.84 | 25.4 | 13.3 | 90.9 | +| densenet161 | 4.71 | 15.6 | 17.2 | 82.4 | +| vgg11 | 8.9 | 18.3 | 85.2 | 201 | +| vgg13 | 6.53 | 14.7 | 71.9 | 166 | +| vgg16 | 5.09 | 11.9 | 61.7 | 139 | +| vgg19 | | | 54.1 | 121 | +| vgg11_bn | 8.74 | 18.4 | 81.8 | 201 | +| vgg13_bn | 6.31 | 14.8 | 68.0 | 166 | +| vgg16_bn | 4.96 | 12.0 | 58.5 | 140 | +| vgg19_bn | | | 51.4 | 121 | + + +## Setup + +> torch2trt depends on the TensorRT Python API. On Jetson, this is included with the latest JetPack. For desktop, please follow the [TensorRT Installation Guide](https://docs.nvidia.com/deeplearning/tensorrt/install-guide/index.html). You may also try installing torch2trt inside one of the NGC PyTorch docker containers for [Desktop](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch) or [Jetson](https://ngc.nvidia.com/catalog/containers/nvidia:l4t-pytorch). + +### Option 1 - Without plugins + +To install without compiling plugins, call the following + +```bash +git clone https://github.com/NVIDIA-AI-IOT/torch2trt +cd torch2trt +python setup.py install +``` + +### Option 2 - With plugins (experimental) + +To install with plugins to support some operations in PyTorch that are not natviely supported with TensorRT, call the following + +> Please note, this currently only includes the interpolate plugin. This plugin requires PyTorch 1.3+ for serialization. + +```bash +git clone https://github.com/NVIDIA-AI-IOT/torch2trt +cd torch2trt +sudo python setup.py install --plugins +``` + +## How does it work? + +This converter works by attaching conversion functions (like ``convert_ReLU``) to the original +PyTorch functional calls (like ``torch.nn.ReLU.forward``). The sample input data is passed +through the network, just as before, except now whenever a registered function (``torch.nn.ReLU.forward``) +is encountered, the corresponding converter (``convert_ReLU``) is also called afterwards. The converter +is passed the arguments and return statement of the original PyTorch function, as well as the TensorRT +network that is being constructed. The input tensors to the original PyTorch function are modified to +have an attribute ``_trt``, which is the TensorRT counterpart to the PyTorch tensor. The conversion function +uses this ``_trt`` to add layers to the TensorRT network, and then sets the ``_trt`` attribute for +relevant output tensors. Once the model is fully executed, the final tensors returns are marked as outputs +of the TensorRT network, and the optimized TensorRT engine is built. + +## How to add (or override) a converter + +Here we show how to add a converter for the ``ReLU`` module using the TensorRT python API. ```python @@ -71,23 +151,27 @@ from torch2trt import tensorrt_converter @tensorrt_converter('torch.nn.ReLU.forward') def convert_ReLU(ctx): - input_tensor = ctx.method_args[1] - output_tensor = ctx.method_return - trt_input = ctx.trt_tensors[input_tensor.__hash__()] - layer = ctx.network.add_activation(input=trt_input, type=trt.ActivationType.RELU) - ctx.trt_tensors[output_tensor.__hash__()] = layer.get_output(0) + input = ctx.method_args[1] + output = ctx.method_return + layer = ctx.network.add_activation(input=input._trt, type=trt.ActivationType.RELU) + output._trt = layer.get_output(0) ``` The converter takes one argument, a ``ConversionContext``, which will contain the following -* network: The TensorRT network that is being constructed. -* method_args: Positional arguments that were passed to the specified Torch function. -* method_kwargs: Keyword arguments that were passed to the specified Torch function. -* method_return: The value returned by the specified Torch function. -* trt_tensors: A dictionary mapping Torch tensors (by hash value) to TensorRT tensors. The - converter must the set values for any output Tensors. Otherwise, if a later function uses - the Torch tensor, and there is not an associated TensorRT tensor in the map, results - may be unexpected. +* ``ctx.network`` - The TensorRT network that is being constructed. + +* ``ctx.method_args`` - Positional arguments that were passed to the specified PyTorch function. The ``_trt`` attribute is set for relevant input tensors. +* ``ctx.method_kwargs`` - Keyword arguments that were passed to the specified PyTorch function. +* ``ctx.method_return`` - The value returned by the specified PyTorch function. The converter must set the ``_trt`` attribute where relevant. + +Please see [this folder](torch2trt/converters) for more examples. + +## See also + +- [JetBot](http://github.com/NVIDIA-AI-IOT/jetbot) - An educational AI robot based on NVIDIA Jetson Nano -Please see the ``torch2trt.py`` module for more examples. +- [JetRacer](http://github.com/NVIDIA-AI-IOT/jetracer) - An educational AI racecar using NVIDIA Jetson Nano +- [JetCam](http://github.com/NVIDIA-AI-IOT/jetcam) - An easy to use Python camera interface for NVIDIA Jetson +- [JetCard](http://github.com/NVIDIA-AI-IOT/jetcard) - An SD card image for web programming AI projects with NVIDIA Jetson Nano diff --git a/benchmarks/JETSON_NANO.md b/benchmarks/JETSON_NANO.md new file mode 100644 index 00000000..0dab2486 --- /dev/null +++ b/benchmarks/JETSON_NANO.md @@ -0,0 +1,20 @@ +| Name | Data Type | Input Shapes | torch2trt kwargs | Max Error | Throughput (PyTorch) | Throughput (TensorRT) | Latency (PyTorch) | Latency (TensorRT) | +|------|-----------|--------------|------------------|-----------|----------------------|-----------------------|-------------------|--------------------| +| torchvision.models.alexnet.alexnet | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 2.29E-05 | 46.4 | 69.9 | 22.1 | 14.7 | +| torchvision.models.squeezenet.squeezenet1_0 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 1.20E-02 | 44 | 137 | 24.2 | 7.6 | +| torchvision.models.squeezenet.squeezenet1_1 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 9.77E-04 | 76.6 | 248 | 14 | 4.34 | +| torchvision.models.resnet.resnet18 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 5.86E-03 | 29.4 | 90.2 | 34.7 | 11.4 | +| torchvision.models.resnet.resnet34 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 1.56E-01 | 15.5 | 50.7 | 64.8 | 20.2 | +| torchvision.models.resnet.resnet50 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 6.45E-02 | 12.4 | 34.2 | 81.7 | 29.8 | +| torchvision.models.resnet.resnet101 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 1.01E+03 | 7.18 | 19.9 | 141 | 51.1 | +| torchvision.models.resnet.resnet152 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 0.00E+00 | 4.96 | 14.1 | 204 | 72.3 | +| torchvision.models.densenet.densenet121 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 3.42E-03 | 11.5 | 41.9 | 84.5 | 24.8 | +| torchvision.models.densenet.densenet169 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 5.86E-03 | 8.25 | 33.2 | 118 | 31.2 | +| torchvision.models.densenet.densenet201 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 3.42E-03 | 6.84 | 25.4 | 141 | 40.8 | +| torchvision.models.densenet.densenet161 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 4.15E-03 | 4.71 | 15.6 | 247 | 65.8 | +| torchvision.models.vgg.vgg11 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 3.51E-04 | 8.9 | 18.3 | 114 | 55.1 | +| torchvision.models.vgg.vgg13 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 3.07E-04 | 6.53 | 14.7 | 156 | 68.7 | +| torchvision.models.vgg.vgg16 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 4.58E-04 | 5.09 | 11.9 | 201 | 85.1 | +| torchvision.models.vgg.vgg11_bn | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 3.81E-04 | 8.74 | 18.4 | 117 | 54.8 | +| torchvision.models.vgg.vgg13_bn | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 5.19E-04 | 6.31 | 14.8 | 162 | 68.5 | +| torchvision.models.vgg.vgg16_bn | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 9.77E-04 | 4.96 | 12 | 207 | 84.3 | diff --git a/benchmarks/JETSON_XAVIER.md b/benchmarks/JETSON_XAVIER.md new file mode 100644 index 00000000..63ece334 --- /dev/null +++ b/benchmarks/JETSON_XAVIER.md @@ -0,0 +1,31 @@ +| Name | Data Type | Input Shapes | torch2trt kwargs | Max Error | Throughput (PyTorch) | Throughput (TensorRT) | Latency (PyTorch) | Latency (TensorRT) | +|------|-----------|--------------|------------------|-----------|----------------------|-----------------------|-------------------|--------------------| +| torch2trt.tests.torchvision.classification.alexnet | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 7.63E-05 | 251 | 565 | 4.96 | 2.02 | +| torch2trt.tests.torchvision.classification.squeezenet1_0 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 9.77E-04 | 121 | 834 | 8.04 | 1.49 | +| torch2trt.tests.torchvision.classification.squeezenet1_1 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 9.77E-04 | 125 | 1.29e+03 | 8.01 | 1.02 | +| torch2trt.tests.torchvision.classification.resnet18 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 9.77E-03 | 136 | 722 | 7.33 | 1.64 | +| torch2trt.tests.torchvision.classification.resnet34 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 2.50E-01 | 77.8 | 396 | 12.9 | 2.79 | +| torch2trt.tests.torchvision.classification.resnet50 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 1.09E-01 | 55.8 | 326 | 17.9 | 3.37 | +| torch2trt.tests.torchvision.classification.resnet101 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 0.00E+00 | 28.3 | 175 | 35.1 | 6.04 | +| torch2trt.tests.torchvision.classification.resnet152 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 0.00E+00 | 18.8 | 122 | 53.2 | 8.57 | +| torch2trt.tests.torchvision.classification.densenet121 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 7.81E-03 | 20.9 | 76.6 | 47.5 | 13 | +| torch2trt.tests.torchvision.classification.densenet169 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 3.91E-03 | 14.8 | 41.7 | 66.7 | 23.7 | +| torch2trt.tests.torchvision.classification.densenet201 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 4.88E-03 | 12.6 | 30.2 | 79.1 | 33 | +| torch2trt.tests.torchvision.classification.densenet161 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 4.88E-03 | 16.1 | 43.7 | 62.1 | 23 | +| torch2trt.tests.torchvision.classification.vgg11 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 2.56E-03 | 84.8 | 201 | 12.1 | 5.24 | +| torch2trt.tests.torchvision.classification.vgg13 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 2.24E-03 | 71.1 | 165 | 14.3 | 6.34 | +| torch2trt.tests.torchvision.classification.vgg16 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 3.78E-03 | 61.5 | 139 | 16.5 | 7.46 | +| torch2trt.tests.torchvision.classification.vgg19 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 2.81E-03 | 54.1 | 120 | 18.7 | 8.61 | +| torch2trt.tests.torchvision.classification.vgg11_bn | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 2.20E-03 | 81.5 | 200 | 12.5 | 5.27 | +| torch2trt.tests.torchvision.classification.vgg13_bn | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 1.71E-03 | 67.5 | 165 | 15.1 | 6.33 | +| torch2trt.tests.torchvision.classification.vgg16_bn | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 2.87E-03 | 58.3 | 139 | 17.4 | 7.48 | +| torch2trt.tests.torchvision.classification.vgg19_bn | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 2.44E-03 | 51.4 | 120 | 19.7 | 8.61 | +| torch2trt.tests.torchvision.classification.mobilenet_v2 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 0.00E+00 | 64.8 | 723 | 15.4 | 1.67 | +| torch2trt.tests.torchvision.classification.shufflenet_v2_x0_5 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 1.53E-05 | 51.2 | 463 | 19.4 | 2.17 | +| torch2trt.tests.torchvision.classification.shufflenet_v2_x1_0 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 1.53E-05 | 49.4 | 419 | 20.4 | 2.43 | +| torch2trt.tests.torchvision.classification.shufflenet_v2_x1_5 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 1.53E-05 | 51.4 | 426 | 19.6 | 2.37 | +| torch2trt.tests.torchvision.classification.shufflenet_v2_x2_0 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 1.53E-05 | 48.2 | 419 | 20.8 | 2.48 | +| torch2trt.tests.torchvision.classification.mnasnet0_5 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 2.03E-06 | 67.8 | 883 | 14.9 | 1.4 | +| torch2trt.tests.torchvision.classification.mnasnet0_75 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 0.00E+00 | 67.6 | 751 | 14.8 | 1.6 | +| torch2trt.tests.torchvision.classification.mnasnet1_0 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 0.00E+00 | 65.7 | 667 | 15.2 | 1.77 | +| torch2trt.tests.torchvision.classification.mnasnet1_3 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 0.00E+00 | 67.4 | 573 | 15 | 2.02 | \ No newline at end of file diff --git a/bug.ipynb b/bug.ipynb deleted file mode 100644 index a74a3619..00000000 --- a/bug.ipynb +++ /dev/null @@ -1,93 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import torchvision.models\n", - "import torch\n", - "import tensorrt as trt\n", - "from torch2trt import torch2trt" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "model = torchvision.models.resnet18(pretrained=True).cuda().eval().half()\n", - "\n", - "input = torch.ones((1, 3, 224, 224)).cuda().half()\n", - "\n", - "model_trt = torch2trt(model, [input], fp16_mode=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import time\n", - "input = input.half()\n", - "\n", - "model = model.half()\n", - "output = model(input)\n", - "print(output.flatten()[0:10])\n", - "\n", - "t0 = time.time()\n", - "with torch.no_grad():\n", - " for i in range(50):\n", - " output = model(input)\n", - " #output = model_trt(input)\n", - "t1 = time.time()\n", - "\n", - "print((t1 - t0) / 50.0)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import time\n", - "input = input.float()\n", - "output = model_trt(input)\n", - "print(output.flatten()[0:10])\n", - "\n", - "t0 = time.time()\n", - "with torch.no_grad():\n", - " for i in range(50):\n", - " output = model_trt(input)\n", - "t1 = time.time()\n", - "\n", - "print((t1 - t0) / 50.0)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.7" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/build.py b/build.py new file mode 100644 index 00000000..3bfcf1df --- /dev/null +++ b/build.py @@ -0,0 +1,64 @@ +import imp +import subprocess +import os +from string import Template + +PLUGINS = [ + 'interpolate', + 'group_norm', +] + +BASE_FOLDER = 'torch2trt/converters' + +NINJA_TEMPLATE = Template(( + "rule link\n" + " command = g++ -shared -o $$out $$in -L$torch_dir/lib -L$cuda_dir/lib64 -L$trt_lib_dir -lc10 -lc10_cuda -ltorch -lcudart -lprotobuf -lprotobuf-lite -pthread -lpthread -lnvinfer\n" + "rule protoc\n" + " command = protoc $$in --cpp_out=. --python_out=.\n" + "rule cxx\n" + " command = g++ -c -fPIC $$in -I$cuda_dir/include -I$torch_dir/include -I$torch_dir/include/torch/csrc/api/include -I. -std=c++11 -I$trt_inc_dir\n" +)) + +PLUGIN_TEMPLATE = Template(( + "build $plugin_dir/$plugin.pb.h $plugin_dir/$plugin.pb.cc $plugin_dir/${plugin}_pb2.py: protoc $plugin_dir/$plugin.proto\n" + "build $plugin.pb.o: cxx $plugin_dir/$plugin.pb.cc\n" + "build $plugin.o: cxx $plugin_dir/$plugin.cpp\n" +)) + + +def build(cuda_dir="/usr/local/cuda", + torch_dir=imp.find_module('torch')[1], + trt_inc_dir="/usr/include/aarch64-linux-gnu", + trt_lib_dir="/usr/lib/aarch64-linux-gnu"): + + global PLUGINS, BASE_FOLDER, NINJA_TEMPLATE, PLUGIN_TEMPLATE + + NINJA_STR = NINJA_TEMPLATE.substitute({ + 'torch_dir': torch_dir, + 'cuda_dir': cuda_dir, + 'trt_inc_dir': trt_inc_dir, + 'trt_lib_dir': trt_lib_dir, + }) + + + plugin_o_files = [] + for plugin in PLUGINS: + NINJA_STR += \ + PLUGIN_TEMPLATE.substitute({ + 'plugin': plugin, + 'plugin_dir': os.path.join(BASE_FOLDER, plugin), + }) + plugin_o_files += [plugin + '.pb.o', plugin + '.o'] + + NINJA_STR += Template(( + "build torch2trt/libtorch2trt.so: link $o_files\n" + )).substitute({'o_files': ' '.join(plugin_o_files)}) + + with open('build.ninja', 'w') as f: + f.write(NINJA_STR) + + subprocess.call(['ninja']) + + +if __name__ == '__main__': + build() diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md new file mode 120000 index 00000000..04c99a55 --- /dev/null +++ b/docs/CHANGELOG.md @@ -0,0 +1 @@ +../CHANGELOG.md \ No newline at end of file diff --git a/docs/CONTRIBUTING.md b/docs/CONTRIBUTING.md new file mode 120000 index 00000000..44fcc634 --- /dev/null +++ b/docs/CONTRIBUTING.md @@ -0,0 +1 @@ +../CONTRIBUTING.md \ No newline at end of file diff --git a/docs/benchmarks/jetson_nano.md b/docs/benchmarks/jetson_nano.md new file mode 100644 index 00000000..f58c9a3c --- /dev/null +++ b/docs/benchmarks/jetson_nano.md @@ -0,0 +1,22 @@ +# Jetson Nano + +| Name | Data Type | Input Shapes | torch2trt kwargs | Max Error | Throughput (PyTorch) | Throughput (TensorRT) | Latency (PyTorch) | Latency (TensorRT) | +|------|-----------|--------------|------------------|-----------|----------------------|-----------------------|-------------------|--------------------| +| torchvision.models.alexnet.alexnet | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 2.29E-05 | 46.4 | 69.9 | 22.1 | 14.7 | +| torchvision.models.squeezenet.squeezenet1_0 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 1.20E-02 | 44 | 137 | 24.2 | 7.6 | +| torchvision.models.squeezenet.squeezenet1_1 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 9.77E-04 | 76.6 | 248 | 14 | 4.34 | +| torchvision.models.resnet.resnet18 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 5.86E-03 | 29.4 | 90.2 | 34.7 | 11.4 | +| torchvision.models.resnet.resnet34 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 1.56E-01 | 15.5 | 50.7 | 64.8 | 20.2 | +| torchvision.models.resnet.resnet50 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 6.45E-02 | 12.4 | 34.2 | 81.7 | 29.8 | +| torchvision.models.resnet.resnet101 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 1.01E+03 | 7.18 | 19.9 | 141 | 51.1 | +| torchvision.models.resnet.resnet152 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 0.00E+00 | 4.96 | 14.1 | 204 | 72.3 | +| torchvision.models.densenet.densenet121 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 3.42E-03 | 11.5 | 41.9 | 84.5 | 24.8 | +| torchvision.models.densenet.densenet169 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 5.86E-03 | 8.25 | 33.2 | 118 | 31.2 | +| torchvision.models.densenet.densenet201 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 3.42E-03 | 6.84 | 25.4 | 141 | 40.8 | +| torchvision.models.densenet.densenet161 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 4.15E-03 | 4.71 | 15.6 | 247 | 65.8 | +| torchvision.models.vgg.vgg11 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 3.51E-04 | 8.9 | 18.3 | 114 | 55.1 | +| torchvision.models.vgg.vgg13 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 3.07E-04 | 6.53 | 14.7 | 156 | 68.7 | +| torchvision.models.vgg.vgg16 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 4.58E-04 | 5.09 | 11.9 | 201 | 85.1 | +| torchvision.models.vgg.vgg11_bn | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 3.81E-04 | 8.74 | 18.4 | 117 | 54.8 | +| torchvision.models.vgg.vgg13_bn | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 5.19E-04 | 6.31 | 14.8 | 162 | 68.5 | +| torchvision.models.vgg.vgg16_bn | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 9.77E-04 | 4.96 | 12 | 207 | 84.3 | diff --git a/docs/benchmarks/jetson_xavier.md b/docs/benchmarks/jetson_xavier.md new file mode 100644 index 00000000..1c3cb2c6 --- /dev/null +++ b/docs/benchmarks/jetson_xavier.md @@ -0,0 +1,33 @@ +# Jetson Xavier + +| Name | Data Type | Input Shapes | torch2trt kwargs | Max Error | Throughput (PyTorch) | Throughput (TensorRT) | Latency (PyTorch) | Latency (TensorRT) | +|------|-----------|--------------|------------------|-----------|----------------------|-----------------------|-------------------|--------------------| +| torch2trt.tests.torchvision.classification.alexnet | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 7.63E-05 | 251 | 565 | 4.96 | 2.02 | +| torch2trt.tests.torchvision.classification.squeezenet1_0 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 9.77E-04 | 121 | 834 | 8.04 | 1.49 | +| torch2trt.tests.torchvision.classification.squeezenet1_1 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 9.77E-04 | 125 | 1.29e+03 | 8.01 | 1.02 | +| torch2trt.tests.torchvision.classification.resnet18 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 9.77E-03 | 136 | 722 | 7.33 | 1.64 | +| torch2trt.tests.torchvision.classification.resnet34 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 2.50E-01 | 77.8 | 396 | 12.9 | 2.79 | +| torch2trt.tests.torchvision.classification.resnet50 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 1.09E-01 | 55.8 | 326 | 17.9 | 3.37 | +| torch2trt.tests.torchvision.classification.resnet101 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 0.00E+00 | 28.3 | 175 | 35.1 | 6.04 | +| torch2trt.tests.torchvision.classification.resnet152 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 0.00E+00 | 18.8 | 122 | 53.2 | 8.57 | +| torch2trt.tests.torchvision.classification.densenet121 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 7.81E-03 | 20.9 | 76.6 | 47.5 | 13 | +| torch2trt.tests.torchvision.classification.densenet169 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 3.91E-03 | 14.8 | 41.7 | 66.7 | 23.7 | +| torch2trt.tests.torchvision.classification.densenet201 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 4.88E-03 | 12.6 | 30.2 | 79.1 | 33 | +| torch2trt.tests.torchvision.classification.densenet161 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 4.88E-03 | 16.1 | 43.7 | 62.1 | 23 | +| torch2trt.tests.torchvision.classification.vgg11 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 2.56E-03 | 84.8 | 201 | 12.1 | 5.24 | +| torch2trt.tests.torchvision.classification.vgg13 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 2.24E-03 | 71.1 | 165 | 14.3 | 6.34 | +| torch2trt.tests.torchvision.classification.vgg16 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 3.78E-03 | 61.5 | 139 | 16.5 | 7.46 | +| torch2trt.tests.torchvision.classification.vgg19 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 2.81E-03 | 54.1 | 120 | 18.7 | 8.61 | +| torch2trt.tests.torchvision.classification.vgg11_bn | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 2.20E-03 | 81.5 | 200 | 12.5 | 5.27 | +| torch2trt.tests.torchvision.classification.vgg13_bn | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 1.71E-03 | 67.5 | 165 | 15.1 | 6.33 | +| torch2trt.tests.torchvision.classification.vgg16_bn | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 2.87E-03 | 58.3 | 139 | 17.4 | 7.48 | +| torch2trt.tests.torchvision.classification.vgg19_bn | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 2.44E-03 | 51.4 | 120 | 19.7 | 8.61 | +| torch2trt.tests.torchvision.classification.mobilenet_v2 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 0.00E+00 | 64.8 | 723 | 15.4 | 1.67 | +| torch2trt.tests.torchvision.classification.shufflenet_v2_x0_5 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 1.53E-05 | 51.2 | 463 | 19.4 | 2.17 | +| torch2trt.tests.torchvision.classification.shufflenet_v2_x1_0 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 1.53E-05 | 49.4 | 419 | 20.4 | 2.43 | +| torch2trt.tests.torchvision.classification.shufflenet_v2_x1_5 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 1.53E-05 | 51.4 | 426 | 19.6 | 2.37 | +| torch2trt.tests.torchvision.classification.shufflenet_v2_x2_0 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 1.53E-05 | 48.2 | 419 | 20.8 | 2.48 | +| torch2trt.tests.torchvision.classification.mnasnet0_5 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 2.03E-06 | 67.8 | 883 | 14.9 | 1.4 | +| torch2trt.tests.torchvision.classification.mnasnet0_75 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 0.00E+00 | 67.6 | 751 | 14.8 | 1.6 | +| torch2trt.tests.torchvision.classification.mnasnet1_0 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 0.00E+00 | 65.7 | 667 | 15.2 | 1.77 | +| torch2trt.tests.torchvision.classification.mnasnet1_3 | float16 | [(1, 3, 224, 224)] | {'fp16_mode': True} | 0.00E+00 | 67.4 | 573 | 15 | 2.02 | diff --git a/docs/css/version-select.css b/docs/css/version-select.css new file mode 100644 index 00000000..49079bf4 --- /dev/null +++ b/docs/css/version-select.css @@ -0,0 +1,5 @@ +@media only screen and (max-width:76.1875em) { + #version-selector { + padding: .6rem .8rem; + } +} diff --git a/docs/getting_started.md b/docs/getting_started.md new file mode 100644 index 00000000..75ce7335 --- /dev/null +++ b/docs/getting_started.md @@ -0,0 +1,32 @@ +# Getting Started + +Follow these steps to get started using torch2trt. + +!!! note + + torch2trt depends on the TensorRT Python API. On Jetson, this is included with the latest JetPack. For desktop, please follow the [TensorRT Installation Guide](https://docs.nvidia.com/deeplearning/tensorrt/install-guide/index.html). You may also try installing torch2trt inside one of the NGC PyTorch docker containers for [Desktop](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch) or [Jetson](https://ngc.nvidia.com/catalog/containers/nvidia:l4t-pytorch). + +### Install Without plugins + +To install without compiling plugins, call the following + +```bash +git clone https://github.com/NVIDIA-AI-IOT/torch2trt +cd torch2trt +python setup.py install +``` + +### Install With plugins + +To install with plugins to support some operations in PyTorch that are not natviely supported with TensorRT, call the following + +!!! note + + Please note, this currently only includes the interpolate plugin. This plugin requires PyTorch 1.3+ for serialization. + +```bash +git clone https://github.com/NVIDIA-AI-IOT/torch2trt +cd torch2trt +sudo python setup.py install --plugins +``` + diff --git a/docs/images/chart.svg b/docs/images/chart.svg new file mode 100644 index 00000000..28f0b295 --- /dev/null +++ b/docs/images/chart.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/docs/images/check.svg b/docs/images/check.svg new file mode 100644 index 00000000..cf59f02c --- /dev/null +++ b/docs/images/check.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 00000000..6114cdc9 --- /dev/null +++ b/docs/index.md @@ -0,0 +1,12 @@ +# torch2trt + + + +torch2trt is a PyTorch to TensorRT converter which utilizes the +TensorRT Python API. The converter is + +* Easy to use - Convert modules with a single function call ``torch2trt`` + +* Easy to extend - Write your own layer converter in Python and register it with ``@tensorrt_converter`` + +If you find an issue, please [let us know](https://github.com/NVIDIA-AI-IOT/torch2trt/issues)! \ No newline at end of file diff --git a/docs/js/version-select.js b/docs/js/version-select.js new file mode 100644 index 00000000..794b5cc8 --- /dev/null +++ b/docs/js/version-select.js @@ -0,0 +1,49 @@ +window.addEventListener("DOMContentLoaded", function() { + // This is a bit hacky. Figure out the base URL from a known CSS file the + // template refers to... + var ex = new RegExp("/?css/version-select.css$"); + var sheet = document.querySelector('link[href$="version-select.css"]'); + + var ABS_BASE_URL = sheet.href.replace(ex, ""); + var CURRENT_VERSION = ABS_BASE_URL.split("/").pop(); + + function makeSelect(options, selected) { + var select = document.createElement("select"); + select.classList.add("form-control"); + + options.forEach(function(i) { + var option = new Option(i.text, i.value, undefined, + i.value === selected); + select.add(option); + }); + + return select; + } + + var xhr = new XMLHttpRequest(); + xhr.open("GET", ABS_BASE_URL + "/../versions.json"); + xhr.onload = function() { + var versions = JSON.parse(this.responseText); + + var realVersion = versions.find(function(i) { + return i.version === CURRENT_VERSION || + i.aliases.includes(CURRENT_VERSION); + }).version; + + var select = makeSelect(versions.map(function(i) { + return {text: i.title, value: i.version}; + }), realVersion); + select.addEventListener("change", function(event) { + window.location.href = ABS_BASE_URL + "/../" + this.value; + }); + + var container = document.createElement("div"); + container.id = "version-selector"; + container.className = "md-nav__item"; + container.appendChild(select); + + var sidebar = document.querySelector(".md-nav--primary > .md-nav__list"); + sidebar.parentNode.insertBefore(container, sidebar); + }; + xhr.send(); +}); diff --git a/docs/see_also.md b/docs/see_also.md new file mode 100644 index 00000000..5217f1c5 --- /dev/null +++ b/docs/see_also.md @@ -0,0 +1,55 @@ +# See Also + +!!! note + + The state of these converters may change over time. We provide this information here with the hope that it will help shed light on the landscape of tools available for optimizing PyTorch models with TensorRT. + If you find this information helpful or outdated / misleading, please let us know. + +In addition to torch2trt, there are other workflows for optimizing your PyTorch model with TensorRT. + +The other converters we are aware of are + +* [ONNX to TensorRT](https://github.com/onnx/onnx-tensorrt) + +!!! tip + + Since the ONNX parser ships with TensorRT, we have included a convenience method for using this + workflow with torch2trt. If you want to quickly try the ONNX method using the torch2trt interface, just call ``torch2trt(..., use_onnx=True)``. + This will perform conversion on the module by exporting the model using PyTorch's JIT tracer, + and parsing with TensorRT's ONNX parser. + +* [TRTorch](https://github.com/NVIDIA/TRTorch) + +Which one you use depends largely on your use case. The differences often come down to + +## Layer support + +Modern deep learning frameworks are large, and there often arise +caveats converting between frameworks using a given workflow. These could include +limitations in serialization or parsing formats. Or in some instances, it may be possible +the layer could be supported, but it has just not been done yet. TRTorch is strong +in the sense that it will default to the original PyTorch method for layers +which are not converted to TensorRT. The best way to know +which conversion method works for you is to try converting your model. + +## Feature support + +TensorRT is evolving and the conversion workflows may have varying level +of feature support. In some instances, you may wish to use a latest feature of TensorRT, like dynamic shapes, +but it is not supported in torch2trt or the interface has not yet been exposed. In this +instance, we recommend checking to see if it is supported by one of the other workflows. The ONNX +converter is typically strong in this regards, since the parser is distributed with TensorRT. + +!!! note + + If there is a TensorRT feature you wished to see in torch2trt, please let us know. We can not gaurantee this will be done, but it helps us gauge interest. + +## Extensibility / Ease of Use + +In case none of the converters satisfy for your use case, you may find it necessary to adapt +the converter to fit your needs. This is very intuitive with torch2trt, +since it is done inline with Python, and there are many [examples](converters.md) to reference. If you know +how the original PyTorch method works, and have the TensorRT Python API on hand, it is relatively straight forward to adapt torch2trt to your needs. +The extensibility is often helpful when you want to implement a converter that is specific to the +context the layer appears in. + diff --git a/docs/usage/basic_usage.md b/docs/usage/basic_usage.md new file mode 100644 index 00000000..f4787656 --- /dev/null +++ b/docs/usage/basic_usage.md @@ -0,0 +1,59 @@ +# Basic Usage + +This page demonstrates basic torch2trt usage. + +## Conversion + +You can easily convert a PyTorch module by calling ``torch2trt`` passing example data as input, for example to convert ``alexnet`` we call + +```python +import torch +from torch2trt import torch2trt +from torchvision.models.alexnet import alexnet + +# create some regular pytorch model... +model = alexnet(pretrained=True).eval().cuda() + +# create example data +x = torch.ones((1, 3, 224, 224)).cuda() + +# convert to TensorRT feeding sample data as input +model_trt = torch2trt(model, [x]) +``` + +!!! note + + Currently with torch2trt, once the model is converted, you must use the same input shapes during + execution. The exception is + the batch size, which can vary up to the value specified by the ``max_batch_size`` parameter. + +## Executution + +We can execute the returned ``TRTModule`` just like the original PyTorch model. Here we +execute the model and print the maximum absolute error. + +```python +y = model(x) +y_trt = model_trt(x) + +# check the output against PyTorch +print(torch.max(torch.abs(y - y_trt))) +``` + +## Saving and loading + +We can save the model as a ``state_dict``. + +```python +torch.save(model_trt.state_dict(), 'alexnet_trt.pth') +``` + +We can load the saved model into a ``TRTModule`` + +```python +from torch2trt import TRTModule + +model_trt = TRTModule() + +model_trt.load_state_dict(torch.load('alexnet_trt.pth')) +``` diff --git a/docs/usage/custom_converter.md b/docs/usage/custom_converter.md new file mode 100644 index 00000000..8270c79d --- /dev/null +++ b/docs/usage/custom_converter.md @@ -0,0 +1,46 @@ +# Custom Converter + +This page details how to extend or modify the behavior of torch2trt by implementing and registering +custom converters. + +## Background + +torch2trt works by attaching conversion functions (like ``convert_ReLU``) to the original +PyTorch functional calls (like ``torch.nn.ReLU.forward``). The sample input data is passed +through the network, just as before, except now whenever a registered function (``torch.nn.ReLU.forward``) +is encountered, the corresponding converter (``convert_ReLU``) is also called afterwards. The converter +is passed the arguments and return statement of the original PyTorch function, as well as the TensorRT +network that is being constructed. The input tensors to the original PyTorch function are modified to +have an attribute ``_trt``, which is the TensorRT counterpart to the PyTorch tensor. The conversion function +uses this ``_trt`` to add layers to the TensorRT network, and then sets the ``_trt`` attribute for +relevant output tensors. Once the model is fully executed, the final tensors returns are marked as outputs +of the TensorRT network, and the optimized TensorRT engine is built. + +## Add a custom converter + +Here we show how to add a converter for the ``ReLU`` module using the TensorRT +python API. + +```python +import tensorrt as trt +from torch2trt import tensorrt_converter + +@tensorrt_converter('torch.nn.ReLU.forward') +def convert_ReLU(ctx): + input = ctx.method_args[1] + output = ctx.method_return + layer = ctx.network.add_activation(input=input._trt, type=trt.ActivationType.RELU) + output._trt = layer.get_output(0) +``` + +The converter takes one argument, a ``ConversionContext``, which will contain +the following + +* ``ctx.network`` - The TensorRT network that is being constructed. + +* ``ctx.method_args`` - Positional arguments that were passed to the specified PyTorch function. The ``_trt`` attribute is set for relevant input tensors. +* ``ctx.method_kwargs`` - Keyword arguments that were passed to the specified PyTorch function. +* ``ctx.method_return`` - The value returned by the specified PyTorch function. The converter must set the ``_trt`` attribute where relevant. + +Please see the [converters](../converters.md) page for a list of implemented converters and links to their source code. These may help +in learning how to write converters. diff --git a/docs/usage/reduced_precision.md b/docs/usage/reduced_precision.md new file mode 100644 index 00000000..1d288525 --- /dev/null +++ b/docs/usage/reduced_precision.md @@ -0,0 +1,152 @@ +# Reduced Precision + +For certain platforms, reduced precision can result in substantial improvements in throughput, +often with little impact on model accuracy. + +# Support Matrix + +Below is a table of layer precision support for various NVIDIA platforms. + +| Platform | FP16 | INT8 | +|----------|------|------| +| Jetson Nano | ![X](../images/check.svg) | | +| Jetson TX2 | ![X](../images/check.svg) | ![X](../images/check.svg) | +| Jetson Xavier NX | ![X](../images/check.svg) | ![X](../images/check.svg) | +| Jetson AGX Xavier | ![X](../images/check.svg) | ![X](../images/check.svg) | + +!!! note + + If the platform you're using is missing from this table or you spot anything incorrect + please [let us know](https://github.com/NVIDIA-AI-IOT/torch2trt). + +## FP16 Precision + +To enable support for fp16 precision with TensorRT, torch2trt exposes the ``fp16_mode`` parameter. +Converting a model with ``fp16_mode=True`` allows the TensorRT optimizer to select layers with fp16 +precision. + + +```python +model_trt = torch2trt(model, [data], fp16_mode=True) +``` + +!!! note + + When ``fp16_mode=True``, this does not necessarily mean that TensorRT will select FP16 layers. + The optimizer attempts to automatically select tactics which result in the best performance. + +## INT8 Precision + +torch2trt also supports int8 precision with TensorRT with the ``int8_mode`` parameter. Unlike fp16 and fp32 precision, switching +to in8 precision often requires calibration to avoid a significant drop in accuracy. + +### Input Data Calibration + +By default +torch2trt will calibrate using the input data provided. For example, if you wanted +to calibrate on a set of 64 random normal images you could do. + +```python +data = torch.randn(64, 3, 224, 224).cuda().eval() + +model_trt = torch2trt(model, [data], int8_mode=True) +``` + +### Dataset Calibration + +In many instances, you may want to calibrate on more data than fits in memory. For this reason, +torch2trt exposes the ``int8_calibration_dataset`` parameter. This parameter takes an input +dataset that is used for calibration. If this parameter is specified, the input data is +ignored during calibration. You create an input dataset by defining +a class which implements the ``__len__`` and ``__getitem__`` methods. + +* The ``__len__`` method should return the number of calibration samples +* The ``__getitem__`` method must return a single calibration sample. This is a list of input tensors to the model. Each tensor should match the shape +you provide to the ``inputs`` parameter when calling ``torch2trt``. + +For example, say you trained an image classification network using the PyTorch [``ImageFolder``](https://pytorch.org/docs/stable/torchvision/datasets.html#imagefolder) dataset. +You could wrap this dataset for calibration, by defining a new dataset which returns only the images without labels in list format. + +```python +from torchvision.datasets import ImageFolder +from torchvision.transforms import ToTensor, Compose, Normalize, Resize + + +class ImageFolderCalibDataset(): + + def __init__(self, root): + self.dataset = ImageFolder( + root=root, + transform=Compose([ + Resize((224, 224)), + ToTensor(), + Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) + ]) + ) + + def __len__(self): + return len(self.dataset) + + def __getitem__(self, idx): + image, _ = self.dataset[idx] + image = image[None, ...] # add batch dimension + return [image] +``` + +You would then provide this calibration dataset to torch2trt as follows + +```python +dataset = ImageFolderCalibDataset('images') + +model_trt = torch2trt(model, [data], int8_calib_dataset=dataset) +``` + +### Calibration Algorithm + +To override the default calibration algorithm that torch2trt uses, you can set the ``int8_calib_algoirthm`` +to the [``tensorrt.CalibrationAlgoType``](https://docs.nvidia.com/deeplearning/tensorrt/api/python_api/infer/Int8/Calibrator.html#iint8calibrator) +that you wish to use. For example, to use the minmax calibration algorithm you would do + +```python +import tensorrt as trt + +model_trt = torch2trt(model, [data], int8_mode=True, int8_calib_algorithm=trt.CalibrationAlgoType.MINMAX_CALIBRATION) +``` + +### Calibration Batch Size + +During calibration, torch2trt pulls data in batches for the TensorRT calibrator. In some instances +[developers have found](https://github.com/NVIDIA-AI-IOT/torch2trt/pull/398) that the calibration batch size can impact the calibrated model accuracy. To set the calibration batch size, you can set the ``int8_calib_batch_size`` +parameter. For example, to use a calibration batch size of 32 you could do + +```python +model_trt = torch2trt(model, [data], int8_mode=True, int8_calib_batch_size=32) +``` + +## Binding Data Types + +The data type of input and output bindings in TensorRT are determined by the original +PyTorch module input and output data types. +This does not directly impact whether the TensorRT optimizer will internally use fp16 or int8 precision. + +For example, to create a model with fp32 precision bindings, you would do the following + +```python +model = model.float() +data = data.float() + +model_trt = torch2trt(model, [data], fp16_mode=True) +``` + +In this instance, the optimizer may choose to use fp16 precision layers internally, but the +input and output data types are fp32. To use fp16 precision input and output bindings you would do + +```python +model = model.half() +data = data.half() + +model_trt = torch2trt(model, [data], fp16_mode=True) +``` + +Now, the input and output bindings of the model are half precision, and internally the optimizer may +choose to select fp16 layers as well. diff --git a/example.ipynb b/example.ipynb deleted file mode 100644 index e54775c4..00000000 --- a/example.ipynb +++ /dev/null @@ -1,123 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 31, - "metadata": {}, - "outputs": [], - "source": [ - "import torchvision.models\n", - "import torch\n", - "import tensorrt as trt\n", - "from torch2trt import torch2trt" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [], - "source": [ - "def cross_validate(model, model_trt, data, tolerance=0.01):\n", - " y = model(data)\n", - " y_trt = model_trt(data)\n", - " max_error = torch.max(torch.abs(y - y_trt))\n", - " if max_error > tolerance:\n", - " raise RuntimeError('Model exceeds tolerance')" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": {}, - "outputs": [], - "source": [ - "net = torch.nn.Sequential(*[\n", - " torch.nn.Conv2d(3, 32, 3),\n", - " torch.nn.ReLU(),\n", - " torch.nn.BatchNorm2d(32)\n", - "]).eval().half().cuda()\n", - "\n", - "data = torch.randn((1, 3, 224, 224)).cuda().half()" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [], - "source": [ - "net_trt = torch2trt(net, [data], fp16_mode=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Help on instancemethod in module tensorrt.tensorrt:\n", - "\n", - "mark_output(...)\n", - " mark_output(self: tensorrt.tensorrt.INetworkDefinition, tensor: tensorrt.tensorrt.ITensor) -> None\n", - " \n", - " \n", - " Mark a tensor as an output.\n", - " \n", - " :arg tensor: The tensor to mark.\n", - "\n" - ] - } - ], - "source": [ - "help(trt.INetworkDefinition.mark_output)" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": {}, - "outputs": [ - { - "ename": "RuntimeError", - "evalue": "expected type torch.cuda.FloatTensor but got torch.cuda.HalfTensor", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mRuntimeError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mcross_validate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnet\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnet_trt\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;32m\u001b[0m in \u001b[0;36mcross_validate\u001b[0;34m(model, model_trt, data, tolerance)\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0my\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0my_trt\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmodel_trt\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0mmax_error\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmax\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mabs\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0my_trt\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 5\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mmax_error\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0mtolerance\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mRuntimeError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'Model exceeds tolerance'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mRuntimeError\u001b[0m: expected type torch.cuda.FloatTensor but got torch.cuda.HalfTensor" - ] - } - ], - "source": [ - "cross_validate(net, net_trt, data)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.7" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/mkdocs.yml b/mkdocs.yml new file mode 100644 index 00000000..27e4d2ed --- /dev/null +++ b/mkdocs.yml @@ -0,0 +1,55 @@ +site_name: torch2trt +theme: + name: "material" + palette: + primary: green + secondary: light green + +repo_url: https://github.com/NVIDIA-AI-IOT/torch2trt + +plugins: + - search + +use_directory_urls: False + +edit_uri: blob/master/docs +markdown_extensions: + - pymdownx.tabbed + - pymdownx.keys + - pymdownx.snippets + - pymdownx.inlinehilite + - pymdownx.highlight: + use_pygments: true + - admonition + - pymdownx.details + - pymdownx.superfences + - attr_list + +# use_directory_urls - False to fix broken raw html image links +# https://github.com/mkdocs/mkdocs/issues/991 + + +nav: + + - Home: index.md + - Getting Started: getting_started.md + - Usage: + - Basic Usage: usage/basic_usage.md + - Reduced Precision: usage/reduced_precision.md + - Custom Converter: usage/custom_converter.md + - Converters: converters.md + - Benchmarks: + - Jetson Nano: benchmarks/jetson_nano.md + - Jetson Xavier: benchmarks/jetson_xavier.md + - Contributing: CONTRIBUTING.md + - See Also: see_also.md + +extra_css: + - css/version-select.css +extra_javascript: + - js/version-select.js + +google_analytics: + - UA-135919510-3 + - auto + diff --git a/notebooks/image_classification/conversion.ipynb b/notebooks/image_classification/conversion.ipynb new file mode 100644 index 00000000..7d84b088 --- /dev/null +++ b/notebooks/image_classification/conversion.ipynb @@ -0,0 +1,171 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First, we create the pre-trained ImageNet model. We'll use ``resnet18`` from the torchvision package. Make sure to set the device to ``cuda``, since the inputs and parameter devices are inferred from model. Also make sure to set ``eval()`` to fix batch norm statistics." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import torchvision\n", + "\n", + "model = torchvision.models.resnet18(pretrained=True).cuda().half().eval()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next, we create some sample input that will be used to infer the shape and data types of our TensorRT engine" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "\n", + "data = torch.randn((1, 3, 224, 224)).cuda().half()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Finally, create the optimized TensorRT engine." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "from torch2trt import torch2trt\n", + "\n", + "model_trt = torch2trt(model, [data], fp16_mode=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can execute the network like this" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "output_trt = model_trt(data)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And check against the original output" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tensor([ 0.7231, 3.0195, 3.1016, 3.1152, 4.7539, 3.8301, 3.9180, 0.3086,\n", + " -0.8726, -0.2261], device='cuda:0', dtype=torch.float16,\n", + " grad_fn=)\n", + "tensor([ 0.7202, 3.0234, 3.1074, 3.1133, 4.7539, 3.8340, 3.9141, 0.3081,\n", + " -0.8716, -0.2227], device='cuda:0', dtype=torch.float16)\n", + "max error: 0.011719\n" + ] + } + ], + "source": [ + "output = model(data)\n", + "\n", + "print(output.flatten()[0:10])\n", + "print(output_trt.flatten()[0:10])\n", + "print('max error: %f' % float(torch.max(torch.abs(output - output_trt))))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can save the model like this" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "torch.save(model_trt.state_dict(), 'resnet18_trt.pth')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And load the model like this." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from torch2trt import TRTModule\n", + "\n", + "model_trt = TRTModule()\n", + "\n", + "model_trt.load_state_dict(torch.load('resnet18_trt.pth'))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "That's it for this notebook! Try out the live demo to see real-time classification on a video feed." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/image_classification/imagenet_labels.json b/notebooks/image_classification/imagenet_labels.json new file mode 100644 index 00000000..d3314cfa --- /dev/null +++ b/notebooks/image_classification/imagenet_labels.json @@ -0,0 +1,1002 @@ +[ +"tench, Tinca tinca", +"goldfish, Carassius auratus", +"great white shark, white shark, man-eater, man-eating shark, Carcharodon carcharias", +"tiger shark, Galeocerdo cuvieri", +"hammerhead, hammerhead shark", +"electric ray, crampfish, numbfish, torpedo", +"stingray", +"cock", +"hen", +"ostrich, Struthio camelus", +"brambling, Fringilla montifringilla", +"goldfinch, Carduelis carduelis", +"house finch, linnet, Carpodacus mexicanus", +"junco, snowbird", +"indigo bunting, indigo finch, indigo bird, Passerina cyanea", +"robin, American robin, Turdus migratorius", +"bulbul", +"jay", +"magpie", +"chickadee", +"water ouzel, dipper", +"kite", +"bald eagle, American eagle, Haliaeetus leucocephalus", +"vulture", +"great grey owl, great gray owl, Strix nebulosa", +"European fire salamander, Salamandra salamandra", +"common newt, Triturus vulgaris", +"eft", +"spotted salamander, Ambystoma maculatum", +"axolotl, mud puppy, Ambystoma mexicanum", +"bullfrog, Rana catesbeiana", +"tree frog, tree-frog", +"tailed frog, bell toad, ribbed toad, tailed toad, Ascaphus trui", +"loggerhead, loggerhead turtle, Caretta caretta", +"leatherback turtle, leatherback, leathery turtle, Dermochelys coriacea", +"mud turtle", +"terrapin", +"box turtle, box tortoise", +"banded gecko", +"common iguana, iguana, Iguana iguana", +"American chameleon, anole, Anolis carolinensis", +"whiptail, whiptail lizard", +"agama", +"frilled lizard, Chlamydosaurus kingi", +"alligator lizard", +"Gila monster, Heloderma suspectum", +"green lizard, Lacerta viridis", +"African chameleon, Chamaeleo chamaeleon", +"Komodo dragon, Komodo lizard, dragon lizard, giant lizard, Varanus komodoensis", +"African crocodile, Nile crocodile, Crocodylus niloticus", +"American alligator, Alligator mississipiensis", +"triceratops", +"thunder snake, worm snake, Carphophis amoenus", +"ringneck snake, ring-necked snake, ring snake", +"hognose snake, puff adder, sand viper", +"green snake, grass snake", +"king snake, kingsnake", +"garter snake, grass snake", +"water snake", +"vine snake", +"night snake, Hypsiglena torquata", +"boa constrictor, Constrictor constrictor", +"rock python, rock snake, Python sebae", +"Indian cobra, Naja naja", +"green mamba", +"sea snake", +"horned viper, cerastes, sand viper, horned asp, Cerastes cornutus", +"diamondback, diamondback rattlesnake, Crotalus adamanteus", +"sidewinder, horned rattlesnake, Crotalus cerastes", +"trilobite", +"harvestman, daddy longlegs, Phalangium opilio", +"scorpion", +"black and gold garden spider, Argiope aurantia", +"barn spider, Araneus cavaticus", +"garden spider, Aranea diademata", +"black widow, Latrodectus mactans", +"tarantula", +"wolf spider, hunting spider", +"tick", +"centipede", +"black grouse", +"ptarmigan", +"ruffed grouse, partridge, Bonasa umbellus", +"prairie chicken, prairie grouse, prairie fowl", +"peacock", +"quail", +"partridge", +"African grey, African gray, Psittacus erithacus", +"macaw", +"sulphur-crested cockatoo, Kakatoe galerita, Cacatua galerita", +"lorikeet", +"coucal", +"bee eater", +"hornbill", +"hummingbird", +"jacamar", +"toucan", +"drake", +"red-breasted merganser, Mergus serrator", +"goose", +"black swan, Cygnus atratus", +"tusker", +"echidna, spiny anteater, anteater", +"platypus, duckbill, duckbilled platypus, duck-billed platypus, Ornithorhynchus anatinus", +"wallaby, brush kangaroo", +"koala, koala bear, kangaroo bear, native bear, Phascolarctos cinereus", +"wombat", +"jellyfish", +"sea anemone, anemone", +"brain coral", +"flatworm, platyhelminth", +"nematode, nematode worm, roundworm", +"conch", +"snail", +"slug", +"sea slug, nudibranch", +"chiton, coat-of-mail shell, sea cradle, polyplacophore", +"chambered nautilus, pearly nautilus, nautilus", +"Dungeness crab, Cancer magister", +"rock crab, Cancer irroratus", +"fiddler crab", +"king crab, Alaska crab, Alaskan king crab, Alaska king crab, Paralithodes camtschatica", +"American lobster, Northern lobster, Maine lobster, Homarus americanus", +"spiny lobster, langouste, rock lobster, crawfish, crayfish, sea crawfish", +"crayfish, crawfish, crawdad, crawdaddy", +"hermit crab", +"isopod", +"white stork, Ciconia ciconia", +"black stork, Ciconia nigra", +"spoonbill", +"flamingo", +"little blue heron, Egretta caerulea", +"American egret, great white heron, Egretta albus", +"bittern", +"crane", +"limpkin, Aramus pictus", +"European gallinule, Porphyrio porphyrio", +"American coot, marsh hen, mud hen, water hen, Fulica americana", +"bustard", +"ruddy turnstone, Arenaria interpres", +"red-backed sandpiper, dunlin, Erolia alpina", +"redshank, Tringa totanus", +"dowitcher", +"oystercatcher, oyster catcher", +"pelican", +"king penguin, Aptenodytes patagonica", +"albatross, mollymawk", +"grey whale, gray whale, devilfish, Eschrichtius gibbosus, Eschrichtius robustus", +"killer whale, killer, orca, grampus, sea wolf, Orcinus orca", +"dugong, Dugong dugon", +"sea lion", +"Chihuahua", +"Japanese spaniel", +"Maltese dog, Maltese terrier, Maltese", +"Pekinese, Pekingese, Peke", +"Shih-Tzu", +"Blenheim spaniel", +"papillon", +"toy terrier", +"Rhodesian ridgeback", +"Afghan hound, Afghan", +"basset, basset hound", +"beagle", +"bloodhound, sleuthhound", +"bluetick", +"black-and-tan coonhound", +"Walker hound, Walker foxhound", +"English foxhound", +"redbone", +"borzoi, Russian wolfhound", +"Irish wolfhound", +"Italian greyhound", +"whippet", +"Ibizan hound, Ibizan Podenco", +"Norwegian elkhound, elkhound", +"otterhound, otter hound", +"Saluki, gazelle hound", +"Scottish deerhound, deerhound", +"Weimaraner", +"Staffordshire bullterrier, Staffordshire bull terrier", +"American Staffordshire terrier, Staffordshire terrier, American pit bull terrier, pit bull terrier", +"Bedlington terrier", +"Border terrier", +"Kerry blue terrier", +"Irish terrier", +"Norfolk terrier", +"Norwich terrier", +"Yorkshire terrier", +"wire-haired fox terrier", +"Lakeland terrier", +"Sealyham terrier, Sealyham", +"Airedale, Airedale terrier", +"cairn, cairn terrier", +"Australian terrier", +"Dandie Dinmont, Dandie Dinmont terrier", +"Boston bull, Boston terrier", +"miniature schnauzer", +"giant schnauzer", +"standard schnauzer", +"Scotch terrier, Scottish terrier, Scottie", +"Tibetan terrier, chrysanthemum dog", +"silky terrier, Sydney silky", +"soft-coated wheaten terrier", +"West Highland white terrier", +"Lhasa, Lhasa apso", +"flat-coated retriever", +"curly-coated retriever", +"golden retriever", +"Labrador retriever", +"Chesapeake Bay retriever", +"German short-haired pointer", +"vizsla, Hungarian pointer", +"English setter", +"Irish setter, red setter", +"Gordon setter", +"Brittany spaniel", +"clumber, clumber spaniel", +"English springer, English springer spaniel", +"Welsh springer spaniel", +"cocker spaniel, English cocker spaniel, cocker", +"Sussex spaniel", +"Irish water spaniel", +"kuvasz", +"schipperke", +"groenendael", +"malinois", +"briard", +"kelpie", +"komondor", +"Old English sheepdog, bobtail", +"Shetland sheepdog, Shetland sheep dog, Shetland", +"collie", +"Border collie", +"Bouvier des Flandres, Bouviers des Flandres", +"Rottweiler", +"German shepherd, German shepherd dog, German police dog, alsatian", +"Doberman, Doberman pinscher", +"miniature pinscher", +"Greater Swiss Mountain dog", +"Bernese mountain dog", +"Appenzeller", +"EntleBucher", +"boxer", +"bull mastiff", +"Tibetan mastiff", +"French bulldog", +"Great Dane", +"Saint Bernard, St Bernard", +"Eskimo dog, husky", +"malamute, malemute, Alaskan malamute", +"Siberian husky", +"dalmatian, coach dog, carriage dog", +"affenpinscher, monkey pinscher, monkey dog", +"basenji", +"pug, pug-dog", +"Leonberg", +"Newfoundland, Newfoundland dog", +"Great Pyrenees", +"Samoyed, Samoyede", +"Pomeranian", +"chow, chow chow", +"keeshond", +"Brabancon griffon", +"Pembroke, Pembroke Welsh corgi", +"Cardigan, Cardigan Welsh corgi", +"toy poodle", +"miniature poodle", +"standard poodle", +"Mexican hairless", +"timber wolf, grey wolf, gray wolf, Canis lupus", +"white wolf, Arctic wolf, Canis lupus tundrarum", +"red wolf, maned wolf, Canis rufus, Canis niger", +"coyote, prairie wolf, brush wolf, Canis latrans", +"dingo, warrigal, warragal, Canis dingo", +"dhole, Cuon alpinus", +"African hunting dog, hyena dog, Cape hunting dog, Lycaon pictus", +"hyena, hyaena", +"red fox, Vulpes vulpes", +"kit fox, Vulpes macrotis", +"Arctic fox, white fox, Alopex lagopus", +"grey fox, gray fox, Urocyon cinereoargenteus", +"tabby, tabby cat", +"tiger cat", +"Persian cat", +"Siamese cat, Siamese", +"Egyptian cat", +"cougar, puma, catamount, mountain lion, painter, panther, Felis concolor", +"lynx, catamount", +"leopard, Panthera pardus", +"snow leopard, ounce, Panthera uncia", +"jaguar, panther, Panthera onca, Felis onca", +"lion, king of beasts, Panthera leo", +"tiger, Panthera tigris", +"cheetah, chetah, Acinonyx jubatus", +"brown bear, bruin, Ursus arctos", +"American black bear, black bear, Ursus americanus, Euarctos americanus", +"ice bear, polar bear, Ursus Maritimus, Thalarctos maritimus", +"sloth bear, Melursus ursinus, Ursus ursinus", +"mongoose", +"meerkat, mierkat", +"tiger beetle", +"ladybug, ladybeetle, lady beetle, ladybird, ladybird beetle", +"ground beetle, carabid beetle", +"long-horned beetle, longicorn, longicorn beetle", +"leaf beetle, chrysomelid", +"dung beetle", +"rhinoceros beetle", +"weevil", +"fly", +"bee", +"ant, emmet, pismire", +"grasshopper, hopper", +"cricket", +"walking stick, walkingstick, stick insect", +"cockroach, roach", +"mantis, mantid", +"cicada, cicala", +"leafhopper", +"lacewing, lacewing fly", +"dragonfly, darning needle, devil's darning needle, sewing needle, snake feeder, snake doctor, mosquito hawk, skeeter hawk", +"damselfly", +"admiral", +"ringlet, ringlet butterfly", +"monarch, monarch butterfly, milkweed butterfly, Danaus plexippus", +"cabbage butterfly", +"sulphur butterfly, sulfur butterfly", +"lycaenid, lycaenid butterfly", +"starfish, sea star", +"sea urchin", +"sea cucumber, holothurian", +"wood rabbit, cottontail, cottontail rabbit", +"hare", +"Angora, Angora rabbit", +"hamster", +"porcupine, hedgehog", +"fox squirrel, eastern fox squirrel, Sciurus niger", +"marmot", +"beaver", +"guinea pig, Cavia cobaya", +"sorrel", +"zebra", +"hog, pig, grunter, squealer, Sus scrofa", +"wild boar, boar, Sus scrofa", +"warthog", +"hippopotamus, hippo, river horse, Hippopotamus amphibius", +"ox", +"water buffalo, water ox, Asiatic buffalo, Bubalus bubalis", +"bison", +"ram, tup", +"bighorn, bighorn sheep, cimarron, Rocky Mountain bighorn, Rocky Mountain sheep, Ovis canadensis", +"ibex, Capra ibex", +"hartebeest", +"impala, Aepyceros melampus", +"gazelle", +"Arabian camel, dromedary, Camelus dromedarius", +"llama", +"weasel", +"mink", +"polecat, fitch, foulmart, foumart, Mustela putorius", +"black-footed ferret, ferret, Mustela nigripes", +"otter", +"skunk, polecat, wood pussy", +"badger", +"armadillo", +"three-toed sloth, ai, Bradypus tridactylus", +"orangutan, orang, orangutang, Pongo pygmaeus", +"gorilla, Gorilla gorilla", +"chimpanzee, chimp, Pan troglodytes", +"gibbon, Hylobates lar", +"siamang, Hylobates syndactylus, Symphalangus syndactylus", +"guenon, guenon monkey", +"patas, hussar monkey, Erythrocebus patas", +"baboon", +"macaque", +"langur", +"colobus, colobus monkey", +"proboscis monkey, Nasalis larvatus", +"marmoset", +"capuchin, ringtail, Cebus capucinus", +"howler monkey, howler", +"titi, titi monkey", +"spider monkey, Ateles geoffroyi", +"squirrel monkey, Saimiri sciureus", +"Madagascar cat, ring-tailed lemur, Lemur catta", +"indri, indris, Indri indri, Indri brevicaudatus", +"Indian elephant, Elephas maximus", +"African elephant, Loxodonta africana", +"lesser panda, red panda, panda, bear cat, cat bear, Ailurus fulgens", +"giant panda, panda, panda bear, coon bear, Ailuropoda melanoleuca", +"barracouta, snoek", +"eel", +"coho, cohoe, coho salmon, blue jack, silver salmon, Oncorhynchus kisutch", +"rock beauty, Holocanthus tricolor", +"anemone fish", +"sturgeon", +"gar, garfish, garpike, billfish, Lepisosteus osseus", +"lionfish", +"puffer, pufferfish, blowfish, globefish", +"abacus", +"abaya", +"academic gown, academic robe, judge's robe", +"accordion, piano accordion, squeeze box", +"acoustic guitar", +"aircraft carrier, carrier, flattop, attack aircraft carrier", +"airliner", +"airship, dirigible", +"altar", +"ambulance", +"amphibian, amphibious vehicle", +"analog clock", +"apiary, bee house", +"apron", +"ashcan, trash can, garbage can, wastebin, ash bin, ash-bin, ashbin, dustbin, trash barrel, trash bin", +"assault rifle, assault gun", +"backpack, back pack, knapsack, packsack, rucksack, haversack", +"bakery, bakeshop, bakehouse", +"balance beam, beam", +"balloon", +"ballpoint, ballpoint pen, ballpen, Biro", +"Band Aid", +"banjo", +"bannister, banister, balustrade, balusters, handrail", +"barbell", +"barber chair", +"barbershop", +"barn", +"barometer", +"barrel, cask", +"barrow, garden cart, lawn cart, wheelbarrow", +"baseball", +"basketball", +"bassinet", +"bassoon", +"bathing cap, swimming cap", +"bath towel", +"bathtub, bathing tub, bath, tub", +"beach wagon, station wagon, wagon, estate car, beach waggon, station waggon, waggon", +"beacon, lighthouse, beacon light, pharos", +"beaker", +"bearskin, busby, shako", +"beer bottle", +"beer glass", +"bell cote, bell cot", +"bib", +"bicycle-built-for-two, tandem bicycle, tandem", +"bikini, two-piece", +"binder, ring-binder", +"binoculars, field glasses, opera glasses", +"birdhouse", +"boathouse", +"bobsled, bobsleigh, bob", +"bolo tie, bolo, bola tie, bola", +"bonnet, poke bonnet", +"bookcase", +"bookshop, bookstore, bookstall", +"bottlecap", +"bow", +"bow tie, bow-tie, bowtie", +"brass, memorial tablet, plaque", +"brassiere, bra, bandeau", +"breakwater, groin, groyne, mole, bulwark, seawall, jetty", +"breastplate, aegis, egis", +"broom", +"bucket, pail", +"buckle", +"bulletproof vest", +"bullet train, bullet", +"butcher shop, meat market", +"cab, hack, taxi, taxicab", +"caldron, cauldron", +"candle, taper, wax light", +"cannon", +"canoe", +"can opener, tin opener", +"cardigan", +"car mirror", +"carousel, carrousel, merry-go-round, roundabout, whirligig", +"carpenter's kit, tool kit", +"carton", +"car wheel", +"cash machine, cash dispenser, automated teller machine, automatic teller machine, automated teller, automatic teller, ATM", +"cassette", +"cassette player", +"castle", +"catamaran", +"CD player", +"cello, violoncello", +"cellular telephone, cellular phone, cellphone, cell, mobile phone", +"chain", +"chainlink fence", +"chain mail, ring mail, mail, chain armor, chain armour, ring armor, ring armour", +"chain saw, chainsaw", +"chest", +"chiffonier, commode", +"chime, bell, gong", +"china cabinet, china closet", +"Christmas stocking", +"church, church building", +"cinema, movie theater, movie theatre, movie house, picture palace", +"cleaver, meat cleaver, chopper", +"cliff dwelling", +"cloak", +"clog, geta, patten, sabot", +"cocktail shaker", +"coffee mug", +"coffeepot", +"coil, spiral, volute, whorl, helix", +"combination lock", +"computer keyboard, keypad", +"confectionery, confectionary, candy store", +"container ship, containership, container vessel", +"convertible", +"corkscrew, bottle screw", +"cornet, horn, trumpet, trump", +"cowboy boot", +"cowboy hat, ten-gallon hat", +"cradle", +"crane", +"crash helmet", +"crate", +"crib, cot", +"Crock Pot", +"croquet ball", +"crutch", +"cuirass", +"dam, dike, dyke", +"desk", +"desktop computer", +"dial telephone, dial phone", +"diaper, nappy, napkin", +"digital clock", +"digital watch", +"dining table, board", +"dishrag, dishcloth", +"dishwasher, dish washer, dishwashing machine", +"disk brake, disc brake", +"dock, dockage, docking facility", +"dogsled, dog sled, dog sleigh", +"dome", +"doormat, welcome mat", +"drilling platform, offshore rig", +"drum, membranophone, tympan", +"drumstick", +"dumbbell", +"Dutch oven", +"electric fan, blower", +"electric guitar", +"electric locomotive", +"entertainment center", +"envelope", +"espresso maker", +"face powder", +"feather boa, boa", +"file, file cabinet, filing cabinet", +"fireboat", +"fire engine, fire truck", +"fire screen, fireguard", +"flagpole, flagstaff", +"flute, transverse flute", +"folding chair", +"football helmet", +"forklift", +"fountain", +"fountain pen", +"four-poster", +"freight car", +"French horn, horn", +"frying pan, frypan, skillet", +"fur coat", +"garbage truck, dustcart", +"gasmask, respirator, gas helmet", +"gas pump, gasoline pump, petrol pump, island dispenser", +"goblet", +"go-kart", +"golf ball", +"golfcart, golf cart", +"gondola", +"gong, tam-tam", +"gown", +"grand piano, grand", +"greenhouse, nursery, glasshouse", +"grille, radiator grille", +"grocery store, grocery, food market, market", +"guillotine", +"hair slide", +"hair spray", +"half track", +"hammer", +"hamper", +"hand blower, blow dryer, blow drier, hair dryer, hair drier", +"hand-held computer, hand-held microcomputer", +"handkerchief, hankie, hanky, hankey", +"hard disc, hard disk, fixed disk", +"harmonica, mouth organ, harp, mouth harp", +"harp", +"harvester, reaper", +"hatchet", +"holster", +"home theater, home theatre", +"honeycomb", +"hook, claw", +"hoopskirt, crinoline", +"horizontal bar, high bar", +"horse cart, horse-cart", +"hourglass", +"iPod", +"iron, smoothing iron", +"jack-o'-lantern", +"jean, blue jean, denim", +"jeep, landrover", +"jersey, T-shirt, tee shirt", +"jigsaw puzzle", +"jinrikisha, ricksha, rickshaw", +"joystick", +"kimono", +"knee pad", +"knot", +"lab coat, laboratory coat", +"ladle", +"lampshade, lamp shade", +"laptop, laptop computer", +"lawn mower, mower", +"lens cap, lens cover", +"letter opener, paper knife, paperknife", +"library", +"lifeboat", +"lighter, light, igniter, ignitor", +"limousine, limo", +"liner, ocean liner", +"lipstick, lip rouge", +"Loafer", +"lotion", +"loudspeaker, speaker, speaker unit, loudspeaker system, speaker system", +"loupe, jeweler's loupe", +"lumbermill, sawmill", +"magnetic compass", +"mailbag, postbag", +"mailbox, letter box", +"maillot", +"maillot, tank suit", +"manhole cover", +"maraca", +"marimba, xylophone", +"mask", +"matchstick", +"maypole", +"maze, labyrinth", +"measuring cup", +"medicine chest, medicine cabinet", +"megalith, megalithic structure", +"microphone, mike", +"microwave, microwave oven", +"military uniform", +"milk can", +"minibus", +"miniskirt, mini", +"minivan", +"missile", +"mitten", +"mixing bowl", +"mobile home, manufactured home", +"Model T", +"modem", +"monastery", +"monitor", +"moped", +"mortar", +"mortarboard", +"mosque", +"mosquito net", +"motor scooter, scooter", +"mountain bike, all-terrain bike, off-roader", +"mountain tent", +"mouse, computer mouse", +"mousetrap", +"moving van", +"muzzle", +"nail", +"neck brace", +"necklace", +"nipple", +"notebook, notebook computer", +"obelisk", +"oboe, hautboy, hautbois", +"ocarina, sweet potato", +"odometer, hodometer, mileometer, milometer", +"oil filter", +"organ, pipe organ", +"oscilloscope, scope, cathode-ray oscilloscope, CRO", +"overskirt", +"oxcart", +"oxygen mask", +"packet", +"paddle, boat paddle", +"paddlewheel, paddle wheel", +"padlock", +"paintbrush", +"pajama, pyjama, pj's, jammies", +"palace", +"panpipe, pandean pipe, syrinx", +"paper towel", +"parachute, chute", +"parallel bars, bars", +"park bench", +"parking meter", +"passenger car, coach, carriage", +"patio, terrace", +"pay-phone, pay-station", +"pedestal, plinth, footstall", +"pencil box, pencil case", +"pencil sharpener", +"perfume, essence", +"Petri dish", +"photocopier", +"pick, plectrum, plectron", +"pickelhaube", +"picket fence, paling", +"pickup, pickup truck", +"pier", +"piggy bank, penny bank", +"pill bottle", +"pillow", +"ping-pong ball", +"pinwheel", +"pirate, pirate ship", +"pitcher, ewer", +"plane, carpenter's plane, woodworking plane", +"planetarium", +"plastic bag", +"plate rack", +"plow, plough", +"plunger, plumber's helper", +"Polaroid camera, Polaroid Land camera", +"pole", +"police van, police wagon, paddy wagon, patrol wagon, wagon, black Maria", +"poncho", +"pool table, billiard table, snooker table", +"pop bottle, soda bottle", +"pot, flowerpot", +"potter's wheel", +"power drill", +"prayer rug, prayer mat", +"printer", +"prison, prison house", +"projectile, missile", +"projector", +"puck, hockey puck", +"punching bag, punch bag, punching ball, punchball", +"purse", +"quill, quill pen", +"quilt, comforter, comfort, puff", +"racer, race car, racing car", +"racket, racquet", +"radiator", +"radio, wireless", +"radio telescope, radio reflector", +"rain barrel", +"recreational vehicle, RV, R.V.", +"reel", +"reflex camera", +"refrigerator, icebox", +"remote control, remote", +"restaurant, eating house, eating place, eatery", +"revolver, six-gun, six-shooter", +"rifle", +"rocking chair, rocker", +"rotisserie", +"rubber eraser, rubber, pencil eraser", +"rugby ball", +"rule, ruler", +"running shoe", +"safe", +"safety pin", +"saltshaker, salt shaker", +"sandal", +"sarong", +"sax, saxophone", +"scabbard", +"scale, weighing machine", +"school bus", +"schooner", +"scoreboard", +"screen, CRT screen", +"screw", +"screwdriver", +"seat belt, seatbelt", +"sewing machine", +"shield, buckler", +"shoe shop, shoe-shop, shoe store", +"shoji", +"shopping basket", +"shopping cart", +"shovel", +"shower cap", +"shower curtain", +"ski", +"ski mask", +"sleeping bag", +"slide rule, slipstick", +"sliding door", +"slot, one-armed bandit", +"snorkel", +"snowmobile", +"snowplow, snowplough", +"soap dispenser", +"soccer ball", +"sock", +"solar dish, solar collector, solar furnace", +"sombrero", +"soup bowl", +"space bar", +"space heater", +"space shuttle", +"spatula", +"speedboat", +"spider web, spider's web", +"spindle", +"sports car, sport car", +"spotlight, spot", +"stage", +"steam locomotive", +"steel arch bridge", +"steel drum", +"stethoscope", +"stole", +"stone wall", +"stopwatch, stop watch", +"stove", +"strainer", +"streetcar, tram, tramcar, trolley, trolley car", +"stretcher", +"studio couch, day bed", +"stupa, tope", +"submarine, pigboat, sub, U-boat", +"suit, suit of clothes", +"sundial", +"sunglass", +"sunglasses, dark glasses, shades", +"sunscreen, sunblock, sun blocker", +"suspension bridge", +"swab, swob, mop", +"sweatshirt", +"swimming trunks, bathing trunks", +"swing", +"switch, electric switch, electrical switch", +"syringe", +"table lamp", +"tank, army tank, armored combat vehicle, armoured combat vehicle", +"tape player", +"teapot", +"teddy, teddy bear", +"television, television system", +"tennis ball", +"thatch, thatched roof", +"theater curtain, theatre curtain", +"thimble", +"thresher, thrasher, threshing machine", +"throne", +"tile roof", +"toaster", +"tobacco shop, tobacconist shop, tobacconist", +"toilet seat", +"torch", +"totem pole", +"tow truck, tow car, wrecker", +"toyshop", +"tractor", +"trailer truck, tractor trailer, trucking rig, rig, articulated lorry, semi", +"tray", +"trench coat", +"tricycle, trike, velocipede", +"trimaran", +"tripod", +"triumphal arch", +"trolleybus, trolley coach, trackless trolley", +"trombone", +"tub, vat", +"turnstile", +"typewriter keyboard", +"umbrella", +"unicycle, monocycle", +"upright, upright piano", +"vacuum, vacuum cleaner", +"vase", +"vault", +"velvet", +"vending machine", +"vestment", +"viaduct", +"violin, fiddle", +"volleyball", +"waffle iron", +"wall clock", +"wallet, billfold, notecase, pocketbook", +"wardrobe, closet, press", +"warplane, military plane", +"washbasin, handbasin, washbowl, lavabo, wash-hand basin", +"washer, automatic washer, washing machine", +"water bottle", +"water jug", +"water tower", +"whiskey jug", +"whistle", +"wig", +"window screen", +"window shade", +"Windsor tie", +"wine bottle", +"wing", +"wok", +"wooden spoon", +"wool, woolen, woollen", +"worm fence, snake fence, snake-rail fence, Virginia fence", +"wreck", +"yawl", +"yurt", +"web site, website, internet site, site", +"comic book", +"crossword puzzle, crossword", +"street sign", +"traffic light, traffic signal, stoplight", +"book jacket, dust cover, dust jacket, dust wrapper", +"menu", +"plate", +"guacamole", +"consomme", +"hot pot, hotpot", +"trifle", +"ice cream, icecream", +"ice lolly, lolly, lollipop, popsicle", +"French loaf", +"bagel, beigel", +"pretzel", +"cheeseburger", +"hotdog, hot dog, red hot", +"mashed potato", +"head cabbage", +"broccoli", +"cauliflower", +"zucchini, courgette", +"spaghetti squash", +"acorn squash", +"butternut squash", +"cucumber, cuke", +"artichoke, globe artichoke", +"bell pepper", +"cardoon", +"mushroom", +"Granny Smith", +"strawberry", +"orange", +"lemon", +"fig", +"pineapple, ananas", +"banana", +"jackfruit, jak, jack", +"custard apple", +"pomegranate", +"hay", +"carbonara", +"chocolate sauce, chocolate syrup", +"dough", +"meat loaf, meatloaf", +"pizza, pizza pie", +"potpie", +"burrito", +"red wine", +"espresso", +"cup", +"eggnog", +"alp", +"bubble", +"cliff, drop, drop-off", +"coral reef", +"geyser", +"lakeside, lakeshore", +"promontory, headland, head, foreland", +"sandbar, sand bar", +"seashore, coast, seacoast, sea-coast", +"valley, vale", +"volcano", +"ballplayer, baseball player", +"groom, bridegroom", +"scuba diver", +"rapeseed", +"daisy", +"yellow lady's slipper, yellow lady-slipper, Cypripedium calceolus, Cypripedium parviflorum", +"corn", +"acorn", +"hip, rose hip, rosehip", +"buckeye, horse chestnut, conker", +"coral fungus", +"agaric", +"gyromitra", +"stinkhorn, carrion fungus", +"earthstar", +"hen-of-the-woods, hen of the woods, Polyporus frondosus, Grifola frondosa", +"bolete", +"ear, spike, capitulum", +"toilet tissue, toilet paper, bathroom tissue" +] \ No newline at end of file diff --git a/notebooks/image_classification/live_demo.ipynb b/notebooks/image_classification/live_demo.ipynb new file mode 100644 index 00000000..8f217f1f --- /dev/null +++ b/notebooks/image_classification/live_demo.ipynb @@ -0,0 +1,188 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This notebook will run a live demo on Jetson Nano using [JetCam](https://github.com/NVIDIA-AI-IOT/jetcam) to acquire images from the camera. First,\n", + "let's start the camera. See the JetCam examples for details." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from jetcam.csi_camera import CSICamera\n", + "# from jetcam.usb_camera import USBCamera\n", + "\n", + "camera = CSICamera(width=224, height=224)\n", + "# camera = USBCamera(width=224, height=224)\n", + "\n", + "camera.running = True" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, let's connect the camera's value to a widget to display." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from jetcam.utils import bgr8_to_jpeg\n", + "import traitlets\n", + "import ipywidgets\n", + "\n", + "image_w = ipywidgets.Image()\n", + "\n", + "traitlets.dlink((camera, 'value'), (image_w, 'value'), transform=bgr8_to_jpeg)\n", + "\n", + "display(image_w)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next, we'll load the TensorRT model. (We assume you followed the conversion notebook and saved to the path ``resnet18_trt.pth``)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "from torch2trt import TRTModule\n", + "\n", + "model_trt = TRTModule()\n", + "model_trt.load_state_dict(torch.load('resnet18_trt.pth'))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The following function will be used to pre-process images from the camera" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import cv2\n", + "import numpy as np\n", + "import torchvision\n", + "\n", + "device = torch.device('cuda')\n", + "mean = 255.0 * np.array([0.485, 0.456, 0.406])\n", + "stdev = 255.0 * np.array([0.229, 0.224, 0.225])\n", + "\n", + "normalize = torchvision.transforms.Normalize(mean, stdev)\n", + "\n", + "def preprocess(camera_value):\n", + " global device, normalize\n", + " x = camera_value\n", + " x = cv2.cvtColor(x, cv2.COLOR_BGR2RGB)\n", + " x = x.transpose((2, 0, 1))\n", + " x = torch.from_numpy(x).float()\n", + " x = normalize(x)\n", + " x = x.to(device)\n", + " x = x[None, ...]\n", + " return x" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This text area will be used to display the class predictions." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "text = ipywidgets.Textarea()\n", + "display(text)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We load the imagenet labels to associate the neural network output with a class name." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "\n", + "with open('imagenet_labels.json', 'r') as f:\n", + " labels = json.load(f)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Finally, we create our execution function, which we attach as a callback to the camera's ``value`` attribute.\n", + "\n", + "Whenever the camera's value is updated (which it will be for each frame, since we set ``camera.running = True``). This function will be called\n", + "describing how the value changed. The new camera value will be stored in ``change['new']``." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def execute(change):\n", + " image = change['new']\n", + " output = model_trt(preprocess(image).half()).detach().cpu().numpy().flatten()\n", + " idx = output.argmax()\n", + " text.value = labels[idx]\n", + "\n", + "camera.observe(execute, names='value')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/image_segmentation/conversion.ipynb b/notebooks/image_segmentation/conversion.ipynb new file mode 100644 index 00000000..b540b34c --- /dev/null +++ b/notebooks/image_segmentation/conversion.ipynb @@ -0,0 +1,226 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "import torchvision\n", + "import torch2trt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model = torchvision.models.segmentation.deeplabv3_resnet101(pretrained=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model = model.cuda().eval().half()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class ModelWrapper(torch.nn.Module):\n", + " def __init__(self, model):\n", + " super(ModelWrapper, self).__init__()\n", + " self.model = model\n", + " def forward(self, x):\n", + " return self.model(x)['out']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model_w = ModelWrapper(model).half()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data = torch.ones((1, 3, 224, 224)).cuda().half()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model_trt = torch2trt.torch2trt(model_w, [data], fp16_mode=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Live demo" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# from jetcam.csi_camera import CSICamera\n", + "from jetcam.usb_camera import USBCamera\n", + "\n", + "# camera = CSICamera(width=224, height=224)\n", + "camera = USBCamera(width=224, height=224)\n", + "\n", + "camera.running = True" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from jetcam.utils import bgr8_to_jpeg\n", + "import traitlets\n", + "import ipywidgets\n", + "\n", + "image_w = ipywidgets.Image()\n", + "\n", + "traitlets.dlink((camera, 'value'), (image_w, 'value'), transform=bgr8_to_jpeg)\n", + "\n", + "display(image_w)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import cv2\n", + "import numpy as np\n", + "import torchvision\n", + "\n", + "device = torch.device('cuda')\n", + "mean = 255.0 * np.array([0.485, 0.456, 0.406])\n", + "stdev = 255.0 * np.array([0.229, 0.224, 0.225])\n", + "\n", + "normalize = torchvision.transforms.Normalize(mean, stdev)\n", + "\n", + "def preprocess(camera_value):\n", + " global device, normalize\n", + " x = camera_value\n", + " x = cv2.cvtColor(x, cv2.COLOR_BGR2RGB)\n", + " x = x.transpose((2, 0, 1))\n", + " x = torch.from_numpy(x).float()\n", + " x = normalize(x)\n", + " x = x.to(device)\n", + " x = x[None, ...]\n", + " return x" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "seg_image = ipywidgets.Image()\n", + "\n", + "display(seg_image)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def execute(change):\n", + " image = change['new']\n", + " output = model_trt(preprocess(camera.value).half())[0].detach().cpu().float().numpy()\n", + " mask = 1.0 * (output.argmax(0) == 15)\n", + " seg_image.value = bgr8_to_jpeg(mask[:, :, None] * image)\n", + " \n", + " \n", + "mask = execute({'new': camera.value})\n", + "# camera.observe(execute, names='value')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "camera.observe(execute, names='value')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "camera.unobserve(execute, names='value')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import time\n", + "\n", + "torch.cuda.current_stream().synchronize()\n", + "t0 = time.time()\n", + "for i in range(100):\n", + " output = model_w(preprocess(camera.value).half())\n", + "torch.cuda.current_stream().synchronize()\n", + "t1 = time.time()\n", + "\n", + "print(100.0 / (t1 - t0))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/scripts/build_docs.sh b/scripts/build_docs.sh new file mode 100755 index 00000000..52a8486a --- /dev/null +++ b/scripts/build_docs.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +GITHUB=$1 +TAG=$2 + +python3 scripts/dump_converters.py --github=$GITHUB --tag=$TAG > docs/converters.md diff --git a/scripts/dump_converters.py b/scripts/dump_converters.py new file mode 100644 index 00000000..cd01465b --- /dev/null +++ b/scripts/dump_converters.py @@ -0,0 +1,54 @@ +import argparse +import sys +import subprocess +import os +from importlib.machinery import SourceFileLoader + +torch2trt = SourceFileLoader("torch2trt", "torch2trt/__init__.py").load_module() # to load relative to root + +HEADER = """ +# Converters + +This table contains a list of supported PyTorch methods and their associated converters. + +If your model is not converting, a good start in debugging would be to see if it contains a method not listed +in this table. You may also find these a useful reference when writing your own converters. + +| Method | Converter | +|--------|-----------|""" + +if __name__ == '__main__': + + parser = argparse.ArgumentParser() + parser.add_argument('--github', + type=str, + default='https://github.com/NVIDIA-AI-IOT/torch2trt') + parser.add_argument('--tag', type=str, default='master') + args = parser.parse_args() + + print(HEADER) + + for method, entry in torch2trt.CONVERTERS.items(): + + if not entry['is_real']: + continue + + converter = entry['converter'] + + # get commit hash +# p = subprocess.Popen(['git', 'rev-parse', 'HEAD'], +# stdout=subprocess.PIPE, +# stderr=subprocess.PIPE) +# commit, err = p.communicate() +# commit = commit.decode('utf-8').strip('\n') + + # get github URL + url = '{github}/blob/{commit}/{relpath}#L{lineno}'.format( + github=args.github, + commit=args.tag, + relpath=os.path.relpath(converter.__code__.co_filename, + os.path.abspath('.')), + lineno=converter.__code__.co_firstlineno) + + print('| ``{method}`` | [``{converter}``]({url}) |'.format( + method=method, converter=converter.__name__, url=url)) diff --git a/scripts/push_docs.sh b/scripts/push_docs.sh new file mode 100755 index 00000000..9d5fc870 --- /dev/null +++ b/scripts/push_docs.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +TAG=$1 + +python3 scripts/dump_converters.py > docs/converters.md + +mike deploy $TAG --push diff --git a/scripts/release_build_docs.sh b/scripts/release_build_docs.sh new file mode 100755 index 00000000..0edc28d8 --- /dev/null +++ b/scripts/release_build_docs.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +TAG=$1 + +python3 scripts/dump_converters.py --tag=$TAG > docs/converters.md + +mike deploy $TAG \ No newline at end of file diff --git a/scripts/release_push_docs.sh b/scripts/release_push_docs.sh new file mode 100755 index 00000000..da008c2c --- /dev/null +++ b/scripts/release_push_docs.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +TAG=$1 + +python3 scripts/dump_converters.py --tag=$TAG > docs/converters.md + +mike deploy $TAG --push diff --git a/scripts/release_test_docs.sh b/scripts/release_test_docs.sh new file mode 100755 index 00000000..b9be6305 --- /dev/null +++ b/scripts/release_test_docs.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +TAG=$1 + +mike set-default $TAG +mike serve --dev-addr=0.0.0.0:8000 \ No newline at end of file diff --git a/scripts/test_docs.sh b/scripts/test_docs.sh new file mode 100755 index 00000000..aef325f9 --- /dev/null +++ b/scripts/test_docs.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +mkdocs serve --dev-addr=0.0.0.0:8000 \ No newline at end of file diff --git a/setup.py b/setup.py index 93d09e83..7967835e 100644 --- a/setup.py +++ b/setup.py @@ -1,8 +1,47 @@ -from setuptools import setup +import sys +import torch +from setuptools import setup, find_packages +from torch.utils.cpp_extension import BuildExtension, CUDAExtension + +def trt_inc_dir(): + return "/usr/include/aarch64-linux-gnu" + +def trt_lib_dir(): + return "/usr/lib/aarch64-linux-gnu" + +ext_modules = [] + +plugins_ext_module = CUDAExtension( + name='plugins', + sources=[ + 'torch2trt/plugins/plugins.cpp' + ], + include_dirs=[ + trt_inc_dir() + ], + library_dirs=[ + trt_lib_dir() + ], + libraries=[ + 'nvinfer' + ], + extra_compile_args={ + 'cxx': ['-DUSE_DEPRECATED_INTLIST'] if torch.__version__ < "1.5" else [], + 'nvcc': [] + } + ) +if '--plugins' in sys.argv: + ext_modules.append(plugins_ext_module) + sys.argv.remove('--plugins') + setup( name='torch2trt', - version='0.0', - description='PyTorch to TensorRT converter', - py_modules=['torch2trt'], + version='0.2.0', + description='An easy to use PyTorch to TensorRT converter', + packages=find_packages(), + ext_package='torch2trt', + ext_modules=ext_modules, + install_requires=["torch"], + cmdclass={'build_ext': BuildExtension} ) diff --git a/test.py b/test.py deleted file mode 100644 index b6c1f8fd..00000000 --- a/test.py +++ /dev/null @@ -1,41 +0,0 @@ -import torch -import argparse -import torchvision.models -from torch2trt import torch2trt -import time - - -if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument('model') - args = parser.parse_args() - - input = torch.randn((1, 3, 224, 224)).cuda().half() - - with torch.no_grad(): - model = getattr(torchvision.models, str(args.model))(pretrained=True).cuda().half().eval() - model_trt = torch2trt(model, [input], fp16_mode=True) - - # run pytorch - output = model(input) - t0 = time.time() - for i in range(100): - output = model(input) - t1 = time.time() - - dt_pytorch = (t1 - t0) / 100.0 - - output = model_trt(input) - t0 = time.time() - for i in range(100): - output = model_trt(input) - t1 = time.time() - - dt_tensorrt = (t1 - t0) / 100.0 - - line = '%s\t%f\t%f' % (args.model, dt_pytorch, dt_tensorrt) - - print(line) - - with open('timings.txt', 'a') as f: - f.write(line + '\n') \ No newline at end of file diff --git a/test.sh b/test.sh old mode 100644 new mode 100755 index 9ef3a9e8..3bf4a4d8 --- a/test.sh +++ b/test.sh @@ -1,22 +1,30 @@ -#!/bin/sh +#!/bin/bash -python3 test.py alexnet -python3 test.py squeezenet1_0 -python3 test.py squeezenet1_1 -python3 test.py resnet18 -python3 test.py resnet34 -python3 test.py resnet50 -python3 test.py resnet101 -python3 test.py resnet152 -python3 test.py densenet121 -python3 test.py densenet169 -python3 test.py densenet201 -python3 test.py densenet161 -python3 test.py vgg11 -python3 test.py vgg13 -python3 test.py vgg16 -python3 test.py vgg19 -python3 test.py vgg11_bn -python3 test.py vgg13_bn -python3 test.py vgg16_bn -python3 test.py vgg19_bn \ No newline at end of file +OUTPUT_FILE=$1 + +touch $OUTPUT_FILE + +echo "| Name | Data Type | Input Shapes | torch2trt kwargs | Max Error | Throughput (PyTorch) | Throughput (TensorRT) | Latency (PyTorch) | Latency (TensorRT) |" >> $OUTPUT_FILE +echo "|------|-----------|--------------|------------------|-----------|----------------------|-----------------------|-------------------|--------------------|" >> $OUTPUT_FILE + +python3 -m torch2trt.test -o $OUTPUT_FILE --name alexnet --include=torch2trt.tests.torchvision.classification +python3 -m torch2trt.test -o $OUTPUT_FILE --name squeezenet1_0 --include=torch2trt.tests.torchvision.classification +python3 -m torch2trt.test -o $OUTPUT_FILE --name squeezenet1_1 --include=torch2trt.tests.torchvision.classification +python3 -m torch2trt.test -o $OUTPUT_FILE --name resnet18 --include=torch2trt.tests.torchvision.classification +python3 -m torch2trt.test -o $OUTPUT_FILE --name resnet34 --include=torch2trt.tests.torchvision.classification +python3 -m torch2trt.test -o $OUTPUT_FILE --name resnet50 --include=torch2trt.tests.torchvision.classification +python3 -m torch2trt.test -o $OUTPUT_FILE --name resnet101 --include=torch2trt.tests.torchvision.classification +python3 -m torch2trt.test -o $OUTPUT_FILE --name resnet152 --include=torch2trt.tests.torchvision.classification +python3 -m torch2trt.test -o $OUTPUT_FILE --name densenet121 --include=torch2trt.tests.torchvision.classification +python3 -m torch2trt.test -o $OUTPUT_FILE --name densenet169 --include=torch2trt.tests.torchvision.classification +python3 -m torch2trt.test -o $OUTPUT_FILE --name densenet201 --include=torch2trt.tests.torchvision.classification +python3 -m torch2trt.test -o $OUTPUT_FILE --name densenet161 --include=torch2trt.tests.torchvision.classification +python3 -m torch2trt.test -o $OUTPUT_FILE --name vgg11$ --include=torch2trt.tests.torchvision.classification +python3 -m torch2trt.test -o $OUTPUT_FILE --name vgg13$ --include=torch2trt.tests.torchvision.classification +python3 -m torch2trt.test -o $OUTPUT_FILE --name vgg16$ --include=torch2trt.tests.torchvision.classification +python3 -m torch2trt.test -o $OUTPUT_FILE --name vgg19$ --include=torch2trt.tests.torchvision.classification +python3 -m torch2trt.test -o $OUTPUT_FILE --name vgg11_bn --include=torch2trt.tests.torchvision.classification +python3 -m torch2trt.test -o $OUTPUT_FILE --name vgg13_bn --include=torch2trt.tests.torchvision.classification +python3 -m torch2trt.test -o $OUTPUT_FILE --name vgg16_bn --include=torch2trt.tests.torchvision.classification +python3 -m torch2trt.test -o $OUTPUT_FILE --name vgg19_bn --include=torch2trt.tests.torchvision.classification +python3 -m torch2trt.test -o $OUTPUT_FILE --name mobilenet_v2 --include=torch2trt.tests.torchvision.classification diff --git a/torch2trt.py b/torch2trt.py deleted file mode 100644 index 97c2a0be..00000000 --- a/torch2trt.py +++ /dev/null @@ -1,481 +0,0 @@ -import torch -import tensorrt as trt -from copy import copy -import numpy as np - - -# UTILITY FUNCTIONS - - -def torch_dtype_to_trt(dtype): - if dtype == torch.int8: - return trt.int8 - elif dtype == torch.int32: - return trt.int32 - elif dtype == torch.float16: - return trt.float16 - elif dtype == torch.float32: - return trt.float32 - else: - raise TypeError('%s is not supported by tensorrt' % dtype) - - -def torch_dtype_from_trt(dtype): - if dtype == trt.int8: - return torch.int8 - elif dtype == trt.int32: - return torch.int32 - elif dtype == trt.float16: - return torch.float16 - elif dtype == trt.float32: - return torch.float32 - else: - raise TypeError('%s is not supported by torch' % dtype) - - -def torch_device_to_trt(device): - if device.type == torch.device('cuda').type: - return trt.TensorLocation.DEVICE - elif device.type == torch.device('cpu').type: - return trt.TensorLocation.HOST - else: - return TypeError('%s is not supported by tensorrt' % device) - - -def torch_device_from_trt(device): - if device == trt.TensorLocation.DEVICE: - return torch.device('cuda') - elif device == trt.TensorLocation.HOST: - return torch.device('cpu') - else: - return TypeError('%s is not supported by torch' % device) - - - -# CONVERSION REGISTRY AND HOOKS - - -CONVERTERS = {} - - -def attach_converter(ctx, method, converter): - """Gets a function that executes PyTorch method and TensorRT converter""" - - def wrapper(*args, **kwargs): - # run original method - outputs = method(*args, **kwargs) - - # call conversion hook - ctx.method_args = args - ctx.method_kwargs = kwargs - ctx.method_return = outputs - #print('%s : %s' % (method.__qualname__, converter.__name__)) - converter(ctx) - - # convert to None so conversion will fail for unsupported layers - ctx.method_args = None - ctx.method_kwargs = None - ctx.method_return = None - - return outputs - - return wrapper - - -class ConversionHook(object): - """Attaches TensorRT converter to PyTorch method call""" - - def __init__(self, ctx, method, converter): - self.ctx = ctx - self.method_str = method - self.method_impl = copy(eval(method)) - self.converter = converter - - def _set_method(self, method): - exec('%s = method' % self.method_str) - - def __enter__(self): - self._set_method(attach_converter(self.ctx, self.method_impl, self.converter)) - - def __exit__(self, type, val, tb): - self._set_method(self.method_impl) - - -class ConversionContext(object): - def __init__(self, network, converters=CONVERTERS): - self.network = network - self.trt_tensors = {} - self.method_args = None - self.method_kwargs = None - self.method_return = None - self.hooks = [ - ConversionHook(self, method, converter) - for method, converter in converters.items() - ] - - def __enter__(self): - for hook in self.hooks: - hook.__enter__() - return self - - def __exit__(self, type, val, tb): - for hook in self.hooks: - hook.__exit__(type, val, tb) - - def add_inputs(self, torch_inputs, names=None): - if names is None: - names = ['input_%d' % i for i in range(len(torch_inputs))] - self.input_names = names - - for i, torch_input in enumerate(torch_inputs): - if torch_input.__hash__() not in self.trt_tensors: - trt_tensor = self.network.add_input( - name=names[i], - shape=tuple(torch_input.shape)[1:], - dtype=torch_dtype_to_trt(torch_input.dtype), - ) - trt_tensor.location = torch_device_to_trt(torch_input.device) - self.trt_tensors[torch_input.__hash__()] = trt_tensor - - def mark_outputs(self, torch_outputs, names=None): - if names is None: - names = ['output_%d' % i for i in range(len(torch_outputs))] - self.output_names = names - - for i, torch_output in enumerate(torch_outputs): - trt_tensor = self.trt_tensors[torch_output.__hash__()] - trt_tensor.name = names[i] - trt_tensor.location = torch_device_to_trt(torch_output.device) - trt_tensor.dtype = torch_dtype_to_trt(torch_output.dtype) - self.network.mark_output(trt_tensor) - - -class TRTModule(torch.nn.Module): - def __init__(self, engine, input_names, output_names, final_shapes=None): - self.input_names = input_names - self.output_names = output_names - self._trt_engine = engine - self._trt_context = self._trt_engine.create_execution_context() - super(TRTModule, self).__init__() - self.final_shapes = final_shapes - - def forward(self, *inputs): - batch_size = inputs[0].shape[0] - bindings = [None] * (len(self.input_names) + len(self.output_names)) - - # create output tensors - outputs = [None] * len(self.output_names) - for i, output_name in enumerate(self.output_names): - idx = self._trt_engine.get_binding_index(output_name) - dtype = torch_dtype_from_trt(self._trt_engine.get_binding_dtype(idx)) - if self.final_shapes is not None: - shape = (batch_size, ) + self.final_shapes[i] - else: - shape = (batch_size, ) + tuple(self._trt_engine.get_binding_shape(idx)) - device = torch_device_from_trt(self._trt_engine.get_location(idx)) - output = torch.empty(size=shape, dtype=dtype, device=device) - outputs[i] = output - bindings[idx] = output.data_ptr() - - for i, input_name in enumerate(self.input_names): - idx = self._trt_engine.get_binding_index(input_name) - bindings[idx] = inputs[i].data_ptr() - - self._trt_context.execute_async(batch_size, bindings, torch.cuda.current_stream().cuda_stream) - - outputs = tuple(outputs) - if len(outputs) == 1: - outputs = outputs[0] - - return outputs - - -def torch2trt(module, inputs, input_names=None, output_names=None, max_batch_size=1, - fp16_mode=False, max_workspace_size=0): - with trt.Logger(trt.Logger.INFO) as logger, trt.Builder(logger) as builder,\ - builder.create_network() as network, ConversionContext(network) as ctx: - - if isinstance(inputs, list): - inputs = tuple(inputs) - if not isinstance(inputs, tuple): - inputs = (inputs, ) - ctx.add_inputs(inputs, input_names) - - outputs = module(*inputs) - - if not isinstance(outputs, tuple): - outputs = (outputs, ) - ctx.mark_outputs(outputs, output_names) - - final_shapes = [tuple(output.shape)[1:] for output in list(outputs)] - - builder.max_workspace_size = max_workspace_size - builder.fp16_mode = fp16_mode - builder.max_batch_size = max_batch_size - - engine = builder.build_cuda_engine(network) - - return TRTModule(engine, ctx.input_names, ctx.output_names, final_shapes) - - -# DEFINE ALL CONVERSION FUNCTIONS - - -def tensorrt_converter(method): - def register_converter(converter): - CONVERTERS[method] = converter - return converter - return register_converter - - -# MODULE CONVERTERS - - -@tensorrt_converter('torch.nn.Linear.forward') -def convert_Linear(ctx): - module = ctx.method_args[0] - input = ctx.method_args[1] - output = ctx.method_return - trt_input = ctx.trt_tensors[input.__hash__()] - - layer = ctx.network.add_fully_connected( - input=trt_input, - num_outputs=module.out_features, - kernel=module.weight.detach().cpu().numpy(), - bias=module.bias.detach().cpu().numpy()) - - ctx.trt_tensors[output.__hash__()] = layer.get_output(0) - - -@tensorrt_converter('torch.nn.Conv2d.forward') -def convert_Conv2d(ctx): - module = ctx.method_args[0] - input = ctx.method_args[1] - output = ctx.method_return - trt_input = ctx.trt_tensors[input.__hash__()] - - kernel_size = module.kernel_size - if not isinstance(kernel_size, tuple): - kernel_size = (kernel_size, ) * 2 - - stride = module.stride - if not isinstance(stride, tuple): - stride = (stride, ) * 2 - - padding = module.padding - if not isinstance(padding, tuple): - padding = (padding, ) * 2 - - bias = trt.Weights() - if module.bias is not None: - bias = module.bias.detach().cpu().numpy() - - layer = ctx.network.add_convolution( - input=trt_input, - num_output_maps=module.out_channels, - kernel_shape=kernel_size, - kernel=module.weight.detach().cpu().numpy(), - bias=bias) - layer.stride = stride - layer.padding = padding - - if module.groups is not None: - layer.num_groups = module.groups - - ctx.trt_tensors[output.__hash__()] = layer.get_output(0) - - -@tensorrt_converter('torch.nn.MaxPool2d.forward') -def convert_MaxPool2d(ctx): - module = ctx.method_args[0] - input = ctx.method_args[1] - output = ctx.method_return - trt_input = ctx.trt_tensors[input.__hash__()] - - kernel_size = module.kernel_size - if not isinstance(kernel_size, tuple): - kernel_size = (kernel_size, ) * 2 - - stride = module.stride - if not isinstance(stride, tuple): - stride = (stride, ) * 2 - - padding = module.padding - if not isinstance(padding, tuple): - padding = (padding, ) * 2 - - layer = ctx.network.add_pooling( - input=trt_input, type=trt.PoolingType.MAX, window_size=kernel_size) - layer.stride = stride - layer.padding = padding - - ctx.trt_tensors[output.__hash__()] = layer.get_output(0) - - -@tensorrt_converter('torch.nn.AvgPool2d.forward') -def convert_AvgPool2d(ctx): - module = ctx.method_args[0] - input = ctx.method_args[1] - output = ctx.method_return - trt_input = ctx.trt_tensors[input.__hash__()] - - kernel_size = module.kernel_size - if not isinstance(kernel_size, tuple): - kernel_size = (kernel_size, ) * 2 - stride = module.stride - if not isinstance(stride, tuple): - stride = (stride, ) * 2 - padding = module.padding - if not isinstance(padding, tuple): - padding = (padding, ) * 2 - - layer = ctx.network.add_pooling( - input=trt_input, type=trt.PoolingType.AVERAGE, window_size=kernel_size) - layer.stride = stride - layer.padding = padding - layer.average_count_excludes_padding = not module.count_include_pad - - ctx.trt_tensors[output.__hash__()] = layer.get_output(0) - - -@tensorrt_converter('torch.nn.AdaptiveAvgPool2d.forward') -def convert_AdaptiveAvgPool2d(ctx): - module = ctx.method_args[0] - input = ctx.method_args[1] - output = ctx.method_return - trt_input = ctx.trt_tensors[input.__hash__()] - - output_size = module.output_size - if not isinstance(output_size, tuple): - output_size = (output_size, ) * 2 - - stride = (trt_input.shape[-2] // output_size[-2], trt_input.shape[-1] // output_size[-1]) - - kernel_size = stride - layer = ctx.network.add_pooling( - input=trt_input, type=trt.PoolingType.AVERAGE, window_size=kernel_size) - layer.stride = stride - - ctx.trt_tensors[output.__hash__()] = layer.get_output(0) - - -@tensorrt_converter('torch.nn.functional.adaptive_avg_pool2d') -def convert_adaptive_avg_pool2d(ctx): - ctx.method_args = (torch.nn.AdaptiveAvgPool2d(ctx.method_args[1]), ctx.method_args[0]) - convert_AdaptiveAvgPool2d(ctx) - - -@tensorrt_converter('torch.nn.ReLU.forward') -def convert_ReLU(ctx): - input = ctx.method_args[1] - output = ctx.method_return - trt_input = ctx.trt_tensors[input.__hash__()] - layer = ctx.network.add_activation( - input=trt_input, type=trt.ActivationType.RELU) - ctx.trt_tensors[output.__hash__()] = layer.get_output(0) - - -@tensorrt_converter('torch.nn.functional.relu') -def convert_relu(ctx): - ctx.method_args = (torch.nn.ReLU(),) + ctx.method_args - convert_ReLU(ctx) - - -@tensorrt_converter('torch.nn.ReLU6.forward') -def convert_ReLU6(ctx): - input = ctx.method_args[1] - output = ctx.method_return - trt_input = ctx.trt_tensors[input.__hash__()] - - layer = ctx.network.add_activation( - input=trt_input, type=trt.ActivationType.RELU) - shape = (1, ) * len(trt_input.shape) # broadcast all dimensions - tensor = 6.0 * torch.ones(shape, dtype=torch_dtype_from_trt(trt_input.dtype)).cpu().numpy() - trt_6 = ctx.network.add_constant(shape, tensor) - layer = ctx.network.add_elementwise( - layer.get_output(0), trt_6.get_output(0), trt.ElementWiseOperation.MIN) - - ctx.trt_tensors[output.__hash__()] = layer.get_output(0) - - -@tensorrt_converter('torch.nn.functional.relu6') -def convert_relu6(ctx): - ctx.method_args = (torch.nn.ReLU6(),) + ctx.method_args - convert_ReLU6(ctx) - - -@tensorrt_converter('torch.nn.LogSoftmax.forward') -def convert_LogSoftmax(ctx): - input = ctx.method_args[1] - output = ctx.method_return - trt_input = ctx.trt_tensors[input.__hash__()] - layer = ctx.network.add_softmax(input=trt_input) - layer = ctx.network.add_unary(input=layer.get_output(0), - op=trt.UnaryOperation.LOG) - ctx.trt_tensors[output.__hash__()] = layer.get_output(0) - - -@tensorrt_converter('torch.nn.Dropout.forward') -@tensorrt_converter('torch.nn.Dropout2d.forward') -@tensorrt_converter('torch.nn.Dropout3d.forward') -def convert_Identity(ctx): - input = ctx.method_args[1] - output = ctx.method_return - ctx.trt_tensors[output.__hash__()] = ctx.trt_tensors[input.__hash__()] - - -@tensorrt_converter('torch.Tensor.view') -@tensorrt_converter('torch.nn.functional.dropout') -@tensorrt_converter('torch.nn.functional.dropout2d') -@tensorrt_converter('torch.nn.functional.dropout3d') -def convert_identity(ctx): - input = ctx.method_args[0] - output = ctx.method_return - ctx.trt_tensors[output.__hash__()] = ctx.trt_tensors[input.__hash__()] - - -@tensorrt_converter('torch.nn.BatchNorm2d.forward') -def convert_BatchNorm2d(ctx): - module = ctx.method_args[0] - input = ctx.method_args[1] - output = ctx.method_return - trt_input = ctx.trt_tensors[input.__hash__()] - - scale = module.weight.detach().cpu().numpy() / np.sqrt(module.running_var.detach().cpu().numpy() + module.eps) - bias = module.bias.detach().cpu().numpy() - module.running_mean.detach().cpu().numpy() * scale - power = np.ones_like(scale) - layer = ctx.network.add_scale(trt_input, trt.ScaleMode.CHANNEL, bias, scale, power) - - ctx.trt_tensors[output.__hash__()] = layer.get_output(0) - - -# TENSOR METHOD CONVERTERS - - -@tensorrt_converter('torch.cat') -def convert_cat(ctx): - inputs = ctx.method_args[0] - - if 'dim' in ctx.method_kwargs: - dim = ctx.method_kwargs['dim'] - else: - dim = ctx.method_args[1] - - output = ctx.method_return - trt_inputs = [ctx.trt_tensors[i.__hash__()] for i in inputs] - - layer = ctx.network.add_concatenation(inputs=trt_inputs) - layer.axis = dim - 1 - ctx.trt_tensors[output.__hash__()] = layer.get_output(0) - - -@tensorrt_converter('torch.Tensor.__iadd__') -@tensorrt_converter('torch.Tensor.__add__') -def convert_add(ctx): - input_a = ctx.method_args[0] - input_b = ctx.method_args[1] - output = ctx.method_return - trt_input_a = ctx.trt_tensors[input_a.__hash__()] - trt_input_b = ctx.trt_tensors[input_b.__hash__()] - layer = ctx.network.add_elementwise(trt_input_a, trt_input_b, trt.ElementWiseOperation.SUM) - ctx.trt_tensors[output.__hash__()] = layer.get_output(0) diff --git a/torch2trt/__init__.py b/torch2trt/__init__.py new file mode 100644 index 00000000..fa80d89b --- /dev/null +++ b/torch2trt/__init__.py @@ -0,0 +1,16 @@ +from .torch2trt import * +from .converters import * +import tensorrt as trt + + +def load_plugins(): + import torch2trt.plugins + registry = trt.get_plugin_registry() + torch2trt_creators = [c for c in registry.plugin_creator_list if c.plugin_namespace == 'torch2trt'] + for c in torch2trt_creators: + registry.register_creator(c, 'torch2trt') + +try: + load_plugins() +except: + pass diff --git a/torch2trt/calibration.py b/torch2trt/calibration.py new file mode 100644 index 00000000..7506ea4e --- /dev/null +++ b/torch2trt/calibration.py @@ -0,0 +1,69 @@ +import torch +import tensorrt as trt + + +if trt.__version__ >= '5.1': + DEFAULT_CALIBRATION_ALGORITHM = trt.CalibrationAlgoType.ENTROPY_CALIBRATION_2 +else: + DEFAULT_CALIBRATION_ALGORITHM = trt.CalibrationAlgoType.ENTROPY_CALIBRATION + + +class TensorBatchDataset(): + + def __init__(self, tensors): + self.tensors = tensors + + def __len__(self): + return len(self.tensors[0]) + + def __getitem__(self, idx): + return [t[idx] for t in self.tensors] + + +class DatasetCalibrator(trt.IInt8Calibrator): + + def __init__(self, inputs, dataset, batch_size=1, algorithm=DEFAULT_CALIBRATION_ALGORITHM): + super(DatasetCalibrator, self).__init__() + + self.dataset = dataset + self.batch_size = batch_size + self.algorithm = algorithm + + # create buffers that will hold data batches + self.buffers = [] + for tensor in inputs: + size = (batch_size,) + tuple(tensor.shape[1:]) + buf = torch.zeros(size=size, dtype=tensor.dtype, device=tensor.device).contiguous() + self.buffers.append(buf) + + self.count = 0 + + def get_batch(self, *args, **kwargs): + if self.count < len(self.dataset): + + for i in range(self.batch_size): + + idx = self.count % len(self.dataset) # roll around if not multiple of dataset + inputs = self.dataset[idx] + + # copy data for (input_idx, dataset_idx) into buffer + for buffer, tensor in zip(self.buffers, inputs): + buffer[i].copy_(tensor) + + self.count += 1 + + return [int(buf.data_ptr()) for buf in self.buffers] + else: + return [] + + def get_algorithm(self): + return self.algorithm + + def get_batch_size(self): + return self.batch_size + + def read_calibration_cache(self, *args, **kwargs): + return None + + def write_calibration_cache(self, cache, *args, **kwargs): + pass \ No newline at end of file diff --git a/torch2trt/converters/AdaptiveAvgPool2d.py b/torch2trt/converters/AdaptiveAvgPool2d.py new file mode 100644 index 00000000..41ad141d --- /dev/null +++ b/torch2trt/converters/AdaptiveAvgPool2d.py @@ -0,0 +1,39 @@ +from torch2trt.torch2trt import * +from torch2trt.module_test import add_module_test + + +@tensorrt_converter('torch.nn.AdaptiveAvgPool2d.forward') +def convert_AdaptiveAvgPool2d(ctx): + module = ctx.method_args[0] + input = ctx.method_args[1] + output = ctx.method_return + + input_trt = add_missing_trt_tensors(ctx.network, [input])[0] + + output_size = module.output_size + if not isinstance(output_size, tuple): + output_size = (output_size, ) * 2 + + stride = (input_trt.shape[-2] // output_size[-2], input_trt.shape[-1] // output_size[-1]) + + kernel_size = stride + layer = ctx.network.add_pooling( + input=input_trt, type=trt.PoolingType.AVERAGE, window_size=kernel_size) + layer.stride = stride + + output._trt = layer.get_output(0) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 224, 224)]) +def test_AdaptiveAvgPool2d_1x1(): + return torch.nn.AdaptiveAvgPool2d((1, 1)) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 224, 224)]) +def test_AdaptiveAvgPool2d_2x2(): + return torch.nn.AdaptiveAvgPool2d((2, 2)) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 224, 224)]) +def test_AdaptiveAvgPool2d_3x3(): + return torch.nn.AdaptiveAvgPool2d((3, 3)) diff --git a/torch2trt/converters/BatchNorm1d.py b/torch2trt/converters/BatchNorm1d.py new file mode 100644 index 00000000..52249db1 --- /dev/null +++ b/torch2trt/converters/BatchNorm1d.py @@ -0,0 +1,36 @@ +from torch2trt.torch2trt import * +from torch2trt.module_test import add_module_test + + +@tensorrt_converter('torch.nn.BatchNorm1d.forward') +def convert_BatchNorm2d(ctx): + module = ctx.method_args[0] + input = ctx.method_args[1] + input_trt = add_missing_trt_tensors(ctx.network, [input])[0] + output = ctx.method_return + + scale = module.weight.detach().cpu().numpy() / np.sqrt(module.running_var.detach().cpu().numpy() + module.eps) + bias = module.bias.detach().cpu().numpy() - module.running_mean.detach().cpu().numpy() * scale + power = np.ones_like(scale) + + # reshape to 2D + layer = ctx.network.add_shuffle(input_trt) + + if len(input.shape) == 2: + layer.reshape_dims = (input.shape[1], 1, 1) + else: + layer.reshape_dims = (input.shape[1], input.shape[2], 1) + + layer = ctx.network.add_scale(layer.get_output(0), trt.ScaleMode.CHANNEL, bias, scale, power) + + # reshape back to 1D + layer = ctx.network.add_shuffle(layer.get_output(0)) + layer.reshape_dims = tuple(output.shape[1:]) + + output._trt = layer.get_output(0) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 10)]) +@add_module_test(torch.float32, torch.device('cuda'), [(1, 10, 3)]) +def test_BatchNorm1d_basic(): + return torch.nn.BatchNorm1d(10) \ No newline at end of file diff --git a/torch2trt/converters/BatchNorm2d.py b/torch2trt/converters/BatchNorm2d.py new file mode 100644 index 00000000..77d5af31 --- /dev/null +++ b/torch2trt/converters/BatchNorm2d.py @@ -0,0 +1,23 @@ +from torch2trt.torch2trt import * +from torch2trt.module_test import add_module_test + + +@tensorrt_converter("torch.nn.BatchNorm2d.forward", enabled=trt_version() < '7.0') +def convert_BatchNorm2d(ctx): + module = ctx.method_args[0] + input = ctx.method_args[1] + input_trt = add_missing_trt_tensors(ctx.network, [input])[0] + output = ctx.method_return + + scale = module.weight.detach().cpu().numpy() / np.sqrt( + module.running_var.detach().cpu().numpy() + module.eps + ) + bias = ( + module.bias.detach().cpu().numpy() + - module.running_mean.detach().cpu().numpy() * scale + ) + power = np.ones_like(scale) + + layer = ctx.network.add_scale(input_trt, trt.ScaleMode.CHANNEL, bias, scale, power) + + output._trt = layer.get_output(0) diff --git a/torch2trt/converters/Conv.py b/torch2trt/converters/Conv.py new file mode 100644 index 00000000..512cddbc --- /dev/null +++ b/torch2trt/converters/Conv.py @@ -0,0 +1,91 @@ +from torch2trt.torch2trt import * +from torch2trt.module_test import add_module_test + + +@tensorrt_converter('torch.nn.Conv2d.forward', enabled=trt_version() >= '7.0') +@tensorrt_converter('torch.nn.Conv3d.forward', enabled=trt_version() >= '7.0') +def convert_Conv_trt7(ctx): + module = ctx.method_args[0] + input = ctx.method_args[1] + input_trt = add_missing_trt_tensors(ctx.network, [input])[0] + output = ctx.method_return + + input_dim = input.dim() - 2 + + kernel_size = module.kernel_size + if not isinstance(kernel_size, tuple): + kernel_size = (kernel_size, ) * input_dim + + stride = module.stride + if not isinstance(stride, tuple): + stride = (stride, ) * input_dim + + padding = module.padding + if not isinstance(padding, tuple): + padding = (padding, ) * input_dim + + dilation = module.dilation + if not isinstance(dilation, tuple): + dilation = (dilation, ) * input_dim + + kernel = module.weight.detach().cpu().numpy() + + bias = None #trt.Weights(torch_dtype_to_trt(module.weight.dtype)) + if module.bias is not None: + bias = module.bias.detach().cpu().numpy() + + layer = ctx.network.add_convolution_nd( + input=input_trt, + num_output_maps=module.out_channels, + kernel_shape=kernel_size, + kernel=kernel, + bias=bias) + layer.stride_nd = stride + layer.padding_nd = padding + layer.dilation_nd = dilation + + if module.groups is not None: + layer.num_groups = module.groups + + output._trt = layer.get_output(0) + + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 10, 224, 224)], enabled=trt_version() >= '7.0') +def test_Conv2d_basic_trt7(): + return torch.nn.Conv2d(10, 5, kernel_size=1, stride=1, padding=0) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 10, 224, 224)], enabled=trt_version() >= '7.0') +def test_Conv2d_stride2_trt7(): + return torch.nn.Conv2d(10, 5, kernel_size=1, stride=2, padding=0) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 10, 224, 224)], enabled=trt_version() >= '7.0') +def test_Conv2d_kernel3_trt7(): + return torch.nn.Conv2d(10, 5, kernel_size=3, stride=2, padding=1) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 10, 224, 224)], enabled=trt_version() >= '7.0') +def test_Conv2d_dilation2_trt7(): + return torch.nn.Conv2d(10, 5, kernel_size=3, stride=1, padding=1, dilation=2) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 10, 64, 64, 64)], enabled=trt_version() >= '7.0') +def test_Conv3d_basic_trt7(): + return torch.nn.Conv3d(10, 5, kernel_size=1, stride=1, padding=0) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 10, 64, 64, 64)], enabled=trt_version() >= '7.0') +def test_Conv3d_stride2_trt7(): + return torch.nn.Conv3d(10, 5, kernel_size=1, stride=2, padding=0) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 10, 64, 64, 64)], enabled=trt_version() >= '7.0') +def test_Conv3d_kernel3_trt7(): + return torch.nn.Conv3d(10, 5, kernel_size=3, stride=2, padding=1) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 10, 64, 64, 64)], enabled=trt_version() >= '7.0') +def test_Conv3d_dilation2_trt7(): + return torch.nn.Conv3d(10, 5, kernel_size=3, stride=1, padding=1, dilation=2) diff --git a/torch2trt/converters/Conv1d.py b/torch2trt/converters/Conv1d.py new file mode 100644 index 00000000..2f54b695 --- /dev/null +++ b/torch2trt/converters/Conv1d.py @@ -0,0 +1,64 @@ +from torch2trt.torch2trt import * +from torch2trt.module_test import add_module_test + + +@tensorrt_converter('torch.nn.Conv1d.forward') +def convert_Conv1d(ctx): + module = ctx.method_args[0] + input = ctx.method_args[1] + input_trt = add_missing_trt_tensors(ctx.network, [input])[0] + output = ctx.method_return + + kernel_size = (module.kernel_size[0], 1) + stride = (module.stride[0], 1) + padding = (module.padding[0], 0) + dilation = (module.dilation[0], 1) + + kernel = module.weight.detach().cpu().numpy()[..., None] + + bias = trt.Weights(torch_dtype_to_trt(module.weight.dtype)) + if module.bias is not None: + bias = module.bias.detach().cpu().numpy() + + # reshape to 2D + layer = ctx.network.add_shuffle(input_trt) + layer.reshape_dims = (-1, input.shape[-1], 1) + + layer = ctx.network.add_convolution( + input=layer.get_output(0), + num_output_maps=module.out_channels, + kernel_shape=kernel_size, + kernel=kernel, + bias=bias) + layer.stride = stride + layer.padding = padding + layer.dilation = dilation + + if module.groups is not None: + layer.num_groups = module.groups + + # reshape back to 1D + layer = ctx.network.add_shuffle(layer.get_output(0)) + layer.reshape_dims = (-1, output.shape[-1]) + + output._trt = layer.get_output(0) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 10, 224)]) +def test_Conv1d_basic(): + return torch.nn.Conv1d(10, 5, kernel_size=1, stride=1, padding=0) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 10, 224)]) +def test_Conv1d_stride2(): + return torch.nn.Conv1d(10, 5, kernel_size=1, stride=2, padding=0) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 10, 224)]) +def test_Conv1d_kernel3(): + return torch.nn.Conv1d(10, 5, kernel_size=3, stride=2, padding=1) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 10, 224)]) +def test_Conv1d_dilation2(): + return torch.nn.Conv1d(10, 5, kernel_size=3, stride=1, padding=1, dilation=2) diff --git a/torch2trt/converters/Conv2d.py b/torch2trt/converters/Conv2d.py new file mode 100644 index 00000000..d37f3de2 --- /dev/null +++ b/torch2trt/converters/Conv2d.py @@ -0,0 +1,68 @@ +from torch2trt.torch2trt import * +from torch2trt.module_test import add_module_test + + +@tensorrt_converter("torch.nn.Conv2d.forward", enabled=trt_version() < '7.0') +def convert_Conv2d(ctx): + module = ctx.method_args[0] + input = ctx.method_args[1] + input_trt = add_missing_trt_tensors(ctx.network, [input])[0] + output = ctx.method_return + + kernel_size = module.kernel_size + if not isinstance(kernel_size, tuple): + kernel_size = (kernel_size,) * 2 + + stride = module.stride + if not isinstance(stride, tuple): + stride = (stride,) * 2 + + padding = module.padding + if not isinstance(padding, tuple): + padding = (padding,) * 2 + + dilation = module.dilation + if not isinstance(dilation, tuple): + dilation = (dilation,) * 2 + + kernel = module.weight.detach().cpu().numpy() + + bias = trt.Weights(torch_dtype_to_trt(module.weight.dtype)) + if module.bias is not None: + bias = module.bias.detach().cpu().numpy() + + layer = ctx.network.add_convolution( + input=input_trt, + num_output_maps=module.out_channels, + kernel_shape=kernel_size, + kernel=kernel, + bias=bias, + ) + layer.stride = stride + layer.padding = padding + layer.dilation = dilation + + if module.groups is not None: + layer.num_groups = module.groups + + output._trt = layer.get_output(0) + + +@add_module_test(torch.float32, torch.device("cuda"), [(1, 10, 224, 224)], enabled=trt_version() < '7.0') +def test_Conv2d_basic(): + return torch.nn.Conv2d(10, 5, kernel_size=1, stride=1, padding=0) + + +@add_module_test(torch.float32, torch.device("cuda"), [(1, 10, 224, 224)], enabled=trt_version() < '7.0') +def test_Conv2d_stride2(): + return torch.nn.Conv2d(10, 5, kernel_size=1, stride=2, padding=0) + + +@add_module_test(torch.float32, torch.device("cuda"), [(1, 10, 224, 224)], enabled=trt_version() < '7.0') +def test_Conv2d_kernel3(): + return torch.nn.Conv2d(10, 5, kernel_size=3, stride=2, padding=1) + + +@add_module_test(torch.float32, torch.device("cuda"), [(1, 10, 224, 224)], enabled=trt_version() < '7.0') +def test_Conv2d_dilation2(): + return torch.nn.Conv2d(10, 5, kernel_size=3, stride=1, padding=1, dilation=2) diff --git a/torch2trt/converters/ConvTranspose.py b/torch2trt/converters/ConvTranspose.py new file mode 100644 index 00000000..4def33ce --- /dev/null +++ b/torch2trt/converters/ConvTranspose.py @@ -0,0 +1,79 @@ +from torch2trt.torch2trt import * +from torch2trt.module_test import add_module_test + + +@tensorrt_converter('torch.nn.ConvTranspose2d.forward', enabled=trt_version() >= '7.0') +@tensorrt_converter('torch.nn.ConvTranspose3d.forward', enabled=trt_version() >= '7.0') +def convert_ConvTranspose2d_trt7(ctx): + module = ctx.method_args[0] + input = ctx.method_args[1] + input_trt = add_missing_trt_tensors(ctx.network, [input])[0] + output = ctx.method_return + + input_dim = input.dim() - 2 + + kernel_size = module.kernel_size + if not isinstance(kernel_size, tuple): + kernel_size = (kernel_size, ) * input_dim + + stride = module.stride + if not isinstance(stride, tuple): + stride = (stride, ) * input_dim + + padding = module.padding + if not isinstance(padding, tuple): + padding = (padding, ) * input_dim + + assert module.dilation == 1 or all([d == 1 for d in module.dilation]), \ + "Transposed convolution dilation is not supported in TensorRT" + + kernel = module.weight.detach().cpu().numpy() + + bias = trt.Weights(torch_dtype_to_trt(module.weight.dtype)) + if module.bias is not None: + bias = module.bias.detach().cpu().numpy() + + layer = ctx.network.add_deconvolution_nd( + input=input_trt, + num_output_maps=module.out_channels, + kernel_shape=kernel_size, + kernel=kernel, + bias=bias) + layer.stride_nd = stride + layer.padding_nd = padding + + if module.groups is not None: + layer.num_groups = module.groups + + output._trt = layer.get_output(0) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 10, 7, 7)], enabled=trt_version() >= '7.0') +def test_ConvTranspose2d_basic_trt7(): + return torch.nn.ConvTranspose2d(10, 5, kernel_size=1, stride=1, padding=0) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 10, 8, 8)], enabled=trt_version() >= '7.0') +def test_ConvTranspose2d_stride2_trt7(): + return torch.nn.ConvTranspose2d(10, 5, kernel_size=1, stride=2, padding=0) + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 10, 9, 9)], enabled=trt_version() >= '7.0') +def test_ConvTranspose2d_kernel3_trt7(): + return torch.nn.ConvTranspose2d(10, 5, kernel_size=3, stride=2, padding=1) + + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 10, 7, 7, 7)], enabled=trt_version() >= '7.0') +def test_ConvTranspose3d_basic_trt7(): + return torch.nn.ConvTranspose3d(10, 5, kernel_size=1, stride=1, padding=0) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 10, 7, 7, 7)], enabled=trt_version() >= '7.0') +def test_ConvTranspose3d_stride2_trt7(): + return torch.nn.ConvTranspose3d(10, 5, kernel_size=1, stride=2, padding=0) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 10, 6, 6, 6)], enabled=trt_version() >= '7.0') +def test_ConvTranspose3d_kernel3_trt7(): + return torch.nn.ConvTranspose3d(10, 5, kernel_size=3, stride=2, padding=1) + diff --git a/torch2trt/converters/ConvTranspose2d.py b/torch2trt/converters/ConvTranspose2d.py new file mode 100644 index 00000000..719b608d --- /dev/null +++ b/torch2trt/converters/ConvTranspose2d.py @@ -0,0 +1,68 @@ +from torch2trt.torch2trt import * +from torch2trt.module_test import add_module_test + +@tensorrt_converter("torch.nn.ConvTranspose2d.forward", enabled=trt_version() < '7.0') +def convert_ConvTranspose2d(ctx): + module = ctx.method_args[0] + input = ctx.method_args[1] + input_trt = add_missing_trt_tensors(ctx.network, [input])[0] + output = ctx.method_return + + kernel_size = module.kernel_size + if not isinstance(kernel_size, tuple): + kernel_size = (kernel_size,) * 2 + + stride = module.stride + if not isinstance(stride, tuple): + stride = (stride,) * 2 + + padding = module.padding + if not isinstance(padding, tuple): + padding = (padding,) * 2 + + kernel = module.weight.detach().cpu().numpy() + + bias = trt.Weights(torch_dtype_to_trt(module.weight.dtype)) + if module.bias is not None: + bias = module.bias.detach().cpu().numpy() + + layer = ctx.network.add_deconvolution( + input=input_trt, + num_output_maps=module.out_channels, + kernel_shape=kernel_size, + kernel=kernel, + bias=bias, + ) + layer.stride = stride + + # if output_padding in original pytorch layer is not 0, pre_padding and post_padding should be set respectively. Otherwise the output dimension of pytorch and tensorrt may be different. + output_padding = module.output_padding + if output_padding[0] + output_padding[1] > 0: + layer.pre_padding = padding + layer.post_padding = trt.tensorrt.DimsHW(padding[0] - output_padding[0], padding[1] - output_padding[1]) + else: + layer.padding = padding + + if module.groups is not None: + layer.num_groups = module.groups + + output._trt = layer.get_output(0) + + +@add_module_test(torch.float32, torch.device("cuda"), [(1,3,224,224)], enabled=trt_version() < '7.0') +def test_square_kernel_equal_stride_mode(): + return torch.nn.ConvTranspose2d(3,3,3,stride=2) + +@add_module_test(torch.float32, torch.device("cuda"), [(1,3,224,224)], enabled=trt_version() < '7.0') +def test_square_kernel_equal_stride_mode_unequal_op_size(): + return torch.nn.ConvTranspose2d(3,6,3,stride=2) + +@add_module_test(torch.float32, torch.device("cuda"), [(1,3,224,224)], enabled=trt_version() < '7.0') +def test_unequal_stride_mode(): + return torch.nn.ConvTranspose2d(3,3,3, stride=(2,1), padding=(4,2)) + +@add_module_test(torch.float32, torch.device("cuda"), [(1,3,112,112)], enabled=trt_version() < '7.0') +@add_module_test(torch.float32, torch.device("cuda"), [(1,3,7,7)], enabled=trt_version() < '7.0') +def test_kernelsize_4(): + return torch.nn.ConvTranspose2d(3,3,4, stride=2, padding=1) + diff --git a/torch2trt/converters/Linear.py b/torch2trt/converters/Linear.py new file mode 100644 index 00000000..323a474f --- /dev/null +++ b/torch2trt/converters/Linear.py @@ -0,0 +1,45 @@ +from torch2trt.torch2trt import * +from torch2trt.module_test import add_module_test + + +@tensorrt_converter('torch.nn.Linear.forward') +def convert_Linear(ctx): + module = ctx.method_args[0] + input = ctx.method_args[1] + input_trt = add_missing_trt_tensors(ctx.network, [input])[0] + output = ctx.method_return + + # reshape to ...xNx1x1 + layer = ctx.network.add_shuffle(input_trt) + layer.reshape_dims = tuple(input_trt.shape) + (1, 1) + + bias = trt.Weights(torch_dtype_to_trt(module.weight.dtype)) + if module.bias is not None: + bias = module.bias.detach().cpu().numpy() + + # add fully connected + layer = ctx.network.add_fully_connected( + input=layer.get_output(0), + num_outputs=module.out_features, + kernel=module.weight.detach().cpu().numpy(), + bias=bias) + + # reshape back to N + layer = ctx.network.add_shuffle(layer.get_output(0)) + layer.reshape_dims = tuple(output.shape[1:]) + + output._trt = layer.get_output(0) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 10)]) +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 10)]) +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 4, 10)]) +def test_Linear_basic(): + return torch.nn.Linear(10, 5) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 10)]) +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 10)]) +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 4, 10)]) +def test_Linear_no_bias(): + return torch.nn.Linear(10, 5, bias=False) \ No newline at end of file diff --git a/torch2trt/converters/LogSoftmax.py b/torch2trt/converters/LogSoftmax.py new file mode 100644 index 00000000..83ac4a95 --- /dev/null +++ b/torch2trt/converters/LogSoftmax.py @@ -0,0 +1,12 @@ +from torch2trt.torch2trt import * + + +@tensorrt_converter('torch.nn.LogSoftmax.forward') +def convert_LogSoftmax(ctx): + input = ctx.method_args[1] + input_trt = add_missing_trt_tensors(ctx.network, [input])[0] + output = ctx.method_return + layer = ctx.network.add_softmax(input=input_trt) + layer = ctx.network.add_unary(input=layer.get_output(0), + op=trt.UnaryOperation.LOG) + output._trt = layer.get_output(0) \ No newline at end of file diff --git a/torch2trt/converters/__init__.py b/torch2trt/converters/__init__.py new file mode 100644 index 00000000..c1ab931b --- /dev/null +++ b/torch2trt/converters/__init__.py @@ -0,0 +1,62 @@ +# dummy converters throw warnings method encountered +import tensorrt as trt +from .dummy_converters import * + +# supported converters will override dummy converters + +from .AdaptiveAvgPool2d import * +from .BatchNorm1d import * +from .BatchNorm2d import * +from .conv_functional import * +from .Conv import * +from .Conv1d import * +from .Conv2d import * +from .ConvTranspose import * +from .ConvTranspose2d import * +from .Linear import * +from .LogSoftmax import * +from .activation import * +from .adaptive_avg_pool2d import * +from .adaptive_max_pool2d import * +from .add import * +from .avg_pool import * +from .batch_norm import * +from .cat import * +from .chunk import * +from .clamp import * +from .compare import * +from .div import * +from .expand import * +from .floordiv import * +from .getitem import * +from .identity import * +from .instance_norm import * +from .interpolate import * +from .group_norm import * +from .max import * +from .max_pool2d import * +from .mean import * +from .min import * +from .mod import * +from .mul import * +from .normalize import * +from .ne import * +from .narrow import * +from .pad import * +from .permute import * +from .pow import * +from .prelu import * +from .prod import * +from .relu import * +from .relu6 import * +from .sigmoid import * +from .softmax import * +from .split import * +from .stack import * +from .sub import * +from .sum import * +from .tanh import * +from .tensor import * +from .transpose import * +from .unary import * +from .view import * diff --git a/torch2trt/converters/activation.py b/torch2trt/converters/activation.py new file mode 100644 index 00000000..63da5ae8 --- /dev/null +++ b/torch2trt/converters/activation.py @@ -0,0 +1,121 @@ +from torch2trt.torch2trt import * +from torch2trt.module_test import add_module_test +from .unary import UnaryModule + + +# | RELU : Rectified Linear activation (impl in relu.py) +# | SIGMOID : Sigmoid activation (impl in sigmoid.py) +# | TANH : Hyperbolic Tangent activation (impl in tanh.py) + + +# | LEAKY_RELU : Leaky Relu activation: f(x) = x if x >= 0, f(x) = alpha * x if x < 0 + + +@tensorrt_converter('torch.nn.functional.leaky_relu') +@tensorrt_converter('torch.nn.functional.leaky_relu_') +def convert_leaky_relu(ctx): + input = get_arg(ctx, 'input', pos=0, default=None) + negative_slope = get_arg(ctx, 'negative_slope', pos=1, default=0.01) + output = ctx.method_return + + input_trt = add_missing_trt_tensors(ctx.network, [input])[0] + layer = ctx.network.add_activation(input_trt, trt.ActivationType.LEAKY_RELU) + layer.alpha = negative_slope + + output._trt = layer.get_output(0) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 5, 3)]) +def test_leaky_relu(): + return UnaryModule(lambda x: torch.nn.functional.leaky_relu(x)) + + +# | ELU : Elu activation: f(x) = x if x >= 0, f(x) = alpha * (exp(x) - 1) if x < 0 + + +@tensorrt_converter('torch.nn.functional.elu') +@tensorrt_converter('torch.nn.functional.elu_') +def convert_elu(ctx): + input = get_arg(ctx, 'input', pos=0, default=None) + alpha = get_arg(ctx, 'alpha', pos=1, default=1.0) + output = ctx.method_return + + input_trt = add_missing_trt_tensors(ctx.network, [input])[0] + layer = ctx.network.add_activation(input_trt, trt.ActivationType.ELU) + layer.alpha = alpha + + output._trt = layer.get_output(0) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 5, 3)]) +def test_elu(): + return UnaryModule(lambda x: torch.nn.functional.elu(x)) + + +# | SELU : Selu activation: f(x) = beta * x if x > 0, f(x) = beta * (alpha * exp(x) - alpha) if x <= 0 + +@tensorrt_converter('torch.selu') +@tensorrt_converter('torch.selu_') +@tensorrt_converter('torch.nn.functional.selu') +@tensorrt_converter('torch.nn.functional.selu_') +def convert_selu(ctx): + input = get_arg(ctx, 'input', pos=0, default=None) + alpha = get_arg(ctx, 'alpha', pos=1, default=1.0) + output = ctx.method_return + + input_trt = add_missing_trt_tensors(ctx.network, [input])[0] + layer = ctx.network.add_activation(input_trt, trt.ActivationType.SELU) + layer.alpha = 1.6732632423543772848170429916717 + layer.beta = 1.0507009873554804934193349852946 + + output._trt = layer.get_output(0) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 5, 3)]) +def test_selu(): + return UnaryModule(lambda x: torch.nn.functional.selu(x)) + + +# | SOFTSIGN : Softsign activation: f(x) = x / (1 + \|x\|) + + +@tensorrt_converter('torch.nn.functional.softsign') +def convert_softsign(ctx): + input = get_arg(ctx, 'input', pos=0, default=None) + output = ctx.method_return + + input_trt = add_missing_trt_tensors(ctx.network, [input])[0] + layer = ctx.network.add_activation(input_trt, trt.ActivationType.SOFTSIGN) + + output._trt = layer.get_output(0) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 5, 3)]) +def test_softsign(): + return UnaryModule(lambda x: torch.nn.functional.softsign(x)) + + +# | SOFTPLUS : Softplus activation: f(x) = alpha * log(exp(beta * x) + 1) + + +@tensorrt_converter('torch.nn.functional.softplus') +def convert_softplus(ctx): + input = get_arg(ctx, 'input', pos=0, default=None) + output = ctx.method_return + + input_trt = add_missing_trt_tensors(ctx.network, [input])[0] + layer = ctx.network.add_activation(input_trt, trt.ActivationType.SOFTPLUS) + + output._trt = layer.get_output(0) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 5, 3)]) +def test_softplus(): + return UnaryModule(lambda x: torch.nn.functional.softplus(x)) + + +# | CLIP : Clip activation: f(x) = max(alpha, min(beta, x)) (impl in clamp.py) + +# | HARD_SIGMOID : Hard sigmoid activation: f(x) = max(0, min(1, alpha * x + beta)) (not sure if there is this in Pytorch?) +# | SCALED_TANH : Scaled Tanh activation: f(x) = alpha * tanh(beta * x) (not sure if there is this in Pytorch?) +# | THRESHOLDED_RELU : Thresholded Relu activation: f(x) = x if x > alpha, f(x) = 0 if x <= alpha (not sure if there is this in Pytorch?) \ No newline at end of file diff --git a/torch2trt/converters/adaptive_avg_pool2d.py b/torch2trt/converters/adaptive_avg_pool2d.py new file mode 100644 index 00000000..30710eb8 --- /dev/null +++ b/torch2trt/converters/adaptive_avg_pool2d.py @@ -0,0 +1,8 @@ +from torch2trt.torch2trt import * +from .AdaptiveAvgPool2d import * + + +@tensorrt_converter('torch.nn.functional.adaptive_avg_pool2d') +def convert_adaptive_avg_pool2d(ctx): + ctx.method_args = (torch.nn.AdaptiveAvgPool2d(ctx.method_args[1]), ctx.method_args[0]) + convert_AdaptiveAvgPool2d(ctx) diff --git a/torch2trt/converters/adaptive_max_pool2d.py b/torch2trt/converters/adaptive_max_pool2d.py new file mode 100644 index 00000000..78956ba2 --- /dev/null +++ b/torch2trt/converters/adaptive_max_pool2d.py @@ -0,0 +1,36 @@ +from torch2trt.torch2trt import * +from torch2trt.module_test import add_module_test + + +@tensorrt_converter('torch.nn.functional.adaptive_max_pool2d') +def convert_adaptive_max_pool2d(ctx): + input = ctx.method_args[0] + output = ctx.method_return + + output_size = ctx.method_args[1] + if isinstance(output_size, int): + output_size = (output_size, ) * 2 + + stride = (input._trt.shape[-2] // output_size[-2], input._trt.shape[-1] // output_size[-1]) + + kernel_size = stride + layer = ctx.network.add_pooling( + input=input._trt, type=trt.PoolingType.MAX, window_size=kernel_size) + layer.stride = stride + + output._trt = layer.get_output(0) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 224, 224)]) +def test_adaptive_max_pool2d_1x1(): + return torch.nn.AdaptiveMaxPool2d((1, 1)) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 224, 224)]) +def test_adaptive_max_pool2d_2x2(): + return torch.nn.AdaptiveMaxPool2d((2, 2)) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 224, 224)]) +def test_adaptive_max_pool2d_3x3(): + return torch.nn.AdaptiveMaxPool2d((3, 3)) diff --git a/torch2trt/converters/add.py b/torch2trt/converters/add.py new file mode 100644 index 00000000..99118a21 --- /dev/null +++ b/torch2trt/converters/add.py @@ -0,0 +1,109 @@ +from torch2trt.torch2trt import * +from torch2trt.module_test import add_module_test + + +@tensorrt_converter('torch.add') +@tensorrt_converter('torch.Tensor.__iadd__') +@tensorrt_converter('torch.Tensor.__add__') +@tensorrt_converter('torch.Tensor.__radd__') +def convert_add(ctx): + input_a = ctx.method_args[0] + input_b = ctx.method_args[1] + output = ctx.method_return + input_a_trt, input_b_trt = add_missing_trt_tensors(ctx.network, [input_a, input_b]) + input_a_trt, input_b_trt = broadcast_trt_tensors(ctx.network, [input_a_trt, input_b_trt], len(output.shape) - 1) + layer = ctx.network.add_elementwise(input_a_trt, input_b_trt, trt.ElementWiseOperation.SUM) + output._trt = layer.get_output(0) + + +class Add(torch.nn.Module): + def __init__(self): + super(Add, self).__init__() + + def forward(self, x, y): + return x + y + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 224, 224), (1, 3, 224, 224)]) +def test_add_basic(): + return Add() + + +class IAdd(torch.nn.Module): + def __init__(self): + super(IAdd, self).__init__() + + def forward(self, x, y): + x += y + return x + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 224, 224), (1, 3, 224, 224)]) +def test_add_iadd(): + return IAdd() + + +class TorchAdd(torch.nn.Module): + def __init__(self): + super(TorchAdd, self).__init__() + + def forward(self, x, y): + return torch.add(x, y) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 224, 224), (1, 3, 224, 224)]) +def test_add_torchadd(): + return TorchAdd() + + +class RAddInt(torch.nn.Module): + def __init__(self): + super(RAddInt, self).__init__() + + def forward(self, x): + return 1 + x + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 224, 224)]) +def test_add_radd_int(): + return RAddInt() + + +class RAddFloat(torch.nn.Module): + def __init__(self): + super(RAddFloat, self).__init__() + + def forward(self, x): + return 1.0 + x + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 224, 224)]) +def test_add_radd_float(): + return RAddFloat() + + +class AddConstantNoBatch(torch.nn.Module): + def __init__(self): + super(AddConstantNoBatch, self).__init__() + self.register_buffer('y', torch.ones((3, 10, 10))) + + def forward(self, x): + return x + self.y + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 10, 10)]) +def test_add_constant_nobatch(): + return AddConstantNoBatch() + + +class AddConstantBatch(torch.nn.Module): + def __init__(self): + super(AddConstantBatch, self).__init__() + self.register_buffer('y', torch.ones((1, 3, 10, 10))) + + def forward(self, x): + return x + self.y + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 10, 10)]) +def test_add_constant_batch(): + return AddConstantBatch() diff --git a/torch2trt/converters/avg_pool.py b/torch2trt/converters/avg_pool.py new file mode 100644 index 00000000..185af508 --- /dev/null +++ b/torch2trt/converters/avg_pool.py @@ -0,0 +1,111 @@ +from torch2trt.torch2trt import * +from torch2trt.module_test import add_module_test + + +@tensorrt_converter("torch.nn.functional.avg_pool2d", enabled=trt_version() < '7.0') +def convert_avg_pool2d(ctx): + # parse args + input = get_arg(ctx, "input", pos=0, default=None) + kernel_size = get_arg(ctx, "kernel_size", pos=1, default=None) + stride = get_arg(ctx, "stride", pos=2, default=None) + padding = get_arg(ctx, "padding", pos=3, default=0) + ceil_mode = get_arg(ctx, "ceil_mode", pos=4, default=False) + count_include_pad = get_arg(ctx, "count_include_pad", pos=5, default=True) + + # get input trt tensor (or create constant if it doesn't exist) + input_trt = add_missing_trt_tensors(ctx.network, [input])[0] + + output = ctx.method_return + + # get kernel size + if not isinstance(kernel_size, tuple): + kernel_size = (kernel_size,) * 2 + + # get stride + if not isinstance(stride, tuple): + stride = (stride,) * 2 + + # get padding + if not isinstance(padding, tuple): + padding = (padding,) * 2 + + layer = ctx.network.add_pooling( + input=input_trt, type=trt.PoolingType.AVERAGE, window_size=kernel_size + ) + + layer.stride = stride + layer.padding = padding + layer.average_count_excludes_padding = not count_include_pad + + if ceil_mode: + layer.padding_mode = trt.PaddingMode.EXPLICIT_ROUND_UP + + output._trt = layer.get_output(0) + + +@tensorrt_converter('torch.nn.functional.avg_pool2d', enabled=trt_version() >= '7.0') +@tensorrt_converter('torch.nn.functional.avg_pool3d', enabled=trt_version() >= '7.0') +def convert_avg_pool_trt7(ctx): + # parse args + input = get_arg(ctx, 'input', pos=0, default=None) + kernel_size = get_arg(ctx, 'kernel_size', pos=1, default=None) + stride = get_arg(ctx, 'stride', pos=2, default=None) + padding = get_arg(ctx, 'padding', pos=3, default=0) + ceil_mode = get_arg(ctx, 'ceil_mode', pos=4, default=False) + count_include_pad = get_arg(ctx, 'count_include_pad', pos=5, default=True) + + # get input trt tensor (or create constant if it doesn't exist) + input_trt = add_missing_trt_tensors(ctx.network, [input])[0] + output = ctx.method_return + + input_dim = input.dim() - 2 + + # get kernel size + if not isinstance(kernel_size, tuple): + kernel_size = (kernel_size, ) * input_dim + + # get stride + if not isinstance(stride, tuple): + stride = (stride, ) * input_dim + + # get padding + if not isinstance(padding, tuple): + padding = (padding, ) * input_dim + + layer = ctx.network.add_pooling_nd( + input=input_trt, type=trt.PoolingType.AVERAGE, window_size=kernel_size) + + layer.stride_nd = stride + layer.padding_nd = padding + layer.average_count_excludes_padding = not count_include_pad + + if ceil_mode: + layer.padding_mode = trt.PaddingMode.EXPLICIT_ROUND_UP + + output._trt = layer.get_output(0) + + +@add_module_test(torch.float32, torch.device("cuda"), [(1, 3, 4, 6)]) +@add_module_test(torch.float32, torch.device("cuda"), [(1, 3, 5, 7)]) +def test_avg_pool2d_without_ceil_mode(): + return torch.nn.AvgPool2d(kernel_size=3, stride=2, padding=1, ceil_mode=False) + + +@add_module_test(torch.float32, torch.device("cuda"), [(1, 3, 4, 6)]) +@add_module_test(torch.float32, torch.device("cuda"), [(1, 3, 5, 7)]) +def test_avg_pool2d_with_ceil_mode(): + return torch.nn.AvgPool2d( + kernel_size=3, stride=2, padding=1, ceil_mode=True, count_include_pad=False + ) # TRT does not support ceil_mode=True && count_include_pad=True + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 4, 4, 6)], enabled=trt_version() >= '7.0') +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 3, 5, 7)], enabled=trt_version() >= '7.0') +def test_avg_pool3d_without_ceil_mode_trt7(): + return torch.nn.AvgPool3d(kernel_size=3, stride=2, padding=1, ceil_mode=False) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 4, 4, 6)], enabled=trt_version() >= '7.0') +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 3, 5, 7)], enabled=trt_version() >= '7.0') +def test_avg_pool3d_with_ceil_mode_trt7(): + return torch.nn.AvgPool3d(kernel_size=3, stride=2, padding=1, ceil_mode=True, count_include_pad=False) # TRT does not support ceil_mode=True && count_include_pad=True diff --git a/torch2trt/converters/batch_norm.py b/torch2trt/converters/batch_norm.py new file mode 100644 index 00000000..30e7582f --- /dev/null +++ b/torch2trt/converters/batch_norm.py @@ -0,0 +1,40 @@ +from torch2trt.torch2trt import * +from torch2trt.module_test import add_module_test + +@tensorrt_converter('torch.nn.functional.batch_norm', enabled=trt_version() >= '7.0') +def convert_batch_norm_trt7(ctx): + + input = get_arg(ctx, 'input', pos=0, default=None) + running_mean = get_arg(ctx, 'running_mean', pos=1, default=None) + running_var = get_arg(ctx, 'running_var', pos=2, default=None) + + weight = get_arg(ctx, 'weight', pos=3, default=None) + bias = get_arg(ctx, 'bias', pos=4, default=None) + eps = get_arg(ctx, 'eps', pos=7, default=10e-6) + + input_trt = add_missing_trt_tensors(ctx.network, [input])[0] + output = ctx.method_return + + scale = weight.detach().cpu().numpy() / np.sqrt(running_var.detach().cpu().numpy() + eps) + bias = bias.detach().cpu().numpy() - running_mean.detach().cpu().numpy() * scale + power = np.ones_like(scale) + + layer = ctx.network.add_scale_nd(input_trt, trt.ScaleMode.CHANNEL, bias, scale, power, 0) + output._trt = layer.get_output(0) + + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 10, 3, 3)], enabled=trt_version() >= '7.0') +def test_batch_norm_2d_trt7(): + return torch.nn.BatchNorm2d(10) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 10, 3, 3, 3)], enabled=trt_version() >= '7.0') +def test_batch_norm_3d_2_trt7(): + return torch.nn.BatchNorm3d(10) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 32, 2, 36, 47)], enabled=trt_version() >= '7.0') +def test_batch_norm_3d_trt7(): + return torch.nn.BatchNorm3d(32) + diff --git a/torch2trt/converters/cat.py b/torch2trt/converters/cat.py new file mode 100644 index 00000000..23f039e8 --- /dev/null +++ b/torch2trt/converters/cat.py @@ -0,0 +1,28 @@ +from torch2trt.torch2trt import * +from torch2trt.module_test import add_module_test + + +@tensorrt_converter('torch.cat') +def convert_cat(ctx): + inputs = get_arg(ctx, 'input', pos=0, default=None) + dim = get_arg(ctx, 'dim', pos=1, default=0) + + output = ctx.method_return + trt_inputs = add_missing_trt_tensors(ctx.network, inputs) + trt_inputs = broadcast_trt_tensors(ctx.network, trt_inputs, len(output.shape) - 1) + + layer = ctx.network.add_concatenation(inputs=trt_inputs) + layer.axis = dim - 1 + output._trt = layer.get_output(0) + +class Cat(torch.nn.Module): + def __init__(self, dim): + super(Cat, self).__init__() + self.dim = dim + + def forward(self, *x): + return torch.cat(x, dim=self.dim) + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 4, 4), (1, 3, 4), (1, 17, 4)]) +def test_Cat_basic(): + return Cat(1) diff --git a/torch2trt/converters/chunk.py b/torch2trt/converters/chunk.py new file mode 100644 index 00000000..eb870103 --- /dev/null +++ b/torch2trt/converters/chunk.py @@ -0,0 +1,60 @@ +from torch2trt.torch2trt import * +from torch2trt.module_test import add_module_test +from .split import convert_split + + +@tensorrt_converter('torch.chunk') +@tensorrt_converter('torch.Tensor.chunk') +def convert_chunk(ctx): + convert_split(ctx) + + +class TorchChunk(torch.nn.Module): + + def __init__(self, *args, **kwargs): + super(TorchChunk, self).__init__() + self.args = args + self.kwargs = kwargs + + def forward(self, x): + return torch.chunk(x, *self.args, **self.kwargs) + + +class TensorChunk(torch.nn.Module): + + def __init__(self, *args, **kwargs): + super(TensorChunk, self).__init__() + self.args = args + self.kwargs = kwargs + + def forward(self, x): + return x.chunk(*self.args, **self.kwargs) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 3)]) +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 3, 3)]) +def test_torch_chunk_1_1(): + return TorchChunk(1, 1) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 3)]) +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 3, 3)]) +def test_torch_chunk_2_1(): + return TorchChunk(2, 1) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 3)]) +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 3, 3)]) +def test_torch_chunk_3_1(): + return TorchChunk(3, 1) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 3)]) +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 3, 3)]) +def test_torch_chunk_3_2(): + return TorchChunk(3, 2) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 3, 3)]) +def test_tensor_chunk_3_2(): + return TensorChunk(3, 2) \ No newline at end of file diff --git a/torch2trt/converters/clamp.py b/torch2trt/converters/clamp.py new file mode 100644 index 00000000..5cc22577 --- /dev/null +++ b/torch2trt/converters/clamp.py @@ -0,0 +1,191 @@ +from torch2trt.torch2trt import * +from torch2trt.module_test import add_module_test + + +def __add_clamp(network, trt_input, val, op): + + # create TensorRT constant for minimum value + val_shape = (1, ) * len(trt_input.shape) # broadcast all dimensions + val_tensor = val * torch.ones(val_shape, dtype=torch_dtype_from_trt(trt_input.dtype)).cpu().numpy() + val_trt = network.add_constant(val_shape, val_tensor) + layer = network.add_elementwise(trt_input, val_trt.get_output(0), op) + + return layer + + +# CLAMP_MIN + + +@tensorrt_converter('torch.clamp_min') +@tensorrt_converter('torch.Tensor.clamp_min') +def convert_clamp_min(ctx): + input = ctx.method_args[0] + input_trt = add_missing_trt_tensors(ctx.network, [input])[0] + val = ctx.method_args[1] + output = ctx.method_return + + layer = __add_clamp(ctx.network, input_trt, val, trt.ElementWiseOperation.MAX) + + output._trt = layer.get_output(0) + + +class TorchClampMin(torch.nn.Module): + def forward(self, x): + return torch.clamp_min(x, -0.1) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 224, 224)]) +def test_torch_clamp_min(): + return TorchClampMin() + + +class TensorClampMin(torch.nn.Module): + def forward(self, x): + return x.clamp_min(-0.1) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 224, 224)]) +def test_tensor_clamp_min(): + return TensorClampMin() + + +# CLAMP_MAX + + +@tensorrt_converter('torch.clamp_max') +@tensorrt_converter('torch.Tensor.clamp_max') +def convert_clamp_max(ctx): + input = ctx.method_args[0] + input_trt = add_missing_trt_tensors(ctx.network, [input])[0] + val = ctx.method_args[1] + output = ctx.method_return + + layer = __add_clamp(ctx.network, input_trt, val, trt.ElementWiseOperation.MIN) + + output._trt = layer.get_output(0) + + +class TorchClampMax(torch.nn.Module): + def forward(self, x): + return torch.clamp_max(x, 0.1) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 224, 224)]) +def test_torch_clamp_max(): + return TorchClampMax() + + +class TensorClampMax(torch.nn.Module): + def forward(self, x): + return x.clamp_max(0.1) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 224, 224)]) +def test_tensor_clamp_max(): + return TensorClampMax() + + +# CLAMP + +@tensorrt_converter('torch.clamp') +@tensorrt_converter('torch.Tensor.clamp') +def convert_clamp(ctx): + input = ctx.method_args[0] + input_trt = add_missing_trt_tensors(ctx.network, [input])[0] + output = ctx.method_return + if "min" in ctx.method_kwargs and "max" in ctx.method_kwargs: + min_val = ctx.method_kwargs["min"] + max_val = ctx.method_kwargs["max"] + layer = __add_clamp(ctx.network, input_trt, min_val, trt.ElementWiseOperation.MAX) + layer = __add_clamp(ctx.network, layer.get_output(0), max_val, trt.ElementWiseOperation.MIN) + elif "min" in ctx.method_kwargs: + min_val = ctx.method_kwargs["min"] + layer = __add_clamp(ctx.network, input_trt, min_val, trt.ElementWiseOperation.MAX) + elif "max" in ctx.method_kwargs: + max_val = ctx.method_kwargs["max"] + layer = __add_clamp(ctx.network, input_trt, max_val, trt.ElementWiseOperation.MIN) + else: + min_val = ctx.method_args[1] + max_val = ctx.method_args[2] + layer = __add_clamp(ctx.network, input_trt, min_val, trt.ElementWiseOperation.MAX) + layer = __add_clamp(ctx.network, layer.get_output(0), max_val, trt.ElementWiseOperation.MIN) + + output._trt = layer.get_output(0) + + +class TorchClamp(torch.nn.Module): + def forward(self, x): + return torch.clamp(x, -0.1, 0.1) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 224, 224)]) +def test_torch_clamp(): + return TorchClamp() + + +class TensorClamp(torch.nn.Module): + def forward(self, x): + return x.clamp(-0.1, 0.1) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 224, 224)]) +def test_tensor_clamp(): + return TensorClamp() + + +class TorchClampOptionMax(torch.nn.Module): + def forward(self, x): + return torch.clamp(x, max=0.1) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 224, 224)]) +def test_torch_clamp_option_max(): + return TorchClampOptionMax() + +class TorchClampOptionMin(torch.nn.Module): + def forward(self, x): + return torch.clamp(x, min=-0.1) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 224, 224)]) +def test_torch_clamp_option_min(): + return TorchClampOptionMin() + + +class TorchClampOptionMaxMin(torch.nn.Module): + def forward(self, x): + return torch.clamp(x, min=-0.1, max=0.1) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 224, 224)]) +def test_torch_clamp_option_max_min(): + return TorchClampOptionMaxMin() + + +class TensorClampOptionMax(torch.nn.Module): + def forward(self, x): + return x.clamp(max=0.1) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 224, 224)]) +def test_tensor_clamp_option_max(): + return TensorClampOptionMax() + +class TensorClampOptionMin(torch.nn.Module): + def forward(self, x): + return x.clamp(min=-0.1) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 224, 224)]) +def test_tensor_clamp_option_min(): + return TensorClampOptionMin() + + +class TensorClampOptionMaxMin(torch.nn.Module): + def forward(self, x): + return x.clamp(min=-0.1, max=0.1) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 224, 224)]) +def test_tensor_clamp_option_max_min(): + return TensorClampOptionMaxMin() \ No newline at end of file diff --git a/torch2trt/converters/compare.py b/torch2trt/converters/compare.py new file mode 100644 index 00000000..fc2024ed --- /dev/null +++ b/torch2trt/converters/compare.py @@ -0,0 +1,60 @@ +from torch2trt.torch2trt import * +from torch2trt.module_test import add_module_test + +def convert_elementwise(ctx, op): + input_a = ctx.method_args[0] + input_b = ctx.method_args[1] + output = ctx.method_return + input_a_trt, input_b_trt = add_missing_trt_tensors(ctx.network, [input_a, input_b]) + input_a_trt, input_b_trt = broadcast_trt_tensors(ctx.network, [input_a_trt, input_b_trt], len(output.shape) - 1) + layer = ctx.network.add_elementwise(input_a_trt, input_b_trt, op) + output._trt = layer.get_output(0) + +@tensorrt_converter('torch.gt', enabled=trt_version() >= '7.0') +@tensorrt_converter('torch.Tensor.__gt__', enabled=trt_version() >= '7.0') +def convert_gt(ctx): + return convert_elementwise(ctx, trt.ElementWiseOperation.GREATER) + +@tensorrt_converter('torch.lt', enabled=trt_version() >= '7.0') +@tensorrt_converter('torch.Tensor.__lt__', enabled=trt_version() >= '7.0') +def convert_gt(ctx): + return convert_elementwise(ctx, trt.ElementWiseOperation.LESS) + +@tensorrt_converter('torch.eq', enabled=trt_version() >= '7.0') +@tensorrt_converter('torch.Tensor.__eq__', enabled=trt_version() >= '7.0') +def convert_gt(ctx): + return convert_elementwise(ctx, trt.ElementWiseOperation.EQUAL) + +class GT(torch.nn.Module): + def __init__(self): + super(GT, self).__init__() + + def forward(self, x, y): + return x > y + +class LT(torch.nn.Module): + def __init__(self): + super(LT, self).__init__() + + def forward(self, x, y): + return x < y + +class EQ(torch.nn.Module): + def __init__(self): + super(EQ, self).__init__() + + def forward(self, x, y): + return x == y + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 6, 6), (1, 3, 6, 6)], enabled=trt_version() >= '7.0') +def test_gt_basic(): + return GT() + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 6, 6), (1, 3, 6, 6)], enabled=trt_version() >= '7.0') +def test_gt_basic(): + return LT() + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 6, 6), (1, 3, 6, 6)], enabled=trt_version() >= '7.0') +def test_gt_basic(): + return EQ() diff --git a/torch2trt/converters/conv_functional.py b/torch2trt/converters/conv_functional.py new file mode 100644 index 00000000..e9cc42d4 --- /dev/null +++ b/torch2trt/converters/conv_functional.py @@ -0,0 +1,127 @@ +from torch2trt.torch2trt import * +from torch2trt.module_test import add_module_test + + +@tensorrt_converter('torch.nn.functional.conv2d', enabled=trt_version() >= '7.0') +@tensorrt_converter('torch.nn.functional.conv3d', enabled=trt_version() >= '7.0') +def convert_Conv_trt7_functional(ctx): + input = get_arg(ctx, 'input', pos=0, default=None) + weight = get_arg(ctx, 'weight', pos=1, default=None) + bias = get_arg(ctx, 'bias', pos=2, default=None) + stride = get_arg(ctx, 'stride', pos=3, default=1) + padding = get_arg(ctx, 'padding', pos=4, default=0) + dilation = get_arg(ctx, 'dilation', pos=5, default=1) + groups = get_arg(ctx, 'groups', pos=6, default=1) + + input_trt = add_missing_trt_tensors(ctx.network, [input])[0] + output = ctx.method_return + + input_dim = input.dim() - 2 + + out_channels = int(weight.shape[0]) + kernel_size = tuple(weight.shape[2:]) + if not isinstance(kernel_size, tuple): + kernel_size = (kernel_size, ) * input_dim + + if not isinstance(stride, tuple): + stride = (stride, ) * input_dim + + if not isinstance(padding, tuple): + padding = (padding, ) * input_dim + + if not isinstance(dilation, tuple): + dilation = (dilation, ) * input_dim + + kernel = weight.detach().cpu().numpy() + + if bias is not None: + bias = bias.detach().cpu().numpy() + + layer = ctx.network.add_convolution_nd( + input=input_trt, + num_output_maps=out_channels, + kernel_shape=kernel_size, + kernel=kernel, + bias=bias) + layer.stride_nd = stride + layer.padding_nd = padding + layer.dilation_nd = dilation + + if groups is not None: + layer.num_groups = groups + + output._trt = layer.get_output(0) + + +class FunctionalConv2d(torch.nn.Module): + def __init__(self, *args, **kwargs): + super().__init__() + self.conv = torch.nn.Conv2d(*args, **kwargs) + + def forward(self, x): + x = torch.nn.functional.conv2d( + x, + self.conv.weight, + self.conv.bias, + self.conv.stride, + self.conv.padding, + self.conv.dilation, + self.conv.groups + ) + return x + +class FunctionalConv3d(torch.nn.Module): + def __init__(self, *args, **kwargs): + super().__init__() + self.conv = torch.nn.Conv3d(*args, **kwargs) + + def forward(self, x): + x = torch.nn.functional.conv3d( + x, + self.conv.weight, + self.conv.bias, + self.conv.stride, + self.conv.padding, + self.conv.dilation, + self.conv.groups + ) + return x + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 10, 224, 224)], enabled=trt_version() >= '7.0') +def test_Conv2d_basic_trt7_functional(): + return FunctionalConv2d(10, 5, kernel_size=1, stride=1, padding=0) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 10, 224, 224)], enabled=trt_version() >= '7.0') +def test_Conv2d_stride2_trt7_functional(): + return FunctionalConv2d(10, 5, kernel_size=1, stride=2, padding=0) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 10, 224, 224)], enabled=trt_version() >= '7.0') +def test_Conv2d_kernel3_trt7_functional(): + return FunctionalConv2d(10, 5, kernel_size=3, stride=2, padding=1) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 10, 224, 224)], enabled=trt_version() >= '7.0') +def test_Conv2d_dilation2_trt7_functional(): + return FunctionalConv2d(10, 5, kernel_size=3, stride=1, padding=1, dilation=2) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 10, 64, 64, 64)], enabled=trt_version() >= '7.0') +def test_Conv3d_basic_trt7_functional(): + return FunctionalConv3d(10, 5, kernel_size=1, stride=1, padding=0) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 10, 64, 64, 64)], enabled=trt_version() >= '7.0') +def test_Conv3d_stride2_trt7_functional(): + return FunctionalConv3d(10, 5, kernel_size=1, stride=2, padding=0) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 10, 64, 64, 64)], enabled=trt_version() >= '7.0') +def test_Conv3d_kernel3_trt7_functional(): + return FunctionalConv3d(10, 5, kernel_size=3, stride=2, padding=1) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 10, 64, 64, 64)], enabled=trt_version() >= '7.0') +def test_Conv3d_dilation2_trt7_functional(): + return FunctionalConv3d(10, 5, kernel_size=3, stride=1, padding=1, dilation=2) diff --git a/torch2trt/converters/div.py b/torch2trt/converters/div.py new file mode 100644 index 00000000..f7e95642 --- /dev/null +++ b/torch2trt/converters/div.py @@ -0,0 +1,123 @@ +from torch2trt.torch2trt import * +from torch2trt.module_test import add_module_test + + +@tensorrt_converter('torch.div') +@tensorrt_converter('torch.Tensor.__div__') # py2 +@tensorrt_converter('torch.Tensor.__idiv__') # py2 +@tensorrt_converter('torch.Tensor.__truediv__') # py3 +@tensorrt_converter('torch.Tensor.__itruediv__') # py3 +def convert_div(ctx): + input_a = ctx.method_args[0] + input_b = ctx.method_args[1] + output = ctx.method_return + input_a_trt, input_b_trt = add_missing_trt_tensors(ctx.network, [input_a, input_b]) + input_a_trt, input_b_trt = broadcast_trt_tensors(ctx.network, [input_a_trt, input_b_trt], len(output.shape) - 1) + layer = ctx.network.add_elementwise(input_a_trt, input_b_trt, trt.ElementWiseOperation.DIV) + output._trt = layer.get_output(0) + + +@tensorrt_converter('torch.Tensor.__rdiv__') # py2 +@tensorrt_converter('torch.Tensor.__rtruediv__') # py3 +def convert_rdiv(ctx): + input_a = ctx.method_args[1] # inputs switched for rdiv + input_b = ctx.method_args[0] + output = ctx.method_return + input_a_trt, input_b_trt = add_missing_trt_tensors(ctx.network, [input_a, input_b]) + input_a_trt, input_b_trt = broadcast_trt_tensors(ctx.network, [input_a_trt, input_b_trt], len(output.shape) - 1) + layer = ctx.network.add_elementwise(input_a_trt, input_b_trt, trt.ElementWiseOperation.DIV) + output._trt = layer.get_output(0) + + +class Div(torch.nn.Module): + def __init__(self): + super(Div, self).__init__() + + def forward(self, x, y): + return x / y + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 224, 224), (1, 3, 224, 224)]) +def test_div_basic(): + return Div() + + +class IDiv(torch.nn.Module): + def __init__(self): + super(IDiv, self).__init__() + + def forward(self, x, y): + x /= y + return x + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 224, 224), (1, 3, 224, 224)]) +def test_div_idiv(): + return IDiv() + + +class TorchDiv(torch.nn.Module): + def __init__(self): + super(TorchDiv, self).__init__() + + def forward(self, x, y): + return torch.div(x, y) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 224, 224), (1, 3, 224, 224)]) +def test_div_torchdiv(): + return TorchDiv() + + +class RDivInt(torch.nn.Module): + def __init__(self): + super(RDivInt, self).__init__() + + def forward(self, x): + return 100 / x + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 3, 3)]) +def test_rdiv_int(): + return RDivInt() + + +class RDivFloat(torch.nn.Module): + def __init__(self): + super(RDivFloat, self).__init__() + + def forward(self, x): + return 100.0 / x + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 3, 3)]) +def test_rdiv_float(): + return RDivFloat() + + +class DivConstantNoBatch(torch.nn.Module): + def __init__(self): + super(DivConstantNoBatch, self).__init__() + self.register_buffer('y', torch.ones((3, 10, 10))) + + def forward(self, x): + return x / self.y + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 10, 10)]) +def test_div_constant_nobatch(): + return DivConstantNoBatch() + + +class DivConstantBatch(torch.nn.Module): + def __init__(self): + super(DivConstantBatch, self).__init__() + self.register_buffer('y', torch.ones((1, 3, 10, 10))) + + def forward(self, x): + return x / self.y + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 10, 10)]) +def test_div_constant_batch(): + return DivConstantBatch() diff --git a/torch2trt/converters/dummy_converters.py b/torch2trt/converters/dummy_converters.py new file mode 100644 index 00000000..8f790e79 --- /dev/null +++ b/torch2trt/converters/dummy_converters.py @@ -0,0 +1,37 @@ +from torch2trt.torch2trt import * + + +def is_private(method): + method = method.split('.')[-1] # remove prefix + return method[0] == '_' and method[1] is not '_' + +def is_function_type(method): + fntype = eval(method + '.__class__.__name__') + return fntype == 'function' or fntype == 'builtin_function_or_method' or fntype == 'method_descriptor' + +def get_methods(namespace): + methods = [] + for method in dir(eval(namespace)): + full_method = namespace + '.' + method + if not is_private(full_method) and is_function_type(full_method): + methods.append(full_method) + return methods + + +TORCH_METHODS = [] +TORCH_METHODS += get_methods('torch') +TORCH_METHODS += get_methods('torch.Tensor') +TORCH_METHODS += get_methods('torch.nn.functional') + + +for method in TORCH_METHODS: + + @tensorrt_converter(method, is_real=False) + def warn_method(ctx): + print('Warning: Encountered known unsupported method %s' % ctx.method_str) + + +@tensorrt_converter('torch.Tensor.dim', is_real=False) +@tensorrt_converter('torch.Tensor.size', is_real=False) +def dont_warn(ctx): + pass \ No newline at end of file diff --git a/torch2trt/converters/expand.py b/torch2trt/converters/expand.py new file mode 100644 index 00000000..e0d07540 --- /dev/null +++ b/torch2trt/converters/expand.py @@ -0,0 +1,43 @@ +from torch2trt.torch2trt import * +from torch2trt.module_test import add_module_test + + +@tensorrt_converter('torch.Tensor.expand') +def convert_expand(ctx): + input = ctx.method_args[0] + sizes = ctx.method_args[1:] + output = ctx.method_return + + inshape = tuple(input.shape)[1:] # exclude batch + shape = tuple(output.shape)[1:] + ndim = len(shape) + start = tuple([0]*ndim) + stride = tuple([int(i == o) for i, o in zip(inshape, shape)]) # stride == 1 if dimensions match, 0 otherwise + + layer = ctx.network.add_slice(input._trt, start, shape, stride) + + output._trt = layer.get_output(0) + + +class ExpandModule(torch.nn.Module): + def __init__(self, *sizes): + super(ExpandModule, self).__init__() + self.sizes = sizes + + def forward(self, x): + return x.expand(*self.sizes) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1,1,3,3)]) +def test_tensor_expand_singledim(): + return ExpandModule(1, 3, 3, 3) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1,1,1,3)]) +def test_tensor_expand_multidim(): + return ExpandModule(1, 3, 3, 3) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1,1,1,3)]) +def test_tensor_expand_inferdim(): + return ExpandModule(1, 3, -1, -1) \ No newline at end of file diff --git a/torch2trt/converters/floordiv.py b/torch2trt/converters/floordiv.py new file mode 100644 index 00000000..9864ca4e --- /dev/null +++ b/torch2trt/converters/floordiv.py @@ -0,0 +1,81 @@ +from torch2trt.torch2trt import * +from torch2trt.module_test import add_module_test + + +@tensorrt_converter('torch.Tensor.__floordiv__') +@tensorrt_converter('torch.Tensor.__ifloordiv__') +@tensorrt_converter('torch.floor_divide') +def convert_floordiv(ctx): + input_a = ctx.method_args[0] + input_b = ctx.method_args[1] + output = ctx.method_return + input_a_trt, input_b_trt = add_missing_trt_tensors(ctx.network, [input_a, input_b]) + input_a_trt, input_b_trt = broadcast_trt_tensors(ctx.network, [input_a_trt, input_b_trt], len(output.shape) - 1) + # we can not use ElementWiseOperation.FLOOR_DIV directly because Torch truncate negative result toward 0 + # but TensorRT FLOOR_DIV op toward -Inf + # sign = ab / |ab| + # floordiv result: sign * (|a| // |b|) + ab_layer = ctx.network.add_elementwise(input_a_trt, input_b_trt, trt.ElementWiseOperation.PROD) + abs_ab_layer = ctx.network.add_unary(ab_layer.get_output(0), trt.UnaryOperation.ABS) + sign_layer = ctx.network.add_elementwise(ab_layer.get_output(0), abs_ab_layer.get_output(0), + trt.ElementWiseOperation.DIV) + abs_a_layer = ctx.network.add_unary(input_a_trt, trt.UnaryOperation.ABS) + abs_b_layer = ctx.network.add_unary(input_b_trt, trt.UnaryOperation.ABS) + abs_floor_layer = ctx.network.add_elementwise(abs_a_layer.get_output(0), abs_b_layer.get_output(0), + trt.ElementWiseOperation.FLOOR_DIV) + out_layer = ctx.network.add_elementwise(sign_layer.get_output(0), abs_floor_layer.get_output(0), + trt.ElementWiseOperation.PROD) + output._trt = out_layer.get_output(0) + + +class FloorDiv(torch.nn.Module): + def __init__(self): + super(FloorDiv, self).__init__() + + def forward(self, x, y): + return x // y + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 40, 20), (1, 3, 1, 20)]) +def test_floordiv_op(): + return FloorDiv() + + +class FloorDivAssign (torch.nn.Module): + def __init__(self): + super(FloorDivAssign, self).__init__() + + def forward(self, x, y): + x //= y + return x + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 40, 20), (1, 3, 1, 20)]) +def test_floordiv_op_assign(): + return FloorDivAssign() + + +class FloorDivConst(torch.nn.Module): + def __init__(self): + super(FloorDivConst, self).__init__() + + def forward(self, x): + return x // 2. + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 40, 20)]) +def test_floordiv_op_const(): + return FloorDivConst() + + +class TorchFloorDiv(torch.nn.Module): + def __init__(self): + super(TorchFloorDiv, self).__init__() + + def forward(self, x, y): + return torch.floor_divide(x, y) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 40, 20), (1, 3, 1, 20)]) +def test_floordiv_func(): + return TorchFloorDiv() diff --git a/torch2trt/converters/getitem.py b/torch2trt/converters/getitem.py new file mode 100644 index 00000000..92203d50 --- /dev/null +++ b/torch2trt/converters/getitem.py @@ -0,0 +1,155 @@ +from torch2trt.torch2trt import * +from torch2trt.module_test import add_module_test + + +def slice_to_trt(dim_size, dim_slice): + + start = 0 if dim_slice.start is None else dim_slice.start + stop = dim_size if dim_slice.stop is None else dim_slice.stop + stride = 1 if dim_slice.step is None else dim_slice.step + + size = (stop - start - 1) // stride + 1 + + return start, size, stride + + +def num_slice_types(slices): + num_slice = 0 + for s in slices: + if isinstance(s, slice) or isinstance(s, int): + num_slice += 1 + return num_slice + + +@tensorrt_converter('torch.Tensor.__getitem__') +def convert_tensor_getitem(ctx): + input = ctx.method_args[0] + slices = ctx.method_args[1] + output = ctx.method_return + + input_trt = input._trt + + # Step 1 - Replace ellipsis with expanded slices + + num_ellipsis = len(input.shape) - num_slice_types(slices) + + new_slices = [] + for s in slices: + + if s == Ellipsis: + while num_ellipsis > 0: + new_slices.append(slice(None, None, None)) + num_ellipsis -= 1 + elif isinstance(s, slice): + new_slices.append(s) + elif s is None: + new_slices.append(None) + elif isinstance(s, int): + new_slices.append(s) + + # fill missing slices at end + while num_slice_types(new_slices) < len(input.shape): + new_slices.append(slice(None, None, None)) + + # Step 2 - Remove batch from slices (TRT from this point) + + slices = tuple(new_slices[1:]) # remove batch + + + # Step 3 - Add slice layer (will currently ignore 'None' slices) + + starts = [] + sizes = [] + strides = [] + + input_dim = 0 + for s in slices: + + if input_dim >= len(input_trt.shape): + break + + input_size = int(input_trt.shape[input_dim]) + + if isinstance(s, slice): + start, size, stride = slice_to_trt(input_size, s) + starts.append(start) + sizes.append(size) + strides.append(stride) + input_dim += 1 + + elif isinstance(s, int): + starts.append(s) + sizes.append(1) + strides.append(1) + input_dim += 1 + + output_trt = ctx.network.add_slice(input_trt, starts, sizes, strides).get_output(0) + + # Step 4 - Add shuffle layer to insert dimensions for 'None' slices and remove dimensions for 'int' slices + + num_non_slice = len([s for s in slices if not isinstance(s, slice)]) + if num_non_slice > 0: + layer = ctx.network.add_shuffle(output_trt) + layer.reshape_dims = tuple(output.shape[1:]) # exclude batch + output_trt = layer.get_output(0) + + output._trt = output_trt + + +class LambdaModule(torch.nn.Module): + def __init__(self, fn): + super(LambdaModule, self).__init__() + self.fn = fn + + def forward(self, x): + return self.fn(x) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 5, 3)]) +def test_tensor_getitem_1d_int(): + return LambdaModule(lambda x: x[:, 0]) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 5, 4, 3)]) +def test_tensor_getitem_2d_int(): + return LambdaModule(lambda x: x[:, 0]) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 5, 4, 3)]) +def test_tensor_getitem_2d_strided(): + return LambdaModule(lambda x: x[:, ::2]) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 5, 4, 3)]) +def test_tensor_getitem_2d_strided_offset(): + return LambdaModule(lambda x: x[:, 1::2]) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 5, 4, 3)]) +def test_tensor_getitem_2d_strided_range(): + return LambdaModule(lambda x: x[:, 1:3:2]) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 5, 4, 3)]) +def test_tensor_getitem_2d_insert_dim(): + return LambdaModule(lambda x: x[:, None]) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 5, 4, 3)]) +def test_tensor_getitem_2d_insert_dim_ellipsis(): + return LambdaModule(lambda x: x[:, None, ...]) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 5, 4, 3)]) +def test_tensor_getitem_2d_append_dim(): + return LambdaModule(lambda x: x[:, ..., None]) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 5, 4, 3)]) +def test_tensor_getitem_2d_append_2dim(): + return LambdaModule(lambda x: x[:, ..., None, None]) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 5, 4, 3)]) +def test_tensor_getitem_2d_weird_combo(): + return LambdaModule(lambda x: x[:, 0:3:4, None, None, 1, ...]) diff --git a/torch2trt/converters/group_norm.py b/torch2trt/converters/group_norm.py new file mode 100644 index 00000000..6b7c37c3 --- /dev/null +++ b/torch2trt/converters/group_norm.py @@ -0,0 +1,48 @@ +import torch.nn as nn +from torch2trt.torch2trt import * +from torch2trt.module_test import add_module_test + +def has_group_norm_plugin(): + try: + from torch2trt.plugins import GroupNormPlugin + return True + except: + return False + + +def get_group_norm_plugin(num_groups, weight, bias, eps): + from torch2trt.plugins import GroupNormPlugin + PLUGIN_NAME = 'group_norm' + registry = trt.get_plugin_registry() + creator = [c for c in registry.plugin_creator_list if c.name == PLUGIN_NAME and c.plugin_namespace == 'torch2trt'][0] + torch2trt_plugin = GroupNormPlugin(num_groups=num_groups, weight=weight, bias=bias, eps=eps) + return creator.deserialize_plugin(PLUGIN_NAME, torch2trt_plugin.serializeToString()) + +@tensorrt_converter('torch.nn.GroupNorm.forward', has_group_norm_plugin()) +def convert_group_norm_trt(ctx): + module = ctx.method_args[0] + input = ctx.method_args[1] + num_groups = module.num_groups + weight = module.weight + bias = module.bias + eps = module.eps + input_trt = add_missing_trt_tensors(ctx.network, [input]) + output = ctx.method_return + plugin = get_group_norm_plugin(num_groups, weight, bias, eps) + + layer = ctx.network.add_plugin_v2(input_trt, plugin) + + output._trt = layer.get_output(0) + + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 10, 112, 112)], has_group_norm_plugin()) +def test_group_norm_trt_g2_fp32(): + return torch.nn.GroupNorm(2, 10) + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 10, 112, 112)], has_group_norm_plugin()) +def test_group_norm_trt_g2_eps_fp32(): + return torch.nn.GroupNorm(2, 10, eps=1e-4) + + + diff --git a/torch2trt/converters/identity.py b/torch2trt/converters/identity.py new file mode 100644 index 00000000..f7ef1f97 --- /dev/null +++ b/torch2trt/converters/identity.py @@ -0,0 +1,22 @@ +from torch2trt.torch2trt import * + + +@tensorrt_converter('torch.Tensor.contiguous') +@tensorrt_converter('torch.nn.functional.dropout') +@tensorrt_converter('torch.nn.functional.dropout2d') +@tensorrt_converter('torch.nn.functional.dropout3d') +def convert_functional_identity(ctx): + input = ctx.method_args[0] + input_trt = add_missing_trt_tensors(ctx.network, [input])[0] + output = ctx.method_return + output._trt = input_trt + + +@tensorrt_converter('torch.nn.Dropout.forward') +@tensorrt_converter('torch.nn.Dropout2d.forward') +@tensorrt_converter('torch.nn.Dropout3d.forward') +def convert_identity(ctx): + input = ctx.method_args[1] + input_trt = add_missing_trt_tensors(ctx.network, [input])[0] + output = ctx.method_return + output._trt = input_trt diff --git a/torch2trt/converters/instance_norm.py b/torch2trt/converters/instance_norm.py new file mode 100644 index 00000000..5785fb20 --- /dev/null +++ b/torch2trt/converters/instance_norm.py @@ -0,0 +1,150 @@ +from torch2trt.torch2trt import * +from torch2trt.module_test import add_module_test + + +def _add_scale_1d2d3d(network, x_trt, mode, offset, scale, power): + ndim = len(x_trt.shape) + + y_trt = x_trt + + # shape to 2D + if ndim != 3: + layer = network.add_shuffle(y_trt) + layer.reshape_dims = (x_trt.shape[0], x_trt.shape[1], -1) # NCH -> NCHW + y_trt = layer.get_output(0) + + y_trt = network.add_scale(y_trt, mode, offset, scale, power).get_output(0) + + # shape to original dimension + if ndim != 3: + layer = network.add_shuffle(layer.get_output(0)) + layer.reshape_dims = tuple(x_trt.shape) + y_trt = layer.get_output(0) + + return y_trt + +@tensorrt_converter('torch.instance_norm') +@tensorrt_converter('torch.nn.functional.instance_norm') +def convert_instance_norm(ctx): + input = get_arg(ctx, 'input', pos=0, default=None) + running_mean = get_arg(ctx, 'running_mean', pos=1, default=None) + running_var = get_arg(ctx, 'running_var', pos=2, default=None) + weight = get_arg(ctx, 'weight', pos=3, default=None) + bias = get_arg(ctx, 'bias', pos=4, default=None) + use_input_stats = get_arg(ctx, 'use_input_stats', pos=5, default=True) + momentum = get_arg(ctx, 'momentum', pos=6, default=0.1) + eps = get_arg(ctx, 'eps', pos=7, default=1e-05) + output = ctx.method_return + + + # CASE 1 - USING RUNNING STATISTICS + if not use_input_stats: + + # equivalent to batch norm + scale = 1.0 / np.sqrt(running_var.detach().cpu().numpy() + eps) + offset = -running_mean.detach().cpu().numpy() * scale + power = np.ones_like(scale) + + if weight is not None: + scale *= weight.detach().cpu().numpy() + offset += bias.detach().cpu().numpy() + + result_trt = _add_scale_1d2d3d(ctx.network, input._trt, trt.ScaleMode.CHANNEL, offset, scale, power) + + output._trt = result_trt + + # CASE 2 - USING INPUT STATS + else: + + eps_np = np.array([eps], dtype=np.float32) + keep_dims = True + reduce_axes = torch_dim_to_trt_axes(tuple(range(2, len(input.shape)))) + + # compute mean over spatial + mean_trt = ctx.network.add_reduce(input._trt, trt.ReduceOperation.AVG, reduce_axes, keep_dims).get_output(0) + + # compute variance over spatial (include eps, to reduce layer count) + delta_trt = ctx.network.add_elementwise(input._trt, mean_trt, trt.ElementWiseOperation.SUB).get_output(0) + var_trt = ctx.network.add_scale(delta_trt, trt.ScaleMode.UNIFORM, np.zeros_like(eps_np), np.ones_like(eps_np), 2 * np.ones_like(eps_np)).get_output(0) + var_trt = ctx.network.add_reduce(var_trt, trt.ReduceOperation.AVG, reduce_axes, keep_dims).get_output(0) + + # compute sqrt(var + eps) + var_trt = ctx.network.add_scale(var_trt, trt.ScaleMode.UNIFORM, eps_np, np.ones_like(eps_np), 0.5 * np.ones_like(eps_np)).get_output(0) + + # compute final result + result_trt = ctx.network.add_elementwise(delta_trt, var_trt, trt.ElementWiseOperation.DIV).get_output(0) + + # compute affine (if applicable) + if weight is not None: + + weight_np = weight.detach().cpu().numpy() + bias_np = bias.detach().cpu().numpy() + + result_trt = _add_scale_1d2d3d(ctx.network, result_trt, trt.ScaleMode.CHANNEL, bias_np, weight_np, np.ones_like(bias_np)) + + output._trt = result_trt + + +# STATIC + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 10, 3)]) +def test_instance_norm_1d_static(): + return torch.nn.InstanceNorm1d(10, track_running_stats=True) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 10, 3, 3)]) +def test_instance_norm_2d_static(): + return torch.nn.InstanceNorm2d(10, track_running_stats=True) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 10, 3, 3, 3)]) +def test_instance_norm_3d_static(): + return torch.nn.InstanceNorm3d(10, track_running_stats=True) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 10, 3)]) +def test_instance_norm_1d_static_affine(): + return torch.nn.InstanceNorm1d(10, affine=True, track_running_stats=True) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 10, 3, 3)]) +def test_instance_norm_2d_static_affine(): + return torch.nn.InstanceNorm2d(10, affine=True, track_running_stats=True) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 10, 3, 3, 3)]) +def test_instance_norm_3d_static_affine(): + return torch.nn.InstanceNorm3d(10, affine=True, track_running_stats=True) + +# DYNAMIC + +# @TODO(jwelsh): 1D dynamic test failing +# @add_module_test(torch.float32, torch.device('cuda'), [(1, 10, 3)]) +# def test_instance_norm_1d_dynamic(): +# return torch.nn.InstanceNorm1d(10, track_running_stats=False) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 10, 3, 3)]) +def test_instance_norm_2d_dynamic(): + return torch.nn.InstanceNorm2d(10, track_running_stats=False) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 10, 3, 3, 3)]) +def test_instance_norm_3d_dynamic(): + return torch.nn.InstanceNorm3d(10, track_running_stats=False) + + +# @TODO(jwelsh): 1D dynamic test failing +# @add_module_test(torch.float32, torch.device('cuda'), [(1, 10, 3)]) +# def test_instance_norm_1d_dynamic_affine(): +# return torch.nn.InstanceNorm1d(10, affine=True, track_running_stats=False) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 10, 3, 3)]) +def test_instance_norm_2d_dynamic_affine(): + return torch.nn.InstanceNorm2d(10, affine=True, track_running_stats=False) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 10, 3, 3, 3)]) +def test_instance_norm_3d_dynamic_affine(): + return torch.nn.InstanceNorm3d(10, affine=True, track_running_stats=False) diff --git a/torch2trt/converters/interpolate.py b/torch2trt/converters/interpolate.py new file mode 100644 index 00000000..dfa20d19 --- /dev/null +++ b/torch2trt/converters/interpolate.py @@ -0,0 +1,179 @@ +import torch.nn.functional as F +import torch.nn as nn +from torch2trt.torch2trt import * +from torch2trt.module_test import add_module_test +import collections + + +def has_interpolate_plugin(): + try: + from torch2trt.plugins import InterpolatePlugin + return True + except: + return False + +def get_interpolate_plugin(size, mode, align_corners): + from torch2trt.plugins import InterpolatePlugin + PLUGIN_NAME = 'interpolate' + registry = trt.get_plugin_registry() + creator = [c for c in registry.plugin_creator_list if c.name == PLUGIN_NAME and c.plugin_namespace == 'torch2trt'][0] + torch2trt_plugin = InterpolatePlugin(size=size, mode=mode, align_corners=align_corners) + return creator.deserialize_plugin(PLUGIN_NAME, torch2trt_plugin.serializeToString()) + + +@tensorrt_converter('torch.nn.functional.interpolate', enabled=trt_version() < '7.1' and has_interpolate_plugin()) +def convert_interpolate_plugin(ctx): + input = ctx.method_args[0] + input_trt = add_missing_trt_tensors(ctx.network, [input])[0] + output = ctx.method_return + + try: + mode = get_arg(ctx, 'mode', pos=3, default='nearest') + except KeyError: + mode = 'nearest' + + try: + align_corners = get_arg(ctx, 'align_corners', pos=4, default=None) + except KeyError: + align_corners = False + + # currently only works for NCHW + size = list(output.shape[2:]) + + plugin = get_interpolate_plugin(size=size, mode=mode, align_corners=align_corners) + + + layer = ctx.network.add_plugin_v2([input_trt], plugin) + + output._trt = layer.get_output(0) + + +@tensorrt_converter('torch.nn.functional.interpolate', enabled=trt_version() >= '7.1') +@tensorrt_converter('torch.nn.functional.upsample', enabled=trt_version() >= '7.1') +def convert_interpolate_trt7(ctx): + #parse args + input = get_arg(ctx, 'input', pos=0, default=None) + size = get_arg(ctx, 'size', pos=1, default=None) + scale_factor=get_arg(ctx, 'scale_factor', pos=2, default=None) + mode = get_arg(ctx, 'mode', pos=3, default='nearest') + align_corners = get_arg(ctx, 'align_corners', pos=4, default=None) + + input_dim = input.dim() - 2 + + input_trt = add_missing_trt_tensors(ctx.network, [input])[0] + output = ctx.method_return + layer = ctx.network.add_resize(input=input_trt) + + shape = size + if shape != None: + if isinstance(shape, collections.Sequence): + shape = [input.size(1)] + list(shape) + else: + shape = [input.size(1)] + [shape] * input_dim + + layer.shape = shape + + scales = scale_factor + if scales != None: + if not isinstance(scales, collections.Sequence): + scales = [scales] * input_dim + layer.scales = [1] + list(scales) + + resize_mode = mode + if resize_mode.lower() in ["linear","bilinear","trilinear"]: + layer.resize_mode = trt.ResizeMode.LINEAR + else: + layer.resize_mode=trt.ResizeMode.NEAREST + + if align_corners != None: + layer.align_corners = align_corners + + output._trt = layer.get_output(0) + + +class Interpolate(torch.nn.Module): + def __init__(self, size, mode, align_corners): + super(Interpolate, self).__init__() + self.size = size + self.mode = mode + self.align_corners = align_corners + + def forward(self, x): + return F.interpolate(x, self.size, mode=self.mode, align_corners=self.align_corners) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 10, 112, 112)], enabled=trt_version() < '7.1' and has_interpolate_plugin()) +def test_interpolate_nearest(): + return Interpolate((224, 224), 'nearest', None) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 10, 112, 112)], enabled=trt_version() < '7.1' and has_interpolate_plugin()) +def test_interpolate_bilinear(): + return Interpolate((224, 224), 'bilinear', False) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 10, 112, 112)], enabled=trt_version() < '7.1' and has_interpolate_plugin()) +def test_interpolate_bicubic(): + return Interpolate((224, 224), 'bicubic', False) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 10, 112, 112)], enabled=trt_version() < '7.1' and has_interpolate_plugin()) +def test_interpolate_area(): + return Interpolate((56, 56), 'area', None) + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 10, 112, 112)], enabled=trt_version() < '7.1' and has_interpolate_plugin()) +def test_upsample_scale_factor2(): + return nn.Upsample(scale_factor=2, mode='bilinear',align_corners=False) + +@add_module_test(torch.float32, torch.device('cuda'), [(1,2,12,12)], enabled=trt_version() >= '7.1') +def test_nearest_mode(): + return torch.nn.Upsample(scale_factor=2, mode="nearest") + +@add_module_test(torch.float32, torch.device('cuda'), [(1,4,12,12)], enabled=trt_version() >= '7.1') +def test_bilinear_mode(): + return torch.nn.Upsample(scale_factor=3, mode="bilinear",align_corners=False) + +@add_module_test(torch.float32, torch.device('cuda'), [(1,3,12,12)], enabled=trt_version() >= '7.1') +def test_align_corner(): + return torch.nn.Upsample(scale_factor=2, mode="bilinear", align_corners=True) + +@add_module_test(torch.float32, torch.device('cuda'), [(1,5,13,13)], enabled=trt_version() >= '7.1') +def test_bilinear_mode_odd_input_shape(): + return torch.nn.Upsample(scale_factor=2,mode="bilinear",align_corners=False) + +@add_module_test(torch.float32, torch.device('cuda'), [(1,4,12,12)], enabled=trt_version() >= '7.1') +def test_size_parameter(): + return torch.nn.Upsample(size=3,mode="nearest") + +@add_module_test(torch.float32, torch.device('cuda'), [(1,3,13,13)], enabled=trt_version() >= '7.1') +@add_module_test(torch.float32, torch.device('cuda'), [(1,3,1,1)], enabled=trt_version() >= '7.1') +def test_size_parameter_odd_input(): + return torch.nn.Upsample(size=[6,3],mode="nearest") + + +@add_module_test(torch.float32, torch.device('cuda'), [(1,4,6,6,6)], enabled=trt_version() >= '7.1') +def test_nearest_mode_3d(): + return torch.nn.Upsample(scale_factor=2, mode="nearest") + +@add_module_test(torch.float32, torch.device('cuda'), [(1,3,5,5,5)], enabled=trt_version() >= '7.1') +def test_bilinear_mode_3d(): + return torch.nn.Upsample(scale_factor=3, mode="trilinear",align_corners=False) + +@add_module_test(torch.float32, torch.device('cuda'), [(1,4,8,8,8)], enabled=trt_version() >= '7.1') +def test_align_corner_3d(): + return torch.nn.Upsample(scale_factor=4, mode="trilinear", align_corners=True) + +@add_module_test(torch.float32, torch.device('cuda'), [(1,6,7,7,7)], enabled=trt_version() >= '7.1') +@add_module_test(torch.float32, torch.device('cuda'), [(1,3,2,4,4)], enabled=trt_version() >= '7.1') +@add_module_test(torch.float32, torch.device('cuda'), [(1,3,1,1,1)], enabled=trt_version() >= '7.1') +def test_bilinear_mode_odd_input_shape_3d(): + return torch.nn.Upsample(scale_factor=2, mode="trilinear",align_corners=False) + +@add_module_test(torch.float32, torch.device('cuda'), [(1,1,12,12,12)], enabled=trt_version() >= '7.1') +def test_size_parameter_3d(): + return torch.nn.Upsample(size=3,mode="trilinear", align_corners=True) + +@add_module_test(torch.float32, torch.device('cuda'), [(1,3,7,9,5)], enabled=trt_version() >= '7.1') +@add_module_test(torch.float32, torch.device('cuda'), [(1,4,3,5,1)], enabled=trt_version() >= '7.1') +def test_size_parameter_odd_input_3d(): + return torch.nn.Upsample(size=[11,14,17],mode="trilinear", align_corners=False) diff --git a/torch2trt/converters/max.py b/torch2trt/converters/max.py new file mode 100644 index 00000000..ac835638 --- /dev/null +++ b/torch2trt/converters/max.py @@ -0,0 +1,62 @@ +from torch2trt.torch2trt import * +from torch2trt.module_test import add_module_test +from .unary import UnaryModule + + +def __convert_max_elementwise(ctx): + input_a = ctx.method_args[0] + input_b = ctx.method_args[1] + output = ctx.method_return + input_a_trt, input_b_trt = add_missing_trt_tensors(ctx.network, [input_a, input_b]) + input_a_trt, input_b_trt = broadcast_trt_tensors(ctx.network, [input_a_trt, input_b_trt], len(output.shape) - 1) + layer = ctx.network.add_elementwise(input_a_trt, input_b_trt, trt.ElementWiseOperation.MAX) + output._trt = layer.get_output(0) + + +def __convert_max_reduce(ctx): + input = ctx.method_args[0] + dim = get_arg(ctx, 'dim', pos=1, default=tuple(range(1, len(input.shape)))) + keepdim = get_arg(ctx, 'keepdim', pos=2, default=False) + input_trt = add_missing_trt_tensors(ctx.network, [input])[0] + output_val = ctx.method_return[0] + output_idx = ctx.method_return[1] + layer = ctx.network.add_reduce(input_trt, trt.ReduceOperation.MAX, torch_dim_to_trt_axes(dim), keepdim) + output_val._trt = layer.get_output(0) + + +@tensorrt_converter('torch.max') +@tensorrt_converter('torch.Tensor.max') +def convert_max(ctx): + if len(ctx.method_args) > 1 and isinstance(ctx.method_args[1], torch.Tensor): + __convert_max_elementwise(ctx) + else: + __convert_max_reduce(ctx) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3)]) +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 3)]) +def test_max_reduce_dim1(): + return UnaryModule(lambda x: torch.max(x, 1)[0]) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 3)]) +def test_max_reduce_dim22(): + return UnaryModule(lambda x: torch.max(x, 2)[0]) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3)]) +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 3)]) +def test_max_reduce_dim1_keepdim(): + return UnaryModule(lambda x: torch.max(x, 1, keepdim=True)[0]) + + +class MaxElementwise(torch.nn.Module): + def forward(self, x, y): + return torch.max(x, y) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 3), (1, 3, 3)]) +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 3), (1,)]) # broadcast +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 3, 3), (1, 3, 3)]) # broadcast +def test_max_elementwise(): + return MaxElementwise() diff --git a/torch2trt/converters/max_pool2d.py b/torch2trt/converters/max_pool2d.py new file mode 100644 index 00000000..cc4fddeb --- /dev/null +++ b/torch2trt/converters/max_pool2d.py @@ -0,0 +1,53 @@ +from torch2trt.torch2trt import * +from torch2trt.module_test import add_module_test + + +@tensorrt_converter('torch.nn.functional.max_pool2d') +def convert_max_pool2d(ctx): + # parse args + input = get_arg(ctx, 'input', pos=0, default=None) + kernel_size = get_arg(ctx, 'kernel_size', pos=1, default=None) + stride = get_arg(ctx, 'stride', pos=2, default=None) + padding = get_arg(ctx, 'padding', pos=3, default=0) + dilation = get_arg(ctx, 'dilation', pos=4, default=1) + ceil_mode = get_arg(ctx, 'ceil_mode', pos=5, default=False) + + # get input trt tensor (or create constant if it doesn't exist) + input_trt = add_missing_trt_tensors(ctx.network, [input])[0] + + output = ctx.method_return + + # get kernel size + if not isinstance(kernel_size, tuple): + kernel_size = (kernel_size, ) * 2 + + # get stride + if not isinstance(stride, tuple): + stride = (stride, ) * 2 + + # get padding + if not isinstance(padding, tuple): + padding = (padding, ) * 2 + + layer = ctx.network.add_pooling( + input=input_trt, type=trt.PoolingType.MAX, window_size=kernel_size) + + layer.stride = stride + layer.padding = padding + + if ceil_mode: + layer.padding_mode = trt.PaddingMode.EXPLICIT_ROUND_UP + + output._trt = layer.get_output(0) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 4, 6)]) +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 5, 7)]) +def test_MaxPool2d_without_ceil_mode(): + return torch.nn.MaxPool2d(kernel_size=3, stride=2, padding=1, ceil_mode=False) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 4, 6)]) +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 5, 7)]) +def test_MaxPool2d_with_ceil_mode(): + return torch.nn.MaxPool2d(kernel_size=3, stride=2, padding=1, ceil_mode=True) \ No newline at end of file diff --git a/torch2trt/converters/mean.py b/torch2trt/converters/mean.py new file mode 100644 index 00000000..c75689e4 --- /dev/null +++ b/torch2trt/converters/mean.py @@ -0,0 +1,68 @@ +from torch2trt.torch2trt import * +from torch2trt.module_test import add_module_test + + +@tensorrt_converter('torch.mean') +@tensorrt_converter('torch.Tensor.mean') +def convert_mean(ctx): + input = ctx.method_args[0] + input_trt = add_missing_trt_tensors(ctx.network, [input])[0] + output = ctx.method_return + + # get dims from args or kwargs + if 'dim' in ctx.method_kwargs: + dim = ctx.method_kwargs['dim'] + elif len(ctx.method_args) >= 2: + dim = ctx.method_args[1] + + # convert list to tuple + if isinstance(dim, list): + dim = tuple(dim) + + if not isinstance(dim, tuple): + dim = (dim, ) + + # create axes bitmask for reduce layer + axes = 0 + for d in dim: + axes |= 1 << (d - 1) # -1 to remove batch dimension + + # get whether to keep dimensions + if 'keepdim' in ctx.method_kwargs: + keep_dims = ctx.method_kwargs['keepdim'] + elif len(ctx.method_args) == 3: + keep_dims = ctx.method_args[2] + else: + keep_dims = False + + layer = ctx.network.add_reduce(input_trt, trt.ReduceOperation.AVG, axes, keep_dims) + output._trt = layer.get_output(0) + + +class Mean(torch.nn.Module): + def __init__(self, dim, keepdim): + super(Mean, self).__init__() + self.dim = dim + self.keepdim = keepdim + def forward(self, x): + return x.mean(self.dim, self.keepdim) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3)]) +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 3)]) +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 3, 3)]) +def test_mean_channel(): + return Mean(1, False) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 3)]) +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 3, 3)]) +def test_mean_tuple(): + return Mean((1, 2), False) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3)]) +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 3)]) +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 3, 3)]) +def test_mean_keepdim(): + return Mean(1, True) \ No newline at end of file diff --git a/torch2trt/converters/min.py b/torch2trt/converters/min.py new file mode 100644 index 00000000..e8f891ad --- /dev/null +++ b/torch2trt/converters/min.py @@ -0,0 +1,62 @@ +from torch2trt.torch2trt import * +from torch2trt.module_test import add_module_test +from .unary import UnaryModule + + +def __convert_min_elementwise(ctx): + input_a = ctx.method_args[0] + input_b = ctx.method_args[1] + output = ctx.method_return + input_a_trt, input_b_trt = add_missing_trt_tensors(ctx.network, [input_a, input_b]) + input_a_trt, input_b_trt = broadcast_trt_tensors(ctx.network, [input_a_trt, input_b_trt], len(output.shape) - 1) + layer = ctx.network.add_elementwise(input_a_trt, input_b_trt, trt.ElementWiseOperation.MIN) + output._trt = layer.get_output(0) + + +def __convert_min_reduce(ctx): + input = ctx.method_args[0] + dim = get_arg(ctx, 'dim', pos=1, default=tuple(range(1,len(input.shape)))) + keepdim = get_arg(ctx, 'keepdim', pos=2, default=False) + input_trt = add_missing_trt_tensors(ctx.network, [input])[0] + output_val = ctx.method_return[0] + output_idx = ctx.method_return[1] + layer = ctx.network.add_reduce(input_trt, trt.ReduceOperation.MIN, torch_dim_to_trt_axes(dim), keepdim) + output_val._trt = layer.get_output(0) + + +@tensorrt_converter('torch.min') +@tensorrt_converter('torch.Tensor.min') +def convert_min(ctx): + if len(ctx.method_args) > 1 and isinstance(ctx.method_args[1], torch.Tensor): + __convert_min_elementwise(ctx) + else: + __convert_min_reduce(ctx) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3)]) +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 3)]) +def test_min_reduce_dim1(): + return UnaryModule(lambda x: torch.min(x, 1)[0]) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 3)]) +def test_min_reduce_dim22(): + return UnaryModule(lambda x: torch.min(x, 2)[0]) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3)]) +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 3)]) +def test_min_reduce_dim1_keepdim(): + return UnaryModule(lambda x: torch.min(x, 1, keepdim=True)[0]) + + +class MinElementwise(torch.nn.Module): + def forward(self, x, y): + return torch.min(x, y) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 3), (1, 3, 3)]) +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 3), (1,)]) # broadcast +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 3, 3), (1, 3, 3)]) # broadcast +def test_min_elementwise(): + return MinElementwise() diff --git a/torch2trt/converters/mod.py b/torch2trt/converters/mod.py new file mode 100644 index 00000000..6cf69435 --- /dev/null +++ b/torch2trt/converters/mod.py @@ -0,0 +1,99 @@ +from torch2trt.torch2trt import * +from torch2trt.module_test import add_module_test + + +@tensorrt_converter('torch.fmod') +def convert_mod(ctx): + input_a = ctx.method_args[0] + input_b = ctx.method_args[1] + output = ctx.method_return + input_a_trt, input_b_trt = add_missing_trt_tensors(ctx.network, [input_a, input_b]) + input_a_trt, input_b_trt = broadcast_trt_tensors(ctx.network, [input_a_trt, input_b_trt], len(output.shape) - 1) + # we can not use ElementWiseOperation.FLOOR_DIV directly because Torch truncate negative result toward 0 + # but TensorRT FLOOR_DIV op toward -Inf + # sign = ab / |ab| + # floordiv result: sign * (|a| // |b|) + ab_layer = ctx.network.add_elementwise(input_a_trt, input_b_trt, trt.ElementWiseOperation.PROD) + abs_ab_layer = ctx.network.add_unary(ab_layer.get_output(0), trt.UnaryOperation.ABS) + sign_layer = ctx.network.add_elementwise(ab_layer.get_output(0), abs_ab_layer.get_output(0), + trt.ElementWiseOperation.DIV) + abs_a_layer = ctx.network.add_unary(input_a_trt, trt.UnaryOperation.ABS) + abs_b_layer = ctx.network.add_unary(input_b_trt, trt.UnaryOperation.ABS) + abs_floor_layer = ctx.network.add_elementwise(abs_a_layer.get_output(0), abs_b_layer.get_output(0), + trt.ElementWiseOperation.FLOOR_DIV) + # a % b = a - (a//b) * b + floordiv_layer = ctx.network.add_elementwise(sign_layer.get_output(0), abs_floor_layer.get_output(0), + trt.ElementWiseOperation.PROD) + prod_layer = ctx.network.add_elementwise(floordiv_layer.get_output(0), input_b_trt, trt.ElementWiseOperation.PROD) + sub_layer = ctx.network.add_elementwise(input_a_trt, prod_layer.get_output(0), trt.ElementWiseOperation.SUB) + output._trt = sub_layer.get_output(0) + + +@tensorrt_converter('torch.Tensor.__mod__') +# we need separate converter for operator because for some reason Torch use truncation toward -Inf for this op. +# bug is filed: https://github.com/pytorch/pytorch/issues/52425 +# but for now we have to convert model exactly +def convert_mod(ctx): + input_a = ctx.method_args[0] + input_b = ctx.method_args[1] + output = ctx.method_return + input_a_trt, input_b_trt = add_missing_trt_tensors(ctx.network, [input_a, input_b]) + input_a_trt, input_b_trt = broadcast_trt_tensors(ctx.network, [input_a_trt, input_b_trt], len(output.shape) - 1) + # a % b = a - (a//b) * b + floordiv_layer = ctx.network.add_elementwise(input_a_trt, input_b_trt, trt.ElementWiseOperation.FLOOR_DIV) + prod_layer = ctx.network.add_elementwise(floordiv_layer.get_output(0), input_b_trt, trt.ElementWiseOperation.PROD) + mod_layer = ctx.network.add_elementwise(input_a_trt, prod_layer.get_output(0), trt.ElementWiseOperation.SUB) + output._trt = mod_layer.get_output(0) + + +class Mod(torch.nn.Module): + def __init__(self): + super(Mod, self).__init__() + + def forward(self, x, y): + return x % y + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 40, 20), (1, 3, 1, 20)]) +def test_mod_op(): + return Mod() + + +class ModAssign(torch.nn.Module): + def __init__(self): + super(ModAssign, self).__init__() + + def forward(self, x, y): + x %= y + return x + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 40, 20), (1, 3, 1, 20)]) +def test_mod_op_assign(): + return ModAssign() + + +class ModConst(torch.nn.Module): + def __init__(self): + super(ModConst, self).__init__() + + def forward(self, x): + return x % 2. + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 40, 20)]) +def test_mod_op_const(): + return ModConst() + + +class TorchMod(torch.nn.Module): + def __init__(self): + super(TorchMod, self).__init__() + + def forward(self, x, y): + return torch.fmod(x, y) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 40, 20), (1, 3, 40, 20)]) +def test_mod_func(): + return TorchMod() diff --git a/torch2trt/converters/mul.py b/torch2trt/converters/mul.py new file mode 100644 index 00000000..eefd744c --- /dev/null +++ b/torch2trt/converters/mul.py @@ -0,0 +1,108 @@ +from torch2trt.torch2trt import * +from torch2trt.module_test import add_module_test + + +@tensorrt_converter('torch.mul') +@tensorrt_converter('torch.Tensor.__imul__') +@tensorrt_converter('torch.Tensor.__mul__') +@tensorrt_converter('torch.Tensor.__rmul__') +def convert_mul(ctx): + input_a = ctx.method_args[0] + input_b = ctx.method_args[1] + output = ctx.method_return + input_a_trt, input_b_trt = add_missing_trt_tensors(ctx.network, [input_a, input_b]) + input_a_trt, input_b_trt = broadcast_trt_tensors(ctx.network, [input_a_trt, input_b_trt], len(output.shape) - 1) + layer = ctx.network.add_elementwise(input_a_trt, input_b_trt, trt.ElementWiseOperation.PROD) + output._trt = layer.get_output(0) + +class Mul(torch.nn.Module): + def __init__(self): + super(Mul, self).__init__() + + def forward(self, x, y): + return x * y + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 224, 224), (1, 3, 224, 224)]) +def test_mul_basic(): + return Mul() + + +class IMul(torch.nn.Module): + def __init__(self): + super(IMul, self).__init__() + + def forward(self, x, y): + x *= y + return x + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 224, 224), (1, 3, 224, 224)]) +def test_mul_imul(): + return IMul() + + +class TorchMul(torch.nn.Module): + def __init__(self): + super(TorchMul, self).__init__() + + def forward(self, x, y): + return torch.mul(x, y) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 224, 224), (1, 3, 224, 224)]) +def test_mul_torchmul(): + return TorchMul() + + +class RMulInt(torch.nn.Module): + def __init__(self): + super(RMulInt, self).__init__() + + def forward(self, x): + return 10 * x + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 3, 3)]) +def test_rmul_int(): + return RMulInt() + + +class RMulFloat(torch.nn.Module): + def __init__(self): + super(RMulFloat, self).__init__() + + def forward(self, x): + return 10.0 * x + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 3, 3)]) +def test_rmul_float(): + return RMulFloat() + + +class MulConstantNoBatch(torch.nn.Module): + def __init__(self): + super(MulConstantNoBatch, self).__init__() + self.register_buffer('y', torch.ones((3, 10, 10))) + + def forward(self, x): + return x * self.y + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 10, 10)]) +def test_mul_constant_nobatch(): + return MulConstantNoBatch() + + +class MulConstantBatch(torch.nn.Module): + def __init__(self): + super(MulConstantBatch, self).__init__() + self.register_buffer('y', torch.ones((1, 3, 10, 10))) + + def forward(self, x): + return x * self.y + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 10, 10)]) +def test_mul_constant_batch(): + return MulConstantBatch() diff --git a/torch2trt/converters/narrow.py b/torch2trt/converters/narrow.py new file mode 100644 index 00000000..dcf1d696 --- /dev/null +++ b/torch2trt/converters/narrow.py @@ -0,0 +1,40 @@ +import tensorrt as trt +from torch2trt.torch2trt import * +from torch2trt.module_test import add_module_test + +@tensorrt_converter('torch.Tensor.narrow') +@tensorrt_converter('torch.narrow') +def convert_narrow(ctx): + inputs = get_arg(ctx, 'input', pos=0, default=None) + start = get_arg(ctx, 'start', pos=2, default=None) + output = ctx.method_return + shape = list(inputs.shape) + start = [0]*len(shape) + stride = [1]*len(shape) + dim = ctx.method_args[1] if get_arg(ctx, 'dim', pos=1, default=0) >=0 else len(shape)+get_arg(ctx, 'dim', pos=1, default=0) + start[dim] = ctx.method_args[2] + shape[dim] = ctx.method_args[3] + # not consider batch dimension + input_trt = trt_(ctx.network,inputs) + layer = ctx.network.add_slice(input=input_trt,start=start[1:], shape=shape[1:],stride=stride[1:]) + output._trt = layer.get_output(0) + +class Narrow(torch.nn.Module): + def __init__(self, dim, start, length): + super(Narrow, self).__init__() + self.dim = dim + self.start = start + self.length = length + + def forward(self, x): + return torch.narrow(x,self.dim,self.start,self.length) + +@add_module_test(torch.float32, torch.device('cuda'), [(1,3,224,224)]) +def test_narrow1(): + return Narrow(1,0,2) + +@add_module_test(torch.float32, torch.device('cuda'), [(1,3,224,224)]) +def test_narrow2(): + return Narrow(2,0,50) + + diff --git a/torch2trt/converters/ne.py b/torch2trt/converters/ne.py new file mode 100644 index 00000000..c28f16e7 --- /dev/null +++ b/torch2trt/converters/ne.py @@ -0,0 +1,54 @@ +from torch2trt.torch2trt import * +from torch2trt.module_test import add_module_test + + +@tensorrt_converter('torch.ne') +@tensorrt_converter('torch.Tensor.__ne__') +def convert_ne(ctx): + input_a = ctx.method_args[0] + input_b = ctx.method_args[1] + output = ctx.method_return + input_a_trt, input_b_trt = add_missing_trt_tensors(ctx.network, [input_a, input_b]) + input_a_trt, input_b_trt = broadcast_trt_tensors(ctx.network, [input_a_trt, input_b_trt], len(output.shape) - 1) + layer_1 = ctx.network.add_elementwise(input_a_trt, input_b_trt, trt.ElementWiseOperation.EQUAL) + layer_2 = ctx.network.add_unary(layer_1.get_output(0), trt.UnaryOperation.NOT) + output._trt = layer_2.get_output(0) + + +class NotEqual(torch.nn.Module): + def __init__(self): + super(NotEqual, self).__init__() + + def forward(self, x, y): + return x != y + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 40, 20), (1, 3, 1, 20)]) +def test_ne_op(): + return NotEqual() + + +class NotEqualConst(torch.nn.Module): + def __init__(self): + super(NotEqualConst, self).__init__() + + def forward(self, x): + return x != 13.62 + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 40, 20)]) +def test_ne_op_const(): + return NotEqualConst() + + +class TorchNotEqual(torch.nn.Module): + def __init__(self): + super(TorchNotEqual, self).__init__() + + def forward(self, x, y): + return torch.ne(x, y) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 40, 20), (1, 3, 1, 20)]) +def test_ne_torch(): + return TorchNotEqual() diff --git a/torch2trt/converters/normalize.py b/torch2trt/converters/normalize.py new file mode 100644 index 00000000..9f639dbf --- /dev/null +++ b/torch2trt/converters/normalize.py @@ -0,0 +1,67 @@ +from torch2trt.torch2trt import * +from torch2trt.module_test import add_module_test + + +@tensorrt_converter('torch.nn.functional.normalize') +def convert_normalize(ctx): + # get args + input = get_arg(ctx, 'input', pos=0, default=None) + p = get_arg(ctx, 'p', pos=1, default=2) + dim = get_arg(ctx, 'dim', pos=2, default=1) + eps = get_arg(ctx, 'eps', pos=3, default=1e-12) + +# input_trt = input._trt + output = ctx.method_return + + # add broadcastable scalar constants to network + input_trt, eps_trt, p_trt, p_inv_trt = add_missing_trt_tensors(ctx.network, [input, eps, p, 1.0 / p]) + input_trt, eps_trt, p_trt, p_inv_trt = broadcast_trt_tensors(ctx.network, [input_trt, eps_trt, p_trt, p_inv_trt], len(input_trt.shape)) + + # compute norm = sum(abs(x)**p, dim=dim)**(1./p) + norm = ctx.network.add_unary(input_trt, trt.UnaryOperation.ABS).get_output(0) + norm = ctx.network.add_elementwise(norm, p_trt, trt.ElementWiseOperation.POW).get_output(0) + norm = ctx.network.add_reduce(norm, trt.ReduceOperation.SUM, torch_dim_to_trt_axes(dim), keep_dims=True).get_output(0) + norm = ctx.network.add_elementwise(norm, p_inv_trt, trt.ElementWiseOperation.POW).get_output(0) + + # clamp norm = max(norm, eps) + norm = ctx.network.add_elementwise(norm, eps_trt, trt.ElementWiseOperation.MAX).get_output(0) + + # divide input by norm + output._trt = ctx.network.add_elementwise(input_trt, norm, trt.ElementWiseOperation.DIV).get_output(0) + + +class Normalize(torch.nn.Module): + def __init__(self, *args, **kwargs): + super(Normalize, self).__init__() + self.args = args + self.kwargs = kwargs + + def forward(self, x): + return torch.nn.functional.normalize(x, *self.args, **self.kwargs) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3)]) +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 3)]) +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 3, 3)]) +def test_normalize_basic(): + return Normalize() + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3)]) +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 3)]) +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 3, 3)]) +def test_normalize_l1_basic(): + return Normalize(p=1.0) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3)]) +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 3)]) +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 3, 3)]) +def test_normalize_l1p5_basic(): + return Normalize(p=1.5) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 3)]) +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 3, 3)]) +def test_normalize_l2_height(): + return Normalize(p=2.0, dim=2) \ No newline at end of file diff --git a/torch2trt/converters/pad.py b/torch2trt/converters/pad.py new file mode 100644 index 00000000..7fa9ba88 --- /dev/null +++ b/torch2trt/converters/pad.py @@ -0,0 +1,33 @@ +from torch2trt.torch2trt import * +from torch2trt.module_test import add_module_test + + +@tensorrt_converter('torch.nn.functional.pad') +def convert_pad(ctx): + input = ctx.method_args[0] + input_trt = add_missing_trt_tensors(ctx.network, [input])[0] + output = ctx.method_return + + pad = ctx.method_args[1] + pre_padding = (pad[2], pad[0]) + post_padding = (pad[3], pad[1]) + + # mode / value are ignored since not supported by TensorRT + + layer = ctx.network.add_padding(input_trt, pre_padding, post_padding) + output._trt = layer.get_output(0) + + +class Pad(torch.nn.Module): + + def __init__(self, pad): + super(Pad, self).__init__() + self.pad = pad + + def forward(self, x): + return torch.nn.functional.pad(x, self.pad) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 224, 224)]) +def test_pad_basic(): + return Pad((1, 2, 3, 4)) \ No newline at end of file diff --git a/torch2trt/converters/permute.py b/torch2trt/converters/permute.py new file mode 100644 index 00000000..dc2355d3 --- /dev/null +++ b/torch2trt/converters/permute.py @@ -0,0 +1,58 @@ +from torch2trt.torch2trt import * +from torch2trt.module_test import add_module_test + + +@tensorrt_converter('torch.Tensor.permute') +def convert_permute(ctx): + input = ctx.method_args[0] + input_trt = add_missing_trt_tensors(ctx.network, [input])[0] + output = ctx.method_return + + # permutation -1 because TRT does not include batch dim + if isinstance(ctx.method_args[1], int): + permutation = tuple(ctx.method_args[1:]) # handle permute(a, b, c) + else: + permutation = tuple(ctx.method_args[1]) # handle permute([a, b, c]) + + assert(permutation[0] == 0) # cannot move batch dim + + trt_permutation = tuple([p - 1 for p in permutation])[1:] + + layer = ctx.network.add_shuffle(input_trt) + layer.second_transpose = tuple(trt_permutation) + + output._trt = layer.get_output(0) + + +class Permute(torch.nn.Module): + def __init__(self, *args): + super(Permute, self).__init__() + self.args = args + def forward(self, x): + return x.permute(*self.args).contiguous() + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 4, 5)]) +def test_permute_2d_0123(): + return Permute(0, 1, 2, 3) + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 4, 5)]) +def test_permute_2d_0312(): + return Permute(0, 3, 1, 2) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 4, 5, 6)]) +def test_permute_3d_01234(): + return Permute(0, 1, 2, 3, 4) + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 4, 5, 6)]) +def test_permute_3d_04132(): + return Permute(0, 4, 1, 3, 2) + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 4, 5, 6)]) +def test_permute_list(): + return Permute([0, 4, 1, 3, 2]) + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 4, 5, 6)]) +def test_permute_tuple(): + return Permute((0, 4, 1, 3, 2)) \ No newline at end of file diff --git a/torch2trt/converters/pow.py b/torch2trt/converters/pow.py new file mode 100644 index 00000000..357cdb77 --- /dev/null +++ b/torch2trt/converters/pow.py @@ -0,0 +1,92 @@ +from torch2trt.torch2trt import * +from torch2trt.module_test import add_module_test + + +@tensorrt_converter('torch.pow') +@tensorrt_converter('torch.Tensor.__ipow__') +@tensorrt_converter('torch.Tensor.__pow__') +def convert_pow(ctx): + input_a = ctx.method_args[0] + input_b = ctx.method_args[1] + output = ctx.method_return + input_a_trt, input_b_trt = add_missing_trt_tensors(ctx.network, [input_a, input_b]) + input_a_trt, input_b_trt = broadcast_trt_tensors(ctx.network, [input_a_trt, input_b_trt], len(output.shape) - 1) + layer = ctx.network.add_elementwise(input_a_trt, input_b_trt, trt.ElementWiseOperation.POW) + output._trt = layer.get_output(0) + + +@tensorrt_converter('torch.Tensor.__rpow__') +def convert_pow(ctx): + input_a = ctx.method_args[1] + input_b = ctx.method_args[0] # flipped for rpow + output = ctx.method_return + input_a_trt, input_b_trt = add_missing_trt_tensors(ctx.network, [input_a, input_b]) + input_a_trt, input_b_trt = broadcast_trt_tensors(ctx.network, [input_a_trt, input_b_trt], len(output.shape) - 1) + layer = ctx.network.add_elementwise(input_a_trt, input_b_trt, trt.ElementWiseOperation.POW) + output._trt = layer.get_output(0) + + +class Pow(torch.nn.Module): + def __init__(self): + super(Pow, self).__init__() + + def forward(self, x, y): + return x ** y + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 224, 224), (1, 3, 224, 224)]) +def test_pow_basic(): + return Pow() + + +# __ipow__ not yet impl in torch +# class IPow(torch.nn.Module): +# def __init__(self): +# super(IPow, self).__init__() + +# def forward(self, x, y): +# x **= y +# return x + + +# @add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 224, 224), (1, 3, 224, 224)]) +# def test_pow_ipow(): +# return IPow() + + +class TorchPow(torch.nn.Module): + def __init__(self): + super(TorchPow, self).__init__() + + def forward(self, x, y): + return torch.pow(x, y) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 224, 224), (1, 3, 224, 224)]) +def test_torch_pow(): + return TorchPow() + + +class RpowInt(torch.nn.Module): + def __init__(self): + super(RpowInt, self).__init__() + + def forward(self, x): + return 2 ** x + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 224, 224)]) +def test_rpow_int(): + return RpowInt() + + +class RpowFloat(torch.nn.Module): + def __init__(self): + super(RpowFloat, self).__init__() + + def forward(self, x): + return 2.0 ** x + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 224, 224)]) +def test_rpow_float(): + return RpowFloat() diff --git a/torch2trt/converters/prelu.py b/torch2trt/converters/prelu.py new file mode 100644 index 00000000..256ae13c --- /dev/null +++ b/torch2trt/converters/prelu.py @@ -0,0 +1,47 @@ +from torch2trt.torch2trt import * +from torch2trt.module_test import add_module_test + + +@tensorrt_converter('torch.nn.functional.prelu') +def convert_prelu(ctx): + input = get_arg(ctx, 'input', pos=0, default=None) + weight = get_arg(ctx, 'weight', pos=1, default=None) + output = ctx.method_return + + weight_shape = [1] * (len(input.shape) - 1) + weight_shape[0] = weight.numel() + + input_trt = add_missing_trt_tensors(ctx.network, [input])[0] + + + # y = prelu(x) = relu(x) - alpha * relu(-x) + weight_trt = ctx.network.add_constant(weight_shape, -weight.detach().view(weight_shape).cpu().numpy()).get_output(0) # detach so considered leaf + + # x >= 0 + a = ctx.network.add_activation(input_trt, trt.ActivationType.RELU).get_output(0) + + # x <= 0 + b = ctx.network.add_unary(input_trt, trt.UnaryOperation.NEG).get_output(0) + b = ctx.network.add_activation(b, trt.ActivationType.RELU).get_output(0) + b = ctx.network.add_elementwise(b, weight_trt, trt.ElementWiseOperation.PROD).get_output(0) + + # y = a + b + y = ctx.network.add_elementwise(a, b, trt.ElementWiseOperation.SUM) + + output._trt = y.get_output(0) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 5)]) +@add_module_test(torch.float32, torch.device('cuda'), [(1, 5, 3)]) +@add_module_test(torch.float32, torch.device('cuda'), [(1, 5, 3, 3)]) +def test_prelu_scalar(): + return torch.nn.PReLU() + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 5)]) +@add_module_test(torch.float32, torch.device('cuda'), [(1, 5, 3)]) +@add_module_test(torch.float32, torch.device('cuda'), [(1, 5, 3, 3)]) +def test_prelu_vector(): + m = torch.nn.PReLU(5) + m.weight = torch.nn.Parameter(torch.randn(5)) # randn so each channel different + return m \ No newline at end of file diff --git a/torch2trt/converters/prod.py b/torch2trt/converters/prod.py new file mode 100644 index 00000000..fda1d3e6 --- /dev/null +++ b/torch2trt/converters/prod.py @@ -0,0 +1,38 @@ +from torch2trt.torch2trt import * +from torch2trt.module_test import add_module_test +from .unary import UnaryModule + + +@tensorrt_converter('torch.prod') +@tensorrt_converter('torch.Tensor.prod') +def convert_prod(ctx): + input = ctx.method_args[0] + dim = get_arg(ctx, 'dim', pos=1, default=tuple(range(1, len(input.shape)))) + keepdim = get_arg(ctx, 'keepdim', pos=2, default=False) + input_trt = add_missing_trt_tensors(ctx.network, [input])[0] + output = ctx.method_return + layer = ctx.network.add_reduce(input_trt, trt.ReduceOperation.PROD, torch_dim_to_trt_axes(dim), keepdim) + output._trt = layer.get_output(0) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3)]) +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 3)]) +def test_prod_reduce_all(): + return UnaryModule(lambda x: torch.prod(x)) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3)]) +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 3)]) +def test_prod_reduce_dim1(): + return UnaryModule(lambda x: torch.prod(x, 1)) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 3)]) +def test_prod_reduce_dim22(): + return UnaryModule(lambda x: torch.prod(x, 2)) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3)]) +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 3)]) +def test_prod_reduce_dim1_keepdim(): + return UnaryModule(lambda x: torch.prod(x, 1, keepdim=True)) diff --git a/torch2trt/converters/relu.py b/torch2trt/converters/relu.py new file mode 100644 index 00000000..986c1f30 --- /dev/null +++ b/torch2trt/converters/relu.py @@ -0,0 +1,49 @@ +from torch2trt.torch2trt import * +from torch2trt.module_test import add_module_test + + +@tensorrt_converter('torch.relu') +@tensorrt_converter('torch.relu_') +@tensorrt_converter('torch.nn.functional.relu') +@tensorrt_converter('torch.nn.functional.relu_') +@tensorrt_converter('torch.Tensor.relu') +def convert_functional_relu(ctx): + ctx.method_args = (torch.nn.ReLU(),) + ctx.method_args + convert_relu(ctx) + + +@tensorrt_converter('torch.nn.ReLU.forward') +def convert_relu(ctx): + input = ctx.method_args[1] + input_trt = add_missing_trt_tensors(ctx.network, [input])[0] + output = ctx.method_return + layer = ctx.network.add_activation( + input=input_trt, type=trt.ActivationType.RELU) + output._trt = layer.get_output(0) + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 4, 5)]) +def test_relu_basic(): + return torch.nn.ReLU() + + +class FunctionalRelu(torch.nn.Module): + def forward(self, x): + return torch.nn.functional.relu(x) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 4, 5)]) +def test_functional_relu_basic(): + return FunctionalRelu() + + +class TensorRelu(torch.nn.Module): + def __init__(self): + super(TensorRelu, self).__init__() + + def forward(self, x): + return x.relu() + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 40, 20)]) +def test_tensor_relu(): + return TensorRelu() diff --git a/torch2trt/converters/relu6.py b/torch2trt/converters/relu6.py new file mode 100644 index 00000000..0d809aa4 --- /dev/null +++ b/torch2trt/converters/relu6.py @@ -0,0 +1,40 @@ +from torch2trt.torch2trt import * +from torch2trt.module_test import add_module_test + + +@tensorrt_converter('torch.nn.functional.relu6') +def convert_functional_relu6(ctx): + ctx.method_args = (torch.nn.ReLU6(),) + ctx.method_args + convert_relu6(ctx) + + +@tensorrt_converter('torch.nn.ReLU6.forward') +def convert_relu6(ctx): + input = ctx.method_args[1] + output = ctx.method_return + + input_a_trt, input_b_trt = add_missing_trt_tensors(ctx.network, [input, 6]) + input_a_trt, input_b_trt = broadcast_trt_tensors(ctx.network, [input_a_trt, input_b_trt], len(output.shape) - 1) + + layer = ctx.network.add_activation( + input=input_a_trt, type=trt.ActivationType.RELU) + layer = ctx.network.add_elementwise( + layer.get_output(0), input_b_trt, trt.ElementWiseOperation.MIN) + + output._trt = layer.get_output(0) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 4, 5)]) +def test_relu6_basic(): + return torch.nn.ReLU6() + + +class FunctionalRelu6(torch.nn.Module): + def forward(self, x): + return torch.nn.functional.relu6(x) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 4, 5)]) +def test_functional_relu6_basic(): + return FunctionalRelu6() + diff --git a/torch2trt/converters/sigmoid.py b/torch2trt/converters/sigmoid.py new file mode 100644 index 00000000..59639475 --- /dev/null +++ b/torch2trt/converters/sigmoid.py @@ -0,0 +1,32 @@ +from torch2trt.torch2trt import * +from torch2trt.module_test import add_module_test + + +@tensorrt_converter('torch.nn.functional.sigmoid') +@tensorrt_converter('torch.sigmoid') +@tensorrt_converter('torch.Tensor.sigmoid') +def convert_sigmoid(ctx): + input = ctx.method_args[0] + input_trt = add_missing_trt_tensors(ctx.network, [input])[0] + output = ctx.method_return + + layer = ctx.network.add_activation(input_trt, trt.ActivationType.SIGMOID) + output._trt = layer.get_output(0) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 3, 3)]) +def test_sigmoid_basic(): + return torch.nn.Sigmoid() + + +class TensorSigmoid(torch.nn.Module): + def __init__(self): + super(TensorSigmoid, self).__init__() + + def forward(self, x): + return x.sigmoid() + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 40, 20)]) +def test_tensor_sigmoid(): + return TensorSigmoid() diff --git a/torch2trt/converters/softmax.py b/torch2trt/converters/softmax.py new file mode 100644 index 00000000..39cd627d --- /dev/null +++ b/torch2trt/converters/softmax.py @@ -0,0 +1,50 @@ +from torch2trt.torch2trt import * +from torch2trt.module_test import add_module_test + + +@tensorrt_converter('torch.nn.functional.softmax') +def convert_softmax(ctx): + input = ctx.method_args[0] + input_trt = add_missing_trt_tensors(ctx.network, [input])[0] + output = ctx.method_return + + # get dims from args or kwargs + if 'dim' in ctx.method_kwargs: + dim = ctx.method_kwargs['dim'] + elif len(ctx.method_args) >= 2: + dim = ctx.method_args[1] + + # convert negative dims +# import pdb +# pdb.set_trace() + if dim < 0: + dim = len(input.shape) + dim + + axes = 1 << (dim - 1) + + layer = ctx.network.add_softmax(input=input_trt) + layer.axes = axes + + output._trt = layer.get_output(0) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3)]) +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 3, 3)]) +def test_softmax_module(): + return torch.nn.Softmax(1) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 3, 3)]) +def test_softmax_module_dim2(): + return torch.nn.Softmax(2) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3)]) +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 3, 3)]) +def test_softmax_module_neg1(): + return torch.nn.Softmax(-1) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 3, 3)]) +def test_softmax_module_dim_neg2(): + return torch.nn.Softmax(-2) diff --git a/torch2trt/converters/split.py b/torch2trt/converters/split.py new file mode 100644 index 00000000..f3ee135c --- /dev/null +++ b/torch2trt/converters/split.py @@ -0,0 +1,80 @@ +from torch2trt.torch2trt import * +from torch2trt.module_test import add_module_test + + +@tensorrt_converter('torch.split') +@tensorrt_converter('torch.Tensor.split') +def convert_split(ctx): + input = get_arg(ctx, 'input', 0, None) + input_trt = add_missing_trt_tensors(ctx.network, [input])[0] + # we don't need to parse split/chunk (arg 1) + # since we infer size from output tensors + dim = get_arg(ctx, 'dim', 2, 0) + + outputs = ctx.method_return + + assert(dim >= 1) + + start = [0] * len(input.shape[1:]) # exclude batch + stride = [1] * len(start) + offset = 0 + trt_dim = dim - 1 + + # add slice layers + for i, output in enumerate(outputs): + shape = list(output.shape[1:]) # exclude batch dim + start[trt_dim] = offset + layer = ctx.network.add_slice(input_trt, start=start, shape=shape, stride=stride) + output._trt = layer.get_output(0) + offset = offset + shape[trt_dim] + + +class TorchSplit(torch.nn.Module): + + def __init__(self, *args, **kwargs): + super(TorchSplit, self).__init__() + self.args = args + self.kwargs = kwargs + + def forward(self, x): + return torch.split(x, *self.args, **self.kwargs) + + +class TensorSplit(torch.nn.Module): + + def __init__(self, *args, **kwargs): + super(TensorSplit, self).__init__() + self.args = args + self.kwargs = kwargs + + def forward(self, x): + return x.split(*self.args, **self.kwargs) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 3)]) +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 3, 3)]) +def test_torch_split_1_1(): + return TorchSplit(1, 1) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 3)]) +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 3, 3)]) +def test_torch_split_2_1(): + return TorchSplit(2, 1) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 3)]) +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 3, 3)]) +def test_torch_split_3_1(): + return TorchSplit(3, 1) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 3)]) +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 3, 3)]) +def test_torch_split_3_2(): + return TorchSplit(3, 2) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 3, 3)]) +def test_tensor_split_3_2(): + return TensorSplit(3, 2) \ No newline at end of file diff --git a/torch2trt/converters/stack.py b/torch2trt/converters/stack.py new file mode 100644 index 00000000..b86e909a --- /dev/null +++ b/torch2trt/converters/stack.py @@ -0,0 +1,40 @@ +from torch2trt.torch2trt import * +from torch2trt.module_test import add_module_test + + +def unsqueeze(ctx, input, dim): + layer = ctx.network.add_shuffle(trt_(ctx.network, input)) + + shape = input.shape[1:dim] + (1,) + input.shape[dim:] + layer.reshape_dims = tuple(shape) + + return layer.get_output(0) + + +@tensorrt_converter('torch.stack', enabled=trt_version() >= '7.0') +def convert_cat_trt7(ctx): + inputs = get_arg(ctx, 'input', pos=0, default=None) + dim = get_arg(ctx, 'dim', pos=1, default=0) + + output = ctx.method_return + trt_inputs = [unsqueeze(ctx, i, dim) for i in inputs] + + layer = ctx.network.add_concatenation(inputs=trt_inputs) + layer.axis = dim - 1 + output._trt = layer.get_output(0) + +class Stack(torch.nn.Module): + def __init__(self, dim): + super(Stack, self).__init__() + self.dim = dim + + def forward(self, *x): + return torch.stack(x, dim=self.dim) + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 4, 4), (1, 4, 4), (1, 4, 4)], enabled=trt_version() >= '7.0') +def test_Stack_basic_trt7(): + return Stack(3) + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 4, 4), (1, 4, 4), (1, 4, 4)], enabled=trt_version() >= '7.0') +def test_Stack_basic2_trt7(): + return Stack(1) diff --git a/torch2trt/converters/sub.py b/torch2trt/converters/sub.py new file mode 100644 index 00000000..1ae2a124 --- /dev/null +++ b/torch2trt/converters/sub.py @@ -0,0 +1,118 @@ +from torch2trt.torch2trt import * +from torch2trt.module_test import add_module_test + + +@tensorrt_converter('torch.sub') +@tensorrt_converter('torch.Tensor.__isub__') +@tensorrt_converter('torch.Tensor.__sub__') +def convert_sub(ctx): + input_a = ctx.method_args[0] + input_b = ctx.method_args[1] + output = ctx.method_return + input_a_trt, input_b_trt = add_missing_trt_tensors(ctx.network, [input_a, input_b]) + input_a_trt, input_b_trt = broadcast_trt_tensors(ctx.network, [input_a_trt, input_b_trt], len(output.shape) - 1) + layer = ctx.network.add_elementwise(input_a_trt, input_b_trt, trt.ElementWiseOperation.SUB) + output._trt = layer.get_output(0) + + +@tensorrt_converter('torch.Tensor.__rsub__') +def convert_sub(ctx): + input_a = ctx.method_args[1] + input_b = ctx.method_args[0] # flipped for rsub + output = ctx.method_return + input_a_trt, input_b_trt = add_missing_trt_tensors(ctx.network, [input_a, input_b]) + input_a_trt, input_b_trt = broadcast_trt_tensors(ctx.network, [input_a_trt, input_b_trt], len(output.shape) - 1) + layer = ctx.network.add_elementwise(input_a_trt, input_b_trt, trt.ElementWiseOperation.SUB) + output._trt = layer.get_output(0) + + +class Sub(torch.nn.Module): + def __init__(self): + super(Sub, self).__init__() + + def forward(self, x, y): + return x - y + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 224, 224), (1, 3, 224, 224)]) +def test_sub_basic(): + return Sub() + + +class ISub(torch.nn.Module): + def __init__(self): + super(ISub, self).__init__() + + def forward(self, x, y): + x -= y + return x + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 224, 224), (1, 3, 224, 224)]) +def test_sub_isub(): + return ISub() + + +class TorchSub(torch.nn.Module): + def __init__(self): + super(TorchSub, self).__init__() + + def forward(self, x, y): + return torch.sub(x, y) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 224, 224), (1, 3, 224, 224)]) +def test_torch_sub(): + return TorchSub() + + +class RSubInt(torch.nn.Module): + def __init__(self): + super(RSubInt, self).__init__() + + def forward(self, x): + return 1 - x + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 224, 224)]) +def test_rsub_int(): + return RSubInt() + + +class RSubFloat(torch.nn.Module): + def __init__(self): + super(RSubFloat, self).__init__() + + def forward(self, x): + return 1.0 - x + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 224, 224)]) +def test_rsub_float(): + return RSubFloat() + +class SubConstantNoBatch(torch.nn.Module): + def __init__(self): + super(SubConstantNoBatch, self).__init__() + self.register_buffer('y', torch.ones((3, 10, 10))) + + def forward(self, x): + return x - self.y + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 10, 10)]) +def test_sub_constant_nobatch(): + return SubConstantNoBatch() + + +class SubConstantBatch(torch.nn.Module): + def __init__(self): + super(SubConstantBatch, self).__init__() + self.register_buffer('y', torch.ones((1, 3, 10, 10))) + + def forward(self, x): + return x - self.y + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 10, 10)]) +def test_sub_constant_batch(): + return SubConstantBatch() diff --git a/torch2trt/converters/sum.py b/torch2trt/converters/sum.py new file mode 100644 index 00000000..272cd433 --- /dev/null +++ b/torch2trt/converters/sum.py @@ -0,0 +1,52 @@ +from torch2trt.torch2trt import * +from torch2trt.module_test import add_module_test +from .unary import UnaryModule +from torch import nn + +@tensorrt_converter('torch.sum') +@tensorrt_converter('torch.Tensor.sum') +def convert_sum(ctx): + input = ctx.method_args[0] + dim = get_arg(ctx, 'dim', pos=1, default=tuple(range(1, len(input.shape)))) + keepdim = get_arg(ctx, 'keepdim', pos=2, default=False) + input_trt = add_missing_trt_tensors(ctx.network, [input])[0] + output = ctx.method_return + layer = ctx.network.add_reduce(input_trt, trt.ReduceOperation.SUM, torch_dim_to_trt_axes(dim), keepdim) + output._trt = layer.get_output(0) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3)]) +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 3)]) +def test_sum_reduce_all(): + return UnaryModule(lambda x: torch.sum(x)) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3)]) +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 3)]) +def test_sum_reduce_dim1(): + return UnaryModule(lambda x: torch.sum(x, 1)) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 3)]) +def test_sum_reduce_dim22(): + return UnaryModule(lambda x: torch.sum(x, 2)) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3)]) +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 3)]) +def test_sum_reduce_dim1_keepdim(): + return UnaryModule(lambda x: torch.sum(x, 1, keepdim=True)) + + +class DisparityRegression(nn.Module): + def __init__(self, maxdisp): + super(DisparityRegression, self).__init__() + self.register_buffer('disp', torch.arange(maxdisp, dtype=torch.float32).view(maxdisp, 1, 1)) + + def forward(self, x): + return x * self.disp#, 1) + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 10, 3, 3)]) +@add_module_test(torch.float32, torch.device('cuda'), [(1, 10, 23, 23)]) +def test_disparity_reg(): + return DisparityRegression(10) diff --git a/torch2trt/converters/tanh.py b/torch2trt/converters/tanh.py new file mode 100644 index 00000000..00ca37b1 --- /dev/null +++ b/torch2trt/converters/tanh.py @@ -0,0 +1,18 @@ +from torch2trt.torch2trt import * +from torch2trt.module_test import add_module_test + + +@tensorrt_converter('torch.nn.functional.tanh') +@tensorrt_converter('torch.tanh') +def convert_tanh(ctx): + input = ctx.method_args[0] + input_trt = add_missing_trt_tensors(ctx.network, [input])[0] + output = ctx.method_return + + layer = ctx.network.add_activation(input_trt, trt.ActivationType.TANH) + output._trt = layer.get_output(0) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 3, 3)]) +def test_tanh_basic(): + return torch.nn.Tanh() \ No newline at end of file diff --git a/torch2trt/converters/tensor.py b/torch2trt/converters/tensor.py new file mode 100644 index 00000000..15406ac7 --- /dev/null +++ b/torch2trt/converters/tensor.py @@ -0,0 +1,22 @@ +from torch2trt.torch2trt import * +from torch2trt.module_test import add_module_test + + +@tensorrt_converter('torch.tensor') +def convert_mod(ctx): + output = ctx.method_return + layer = ctx.network.add_constant(tuple(output.shape), output.detach().cpu().numpy() ) + output._trt = layer.get_output(0) + + +class TorchTensor(torch.nn.Module): + def __init__(self): + super(TorchTensor, self).__init__() + + def forward(self, x): + return x + torch.tensor([[1., 2., 3.], [4., 5., 6.]], device=torch.device('cuda')) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 2, 3)]) +def test_tensor_creation(): + return TorchTensor() diff --git a/torch2trt/converters/transpose.py b/torch2trt/converters/transpose.py new file mode 100644 index 00000000..c5131592 --- /dev/null +++ b/torch2trt/converters/transpose.py @@ -0,0 +1,51 @@ +from torch2trt.torch2trt import * +from torch2trt.module_test import add_module_test + + +@tensorrt_converter("torch.transpose", enabled=trt_version() < '7.0') +def convert_transpose(ctx): + input = ctx.method_args[0] + input_trt = add_missing_trt_tensors(ctx.network, [input])[0] + output = ctx.method_return + # permutation -1 because TRT does not include batch dim + permutation = list(range(len(input.shape) - 1)) + dim0 = ctx.method_args[1] - 1 + dim1 = ctx.method_args[2] - 1 + permutation[dim0] = dim1 + permutation[dim1] = dim0 + layer = ctx.network.add_shuffle(input_trt) + layer.second_transpose = tuple(permutation) + output._trt = layer.get_output(0) + + +@tensorrt_converter('torch.transpose', enabled=trt_version() >= '7.0') +def convert_transpose_trt7(ctx): + input = ctx.method_args[0] + input_trt = add_missing_trt_tensors(ctx.network, [input])[0] + output = ctx.method_return + # permutation -1 because TRT does not include batch dim + permutation = list(range(len(input.shape) - 1)) + dim0 = ctx.method_args[1] - 1 + dim1 = ctx.method_args[2] - 1 + permutation[dim0] = dim1 + permutation[dim1] = dim0 + layer = ctx.network.add_shuffle(input_trt) + layer.second_transpose = tuple(permutation) + output._trt = layer.get_output(0) + + + +class Transpose(torch.nn.Module): + def __init__(self, dim0, dim1): + super(Transpose, self).__init__() + self.dim0 = dim0 + self.dim1 = dim1 + + def forward(self, x): + return torch.transpose(x, self.dim0, self.dim1).contiguous() + + +@add_module_test(torch.float32, torch.device("cuda"), [(1, 3, 3)]) +@add_module_test(torch.float32, torch.device("cuda"), [(1, 3, 3, 3)]) +def test_transpose_12(): + return Transpose(1, 2) diff --git a/torch2trt/converters/unary.py b/torch2trt/converters/unary.py new file mode 100644 index 00000000..6f80f670 --- /dev/null +++ b/torch2trt/converters/unary.py @@ -0,0 +1,281 @@ +from torch2trt.torch2trt import * +from torch2trt.module_test import add_module_test + + +def __convert_unary(ctx, op): + input = get_arg(ctx, 'input', pos=0, default=None) + input_trt = add_missing_trt_tensors(ctx.network, [input])[0] + output = ctx.method_return + layer = ctx.network.add_unary(input_trt, op) + output._trt = layer.get_output(0) + + +class UnaryModule(torch.nn.Module): + def __init__(self, fn): + super(UnaryModule, self).__init__() + self.fn = fn + + def forward(self, x): + return self.fn(x) + +# EXP : Exponentiation + + +@tensorrt_converter('torch.exp') +@tensorrt_converter('torch.exp_') +@tensorrt_converter('torch.Tensor.exp') +@tensorrt_converter('torch.Tensor.exp_') +def convert_exp(ctx): + __convert_unary(ctx, trt.UnaryOperation.EXP) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 5, 3)]) +def test_exp(): + return UnaryModule(lambda x: torch.exp(x)) + + +# LOG : Log (base e) + + +@tensorrt_converter('torch.log') +@tensorrt_converter('torch.log_') +@tensorrt_converter('torch.Tensor.log') +@tensorrt_converter('torch.Tensor.log_') +def convert_log(ctx): + __convert_unary(ctx, trt.UnaryOperation.LOG) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 5, 3)]) +def test_log(): + return UnaryModule(lambda x: torch.log(x)) + + +# SQRT : Square root + + +@tensorrt_converter('torch.sqrt') +@tensorrt_converter('torch.sqrt_') +@tensorrt_converter('torch.Tensor.sqrt') +@tensorrt_converter('torch.Tensor.sqrt_') +def convert_sqrt(ctx): + __convert_unary(ctx, trt.UnaryOperation.SQRT) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 5, 3)]) +def test_sqrt(): + return UnaryModule(lambda x: torch.sqrt(x)) + + +# RECIP : Reciprocal + + +@tensorrt_converter('torch.reciprocal') +@tensorrt_converter('torch.reciprocal_') +@tensorrt_converter('torch.Tensor.reciprocal') +@tensorrt_converter('torch.Tensor.reciprocal_') +def convert_reciprocal(ctx): + __convert_unary(ctx, trt.UnaryOperation.RECIP) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 5, 3)]) +def test_reciprocal(): + return UnaryModule(lambda x: torch.reciprocal(x)) + + +# ABS : Absolute value + + +@tensorrt_converter('torch.abs') +@tensorrt_converter('torch.abs_') +@tensorrt_converter('torch.Tensor.abs') +@tensorrt_converter('torch.Tensor.abs_') +def convert_abs(ctx): + __convert_unary(ctx, trt.UnaryOperation.ABS) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 5, 3)]) +def test_abs(): + return UnaryModule(lambda x: torch.abs(x)) + + +# NEG : Negation + +@tensorrt_converter('torch.neg') +@tensorrt_converter('torch.neg_') +@tensorrt_converter('torch.Tensor.neg') +@tensorrt_converter('torch.Tensor.__neg__') +@tensorrt_converter('torch.Tensor.neg_') +def convert_neg(ctx): + __convert_unary(ctx, trt.UnaryOperation.NEG) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 5, 3)]) +def test_neg(): + return UnaryModule(lambda x: torch.neg(x)) + + +# SIN : Sine + + +@tensorrt_converter('torch.sin') +@tensorrt_converter('torch.sin_') +@tensorrt_converter('torch.Tensor.sin') +@tensorrt_converter('torch.Tensor.sin_') +def convert_sin(ctx): + __convert_unary(ctx, trt.UnaryOperation.SIN) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 5, 3)]) +def test_sin(): + return UnaryModule(lambda x: torch.sin(x)) + + +# COS : Cosine + + +@tensorrt_converter('torch.cos') +@tensorrt_converter('torch.cos_') +@tensorrt_converter('torch.Tensor.cos') +@tensorrt_converter('torch.Tensor.cos_') +def convert_cos(ctx): + __convert_unary(ctx, trt.UnaryOperation.COS) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 5, 3)]) +def test_cos(): + return UnaryModule(lambda x: torch.cos(x)) + + +# | TAN : Tangent + + +@tensorrt_converter('torch.tan') +@tensorrt_converter('torch.tan_') +@tensorrt_converter('torch.Tensor.tan') +@tensorrt_converter('torch.Tensor.tan_') +def convert_cos(ctx): + __convert_unary(ctx, trt.UnaryOperation.TAN) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 5, 3)]) +def test_tan(): + return UnaryModule(lambda x: torch.tan(x)) + + +# | SINH : Hyperbolic sine + + +@tensorrt_converter('torch.sinh') +@tensorrt_converter('torch.sinh_') +@tensorrt_converter('torch.Tensor.sinh') +@tensorrt_converter('torch.Tensor.sinh_') +def convert_sinh(ctx): + __convert_unary(ctx, trt.UnaryOperation.SINH) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 5, 3)]) +def test_sinh(): + return UnaryModule(lambda x: torch.sinh(x)) + + +# | COSH : Hyperbolic cosine + + +@tensorrt_converter('torch.cosh') +@tensorrt_converter('torch.cosh_') +@tensorrt_converter('torch.Tensor.cosh') +@tensorrt_converter('torch.Tensor.cosh_') +def convert_cosh(ctx): + __convert_unary(ctx, trt.UnaryOperation.COSH) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 5, 3)]) +def test_cosh(): + return UnaryModule(lambda x: torch.cosh(x)) + + +# | ASIN : Inverse sine + + +@tensorrt_converter('torch.asin') +@tensorrt_converter('torch.asin_') +@tensorrt_converter('torch.Tensor.asin') +@tensorrt_converter('torch.Tensor.asin_') +def convert_asin(ctx): + __convert_unary(ctx, trt.UnaryOperation.ASIN) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 5, 3)]) +def test_asin(): + return UnaryModule(lambda x: torch.asin(x)) + + +# | ACOS : Inverse cosine + + +@tensorrt_converter('torch.acos') +@tensorrt_converter('torch.acos_') +@tensorrt_converter('torch.Tensor.acos') +@tensorrt_converter('torch.Tensor.acos_') +def convert_acos(ctx): + __convert_unary(ctx, trt.UnaryOperation.ACOS) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 5, 3)]) +def test_acos(): + return UnaryModule(lambda x: torch.acos(x)) + + +# | ATAN : Inverse tangent + + +@tensorrt_converter('torch.atan') +@tensorrt_converter('torch.atan_') +@tensorrt_converter('torch.Tensor.atan') +@tensorrt_converter('torch.Tensor.atan_') +def convert_atan(ctx): + __convert_unary(ctx, trt.UnaryOperation.ATAN) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 5, 3)]) +def test_atan(): + return UnaryModule(lambda x: torch.atan(x)) + + +# | ASINH : Inverse hyperbolic sine +# | +# | ACOSH : Inverse hyperbolic cosine +# | +# | ATANH : Inverse hyperbolic tangent +# | + +# CEIL : Ceiling + + +@tensorrt_converter('torch.ceil') +@tensorrt_converter('torch.ceil_') +@tensorrt_converter('torch.Tensor.ceil') +@tensorrt_converter('torch.Tensor.ceil_') +def convert_ceil(ctx): + __convert_unary(ctx, trt.UnaryOperation.CEIL) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 5, 3)]) +def test_ceil(): + return UnaryModule(lambda x: torch.ceil(x)) + + +# FLOOR : Floor + + +@tensorrt_converter('torch.floor') +@tensorrt_converter('torch.floor_') +@tensorrt_converter('torch.Tensor.floor') +@tensorrt_converter('torch.Tensor.floor_') +def convert_floor(ctx): + __convert_unary(ctx, trt.UnaryOperation.FLOOR) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 5, 3)]) +def test_floor(): + return UnaryModule(lambda x: torch.floor(x)) \ No newline at end of file diff --git a/torch2trt/converters/view.py b/torch2trt/converters/view.py new file mode 100644 index 00000000..b60dabb4 --- /dev/null +++ b/torch2trt/converters/view.py @@ -0,0 +1,77 @@ +from torch2trt.torch2trt import * +from torch2trt.module_test import add_module_test + + +@tensorrt_converter('torch.flatten') +@tensorrt_converter('torch.Tensor.reshape') +@tensorrt_converter('torch.Tensor.view') +@tensorrt_converter('torch.Tensor.squeeze') +@tensorrt_converter('torch.Tensor.unsqueeze') +@tensorrt_converter('torch.Tensor.flatten') +@tensorrt_converter('torch.squeeze') +@tensorrt_converter('torch.unsqueeze') +def convert_view(ctx): + input = ctx.method_args[0] + input_trt = add_missing_trt_tensors(ctx.network, [input])[0] + output = ctx.method_return + layer = ctx.network.add_shuffle(input_trt) + layer.reshape_dims = tuple(output.shape[1:]) + output._trt = layer.get_output(0) + + +class View(torch.nn.Module): + def __init__(self, *dims): + super(View, self).__init__() + self.dims = dims + + def forward(self, x): + return x.view(*self.dims) + + +class Squeeze(torch.nn.Module): + def __init__(self, dim): + super(Squeeze, self).__init__() + self.dim = dim + + def forward(self, x): + return x.squeeze(dim=self.dim) + +class UnSqueeze(torch.nn.Module): + def __init__(self, dim): + super(UnSqueeze, self).__init__() + self.dim = dim + + def forward(self, x): + return x.unsqueeze(dim=self.dim) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3)]) +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 3)]) +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 3, 3)]) +def test_view_1d(): + return View(1, -1) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3)]) +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 3)]) +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 3, 3)]) +def test_view_2d(): + return View(1, 1, -1) + + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 3, 3, 6)]) +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 3, 3, 3, 6)]) +def test_view_3d(): + return View(1, 3, 3, -1) + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 7)]) +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 5, 3)]) +def test_unsqueeze(): + return UnSqueeze(2) + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 1)]) +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 1, 3)]) +def test_squeeze(): + return Squeeze(2) + + diff --git a/torch2trt/module_test.py b/torch2trt/module_test.py new file mode 100644 index 00000000..fb158fe8 --- /dev/null +++ b/torch2trt/module_test.py @@ -0,0 +1,35 @@ +import torch +import torchvision + + +class ModuleTest(object): + def __init__(self, module_fn, dtype, device, input_shapes, **torch2trt_kwargs): + self.module_fn = module_fn + self.dtype = dtype + self.device = device + self.input_shapes = input_shapes + self.torch2trt_kwargs = torch2trt_kwargs + + def module_name(self): + return self.module_fn.__module__ + '.' + self.module_fn.__name__ + + +MODULE_TESTS = [ +] + + +def add_module_test(dtype, device, input_shapes, enabled=True, **torch2trt_kwargs): + def register_module_test(module): + global MODULE_TESTS + MODULE_TESTS += [ModuleTest(module, dtype, device, input_shapes, **torch2trt_kwargs)] + return module + + def pass_module_test(module): + return module + + if enabled: + return register_module_test + else: + return pass_module_test + + return register_module_test diff --git a/torch2trt/plugins/group_norm.cpp b/torch2trt/plugins/group_norm.cpp new file mode 100644 index 00000000..ccc7b51d --- /dev/null +++ b/torch2trt/plugins/group_norm.cpp @@ -0,0 +1,296 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace nvinfer1; + +namespace torch2trt { + +class GroupNormPlugin : public IPluginV2 { +private: + // configured by class + at::TensorOptions tensor_options; + std::vector input_sizes; + std::vector output_sizes; + DataType dtype; + + // group norm parameters, configured by user + int64_t num_groups; + at::Tensor weight; + at::Tensor bias; + double eps; + + +public: + + // create from arguments + GroupNormPlugin(int64_t num_groups, at::Tensor weight, at::Tensor bias, double eps) : + num_groups{num_groups}, weight{weight}, bias{bias}, eps{eps} + {} + + GroupNormPlugin(const char *data, size_t length) : GroupNormPlugin(std::string(data, length)) {} + + GroupNormPlugin(const std::string &data){ + deserializeFromString(data); + } + + void deserializeFromString(const std::string &data) { + std::istringstream data_stream(data); + torch::serialize::InputArchive input_archive; + input_archive.load_from(data_stream); + { + torch::IValue value; + input_archive.read("num_groups", value); +#ifdef USE_DEPRECATED_INTLIST + num_groups = value.toIntListRef().vec(); +#else + num_groups = value.toInt(); +#endif + } + { + torch::IValue value; + input_archive.read("weight", value); + weight = value.toTensor(); + } + { + torch::IValue value; + input_archive.read("bias", value); + bias = value.toTensor(); + } + + { + torch::IValue value; + input_archive.read("eps", value); +#ifdef USE_DEPRECATED_INTLIST + eps = value.toDoubleListRef().vec(); +#else + eps = value.toDouble(); +#endif + } + { + torch::IValue value; + input_archive.read("dtype", value); + dtype = (DataType) value.toInt(); + } + { + torch::IValue value; + input_archive.read("input_sizes", value); +#ifdef USE_DEPRECATED_INTLIST + input_sizes = value.toIntListRef().vec(); +#else + input_sizes = value.toIntVector(); +#endif + } + { + torch::IValue value; + input_archive.read("output_sizes", value); +#ifdef USE_DEPRECATED_INTLIST + output_sizes = value.toIntListRef().vec(); +#else + output_sizes = value.toIntVector(); +#endif + } + } + std::string serializeToString() const { + torch::serialize::OutputArchive output_archive; + output_archive.write("num_groups", torch::IValue(num_groups)); + output_archive.write("weight", torch::IValue(weight)); + output_archive.write("bias", torch::IValue(bias)); + output_archive.write("eps", torch::IValue(eps)); + output_archive.write("dtype", torch::IValue((int) dtype)); + output_archive.write("input_sizes", torch::IValue(input_sizes)); + output_archive.write("output_sizes", torch::IValue(output_sizes)); + std::ostringstream data_str; + output_archive.save_to(data_str); + return data_str.str(); + } + + const char* getPluginType() const override { + return "group_norm"; + }; + + const char* getPluginVersion() const override { + return "1"; + } + + int getNbOutputs() const override { + return 1; + } + + Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override { + Dims dims; + dims.nbDims = inputs->nbDims; + + for (int i = 0; i < inputs->nbDims; i++) { + dims.d[i] = inputs->d[i]; + } + + return dims; + } + + bool supportsFormat(DataType type, PluginFormat format) const override { + if (format != PluginFormat::kNCHW) { + return false; + } + if (type == DataType::kINT32 || type == DataType::kINT8) { + return false; + } + return true; + } + + void configureWithFormat(const Dims* inputDims, int nbInputs, const Dims* outputDims, + int nbOutputs, DataType type, PluginFormat format, int maxBatchSize) override { + + // set data type + if (type == DataType::kFLOAT) { + tensor_options = tensor_options.dtype(c10::kFloat); + dtype = type; + } else if (type == DataType::kHALF) { + tensor_options = tensor_options.dtype(c10::kHalf); + dtype = type; + } + + // set input sizes + input_sizes.resize(inputDims[0].nbDims); + for (int i = 0; i < inputDims[0].nbDims; i++) { + input_sizes[i] = inputDims[0].d[i]; + } + + // set output sizes + output_sizes.resize(outputDims[0].nbDims); + for (int i = 0; i < outputDims[0].nbDims; i++) { + output_sizes[i] = outputDims[0].d[i]; + } + } + + int initialize() override { + // set device + tensor_options = tensor_options.device(c10::kCUDA); + + // set data type + if (dtype == DataType::kFLOAT) { + tensor_options = tensor_options.dtype(c10::kFloat); + } else if (dtype == DataType::kHALF) { + tensor_options = tensor_options.dtype(c10::kHalf); + } + + + weight = weight.to(tensor_options); + bias = bias.to(tensor_options); + + return 0; + } + + void terminate() override {} + + size_t getWorkspaceSize(int maxBatchSize) const override { return 0; } + + int enqueue(int batchSize, const void* const* inputs, void** outputs, void* workspace, cudaStream_t stream) override { + // get input / output dimensions + std::vector batch_input_sizes = input_sizes; + std::vector batch_output_sizes = output_sizes; + batch_input_sizes.insert(batch_input_sizes.begin(), batchSize); + batch_output_sizes.insert(batch_output_sizes.begin(), batchSize); + + // create tensor wrappers + at::Tensor input = at::from_blob((void*) inputs[0], batch_input_sizes, [](void*){}, tensor_options); + at::Tensor output = at::from_blob(outputs[0], batch_output_sizes, [](void*){}, tensor_options); + + // create new torch cuda stream + at::cuda::CUDAStream torch_stream = at::cuda::getStreamFromPool(); + at::cuda::CUDAStreamGuard torch_guard(torch_stream); + + // capture current work on tensorrt cuda stream + cudaEvent_t event; + cudaEventCreate(&event); + cudaEventRecord(event, stream); + + // make torch cuda stream wait on tensorrt work + cudaStreamWaitEvent(torch_stream.stream(), event, 0); + + + + // enqueue work + // Group_norm function from PyTorch: https://pytorch.org/cppdocs/api/function_namespaceat_1a6bc1e9504ea440c6c96ff8a8b94333f2.html#exhale-function-namespaceat-1a6bc1e9504ea440c6c96ff8a8b94333f2 + at::Tensor output_tmp = at::group_norm(input, num_groups, weight, bias, eps=eps); + output.copy_(output_tmp); + + // capture event on enqueued stream + cudaEvent_t torch_event; + cudaEventCreate(&torch_event); + cudaEventRecord(torch_event, torch_stream.stream()); + cudaStreamWaitEvent(stream, torch_event, 0); + + cudaEventDestroy(event); + cudaEventDestroy(torch_event); + + return 0; + } + + + size_t getSerializationSize() const override { + return serializeToString().size(); + } + + void serialize(void* buffer) const override { + std::string data = serializeToString(); + size_t size = getSerializationSize(); + data.copy((char *) buffer, size); + } + + void destroy() override {} + + IPluginV2* clone() const override { + return new GroupNormPlugin(num_groups, weight, bias, eps); + } + + void setPluginNamespace(const char* pluginNamespace) override {} + + const char *getPluginNamespace() const override { + return "torch2trt"; + } + +}; + +class GroupNormPluginCreator : public IPluginCreator { +public: + GroupNormPluginCreator() {} + + const char *getPluginNamespace() const override { + return "torch2trt"; + } + + const char *getPluginName() const override { + return "group_norm"; + } + + const char *getPluginVersion() const override { + return "1"; + } + + IPluginV2 *deserializePlugin(const char *name, const void *data, size_t length) override { + return new GroupNormPlugin((const char*) data, length); + } + + void setPluginNamespace(const char *N) override {} + const PluginFieldCollection *getFieldNames() override { return nullptr; } + + IPluginV2 *createPlugin(const char *name, const PluginFieldCollection *fc) override { return nullptr; } + +}; + + +REGISTER_TENSORRT_PLUGIN(GroupNormPluginCreator); + +} // namespace torch2trt + + + + diff --git a/torch2trt/plugins/interpolate.cpp b/torch2trt/plugins/interpolate.cpp new file mode 100644 index 00000000..cf463d0f --- /dev/null +++ b/torch2trt/plugins/interpolate.cpp @@ -0,0 +1,284 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace nvinfer1; + +namespace torch2trt { + + +class InterpolatePlugin : public IPluginV2 { +private: + + // configured by class + at::TensorOptions tensor_options; + std::vector input_sizes; + std::vector output_sizes; + DataType dtype; + + // configured by user + std::vector size; + std::string mode; + bool align_corners; + +public: + + // create from arguments + InterpolatePlugin(std::vector size, std::string mode, bool align_corners) : + size(size), mode(mode), align_corners(align_corners) + {} + + InterpolatePlugin(const char *data, size_t length) : InterpolatePlugin(std::string(data, length)) {} + + // create from serialized data + InterpolatePlugin(const std::string &data) { + deserializeFromString(data); + } + + void deserializeFromString(const std::string &data) { + std::istringstream data_stream(data); + torch::serialize::InputArchive input_archive; + input_archive.load_from(data_stream); + { + torch::IValue value; + input_archive.read("size", value); +#ifdef USE_DEPRECATED_INTLIST + size = value.toIntListRef().vec(); +#else + size = value.toIntVector(); +#endif + } + { + torch::IValue value; + input_archive.read("mode", value); + mode = value.toStringRef(); + } + { + torch::IValue value; + input_archive.read("align_corners", value); + align_corners = value.toBool(); + } + { + torch::IValue value; + input_archive.read("dtype", value); + dtype = (DataType) value.toInt(); + } + { + torch::IValue value; + input_archive.read("input_sizes", value); +#ifdef USE_DEPRECATED_INTLIST + input_sizes = value.toIntListRef().vec(); +#else + input_sizes = value.toIntVector(); +#endif + } + { + torch::IValue value; + input_archive.read("output_sizes", value); +#ifdef USE_DEPRECATED_INTLIST + output_sizes = value.toIntListRef().vec(); +#else + output_sizes = value.toIntVector(); +#endif + } + } + + std::string serializeToString() const { + torch::serialize::OutputArchive output_archive; + output_archive.write("size", torch::IValue(size)); + output_archive.write("mode", torch::IValue(mode)); + output_archive.write("align_corners", torch::IValue(align_corners)); + output_archive.write("dtype", torch::IValue((int) dtype)); + output_archive.write("input_sizes", torch::IValue(input_sizes)); + output_archive.write("output_sizes", torch::IValue(output_sizes)); + std::ostringstream data_str; + output_archive.save_to(data_str); + return data_str.str(); + } + + const char* getPluginType() const override { + return "interpolate"; + }; + + const char* getPluginVersion() const override { + return "1"; + } + + int getNbOutputs() const override { + return 1; + } + + Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override { + Dims dims; + dims.nbDims = inputs->nbDims; + + dims.d[0] = inputs->d[0]; + for (int i = 0; i < size.size(); i++) { + dims.d[i + 1] = size[i]; + } + + return dims; + } + + bool supportsFormat(DataType type, PluginFormat format) const override { + if (format != PluginFormat::kNCHW) { + return false; + } + if (type == DataType::kINT32 || type == DataType::kINT8) { + return false; + } + return true; + } + + void configureWithFormat(const Dims* inputDims, int nbInputs, const Dims* outputDims, + int nbOutputs, DataType type, PluginFormat format, int maxBatchSize) override { + + // set data type + if (type == DataType::kFLOAT) { + tensor_options = tensor_options.dtype(c10::kFloat); + dtype = type; + } else if (type == DataType::kHALF) { + tensor_options = tensor_options.dtype(c10::kHalf); + dtype = type; + } + + // set input sizes + input_sizes.resize(inputDims[0].nbDims); + for (int i = 0; i < inputDims[0].nbDims; i++) { + input_sizes[i] = inputDims[0].d[i]; + } + + // set output sizes + output_sizes.resize(outputDims[0].nbDims); + for (int i = 0; i < outputDims[0].nbDims; i++) { + output_sizes[i] = outputDims[0].d[i]; + } + } + + int initialize() override { + // set device + tensor_options = tensor_options.device(c10::kCUDA); + + // set data type + if (dtype == DataType::kFLOAT) { + tensor_options = tensor_options.dtype(c10::kFloat); + } else if (dtype == DataType::kHALF) { + tensor_options = tensor_options.dtype(c10::kHalf); + } + + return 0; + } + + void terminate() override {} + + size_t getWorkspaceSize(int maxBatchSize) const override { return 0; } + + int enqueue(int batchSize, const void* const* inputs, void** outputs, void* workspace, cudaStream_t stream) override { + // get input / output dimensions + std::vector batch_input_sizes = input_sizes; + std::vector batch_output_sizes = output_sizes; + batch_input_sizes.insert(batch_input_sizes.begin(), batchSize); + batch_output_sizes.insert(batch_output_sizes.begin(), batchSize); + + // create tensor wrappers + at::Tensor input = at::from_blob((void*) inputs[0], batch_input_sizes, [](void*){}, tensor_options); + at::Tensor output = at::from_blob(outputs[0], batch_output_sizes, [](void*){}, tensor_options); + + // create new torch cuda stream + at::cuda::CUDAStream torch_stream = at::cuda::getStreamFromPool(); + at::cuda::CUDAStreamGuard torch_guard(torch_stream); + + // capture current work on tensorrt cuda stream + cudaEvent_t event; + cudaEventCreate(&event); + cudaEventRecord(event, stream); + + // make torch cuda stream wait on tensorrt work + cudaStreamWaitEvent(torch_stream.stream(), event, 0); + + // enqueue work + if (mode == "bilinear") { + at::upsample_bilinear2d_out(output, input, {size[0], size[1]}, align_corners); + } else if (mode == "nearest") { + at::upsample_nearest2d_out(output, input, {size[0], size[1]}); + } else if (mode == "area") { + at::adaptive_avg_pool2d_out(output, input, {size[0], size[1]}); + } else if (mode == "bicubic") { + at::upsample_bicubic2d_out(output, input, {size[0], size[1]}, align_corners); + } + + // capture event on enqueued stream + cudaEvent_t torch_event; + cudaEventCreate(&torch_event); + cudaEventRecord(torch_event, torch_stream.stream()); + + cudaStreamWaitEvent(stream, torch_event, 0); + + cudaEventDestroy(event); + cudaEventDestroy(torch_event); + + return 0; + } + + size_t getSerializationSize() const override { + return serializeToString().size(); + } + + void serialize(void* buffer) const override { + std::string data = serializeToString(); + size_t size = getSerializationSize(); + data.copy((char *) buffer, size); + } + + void destroy() override {} + + IPluginV2* clone() const override { + return new InterpolatePlugin(size, mode, align_corners); + } + + void setPluginNamespace(const char* pluginNamespace) override {} + + const char *getPluginNamespace() const override { + return "torch2trt"; + } + +}; + +class InterpolatePluginCreator : public IPluginCreator { +public: + InterpolatePluginCreator() {} + + const char *getPluginNamespace() const override { + return "torch2trt"; + } + + const char *getPluginName() const override { + return "interpolate"; + } + + const char *getPluginVersion() const override { + return "1"; + } + + IPluginV2 *deserializePlugin(const char *name, const void *data, size_t length) override { + return new InterpolatePlugin((const char*) data, length); + } + + void setPluginNamespace(const char *N) override {} + const PluginFieldCollection *getFieldNames() override { return nullptr; } + + IPluginV2 *createPlugin(const char *name, const PluginFieldCollection *fc) override { return nullptr; } + +}; + + +REGISTER_TENSORRT_PLUGIN(InterpolatePluginCreator); + +} // namespace torch2trt diff --git a/torch2trt/plugins/plugins.cpp b/torch2trt/plugins/plugins.cpp new file mode 100644 index 00000000..f548949f --- /dev/null +++ b/torch2trt/plugins/plugins.cpp @@ -0,0 +1,30 @@ +#include +#include "interpolate.cpp" +#include "group_norm.cpp" + + +using namespace nvinfer1; + +namespace torch2trt { + PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { + py::class_(m, "InterpolatePlugin") + .def(py::init, std::string, bool>(), py::arg("size"), py::arg("mode"), py::arg("align_corners")) + .def(py::init(), py::arg("data")) + .def("getSerializationSize", &InterpolatePlugin::getSerializationSize) + .def("deserializeFromString", &InterpolatePlugin::deserializeFromString) + .def("serializeToString", [](const InterpolatePlugin& plugin) { + std::string data = plugin.serializeToString(); + return py::bytes(data); + }); + py::class_(m, "GroupNormPlugin") + .def(py::init(), py::arg("num_groups"), py::arg("weight"), py::arg("bias"), py::arg("eps")) + .def(py::init(), py::arg("data")) + .def("getSerializationSize", &GroupNormPlugin::getSerializationSize) + .def("deserializeFromString", &GroupNormPlugin::deserializeFromString) + .def("serializeToString", [](const GroupNormPlugin& plugin) { + std::string data = plugin.serializeToString(); + return py::bytes(data); + }); + + } +} // namespace torch2trt diff --git a/torch2trt/test.py b/torch2trt/test.py new file mode 100644 index 00000000..dec9bb88 --- /dev/null +++ b/torch2trt/test.py @@ -0,0 +1,147 @@ +from torch2trt import * +from .module_test import ModuleTest, MODULE_TESTS +import time +import argparse +import re +import runpy +import traceback +from termcolor import colored + + +def run(self): + # create module + module = self.module_fn() + module = module.to(self.device) + module = module.type(self.dtype) + module = module.eval() + + # create inputs for conversion + inputs_conversion = () + for shape in self.input_shapes: + inputs_conversion += (torch.zeros(shape).to(self.device).type(self.dtype), ) + + + # convert module + module_trt = torch2trt(module, inputs_conversion, max_workspace_size=1 << 20, **self.torch2trt_kwargs) + + # create inputs for torch/trt.. copy of inputs to handle inplace ops + inputs = () + for shape in self.input_shapes: + inputs += (torch.randn(shape).to(self.device).type(self.dtype), ) + inputs_trt = tuple([tensor.clone() for tensor in inputs]) + + + # test output against original + outputs = module(*inputs) + outputs_trt = module_trt(*inputs_trt) + + if not isinstance(outputs, tuple): + outputs = (outputs, ) + + # compute max error + max_error = 0 + for i in range(len(outputs)): + max_error_i = 0 + if outputs[i].dtype == torch.bool: + max_error_i = torch.sum(outputs[i] ^ outputs_trt[i]) + else: + max_error_i = torch.max(torch.abs(outputs[i] - outputs_trt[i])) + + if max_error_i > max_error: + max_error = max_error_i + + # benchmark pytorch throughput + torch.cuda.current_stream().synchronize() + t0 = time.time() + for i in range(50): + outputs = module(*inputs) + torch.cuda.current_stream().synchronize() + t1 = time.time() + + fps = 50.0 / (t1 - t0) + + # benchmark tensorrt throughput + torch.cuda.current_stream().synchronize() + t0 = time.time() + for i in range(50): + outputs = module_trt(*inputs) + torch.cuda.current_stream().synchronize() + t1 = time.time() + + fps_trt = 50.0 / (t1 - t0) + + # benchmark pytorch latency + torch.cuda.current_stream().synchronize() + t0 = time.time() + for i in range(50): + outputs = module(*inputs) + torch.cuda.current_stream().synchronize() + t1 = time.time() + + ms = 1000.0 * (t1 - t0) / 50.0 + + # benchmark tensorrt latency + torch.cuda.current_stream().synchronize() + t0 = time.time() + for i in range(50): + outputs = module_trt(*inputs) + torch.cuda.current_stream().synchronize() + t1 = time.time() + + ms_trt = 1000.0 * (t1 - t0) / 50.0 + + return max_error, fps, fps_trt, ms, ms_trt + + +if __name__ == '__main__': + + parser = argparse.ArgumentParser() + parser.add_argument('--output', '-o', help='Test output file path', type=str, default='torch2trt_test.md') + parser.add_argument('--name', help='Regular expression to filter modules to test by name', type=str, default='.*') + parser.add_argument('--tolerance', help='Maximum error to print warning for entry', type=float, default='-1') + parser.add_argument('--include', help='Addition python file to include defining additional tests', action='append', default=[]) + parser.add_argument('--use_onnx', help='Whether to test using ONNX or torch2trt tracing', action='store_true') + args = parser.parse_args() + + for include in args.include: + runpy.run_module(include) + + num_tests, num_success, num_tolerance, num_error = 0, 0, 0, 0 + for test in MODULE_TESTS: + + # filter by module name + name = test.module_name() + if not re.search(args.name, name): + continue + + num_tests += 1 + # run test + try: + if args.use_onnx: + test.torch2trt_kwargs.update({'use_onnx': True}) + + max_error, fps, fps_trt, ms, ms_trt = run(test) + + # write entry + line = '| %s | %s | %s | %s | %.2E | %.3g | %.3g | %.3g | %.3g |' % (name, test.dtype.__repr__().split('.')[-1], str(test.input_shapes), str(test.torch2trt_kwargs), max_error, fps, fps_trt, ms, ms_trt) + + if args.tolerance >= 0 and max_error > args.tolerance: + print(colored(line, 'yellow')) + num_tolerance += 1 + else: + print(line) + num_success += 1 + except: + line = '| %s | %s | %s | %s | N/A | N/A | N/A | N/A | N/A |' % (name, test.dtype.__repr__().split('.')[-1], str(test.input_shapes), str(test.torch2trt_kwargs)) + print(colored(line, 'red')) + num_error += 1 + tb = traceback.format_exc() + print(tb) + + with open(args.output, 'a+') as f: + f.write(line + '\n') + + print('NUM_TESTS: %d' % num_tests) + print('NUM_SUCCESSFUL_CONVERSION: %d' % num_success) + print('NUM_FAILED_CONVERSION: %d' % num_error) + print('NUM_ABOVE_TOLERANCE: %d' % num_tolerance) \ No newline at end of file diff --git a/torch2trt/tests/__init__.py b/torch2trt/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/torch2trt/tests/test_contiguous.py b/torch2trt/tests/test_contiguous.py new file mode 100644 index 00000000..f2047072 --- /dev/null +++ b/torch2trt/tests/test_contiguous.py @@ -0,0 +1,20 @@ +import torch +from torch2trt import torch2trt + + +def test_contiguous(): + net = torch.nn.Conv2d(3, 10, kernel_size=3) + net.eval().cuda() + + test_tensor = torch.randn((1, 25, 25, 3)).cuda().permute((0, 3, 1, 2)) + + with torch.no_grad(): + test_out = net(test_tensor) + + with torch.no_grad(): + trt_net = torch2trt(net, [test_tensor]) + test_trt_out = trt_net(test_tensor) + + delta = (test_out.contiguous() - test_trt_out.contiguous()).abs().sum() + assert delta < 1e-3, f"Delta: {delta}" + diff --git a/torch2trt/tests/torchvision/__init__.py b/torch2trt/tests/torchvision/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/torch2trt/tests/torchvision/classification.py b/torch2trt/tests/torchvision/classification.py new file mode 100644 index 00000000..acf15447 --- /dev/null +++ b/torch2trt/tests/torchvision/classification.py @@ -0,0 +1,148 @@ +import torch +import torchvision +from torch2trt.module_test import add_module_test + + +@add_module_test(torch.float16, torch.device('cuda'), [(1, 3, 224, 224)], fp16_mode=True) +def alexnet(): + return torchvision.models.alexnet(pretrained=False) + + +@add_module_test(torch.float16, torch.device('cuda'), [(1, 3, 224, 224)], fp16_mode=True) +def squeezenet1_0(): + return torchvision.models.squeezenet1_0(pretrained=False) + + +@add_module_test(torch.float16, torch.device('cuda'), [(1, 3, 224, 224)], fp16_mode=True) +def squeezenet1_1(): + return torchvision.models.squeezenet1_1(pretrained=False) + + +@add_module_test(torch.float16, torch.device('cuda'), [(1, 3, 224, 224)], fp16_mode=True) +def resnet18(): + return torchvision.models.resnet18(pretrained=False) + + +@add_module_test(torch.float16, torch.device('cuda'), [(1, 3, 224, 224)], fp16_mode=True) +def resnet34(): + return torchvision.models.resnet34(pretrained=False) + + +@add_module_test(torch.float16, torch.device('cuda'), [(1, 3, 224, 224)], fp16_mode=True) +def resnet50(): + return torchvision.models.resnet50(pretrained=False) + + +@add_module_test(torch.float16, torch.device('cuda'), [(1, 3, 224, 224)], fp16_mode=True) +def resnet101(): + return torchvision.models.resnet101(pretrained=False) + + +@add_module_test(torch.float16, torch.device('cuda'), [(1, 3, 224, 224)], fp16_mode=True) +def resnet152(): + return torchvision.models.resnet152(pretrained=False) + + +@add_module_test(torch.float16, torch.device('cuda'), [(1, 3, 224, 224)], fp16_mode=True) +def densenet121(): + return torchvision.models.densenet121(pretrained=False) + + +@add_module_test(torch.float16, torch.device('cuda'), [(1, 3, 224, 224)], fp16_mode=True) +def densenet169(): + return torchvision.models.densenet169(pretrained=False) + + +@add_module_test(torch.float16, torch.device('cuda'), [(1, 3, 224, 224)], fp16_mode=True) +def densenet201(): + return torchvision.models.densenet201(pretrained=False) + + +@add_module_test(torch.float16, torch.device('cuda'), [(1, 3, 224, 224)], fp16_mode=True) +def densenet161(): + return torchvision.models.densenet161(pretrained=False) + + +@add_module_test(torch.float16, torch.device('cuda'), [(1, 3, 224, 224)], fp16_mode=True) +def vgg11(): + return torchvision.models.vgg11(pretrained=False) + + +@add_module_test(torch.float16, torch.device('cuda'), [(1, 3, 224, 224)], fp16_mode=True) +def vgg13(): + return torchvision.models.vgg13(pretrained=False) + + +@add_module_test(torch.float16, torch.device('cuda'), [(1, 3, 224, 224)], fp16_mode=True) +def vgg16(): + return torchvision.models.vgg16(pretrained=False) + + +@add_module_test(torch.float16, torch.device('cuda'), [(1, 3, 224, 224)], fp16_mode=True) +def vgg19(): + return torchvision.models.vgg19(pretrained=False) + + +@add_module_test(torch.float16, torch.device('cuda'), [(1, 3, 224, 224)], fp16_mode=True) +def vgg11_bn(): + return torchvision.models.vgg11_bn(pretrained=False) + + +@add_module_test(torch.float16, torch.device('cuda'), [(1, 3, 224, 224)], fp16_mode=True) +def vgg13_bn(): + return torchvision.models.vgg13_bn(pretrained=False) + + +@add_module_test(torch.float16, torch.device('cuda'), [(1, 3, 224, 224)], fp16_mode=True) +def vgg16_bn(): + return torchvision.models.vgg16_bn(pretrained=False) + + +@add_module_test(torch.float16, torch.device('cuda'), [(1, 3, 224, 224)], fp16_mode=True) +def vgg19_bn(): + return torchvision.models.vgg19_bn(pretrained=False) + + +@add_module_test(torch.float16, torch.device('cuda'), [(1, 3, 224, 224)], fp16_mode=True) +def mobilenet_v2(): + return torchvision.models.mobilenet_v2(pretrained=False) + + +@add_module_test(torch.float16, torch.device('cuda'), [(1, 3, 224, 224)], fp16_mode=True) +def shufflenet_v2_x0_5(): + return torchvision.models.shufflenet_v2_x0_5(pretrained=False) + + +@add_module_test(torch.float16, torch.device('cuda'), [(1, 3, 224, 224)], fp16_mode=True) +def shufflenet_v2_x1_0(): + return torchvision.models.shufflenet_v2_x1_0(pretrained=False) + + +@add_module_test(torch.float16, torch.device('cuda'), [(1, 3, 224, 224)], fp16_mode=True) +def shufflenet_v2_x1_5(): + return torchvision.models.shufflenet_v2_x1_5(pretrained=False) + + +@add_module_test(torch.float16, torch.device('cuda'), [(1, 3, 224, 224)], fp16_mode=True) +def shufflenet_v2_x2_0(): + return torchvision.models.shufflenet_v2_x2_0(pretrained=False) + + +@add_module_test(torch.float16, torch.device('cuda'), [(1, 3, 224, 224)], fp16_mode=True) +def mnasnet0_5(): + return torchvision.models.mnasnet0_5(pretrained=False) + + +@add_module_test(torch.float16, torch.device('cuda'), [(1, 3, 224, 224)], fp16_mode=True) +def mnasnet0_75(): + return torchvision.models.mnasnet0_75(pretrained=False) + + +@add_module_test(torch.float16, torch.device('cuda'), [(1, 3, 224, 224)], fp16_mode=True) +def mnasnet1_0(): + return torchvision.models.mnasnet1_0(pretrained=False) + + +@add_module_test(torch.float16, torch.device('cuda'), [(1, 3, 224, 224)], fp16_mode=True) +def mnasnet1_3(): + return torchvision.models.mnasnet1_3(pretrained=False) \ No newline at end of file diff --git a/torch2trt/tests/torchvision/save_load.py b/torch2trt/tests/torchvision/save_load.py new file mode 100644 index 00000000..15af6728 --- /dev/null +++ b/torch2trt/tests/torchvision/save_load.py @@ -0,0 +1,24 @@ +from torch2trt import * +import torchvision +import torch +from .segmentation import deeplabv3_resnet50 + + +if __name__ == '__main__': + model = deeplabv3_resnet50().cuda().eval().half() + data = torch.randn((1, 3, 224, 224)).cuda().half() + + print('Running torch2trt...') + model_trt = torch2trt(model, [data], fp16_mode=True, max_workspace_size=1<<25) + + print('Saving model...') + torch.save(model_trt.state_dict(), '.test_model.pth') + + print('Loading model...') + model_trt_2 = TRTModule() + model_trt_2.load_state_dict(torch.load('.test_model.pth')) + + assert(model_trt_2.engine is not None) + + print(torch.max(torch.abs(model_trt_2(data) - model(data)))) + print(torch.max(torch.abs(model_trt_2(data) - model_trt(data)))) \ No newline at end of file diff --git a/torch2trt/tests/torchvision/segmentation.py b/torch2trt/tests/torchvision/segmentation.py new file mode 100644 index 00000000..6cc915dd --- /dev/null +++ b/torch2trt/tests/torchvision/segmentation.py @@ -0,0 +1,39 @@ +import torch +import torchvision +from torch2trt.module_test import add_module_test + + +class ModelWrapper(torch.nn.Module): + def __init__(self, model): + super(ModelWrapper, self).__init__() + self.model = model + def forward(self, x): + return self.model(x)['out'] + + +@add_module_test(torch.float16, torch.device('cuda'), [(1, 3, 224, 224)], fp16_mode=True) +def deeplabv3_resnet50(): + bb = torchvision.models.segmentation.deeplabv3_resnet50(pretrained=False) + model = ModelWrapper(bb) + return model + + +@add_module_test(torch.float16, torch.device('cuda'), [(1, 3, 224, 224)], fp16_mode=True) +def deeplabv3_resnet101(): + bb = torchvision.models.segmentation.deeplabv3_resnet101(pretrained=False) + model = ModelWrapper(bb) + return model + + +@add_module_test(torch.float16, torch.device('cuda'), [(1, 3, 224, 224)], fp16_mode=True) +def fcn_resnet50(): + bb = torchvision.models.segmentation.fcn_resnet50(pretrained=False) + model = ModelWrapper(bb) + return model + + +@add_module_test(torch.float16, torch.device('cuda'), [(1, 3, 224, 224)], fp16_mode=True) +def fcn_resnet101(): + bb = torchvision.models.segmentation.fcn_resnet101(pretrained=False) + model = ModelWrapper(bb) + return model \ No newline at end of file diff --git a/torch2trt/torch2trt.py b/torch2trt/torch2trt.py new file mode 100644 index 00000000..cc628ede --- /dev/null +++ b/torch2trt/torch2trt.py @@ -0,0 +1,625 @@ +import torch +import tensorrt as trt +import copy +import numpy as np +import io +from collections import defaultdict +import importlib + +from .calibration import ( + TensorBatchDataset, + DatasetCalibrator, + DEFAULT_CALIBRATION_ALGORITHM, +) + +# UTILITY FUNCTIONS + + +def trt_version(): + return trt.__version__ + + +def torch_version(): + return torch.__version__ + + +def torch_dtype_to_trt(dtype): + if trt_version() >= '7.0' and dtype == torch.bool: + return trt.bool + elif dtype == torch.int8: + return trt.int8 + elif dtype == torch.int32: + return trt.int32 + elif dtype == torch.float16: + return trt.float16 + elif dtype == torch.float32: + return trt.float32 + else: + raise TypeError("%s is not supported by tensorrt" % dtype) + + +def torch_dtype_from_trt(dtype): + if dtype == trt.int8: + return torch.int8 + elif trt_version() >= '7.0' and dtype == trt.bool: + return torch.bool + elif dtype == trt.int32: + return torch.int32 + elif dtype == trt.float16: + return torch.float16 + elif dtype == trt.float32: + return torch.float32 + else: + raise TypeError("%s is not supported by torch" % dtype) + + +def torch_device_to_trt(device): + if device.type == torch.device("cuda").type: + return trt.TensorLocation.DEVICE + elif device.type == torch.device("cpu").type: + return trt.TensorLocation.HOST + else: + return TypeError("%s is not supported by tensorrt" % device) + + +def torch_device_from_trt(device): + if device == trt.TensorLocation.DEVICE: + return torch.device("cuda") + elif device == trt.TensorLocation.HOST: + return torch.device("cpu") + else: + return TypeError("%s is not supported by torch" % device) + + +def trt_num_inputs(engine): + count = 0 + for i in range(engine.num_bindings): + if engine.binding_is_input(i): + count += 1 + return count + + +def trt_num_outputs(engine): + count = 0 + for i in range(engine.num_bindings): + if not engine.binding_is_input(i): + count += 1 + return count + + +def torch_dim_to_trt_axes(dim): + """Converts torch dim, or tuple of dims to a tensorrt axes bitmask""" + if not isinstance(dim, tuple): + dim = (dim,) + + # create axes bitmask for reduce layer + axes = 0 + for d in dim: + axes |= 1 << (d - 1) # -1 to remove batch dimension + + return axes + + +def add_trt_constant(network, tensor): + shape = tuple(tensor.shape[1:]) + array = tensor[0].detach().cpu().numpy() + layer = network.add_constant(shape, array) + return layer.get_output(0) + + +def check_torch_dtype(*tensors): + dtype = None + for t in tensors: + if isinstance(t, torch.Tensor): + if dtype is None: + dtype = t.dtype + else: + assert dtype == t.dtype # , 'Tensor data types must match') + assert ( + dtype is not None + ) # , 'Data type could not be inferred from any item in list') + return dtype + + +def add_missing_trt_tensors(network, tensors): + """Creates missing TensorRT tensors as constants and attaches them to the Torch Tensors""" + trt_tensors = [None] * len(tensors) + + dtype = check_torch_dtype(*tensors) + + for i, t in enumerate(tensors): + trt_tensor = None + + # GET TRT TENSOR (OR CREATE TRT CONSTANT) + + # get tensor w/ _trt + # or... add constant for scalar primitive + if isinstance(t, float) or isinstance(t, int): + shape = (1,) + scalar = t * torch.ones(shape, dtype=dtype).cpu().numpy() + trt_tensor = network.add_constant(shape, scalar).get_output(0) + elif hasattr(t, "_trt"): + trt_tensor = t._trt + + # or... add constant for leaf tensor w/o _trt + else: + + # remove all preceding ones, these can be re-inserted later when broadcasting + num_preceding_ones = 0 + for j in range(len(t.shape)): + if int(t.shape[j]) == 1: + num_preceding_ones += 1 + else: + break + shape = tuple(t.shape[num_preceding_ones:]) + + weight = t.detach().cpu().numpy() + t._trt = network.add_constant(shape, weight).get_output(0) + trt_tensor = t._trt + + + assert trt_tensor is not None + + trt_tensors[i] = trt_tensor + + return trt_tensors + + +def broadcast_trt_tensors(network, trt_tensors, broadcast_ndim): + """Broadcast TensorRT tensors to the specified dimension by pre-padding shape 1 dims""" + broadcasted_trt_tensors = [None] * len(trt_tensors) + + for i, t in enumerate(trt_tensors): + + if len(t.shape) < broadcast_ndim: + # append 1 size dims to front + diff = broadcast_ndim - len(t.shape) + shape = tuple([1] * diff + list(t.shape)) + layer = network.add_shuffle(t) + layer.reshape_dims = shape + trt_tensor = layer.get_output(0) + else: + trt_tensor = t + + broadcasted_trt_tensors[i] = trt_tensor + + return broadcasted_trt_tensors + + +def trt_(network, *tensors): + """Creates missing TensorRT tensors and adds shuffle layers to make tensors broadcastable""" + trt_tensors = [None] * len(tensors) + + dtype = check_torch_dtype(*tensors) + + # get broadcast dimension + broadcast_num_dim = 0 + for t in tensors: + if isinstance(t, torch.Tensor): + if not hasattr(t, "_trt"): + num_dim = len(t.shape) # don't exclude batch for constants + else: + num_dim = len( + t._trt.shape + ) # non-leaf tensors must already have _trt, get shape from that + if num_dim > broadcast_num_dim: + broadcast_num_dim = num_dim + + for i, t in enumerate(tensors): + trt_tensor = None + + # GET TRT TENSOR (OR CREATE TRT CONSTANT) + + # get tensor w/ _trt + if isinstance(t, torch.Tensor) and hasattr(t, "_trt"): + trt_tensor = t._trt + + # or... add constant for leaf tensor w/o _trt + elif isinstance(t, torch.Tensor) and not hasattr(t, "_trt"): + # add leaf tensor + shape = tuple(t.shape) # don't exclude batch when adding constants...? + weight = t.detach().cpu().numpy() + t._trt = network.add_constant(shape, weight).get_output(0) + trt_tensor = t._trt + + # or... add constant for scalar primitive + elif isinstance(t, float) or isinstance(t, int): + shape = (1,) * broadcast_num_dim + scalar = t * torch.ones(shape, dtype=dtype).cpu().numpy() + trt_tensor = network.add_constant(shape, scalar).get_output(0) + + assert trt_tensor is not None + + # MAKE TRT TENSOR BROADCASTABLE IF IT IS NOT ALREADY + + if len(trt_tensor.shape) < broadcast_num_dim: + # append 1 size dims to front + diff = broadcast_num_dim - len(trt_tensor.shape) + shape = tuple([1] * diff + list(trt_tensor.shape)) + layer = network.add_shuffle(trt_tensor) + layer.reshape_dims = shape + trt_tensor = layer.get_output(0) + + trt_tensors[i] = trt_tensor + + if len(trt_tensors) == 1: + return trt_tensors[0] + else: + return tuple(trt_tensors) + + +# CONVERSION REGISTRY AND HOOKS + + +CONVERTERS = {} + + +def get_arg(ctx, name, pos, default): + if name in ctx.method_kwargs: + return ctx.method_kwargs[name] + elif len(ctx.method_args) > pos: + return ctx.method_args[pos] + else: + return default + + +def attach_converter(ctx, method, converter, method_str): + """Gets a function that executes PyTorch method and TensorRT converter""" + global DUMMY_CONVERTERS + + def wrapper(*args, **kwargs): + skip = True + + # check if another (parent) converter has lock + if not ctx.lock: + if converter["is_real"]: + ctx.lock = True # only real converters can acquire lock + skip = False + + # run original method + outputs = method(*args, **kwargs) + + if not skip: + ctx.method_args = args + ctx.method_kwargs = kwargs + ctx.method_return = outputs + ctx.method_str = method_str + + # print('%s' % (converter.__name__,)) + converter["converter"](ctx) + + # convert to None so conversion will fail for unsupported layers + ctx.method_args = None + ctx.method_kwargs = None + ctx.method_return = None + ctx.lock = False + + return outputs + + return wrapper + + +class ConversionHook(object): + """Attaches TensorRT converter to PyTorch method call""" + + def __init__(self, ctx, key, converter): + self.ctx = ctx + self.key = key + self.converter = converter + + def _set_method(self, method): + module = self.converter['module'] + exec('module.%s = method' % self.converter['qual_name']) + + def __enter__(self): + self._set_method( + attach_converter( + self.ctx, self.converter['method_impl'], self.converter, self.converter['method_str'] + ) + ) + + def __exit__(self, type, val, tb): + self._set_method(self.converter['method_impl']) + +def default_input_names(num_inputs): + return ["input_%d" % i for i in range(num_inputs)] + +def default_output_names(num_outputs): + return ["output_%d" % i for i in range(num_outputs)] + + +class LayerNamingNetworkWrapper(object): + def __init__(self, ctx, network): + self._ctx = ctx + self._network = network + self._layer_counts = defaultdict(lambda: 0) + + def _set_layer_name(self, layer): + def arg_str(arg): + if isinstance(arg, torch.Tensor): + return "tensor(shape=%s, dtype=%s)" % (str(list(arg.shape)), str(arg.dtype)) + return str(arg) + + self._layer_counts[layer.type.name] += 1 + args = [arg_str(arg) for arg in self._ctx.method_args] + kwargs = ["%s=%s" % (key, arg_str(arg)) for key, arg in self._ctx.method_kwargs.items()] + layer.name = "[%s #%d] %s(%s)" % (layer.type.name, self._layer_counts[layer.type.name], + self._ctx.method_str, ", ".join(args + kwargs)) + + def __getattr__(self, name): + attr = getattr(self._network, name) + if callable(attr): + def wrapper(*args, **kwargs): + ret = attr(*args, **kwargs) + if isinstance(ret, trt.ILayer): + self._set_layer_name(ret) + return ret + + return wrapper + else: + return attr + + +class ConversionContext(object): + + def __init__(self, network, converters=CONVERTERS, torch2trt_kwargs=None): + self.network = LayerNamingNetworkWrapper(self, network) + self.lock = False + self.method_args = None + self.method_kwargs = None + self.method_return = None + self.torch2trt_kwargs = torch2trt_kwargs + self.hooks = [ + ConversionHook(self, key, converter) + for key, converter in converters.items() + ] + + def __enter__(self): + for hook in self.hooks: + hook.__enter__() + return self + + def __exit__(self, type, val, tb): + for hook in self.hooks: + hook.__exit__(type, val, tb) + + def add_inputs(self, torch_inputs, names=None): + if names is None: + names = default_input_names(len(torch_inputs)) + self.input_names = names + + for i, torch_input in enumerate(torch_inputs): + if not hasattr(torch_input, "_trt"): + trt_tensor = self.network.add_input( + name=names[i], + shape=tuple(torch_input.shape)[1:], + dtype=torch_dtype_to_trt(torch_input.dtype), + ) + trt_tensor.location = torch_device_to_trt(torch_input.device) + torch_input._trt = trt_tensor + + def mark_outputs(self, torch_outputs, names=None): + if names is None: + names = default_output_names(len(torch_outputs)) + self.output_names = names + + for i, torch_output in enumerate(torch_outputs): + trt_tensor = torch_output._trt + trt_tensor.name = names[i] + trt_tensor.location = torch_device_to_trt(torch_output.device) + trt_tensor.dtype = torch_dtype_to_trt(torch_output.dtype) + self.network.mark_output(trt_tensor) + + +class TRTModule(torch.nn.Module): + def __init__(self, engine=None, input_names=None, output_names=None): + super(TRTModule, self).__init__() + self._register_state_dict_hook(TRTModule._on_state_dict) + self.engine = engine + if self.engine is not None: + self.context = self.engine.create_execution_context() + self.input_names = input_names + self.output_names = output_names + + def _on_state_dict(self, state_dict, prefix, local_metadata): + state_dict[prefix + "engine"] = bytearray(self.engine.serialize()) + state_dict[prefix + "input_names"] = self.input_names + state_dict[prefix + "output_names"] = self.output_names + + def _load_from_state_dict( + self, + state_dict, + prefix, + local_metadata, + strict, + missing_keys, + unexpected_keys, + error_msgs, + ): + engine_bytes = state_dict[prefix + "engine"] + + with trt.Logger() as logger, trt.Runtime(logger) as runtime: + self.engine = runtime.deserialize_cuda_engine(engine_bytes) + self.context = self.engine.create_execution_context() + + self.input_names = state_dict[prefix + "input_names"] + self.output_names = state_dict[prefix + "output_names"] + + def forward(self, *inputs): + batch_size = inputs[0].shape[0] + bindings = [None] * (len(self.input_names) + len(self.output_names)) + + # create output tensors + outputs = [None] * len(self.output_names) + for i, output_name in enumerate(self.output_names): + idx = self.engine.get_binding_index(output_name) + dtype = torch_dtype_from_trt(self.engine.get_binding_dtype(idx)) + shape = (batch_size,) + tuple(self.engine.get_binding_shape(idx)) + device = torch_device_from_trt(self.engine.get_location(idx)) + output = torch.empty(size=shape, dtype=dtype, device=device) + outputs[i] = output + bindings[idx] = output.data_ptr() + + for i, input_name in enumerate(self.input_names): + idx = self.engine.get_binding_index(input_name) + bindings[idx] = inputs[i].contiguous().data_ptr() + + self.context.execute_async( + batch_size, bindings, torch.cuda.current_stream().cuda_stream + ) + + outputs = tuple(outputs) + if len(outputs) == 1: + outputs = outputs[0] + + return outputs + + def enable_profiling(self): + if not self.context.profiler: + self.context.profiler = trt.Profiler() + + +def torch2trt(module, + inputs, + input_names=None, + output_names=None, + log_level=trt.Logger.ERROR, + max_batch_size=1, + fp16_mode=False, + max_workspace_size=1<<25, + strict_type_constraints=False, + keep_network=True, + int8_mode=False, + int8_calib_dataset=None, + int8_calib_algorithm=DEFAULT_CALIBRATION_ALGORITHM, + int8_calib_batch_size=1, + use_onnx=False, + **kwargs): + + # capture arguments to provide to context + kwargs.update(locals()) + kwargs.pop('kwargs') + + inputs_in = inputs + + # copy inputs to avoid modifications to source data + inputs = [tensor.clone()[0:1] for tensor in inputs] # only run single entry + + logger = trt.Logger(log_level) + builder = trt.Builder(logger) + + if isinstance(inputs, list): + inputs = tuple(inputs) + if not isinstance(inputs, tuple): + inputs = (inputs,) + + # run once to get num outputs + outputs = module(*inputs) + if not isinstance(outputs, tuple) and not isinstance(outputs, list): + outputs = (outputs,) + + if input_names is None: + input_names = default_input_names(len(inputs)) + if output_names is None: + output_names = default_output_names(len(outputs)) + + if use_onnx: + + f = io.BytesIO() + torch.onnx.export(module, inputs, f, input_names=input_names, output_names=output_names) + f.seek(0) + onnx_bytes = f.read() + network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)) + parser = trt.OnnxParser(network, logger) + parser.parse(onnx_bytes) + + else: + network = builder.create_network() + with ConversionContext(network, torch2trt_kwargs=kwargs) as ctx: + + ctx.add_inputs(inputs, input_names) + + outputs = module(*inputs) + + if not isinstance(outputs, tuple) and not isinstance(outputs, list): + outputs = (outputs,) + ctx.mark_outputs(outputs, output_names) + + builder.max_workspace_size = max_workspace_size + builder.fp16_mode = fp16_mode + builder.max_batch_size = max_batch_size + builder.strict_type_constraints = strict_type_constraints + + if int8_mode: + + # default to use input tensors for calibration + if int8_calib_dataset is None: + int8_calib_dataset = TensorBatchDataset(inputs_in) + + builder.int8_mode = True + + # @TODO(jwelsh): Should we set batch_size=max_batch_size? Need to investigate memory consumption + builder.int8_calibrator = DatasetCalibrator( + inputs, int8_calib_dataset, batch_size=int8_calib_batch_size, algorithm=int8_calib_algorithm + ) + + engine = builder.build_cuda_engine(network) + + module_trt = TRTModule(engine, input_names, output_names) + + if keep_network: + module_trt.network = network + + return module_trt + + +# DEFINE ALL CONVERSION FUNCTIONS + +def get_module_qualname(name): + s = name.split('.') + + for i in range(len(s)): + idx = len(s) - i - 1 + modulename, qualname = ".".join(s[:idx]), ".".join(s[idx:]) + try: + module = importlib.import_module(modulename) + return module, modulename, qualname + except: + pass + + raise RuntimeError("Could not import module") + + +def tensorrt_converter(method, is_real=True, enabled=True, imports=[]): + + if isinstance(method, str): + module, module_name, qual_name = get_module_qualname(method) + else: + module, module_name, qual_name = importlib.import_module(method.__module__), method.__module__, method.__qualname__ + + try: + method_impl = eval('copy.deepcopy(module.%s)' % qual_name) + except: + enabled = False + + def register_converter(converter): + CONVERTERS[method] = { + "converter": converter, + "is_real": is_real, + "module": module, + "module_name": module_name, + "qual_name": qual_name, + "method_str": module_name + '.' + qual_name, + "method_impl": method_impl + } + return converter + + def pass_converter(converter): + return converter + + if enabled: + return register_converter + else: + return pass_converter + + return register_converter diff --git a/torch2trt/utils.py b/torch2trt/utils.py new file mode 100644 index 00000000..f5493e00 --- /dev/null +++ b/torch2trt/utils.py @@ -0,0 +1,63 @@ +import graphviz +import tensorrt as trt + + +def trt_network_to_dot_graph(network): + dot = graphviz.Digraph(comment="Network") + + # add nodes (layers) + for i in range(network.num_layers): + layer = network.get_layer(i) + dot.node(layer.name) + + # add nodes (inputs) + for i in range(network.num_inputs): + dot.node(network.get_input(i).name) + + # add nodes (outputs) + for i in range(network.num_outputs): + dot.node(network.get_output(i).name) + + # add layer->layer edges + for a in range(network.num_layers): + layer_a = network.get_layer(a) + + for b in range(network.num_layers): + layer_b = network.get_layer(b) + + for i in range(layer_a.num_outputs): + output_i = layer_a.get_output(i) + + for j in range(layer_b.num_inputs): + input_j = layer_b.get_input(j) + + if output_i == input_j: + dot.edge(layer_a.name, layer_b.name, label=str(input_j.shape)) + + # add input->layer edges + for i in range(network.num_inputs): + input_i = network.get_input(i) + + for b in range(network.num_layers): + layer_b = network.get_layer(b) + + for j in range(layer_b.num_inputs): + input_j = layer_b.get_input(j) + + if input_i == input_j: + dot.edge(input_i.name, layer_b.name, label=str(input_j.shape)) + + # add layer->output edges + for i in range(network.num_outputs): + input_i = network.get_output(i) + + for b in range(network.num_layers): + layer_b = network.get_layer(b) + + for j in range(layer_b.num_outputs): + input_j = layer_b.get_output(j) + + if input_i == input_j: + dot.edge(layer_b.name, input_i.name, label=str(input_j.shape)) + + return dot