pytorch
diff --git a/‎.github/workflows/build-test-linux-x86_64.yml‎
Lines changed: 6 additions & 0 deletions b/‎.github/workflows/build-test-linux-x86_64.yml‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎examples/dynamo/cuda_python_aot_annotation.py‎
Lines changed: 150 additions & 0 deletions b/‎examples/dynamo/cuda_python_aot_annotation.py‎
Lines changed: 150 additions & 0 deletions
diff --git a/‎py/torch_tensorrt/__init__.py‎
Lines changed: 3 additions & 0 deletions b/‎py/torch_tensorrt/__init__.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎py/torch_tensorrt/_features.py‎
Lines changed: 3 additions & 5 deletions b/‎py/torch_tensorrt/_features.py‎
Lines changed: 3 additions & 5 deletions
diff --git a/‎py/torch_tensorrt/annotation/__init__.py‎
Lines changed: 128 additions & 0 deletions b/‎py/torch_tensorrt/annotation/__init__.py‎
Lines changed: 128 additions & 0 deletions
@@ -459,6 +459,12 @@ jobs:
         python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dynamo_converters_test_results.xml automatic_plugin/test_automatic_plugin_with_attrs.py
         python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dynamo_converters_test_results.xml automatic_plugin/test_flashinfer_rmsnorm.py
         popd
+        pushd .
+        # cuda-python is an optional runtime dep for the torch_tensorrt.annotation QDP layer.
+        python -m pip install cuda-python
+        cd tests/py/annotation
+        python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dynamo_annotation_test_results.xml .
+        popd
 
   L2-torchscript-tests:
     name: ${{ matrix.display-name }}
 
@@ -0,0 +1,150 @@
+"""
+.. _cuda_python_aot_annotation:
+
+Non-Pointwise Custom Plugin via torch_tensorrt.annotation
+==========================================================
+
+This example demonstrates a shape-changing (non-pointwise) CUDA kernel that
+duplicates each input element into two output elements:
+    y[2*i] = x[i], y[2*i + 1] = x[i]
+"""
+
+import sys
+
+import torch
+import torch_tensorrt
+
+if not torch_tensorrt.ENABLED_FEATURES.qdp_plugin:
+    print(
+        "[cuda_python_aot_annotation] Skipping example: "
+        "torch_tensorrt.annotation requires TensorRT QDP plugin support."
+    )
+    sys.exit(0)
+
+try:
+    import tensorrt.plugin as trtp
+except ImportError:
+    print("[cuda_python_aot_annotation] Skipping example: tensorrt.plugin unavailable.")
+    sys.exit(0)
+
+try:
+    from cuda.core import Device as _Device
+    from cuda.core import LaunchConfig as _LaunchConfig
+    from cuda.core import Program as _Program
+    from cuda.core import ProgramOptions as _ProgramOptions
+    from cuda.core import launch as _cuda_launch
+except ImportError:
+    try:
+        from cuda.core.experimental import Device as _Device
+        from cuda.core.experimental import LaunchConfig as _LaunchConfig
+        from cuda.core.experimental import Program as _Program
+        from cuda.core.experimental import ProgramOptions as _ProgramOptions
+        from cuda.core.experimental import launch as _cuda_launch
+    except ImportError:
+        print(
+            "[cuda_python_aot_annotation] Skipping example: cuda-python is not "
+            "installed. Install with `pip install cuda-python` to run this example."
+        )
+        sys.exit(0)
+
+import torch_tensorrt.annotation as tta
+
+
+CU_REPEAT2 = """
+extern "C" __global__ void repeat2_kernel(
+        const float* __restrict__ x, const int n, float* __restrict__ y) {
+    const int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < n) {
+        const float v = x[i];
+        y[2 * i] = v;
+        y[2 * i + 1] = v;
+    }
+}
+"""
+
+_device = _Device()
+_device.set_current()
+_opts = _ProgramOptions(
+    std="c++17", arch=f"sm_{_device.arch}", include_path=["/usr/local/cuda/include"]
+)
+_program = _Program(CU_REPEAT2, code_type="c++", options=_opts)
+_module = _program.compile("ptx", name_expressions=("repeat2_kernel",))
+_kernel = _module.get_kernel("repeat2_kernel")
+
+
+class _PTStream:
+    def __cuda_stream__(self):
+        return (0, torch.cuda.current_stream().cuda_stream)
+
+
+def _eager_repeat2(x: torch.Tensor) -> torch.Tensor:
+    if x.dtype != torch.float32:
+        raise ValueError("This example expects float32 input")
+    flat = x.contiguous().view(-1)
+    n = int(flat.numel())
+    y = torch.empty((n * 2,), device=x.device, dtype=x.dtype)
+    block = 256
+    grid = max(1, (n + block - 1) // block)
+    stream = _device.create_stream(_PTStream())
+    _cuda_launch(
+        stream,
+        _LaunchConfig(grid=(grid,), block=(block,)),
+        _kernel,
+        flat.data_ptr(),
+        n,
+        y.data_ptr(),
+    )
+    return y
+
+
+def _aot_repeat2(inputs, outputs, tactic):
+    n = inputs[0].shape_expr.numel()
+    params = trtp.KernelLaunchParams()
+    params.grid_x = trtp.cdiv(n, 256)
+    params.block_x = 256
+    params.shared_mem = 0
+    extra = trtp.SymIntExprs(1)
+    extra[0] = trtp.SymInt32(n)
+    return params, extra
+
+
+@tta.cuda_plugin(
+    op_name="ann_ex::repeat2",
+    kernel_source=CU_REPEAT2,
+    kernel_name="repeat2_kernel",
+    eager_fn=_eager_repeat2,
+    aot_fn=_aot_repeat2,
+    supports_dynamic_shapes=True,
+)
+def _repeat2_meta(x: torch.Tensor) -> torch.Tensor:
+    return torch.empty((x.numel() * 2,), device=x.device, dtype=x.dtype)
+
+
+class Repeat2Model(torch.nn.Module):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return torch.ops.ann_ex.repeat2(x)
+
+
+if __name__ == "__main__":
+    x = torch.randn(1024, device="cuda", dtype=torch.float32)
+    ref = torch.repeat_interleave(x, 2, dim=0)
+
+    model = Repeat2Model().cuda().eval()
+    eager_out = model(x)
+    print("Eager result matches repeat_interleave:", torch.allclose(eager_out, ref, atol=1e-4))
+
+    print("Compiling with Torch-TensorRT...")
+    with torch_tensorrt.logging.debug():
+        trt_model = torch_tensorrt.compile(
+            model,
+            inputs=[x],
+            enabled_precisions={torch.float32},
+            min_block_size=1,
+        )
+
+    with torch.no_grad():
+        for _ in range(5):
+            out = trt_model(x)
+            assert torch.allclose(out, ref, atol=1e-2, rtol=1e-2), "Mismatch!"
+
+    print("TRT inference successful - results match repeat_interleave")
@@ -99,6 +99,9 @@ def _register_with_torch() -> None:
     from torch_tensorrt.dynamo import backend  # noqa: F401
     from torch_tensorrt import dynamo  # noqa: F401
 
+if ENABLED_FEATURES.qdp_plugin:
+    from torch_tensorrt import annotation  # noqa: F401
+
 from torch_tensorrt._compile import *  # noqa: F403
 from torch_tensorrt.dynamo.runtime._MutableTorchTensorRTModule import (
     MutableTorchTensorRTModule,
 
@@ -43,7 +43,7 @@
 linked_file_full_path = os.path.join(trtorch_dir, linked_file)
 linked_file_runtime_full_path = os.path.join(trtorch_dir, linked_file_runtime)
 
-_TENSORRT_RTX = tensorrt._package_name == "tensorrt_rtx"
+_TENSORRT_RTX = getattr(tensorrt, "_package_name", "") == "tensorrt_rtx"
 _TS_FE_AVAIL = os.path.isfile(linked_file_full_path)
 _TORCHTRT_RT_AVAIL = _TS_FE_AVAIL or os.path.isfile(linked_file_runtime_full_path)
 _DYNAMO_FE_AVAIL = version.parse(sanitized_torch_version()) >= version.parse("2.1.dev")
@@ -57,10 +57,8 @@
 elif importlib.util.find_spec("tensorrt.plugin") and importlib.util.find_spec(
     "tensorrt.plugin._lib"
 ):
-    # there is a bug in tensorrt 10.14.* and 10.15.* that causes the plugin to not work, disable it for now
-    if tensorrt.__version__.startswith("10.15.") or tensorrt.__version__.startswith(
-        "10.14."
-    ):
+    # TensorRT 10.14.* has a known bug that breaks QDP plugins; 10.15.+ works.
+    if tensorrt.__version__.startswith("10.14."):
         _QDP_PLUGIN_AVAIL = False
     else:
         _QDP_PLUGIN_AVAIL = True
 
@@ -0,0 +1,128 @@
+"""
+torch_tensorrt.annotation  (experimental)
+==========================================
+High-level decorators for registering custom CUDA C++ kernels — compiled at
+runtime with NVRTC via **cuda-python** — as TensorRT Quick Deployable Plugins
+(QDP) with full AOT support.
+
+Two registration paths are offered:
+
+``cuda_plugin``
+    One-shot decorator that combines ``cuda_python`` + ``custom_plugin`` for
+    lower boilerplate in common cases.
+
+``custom_plugin``
+    Full auto-registration.  Provide an eager (CUDA) implementation and a
+    meta/fake implementation; the framework registers both the PyTorch custom
+    op **and** the TensorRT plugin.
+
+``register_custom_plugin``
+    TRT-only registration.  Use when ``@torch.library.custom_op`` has already
+    been called.  Only the TRT plugin descriptor, AOT implementation, and
+    Torch-TensorRT converter are added.
+
+``pointwise_aot``
+    Helper that creates a standard AOT launch-config function for pointwise
+    kernels using 1D launch geometry.
+
+``pointwise_eager``
+    Helper that builds a unary pointwise eager CUDA implementation from kernel
+    source, reducing cuda-python compile/launch boilerplate in examples.
+
+``kernel_template_aot`` / ``kernel_template_eager``
+    Generic templates for non-pointwise and multi-dimensional kernels where
+    users provide output allocation and launch/argument mapping callbacks.
+
+Minimal example (full auto-registration)::
+
+    import torch, torch_tensorrt
+    import torch_tensorrt.annotation as tta
+    import tensorrt.plugin as trtp
+
+    cu_code = \"\"\"
+    extern "C" __global__ void pointwise_relu(const float* x, int n, float* y) {
+        int i = blockIdx.x * blockDim.x + threadIdx.x;
+        if (i < n) y[i] = x[i] > 0.f ? x[i] : 0.f;
+    }
+    \"\"\"
+
+    def _eager_relu(x: torch.Tensor) -> torch.Tensor:
+        from cuda.core import Device, LaunchConfig, launch as cuda_launch
+        y = torch.empty_like(x)
+        n = x.numel()
+        block = 256
+        cfg = LaunchConfig(grid=(max(1, (n + block - 1) // block),), block=(block,))
+        dev = Device(); dev.set_current()
+        # wrap current PyTorch stream so the kernel stays on the same stream
+        class _Stream:
+            def __cuda_stream__(self): return (0, torch.cuda.current_stream().cuda_stream)
+        s = dev.create_stream(_Stream())
+        cuda_launch(s, cfg, _kernel_obj, x.data_ptr(), n, y.data_ptr())
+        return y
+
+    def _aot_relu(inputs, outputs, tactic):
+        N = inputs[0].shape_expr.numel()
+        p = trtp.KernelLaunchParams()
+        p.grid_x = trtp.cdiv(N, 256)
+        p.block_x = 256
+        p.shared_mem = 0
+        extra = trtp.SymIntExprs(1)
+        extra[0] = trtp.SymInt32(N)
+        return p, extra
+
+    spec = tta.cuda_python(cu_code, "pointwise_relu", aot_fn=_aot_relu, eager_fn=_eager_relu)
+
+    @tta.custom_plugin("myns::relu", spec, supports_dynamic_shapes=True)
+    def _(x: torch.Tensor) -> torch.Tensor:
+        return torch.empty_like(x)
+
+    # Use in a model
+    class M(torch.nn.Module):
+        def forward(self, x): return torch.ops.myns.relu(x)
+
+    model_trt = torch_tensorrt.compile(M().cuda().eval(), inputs=[torch.randn(1024, device="cuda")])
+"""
+
+from torch_tensorrt.annotation._specs import CudaPythonSpec
+from torch_tensorrt.annotation._custom_plugin import (
+    cuda_plugin,
+    cuda_python,
+    custom_plugin,
+    pointwise_aot,
+    pointwise_eager,
+    register_custom_plugin,
+)
+from torch_tensorrt.annotation._kernel_spec import (
+    Custom,
+    DimSize,
+    Elementwise,
+    InputDecl,
+    KernelSpec,
+    Numel,
+    OutputDecl,
+    ReduceDims,
+    Reduction,
+    SameAs,
+)
+from torch_tensorrt.annotation._kernel_plugin import kernel_plugin
+
+__all__ = [
+    "CudaPythonSpec",
+    "Custom",
+    "DimSize",
+    "Elementwise",
+    "InputDecl",
+    "KernelSpec",
+    "Numel",
+    "OutputDecl",
+    "ReduceDims",
+    "Reduction",
+    "SameAs",
+    "cuda_plugin",
+    "cuda_python",
+    "custom_plugin",
+    "kernel_plugin",
+    "pointwise_aot",
+    "pointwise_eager",
+    "register_custom_plugin",
+]