Add support for strongly typed op_quantized_relu (#13345)

ethansfng · Ethan Ng · manuelcandales · web-flow · commit 05b7b9f844a5 · 2025-08-14T10:45:18.000-07:00
Differential Revision: D80117641

---------

Co-authored-by: Ethan Ng &lt;ethann@fb.com&gt;
Co-authored-by: Manuel Candales &lt;42380156+manuelcandales@users.noreply.github.com&gt;
diff --git a/backends/cadence/aot/functions.yaml b/backends/cadence/aot/functions.yaml
@@ -219,6 +219,16 @@
     - arg_meta: null
       kernel_name: impl::reference::quantized_relu_per_tensor_out
 
+- func: cadence::quantized_relu_asym8s_asym8s.per_tensor_out(Tensor X, int X_zero_point, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::reference::quantized_relu_asym8s_asym8s_per_tensor_out
+
+- func: cadence::quantized_relu_asym8u_asym8u.per_tensor_out(Tensor X, int X_zero_point, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::reference::quantized_relu_asym8u_asym8u_per_tensor_out
+
 - func: cadence::quantized_matmul.out(Tensor X, int X_zero_point, Tensor Y, int Y_zero_point, Tensor? bias, int out_multiplier, int out_shift, int out_zero_point, bool transposed, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
     - arg_meta: null
diff --git a/backends/cadence/aot/functions_hifi.yaml b/backends/cadence/aot/functions_hifi.yaml
@@ -339,6 +339,16 @@
     - arg_meta: null
       kernel_name: cadence::impl::HiFi::quantized_relu_per_tensor_out
 
+- func: cadence::quantized_relu_asym8s_asym8s.per_tensor_out(Tensor X, int X_zero_point, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: cadence::impl::HiFi::quantized_relu_asym8s_asym8s_per_tensor_out
+
+- func: cadence::quantized_relu_asym8u_asym8u.per_tensor_out(Tensor X, int X_zero_point, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: cadence::impl::HiFi::quantized_relu_asym8u_asym8u_per_tensor_out
+
 - func: cadence::quantized_matmul.out(Tensor X, int X_zero_point, Tensor Y, int Y_zero_point, Tensor? bias, int out_multiplier, int out_shift, int out_zero_point, bool transposed, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
     - arg_meta: null
diff --git a/backends/cadence/aot/ops_registrations.py b/backends/cadence/aot/ops_registrations.py
@@ -232,6 +232,20 @@
     "quantized_relu.per_tensor_out(Tensor X, int X_zero_point, int out_zero_point, int out_multiplier, "
     "int out_shift, *, Tensor(a!) out) -> Tensor(a!)"
 )
+lib.define(
+    "quantized_relu_asym8s_asym8s.per_tensor(Tensor X, int X_zero_point, int out_zero_point, int out_multiplier, int out_shift) -> Tensor"
+)
+lib.define(
+    "quantized_relu_asym8s_asym8s.per_tensor_out(Tensor X, int X_zero_point, int out_zero_point, int out_multiplier, "
+    "int out_shift, *, Tensor(a!) out) -> Tensor(a!)"
+)
+lib.define(
+    "quantized_relu_asym8u_asym8u.per_tensor(Tensor X, int X_zero_point, int out_zero_point, int out_multiplier, int out_shift) -> Tensor"
+)
+lib.define(
+    "quantized_relu_asym8u_asym8u.per_tensor_out(Tensor X, int X_zero_point, int out_zero_point, int out_multiplier, "
+    "int out_shift, *, Tensor(a!) out) -> Tensor(a!)"
+)
 lib.define(
     "quantized_add.out(Tensor X, Tensor X_scale, Tensor X_zero_point, Tensor Y, Tensor Y_scale, "
     "Tensor Y_zero_point, float out_scale, int out_zero_point, *, Tensor(a!) out) -> Tensor(a!)"
@@ -770,6 +784,28 @@ def quantized_relu_per_tensor_meta(
     return input.new_empty(input.size(), dtype=input.dtype)
 
 
+@register_fake("cadence::quantized_relu_asym8s_asym8s.per_tensor")
+def quantized_relu_asym8s_asym8s_per_tensor_meta(
+    input: torch.Tensor,
+    in_zero_point: int,
+    out_zero_point: int,
+    out_multiplier: int,
+    out_shift: int,
+) -> torch.Tensor:
+    return input.new_empty(input.size(), dtype=input.dtype)
+
+
+@register_fake("cadence::quantized_relu_asym8u_asym8u.per_tensor")
+def quantized_relu_asym8u_asym8u_per_tensor_meta(
+    input: torch.Tensor,
+    in_zero_point: int,
+    out_zero_point: int,
+    out_multiplier: int,
+    out_shift: int,
+) -> torch.Tensor:
+    return input.new_empty(input.size(), dtype=input.dtype)
+
+
 @register_fake("cadence::fully_connected")
 def fully_connected_meta(
     src: torch.Tensor,
diff --git a/backends/cadence/aot/tests/test_type_dispatch_passes.py b/backends/cadence/aot/tests/test_type_dispatch_passes.py
@@ -137,3 +137,51 @@ def test_mixed_types_error(self) -> None:
         with self.assertRaises(RuntimeError) as context:
             cast(PassResult, p(gm)).graph_module
         self.assertIn("Unsupported input types", str(context.exception))
+
+    def test_int8_dispatch_quantized_relu(self) -> None:
+        """Test int8 input should dispatch to asym8s_asym8s variant for quantized_relu"""
+        x = torch.randint(-128, 127, (2, 3), dtype=torch.int8)
+        gm = single_op_builder(
+            placeholders=(x,),
+            op=exir_ops.edge.cadence.quantized_relu.per_tensor,
+            args=(x, 0, 0, 1, 0),
+        )
+        p = CompileTimeTypeDispatchPass()
+        gm = cast(PassResult, p(gm)).graph_module
+        # Original op should be replaced
+        self.assertEqual(
+            count_node(gm, exir_ops.edge.cadence.quantized_relu.per_tensor),
+            0,
+        )
+        # Should be replaced with int8 specific variant
+        self.assertEqual(
+            count_node(
+                gm,
+                exir_ops.edge.cadence.quantized_relu_asym8s_asym8s.per_tensor,
+            ),
+            1,
+        )
+
+    def test_uint8_dispatch_quantized_relu(self) -> None:
+        """Test uint8 input should dispatch to asym8u_asym8u variant for quantized_relu"""
+        x = torch.randint(0, 255, (2, 3), dtype=torch.uint8)
+        gm = single_op_builder(
+            placeholders=(x,),
+            op=exir_ops.edge.cadence.quantized_relu.per_tensor,
+            args=(x, 0, 0, 1, 0),
+        )
+        p = CompileTimeTypeDispatchPass()
+        gm = cast(PassResult, p(gm)).graph_module
+        # Original op should be replaced
+        self.assertEqual(
+            count_node(gm, exir_ops.edge.cadence.quantized_relu.per_tensor),
+            0,
+        )
+        # Should be replaced with uint8 specific variant
+        self.assertEqual(
+            count_node(
+                gm,
+                exir_ops.edge.cadence.quantized_relu_asym8u_asym8u.per_tensor,
+            ),
+            1,
+        )
diff --git a/backends/cadence/aot/type_dispatch.py b/backends/cadence/aot/type_dispatch.py
@@ -23,40 +23,63 @@ class CompileTimeTypeDispatchPass(ExportPass):
     Replaces generic ops with ops that have explicit types.
     """
 
-    _TYPE_DISPATCH_MAP: dict[tuple[torch.dtype, torch.dtype], str] = {
+    _BINARY_TYPE_DISPATCH_MAP: dict[tuple[torch.dtype, torch.dtype], str] = {
         (torch.int8, torch.int8): "asym8sxasym8s_asym8s",
         (torch.uint8, torch.uint8): "asym8uxasym8u_asym8u",
     }
 
-    _SUPPORTED_OPS: dict[OpOverload, str] = {
+    _UNARY_TYPE_DISPATCH_MAP: dict[torch.dtype, str] = {
+        torch.int8: "asym8s_asym8s",
+        torch.uint8: "asym8u_asym8u",
+    }
+
+    _BINARY_SUPPORTED_OPS: dict[OpOverload, str] = {
         exir_ops.edge.cadence.quantized_fully_connected.per_tensor: "quantized_fully_connected",
         exir_ops.edge.cadence.quantized_linear.per_tensor: "quantized_linear",
     }
 
+    _SUPPORTED_UNARY_OPS: dict[OpOverload, str] = {
+        exir_ops.edge.cadence.quantized_relu.per_tensor: "quantized_relu",
+    }
+
     def call_operator(
         self,
         op: OpOverload,
         args: tuple[Argument, ...],
         kwargs: dict[str, Argument],
         meta: NodeMetadata,
     ) -> ProxyValue:
-        if op not in self._SUPPORTED_OPS:
-            return super().call_operator(op, args, kwargs, meta)
+        if op in self._BINARY_SUPPORTED_OPS:
+            # pyre-ignore[16]: None has no attribute `to_tensor`.
+            input_dtype = args[0].to_tensor().dtype
+            weight_dtype = args[1].to_tensor().dtype
+            dtype_pair = (input_dtype, weight_dtype)
+
+            if dtype_pair not in self._BINARY_TYPE_DISPATCH_MAP:
+                raise RuntimeError(
+                    f"Unsupported input types for {op}: {input_dtype} and {weight_dtype}"
+                )
+
+            base_op_name = self._BINARY_SUPPORTED_OPS[op]
+            type_suffix = self._BINARY_TYPE_DISPATCH_MAP[dtype_pair]
+
+            typed_op_name = f"{base_op_name}_{type_suffix}"
+            typed_op = getattr(exir_ops.edge.cadence, typed_op_name).per_tensor
+
+            return super().call_operator(typed_op, args, kwargs, meta)
+
+        elif op in self._SUPPORTED_UNARY_OPS:
+            input_dtype = args[0].to_tensor().dtype
 
-        # pyre-ignore[16]: None has no attribute `to_tensor`.
-        input_dtype = args[0].to_tensor().dtype
-        weight_dtype = args[1].to_tensor().dtype
-        dtype_pair = (input_dtype, weight_dtype)
+            if input_dtype not in self._UNARY_TYPE_DISPATCH_MAP:
+                raise RuntimeError(f"Unsupported input type for {op}: {input_dtype}")
 
-        if dtype_pair not in self._TYPE_DISPATCH_MAP:
-            raise RuntimeError(
-                f"Unsupported input types for {op}: {input_dtype} and {weight_dtype}"
-            )
+            base_op_name = self._SUPPORTED_UNARY_OPS[op]
+            type_suffix = self._UNARY_TYPE_DISPATCH_MAP[input_dtype]
 
-        base_op_name = self._SUPPORTED_OPS[op]
-        type_suffix = self._TYPE_DISPATCH_MAP[dtype_pair]
+            typed_op_name = f"{base_op_name}_{type_suffix}"
+            typed_op = getattr(exir_ops.edge.cadence, typed_op_name).per_tensor
 
-        typed_op_name = f"{base_op_name}_{type_suffix}"
-        typed_op = getattr(exir_ops.edge.cadence, typed_op_name).per_tensor
+            return super().call_operator(typed_op, args, kwargs, meta)
 
-        return super().call_operator(typed_op, args, kwargs, meta)
+        return super().call_operator(op, args, kwargs, meta)
diff --git a/backends/cadence/hifi/operators/op_quantized_relu_asym8s_asym8s_per_tensor_out.cpp b/backends/cadence/hifi/operators/op_quantized_relu_asym8s_asym8s_per_tensor_out.cpp
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/cadence/hifi/kernels/kernels.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+#include <xa_nnlib_kernels_api.h>
+
+namespace cadence {
+namespace impl {
+namespace HiFi {
+namespace native {
+
+using ::executorch::aten::Tensor;
+using ::executorch::runtime::KernelRuntimeContext;
+
+void quantized_relu_asym8s_asym8s_per_tensor_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& input,
+    const int64_t in_zero_point,
+    const int64_t out_zero_point,
+    const int64_t out_multiplier,
+    const int64_t out_shift,
+    Tensor& output) {
+  const int8_t* __restrict__ input_data = input.const_data_ptr<int8_t>();
+  int8_t* __restrict__ output_data = output.mutable_data_ptr<int8_t>();
+
+  const int32_t out_multipler_int32 = static_cast<int32_t>(out_multiplier);
+  const int32_t out_shift_int32 = static_cast<int32_t>(out_shift);
+
+  const int32_t ret = xa_nn_vec_relu_asym8s_asym8s(
+      output_data,
+      input_data,
+      in_zero_point,
+      out_multipler_int32,
+      out_shift_int32,
+      out_zero_point,
+      -128,
+      127,
+      input.numel());
+  ET_DCHECK_MSG(
+      ret == 0, "HiFi quantized_relu_asym8s_asym8s_per_tensor failed");
+}
+
+} // namespace native
+} // namespace HiFi
+} // namespace impl
+} // namespace cadence
diff --git a/backends/cadence/hifi/operators/op_quantized_relu_asym8u_asym8u_per_tensor_out.cpp b/backends/cadence/hifi/operators/op_quantized_relu_asym8u_asym8u_per_tensor_out.cpp
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/cadence/hifi/kernels/kernels.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+#include <xa_nnlib_kernels_api.h>
+
+namespace cadence {
+namespace impl {
+namespace HiFi {
+namespace native {
+
+using ::executorch::aten::Tensor;
+using ::executorch::runtime::KernelRuntimeContext;
+
+void quantized_relu_asym8u_asym8u_per_tensor_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& input,
+    const int64_t in_zero_point,
+    const int64_t out_zero_point,
+    const int64_t out_multiplier,
+    const int64_t out_shift,
+    Tensor& output) {
+  const uint8_t* __restrict__ input_data = input.const_data_ptr<uint8_t>();
+  uint8_t* __restrict__ output_data = output.mutable_data_ptr<uint8_t>();
+
+  const int32_t out_multipler_int32 = static_cast<int32_t>(out_multiplier);
+  const int32_t out_shift_int32 = static_cast<int32_t>(out_shift);
+
+  const int32_t ret = xa_nn_vec_relu_asym8u_asym8u(
+      output_data,
+      input_data,
+      in_zero_point,
+      out_multipler_int32,
+      out_shift_int32,
+      _out_zero_point,
+      0,
+      255,
+      input.numel());
+  ET_DCHECK_MSG(
+      ret == 0, "HiFi quantized_relu_asym8u_asym8u_per_tensor failed");
+}
+
+} // namespace native
+} // namespace HiFi
+} // namespace impl
+} // namespace cadence
diff --git a/backends/cadence/hifi/operators/targets.bzl b/backends/cadence/hifi/operators/targets.bzl
@@ -73,6 +73,8 @@ OPERATORS = [
     "quantized_linear_asym8uxasym8u_asym8u_per_tensor_out",
     "quantized_matmul_out",
     "quantized_relu_out",
+    "quantized_relu_asym8s_asym8s_per_tensor_out",
+    "quantized_relu_asym8u_asym8u_per_tensor_out",
     "quantize_per_tensor",
     "remainder",
     "rsqrt",
diff --git a/backends/cadence/reference/operators/quantized_relu_out.cpp b/backends/cadence/reference/operators/quantized_relu_out.cpp