Initial draft CMSIS-NN integration (WIP)

sidart · sidart · commit 38a5ce1aea6b · 2025-07-18T16:05:17.000-07:00
Summary:

Test Plan:

Reviewers:

Subscribers:

Tasks:

Tags:
diff --git a/backends/cortex_m/CMakeLists.txt b/backends/cortex_m/CMakeLists.txt
@@ -25,9 +25,10 @@ include(${EXECUTORCH_ROOT}/tools/cmake/Codegen.cmake)
 
 # Cortex-M ops kernel sources
 set(_cortex_m_kernels__srcs
+    ${CMAKE_CURRENT_SOURCE_DIR}/ops/op_add.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/ops/op_aten_add_tensor.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/ops/op_quantize_per_tensor.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/ops/op_dequantize_per_tensor.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/ops/op_add.cpp
 )
 
 # Generate C++ bindings to register kernels into Executorch (for runtime).
diff --git a/backends/cortex_m/ops/op_add.cpp b/backends/cortex_m/ops/op_add.cpp
@@ -1,5 +1,6 @@
 #include <executorch/runtime/kernel/kernel_includes.h>
-#include <cinttypes>
+#include <iostream>
+
 namespace cortex_m {
 namespace native {
 
@@ -13,7 +14,9 @@ Tensor& add_out(
     const Tensor& input2,
     const ScalarType dtype,
     Tensor& out) {
-  
+  std::cout << "add_out kernel called" << std::endl;
+  ET_LOG(Info, "xxxxxxxxxx add_out kernel called");
+
   // Ensure input is char type
   ET_CHECK_MSG(
       input1.scalar_type() == ScalarType::Char,
@@ -37,6 +40,7 @@ Tensor& add_out(
       "dtype %" PRId8 " is not int8 (Char)",
       static_cast<int8_t>(dtype));
   
+  assert(false);
 
   return out;
 }
diff --git a/backends/cortex_m/ops/op_aten_add_tensor.cpp b/backends/cortex_m/ops/op_aten_add_tensor.cpp
@@ -0,0 +1,49 @@
+#include <executorch/runtime/kernel/kernel_includes.h>
+#include <iostream>
+
+namespace cortex_m {
+namespace native {
+
+using Tensor = executorch::aten::Tensor;
+using ScalarType = executorch::aten::ScalarType;
+using KernelRuntimeContext = torch::executor::KernelRuntimeContext;
+
+Tensor& aten_add_tensor(
+    KernelRuntimeContext& ctx,
+    const Tensor& self,
+    const Tensor& other,
+    const ScalarType dtype,
+    Tensor& out) {
+  ET_LOG(Info, "xxxxxxxxxx aten_add_tensor kernel called");
+
+  // Ensure input is char type
+  ET_CHECK_MSG(
+      self.scalar_type() == ScalarType::Char,
+      "self.scalar_type() %" PRId8 " is not char type",
+      static_cast<int8_t>(self.scalar_type()));
+
+  ET_CHECK_MSG(
+      other.scalar_type() == ScalarType::Char,
+      "other.scalar_type() %" PRId8 " is not char type",
+      static_cast<int8_t>(other.scalar_type()));
+
+  // Check dtype is int8 (Char)
+  ET_CHECK_MSG(
+      dtype == ScalarType::Char,
+      "dtype %" PRId8 " is not int8 (Char)",
+      static_cast<int8_t>(dtype));
+  
+  // Example: element-wise add self and other into out
+  // (Assuming Tensor has data() and size() methods)
+ const int8_t* self_data = self.const_data_ptr<int8_t>();
+  const int8_t* other_data = other.const_data_ptr<int8_t>();
+  int8_t* out_data = out.mutable_data_ptr<int8_t>();
+  size_t numel = self.numel(); // or self.size() if that's the API
+  for (size_t i = 0; i < numel; ++i) {
+    out_data[i] = self_data[i] + other_data[i];
+  }
+  return out;
+}
+
+} // namespace native
+} // namespace cortex_m
diff --git a/backends/cortex_m/ops/operators.py b/backends/cortex_m/ops/operators.py
@@ -13,6 +13,61 @@
 # New operator library with a custom namespace to allow fusion etc.
 lib = Library("cortex_m", "DEF")
 
+###
+# add.Tensor
+###
+
+lib.define(
+    "add.Tensor(Tensor self, Tensor other, ScalarType dtype) -> (Tensor Z)"
+)
+
+lib.define(
+    "add_Tensor.out(Tensor self, Tensor other, ScalarType dtype, Tensor(a!) out) -> Tensor(a!)"
+)
+
+@impl(lib, "add.Tensor", "CompositeExplicitAutograd")
+def aten_add_tensor_impl(
+    input1: torch.Tensor,
+    input2: torch.Tensor,
+    dtype: torch.dtype,
+    out: torch.Tensor,
+) -> torch.Tensor:
+    """
+    The implementation of aten add.Tensor.
+    """
+    return exir_ops.edge.aten.add.Tensor(input1, input2, dtype)
+
+###
+# add.out
+###
+
+lib.define(
+    "add(Tensor input1, Tensor input2, ScalarType dtype) -> (Tensor Z)"
+)
+
+lib.define(
+    "add.out(Tensor input1, Tensor input2, ScalarType dtype, Tensor(a!) out) -> Tensor(a!)"
+)
+
+@impl(lib, "add.out", "CompositeExplicitAutograd")
+def add_out_impl(
+    input1: torch.Tensor,
+    input2: torch.Tensor,
+    dtype: torch.dtype,
+    out: torch.Tensor,
+) -> torch.Tensor:
+    """
+    The implementation of cmsis-nn add.out.
+    """
+    if node.dtype == torch.qint8:
+        return exir_ops.edge.quantized_decomposed.add.default(
+            input1, input2, dtype, dtype
+        )
+    else:
+        return exir_ops.edge.aten.add.default(
+            input1, input2, dtype, dtype
+        )
+
 ###
 # dequantize_per_tensor
 ###
@@ -25,7 +80,6 @@
     "quantize_per_tensor.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)"
 )
 
-
 @register_fake("cortex_m::quantize_per_tensor")
 def quantize_per_tensor_meta(
     input: torch.Tensor,
@@ -37,7 +91,6 @@ def quantize_per_tensor_meta(
 ) -> torch.Tensor:
     return torch.empty_like(input, dtype=dtype)
 
-
 @impl(lib, "quantize_per_tensor", "CompositeExplicitAutograd")
 def quantize_per_tensor_impl(
     input: torch.Tensor,
diff --git a/backends/cortex_m/ops/operators.yaml b/backends/cortex_m/ops/operators.yaml
@@ -16,8 +16,14 @@
     - arg_meta: null
       kernel_name: cortex_m::dequantize_per_tensor_out
 
-- func: cortex_m::add.out(Tensor a, Tensor b, Scalar alpha, *, Tensor(a!) out) -> Tensor(a!)
+- func: cortex_m::add.out(Tensor a, Tensor b, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)
   variants: function
   kernels:
     - arg_meta: null
       kernel_name: cortex_m::add_out
+
+- func: cortex_m::add.Tensor(Tensor self, Tensor other, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  kernels:
+    - arg_meta: null
+      kernel_name: cortex_m::aten_add_tensor
diff --git a/backends/cortex_m/passes/replace_quant_nodes_pass.py b/backends/cortex_m/passes/replace_quant_nodes_pass.py
@@ -31,6 +31,14 @@ def _is_qualified_int8_node(args) -> bool:
     def __init__(self):
         super().__init__()
         self.op_replacements = {
+            exir_ops.edge.add: {
+                "new_target": exir_ops.edge.cortex_m.add,
+                "qualifier": self._is_qualified_int8_node,
+            },
+            exir_ops.edge.aten.add.Tensor: {
+                "new_target": exir_ops.edge.cortex_m.add.Tensor,
+                "qualifier": self._is_qualified_int8_node,
+            },
             exir_ops.edge.quantized_decomposed.quantize_per_tensor.default: {
                 "new_target": exir_ops.edge.cortex_m.quantize_per_tensor.default,
                 "qualifier": self._is_qualified_int8_node,