InfiniTensor
diff --git a/‎include/infinicore/ops.hpp‎
Lines changed: 1 addition & 0 deletions b/‎include/infinicore/ops.hpp‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎include/infinicore/ops/add_rms_norm.hpp‎
Lines changed: 20 additions & 0 deletions b/‎include/infinicore/ops/add_rms_norm.hpp‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎include/infiniop.h‎
Lines changed: 1 addition & 0 deletions b/‎include/infiniop.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎include/infiniop/ops/add_rms_norm.h‎
Lines changed: 32 additions & 0 deletions b/‎include/infiniop/ops/add_rms_norm.h‎
Lines changed: 32 additions & 0 deletions
diff --git a/‎python/infinicore/__init__.py‎
Lines changed: 3 additions & 0 deletions b/‎python/infinicore/__init__.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎python/infinicore/ops/add_rms_norm.py‎
Lines changed: 47 additions & 0 deletions b/‎python/infinicore/ops/add_rms_norm.py‎
Lines changed: 47 additions & 0 deletions
diff --git a/‎src/infinicore/ops/add_rms_norm/add_rms_norm.cc‎
Lines changed: 29 additions & 0 deletions b/‎src/infinicore/ops/add_rms_norm/add_rms_norm.cc‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎src/infinicore/ops/add_rms_norm/add_rms_norm_infiniop.cc‎
Lines changed: 50 additions & 0 deletions b/‎src/infinicore/ops/add_rms_norm/add_rms_norm_infiniop.cc‎
Lines changed: 50 additions & 0 deletions
diff --git a/‎src/infinicore/pybind11/ops.hpp‎
Lines changed: 2 additions & 0 deletions b/‎src/infinicore/pybind11/ops.hpp‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/infinicore/pybind11/ops/add_rms_norm.hpp‎
Lines changed: 51 additions & 0 deletions b/‎src/infinicore/pybind11/ops/add_rms_norm.hpp‎
Lines changed: 51 additions & 0 deletions
@@ -1,6 +1,7 @@
 #pragma once
 
 #include "ops/add.hpp"
+#include "ops/add_rms_norm.hpp"
 #include "ops/attention.hpp"
 #include "ops/causal_softmax.hpp"
 #include "ops/matmul.hpp"
 
@@ -0,0 +1,20 @@
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+#include <utility>
+
+namespace infinicore::op {
+class AddRMSNorm {
+public:
+    using schema = void (*)(Tensor, Tensor, Tensor, Tensor, Tensor, float);
+    static void execute(Tensor y, Tensor residual_out, Tensor a, Tensor b, Tensor weight, float epsilon = 1e-5f);
+    static common::OpDispatcher<schema> &dispatcher();
+};
+
+// Fused Add and RMS Normalization
+// Returns: (normalized_result, add_result)
+// The add_result can be used as residual for subsequent layers
+std::pair<Tensor, Tensor> add_rms_norm(Tensor a, Tensor b, Tensor weight, float epsilon = 1e-5f);
+void add_rms_norm_(Tensor y, Tensor residual_out, Tensor a, Tensor b, Tensor weight, float epsilon = 1e-5f);
+} // namespace infinicore::op
@@ -3,6 +3,7 @@
 
 #include "infiniop/handle.h"
 #include "infiniop/ops/add.h"
+#include "infiniop/ops/add_rms_norm.h"
 #include "infiniop/ops/attention.h"
 #include "infiniop/ops/causal_softmax.h"
 #include "infiniop/ops/clip.h"
 
@@ -0,0 +1,32 @@
+#ifndef __INFINIOP_ADD_RMS_NORM_API_H__
+#define __INFINIOP_ADD_RMS_NORM_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopAddRMSNormDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateAddRMSNormDescriptor(
+    infiniopHandle_t handle,
+    infiniopAddRMSNormDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t a_desc,
+    infiniopTensorDescriptor_t b_desc,
+    infiniopTensorDescriptor_t weight_desc,
+    float epsilon,
+    infiniopTensorDescriptor_t residual_out_desc);
+
+__C __export infiniStatus_t infiniopGetAddRMSNormWorkspaceSize(infiniopAddRMSNormDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopAddRMSNorm(infiniopAddRMSNormDescriptor_t desc,
+                                                void *workspace,
+                                                size_t workspace_size,
+                                                void *y,
+                                                const void *a,
+                                                const void *b,
+                                                const void *weight,
+                                                void *residual_out,
+                                                void *stream);
+
+__C __export infiniStatus_t infiniopDestroyAddRMSNormDescriptor(infiniopAddRMSNormDescriptor_t desc);
+
+#endif
@@ -40,6 +40,7 @@
     uint8,
 )
 from infinicore.ops.add import add
+from infinicore.ops.add_rms_norm import add_rms_norm, add_rms_norm_
 from infinicore.ops.attention import attention
 from infinicore.ops.matmul import matmul
 from infinicore.ops.mul import mul
@@ -105,6 +106,8 @@
     "uint8",
     # Operations.
     "add",
+    "add_rms_norm",
+    "add_rms_norm_",
     "attention",
     "matmul",
     "mul",
 
@@ -0,0 +1,47 @@
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+
+def add_rms_norm(a, b, weight, epsilon=1e-5, *, out=None):
+    """
+    Fused Add and RMS Normalization.
+
+    Args:
+        a: First input tensor
+        b: Second input tensor
+        weight: Scale weights
+        epsilon: Small constant for numerical stability, default is 1e-5
+        out: Optional output tuple (y, residual_out) for in-place operation
+
+    Returns:
+        Tuple of (normalized_result, add_result): (RMSNorm(a + b) * weight, a + b)
+        The add_result can be used as residual for subsequent layers.
+    """
+    if out is None:
+        result = _infinicore.add_rms_norm(
+            a._underlying, b._underlying, weight._underlying, epsilon
+        )
+        return (Tensor(result[0]), Tensor(result[1]))
+
+    y, residual_out = out
+    _infinicore.add_rms_norm_(
+        y._underlying,
+        residual_out._underlying,
+        a._underlying,
+        b._underlying,
+        weight._underlying,
+        epsilon,
+    )
+    return (y, residual_out)
+
+
+def add_rms_norm_(y, residual_out, a, b, weight, epsilon=1e-5):
+    """In-place Fused Add and RMS Normalization."""
+    _infinicore.add_rms_norm_(
+        y._underlying,
+        residual_out._underlying,
+        a._underlying,
+        b._underlying,
+        weight._underlying,
+        epsilon,
+    )
@@ -0,0 +1,29 @@
+#include "infinicore/ops/add_rms_norm.hpp"
+
+#include "../../utils.hpp"
+
+namespace infinicore::op {
+
+common::OpDispatcher<AddRMSNorm::schema> &AddRMSNorm::dispatcher() {
+    static common::OpDispatcher<AddRMSNorm::schema> dispatcher_;
+    return dispatcher_;
+};
+
+void AddRMSNorm::execute(Tensor y, Tensor residual_out, Tensor a, Tensor b, Tensor weight, float epsilon) {
+    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(y, residual_out, a, b, weight);
+    infinicore::context::setDevice(y->device());
+    dispatcher().lookup(y->device().getType())(y, residual_out, a, b, weight, epsilon);
+}
+
+std::pair<Tensor, Tensor> add_rms_norm(Tensor a, Tensor b, Tensor weight, float epsilon) {
+    auto y = Tensor::empty(a->shape(), a->dtype(), a->device());
+    auto residual_out = Tensor::empty(a->shape(), a->dtype(), a->device());
+    add_rms_norm_(y, residual_out, a, b, weight, epsilon);
+    return std::make_pair(y, residual_out);
+}
+
+void add_rms_norm_(Tensor y, Tensor residual_out, Tensor a, Tensor b, Tensor weight, float epsilon) {
+    AddRMSNorm::execute(y, residual_out, a, b, weight, epsilon);
+}
+
+} // namespace infinicore::op
@@ -0,0 +1,50 @@
+#include "../../utils.hpp"
+#include "infinicore/common/hash.hpp"
+#include "infinicore/ops/add_rms_norm.hpp"
+#include "infinicore/ops/common/cache.hpp"
+#include <infiniop.h>
+
+namespace infinicore::op::add_rms_norm_impl::infiniop {
+
+thread_local common::OpCache<size_t, infiniopAddRMSNormDescriptor_t> caches(
+    100, // capacity
+    [](infiniopAddRMSNormDescriptor_t &desc) {
+        if (desc != nullptr) {
+            INFINICORE_CHECK_ERROR(infiniopDestroyAddRMSNormDescriptor(desc));
+            desc = nullptr;
+        }
+    });
+
+void calculate(Tensor y, Tensor residual_out, Tensor a, Tensor b, Tensor weight, float epsilon) {
+    size_t seed = hash_combine(y, residual_out, a, b, weight, epsilon);
+
+    auto device = context::getDevice();
+    auto &cache = caches.getCache(device);
+
+    auto desc_opt = cache.get(seed);
+    infiniopAddRMSNormDescriptor_t desc = nullptr;
+
+    if (!desc_opt) {
+        INFINICORE_CHECK_ERROR(infiniopCreateAddRMSNormDescriptor(
+            context::getInfiniopHandle(device), &desc,
+            y->desc(), a->desc(), b->desc(), weight->desc(), epsilon, residual_out->desc()));
+        cache.put(seed, desc);
+    } else {
+        desc = *desc_opt;
+    }
+
+    size_t workspace_size = 0;
+    INFINICORE_CHECK_ERROR(infiniopGetAddRMSNormWorkspaceSize(desc, &workspace_size));
+    std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
+
+    INFINICORE_CHECK_ERROR(infiniopAddRMSNorm(
+        desc, workspace->data(), workspace_size,
+        y->data(), a->data(), b->data(), weight->data(), residual_out->data(), context::getStream()));
+}
+
+static bool registered = []() {
+    AddRMSNorm::dispatcher().registerAll(&calculate, false);
+    return true;
+}();
+
+} // namespace infinicore::op::add_rms_norm_impl::infiniop
@@ -3,6 +3,7 @@
 #include <pybind11/pybind11.h>
 
 #include "ops/add.hpp"
+#include "ops/add_rms_norm.hpp"
 #include "ops/attention.hpp"
 #include "ops/causal_softmax.hpp"
 #include "ops/embedding.hpp"
@@ -24,6 +25,7 @@ namespace infinicore::ops {
 
 inline void bind(py::module &m) {
     bind_add(m);
+    bind_add_rms_norm(m);
     bind_attention(m);
     bind_causal_softmax(m);
     bind_random_sample(m);
 
@@ -0,0 +1,51 @@
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+#include "infinicore/ops/add_rms_norm.hpp"
+
+namespace py = pybind11;
+
+namespace infinicore::ops {
+
+inline void bind_add_rms_norm(py::module &m) {
+    m.def("add_rms_norm",
+          &op::add_rms_norm,
+          py::arg("a"),
+          py::arg("b"),
+          py::arg("weight"),
+          py::arg("epsilon") = 1e-5f,
+          R"doc(Fused Add and RMS Normalization.
+
+Args:
+    a: First input tensor
+    b: Second input tensor
+    weight: Scale weights
+    epsilon: Small constant for numerical stability, default is 1e-5
+
+Returns:
+    Tuple of (normalized_result, add_result): (RMSNorm(a + b) * weight, a + b)
+    The add_result can be used as residual for subsequent layers.
+)doc");
+
+    m.def("add_rms_norm_",
+          &op::add_rms_norm_,
+          py::arg("y"),
+          py::arg("residual_out"),
+          py::arg("a"),
+          py::arg("b"),
+          py::arg("weight"),
+          py::arg("epsilon") = 1e-5f,
+          R"doc(In-place Fused Add and RMS Normalization.
+
+Args:
+    y: Output tensor for normalized result
+    residual_out: Output tensor for add result (a + b) before normalization
+    a: First input tensor
+    b: Second input tensor
+    weight: Scale weights
+    epsilon: Small constant for numerical stability, default is 1e-5
+)doc");
+}
+
+} // namespace infinicore::ops