InfiniTensor · voltjia · May 7, 2025 · May 7, 2025
diff --git a/src/ntops/__init__.py b/src/ntops/__init__.py
@@ -1,12 +0,0 @@
-from ntops.abs import abs
-from ntops.add import add
-from ntops.addmm import addmm
-from ntops.bmm import bmm
-from ntops.div import div
-from ntops.exp import exp
-from ntops.gelu import gelu
-from ntops.mm import mm
-from ntops.mul import mul
-from ntops.rsqrt import rsqrt
-
-__all__ = ["abs", "add", "addmm", "bmm", "div", "exp", "gelu", "mm", "mul", "rsqrt"]

diff --git a/src/ntops/abs.py b/src/ntops/abs.py
diff --git a/src/ntops/add.py b/src/ntops/add.py
diff --git a/src/ntops/exp.py b/src/ntops/exp.py
diff --git a/src/ntops/kernels/__init__.py b/src/ntops/kernels/__init__.py
diff --git a/src/ntops/kernels/abs.py b/src/ntops/kernels/abs.py
@@ -0,0 +1,16 @@
+import functools
+
+import ninetoothed
+import ninetoothed.language as ntl
+from ninetoothed import Tensor
+
+from ntops.kernels.element_wise import arrangement
+
+
+def application(input, output):
+    output = ntl.abs(input)  # noqa: F841
+
+
+@functools.cache
+def make(ndim):
+    return ninetoothed.make(arrangement, application, (Tensor(ndim), Tensor(ndim)))
diff --git a/src/ntops/kernels/add.py b/src/ntops/kernels/add.py
@@ -0,0 +1,17 @@
+import functools
+
+import ninetoothed
+from ninetoothed import Tensor
+
+from ntops.kernels.element_wise import arrangement
+
+
+def application(input, other, alpha, output):
+    output = input + alpha * other  # noqa: F841
+
+
+@functools.cache
+def make(ndim):
+    tensors = (Tensor(ndim), Tensor(ndim), Tensor(0), Tensor(ndim))
+
+    return ninetoothed.make(arrangement, application, tensors)
diff --git a/src/ntops/addmm.py → src/ntops/kernels/addmm.py b/src/ntops/addmm.py → src/ntops/kernels/addmm.py
@@ -2,10 +2,9 @@
 
 import ninetoothed
 import ninetoothed.language as ntl
-import torch
 from ninetoothed import Tensor
 
-import ntops.mm as mm
+import ntops.kernels.mm as mm
 
 
 def arrangement(input, x, y, beta, alpha, output):
@@ -22,22 +21,8 @@ def application(input, x, y, beta, alpha, output):
     output = beta * input + alpha * mm_output
 
 
-def addmm(input, x, y, beta, alpha, output=None):
-    m, _ = x.shape
-    _, n = y.shape
-
-    if output is None:
-        output = torch.empty((m, n), dtype=input.dtype, device=input.device)
-
-    kernel = _make()
-
-    kernel(input, x, y, beta, alpha, output)
-
-    return output
-
-
 @functools.cache
-def _make():
+def make():
     tensors = (Tensor(2), Tensor(2), Tensor(2), Tensor(0), Tensor(0), Tensor(2))
 
     return ninetoothed.make(arrangement, application, tensors)
diff --git a/src/ntops/bmm.py → src/ntops/kernels/bmm.py b/src/ntops/bmm.py → src/ntops/kernels/bmm.py
@@ -1,10 +1,9 @@
 import functools
 
 import ninetoothed
-import torch
 from ninetoothed import Tensor
 
-from ntops.mm import BLOCK_SIZE_K, BLOCK_SIZE_M, BLOCK_SIZE_N, application
+from ntops.kernels.mm import BLOCK_SIZE_K, BLOCK_SIZE_M, BLOCK_SIZE_N, application
 
 
 def arrangement(input, other, output):
@@ -26,20 +25,6 @@ def arrangement(input, other, output):
     return input_arranged, other_arranged, output_arranged
 
 
-def bmm(input, other, output=None):
-    b, m, _ = input.shape
-    _, _, n = other.shape
-
-    if output is None:
-        output = torch.empty((b, m, n), dtype=input.dtype, device=input.device)
-
-    kernel = _make()
-
-    kernel(input, other, output)
-
-    return output
-
-
 @functools.cache
-def _make():
+def make():
     return ninetoothed.make(arrangement, application, (Tensor(3), Tensor(3), Tensor(3)))
diff --git a/src/ntops/div.py → src/ntops/kernels/div.py b/src/ntops/div.py → src/ntops/kernels/div.py
@@ -2,10 +2,9 @@
 
 import ninetoothed
 import ninetoothed.language as ntl
-import torch
 from ninetoothed import Tensor
 
-from ntops import element_wise
+from ntops.kernels.element_wise import arrangement
 
 
 def default_application(input, other, output):
@@ -20,26 +19,15 @@ def floor_application(input, other, output):
     output = ntl.floor(input / other)  # noqa: F841
 
 
-def div(input, other, rounding_mode=None, output=None):
-    if output is None:
-        output = torch.empty_like(input)
-
-    kernel = _make(input.ndim, rounding_mode)
-
-    kernel(input, other, output)
-
-    return output
-
-
 @functools.cache
-def _make(ndim, rounding_mode):
-    tensors = (Tensor(ndim), Tensor(ndim), Tensor(ndim))
-
+def make(ndim, rounding_mode):
     if rounding_mode == "trunc":
         application = trunc_application
     elif rounding_mode == "floor":
         application = floor_application
     else:
         application = default_application
 
-    return ninetoothed.make(element_wise.arrangement, application, tensors)
+    tensors = (Tensor(ndim), Tensor(ndim), Tensor(ndim))
+
+    return ninetoothed.make(arrangement, application, tensors)
diff --git a/src/ntops/element_wise.py → src/ntops/kernels/element_wise.py b/src/ntops/element_wise.py → src/ntops/kernels/element_wise.py
diff --git a/src/ntops/kernels/exp.py b/src/ntops/kernels/exp.py
@@ -0,0 +1,16 @@
+import functools
+
+import ninetoothed
+import ninetoothed.language as ntl
+from ninetoothed import Tensor
+
+from ntops.kernels.element_wise import arrangement
+
+
+def application(input, output):
+    output = ntl.exp(input)  # noqa: F841
+
+
+@functools.cache
+def make(ndim):
+    return ninetoothed.make(arrangement, application, (Tensor(ndim), Tensor(ndim)))
diff --git a/src/ntops/gelu.py → src/ntops/kernels/gelu.py b/src/ntops/gelu.py → src/ntops/kernels/gelu.py
@@ -3,10 +3,9 @@
 
 import ninetoothed
 import ninetoothed.language as ntl
-import torch
 from ninetoothed import Tensor
 
-from ntops import element_wise
+from ntops.kernels.element_wise import arrangement
 
 
 def default_application(input, output):
@@ -28,23 +27,13 @@ def tanh_application(input, output):
     )
 
 
-def gelu(input, approximate="none"):
-    output = torch.empty_like(input)
-
-    kernel = _make(input.ndim, approximate)
-
-    kernel(input, output)
-
-    return output
-
-
 @functools.cache
-def _make(ndim, approximate):
-    tensors = (Tensor(ndim), Tensor(ndim))
-
+def make(ndim, approximate):
     if approximate == "tanh":
         application = tanh_application
     else:
         application = default_application
 
-    return ninetoothed.make(element_wise.arrangement, application, tensors)
+    tensors = (Tensor(ndim), Tensor(ndim))
+
+    return ninetoothed.make(arrangement, application, tensors)
diff --git a/src/ntops/mm.py → src/ntops/kernels/mm.py b/src/ntops/mm.py → src/ntops/kernels/mm.py
@@ -2,7 +2,6 @@
 
 import ninetoothed
 import ninetoothed.language as ntl
-import torch
 from ninetoothed import Tensor
 
 BLOCK_SIZE_M = ninetoothed.block_size()
@@ -35,20 +34,6 @@ def application(input, other, output):
     output = accumulator
 
 
-def mm(input, other, output=None):
-    m, _ = input.shape
-    _, n = other.shape
-
-    if output is None:
-        output = torch.empty((m, n), dtype=input.dtype, device=input.device)
-
-    kernel = _make()
-
-    kernel(input, other, output)
-
-    return output
-
-
 @functools.cache
-def _make():
+def make():
     return ninetoothed.make(arrangement, application, (Tensor(2), Tensor(2), Tensor(2)))
diff --git a/src/ntops/kernels/mul.py b/src/ntops/kernels/mul.py
@@ -0,0 +1,17 @@
+import functools
+
+import ninetoothed
+from ninetoothed import Tensor
+
+from ntops.kernels.element_wise import arrangement
+
+
+def application(input, other, output):
+    output = input * other  # noqa: F841
+
+
+@functools.cache
+def make(ndim):
+    tensors = (Tensor(ndim), Tensor(ndim), Tensor(ndim))
+
+    return ninetoothed.make(arrangement, application, tensors)
diff --git a/src/ntops/kernels/rsqrt.py b/src/ntops/kernels/rsqrt.py
@@ -0,0 +1,16 @@
+import functools
+
+import ninetoothed
+import ninetoothed.language as ntl
+from ninetoothed import Tensor
+
+from ntops.kernels.element_wise import arrangement
+
+
+def application(input, output):
+    output = ntl.rsqrt(ntl.cast(input, ntl.float32))  # noqa: F841
+
+
+@functools.cache
+def make(ndim):
+    return ninetoothed.make(arrangement, application, (Tensor(ndim), Tensor(ndim)))