tile-ai · xwhzz · Oct 21, 2025 · Oct 17, 2025 · Oct 21, 2025
diff --git a/examples/elementwise/elementwise_abs.py b/examples/elementwise/elementwise_abs.py
@@ -0,0 +1,62 @@
+import argparse
+
+import tilelang
+import tilelang.language as T
+import torch
+
+tilelang.cache.clear_cache()
+
+parser = argparse.ArgumentParser(description="NPU Kernel Compilation")
+parser.add_argument("--m", type=int, default=1024, help="Matrix M dimension")
+parser.add_argument("--n", type=int, default=1024, help="Matrix N dimension")
+args = parser.parse_args()
+
+M = args.m
+N = args.n
+
+
+@tilelang.jit(out_idx=[-1])
+def abs(M, N, block_M, block_N, dtype="float"):
+    m_num = M // block_M
+    n_num = N // block_N
+
+    VEC_NUM = 2
+
+    @T.prim_func
+    def main(
+            A: T.Tensor((M, N), dtype),
+            B: T.Tensor((M, N), dtype),
+    ):
+        with T.Kernel(m_num * n_num, is_npu=True) as (cid, vid):
+            bx = cid // n_num
+            by = cid % n_num
+
+            a_ub = T.alloc_ub((block_M // VEC_NUM, block_N), dtype)
+            b_ub = T.alloc_ub((block_M // VEC_NUM, block_N), dtype)
+            with T.Scope("V"):
+                T.copy(A[bx * block_M + vid * block_M // VEC_NUM, by * block_N], a_ub)
+
+                T.barrier_all()
+                T.abs(b_ub, a_ub)
+                T.barrier_all()
+
+                T.copy(b_ub, B[bx * block_M + vid * block_M // VEC_NUM, by * block_N])
+
+    return main
+
+
+func = abs(M, N, 128, 256)
+
+torch.manual_seed(0)
+
+a = torch.randn(M, N).npu()
+
+torch.npu.synchronize()
+print("init successful!")
+
+b = func(a)
+
+ref_b = torch.abs(a)
+
+torch.testing.assert_close(b, ref_b, rtol=1e-2, atol=1e-2)
+print("Kernel Output Match!")
diff --git a/examples/elementwise/elementwise_ln.py b/examples/elementwise/elementwise_ln.py
@@ -0,0 +1,62 @@
+import argparse
+
+import tilelang
+import tilelang.language as T
+import torch
+
+tilelang.cache.clear_cache()
+
+parser = argparse.ArgumentParser(description="NPU Kernel Compilation")
+parser.add_argument("--m", type=int, default=1024, help="Matrix M dimension")
+parser.add_argument("--n", type=int, default=1024, help="Matrix N dimension")
+args = parser.parse_args()
+
+M = args.m
+N = args.n
+
+
+@tilelang.jit(out_idx=[-1])
+def ln(M, N, block_M, block_N, dtype="float"):
+    m_num = M // block_M
+    n_num = N // block_N
+
+    VEC_NUM = 2
+
+    @T.prim_func
+    def main(
+            A: T.Tensor((M, N), dtype),
+            B: T.Tensor((M, N), dtype),
+    ):
+        with T.Kernel(m_num * n_num, is_npu=True) as (cid, vid):
+            bx = cid // n_num
+            by = cid % n_num
+
+            a_ub = T.alloc_ub((block_M // VEC_NUM, block_N), dtype)
+            b_ub = T.alloc_ub((block_M // VEC_NUM, block_N), dtype)
+            with T.Scope("V"):
+                T.copy(A[bx * block_M + vid * block_M // VEC_NUM, by * block_N], a_ub)
+
+                T.barrier_all()
+                T.ln(b_ub, a_ub)
+                T.barrier_all()
+
+                T.copy(b_ub, B[bx * block_M + vid * block_M // VEC_NUM, by * block_N])
+
+    return main
+
+
+func = ln(M, N, 128, 256)
+
+torch.manual_seed(0)
+
+a = abs(torch.randn(M, N).npu())
+
+torch.npu.synchronize()
+print("init successful!")
+
+b = func(a)
+
+ref_b = torch.log(a)
+
+torch.testing.assert_close(b, ref_b, rtol=1e-2, atol=1e-2)
+print("Kernel Output Match!")
diff --git a/examples/elementwise/elementwise_shiftleft.py b/examples/elementwise/elementwise_shiftleft.py
@@ -0,0 +1,64 @@
+import argparse
+
+import tilelang
+import tilelang.language as T
+import torch
+import random
+
+tilelang.cache.clear_cache()
+
+parser = argparse.ArgumentParser(description="NPU Kernel Compilation")
+parser.add_argument("--m", type=int, default=1024, help="Matrix M dimension")
+parser.add_argument("--n", type=int, default=1024, help="Matrix N dimension")
+args = parser.parse_args()
+
+M = args.m
+N = args.n
+scalarvalue = random.randint(1, 32)
+
+
+@tilelang.jit(out_idx=[-1])
+def shiftleft(M, N, block_M, block_N, scalarvalue, dtype="int32"):
+    m_num = M // block_M
+    n_num = N // block_N
+
+    VEC_NUM = 2
+
+    @T.prim_func
+    def main(
+            A: T.Tensor((M, N), dtype),
+            B: T.Tensor((M, N), dtype),
+    ):
+        with T.Kernel(m_num * n_num, is_npu=True) as (cid, vid):
+            bx = cid // n_num
+            by = cid % n_num
+
+            a_ub = T.alloc_ub((block_M // VEC_NUM, block_N), dtype)
+            b_ub = T.alloc_ub((block_M // VEC_NUM, block_N), dtype)
+            with T.Scope("V"):
+                T.copy(A[bx * block_M + vid * block_M // VEC_NUM, by * block_N], a_ub)
+
+                T.barrier_all()
+                T.shiftleft(b_ub, a_ub, scalarvalue)
+                T.barrier_all()
+
+                T.copy(b_ub, B[bx * block_M + vid * block_M // VEC_NUM, by * block_N])
+
+    return main
+
+
+func = shiftleft(M, N, 128, 256, scalarvalue)
+
+torch.manual_seed(0)
+
+a = torch.randint(low=1, high=101, size=(M, N), dtype=torch.int32).npu()
+
+torch.npu.synchronize()
+print("init successful!")
+
+b = func(a)
+
+ref_b = pow(2, scalarvalue) * a
+
+torch.testing.assert_close(b, ref_b, rtol=1e-2, atol=1e-2)
+print("Kernel Output Match!")
diff --git a/examples/elementwise/elementwise_shiftright.py b/examples/elementwise/elementwise_shiftright.py
@@ -0,0 +1,64 @@
+import argparse
+
+import tilelang
+import tilelang.language as T
+import torch
+import random
+
+tilelang.cache.clear_cache()
+
+parser = argparse.ArgumentParser(description="NPU Kernel Compilation")
+parser.add_argument("--m", type=int, default=1024, help="Matrix M dimension")
+parser.add_argument("--n", type=int, default=1024, help="Matrix N dimension")
+args = parser.parse_args()
+
+M = args.m
+N = args.n
+scalarvalue = random.randint(1,32)
+
+
+@tilelang.jit(out_idx=[-1])
+def shiftright(M, N, block_M, block_N, scalarvalue, dtype="int32"):
+    m_num = M // block_M
+    n_num = N // block_N
+
+    VEC_NUM = 2
+
+    @T.prim_func
+    def main(
+            A: T.Tensor((M, N), dtype),
+            B: T.Tensor((M, N), dtype),
+    ):
+        with T.Kernel(m_num * n_num, is_npu=True) as (cid, vid):
+            bx = cid // n_num
+            by = cid % n_num
+
+            a_ub = T.alloc_ub((block_M // VEC_NUM, block_N), dtype)
+            b_ub = T.alloc_ub((block_M // VEC_NUM, block_N), dtype)
+            with T.Scope("V"):
+                T.copy(A[bx * block_M + vid * block_M // VEC_NUM, by * block_N], a_ub)
+
+                T.barrier_all()
+                T.shiftright(b_ub, a_ub, scalarvalue)
+                T.barrier_all()
+
+                T.copy(b_ub, B[bx * block_M + vid * block_M // VEC_NUM, by * block_N])
+
+    return main
+
+
+func = shiftright(M, N, 128, 256, scalarvalue)
+
+torch.manual_seed(0)
+
+a = torch.randint(low=1, high=101, size=(M, N), dtype=torch.int32).npu()
+
+torch.npu.synchronize()
+print("init successful!")
+
+b = func(a)
+
+ref_b = a // pow(2, scalarvalue)
+
+torch.testing.assert_close(b, ref_b, rtol=1e-2, atol=1e-2)
+print("Kernel Output Match!")
diff --git a/src/target/codegen_ascend.cc b/src/target/codegen_ascend.cc
@@ -656,6 +656,24 @@ void CodeGenTileLangAscend::VisitExpr_(const CallNode *op, std::ostream &os) {
       }
       this->stream << ", " << PrintExpr(op->args[op->args.size() - 1])
                    << ");\n";
+    } else if (op_name == "AscendC::ShiftLeft" || op_name == "AscendC::ShiftRight") {
+      std::vector<std::string> var_names;
+      for (int i = 1; i < 3; i++) {
+        auto var_name = print_buffer_offset(op->args[i].as<CallNode>());
+        var_names.push_back(var_name);
+      }
+      this->PrintIndent();
+      this->stream << op_name << "(";
+      for (int i = 0; i < var_names.size(); i++) {
+        this->stream << var_names[i];
+        if (i != var_names.size() - 1) {
+          this->stream << ", ";
+        }
+      }
+      for (int i = 3; i < op->args.size(); i++) {
+        this->stream << ", " << PrintExpr(op->args[i]);
+      }
+      this->stream << ");\n";
     } else if (op_name == "AscendC::Muls" || op_name == "AscendC::Adds") {
       std::vector<std::string> var_names;
       for (int i = 1; i < 3; i++) {

diff --git a/tilelang/language/ascend.py b/tilelang/language/ascend.py
@@ -419,6 +419,24 @@ def axpy(dst: Buffer, src0: Buffer, scalar_value: PrimExpr):
     return scalar_op(dst, src0, scalar_value, "Axpy")
 
 
+def shiftleft(dst: Buffer, src0: Buffer, scalarValue: PrimExpr):
+    size_0 = math.prod(src0.shape)
+    size_2 = math.prod(dst.shape)
+
+    assert size_0 == size_2, "size must be same"
+
+    return T.call_extern("handle", f"AscendC::ShiftLeft", dst.access_ptr("w"),
+                         src0.access_ptr("r"), scalarValue, size_0)
+
+
+def shiftright(dst: Buffer, src0: Buffer, scalarValue: PrimExpr):
+    size_0 = math.prod(src0.shape)
+    size_2 = math.prod(dst.shape)
+
+    assert size_0 == size_2, "size must be same"
+
+    return T.call_extern("handle", f"AscendC::ShiftRight", dst.access_ptr("w"),
+                         src0.access_ptr("r"), scalarValue, size_0)
 def transpose(dst: Buffer, src: Buffer):
     return T.call_extern("handle", "AscendC::Transpose", dst.access_ptr("w"), src.access_ptr("r"))