Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 62 additions & 0 deletions examples/elementwise/elementwise_abs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import argparse

import tilelang
import tilelang.language as T
import torch

tilelang.cache.clear_cache()

parser = argparse.ArgumentParser(description="NPU Kernel Compilation")
parser.add_argument("--m", type=int, default=1024, help="Matrix M dimension")
parser.add_argument("--n", type=int, default=1024, help="Matrix N dimension")
args = parser.parse_args()

M = args.m
N = args.n


@tilelang.jit(out_idx=[-1])
def abs(M, N, block_M, block_N, dtype="float"):
m_num = M // block_M
n_num = N // block_N

VEC_NUM = 2

@T.prim_func
def main(
A: T.Tensor((M, N), dtype),
B: T.Tensor((M, N), dtype),
):
with T.Kernel(m_num * n_num, is_npu=True) as (cid, vid):
bx = cid // n_num
by = cid % n_num

a_ub = T.alloc_ub((block_M // VEC_NUM, block_N), dtype)
b_ub = T.alloc_ub((block_M // VEC_NUM, block_N), dtype)
with T.Scope("V"):
T.copy(A[bx * block_M + vid * block_M // VEC_NUM, by * block_N], a_ub)

T.barrier_all()
T.abs(b_ub, a_ub)
T.barrier_all()

T.copy(b_ub, B[bx * block_M + vid * block_M // VEC_NUM, by * block_N])

return main


func = abs(M, N, 128, 256)

torch.manual_seed(0)

a = torch.randn(M, N).npu()

torch.npu.synchronize()
print("init successful!")

b = func(a)

ref_b = torch.abs(a)

torch.testing.assert_close(b, ref_b, rtol=1e-2, atol=1e-2)
print("Kernel Output Match!")
62 changes: 62 additions & 0 deletions examples/elementwise/elementwise_ln.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import argparse

import tilelang
import tilelang.language as T
import torch

tilelang.cache.clear_cache()

parser = argparse.ArgumentParser(description="NPU Kernel Compilation")
parser.add_argument("--m", type=int, default=1024, help="Matrix M dimension")
parser.add_argument("--n", type=int, default=1024, help="Matrix N dimension")
args = parser.parse_args()

M = args.m
N = args.n


@tilelang.jit(out_idx=[-1])
def ln(M, N, block_M, block_N, dtype="float"):
m_num = M // block_M
n_num = N // block_N

VEC_NUM = 2

@T.prim_func
def main(
A: T.Tensor((M, N), dtype),
B: T.Tensor((M, N), dtype),
):
with T.Kernel(m_num * n_num, is_npu=True) as (cid, vid):
bx = cid // n_num
by = cid % n_num

a_ub = T.alloc_ub((block_M // VEC_NUM, block_N), dtype)
b_ub = T.alloc_ub((block_M // VEC_NUM, block_N), dtype)
with T.Scope("V"):
T.copy(A[bx * block_M + vid * block_M // VEC_NUM, by * block_N], a_ub)

T.barrier_all()
T.ln(b_ub, a_ub)
T.barrier_all()

T.copy(b_ub, B[bx * block_M + vid * block_M // VEC_NUM, by * block_N])

return main


func = ln(M, N, 128, 256)

torch.manual_seed(0)

a = abs(torch.randn(M, N).npu())

torch.npu.synchronize()
print("init successful!")

b = func(a)

ref_b = torch.log(a)

torch.testing.assert_close(b, ref_b, rtol=1e-2, atol=1e-2)
print("Kernel Output Match!")
64 changes: 64 additions & 0 deletions examples/elementwise/elementwise_shiftleft.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
import argparse

import tilelang
import tilelang.language as T
import torch
import random

tilelang.cache.clear_cache()

parser = argparse.ArgumentParser(description="NPU Kernel Compilation")
parser.add_argument("--m", type=int, default=1024, help="Matrix M dimension")
parser.add_argument("--n", type=int, default=1024, help="Matrix N dimension")
args = parser.parse_args()

M = args.m
N = args.n
scalarvalue = random.randint(1, 32)


@tilelang.jit(out_idx=[-1])
def shiftleft(M, N, block_M, block_N, scalarvalue, dtype="int32"):
m_num = M // block_M
n_num = N // block_N

VEC_NUM = 2

@T.prim_func
def main(
A: T.Tensor((M, N), dtype),
B: T.Tensor((M, N), dtype),
):
with T.Kernel(m_num * n_num, is_npu=True) as (cid, vid):
bx = cid // n_num
by = cid % n_num

a_ub = T.alloc_ub((block_M // VEC_NUM, block_N), dtype)
b_ub = T.alloc_ub((block_M // VEC_NUM, block_N), dtype)
with T.Scope("V"):
T.copy(A[bx * block_M + vid * block_M // VEC_NUM, by * block_N], a_ub)

T.barrier_all()
T.shiftleft(b_ub, a_ub, scalarvalue)
T.barrier_all()

T.copy(b_ub, B[bx * block_M + vid * block_M // VEC_NUM, by * block_N])

return main


func = shiftleft(M, N, 128, 256, scalarvalue)

torch.manual_seed(0)

a = torch.randint(low=1, high=101, size=(M, N), dtype=torch.int32).npu()

torch.npu.synchronize()
print("init successful!")

b = func(a)

ref_b = pow(2, scalarvalue) * a

torch.testing.assert_close(b, ref_b, rtol=1e-2, atol=1e-2)
print("Kernel Output Match!")
64 changes: 64 additions & 0 deletions examples/elementwise/elementwise_shiftright.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
import argparse

import tilelang
import tilelang.language as T
import torch
import random

tilelang.cache.clear_cache()

parser = argparse.ArgumentParser(description="NPU Kernel Compilation")
parser.add_argument("--m", type=int, default=1024, help="Matrix M dimension")
parser.add_argument("--n", type=int, default=1024, help="Matrix N dimension")
args = parser.parse_args()

M = args.m
N = args.n
scalarvalue = random.randint(1,32)


@tilelang.jit(out_idx=[-1])
def shiftright(M, N, block_M, block_N, scalarvalue, dtype="int32"):
m_num = M // block_M
n_num = N // block_N

VEC_NUM = 2

@T.prim_func
def main(
A: T.Tensor((M, N), dtype),
B: T.Tensor((M, N), dtype),
):
with T.Kernel(m_num * n_num, is_npu=True) as (cid, vid):
bx = cid // n_num
by = cid % n_num

a_ub = T.alloc_ub((block_M // VEC_NUM, block_N), dtype)
b_ub = T.alloc_ub((block_M // VEC_NUM, block_N), dtype)
with T.Scope("V"):
T.copy(A[bx * block_M + vid * block_M // VEC_NUM, by * block_N], a_ub)

T.barrier_all()
T.shiftright(b_ub, a_ub, scalarvalue)
T.barrier_all()

T.copy(b_ub, B[bx * block_M + vid * block_M // VEC_NUM, by * block_N])

return main


func = shiftright(M, N, 128, 256, scalarvalue)

torch.manual_seed(0)

a = torch.randint(low=1, high=101, size=(M, N), dtype=torch.int32).npu()

torch.npu.synchronize()
print("init successful!")

b = func(a)

ref_b = a // pow(2, scalarvalue)

torch.testing.assert_close(b, ref_b, rtol=1e-2, atol=1e-2)
print("Kernel Output Match!")
18 changes: 18 additions & 0 deletions src/target/codegen_ascend.cc
Original file line number Diff line number Diff line change
Expand Up @@ -656,6 +656,24 @@ void CodeGenTileLangAscend::VisitExpr_(const CallNode *op, std::ostream &os) {
}
this->stream << ", " << PrintExpr(op->args[op->args.size() - 1])
<< ");\n";
} else if (op_name == "AscendC::ShiftLeft" || op_name == "AscendC::ShiftRight") {
std::vector<std::string> var_names;
for (int i = 1; i < 3; i++) {
auto var_name = print_buffer_offset(op->args[i].as<CallNode>());
var_names.push_back(var_name);
}
this->PrintIndent();
this->stream << op_name << "(";
for (int i = 0; i < var_names.size(); i++) {
this->stream << var_names[i];
if (i != var_names.size() - 1) {
this->stream << ", ";
}
}
for (int i = 3; i < op->args.size(); i++) {
this->stream << ", " << PrintExpr(op->args[i]);
}
this->stream << ");\n";
} else if (op_name == "AscendC::Muls" || op_name == "AscendC::Adds") {
std::vector<std::string> var_names;
for (int i = 1; i < 3; i++) {
Expand Down
18 changes: 18 additions & 0 deletions tilelang/language/ascend.py
Original file line number Diff line number Diff line change
Expand Up @@ -419,6 +419,24 @@ def axpy(dst: Buffer, src0: Buffer, scalar_value: PrimExpr):
return scalar_op(dst, src0, scalar_value, "Axpy")


def shiftleft(dst: Buffer, src0: Buffer, scalarValue: PrimExpr):
size_0 = math.prod(src0.shape)
size_2 = math.prod(dst.shape)

assert size_0 == size_2, "size must be same"

return T.call_extern("handle", f"AscendC::ShiftLeft", dst.access_ptr("w"),
src0.access_ptr("r"), scalarValue, size_0)


def shiftright(dst: Buffer, src0: Buffer, scalarValue: PrimExpr):
size_0 = math.prod(src0.shape)
size_2 = math.prod(dst.shape)

assert size_0 == size_2, "size must be same"

return T.call_extern("handle", f"AscendC::ShiftRight", dst.access_ptr("w"),
src0.access_ptr("r"), scalarValue, size_0)
def transpose(dst: Buffer, src: Buffer):
return T.call_extern("handle", "AscendC::Transpose", dst.access_ptr("w"), src.access_ptr("r"))

Expand Down
Loading