Skip to content

Commit c0ffe2c

Browse files
authored
[ascend]zmz/Integrate the aclnn framework (DeepLink-org#871)
all aclnn
1 parent b7eae80 commit c0ffe2c

File tree

9 files changed

+293
-7
lines changed

9 files changed

+293
-7
lines changed

impl/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ elseif (${IMPL_OPT} IN_LIST IMPL_CAMB_TORCH)
6969
elseif (${IMPL_OPT} IN_LIST IMPL_CAMB)
7070
add_subdirectory(camb)
7171
elseif (${IMPL_OPT} IN_LIST IMPL_ASCEND)
72-
add_subdirectory(ascend_npu)
72+
add_subdirectory(ascend_npu)
7373
elseif (${IMPL_OPT} IN_LIST IMPL_SUPA)
7474
add_subdirectory(supa)
7575
elseif (${IMPL_OPT} IN_LIST IMPL_DROPLET)

impl/ascend/CMakeLists.txt

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@ else()
99
set(ASCEND_DIR /usr/local/Ascend)
1010
endif()
1111

12-
1312
if(EXISTS ${ASCEND_DIR}/ascend-toolkit/latest/)
1413
message(STATUS "ascend-toolkit exists:" ${ASCEND_DIR}/ascend-toolkit/latest/)
1514
message(STATUS "ASCEND_DIR:" ${ASCEND_DIR})
@@ -19,7 +18,7 @@ else()
1918
message(FATAL_ERROR "No ascend-toolkit found.")
2019
endif()
2120

22-
file(GLOB_RECURSE IMPL_SRC RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} functions/*.cpp functions_mmcv/*.cpp common/*.cpp)
21+
file(GLOB_RECURSE IMPL_SRC RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} functions/*.cpp functions_mmcv/*.cpp common/*.cpp aclnn/*.cpp)
2322
list(APPEND IMPL_SRC ascend_tensor.cpp)
2423

2524
# adaptor
@@ -45,7 +44,7 @@ endif()
4544

4645
add_library(${DEVICEIMPL} SHARED ${IMPL_SRC})
4746
set_target_properties(${DEVICEIMPL} PROPERTIES SUFFIX ".so")
48-
target_link_libraries(${DEVICEIMPL} ascendcl acl_op_compiler)
47+
target_link_libraries(${DEVICEIMPL} ascendcl acl_op_compiler nnopbase opapi)
4948

5049
if(USE_ADAPTOR)
5150
add_dependencies(${DEVICEIMPL} adaptor_code_gen)

impl/ascend/aclnn/aclnn.cpp

Lines changed: 207 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,207 @@
1+
/**
2+
* @file
3+
* @author DeepLink
4+
* @copyright (c) 2023, DeepLink.
5+
*/
6+
7+
#include "aclnn.hpp"
8+
9+
#include <acl/acl_rt.h>
10+
11+
#include <functional>
12+
#include <numeric>
13+
#include <valarray>
14+
#include <vector>
15+
16+
#include "../common/acloprunner.hpp"
17+
#include "../common/utils.hpp"
18+
19+
namespace impl {
20+
namespace ascend {
21+
22+
int createAclTensor1(diopiConstTensorHandle_t input, aclTensor** tensor) {
23+
impl::ascend::AscendTensor inAt(input);
24+
void* deviceAddr = nullptr;
25+
26+
// 调用aclCreateTensor接口创建aclTensor
27+
*tensor = aclCreateTensor(inAt.getAclMemShape().data(),
28+
inAt.getAclMemShape().size(),
29+
inAt.getAclDataType(),
30+
inAt.stride().data(),
31+
0,
32+
inAt.getAclDataFormat(),
33+
inAt.getAclMemShape().data(),
34+
inAt.getAclMemShape().size(),
35+
const_cast<void*>(inAt.data()));
36+
return ACL_SUCCESS;
37+
}
38+
39+
aclScalar* createAclScalar1(const diopiScalar_t* input) {
40+
// 创建alpha aclScalar
41+
if (input->stype == diopiDtype_t::diopi_dtype_float64) {
42+
auto v = getValue<double>(input);
43+
return aclCreateScalar(&v, getAclDataType(input->stype));
44+
} else {
45+
auto v = getValue<int64_t>(input);
46+
return aclCreateScalar(&v, getAclDataType(input->stype));
47+
}
48+
return nullptr;
49+
}
50+
51+
void printContiguousTensor(const aclTensor& tensor, const void* tensorPtr) {
52+
int64_t* shape = nullptr;
53+
uint64_t num = 0;
54+
aclGetViewShape(&tensor, &shape, &num);
55+
std::vector<int64_t> shapeVec(shape, shape + num);
56+
int64_t size = std::accumulate(shapeVec.begin(), shapeVec.end(), 1, std::multiplies<>());
57+
std::vector<float> result(size, 0);
58+
auto ret = aclrtMemcpy(result.data(), result.size() * sizeof(result[0]), tensorPtr, size * sizeof(float), ACL_MEMCPY_DEVICE_TO_HOST);
59+
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("copy result from device to host failed. ERROR: %d\n", ret); return;);
60+
for (int64_t i = 0; i < size; i++) {
61+
LOG_PRINT("result[%ld] is: %f\n", i, result[i]);
62+
}
63+
}
64+
65+
void printContiguousTensor(const aclTensor& tensor, diopiConstTensorHandle_t diopi) {
66+
const void* p = nullptr;
67+
diopiGetTensorDataConst(diopi, &p);
68+
return printContiguousTensor(tensor, p);
69+
}
70+
71+
int aclnnAddAdaptor(diopiContextHandle_t ctx, diopiConstTensorHandle_t self1, diopiConstTensorHandle_t other1, const diopiScalar_t* alpha1,
72+
diopiTensorHandle_t out1) {
73+
aclrtStream stream;
74+
diopiGetStream(ctx, &stream);
75+
// 1.构造输入与输出,需要根据API的接口自定义构造
76+
aclTensor* self = nullptr;
77+
aclTensor* other = nullptr;
78+
aclScalar* alpha = nullptr;
79+
aclTensor* out = nullptr;
80+
// 创建self aclTensor
81+
auto ret = createAclTensor1(self1, &self);
82+
CHECK_RET(ret == ACL_SUCCESS, return ret);
83+
// 创建other aclTensor
84+
ret = createAclTensor1(other1, &other);
85+
CHECK_RET(ret == ACL_SUCCESS, return ret);
86+
// 创建alpha aclScalar
87+
alpha = createAclScalar1(alpha1);
88+
89+
CHECK_RET(alpha != nullptr, return ret);
90+
// 创建out aclTensor
91+
ret = createAclTensor1(out1, &out);
92+
CHECK_RET(ret == ACL_SUCCESS, return ret);
93+
94+
// 2.调用CANN算子库API
95+
uint64_t workspaceSize = 0;
96+
aclOpExecutor* executor;
97+
// 调用aclnnAdd第一段接口
98+
ret = aclnnAddGetWorkspaceSize(self, other, alpha, out, &workspaceSize, &executor);
99+
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclnnAddGetWorkspaceSize failed. ERROR: %d\n", ret); return ret);
100+
// 根据第一段接口计算出的workspaceSize申请device内存
101+
void* workspaceAddr = nullptr;
102+
if (workspaceSize > 0) {
103+
ret = aclrtMalloc(&workspaceAddr, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST);
104+
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("allocate workspace failed. ERROR: %d\n", ret); return ret;);
105+
}
106+
// 调用aclnnAdd第二段接口
107+
ret = aclnnAdd(workspaceAddr, workspaceSize, executor, stream);
108+
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclnnAdd failed. ERROR: %d\n", ret); return ret);
109+
// 3.(固定写法)同步等待任务执行结束
110+
ret = aclrtSynchronizeStream(stream);
111+
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclrtSynchronizeStream failed. ERROR: %d\n", ret); return ret);
112+
113+
if (workspaceSize > 0) {
114+
aclrtFree(workspaceAddr);
115+
}
116+
117+
return 0;
118+
}
119+
120+
int aclnnSinAdaptor(diopiContextHandle_t ctx, diopiConstTensorHandle_t self1, diopiTensorHandle_t out1) {
121+
aclrtStream stream;
122+
diopiGetStream(ctx, &stream);
123+
// 1.构造输入与输出,需要根据API的接口自定义构造
124+
aclTensor* self = nullptr;
125+
aclTensor* out = nullptr;
126+
AscendTensor inAt(self1);
127+
if (inAt.numel() == 0) {
128+
return 0;
129+
}
130+
// 创建self aclTensor
131+
auto ret = createAclTensor1(self1, &self);
132+
CHECK_RET(ret == ACL_SUCCESS, return ret);
133+
// 创建out aclTensor
134+
ret = createAclTensor1(out1, &out);
135+
CHECK_RET(ret == ACL_SUCCESS, return ret);
136+
137+
// 2.调用CANN算子库API
138+
uint64_t workspaceSize = 0;
139+
aclOpExecutor* executor;
140+
// 调用aclnnSin第一段接口
141+
ret = aclnnSinGetWorkspaceSize(self, out, &workspaceSize, &executor);
142+
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclnnSinGetWorkspaceSize failed. ERROR: %d\n", ret); return ret);
143+
// 根据第一段接口计算出的workspaceSize申请device内存
144+
void* workspaceAddr = nullptr;
145+
if (workspaceSize > 0) {
146+
ret = aclrtMalloc(&workspaceAddr, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST);
147+
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("allocate workspace failed. ERROR: %d\n", ret); return ret;);
148+
}
149+
// 调用aclnnSin第二段接口
150+
ret = aclnnSin(workspaceAddr, workspaceSize, executor, stream);
151+
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclnnSin failed. ERROR: %d\n", ret); return ret);
152+
// 3.(固定写法)同步等待任务执行结束
153+
ret = aclrtSynchronizeStream(stream);
154+
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclrtSynchronizeStream failed. ERROR: %d\n", ret); return ret);
155+
156+
if (workspaceSize > 0) {
157+
aclrtFree(workspaceAddr);
158+
}
159+
160+
return 0;
161+
}
162+
163+
int aclnnCosAdaptor(diopiContextHandle_t ctx, diopiConstTensorHandle_t self1, diopiTensorHandle_t out1) {
164+
aclrtStream stream;
165+
diopiGetStream(ctx, &stream);
166+
// 1.构造输入与输出,需要根据API的接口自定义构造
167+
aclTensor* self = nullptr;
168+
aclTensor* out = nullptr;
169+
AscendTensor inAt(self1);
170+
if (inAt.numel() == 0) {
171+
return 0;
172+
}
173+
// 创建self aclTensor
174+
auto ret = createAclTensor1(self1, &self);
175+
CHECK_RET(ret == ACL_SUCCESS, return ret);
176+
// 创建out aclTensor
177+
ret = createAclTensor1(out1, &out);
178+
CHECK_RET(ret == ACL_SUCCESS, return ret);
179+
180+
// 2.调用CANN算子库API
181+
uint64_t workspaceSize = 0;
182+
aclOpExecutor* executor;
183+
// 调用aclnnCos第一段接口
184+
ret = aclnnCosGetWorkspaceSize(self, out, &workspaceSize, &executor);
185+
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclnnCosGetWorkspaceSize failed. ERROR: %d\n", ret); return ret);
186+
// 根据第一段接口计算出的workspaceSize申请device内存
187+
void* workspaceAddr = nullptr;
188+
if (workspaceSize > 0) {
189+
ret = aclrtMalloc(&workspaceAddr, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST);
190+
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("allocate workspace failed. ERROR: %d\n", ret); return ret;);
191+
}
192+
// 调用aclnnCos第二段接口
193+
ret = aclnnCos(workspaceAddr, workspaceSize, executor, stream);
194+
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclnnCos failed. ERROR: %d\n", ret); return ret);
195+
// 3.(固定写法)同步等待任务执行结束
196+
ret = aclrtSynchronizeStream(stream);
197+
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclrtSynchronizeStream failed. ERROR: %d\n", ret); return ret);
198+
199+
if (workspaceSize > 0) {
200+
aclrtFree(workspaceAddr);
201+
}
202+
203+
return 0;
204+
}
205+
206+
} // namespace ascend
207+
} // namespace impl

impl/ascend/aclnn/aclnn.hpp

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
/**
2+
* @file
3+
* @author DeepLink
4+
* @copyright (c) 2023, DeepLink.
5+
*/
6+
7+
#ifndef IMPL_ASCEND_ACLNN_ACLNN_HPP_
8+
#define IMPL_ASCEND_ACLNN_ACLNN_HPP_
9+
10+
#include <algorithm>
11+
#include <iostream>
12+
#include <sstream>
13+
#include <string>
14+
#include <vector>
15+
16+
#include "../ascend_tensor.hpp"
17+
#include "acl/acl.h"
18+
#include "aclnnop/aclnn_add.h" // TODO(zmz): add all
19+
#include "aclnnop/aclnn_cos.h" // TODO(zmz): add all
20+
#include "aclnnop/aclnn_sin.h" // TODO(zmz): add all
21+
#include "impl_functions.hpp"
22+
23+
namespace impl {
24+
namespace ascend {
25+
26+
#define CHECK_RET(cond, return_expr) \
27+
do { \
28+
if (!(cond)) { \
29+
return_expr; \
30+
} \
31+
} while (0)
32+
33+
#define LOG_PRINT(message, ...) \
34+
do { \
35+
printf(message, ##__VA_ARGS__); \
36+
} while (0)
37+
38+
int aclnnAddAdaptor(diopiContextHandle_t ctx, diopiConstTensorHandle_t self, diopiConstTensorHandle_t other, const diopiScalar_t* alpha,
39+
diopiTensorHandle_t out);
40+
41+
int aclnnSinAdaptor(diopiContextHandle_t ctx, diopiConstTensorHandle_t self, diopiTensorHandle_t out);
42+
43+
int aclnnCosAdaptor(diopiContextHandle_t ctx, diopiConstTensorHandle_t self, diopiTensorHandle_t out);
44+
45+
} // namespace ascend
46+
} // namespace impl
47+
48+
#endif // IMPL_ASCEND_ACLNN_ACLNN_HPP_

impl/ascend/functions/binary.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
#include <cmath>
88

9+
#include "../aclnn/aclnn.hpp"
910
#include "../common/acloprunner.hpp"
1011

1112
namespace impl {
@@ -32,6 +33,7 @@ bool isScalarOne(const diopiScalar_t* alpha) {
3233

3334
diopiError_t diopiAdd(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiConstTensorHandle_t other,
3435
const diopiScalar_t* alpha) {
36+
#if 1
3537
diopiDtype_t outDtype, inputDtype, otherDtype;
3638
diopiGetTensorDtype(out, &outDtype);
3739
diopiGetTensorDtype(input, &inputDtype);
@@ -58,6 +60,9 @@ diopiError_t diopiAdd(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiCo
5860
}
5961

6062
if (outDtype != highType) diopiCastDtype(ctx, out, outTemp);
63+
#else
64+
auto ret = aclnnAddAdaptor(ctx, input, other, alpha, out);
65+
#endif
6166
return diopiSuccess;
6267
}
6368

impl/ascend/functions/cos.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,8 @@
66

77
#include <set>
88

9+
#include "../aclnn/aclnn.hpp"
910
#include "../common/acloprunner.hpp"
10-
1111
namespace impl {
1212
namespace ascend {
1313

@@ -17,6 +17,7 @@ diopiError_t diopiCosInp(diopiContextHandle_t ctx, diopiTensorHandle_t input) {
1717
}
1818

1919
diopiError_t diopiCos(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input) {
20+
#if 0
2021
AscendTensor in = AscendTensor(input);
2122
if (0 == in.numel()) {
2223
return diopiSuccess;
@@ -36,6 +37,9 @@ diopiError_t diopiCos(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiCo
3637
AclOpRunner<1, 1>("Cos", ctx).addInput(input).addOutput(out).run();
3738
}
3839

40+
#else
41+
aclnnCosAdaptor(ctx, input, out);
42+
#endif
3943
return diopiSuccess;
4044
}
4145

impl/ascend/functions/sin.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
#include <set>
88

9+
#include "../aclnn/aclnn.hpp"
910
#include "../common/acloprunner.hpp"
1011

1112
namespace impl {
@@ -17,6 +18,7 @@ diopiError_t diopiSinInp(diopiContextHandle_t ctx, diopiTensorHandle_t input) {
1718
}
1819

1920
diopiError_t diopiSin(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input) {
21+
#if 0
2022
AscendTensor in(input);
2123
if (0 == in.numel()) {
2224
return diopiSuccess;
@@ -35,7 +37,9 @@ diopiError_t diopiSin(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiCo
3537
} else {
3638
AclOpRunner<1, 1>("Sin", ctx).addInput(input).addOutput(out).run();
3739
}
38-
40+
#else
41+
aclnnSinAdaptor(ctx, input, out);
42+
#endif
3943
return diopiSuccess;
4044
}
4145

0 commit comments

Comments
 (0)