Skip to content

Commit cdd4cbb

Browse files
authored
[camb]tyf/change permute (DeepLink-org#876)
* change permute
1 parent 889c520 commit cdd4cbb

File tree

9 files changed

+77
-138
lines changed

9 files changed

+77
-138
lines changed

adaptor/codegen/gen.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@
3838
"NLC": "diopiMemoryFormat_t::ChannelsLast1d",
3939
"NHWC": "diopiMemoryFormat_t::ChannelsLast",
4040
"NDHWC": "diopiMemoryFormat_t::ChannelsLast3d",
41-
"UD": "diopiMemoryFormat_t::Preserve",
41+
"ND": "diopiMemoryFormat_t::Preserve",
4242
}
4343

4444

@@ -443,7 +443,7 @@ def analysis_configs(config: List[dict], funcs_info: dict) -> dict:
443443
or layout == "NCL"
444444
or layout == "NDHWC"
445445
or layout == "NCDHW"
446-
or layout == "UD"
446+
or layout == "ND"
447447
):
448448
op_layouts.append(layout)
449449
else:

adaptor/csrc/convert.cpp

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,12 @@
11
#include "convert.hpp"
22

3-
bool denseCheckAdaptor(diopiSize_t shape, diopiSize_t stride) {
3+
bool denseCheck(diopiSize_t shape, diopiSize_t stride) {
44
int dim = shape.len;
5+
6+
if (isContiguous(shape, stride, diopiMemoryFormat_t::Contiguous)) {
7+
return true;
8+
}
9+
510
std::vector<std::pair<int64_t, int64_t>> stridesSizes(dim, std::pair<int64_t, int64_t>(1, 1));
611

712
for (int i = 0; i < dim; i++) {

adaptor/csrc/convert.hpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
#include <ostream>
1414
#include <vector>
1515

16-
bool denseCheckAdaptor(diopiSize_t shape, diopiSize_t stride);
16+
bool denseCheck(diopiSize_t shape, diopiSize_t stride);
1717

1818
std::vector<int64_t> calcStrides(diopiSize_t size, diopiMemoryFormat_t format = diopiMemoryFormat_t::Contiguous);
1919

@@ -111,7 +111,7 @@ ConvertType castImpl(diopiContextHandle_t ctx, T src, T* dst, std::vector<diopiM
111111
diopiSize_t dstStride = srcStride;
112112
diopiSize_t dstSize = srcSize;
113113
if (!targetMemoryFormats.empty()) {
114-
if (!denseCheckAdaptor(srcSize, srcStride) && supportMemoryFormats[0] == diopiMemoryFormat_t::Preserve) {
114+
if (!denseCheck(srcSize, srcStride) && supportMemoryFormats[0] == diopiMemoryFormat_t::Preserve) {
115115
targetMemoryFormats.push_back(diopiMemoryFormat_t::Preserve);
116116
needConvertMemoryFormat = true;
117117
}

impl/camb/common/clone.cpp

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,11 @@ namespace impl {
1010
namespace camb {
1111
diopiError_t clone(diopiContextHandle_t ctx, const DiopiTensor& inTensor, DiopiTensor& outTensor, diopiMemoryFormat_t memoryFormat) {
1212
cnnlHandle_t handle = cnnlHandlePool.get(ctx);
13+
if (!denseCheck(const_cast<DiopiTensor&>(inTensor))) {
14+
DiopiTensor denseOut;
15+
toDense(ctx, const_cast<DiopiTensor&>(inTensor), denseOut);
16+
const_cast<DiopiTensor&>(inTensor) = denseOut;
17+
}
1318
if (memoryFormat == diopiMemoryFormat_t::Preserve) {
1419
// torch.preserve_format: Used in functions like clone to preserve the memory format of the input tensor.
1520
// If input tensor is allocated in dense non-overlapping memory, the output tensor strides will be copied from the input.
@@ -19,9 +24,10 @@ diopiError_t clone(diopiContextHandle_t ctx, const DiopiTensor& inTensor, DiopiT
1924
} else {
2025
outTensor = requiresTensor(ctx, inTensor.shape(), inTensor.dtype(), memoryFormat);
2126
}
27+
2228
if (inTensor.shape() == outTensor.shape() && inTensor.dim() != 0 && inTensor.dtype() != diopi_dtype_float64 && inTensor.dtype() == outTensor.dtype() &&
23-
denseCheck(inTensor) && denseCheck(outTensor) && (outTensor.isContiguous() || inTensor.isContiguous())) {
24-
DIOPI_CALL(contiguousOut(ctx, const_cast<DiopiTensor&>(inTensor), outTensor));
29+
denseCheck(outTensor)) {
30+
DIOPI_CALL(permuteCopy(ctx, const_cast<DiopiTensor&>(inTensor), outTensor));
2531
return diopiSuccess;
2632
}
2733
CnnlTensorDesc inTensorDesc(inTensor, CNNL_LAYOUT_ARRAY);

impl/camb/common/common.hpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ bool broadcast(DiopiTensor inputTensor, const std::vector<int64_t>& targetShape,
3434

3535
diopiError_t contiguous(diopiContextHandle_t ctx, DiopiTensor& src, diopiMemoryFormat_t memoryFormat = diopiMemoryFormat_t::Contiguous);
3636

37-
diopiError_t contiguousOut(diopiContextHandle_t ctx, DiopiTensor& src, DiopiTensor& dest);
37+
diopiError_t permuteCopy(diopiContextHandle_t ctx, DiopiTensor& src, DiopiTensor& dest);
3838

3939
diopiError_t contiguous(diopiContextHandle_t ctx, DiopiTensor& src, diopiMemoryFormat_t memoryFormat, cnnlTensorLayout_t layoutIn,
4040
cnnlTensorLayout_t layoutOut);
@@ -59,6 +59,10 @@ bool isSlice(const DiopiTensor& src);
5959

6060
bool isSparse(const DiopiTensor& src);
6161

62+
diopiError_t permuteTensor(DiopiTensor& t, const std::vector<int32_t>& order);
63+
64+
diopiError_t getPermuteOrder(const DiopiTensor& src, std::vector<int32_t>& orderOut, std::vector<int32_t>& reverseOrder);
65+
6266
diopiError_t getDenseStride(const DiopiTensor& src, std::vector<int64_t>& dstStride);
6367

6468
diopiError_t sliceToDense(diopiContextHandle_t ctx, DiopiTensor& src, DiopiTensor& dst);

impl/camb/common/contiguous.cpp

Lines changed: 38 additions & 123 deletions
Original file line numberDiff line numberDiff line change
@@ -24,19 +24,7 @@ static diopiError_t transpose(diopiContextHandle_t& ctx, DiopiTensor& in, DiopiT
2424
return diopiSuccess;
2525
}
2626

27-
// static diopiError_t calTensordiopiMemoryFormat_t(const DiopiTensor& tensor, diopiMemoryFormat_t& memoryFormatOut) {
28-
// if (tensor.isContiguous(diopiMemoryFormat_t::ChannelsLast)) {
29-
// memoryFormatOut = diopiMemoryFormat_t::ChannelsLast;
30-
// } else if (tensor.isContiguous(diopiMemoryFormat_t::ChannelsLast3d)) {
31-
// memoryFormatOut = diopiMemoryFormat_t::ChannelsLast3d;
32-
// } else if (tensor.isContiguous(diopiMemoryFormat_t::Contiguous)) {
33-
// memoryFormatOut = diopiMemoryFormat_t::Contiguous;
34-
// } else {
35-
// return diopiNoImplement;
36-
// }
37-
// return diopiSuccess;
38-
// }
39-
static diopiError_t getPermuteOrder(const DiopiTensor& src, std::vector<int32_t>& orderOut, std::vector<int32_t>& reverseOrder) {
27+
diopiError_t getPermuteOrder(const DiopiTensor& src, std::vector<int32_t>& orderOut, std::vector<int32_t>& reverseOrder) {
4028
if (src.isContiguous()) {
4129
orderOut.resize(src.dim());
4230
for (int i = 0; i < src.dim(); ++i) {
@@ -59,6 +47,7 @@ static diopiError_t getPermuteOrder(const DiopiTensor& src, std::vector<int32_t>
5947
stridesSizes[i] = std::pair<int, int>(inputStrides[i], inputSizes[i]);
6048
}
6149

50+
// shape:2,3,4,5 stride:60,1,15,3 -> orderOut: 0,3,1,2, reverseOrder: 0,2,3,1
6251
sort(stridesSizes.begin(), stridesSizes.end(), [](std::pair<int, int> a, std::pair<int, int> b) { return a.first > b.first; });
6352
for (int i = 0; i < dim; ++i) {
6453
auto pair = stridesSizes[i];
@@ -83,73 +72,6 @@ static diopiError_t getPermuteOrder(const DiopiTensor& src, std::vector<int32_t>
8372
return diopiSuccess;
8473
}
8574

86-
static diopiError_t calOrderAndSrcMemoryFormat(const DiopiTensor& src, diopiMemoryFormat_t destMemoryFormat, diopiMemoryFormat_t& srcMemoryFormatOut,
87-
std::vector<int32_t>& orderOut, std::vector<int32_t>& reverseOrder) {
88-
if (src.isContiguous(destMemoryFormat)) {
89-
srcMemoryFormatOut = destMemoryFormat;
90-
orderOut.resize(src.dim());
91-
for (int i = 0; i < src.dim(); ++i) {
92-
orderOut[i] = i;
93-
}
94-
reverseOrder = orderOut;
95-
return diopiSuccess;
96-
}
97-
if (src.isContiguous(diopiMemoryFormat_t::ChannelsLast1d) && destMemoryFormat == diopiMemoryFormat_t::Contiguous) {
98-
if (src.dim() != 3) {
99-
setLastErrorString("the dim of the tensor should be 4, but now is %d.", src.dim());
100-
return diopiNoImplement;
101-
}
102-
srcMemoryFormatOut = diopiMemoryFormat_t::ChannelsLast1d;
103-
orderOut = {0, 2, 1};
104-
reverseOrder = {0, 2, 1};
105-
} else if (src.isContiguous(diopiMemoryFormat_t::Contiguous) && destMemoryFormat == diopiMemoryFormat_t::ChannelsLast1d) {
106-
if (src.dim() != 3) {
107-
setLastErrorString("the dim of the tensor should be 4, but now is %d.", src.dim());
108-
return diopiNoImplement;
109-
}
110-
srcMemoryFormatOut = diopiMemoryFormat_t::Contiguous;
111-
orderOut = {0, 2, 1};
112-
reverseOrder = {0, 2, 1};
113-
} else if (src.isContiguous(diopiMemoryFormat_t::ChannelsLast) && destMemoryFormat == diopiMemoryFormat_t::Contiguous) {
114-
if (src.dim() != 4) {
115-
setLastErrorString("the dim of the tensor should be 4, but now is %d.", src.dim());
116-
return diopiNoImplement;
117-
}
118-
srcMemoryFormatOut = diopiMemoryFormat_t::ChannelsLast;
119-
orderOut = {0, 3, 1, 2};
120-
reverseOrder = {0, 2, 3, 1};
121-
} else if (src.isContiguous(diopiMemoryFormat_t::Contiguous) && destMemoryFormat == diopiMemoryFormat_t::ChannelsLast) {
122-
if (src.dim() != 4) {
123-
setLastErrorString("the dim of the tensor should be 4, but now is %d.", src.dim());
124-
return diopiNoImplement;
125-
}
126-
srcMemoryFormatOut = diopiMemoryFormat_t::Contiguous;
127-
orderOut = {0, 2, 3, 1};
128-
reverseOrder = {0, 3, 1, 2};
129-
} else if (src.isContiguous(diopiMemoryFormat_t::Contiguous) && destMemoryFormat == diopiMemoryFormat_t::ChannelsLast3d) {
130-
if (src.dim() != 5) {
131-
setLastErrorString("the dim of the tensor should be 5, but now is %d.", src.dim());
132-
return diopiNoImplement;
133-
}
134-
srcMemoryFormatOut = diopiMemoryFormat_t::Contiguous;
135-
orderOut = {0, 2, 3, 4, 1};
136-
reverseOrder = {0, 4, 1, 2, 3};
137-
} else if (src.isContiguous(diopiMemoryFormat_t::ChannelsLast3d) && destMemoryFormat == diopiMemoryFormat_t::Contiguous) {
138-
if (src.dim() != 5) {
139-
setLastErrorString("the dim of the tensor should be 5, but now is %d.", src.dim());
140-
return diopiNoImplement;
141-
}
142-
srcMemoryFormatOut = diopiMemoryFormat_t::ChannelsLast3d;
143-
orderOut = {0, 4, 1, 2, 3};
144-
reverseOrder = {0, 2, 3, 4, 1};
145-
} else {
146-
// convert to contiguous format
147-
srcMemoryFormatOut = diopiMemoryFormat_t::Preserve;
148-
return diopiSuccess;
149-
}
150-
return diopiSuccess;
151-
}
152-
15375
diopiError_t calCnnlLayout(diopiMemoryFormat_t memoryFormat, int64_t dim, cnnlTensorLayout_t& cnnlLayout) {
15476
switch (memoryFormat) {
15577
case diopiMemoryFormat_t::ChannelsLast1d:
@@ -234,68 +156,61 @@ diopiError_t contiguous(diopiContextHandle_t ctx, DiopiTensor& src, diopiMemoryF
234156

235157
int64_t dim = src.dim();
236158
DIOPI_CHECK(dim <= 8, "only support less than 8d tensor currently");
237-
diopiMemoryFormat_t srcMemoryFormat;
238-
std::vector<int32_t> order;
239-
std::vector<int32_t> reverseOrder;
240159
DiopiTensor dest;
241-
DIOPI_CALL(calOrderAndSrcMemoryFormat(src, memoryFormat, srcMemoryFormat, order, reverseOrder));
242-
if (srcMemoryFormat == diopiMemoryFormat_t::Preserve) {
243-
DIOPI_CALL(clone(ctx, src, dest, memoryFormat));
244-
src = dest;
245-
return diopiSuccess;
246-
}
247-
dest = requiresTensor(ctx, src.shape(), src.dtype(), memoryFormat);
248-
// set CNNL_LAYOUT_ARRAY because NLC->NCL failed ( no layout NCL);
249-
cnnlTensorLayout_t srcLayout = CNNL_LAYOUT_ARRAY;
250-
cnnlTensorLayout_t destLayout = CNNL_LAYOUT_ARRAY;
251-
252-
std::vector<int64_t> olderDestStride = dest.stride();
253-
std::vector<int64_t> olderDestShape = dest.shape();
254-
if (memoryFormat != diopiMemoryFormat_t::Contiguous) {
255-
DIOPI_CALL(permuteTensor(dest, order));
256-
} else {
257-
DIOPI_CALL(permuteTensor(src, reverseOrder));
258-
}
259-
DIOPI_CALL(transpose(ctx, src, dest, srcLayout, destLayout, order));
260-
// recovery the shape
261-
dest.asStrided(olderDestShape, olderDestStride);
160+
DIOPI_CALL(clone(ctx, src, dest, memoryFormat));
262161
src = dest;
263162
return diopiSuccess;
264163
}
265164

266-
// inplace contiguous
267-
diopiError_t contiguousOut(diopiContextHandle_t ctx, DiopiTensor& src, DiopiTensor& dest) {
165+
diopiError_t permuteCopy(diopiContextHandle_t ctx, DiopiTensor& src, DiopiTensor& dest) {
166+
// using input permute + output permute + cnnltranspose to copy
268167
DIOPI_CHECK(src.shape() == dest.shape(), "src's shape should be the same as dest's");
269168
int64_t dim = src.dim();
270169
DIOPI_CHECK(dim <= 8, "only support less than 8d tensor currently");
271-
std::vector<int32_t> order(dim, 0);
272-
std::vector<int32_t> reverseOrder(dim, 0);
170+
bool srcIsContiguous = src.isContiguous();
171+
bool destIsContiguous = dest.isContiguous();
172+
std::vector<int32_t> inputOrder(dim, 0);
173+
std::vector<int32_t> inputBackOrder(dim, 0); // permuteTensor(input,inputBackOrder)->contiguous
174+
std::vector<int32_t> outputOrder(dim, 0);
175+
std::vector<int32_t> outputBackOrder(dim, 0); // permuteTensor(output,outputBackOrder)->contiguous
176+
std::vector<int32_t> inputToOutputOrder(dim, 0); // into cnnltranspose
177+
178+
// input shape:2,3,4,5 stride:60,1,15,3 -> inputBackOrder: 0,2,3,1, inputOrder: 0,3,1,2
179+
// output shape:2,3,4,5 stride:60,20,1,4 -> outputBackOrder: 0,1,3,2, outputOrder: 0,1,3,2
180+
// inputToOutputOrder: 0,2,1,3
181+
182+
getPermuteOrder(src, inputOrder, inputBackOrder);
183+
getPermuteOrder(dest, outputOrder, outputBackOrder);
273184

274-
if (src.isContiguous()) {
275-
getPermuteOrder(dest, reverseOrder, order);
276-
} else {
277-
getPermuteOrder(src, order, reverseOrder);
278-
}
279-
// set CNNL_LAYOUT_ARRAY because NLC->NCL failed ( no layout NCL);
280185
cnnlTensorLayout_t srcLayout = CNNL_LAYOUT_ARRAY;
281186
cnnlTensorLayout_t destLayout = CNNL_LAYOUT_ARRAY;
282187

283188
std::vector<int64_t> olderDestStride = dest.stride();
284189
std::vector<int64_t> olderDestShape = dest.shape();
285190
std::vector<int64_t> olderSrcStride = src.stride();
286191
std::vector<int64_t> olderSrcShape = src.shape();
287-
// if (destMemoryFormat != diopiMemoryFormat_t::Contiguous) {
288-
if (src.isContiguous()) {
289-
DIOPI_CALL(permuteTensor(dest, order));
290-
} else {
291-
DIOPI_CALL(permuteTensor(src, reverseOrder));
192+
193+
// permute to get contiguous tensor
194+
if (!destIsContiguous) {
195+
DIOPI_CALL(permuteTensor(dest, outputBackOrder));
196+
}
197+
198+
if (!srcIsContiguous) {
199+
DIOPI_CALL(permuteTensor(src, inputBackOrder));
200+
}
201+
202+
for (int i = 0; i < dim; ++i) {
203+
inputToOutputOrder[i] = inputOrder[outputBackOrder[i]];
292204
}
293-
DIOPI_CALL(transpose(ctx, src, dest, srcLayout, destLayout, order));
205+
206+
DIOPI_CALL(transpose(ctx, src, dest, srcLayout, destLayout, inputToOutputOrder));
207+
294208
// recovery the shape and strides
295-
// if (destMemoryFormat != diopiMemoryFormat_t::Contiguous) {
296-
if (src.isContiguous()) {
209+
if (!destIsContiguous) {
297210
dest.asStrided(olderDestShape, olderDestStride);
298-
} else {
211+
}
212+
213+
if (!srcIsContiguous) {
299214
src.asStrided(olderSrcShape, olderSrcStride);
300215
}
301216
return diopiSuccess;

impl/camb/common/denseCheck.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,8 @@ diopiError_t sliceToDense(diopiContextHandle_t ctx, DiopiTensor& src, DiopiTenso
124124

125125
// 得到按照stride从大到小顺序排列的shape,实测这种copy最快,也可以考虑用slice;
126126
// e.g. shape 128 12 64 197 stride 453888 64 1 2304-> shape 128 197 64 12 stride 453888 2304 64 1
127+
// e.g. shape: [128, 768, 14, 14], stride: [151296, 1, 10752, 768]
128+
// -> shape: 128 14 14 768, stride:151296, 10752, 768, 1
127129
std::vector<int64_t> generateShape;
128130
std::vector<int64_t> generateStride;
129131
std::vector<int64_t> generateOutStride(dim, 0);

impl/camb/convert_config.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,10 @@
1515
layout: NCL, NLC, NCHW, NHWC, NDHWC
1616

1717
- diopiDivScalar:
18-
layout: UD
18+
layout: ND
1919

2020
- diopiDivScalar:
21-
layout: UD
21+
layout: ND
2222

2323
- diopiBmm:
2424
layout: NCL, NLC, NCHW, NHWC, NCDHW, NDHWC
@@ -81,4 +81,4 @@
8181
#* Ops above are not neccesary to convert format *#
8282

8383
- diopiReluInp:
84-
layout: UD
84+
layout: ND

impl/camb/functions/copy.cpp

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ diopiError_t diopiCopyInp(diopiContextHandle_t ctx, diopiConstTensorHandle_t src
3737
return diopiSuccess;
3838
}
3939
DiopiTensor srcTr(src);
40+
DiopiTensor srcTmpTr(src);
4041
DiopiTensor destTr(dest);
4142
cnnlHandle_t handle = cnnlHandlePool.get(ctx);
4243
if (!srcTr.defined()) {
@@ -47,21 +48,27 @@ diopiError_t diopiCopyInp(diopiContextHandle_t ctx, diopiConstTensorHandle_t src
4748
return diopiSuccess;
4849
}
4950

51+
// if src is not dense, change it to preserved-format dense
52+
if (!denseCheck(srcTr)) {
53+
DiopiTensor denseOut;
54+
toDense(ctx, srcTr, denseOut);
55+
srcTr = denseOut;
56+
}
57+
5058
// memory format convert if memory format is matched.
5159
// cnnTranspose doesn't support float64 and scalar and contiguousOut only support convertion between the contiguous tensor and the no-contiguous tensor.
52-
if (srcTr.shape() == destTr.shape() && srcTr.dim() != 0 && srcTr.dtype() != diopi_dtype_float64 && denseCheck(srcTr) && denseCheck(destTr) &&
53-
(destTr.isContiguous() || srcTr.isContiguous())) {
60+
if (srcTr.shape() == destTr.shape() && srcTr.dim() != 0 && srcTr.dtype() != diopi_dtype_float64 && denseCheck(destTr)) {
5461
DiopiTensor destTmpTr = destTr;
5562
if (destTmpTr.dtype() != srcTr.dtype()) {
5663
destTmpTr = requiresTensor(ctx, destTr.shape(), srcTr.dtype());
5764
}
58-
DIOPI_CALL(contiguousOut(ctx, srcTr, destTmpTr));
65+
DIOPI_CALL(permuteCopy(ctx, srcTr, destTmpTr));
66+
5967
if (destTmpTr.dtype() != destTr.dtype()) {
6068
DIOPI_CALL(dataTypeCast(ctx, destTr, destTmpTr));
6169
}
6270
return diopiSuccess;
6371
}
64-
6572
// Ordinary copy
6673
// broadcast
6774
if (srcTr.shape() != destTr.shape()) {

0 commit comments

Comments
 (0)