3535#define COL_MAJOR 1
3636#endif
3737
38+ // #define MATMUL_DEBUG
39+
3840namespace xf {
3941namespace dsp {
4042namespace aie {
@@ -94,13 +96,28 @@ constexpr loHi getUnTileShuffleOffsetsInt16(unsigned M, unsigned N, unsigned vec
9496 loHi ret = {.lo = offLo, .hi = offHi};
9597 return ret;
9698}
99+
100+ template <typename T_D, unsigned inRow, unsigned inCol>
101+ static constexpr int getVecSize () {
102+ constexpr unsigned minVBuffSizeforType = (512 / 8 ) / sizeof (T_D); // not sure why this is 512 bits?
103+ if
104+ constexpr (minVBuffSizeforType > (inRow * inCol)) { return inRow * inCol; }
105+ else if
106+ constexpr (inCol % minVBuffSizeforType == 0 || minVBuffSizeforType > inCol) { return minVBuffSizeforType; }
107+ else {
108+ int vSize = minVBuffSizeforType;
109+ while (inCol % vSize != 0 ) {
110+ vSize /= 2 ;
111+ }
112+ return vSize;
113+ }
114+ }
115+
97116template <unsigned M, unsigned N, unsigned inRow, unsigned inCol, unsigned leadingDim, typename T_D>
98117static void doUnTile (T_D* __restrict inPtr, T_D* outPtr) {
99118 constexpr unsigned minGranularity = (128 / 8 ) / sizeof (T_D);
100119 constexpr unsigned loadSize = (N >= minGranularity) ? N : minGranularity;
101- constexpr unsigned minVBuffSizeforType = (512 / 8 ) / sizeof (T_D);
102- constexpr unsigned vectorSize = (minVBuffSizeforType > (inRow * inCol)) ? (inRow * inCol) : minVBuffSizeforType;
103-
120+ constexpr unsigned vectorSize = getVecSize<T_D, inRow, inCol>();
104121 // static_assert(N >= minGranularity, "Granularity is awkward");
105122 static_assert (vectorSize <= (1024 / 8 ) / sizeof (T_D), " calculated vector size too large for vector register." );
106123 static_assert (!(leadingDim == COL_MAJOR && std::is_same_v<T_D, int16>),
@@ -110,9 +127,6 @@ static void doUnTile(T_D* __restrict inPtr, T_D* outPtr) {
110127 loHi offsets = std::is_same_v<T_D, int16> ? getUnTileShuffleOffsetsInt16 (M, N, vectorSize, leadingDim)
111128 : getUnTileShuffleOffsets (M, N, vectorSize, leadingDim);
112129
113- // printf("M: %d, N: %d, vectorSize: %d, loadSize: %d, leadingDim: %d\n", M, N, vectorSize, loadSize, leadingDim);
114- // printf("Offsets: lo : %0X, hi: %0X\n", offsets.lo, offsets.hi);
115-
116130 const unsigned loadsPerVector = vectorSize / loadSize;
117131 const unsigned tilesPerVector = vectorSize / (M * N);
118132 const unsigned colsPerLoad =
@@ -129,8 +143,7 @@ static void doUnTile(T_D* __restrict inPtr, T_D* outPtr) {
129143
130144 const unsigned vectorsPerCol = inRow / rowsPerVector;
131145 const unsigned vectorsPerRow = inCol / colsPerVector;
132- // printf("colsPerLoad: %d, rowsPerLoad: %d, colsPerVector: %d, rowsPerVector: %d, vectorsPerCol: %d, vectorsPerRow:
133- // %d\n",colsPerLoad, rowsPerLoad, colsPerVector, rowsPerVector, vectorsPerCol, vectorsPerRow );
146+
134147 // Loop through a row first if row major
135148 const unsigned outerLoopCount = (leadingDim == ROW_MAJOR) ? vectorsPerCol : vectorsPerRow;
136149 const unsigned innerLoopCount = (leadingDim == ROW_MAJOR) ? vectorsPerRow : vectorsPerCol;
@@ -145,9 +158,6 @@ static void doUnTile(T_D* __restrict inPtr, T_D* outPtr) {
145158 const unsigned outerDimStoreIncr = (leadingDim == ROW_MAJOR) ? inCol : inRow;
146159 const unsigned innerDimStoreIncr = storeSize;
147160
148- // printf("outerLoopCount: %d, innerLoopCount: %d, outerDimPerVector: %d, innerDimPerVector: %d, storeSize: %d \n",
149- // outerLoopCount, innerLoopCount, outerDimPerVector, innerDimPerVector, storeSize);
150-
151161 const bool shuffleIsNeeded = (leadingDim == COL_MAJOR) || ((leadingDim == ROW_MAJOR) && (loadSize > N));
152162
153163 for (unsigned outerDimIdx = 0 ; outerDimIdx < outerLoopCount; ++outerDimIdx)
@@ -161,8 +171,6 @@ static void doUnTile(T_D* __restrict inPtr, T_D* outPtr) {
161171 for (unsigned innerDimIdx = 0 ; innerDimIdx < innerLoopCount; ++innerDimIdx)
162172 chess_loop_count ((innerLoopCount)) chess_prepare_for_pipelining {
163173 const unsigned ptrInnerBase = innerDimIdx * innerLoopIncr;
164- // printf("outerDimIdx: %d, ptrOuterBase: %d, innerDimIdx: %d, ptrInnerBase: %d\n",outerDimIdx,
165- // ptrOuterBase, innerDimIdx, ptrInnerBase);
166174
167175 aie::vector<T_D, vectorSize> vec;
168176
@@ -188,29 +196,17 @@ static void doUnTile(T_D* __restrict inPtr, T_D* outPtr) {
188196 loadSize * loadIdx; // unlikely
189197
190198 const unsigned loadPtr = innerLoadPtr + ptrInnerBase + ptrOuterBase;
191-
192- // printf("loadPtr=%d, innerLoadPtr=%d\n", loadPtr, innerLoadPtr);
193- // load
194-
195199 vec.insert (loadIdx, aie::load_v<loadSize>(inPtr + loadPtr));
196200 }
197201
198- // myprint(vec, true, "beforeShuffle: ");
199202 if
200- constexpr (shuffleIsNeeded) {
201- // printf("We need to do a shuffle\n");
202- vec = doShuffle (vec, 0 , offsets);
203- // myprint(vec, true, "afterShuffle: ");
204- }
203+ constexpr (shuffleIsNeeded) { vec = doShuffle (vec, 0 , offsets); }
205204#pragma unroll((outerDimPerVector))
206205 for (unsigned outerStoreIdx = 0 ; outerStoreIdx < outerDimPerVector; ++outerStoreIdx) {
207206 const unsigned storeOuterPtr = outerStoreIdx * outerDimStoreIncr;
208207#pragma unroll((std::max(innerDimPerVector / storeSize, (unsigned) 1)))
209208 for (unsigned innerStoreIdx = 0 ;
210209 innerStoreIdx < std::max (innerDimPerVector / storeSize, (unsigned )1 ); ++innerStoreIdx) {
211- // printf("outerStoreIdx=%d, storeOuterPtr=%d, innerStoreIdx=%d, storeInnerPtr=%d\n",
212- // outerStoreIdx,storeOuterPtr, innerStoreIdx, innerStoreIdx*storeSize);
213-
214210 // If we don't shuffle and still load multiple outerDims, then we need to skip over that.
215211 const unsigned sliceIdx =
216212 (!shuffleIsNeeded && outerDimPerVector > 1 )
@@ -220,23 +216,12 @@ static void doUnTile(T_D* __restrict inPtr, T_D* outPtr) {
220216 innerDimIdx * innerDimPerVector +
221217 outerDimIdx * outerDimPerVector * outerDimStoreIncr;
222218
223- // printf("storePtr=%d, sliceIdx=%d\n", storePtr, sliceIdx);
224-
225219 // store direct to window
226220 aie::store_v (outPtr + storePtr, vec.template extract <storeSize>(sliceIdx));
227221 }
228222 }
229223 }
230224 }
231-
232- const unsigned tileSize = (M * N);
233- // for (unsigned AChunk=0; AChunk<(inRow*inCol); AChunk+=tileSize){
234- // aie::vector<T_D, tileSize> APost = aie::load_v<tileSize>(outPtr); outPtr += tileSize;
235- // // //aie::vector<T_D, sizeTileA> A1 = aie::load_v<sizeTileA>(pA1); pA1 += sizeTileA;
236- // myprint(APost,true,"A0postProc: ");
237- // // myprint(A1,true,"A1preProc: ");
238- // //
239- // }
240225}
241226
242227namespace aie = ::aie;
0 commit comments