@@ -5253,7 +5253,7 @@ void G4_BB_SB::footprintMerge(SBNode* node, const SBNode* nextNode)
52535253 return ;
52545254}
52555255
5256- bool G4_BB_SB::hasInternalDependenceWithinDPAS (SBNode* node)
5256+ bool G4_BB_SB::hasInternalDependenceWithinDPAS (SBNode* node) const
52575257{
52585258 const SBFootprint* dstfp = node->getFirstFootprint (Opnd_dst);
52595259
@@ -5322,12 +5322,26 @@ bool G4_BB_SB::hasDependenceBetweenDPASNodes(SBNode* node, SBNode* nextNode)
53225322 return false ;
53235323}
53245324
5325- #define SRC2_CACHE_SIZE 1024
5326- bool G4_BB_SB::src2FootPrintCachePVC (SBNode * curNode, SBNode * nextNode) const
5325+ unsigned short G4_BB_SB::getDpasSrcCacheSize (Gen4_Operand_Number opNum) const
53275326{
5327+ if (opNum == Gen4_Operand_Number::Opnd_src1) {
5328+ return 512 ;
5329+ }
5330+ if (opNum == Gen4_Operand_Number::Opnd_src2) {
5331+ if (builder.hasDpasSrc2ReadSupression ())
5332+ return 1024 ;
5333+ }
5334+ return 0 ;
5335+ }
5336+
5337+ bool G4_BB_SB::dpasSrcFootPrintCache (Gen4_Operand_Number opNum, SBNode * curNode, SBNode * nextNode) const
5338+ {
5339+ // this function is expected to be called only when there is suppression buffer of given src number
5340+ assert (getDpasSrcCacheSize (opNum) != 0 );
5341+
53285342 BitSet cachedGRF (totalGRFNum, false );
53295343
5330- for (const SBFootprint* fp = curNode->getFirstFootprint (Opnd_src2 ); fp; fp = fp->next )
5344+ for (const SBFootprint* fp = curNode->getFirstFootprint (opNum ); fp; fp = fp->next )
53315345 {
53325346 unsigned short leftB = fp->LeftB / builder.numEltPerGRF <Type_UB>();
53335347 unsigned short rightB = fp->RightB / builder.numEltPerGRF <Type_UB>();
@@ -5337,7 +5351,7 @@ bool G4_BB_SB::src2FootPrintCachePVC(SBNode * curNode, SBNode * nextNode) const
53375351 }
53385352 }
53395353
5340- for (const SBFootprint* fp = nextNode->getFirstFootprint (Opnd_src2 ); fp; fp = fp->next )
5354+ for (const SBFootprint* fp = nextNode->getFirstFootprint (opNum ); fp; fp = fp->next )
53415355 {
53425356 unsigned short leftB = fp->LeftB / builder.numEltPerGRF <Type_UB>();
53435357 unsigned short rightB = fp->RightB / builder.numEltPerGRF <Type_UB>();
@@ -5355,8 +5369,7 @@ bool G4_BB_SB::src2FootPrintCachePVC(SBNode * curNode, SBNode * nextNode) const
53555369 cachedGRFNum++;
53565370 }
53575371 }
5358-
5359- return cachedGRFNum <= (SRC2_CACHE_SIZE + builder.numEltPerGRF <Type_UB>() - 1 ) / builder.numEltPerGRF <Type_UB>();
5372+ return cachedGRFNum <= (getDpasSrcCacheSize (opNum) + builder.numEltPerGRF <Type_UB>() - 1 ) / builder.numEltPerGRF <Type_UB>();
53605373}
53615374
53625375bool G4_BB_SB::src2SameFootPrintDiffType (SBNode * curNode, SBNode * nextNode) const
@@ -5386,8 +5399,9 @@ bool G4_BB_SB::src2SameFootPrintDiffType(SBNode * curNode, SBNode * nextNode) co
53865399 return false ;
53875400}
53885401
5402+
53895403// restrict a macro to :
5390- // 1. Consecutive instructions of same opcode, same datatype in all sources and dest and same register for Src1.
5404+ // 1. Consecutive instructions of same opcode, same datatype in all sources and dest and same register for Src1.
53915405// 2. Allow having variable repeat count
53925406bool G4_BB_SB::isLastDpas (SBNode* curNode, SBNode* nextNode)
53935407{
@@ -5414,14 +5428,8 @@ bool G4_BB_SB::isLastDpas(SBNode* curNode, SBNode* nextNode)
54145428 }
54155429
54165430 G4_InstDpas* dpasInst = curInst->asDpasInst ();
5417- G4_Operand* srcOpnd1 = curInst->getSrc (1 );
5418- G4_Operand* srcOpnd2 = curInst->getSrc (2 );
5419- unsigned short leftBound1 = srcOpnd1->getLinearizedStart ();
5420- unsigned short leftBound2 = srcOpnd2->getLinearizedStart ();
54215431 uint8_t curD = dpasInst->getSystolicDepth ();
54225432 uint8_t curC = dpasInst->getRepeatCount ();
5423- int curSrc1Reg = leftBound1 / builder.numEltPerGRF <Type_UB>();
5424- int curSrc2Reg = leftBound2 / builder.numEltPerGRF <Type_UB>();
54255433
54265434 G4_InstDpas* nextDpasInst = nextInst->asDpasInst ();
54275435 uint8_t nextD = nextDpasInst->getSystolicDepth ();
@@ -5443,15 +5451,8 @@ bool G4_BB_SB::isLastDpas(SBNode* curNode, SBNode* nextNode)
54435451 }
54445452 }
54455453
5446- srcOpnd1 = nextDpasInst->getSrc (1 );
5447- srcOpnd2 = nextDpasInst->getSrc (2 );
5448- leftBound1 = srcOpnd1->getLinearizedStart ();
5449- leftBound2 = srcOpnd2->getLinearizedStart ();
5450- int nextSrc1Reg = leftBound1 / builder.numEltPerGRF <Type_UB>();
5451- int nextSrc2Reg = leftBound2 / builder.numEltPerGRF <Type_UB>();
5452-
5453- if (builder.hasSrc2ReadSupression () &&
5454- builder.hasSrc2ReadSupressionSameRegSameType () &&
5454+ if (builder.hasDpasSrc2ReadSupression () &&
5455+ builder.hasDpasSrc2ReadSupressionSameRegSameType () &&
54555456 src2SameFootPrintDiffType (curNode, nextNode))
54565457 {
54575458 return true ;
@@ -5462,26 +5463,38 @@ bool G4_BB_SB::isLastDpas(SBNode* curNode, SBNode* nextNode)
54625463 return false ;
54635464 }
54645465
5465- // src1 or src2 read suppression
5466- if (curSrc1Reg == nextSrc1Reg ||
5467- (builder.hasSrc2ReadSupression () && (curSrc2Reg == nextSrc2Reg &&
5468- curC == nextC &&
5469- curC == 8 )))
5466+ // dpas having dependency to other dpas in the macro cannot be part of the macro
5467+ if (hasDependenceBetweenDPASNodes (curNode, nextNode))
5468+ {
5469+ return true ;
5470+ }
5471+
5472+ // Src1 read suppression:
5473+ if (dpasSrcFootPrintCache (Gen4_Operand_Number::Opnd_src1, curNode, nextNode) &&
5474+ curNode->getFirstFootprint (Opnd_src1)->isSameOrNoOverlap (nextNode->getFirstFootprint (Opnd_src1)))
54705475 {
54715476 return false ;
54725477 }
54735478
5474- // Src2 read suppression with GRF cache.
5479+ // Src2 read suppression:
54755480 // Using {Atomic} in the last line of a macro (such as in the lines I highlighted) has some implications in the hardware implementation:
54765481 // 1. In 8x8 macros (such as the one you pasted) is fine.
54775482 // 2. In other repetitions, it will cause that the src1 of the next macro will be ignored.
54785483 // Hardware uses {Atomic} to indicate that the next instruction will reuse the src1. In an 8x8, they always verify
5484+ // 3. non-df dpas
5485+ auto isDFInst = [](G4_INST& inst) {
5486+ for (Gen4_Operand_Number opndNum : {Opnd_dst, Opnd_src0, Opnd_src1, Opnd_src2})
5487+ if (inst.getOperand (opndNum)->getType () == G4_Type::Type_DF)
5488+ return true ;
5489+ return false ;
5490+ };
54795491
5480- if (builder.hasSrc2ReadSupression () &&
5492+ if (builder.hasDpasSrc2ReadSupression () &&
54815493 curC == nextC &&
54825494 curC == 8 &&
5483- src2FootPrintCachePVC (curNode, nextNode) &&
5484- curNode->getFirstFootprint (Opnd_src2)->isWholeOverlap (nextNode->getFirstFootprint (Opnd_src2)))
5495+ !isDFInst (*nextDpasInst) &&
5496+ dpasSrcFootPrintCache (Gen4_Operand_Number::Opnd_src2, curNode, nextNode) &&
5497+ curNode->getFirstFootprint (Opnd_src2)->isSameOrNoOverlap (nextNode->getFirstFootprint (Opnd_src2)))
54855498 {
54865499 return false ;
54875500 }
@@ -5776,11 +5789,8 @@ void G4_BB_SB::SBDDD(G4_BB* bb,
57765789 // Different Depth, src1 and type cannot be merged
57775790 // Same register reuse in dest and src cannot be a part of a macro, even the last one.
57785791 if (sameSrcDst ||
5779- isLastDpas (node, &nextNode) ||
5780- hasDependenceBetweenDPASNodes (node, &nextNode))
5781- {
5792+ isLastDpas (node, &nextNode)) // if isLastDpas is true, it might still can be fwd
57825793 break ;
5783- }
57845794
57855795 if (hasInternalDependenceWithinDPAS (&nextNode))
57865796 {
0 commit comments