Skip to content

Commit 8de77f8

Browse files
DianaChenigcbot
authored andcommitted
vISA SWSB: refactor dpas macro setting
Unify code for handling src1 and src2 read suppression
1 parent 223ccfd commit 8de77f8

File tree

6 files changed

+89
-44
lines changed

6 files changed

+89
-44
lines changed

visa/BinaryEncodingIGA.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -429,6 +429,7 @@ InstOptSet BinaryEncodingIGA::getIGAInstOptSet(G4_INST* inst) const
429429
options.add(InstOpt::NOCOMPACT);
430430
}
431431

432+
432433
return options;
433434
}
434435

visa/G4_IR.hpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -822,7 +822,6 @@ typedef struct _SWSBInfo
822822
bool isNoDDChkInst() const { return (option & InstOpt_NoDDChk) ? true : false; }
823823
bool isNoDDClrInst() const { return (option & InstOpt_NoDDClr) ? true : false; }
824824
bool isBreakPointInst() const { return (option & InstOpt_BreakPoint) ? true : false; }
825-
826825
// true if inst reads/writes acc either implicitly or explicitly
827826
bool useAcc() const
828827
{

visa/HWCaps.inc

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -924,12 +924,13 @@ SPDX-License-Identifier: MIT
924924
return false;
925925
}
926926

927-
bool hasSrc2ReadSupression() const
927+
bool hasDpasSrc2ReadSupression() const
928928
{
929929
return getPlatform() >= Xe_PVCXT;
930930
}
931931

932-
bool hasSrc2ReadSupressionSameRegSameType() const
932+
933+
bool hasDpasSrc2ReadSupressionSameRegSameType() const
933934
{
934935
return getPlatform() == Xe_PVC && !getOption(vISA_HasPartialInt64);
935936
}

visa/LocalScheduler/SWSB_G4IR.cpp

Lines changed: 46 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -5253,7 +5253,7 @@ void G4_BB_SB::footprintMerge(SBNode* node, const SBNode* nextNode)
52535253
return;
52545254
}
52555255

5256-
bool G4_BB_SB::hasInternalDependenceWithinDPAS(SBNode* node)
5256+
bool G4_BB_SB::hasInternalDependenceWithinDPAS(SBNode* node) const
52575257
{
52585258
const SBFootprint* dstfp = node->getFirstFootprint(Opnd_dst);
52595259

@@ -5322,12 +5322,26 @@ bool G4_BB_SB::hasDependenceBetweenDPASNodes(SBNode* node, SBNode* nextNode)
53225322
return false;
53235323
}
53245324

5325-
#define SRC2_CACHE_SIZE 1024
5326-
bool G4_BB_SB::src2FootPrintCachePVC(SBNode * curNode, SBNode * nextNode) const
5325+
unsigned short G4_BB_SB::getDpasSrcCacheSize(Gen4_Operand_Number opNum) const
53275326
{
5327+
if (opNum == Gen4_Operand_Number::Opnd_src1) {
5328+
return 512;
5329+
}
5330+
if (opNum == Gen4_Operand_Number::Opnd_src2) {
5331+
if (builder.hasDpasSrc2ReadSupression())
5332+
return 1024;
5333+
}
5334+
return 0;
5335+
}
5336+
5337+
bool G4_BB_SB::dpasSrcFootPrintCache(Gen4_Operand_Number opNum, SBNode * curNode, SBNode * nextNode) const
5338+
{
5339+
// this function is expected to be called only when there is suppression buffer of given src number
5340+
assert(getDpasSrcCacheSize(opNum) != 0);
5341+
53285342
BitSet cachedGRF(totalGRFNum, false);
53295343

5330-
for (const SBFootprint* fp = curNode->getFirstFootprint(Opnd_src2); fp; fp = fp->next)
5344+
for (const SBFootprint* fp = curNode->getFirstFootprint(opNum); fp; fp = fp->next)
53315345
{
53325346
unsigned short leftB = fp->LeftB / builder.numEltPerGRF<Type_UB>();
53335347
unsigned short rightB = fp->RightB / builder.numEltPerGRF<Type_UB>();
@@ -5337,7 +5351,7 @@ bool G4_BB_SB::src2FootPrintCachePVC(SBNode * curNode, SBNode * nextNode) const
53375351
}
53385352
}
53395353

5340-
for (const SBFootprint* fp = nextNode->getFirstFootprint(Opnd_src2); fp; fp = fp->next)
5354+
for (const SBFootprint* fp = nextNode->getFirstFootprint(opNum); fp; fp = fp->next)
53415355
{
53425356
unsigned short leftB = fp->LeftB / builder.numEltPerGRF<Type_UB>();
53435357
unsigned short rightB = fp->RightB / builder.numEltPerGRF<Type_UB>();
@@ -5355,8 +5369,7 @@ bool G4_BB_SB::src2FootPrintCachePVC(SBNode * curNode, SBNode * nextNode) const
53555369
cachedGRFNum++;
53565370
}
53575371
}
5358-
5359-
return cachedGRFNum <= (SRC2_CACHE_SIZE + builder.numEltPerGRF<Type_UB>() - 1) / builder.numEltPerGRF<Type_UB>();
5372+
return cachedGRFNum <= (getDpasSrcCacheSize(opNum) + builder.numEltPerGRF<Type_UB>() - 1) / builder.numEltPerGRF<Type_UB>();
53605373
}
53615374

53625375
bool G4_BB_SB::src2SameFootPrintDiffType(SBNode * curNode, SBNode * nextNode) const
@@ -5386,8 +5399,9 @@ bool G4_BB_SB::src2SameFootPrintDiffType(SBNode * curNode, SBNode * nextNode) co
53865399
return false;
53875400
}
53885401

5402+
53895403
//restrict a macro to :
5390-
// 1. Consecutive instructions of same opcode, same datatype in all sources and dest and same register for Src1.
5404+
// 1. Consecutive instructions of same opcode, same datatype in all sources and dest and same register for Src1.
53915405
// 2. Allow having variable repeat count
53925406
bool G4_BB_SB::isLastDpas(SBNode* curNode, SBNode* nextNode)
53935407
{
@@ -5414,14 +5428,8 @@ bool G4_BB_SB::isLastDpas(SBNode* curNode, SBNode* nextNode)
54145428
}
54155429

54165430
G4_InstDpas* dpasInst = curInst->asDpasInst();
5417-
G4_Operand* srcOpnd1 = curInst->getSrc(1);
5418-
G4_Operand* srcOpnd2 = curInst->getSrc(2);
5419-
unsigned short leftBound1 = srcOpnd1->getLinearizedStart();
5420-
unsigned short leftBound2 = srcOpnd2->getLinearizedStart();
54215431
uint8_t curD = dpasInst->getSystolicDepth();
54225432
uint8_t curC = dpasInst->getRepeatCount();
5423-
int curSrc1Reg = leftBound1 / builder.numEltPerGRF<Type_UB>();
5424-
int curSrc2Reg = leftBound2 / builder.numEltPerGRF<Type_UB>();
54255433

54265434
G4_InstDpas* nextDpasInst = nextInst->asDpasInst();
54275435
uint8_t nextD = nextDpasInst->getSystolicDepth();
@@ -5443,15 +5451,8 @@ bool G4_BB_SB::isLastDpas(SBNode* curNode, SBNode* nextNode)
54435451
}
54445452
}
54455453

5446-
srcOpnd1 = nextDpasInst->getSrc(1);
5447-
srcOpnd2 = nextDpasInst->getSrc(2);
5448-
leftBound1 = srcOpnd1->getLinearizedStart();
5449-
leftBound2 = srcOpnd2->getLinearizedStart();
5450-
int nextSrc1Reg = leftBound1 / builder.numEltPerGRF<Type_UB>();
5451-
int nextSrc2Reg = leftBound2 / builder.numEltPerGRF<Type_UB>();
5452-
5453-
if (builder.hasSrc2ReadSupression() &&
5454-
builder.hasSrc2ReadSupressionSameRegSameType() &&
5454+
if (builder.hasDpasSrc2ReadSupression() &&
5455+
builder.hasDpasSrc2ReadSupressionSameRegSameType() &&
54555456
src2SameFootPrintDiffType(curNode, nextNode))
54565457
{
54575458
return true;
@@ -5462,26 +5463,38 @@ bool G4_BB_SB::isLastDpas(SBNode* curNode, SBNode* nextNode)
54625463
return false;
54635464
}
54645465

5465-
//src1 or src2 read suppression
5466-
if (curSrc1Reg == nextSrc1Reg ||
5467-
(builder.hasSrc2ReadSupression() && (curSrc2Reg == nextSrc2Reg &&
5468-
curC == nextC &&
5469-
curC == 8)))
5466+
// dpas having dependency to other dpas in the macro cannot be part of the macro
5467+
if (hasDependenceBetweenDPASNodes(curNode, nextNode))
5468+
{
5469+
return true;
5470+
}
5471+
5472+
//Src1 read suppression:
5473+
if (dpasSrcFootPrintCache(Gen4_Operand_Number::Opnd_src1, curNode, nextNode) &&
5474+
curNode->getFirstFootprint(Opnd_src1)->isSameOrNoOverlap(nextNode->getFirstFootprint(Opnd_src1)))
54705475
{
54715476
return false;
54725477
}
54735478

5474-
//Src2 read suppression with GRF cache.
5479+
//Src2 read suppression:
54755480
//Using {Atomic} in the last line of a macro (such as in the lines I highlighted) has some implications in the hardware implementation:
54765481
//1. In 8x8 macros (such as the one you pasted) is fine.
54775482
//2. In other repetitions, it will cause that the src1 of the next macro will be ignored.
54785483
// Hardware uses {Atomic} to indicate that the next instruction will reuse the src1. In an 8x8, they always verify
5484+
//3. non-df dpas
5485+
auto isDFInst = [](G4_INST& inst) {
5486+
for (Gen4_Operand_Number opndNum : {Opnd_dst, Opnd_src0, Opnd_src1, Opnd_src2})
5487+
if (inst.getOperand(opndNum)->getType() == G4_Type::Type_DF)
5488+
return true;
5489+
return false;
5490+
};
54795491

5480-
if (builder.hasSrc2ReadSupression() &&
5492+
if (builder.hasDpasSrc2ReadSupression() &&
54815493
curC == nextC &&
54825494
curC == 8 &&
5483-
src2FootPrintCachePVC(curNode, nextNode) &&
5484-
curNode->getFirstFootprint(Opnd_src2)->isWholeOverlap(nextNode->getFirstFootprint(Opnd_src2)))
5495+
!isDFInst(*nextDpasInst) &&
5496+
dpasSrcFootPrintCache(Gen4_Operand_Number::Opnd_src2, curNode, nextNode) &&
5497+
curNode->getFirstFootprint(Opnd_src2)->isSameOrNoOverlap(nextNode->getFirstFootprint(Opnd_src2)))
54855498
{
54865499
return false;
54875500
}
@@ -5776,11 +5789,8 @@ void G4_BB_SB::SBDDD(G4_BB* bb,
57765789
//Different Depth, src1 and type cannot be merged
57775790
//Same register reuse in dest and src cannot be a part of a macro, even the last one.
57785791
if (sameSrcDst ||
5779-
isLastDpas(node, &nextNode) ||
5780-
hasDependenceBetweenDPASNodes(node, &nextNode))
5781-
{
5792+
isLastDpas(node, &nextNode)) // if isLastDpas is true, it might still can be fwd
57825793
break;
5783-
}
57845794

57855795
if (hasInternalDependenceWithinDPAS(&nextNode))
57865796
{

visa/LocalScheduler/SWSB_G4IR.h

Lines changed: 38 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -219,7 +219,6 @@ namespace vISA
219219
return false;
220220
}
221221

222-
223222
//Check if current footprint overlaps footprint2
224223
//FIXME: it's conservative. Because for the indirect, the ranges may be contiguous?
225224
bool isWholeOverlap(const SBFootprint *liveFootprint) const
@@ -256,6 +255,38 @@ namespace vISA
256255
return findOverlap;
257256
}
258257

258+
// check if the current footprint has the same range with given one, or they are not overlapped at all
259+
bool isSameOrNoOverlap(const SBFootprint* liveFootprint) const
260+
{
261+
unsigned short offset = 0;
262+
if (!hasOverlap(liveFootprint, offset))
263+
return true;
264+
265+
266+
for (const SBFootprint* footprint2Ptr = liveFootprint; footprint2Ptr; footprint2Ptr = footprint2Ptr->next)
267+
{
268+
if (fType == footprint2Ptr->fType &&
269+
LeftB == footprint2Ptr->LeftB && RightB == footprint2Ptr->RightB)
270+
continue;
271+
272+
bool findSame = false;
273+
for (const SBFootprint* curFootprintPtr = next; curFootprintPtr; curFootprintPtr = curFootprintPtr->next)
274+
{
275+
FOOTPRINT_TYPE curFType = curFootprintPtr->fType;
276+
if (curFType == footprint2Ptr->fType &&
277+
curFootprintPtr->LeftB == footprint2Ptr->LeftB && curFootprintPtr->RightB == footprint2Ptr->RightB)
278+
{
279+
findSame = true;
280+
break;
281+
}
282+
}
283+
284+
if (!findSame)
285+
return false;
286+
}
287+
return true;
288+
}
289+
259290
};
260291

261292
// Bit set which is used for global dependence analysis for SBID.
@@ -1352,7 +1383,9 @@ namespace vISA
13521383

13531384
int totalGRFNum;
13541385
int tokenAfterDPASCycle;
1355-
1386+
private:
1387+
// dpas read suppression buffer size
1388+
unsigned short getDpasSrcCacheSize(Gen4_Operand_Number opNum) const;
13561389
public:
13571390
LiveGRFBuckets *send_use_kills;
13581391
BB_SWSB_LIST Preds;
@@ -1503,13 +1536,13 @@ namespace vISA
15031536

15041537
void clearKilledBucketNodeXeHP(LiveGRFBuckets* LB, int integerID, int floatID, int longID, int mathID);
15051538

1506-
bool hasInternalDependenceWithinDPAS(SBNode *node);
1539+
bool hasInternalDependenceWithinDPAS(SBNode *node) const;
15071540
bool hasDependenceBetweenDPASNodes(SBNode * node, SBNode * nextNode);
1508-
bool src2FootPrintCachePVC(SBNode* curNode, SBNode* nextNode) const;
1541+
// check if the given src can be cached (by src suppression buffer)
1542+
bool dpasSrcFootPrintCache(Gen4_Operand_Number opNum, SBNode* curNode, SBNode* nextNode) const;
15091543
bool src2SameFootPrintDiffType(SBNode* curNode, SBNode* nextNode) const;
15101544
bool isLastDpas(SBNode * curNode, SBNode * nextNode);
15111545

1512-
15131546
void getLiveOutToken(unsigned allSendNum, const SBNODE_VECT *SBNodes);
15141547

15151548

visa/iga/IGALibrary/Backend/GED/Encoder.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -174,6 +174,7 @@ void Encoder::encodeBlock(Block *blk)
174174
mustNotCompact = true;
175175
}
176176

177+
177178
int32_t iLen = 16;
178179
if (mustCompact || (!mustNotCompact && m_opts.autoCompact)) {
179180
// try compact first

0 commit comments

Comments
 (0)