Skip to content

Commit 2e15bfc

Browse files
committed
[AMDGPU] Add scheduling stage to rewrite MFMA from VGPR to AGPR
Change-Id: I47b2a4274a35f3cf0a6d064674d1d29526e4dfd2
1 parent fc9dd58 commit 2e15bfc

File tree

2 files changed

+315
-6
lines changed

2 files changed

+315
-6
lines changed

llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp

Lines changed: 285 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
#include "SIMachineFunctionInfo.h"
3030
#include "Utils/AMDGPUBaseInfo.h"
3131
#include "llvm/ADT/STLExtras.h"
32+
#include "llvm/CodeGen/MachineCycleAnalysis.h"
3233
#include "llvm/CodeGen/RegisterClassInfo.h"
3334
#include "llvm/MC/LaneBitmask.h"
3435
#include "llvm/Support/ErrorHandling.h"
@@ -528,6 +529,7 @@ GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy(
528529
const MachineSchedContext *C, bool IsLegacyScheduler)
529530
: GCNSchedStrategy(C) {
530531
SchedStages.push_back(GCNSchedStageID::OccInitialSchedule);
532+
SchedStages.push_back(GCNSchedStageID::RewriteSchedule);
531533
SchedStages.push_back(GCNSchedStageID::UnclusteredHighRPReschedule);
532534
SchedStages.push_back(GCNSchedStageID::ClusteredLowOccupancyReschedule);
533535
SchedStages.push_back(GCNSchedStageID::PreRARematerialize);
@@ -778,6 +780,8 @@ GCNScheduleDAGMILive::createSchedStage(GCNSchedStageID SchedStageID) {
778780
switch (SchedStageID) {
779781
case GCNSchedStageID::OccInitialSchedule:
780782
return std::make_unique<OccInitialScheduleStage>(SchedStageID, *this);
783+
case GCNSchedStageID::RewriteSchedule:
784+
return std::make_unique<RewriteScheduleStage>(SchedStageID, *this);
781785
case GCNSchedStageID::UnclusteredHighRPReschedule:
782786
return std::make_unique<UnclusteredHighRPStage>(SchedStageID, *this);
783787
case GCNSchedStageID::ClusteredLowOccupancyReschedule:
@@ -803,7 +807,8 @@ void GCNScheduleDAGMILive::schedule() {
803807
GCNRegPressure
804808
GCNScheduleDAGMILive::getRealRegPressure(unsigned RegionIdx) const {
805809
GCNDownwardRPTracker RPTracker(*LIS);
806-
RPTracker.advance(begin(), end(), &LiveIns[RegionIdx]);
810+
RPTracker.advance(Regions[RegionIdx].first, Regions[RegionIdx].second,
811+
&LiveIns[RegionIdx]);
807812
return RPTracker.moveMaxPressure();
808813
}
809814

@@ -940,10 +945,12 @@ void GCNScheduleDAGMILive::finalizeSchedule() {
940945
Pressure.resize(Regions.size());
941946
RegionsWithHighRP.resize(Regions.size());
942947
RegionsWithExcessRP.resize(Regions.size());
948+
RegionsWithExcessArchVGPR.resize(Regions.size());
943949
RegionsWithMinOcc.resize(Regions.size());
944950
RegionsWithIGLPInstrs.resize(Regions.size());
945951
RegionsWithHighRP.reset();
946952
RegionsWithExcessRP.reset();
953+
RegionsWithExcessArchVGPR.reset();
947954
RegionsWithMinOcc.reset();
948955
RegionsWithIGLPInstrs.reset();
949956

@@ -1002,6 +1009,9 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, const GCNSchedStageID &StageID) {
10021009
case GCNSchedStageID::OccInitialSchedule:
10031010
OS << "Max Occupancy Initial Schedule";
10041011
break;
1012+
case GCNSchedStageID::RewriteSchedule:
1013+
OS << "Instruction Rewriting Reschedule";
1014+
break;
10051015
case GCNSchedStageID::UnclusteredHighRPReschedule:
10061016
OS << "Unclustered High Register Pressure Reschedule";
10071017
break;
@@ -1035,6 +1045,245 @@ bool GCNSchedStage::initGCNSchedStage() {
10351045
return true;
10361046
}
10371047

1048+
bool RewriteScheduleStage::initGCNSchedStage() {
1049+
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1050+
if (!ST.hasGFX90AInsts() || DAG.RegionsWithExcessArchVGPR.none())
1051+
return false;
1052+
1053+
const SIInstrInfo *TII = ST.getInstrInfo();
1054+
const SIRegisterInfo *SRI = ST.getRegisterInfo();
1055+
SmallPtrSet<MachineInstr *, 16> CrossRCUseCopies;
1056+
SmallPtrSet<MachineInstr *, 16> CrossRCDefCopies;
1057+
std::vector<std::pair<MachineInstr *, unsigned>> RewriteInsts;
1058+
1059+
for (auto &MBB : MF) {
1060+
for (auto &MI : MBB) {
1061+
if (TII->isMAI(MI)) {
1062+
int ReplacementOp = AMDGPU::getMFMASrcCVDstAGPROp(MI.getOpcode());
1063+
if (ReplacementOp == -1)
1064+
continue;
1065+
const TargetRegisterClass *VGPRRC =
1066+
DAG.MRI.getRegClass(MI.getOperand(0).getReg());
1067+
const TargetRegisterClass *AGPRRC = SRI->getEquivalentAGPRClass(VGPRRC);
1068+
const TargetRegisterClass *DestConstrainExceptRC =
1069+
recomputeRegClassExceptRewritable(MI.getOperand(0).getReg(), VGPRRC,
1070+
AGPRRC);
1071+
1072+
if (!DestConstrainExceptRC)
1073+
CrossRCUseCopies.insert(&MI);
1074+
1075+
MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
1076+
if (Src2 && Src2->isReg()) {
1077+
const TargetRegisterClass *Src2ConstrainExceptRC =
1078+
recomputeRegClassExceptRewritable(Src2->getReg(), VGPRRC, AGPRRC);
1079+
if ((!Src2ConstrainExceptRC || Src2ConstrainExceptRC != AGPRRC))
1080+
CrossRCDefCopies.insert(&MI);
1081+
1082+
DAG.MRI.setRegClass(Src2->getReg(), AGPRRC);
1083+
}
1084+
1085+
DAG.MRI.setRegClass(MI.getOperand(0).getReg(), AGPRRC);
1086+
1087+
auto OriginalOpc = MI.getOpcode();
1088+
MI.setDesc(TII->get(ReplacementOp));
1089+
RewriteInsts.push_back({&MI, OriginalOpc});
1090+
}
1091+
}
1092+
}
1093+
1094+
bool ShouldRewrite = false;
1095+
for (unsigned RegionIdx = 0; RegionIdx < DAG.Regions.size(); RegionIdx++) {
1096+
if (!DAG.RegionsWithExcessArchVGPR[RegionIdx])
1097+
continue;
1098+
1099+
// For the cases we care about (i.e. ArchVGPR usage is greater than the
1100+
// addressable limit), rewriting alone should bring pressure to manageable
1101+
// level. If we find any such region, then the rewrite is potentially
1102+
// beneficial.
1103+
auto PressureAfter = DAG.getRealRegPressure(RegionIdx);
1104+
unsigned MaxCombinedVGPRs = ST.getMaxNumVGPRs(MF);
1105+
if (PressureAfter.getArchVGPRNum() <= ST.getAddressableNumArchVGPRs() &&
1106+
PressureAfter.getVGPRNum(true) <= MaxCombinedVGPRs) {
1107+
ShouldRewrite = true;
1108+
break;
1109+
}
1110+
}
1111+
1112+
// If we find that we'll need to insert cross RC copies inside loop bodies,
1113+
// then bail
1114+
if (ShouldRewrite) {
1115+
CI.clear();
1116+
CI.compute(MF);
1117+
1118+
for (auto *DefMI : CrossRCUseCopies) {
1119+
auto DefReg = DefMI->getOperand(0).getReg();
1120+
1121+
for (auto &UseMI : DAG.MRI.use_nodbg_instructions(DefReg)) {
1122+
for (unsigned OpNo = 0; OpNo < UseMI.getNumOperands(); OpNo++) {
1123+
auto &TheOp = UseMI.getOperand(OpNo);
1124+
if (!TheOp.isReg() || !TheOp.isUse())
1125+
continue;
1126+
if (TheOp.getReg() != DefReg)
1127+
continue;
1128+
1129+
auto RequiredRC = UseMI.getRegClassConstraint(OpNo, DAG.TII, DAG.TRI);
1130+
if (!RequiredRC || SRI->hasAGPRs(RequiredRC))
1131+
continue;
1132+
1133+
unsigned DefDepth = CI.getCycleDepth(DefMI->getParent());
1134+
if (DefDepth && CI.getCycleDepth(UseMI.getParent()) >= DefDepth) {
1135+
ShouldRewrite = false;
1136+
break;
1137+
}
1138+
}
1139+
if (!ShouldRewrite)
1140+
break;
1141+
}
1142+
if (!ShouldRewrite)
1143+
break;
1144+
}
1145+
}
1146+
1147+
// If we haven't found the beneficial conditions, prefer the VGPR form which
1148+
// may result in less cross RC copies.
1149+
if (!ShouldRewrite) {
1150+
for (auto RI : RewriteInsts) {
1151+
MachineInstr *MI = RI.first;
1152+
1153+
assert(TII->isMAI(*MI));
1154+
const TargetRegisterClass *AGPRRC =
1155+
DAG.MRI.getRegClass(MI->getOperand(0).getReg());
1156+
const TargetRegisterClass *VGPRRC = SRI->getEquivalentVGPRClass(AGPRRC);
1157+
1158+
MachineOperand *Src2 = TII->getNamedOperand(*MI, AMDGPU::OpName::src2);
1159+
assert(Src2);
1160+
1161+
if (Src2->isReg()) {
1162+
DAG.MRI.setRegClass(Src2->getReg(), VGPRRC);
1163+
}
1164+
DAG.MRI.setRegClass(MI->getOperand(0).getReg(), VGPRRC);
1165+
MI->setDesc(TII->get(RI.second));
1166+
}
1167+
1168+
return false;
1169+
}
1170+
1171+
DAG.RegionsWithExcessArchVGPR.reset();
1172+
DAG.RegionsWithExcessRP.reset();
1173+
1174+
// Insert cross RC copies for the users of the MFMA result
1175+
for (auto MI : CrossRCUseCopies) {
1176+
auto DefReg = MI->getOperand(0).getReg();
1177+
SmallVector<MachineInstr *, 4> UseInstrs;
1178+
for (auto &UseMI : DAG.MRI.use_nodbg_instructions(DefReg))
1179+
UseInstrs.push_back(&UseMI);
1180+
1181+
DenseMap<Register, MachineInstr *> NewCopies;
1182+
for (auto UseMI : UseInstrs) {
1183+
for (unsigned OpNo = 0; OpNo < UseMI->getNumOperands(); OpNo++) {
1184+
auto &TheOp = UseMI->getOperand(OpNo);
1185+
if (!TheOp.isReg() || !TheOp.isUse())
1186+
continue;
1187+
if (TheOp.getReg() != DefReg)
1188+
continue;
1189+
1190+
auto RequiredRC = UseMI->getRegClassConstraint(OpNo, DAG.TII, DAG.TRI);
1191+
1192+
if (!RequiredRC || SRI->hasAGPRs(RequiredRC))
1193+
continue;
1194+
1195+
Register DestVGPR;
1196+
if (!NewCopies.contains(DefReg)) {
1197+
Register DestVGPR = DAG.MRI.createVirtualRegister(
1198+
SRI->getEquivalentVGPRClass(DAG.MRI.getRegClass(DefReg)));
1199+
1200+
// Insert copy near the user to avoid inserting inside loops.
1201+
MachineInstrBuilder VGPRCopy =
1202+
BuildMI(*UseMI->getParent(), UseMI->getIterator(),
1203+
UseMI->getDebugLoc(), TII->get(TargetOpcode::COPY))
1204+
.addDef(DestVGPR, 0, 0)
1205+
.addUse(DefReg, 0, 0);
1206+
1207+
NewCopies[DefReg] = VGPRCopy;
1208+
}
1209+
DestVGPR = NewCopies[DefReg]->getOperand(0).getReg();
1210+
TheOp.setReg(DestVGPR);
1211+
}
1212+
}
1213+
if (NewCopies.contains(DefReg)) {
1214+
DAG.LIS->InsertMachineInstrInMaps(*NewCopies[DefReg]);
1215+
DAG.LIS->removeInterval(DefReg);
1216+
DAG.LIS->createAndComputeVirtRegInterval(DefReg);
1217+
DAG.LIS->createAndComputeVirtRegInterval(
1218+
NewCopies[DefReg]->getOperand(0).getReg());
1219+
}
1220+
}
1221+
1222+
// Insert cross RC copies for the use operands of the MFMA
1223+
for (auto MI : CrossRCDefCopies) {
1224+
MachineOperand *Src2 = TII->getNamedOperand(*MI, AMDGPU::OpName::src2);
1225+
if (!Src2)
1226+
continue;
1227+
if (!Src2->isReg())
1228+
continue;
1229+
auto Src2Reg = Src2->getReg();
1230+
SmallVector<MachineInstr *, 4> DefInstrs;
1231+
for (auto &DefMI : DAG.MRI.def_instructions(Src2Reg))
1232+
DefInstrs.push_back(&DefMI);
1233+
1234+
DenseMap<Register, MachineInstr *> NewCopies;
1235+
for (auto DefMI : DefInstrs) {
1236+
for (unsigned OpNo = 0; OpNo < DefMI->getNumOperands(); OpNo++) {
1237+
auto &TheOp = DefMI->getOperand(OpNo);
1238+
if (!TheOp.isReg() || !TheOp.isDef())
1239+
continue;
1240+
if (TheOp.getReg() != Src2Reg)
1241+
continue;
1242+
1243+
auto RequiredRC = DefMI->getRegClassConstraint(OpNo, DAG.TII, DAG.TRI);
1244+
1245+
if (!RequiredRC || SRI->hasAGPRs(RequiredRC))
1246+
continue;
1247+
1248+
Register SrcVGPR;
1249+
if (!NewCopies.contains(Src2Reg)) {
1250+
Register SrcVGPR = DAG.MRI.createVirtualRegister(
1251+
SRI->getEquivalentVGPRClass(DAG.MRI.getRegClass(Src2Reg)));
1252+
1253+
// Insert copy near the def to avoid inserting inside loops.
1254+
MachineInstrBuilder VGPRCopy =
1255+
BuildMI(*DefMI->getParent(), ++DefMI->getIterator(),
1256+
DefMI->getDebugLoc(), TII->get(TargetOpcode::COPY))
1257+
.addDef(Src2Reg, 0, 0)
1258+
.addUse(SrcVGPR, 0, 0);
1259+
1260+
NewCopies[Src2Reg] = VGPRCopy;
1261+
}
1262+
1263+
SrcVGPR = NewCopies[Src2Reg]->getOperand(1).getReg();
1264+
TheOp.setReg(SrcVGPR);
1265+
}
1266+
}
1267+
1268+
if (NewCopies.contains(Src2Reg)) {
1269+
DAG.LIS->InsertMachineInstrInMaps(*NewCopies[Src2Reg]);
1270+
DAG.LIS->removeInterval(Src2Reg);
1271+
DAG.LIS->createAndComputeVirtRegInterval(Src2Reg);
1272+
DAG.LIS->createAndComputeVirtRegInterval(
1273+
NewCopies[Src2Reg]->getOperand(1).getReg());
1274+
}
1275+
}
1276+
1277+
// Liveins may have been modified for cross RC copies
1278+
RegionPressureMap LiveInUpdater(&DAG, false);
1279+
LiveInUpdater.buildLiveRegMap();
1280+
1281+
for (unsigned RegionIdx = 0; RegionIdx < DAG.Regions.size(); RegionIdx++)
1282+
DAG.LiveIns[RegionIdx] = LiveInUpdater.getLiveRegsForRegionIdx(RegionIdx);
1283+
1284+
return true;
1285+
}
1286+
10381287
bool UnclusteredHighRPStage::initGCNSchedStage() {
10391288
if (DisableUnclusterHighRP)
10401289
return false;
@@ -1338,6 +1587,9 @@ void GCNSchedStage::checkScheduling() {
13381587
DAG.RegionsWithExcessRP[RegionIdx] = true;
13391588
}
13401589

1590+
if (PressureAfter.getArchVGPRNum() > ST.getAddressableNumArchVGPRs())
1591+
DAG.RegionsWithExcessArchVGPR[RegionIdx] = true;
1592+
13411593
// Revert if this region's schedule would cause a drop in occupancy or
13421594
// spilling.
13431595
if (shouldRevertScheduling(WavesAfter)) {
@@ -1641,6 +1893,38 @@ void GCNSchedStage::revertScheduling() {
16411893
DAG.Regions[RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd);
16421894
}
16431895

1896+
bool RewriteScheduleStage::isRewriteCandidate(MachineInstr *MI) const {
1897+
1898+
if (!static_cast<const SIInstrInfo *>(DAG.TII)->isMAI(*MI))
1899+
return false;
1900+
return AMDGPU::getMFMASrcCVDstAGPROp(MI->getOpcode()) != -1;
1901+
}
1902+
1903+
const TargetRegisterClass *
1904+
RewriteScheduleStage::recomputeRegClassExceptRewritable(
1905+
Register Reg, const TargetRegisterClass *OldRC,
1906+
const TargetRegisterClass *NewRC) const {
1907+
1908+
// Accumulate constraints from all uses.
1909+
for (MachineOperand &MO : DAG.MRI.reg_nodbg_operands(Reg)) {
1910+
// Apply the effect of the given operand to NewRC.
1911+
MachineInstr *MI = MO.getParent();
1912+
// We can swap the classes of dst + src2 as a pair to AGPR, so ignore the
1913+
// effects of rewrite candidates. It just so happens that we can use either
1914+
// AGPR or VGPR in src0/src1, so don't bother checking the constraint
1915+
// effects of the individual operands.
1916+
if (isRewriteCandidate(MI))
1917+
continue;
1918+
1919+
unsigned OpNo = &MO - &MI->getOperand(0);
1920+
NewRC = MI->getRegClassConstraintEffect(OpNo, NewRC, DAG.TII, DAG.TRI);
1921+
if (!NewRC || NewRC == OldRC)
1922+
return nullptr;
1923+
}
1924+
1925+
return NewRC;
1926+
}
1927+
16441928
bool PreRARematStage::allUsesAvailableAt(const MachineInstr *InstToRemat,
16451929
SlotIndex OriginalIdx,
16461930
SlotIndex RematIdx) const {

0 commit comments

Comments
 (0)