Skip to content

Commit 13f5604

Browse files
committed
[AMDGPU] Add scheduling stage to rewrite MFMA from VGPR to AGPR
Change-Id: I47b2a4274a35f3cf0a6d064674d1d29526e4dfd2
1 parent 9295849 commit 13f5604

File tree

2 files changed

+313
-5
lines changed

2 files changed

+313
-5
lines changed

llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp

Lines changed: 283 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
#include "SIMachineFunctionInfo.h"
3030
#include "Utils/AMDGPUBaseInfo.h"
3131
#include "llvm/ADT/STLExtras.h"
32+
#include "llvm/CodeGen/MachineCycleAnalysis.h"
3233
#include "llvm/CodeGen/RegisterClassInfo.h"
3334
#include "llvm/MC/LaneBitmask.h"
3435
#include "llvm/Support/ErrorHandling.h"
@@ -535,6 +536,7 @@ GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy(
535536
const MachineSchedContext *C, bool IsLegacyScheduler)
536537
: GCNSchedStrategy(C) {
537538
SchedStages.push_back(GCNSchedStageID::OccInitialSchedule);
539+
SchedStages.push_back(GCNSchedStageID::RewriteSchedule);
538540
SchedStages.push_back(GCNSchedStageID::UnclusteredHighRPReschedule);
539541
SchedStages.push_back(GCNSchedStageID::ClusteredLowOccupancyReschedule);
540542
SchedStages.push_back(GCNSchedStageID::PreRARematerialize);
@@ -785,6 +787,8 @@ GCNScheduleDAGMILive::createSchedStage(GCNSchedStageID SchedStageID) {
785787
switch (SchedStageID) {
786788
case GCNSchedStageID::OccInitialSchedule:
787789
return std::make_unique<OccInitialScheduleStage>(SchedStageID, *this);
790+
case GCNSchedStageID::RewriteSchedule:
791+
return std::make_unique<RewriteScheduleStage>(SchedStageID, *this);
788792
case GCNSchedStageID::UnclusteredHighRPReschedule:
789793
return std::make_unique<UnclusteredHighRPStage>(SchedStageID, *this);
790794
case GCNSchedStageID::ClusteredLowOccupancyReschedule:
@@ -948,10 +952,12 @@ void GCNScheduleDAGMILive::finalizeSchedule() {
948952
Pressure.resize(Regions.size());
949953
RegionsWithHighRP.resize(Regions.size());
950954
RegionsWithExcessRP.resize(Regions.size());
955+
RegionsWithExcessArchVGPR.resize(Regions.size());
951956
RegionsWithMinOcc.resize(Regions.size());
952957
RegionsWithIGLPInstrs.resize(Regions.size());
953958
RegionsWithHighRP.reset();
954959
RegionsWithExcessRP.reset();
960+
RegionsWithExcessArchVGPR.reset();
955961
RegionsWithMinOcc.reset();
956962
RegionsWithIGLPInstrs.reset();
957963

@@ -1010,6 +1016,9 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, const GCNSchedStageID &StageID) {
10101016
case GCNSchedStageID::OccInitialSchedule:
10111017
OS << "Max Occupancy Initial Schedule";
10121018
break;
1019+
case GCNSchedStageID::RewriteSchedule:
1020+
OS << "Instruction Rewriting Reschedule";
1021+
break;
10131022
case GCNSchedStageID::UnclusteredHighRPReschedule:
10141023
OS << "Unclustered High Register Pressure Reschedule";
10151024
break;
@@ -1043,6 +1052,245 @@ bool GCNSchedStage::initGCNSchedStage() {
10431052
return true;
10441053
}
10451054

1055+
bool RewriteScheduleStage::initGCNSchedStage() {
1056+
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1057+
if (!ST.hasGFX90AInsts() || DAG.RegionsWithExcessArchVGPR.none())
1058+
return false;
1059+
1060+
const SIInstrInfo *TII = ST.getInstrInfo();
1061+
const SIRegisterInfo *SRI = ST.getRegisterInfo();
1062+
SmallPtrSet<MachineInstr *, 16> CrossRCUseCopies;
1063+
SmallPtrSet<MachineInstr *, 16> CrossRCDefCopies;
1064+
std::vector<std::pair<MachineInstr *, unsigned>> RewriteInsts;
1065+
1066+
for (auto &MBB : MF) {
1067+
for (auto &MI : MBB) {
1068+
if (TII->isMAI(MI)) {
1069+
int ReplacementOp = AMDGPU::getMFMASrcCVDstAGPROp(MI.getOpcode());
1070+
if (ReplacementOp == -1)
1071+
continue;
1072+
const TargetRegisterClass *VGPRRC =
1073+
DAG.MRI.getRegClass(MI.getOperand(0).getReg());
1074+
const TargetRegisterClass *AGPRRC = SRI->getEquivalentAGPRClass(VGPRRC);
1075+
const TargetRegisterClass *DestConstrainExceptRC =
1076+
recomputeRegClassExceptRewritable(MI.getOperand(0).getReg(), VGPRRC,
1077+
AGPRRC);
1078+
1079+
if (!DestConstrainExceptRC)
1080+
CrossRCUseCopies.insert(&MI);
1081+
1082+
MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
1083+
if (Src2 && Src2->isReg()) {
1084+
const TargetRegisterClass *Src2ConstrainExceptRC =
1085+
recomputeRegClassExceptRewritable(Src2->getReg(), VGPRRC, AGPRRC);
1086+
if ((!Src2ConstrainExceptRC || Src2ConstrainExceptRC != AGPRRC))
1087+
CrossRCDefCopies.insert(&MI);
1088+
1089+
DAG.MRI.setRegClass(Src2->getReg(), AGPRRC);
1090+
}
1091+
1092+
DAG.MRI.setRegClass(MI.getOperand(0).getReg(), AGPRRC);
1093+
1094+
auto OriginalOpc = MI.getOpcode();
1095+
MI.setDesc(TII->get(ReplacementOp));
1096+
RewriteInsts.push_back({&MI, OriginalOpc});
1097+
}
1098+
}
1099+
}
1100+
1101+
bool ShouldRewrite = false;
1102+
for (unsigned RegionIdx = 0; RegionIdx < DAG.Regions.size(); RegionIdx++) {
1103+
if (!DAG.RegionsWithExcessArchVGPR[RegionIdx])
1104+
continue;
1105+
1106+
// For the cases we care about (i.e. ArchVGPR usage is greater than the
1107+
// addressable limit), rewriting alone should bring pressure to manageable
1108+
// level. If we find any such region, then the rewrite is potentially
1109+
// beneficial.
1110+
auto PressureAfter = DAG.getRealRegPressure(RegionIdx);
1111+
unsigned MaxCombinedVGPRs = ST.getMaxNumVGPRs(MF);
1112+
if (PressureAfter.getArchVGPRNum() <= ST.getAddressableNumArchVGPRs() &&
1113+
PressureAfter.getVGPRNum(true) <= MaxCombinedVGPRs) {
1114+
ShouldRewrite = true;
1115+
break;
1116+
}
1117+
}
1118+
1119+
// If we find that we'll need to insert cross RC copies inside loop bodies,
1120+
// then bail
1121+
if (ShouldRewrite) {
1122+
CI.clear();
1123+
CI.compute(MF);
1124+
1125+
for (auto *DefMI : CrossRCUseCopies) {
1126+
auto DefReg = DefMI->getOperand(0).getReg();
1127+
1128+
for (auto &UseMI : DAG.MRI.use_nodbg_instructions(DefReg)) {
1129+
for (unsigned OpNo = 0; OpNo < UseMI.getNumOperands(); OpNo++) {
1130+
auto &TheOp = UseMI.getOperand(OpNo);
1131+
if (!TheOp.isReg() || !TheOp.isUse())
1132+
continue;
1133+
if (TheOp.getReg() != DefReg)
1134+
continue;
1135+
1136+
auto RequiredRC = UseMI.getRegClassConstraint(OpNo, DAG.TII, DAG.TRI);
1137+
if (!RequiredRC || SRI->hasAGPRs(RequiredRC))
1138+
continue;
1139+
1140+
unsigned DefDepth = CI.getCycleDepth(DefMI->getParent());
1141+
if (DefDepth && CI.getCycleDepth(UseMI.getParent()) >= DefDepth) {
1142+
ShouldRewrite = false;
1143+
break;
1144+
}
1145+
}
1146+
if (!ShouldRewrite)
1147+
break;
1148+
}
1149+
if (!ShouldRewrite)
1150+
break;
1151+
}
1152+
}
1153+
1154+
// If we haven't found the beneficial conditions, prefer the VGPR form which
1155+
// may result in less cross RC copies.
1156+
if (!ShouldRewrite) {
1157+
for (auto RI : RewriteInsts) {
1158+
MachineInstr *MI = RI.first;
1159+
1160+
assert(TII->isMAI(*MI));
1161+
const TargetRegisterClass *AGPRRC =
1162+
DAG.MRI.getRegClass(MI->getOperand(0).getReg());
1163+
const TargetRegisterClass *VGPRRC = SRI->getEquivalentVGPRClass(AGPRRC);
1164+
1165+
MachineOperand *Src2 = TII->getNamedOperand(*MI, AMDGPU::OpName::src2);
1166+
assert(Src2);
1167+
1168+
if (Src2->isReg()) {
1169+
DAG.MRI.setRegClass(Src2->getReg(), VGPRRC);
1170+
}
1171+
DAG.MRI.setRegClass(MI->getOperand(0).getReg(), VGPRRC);
1172+
MI->setDesc(TII->get(RI.second));
1173+
}
1174+
1175+
return false;
1176+
}
1177+
1178+
DAG.RegionsWithExcessArchVGPR.reset();
1179+
DAG.RegionsWithExcessRP.reset();
1180+
1181+
// Insert cross RC copies for the users of the MFMA result
1182+
for (auto MI : CrossRCUseCopies) {
1183+
auto DefReg = MI->getOperand(0).getReg();
1184+
SmallVector<MachineInstr *, 4> UseInstrs;
1185+
for (auto &UseMI : DAG.MRI.use_nodbg_instructions(DefReg))
1186+
UseInstrs.push_back(&UseMI);
1187+
1188+
DenseMap<Register, MachineInstr *> NewCopies;
1189+
for (auto UseMI : UseInstrs) {
1190+
for (unsigned OpNo = 0; OpNo < UseMI->getNumOperands(); OpNo++) {
1191+
auto &TheOp = UseMI->getOperand(OpNo);
1192+
if (!TheOp.isReg() || !TheOp.isUse())
1193+
continue;
1194+
if (TheOp.getReg() != DefReg)
1195+
continue;
1196+
1197+
auto RequiredRC = UseMI->getRegClassConstraint(OpNo, DAG.TII, DAG.TRI);
1198+
1199+
if (!RequiredRC || SRI->hasAGPRs(RequiredRC))
1200+
continue;
1201+
1202+
Register DestVGPR;
1203+
if (!NewCopies.contains(DefReg)) {
1204+
Register DestVGPR = DAG.MRI.createVirtualRegister(
1205+
SRI->getEquivalentVGPRClass(DAG.MRI.getRegClass(DefReg)));
1206+
1207+
// Insert copy near the user to avoid inserting inside loops.
1208+
MachineInstrBuilder VGPRCopy =
1209+
BuildMI(*UseMI->getParent(), UseMI->getIterator(),
1210+
UseMI->getDebugLoc(), TII->get(TargetOpcode::COPY))
1211+
.addDef(DestVGPR, 0, 0)
1212+
.addUse(DefReg, 0, 0);
1213+
1214+
NewCopies[DefReg] = VGPRCopy;
1215+
}
1216+
DestVGPR = NewCopies[DefReg]->getOperand(0).getReg();
1217+
TheOp.setReg(DestVGPR);
1218+
}
1219+
}
1220+
if (NewCopies.contains(DefReg)) {
1221+
DAG.LIS->InsertMachineInstrInMaps(*NewCopies[DefReg]);
1222+
DAG.LIS->removeInterval(DefReg);
1223+
DAG.LIS->createAndComputeVirtRegInterval(DefReg);
1224+
DAG.LIS->createAndComputeVirtRegInterval(
1225+
NewCopies[DefReg]->getOperand(0).getReg());
1226+
}
1227+
}
1228+
1229+
// Insert cross RC copies for the use operands of the MFMA
1230+
for (auto MI : CrossRCDefCopies) {
1231+
MachineOperand *Src2 = TII->getNamedOperand(*MI, AMDGPU::OpName::src2);
1232+
if (!Src2)
1233+
continue;
1234+
if (!Src2->isReg())
1235+
continue;
1236+
auto Src2Reg = Src2->getReg();
1237+
SmallVector<MachineInstr *, 4> DefInstrs;
1238+
for (auto &DefMI : DAG.MRI.def_instructions(Src2Reg))
1239+
DefInstrs.push_back(&DefMI);
1240+
1241+
DenseMap<Register, MachineInstr *> NewCopies;
1242+
for (auto DefMI : DefInstrs) {
1243+
for (unsigned OpNo = 0; OpNo < DefMI->getNumOperands(); OpNo++) {
1244+
auto &TheOp = DefMI->getOperand(OpNo);
1245+
if (!TheOp.isReg() || !TheOp.isDef())
1246+
continue;
1247+
if (TheOp.getReg() != Src2Reg)
1248+
continue;
1249+
1250+
auto RequiredRC = DefMI->getRegClassConstraint(OpNo, DAG.TII, DAG.TRI);
1251+
1252+
if (!RequiredRC || SRI->hasAGPRs(RequiredRC))
1253+
continue;
1254+
1255+
Register SrcVGPR;
1256+
if (!NewCopies.contains(Src2Reg)) {
1257+
Register SrcVGPR = DAG.MRI.createVirtualRegister(
1258+
SRI->getEquivalentVGPRClass(DAG.MRI.getRegClass(Src2Reg)));
1259+
1260+
// Insert copy near the def to avoid inserting inside loops.
1261+
MachineInstrBuilder VGPRCopy =
1262+
BuildMI(*DefMI->getParent(), ++DefMI->getIterator(),
1263+
DefMI->getDebugLoc(), TII->get(TargetOpcode::COPY))
1264+
.addDef(Src2Reg, 0, 0)
1265+
.addUse(SrcVGPR, 0, 0);
1266+
1267+
NewCopies[Src2Reg] = VGPRCopy;
1268+
}
1269+
1270+
SrcVGPR = NewCopies[Src2Reg]->getOperand(1).getReg();
1271+
TheOp.setReg(SrcVGPR);
1272+
}
1273+
}
1274+
1275+
if (NewCopies.contains(Src2Reg)) {
1276+
DAG.LIS->InsertMachineInstrInMaps(*NewCopies[Src2Reg]);
1277+
DAG.LIS->removeInterval(Src2Reg);
1278+
DAG.LIS->createAndComputeVirtRegInterval(Src2Reg);
1279+
DAG.LIS->createAndComputeVirtRegInterval(
1280+
NewCopies[Src2Reg]->getOperand(1).getReg());
1281+
}
1282+
}
1283+
1284+
// Liveins may have been modified for cross RC copies
1285+
RegionPressureMap LiveInUpdater(&DAG, false);
1286+
LiveInUpdater.buildLiveRegMap();
1287+
1288+
for (unsigned RegionIdx = 0; RegionIdx < DAG.Regions.size(); RegionIdx++)
1289+
DAG.LiveIns[RegionIdx] = LiveInUpdater.getLiveRegsForRegionIdx(RegionIdx);
1290+
1291+
return true;
1292+
}
1293+
10461294
bool UnclusteredHighRPStage::initGCNSchedStage() {
10471295
if (DisableUnclusterHighRP)
10481296
return false;
@@ -1348,6 +1596,9 @@ void GCNSchedStage::checkScheduling() {
13481596
DAG.RegionsWithExcessRP[RegionIdx] = true;
13491597
}
13501598

1599+
if (PressureAfter.getArchVGPRNum() > ST.getAddressableNumArchVGPRs())
1600+
DAG.RegionsWithExcessArchVGPR[RegionIdx] = true;
1601+
13511602
// Revert if this region's schedule would cause a drop in occupancy or
13521603
// spilling.
13531604
if (shouldRevertScheduling(WavesAfter)) {
@@ -1648,6 +1899,38 @@ void GCNSchedStage::revertScheduling() {
16481899
DAG.Regions[RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd);
16491900
}
16501901

1902+
bool RewriteScheduleStage::isRewriteCandidate(MachineInstr *MI) const {
1903+
1904+
if (!static_cast<const SIInstrInfo *>(DAG.TII)->isMAI(*MI))
1905+
return false;
1906+
return AMDGPU::getMFMASrcCVDstAGPROp(MI->getOpcode()) != -1;
1907+
}
1908+
1909+
const TargetRegisterClass *
1910+
RewriteScheduleStage::recomputeRegClassExceptRewritable(
1911+
Register Reg, const TargetRegisterClass *OldRC,
1912+
const TargetRegisterClass *NewRC) const {
1913+
1914+
// Accumulate constraints from all uses.
1915+
for (MachineOperand &MO : DAG.MRI.reg_nodbg_operands(Reg)) {
1916+
// Apply the effect of the given operand to NewRC.
1917+
MachineInstr *MI = MO.getParent();
1918+
// We can swap the classes of dst + src2 as a pair to AGPR, so ignore the
1919+
// effects of rewrite candidates. It just so happens that we can use either
1920+
// AGPR or VGPR in src0/src1, so don't bother checking the constraint
1921+
// effects of the individual operands.
1922+
if (isRewriteCandidate(MI))
1923+
continue;
1924+
1925+
unsigned OpNo = &MO - &MI->getOperand(0);
1926+
NewRC = MI->getRegClassConstraintEffect(OpNo, NewRC, DAG.TII, DAG.TRI);
1927+
if (!NewRC || NewRC == OldRC)
1928+
return nullptr;
1929+
}
1930+
1931+
return NewRC;
1932+
}
1933+
16511934
bool PreRARematStage::allUsesAvailableAt(const MachineInstr *InstToRemat,
16521935
SlotIndex OriginalIdx,
16531936
SlotIndex RematIdx) const {

0 commit comments

Comments
 (0)