|
29 | 29 | #include "SIMachineFunctionInfo.h"
|
30 | 30 | #include "Utils/AMDGPUBaseInfo.h"
|
31 | 31 | #include "llvm/ADT/STLExtras.h"
|
| 32 | +#include "llvm/CodeGen/MachineCycleAnalysis.h" |
32 | 33 | #include "llvm/CodeGen/RegisterClassInfo.h"
|
33 | 34 | #include "llvm/MC/LaneBitmask.h"
|
34 | 35 | #include "llvm/Support/ErrorHandling.h"
|
@@ -535,6 +536,7 @@ GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy(
|
535 | 536 | const MachineSchedContext *C, bool IsLegacyScheduler)
|
536 | 537 | : GCNSchedStrategy(C) {
|
537 | 538 | SchedStages.push_back(GCNSchedStageID::OccInitialSchedule);
|
| 539 | + SchedStages.push_back(GCNSchedStageID::RewriteSchedule); |
538 | 540 | SchedStages.push_back(GCNSchedStageID::UnclusteredHighRPReschedule);
|
539 | 541 | SchedStages.push_back(GCNSchedStageID::ClusteredLowOccupancyReschedule);
|
540 | 542 | SchedStages.push_back(GCNSchedStageID::PreRARematerialize);
|
@@ -785,6 +787,8 @@ GCNScheduleDAGMILive::createSchedStage(GCNSchedStageID SchedStageID) {
|
785 | 787 | switch (SchedStageID) {
|
786 | 788 | case GCNSchedStageID::OccInitialSchedule:
|
787 | 789 | return std::make_unique<OccInitialScheduleStage>(SchedStageID, *this);
|
| 790 | + case GCNSchedStageID::RewriteSchedule: |
| 791 | + return std::make_unique<RewriteScheduleStage>(SchedStageID, *this); |
788 | 792 | case GCNSchedStageID::UnclusteredHighRPReschedule:
|
789 | 793 | return std::make_unique<UnclusteredHighRPStage>(SchedStageID, *this);
|
790 | 794 | case GCNSchedStageID::ClusteredLowOccupancyReschedule:
|
@@ -948,10 +952,12 @@ void GCNScheduleDAGMILive::finalizeSchedule() {
|
948 | 952 | Pressure.resize(Regions.size());
|
949 | 953 | RegionsWithHighRP.resize(Regions.size());
|
950 | 954 | RegionsWithExcessRP.resize(Regions.size());
|
| 955 | + RegionsWithExcessArchVGPR.resize(Regions.size()); |
951 | 956 | RegionsWithMinOcc.resize(Regions.size());
|
952 | 957 | RegionsWithIGLPInstrs.resize(Regions.size());
|
953 | 958 | RegionsWithHighRP.reset();
|
954 | 959 | RegionsWithExcessRP.reset();
|
| 960 | + RegionsWithExcessArchVGPR.reset(); |
955 | 961 | RegionsWithMinOcc.reset();
|
956 | 962 | RegionsWithIGLPInstrs.reset();
|
957 | 963 |
|
@@ -1010,6 +1016,9 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, const GCNSchedStageID &StageID) {
|
1010 | 1016 | case GCNSchedStageID::OccInitialSchedule:
|
1011 | 1017 | OS << "Max Occupancy Initial Schedule";
|
1012 | 1018 | break;
|
| 1019 | + case GCNSchedStageID::RewriteSchedule: |
| 1020 | + OS << "Instruction Rewriting Reschedule"; |
| 1021 | + break; |
1013 | 1022 | case GCNSchedStageID::UnclusteredHighRPReschedule:
|
1014 | 1023 | OS << "Unclustered High Register Pressure Reschedule";
|
1015 | 1024 | break;
|
@@ -1043,6 +1052,245 @@ bool GCNSchedStage::initGCNSchedStage() {
|
1043 | 1052 | return true;
|
1044 | 1053 | }
|
1045 | 1054 |
|
| 1055 | +bool RewriteScheduleStage::initGCNSchedStage() { |
| 1056 | + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
| 1057 | + if (!ST.hasGFX90AInsts() || DAG.RegionsWithExcessArchVGPR.none()) |
| 1058 | + return false; |
| 1059 | + |
| 1060 | + const SIInstrInfo *TII = ST.getInstrInfo(); |
| 1061 | + const SIRegisterInfo *SRI = ST.getRegisterInfo(); |
| 1062 | + SmallPtrSet<MachineInstr *, 16> CrossRCUseCopies; |
| 1063 | + SmallPtrSet<MachineInstr *, 16> CrossRCDefCopies; |
| 1064 | + std::vector<std::pair<MachineInstr *, unsigned>> RewriteInsts; |
| 1065 | + |
| 1066 | + for (auto &MBB : MF) { |
| 1067 | + for (auto &MI : MBB) { |
| 1068 | + if (TII->isMAI(MI)) { |
| 1069 | + int ReplacementOp = AMDGPU::getMFMASrcCVDstAGPROp(MI.getOpcode()); |
| 1070 | + if (ReplacementOp == -1) |
| 1071 | + continue; |
| 1072 | + const TargetRegisterClass *VGPRRC = |
| 1073 | + DAG.MRI.getRegClass(MI.getOperand(0).getReg()); |
| 1074 | + const TargetRegisterClass *AGPRRC = SRI->getEquivalentAGPRClass(VGPRRC); |
| 1075 | + const TargetRegisterClass *DestConstrainExceptRC = |
| 1076 | + recomputeRegClassExceptRewritable(MI.getOperand(0).getReg(), VGPRRC, |
| 1077 | + AGPRRC); |
| 1078 | + |
| 1079 | + if (!DestConstrainExceptRC) |
| 1080 | + CrossRCUseCopies.insert(&MI); |
| 1081 | + |
| 1082 | + MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2); |
| 1083 | + if (Src2 && Src2->isReg()) { |
| 1084 | + const TargetRegisterClass *Src2ConstrainExceptRC = |
| 1085 | + recomputeRegClassExceptRewritable(Src2->getReg(), VGPRRC, AGPRRC); |
| 1086 | + if ((!Src2ConstrainExceptRC || Src2ConstrainExceptRC != AGPRRC)) |
| 1087 | + CrossRCDefCopies.insert(&MI); |
| 1088 | + |
| 1089 | + DAG.MRI.setRegClass(Src2->getReg(), AGPRRC); |
| 1090 | + } |
| 1091 | + |
| 1092 | + DAG.MRI.setRegClass(MI.getOperand(0).getReg(), AGPRRC); |
| 1093 | + |
| 1094 | + auto OriginalOpc = MI.getOpcode(); |
| 1095 | + MI.setDesc(TII->get(ReplacementOp)); |
| 1096 | + RewriteInsts.push_back({&MI, OriginalOpc}); |
| 1097 | + } |
| 1098 | + } |
| 1099 | + } |
| 1100 | + |
| 1101 | + bool ShouldRewrite = false; |
| 1102 | + for (unsigned RegionIdx = 0; RegionIdx < DAG.Regions.size(); RegionIdx++) { |
| 1103 | + if (!DAG.RegionsWithExcessArchVGPR[RegionIdx]) |
| 1104 | + continue; |
| 1105 | + |
| 1106 | + // For the cases we care about (i.e. ArchVGPR usage is greater than the |
| 1107 | + // addressable limit), rewriting alone should bring pressure to manageable |
| 1108 | + // level. If we find any such region, then the rewrite is potentially |
| 1109 | + // beneficial. |
| 1110 | + auto PressureAfter = DAG.getRealRegPressure(RegionIdx); |
| 1111 | + unsigned MaxCombinedVGPRs = ST.getMaxNumVGPRs(MF); |
| 1112 | + if (PressureAfter.getArchVGPRNum() <= ST.getAddressableNumArchVGPRs() && |
| 1113 | + PressureAfter.getVGPRNum(true) <= MaxCombinedVGPRs) { |
| 1114 | + ShouldRewrite = true; |
| 1115 | + break; |
| 1116 | + } |
| 1117 | + } |
| 1118 | + |
| 1119 | + // If we find that we'll need to insert cross RC copies inside loop bodies, |
| 1120 | + // then bail |
| 1121 | + if (ShouldRewrite) { |
| 1122 | + CI.clear(); |
| 1123 | + CI.compute(MF); |
| 1124 | + |
| 1125 | + for (auto *DefMI : CrossRCUseCopies) { |
| 1126 | + auto DefReg = DefMI->getOperand(0).getReg(); |
| 1127 | + |
| 1128 | + for (auto &UseMI : DAG.MRI.use_nodbg_instructions(DefReg)) { |
| 1129 | + for (unsigned OpNo = 0; OpNo < UseMI.getNumOperands(); OpNo++) { |
| 1130 | + auto &TheOp = UseMI.getOperand(OpNo); |
| 1131 | + if (!TheOp.isReg() || !TheOp.isUse()) |
| 1132 | + continue; |
| 1133 | + if (TheOp.getReg() != DefReg) |
| 1134 | + continue; |
| 1135 | + |
| 1136 | + auto RequiredRC = UseMI.getRegClassConstraint(OpNo, DAG.TII, DAG.TRI); |
| 1137 | + if (!RequiredRC || SRI->hasAGPRs(RequiredRC)) |
| 1138 | + continue; |
| 1139 | + |
| 1140 | + unsigned DefDepth = CI.getCycleDepth(DefMI->getParent()); |
| 1141 | + if (DefDepth && CI.getCycleDepth(UseMI.getParent()) >= DefDepth) { |
| 1142 | + ShouldRewrite = false; |
| 1143 | + break; |
| 1144 | + } |
| 1145 | + } |
| 1146 | + if (!ShouldRewrite) |
| 1147 | + break; |
| 1148 | + } |
| 1149 | + if (!ShouldRewrite) |
| 1150 | + break; |
| 1151 | + } |
| 1152 | + } |
| 1153 | + |
| 1154 | + // If we haven't found the beneficial conditions, prefer the VGPR form which |
| 1155 | + // may result in less cross RC copies. |
| 1156 | + if (!ShouldRewrite) { |
| 1157 | + for (auto RI : RewriteInsts) { |
| 1158 | + MachineInstr *MI = RI.first; |
| 1159 | + |
| 1160 | + assert(TII->isMAI(*MI)); |
| 1161 | + const TargetRegisterClass *AGPRRC = |
| 1162 | + DAG.MRI.getRegClass(MI->getOperand(0).getReg()); |
| 1163 | + const TargetRegisterClass *VGPRRC = SRI->getEquivalentVGPRClass(AGPRRC); |
| 1164 | + |
| 1165 | + MachineOperand *Src2 = TII->getNamedOperand(*MI, AMDGPU::OpName::src2); |
| 1166 | + assert(Src2); |
| 1167 | + |
| 1168 | + if (Src2->isReg()) { |
| 1169 | + DAG.MRI.setRegClass(Src2->getReg(), VGPRRC); |
| 1170 | + } |
| 1171 | + DAG.MRI.setRegClass(MI->getOperand(0).getReg(), VGPRRC); |
| 1172 | + MI->setDesc(TII->get(RI.second)); |
| 1173 | + } |
| 1174 | + |
| 1175 | + return false; |
| 1176 | + } |
| 1177 | + |
| 1178 | + DAG.RegionsWithExcessArchVGPR.reset(); |
| 1179 | + DAG.RegionsWithExcessRP.reset(); |
| 1180 | + |
| 1181 | + // Insert cross RC copies for the users of the MFMA result |
| 1182 | + for (auto MI : CrossRCUseCopies) { |
| 1183 | + auto DefReg = MI->getOperand(0).getReg(); |
| 1184 | + SmallVector<MachineInstr *, 4> UseInstrs; |
| 1185 | + for (auto &UseMI : DAG.MRI.use_nodbg_instructions(DefReg)) |
| 1186 | + UseInstrs.push_back(&UseMI); |
| 1187 | + |
| 1188 | + DenseMap<Register, MachineInstr *> NewCopies; |
| 1189 | + for (auto UseMI : UseInstrs) { |
| 1190 | + for (unsigned OpNo = 0; OpNo < UseMI->getNumOperands(); OpNo++) { |
| 1191 | + auto &TheOp = UseMI->getOperand(OpNo); |
| 1192 | + if (!TheOp.isReg() || !TheOp.isUse()) |
| 1193 | + continue; |
| 1194 | + if (TheOp.getReg() != DefReg) |
| 1195 | + continue; |
| 1196 | + |
| 1197 | + auto RequiredRC = UseMI->getRegClassConstraint(OpNo, DAG.TII, DAG.TRI); |
| 1198 | + |
| 1199 | + if (!RequiredRC || SRI->hasAGPRs(RequiredRC)) |
| 1200 | + continue; |
| 1201 | + |
| 1202 | + Register DestVGPR; |
| 1203 | + if (!NewCopies.contains(DefReg)) { |
| 1204 | + Register DestVGPR = DAG.MRI.createVirtualRegister( |
| 1205 | + SRI->getEquivalentVGPRClass(DAG.MRI.getRegClass(DefReg))); |
| 1206 | + |
| 1207 | + // Insert copy near the user to avoid inserting inside loops. |
| 1208 | + MachineInstrBuilder VGPRCopy = |
| 1209 | + BuildMI(*UseMI->getParent(), UseMI->getIterator(), |
| 1210 | + UseMI->getDebugLoc(), TII->get(TargetOpcode::COPY)) |
| 1211 | + .addDef(DestVGPR, 0, 0) |
| 1212 | + .addUse(DefReg, 0, 0); |
| 1213 | + |
| 1214 | + NewCopies[DefReg] = VGPRCopy; |
| 1215 | + } |
| 1216 | + DestVGPR = NewCopies[DefReg]->getOperand(0).getReg(); |
| 1217 | + TheOp.setReg(DestVGPR); |
| 1218 | + } |
| 1219 | + } |
| 1220 | + if (NewCopies.contains(DefReg)) { |
| 1221 | + DAG.LIS->InsertMachineInstrInMaps(*NewCopies[DefReg]); |
| 1222 | + DAG.LIS->removeInterval(DefReg); |
| 1223 | + DAG.LIS->createAndComputeVirtRegInterval(DefReg); |
| 1224 | + DAG.LIS->createAndComputeVirtRegInterval( |
| 1225 | + NewCopies[DefReg]->getOperand(0).getReg()); |
| 1226 | + } |
| 1227 | + } |
| 1228 | + |
| 1229 | + // Insert cross RC copies for the use operands of the MFMA |
| 1230 | + for (auto MI : CrossRCDefCopies) { |
| 1231 | + MachineOperand *Src2 = TII->getNamedOperand(*MI, AMDGPU::OpName::src2); |
| 1232 | + if (!Src2) |
| 1233 | + continue; |
| 1234 | + if (!Src2->isReg()) |
| 1235 | + continue; |
| 1236 | + auto Src2Reg = Src2->getReg(); |
| 1237 | + SmallVector<MachineInstr *, 4> DefInstrs; |
| 1238 | + for (auto &DefMI : DAG.MRI.def_instructions(Src2Reg)) |
| 1239 | + DefInstrs.push_back(&DefMI); |
| 1240 | + |
| 1241 | + DenseMap<Register, MachineInstr *> NewCopies; |
| 1242 | + for (auto DefMI : DefInstrs) { |
| 1243 | + for (unsigned OpNo = 0; OpNo < DefMI->getNumOperands(); OpNo++) { |
| 1244 | + auto &TheOp = DefMI->getOperand(OpNo); |
| 1245 | + if (!TheOp.isReg() || !TheOp.isDef()) |
| 1246 | + continue; |
| 1247 | + if (TheOp.getReg() != Src2Reg) |
| 1248 | + continue; |
| 1249 | + |
| 1250 | + auto RequiredRC = DefMI->getRegClassConstraint(OpNo, DAG.TII, DAG.TRI); |
| 1251 | + |
| 1252 | + if (!RequiredRC || SRI->hasAGPRs(RequiredRC)) |
| 1253 | + continue; |
| 1254 | + |
| 1255 | + Register SrcVGPR; |
| 1256 | + if (!NewCopies.contains(Src2Reg)) { |
| 1257 | + Register SrcVGPR = DAG.MRI.createVirtualRegister( |
| 1258 | + SRI->getEquivalentVGPRClass(DAG.MRI.getRegClass(Src2Reg))); |
| 1259 | + |
| 1260 | + // Insert copy near the def to avoid inserting inside loops. |
| 1261 | + MachineInstrBuilder VGPRCopy = |
| 1262 | + BuildMI(*DefMI->getParent(), ++DefMI->getIterator(), |
| 1263 | + DefMI->getDebugLoc(), TII->get(TargetOpcode::COPY)) |
| 1264 | + .addDef(Src2Reg, 0, 0) |
| 1265 | + .addUse(SrcVGPR, 0, 0); |
| 1266 | + |
| 1267 | + NewCopies[Src2Reg] = VGPRCopy; |
| 1268 | + } |
| 1269 | + |
| 1270 | + SrcVGPR = NewCopies[Src2Reg]->getOperand(1).getReg(); |
| 1271 | + TheOp.setReg(SrcVGPR); |
| 1272 | + } |
| 1273 | + } |
| 1274 | + |
| 1275 | + if (NewCopies.contains(Src2Reg)) { |
| 1276 | + DAG.LIS->InsertMachineInstrInMaps(*NewCopies[Src2Reg]); |
| 1277 | + DAG.LIS->removeInterval(Src2Reg); |
| 1278 | + DAG.LIS->createAndComputeVirtRegInterval(Src2Reg); |
| 1279 | + DAG.LIS->createAndComputeVirtRegInterval( |
| 1280 | + NewCopies[Src2Reg]->getOperand(1).getReg()); |
| 1281 | + } |
| 1282 | + } |
| 1283 | + |
| 1284 | + // Liveins may have been modified for cross RC copies |
| 1285 | + RegionPressureMap LiveInUpdater(&DAG, false); |
| 1286 | + LiveInUpdater.buildLiveRegMap(); |
| 1287 | + |
| 1288 | + for (unsigned RegionIdx = 0; RegionIdx < DAG.Regions.size(); RegionIdx++) |
| 1289 | + DAG.LiveIns[RegionIdx] = LiveInUpdater.getLiveRegsForRegionIdx(RegionIdx); |
| 1290 | + |
| 1291 | + return true; |
| 1292 | +} |
| 1293 | + |
1046 | 1294 | bool UnclusteredHighRPStage::initGCNSchedStage() {
|
1047 | 1295 | if (DisableUnclusterHighRP)
|
1048 | 1296 | return false;
|
@@ -1348,6 +1596,9 @@ void GCNSchedStage::checkScheduling() {
|
1348 | 1596 | DAG.RegionsWithExcessRP[RegionIdx] = true;
|
1349 | 1597 | }
|
1350 | 1598 |
|
| 1599 | + if (PressureAfter.getArchVGPRNum() > ST.getAddressableNumArchVGPRs()) |
| 1600 | + DAG.RegionsWithExcessArchVGPR[RegionIdx] = true; |
| 1601 | + |
1351 | 1602 | // Revert if this region's schedule would cause a drop in occupancy or
|
1352 | 1603 | // spilling.
|
1353 | 1604 | if (shouldRevertScheduling(WavesAfter)) {
|
@@ -1648,6 +1899,38 @@ void GCNSchedStage::revertScheduling() {
|
1648 | 1899 | DAG.Regions[RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd);
|
1649 | 1900 | }
|
1650 | 1901 |
|
| 1902 | +bool RewriteScheduleStage::isRewriteCandidate(MachineInstr *MI) const { |
| 1903 | + |
| 1904 | + if (!static_cast<const SIInstrInfo *>(DAG.TII)->isMAI(*MI)) |
| 1905 | + return false; |
| 1906 | + return AMDGPU::getMFMASrcCVDstAGPROp(MI->getOpcode()) != -1; |
| 1907 | +} |
| 1908 | + |
| 1909 | +const TargetRegisterClass * |
| 1910 | +RewriteScheduleStage::recomputeRegClassExceptRewritable( |
| 1911 | + Register Reg, const TargetRegisterClass *OldRC, |
| 1912 | + const TargetRegisterClass *NewRC) const { |
| 1913 | + |
| 1914 | + // Accumulate constraints from all uses. |
| 1915 | + for (MachineOperand &MO : DAG.MRI.reg_nodbg_operands(Reg)) { |
| 1916 | + // Apply the effect of the given operand to NewRC. |
| 1917 | + MachineInstr *MI = MO.getParent(); |
| 1918 | + // We can swap the classes of dst + src2 as a pair to AGPR, so ignore the |
| 1919 | + // effects of rewrite candidates. It just so happens that we can use either |
| 1920 | + // AGPR or VGPR in src0/src1, so don't bother checking the constraint |
| 1921 | + // effects of the individual operands. |
| 1922 | + if (isRewriteCandidate(MI)) |
| 1923 | + continue; |
| 1924 | + |
| 1925 | + unsigned OpNo = &MO - &MI->getOperand(0); |
| 1926 | + NewRC = MI->getRegClassConstraintEffect(OpNo, NewRC, DAG.TII, DAG.TRI); |
| 1927 | + if (!NewRC || NewRC == OldRC) |
| 1928 | + return nullptr; |
| 1929 | + } |
| 1930 | + |
| 1931 | + return NewRC; |
| 1932 | +} |
| 1933 | + |
1651 | 1934 | bool PreRARematStage::allUsesAvailableAt(const MachineInstr *InstToRemat,
|
1652 | 1935 | SlotIndex OriginalIdx,
|
1653 | 1936 | SlotIndex RematIdx) const {
|
|
0 commit comments