29
29
#include " SIMachineFunctionInfo.h"
30
30
#include " Utils/AMDGPUBaseInfo.h"
31
31
#include " llvm/ADT/STLExtras.h"
32
+ #include " llvm/CodeGen/MachineCycleAnalysis.h"
32
33
#include " llvm/CodeGen/RegisterClassInfo.h"
33
34
#include " llvm/MC/LaneBitmask.h"
34
35
#include " llvm/Support/ErrorHandling.h"
@@ -528,6 +529,7 @@ GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy(
528
529
const MachineSchedContext *C, bool IsLegacyScheduler)
529
530
: GCNSchedStrategy(C) {
530
531
SchedStages.push_back (GCNSchedStageID::OccInitialSchedule);
532
+ SchedStages.push_back (GCNSchedStageID::RewriteSchedule);
531
533
SchedStages.push_back (GCNSchedStageID::UnclusteredHighRPReschedule);
532
534
SchedStages.push_back (GCNSchedStageID::ClusteredLowOccupancyReschedule);
533
535
SchedStages.push_back (GCNSchedStageID::PreRARematerialize);
@@ -778,6 +780,8 @@ GCNScheduleDAGMILive::createSchedStage(GCNSchedStageID SchedStageID) {
778
780
switch (SchedStageID) {
779
781
case GCNSchedStageID::OccInitialSchedule:
780
782
return std::make_unique<OccInitialScheduleStage>(SchedStageID, *this );
783
+ case GCNSchedStageID::RewriteSchedule:
784
+ return std::make_unique<RewriteScheduleStage>(SchedStageID, *this );
781
785
case GCNSchedStageID::UnclusteredHighRPReschedule:
782
786
return std::make_unique<UnclusteredHighRPStage>(SchedStageID, *this );
783
787
case GCNSchedStageID::ClusteredLowOccupancyReschedule:
@@ -803,7 +807,8 @@ void GCNScheduleDAGMILive::schedule() {
803
807
GCNRegPressure
804
808
GCNScheduleDAGMILive::getRealRegPressure (unsigned RegionIdx) const {
805
809
GCNDownwardRPTracker RPTracker (*LIS);
806
- RPTracker.advance (begin (), end (), &LiveIns[RegionIdx]);
810
+ RPTracker.advance (Regions[RegionIdx].first , Regions[RegionIdx].second ,
811
+ &LiveIns[RegionIdx]);
807
812
return RPTracker.moveMaxPressure ();
808
813
}
809
814
@@ -940,10 +945,12 @@ void GCNScheduleDAGMILive::finalizeSchedule() {
940
945
Pressure.resize (Regions.size ());
941
946
RegionsWithHighRP.resize (Regions.size ());
942
947
RegionsWithExcessRP.resize (Regions.size ());
948
+ RegionsWithExcessArchVGPR.resize (Regions.size ());
943
949
RegionsWithMinOcc.resize (Regions.size ());
944
950
RegionsWithIGLPInstrs.resize (Regions.size ());
945
951
RegionsWithHighRP.reset ();
946
952
RegionsWithExcessRP.reset ();
953
+ RegionsWithExcessArchVGPR.reset ();
947
954
RegionsWithMinOcc.reset ();
948
955
RegionsWithIGLPInstrs.reset ();
949
956
@@ -1002,6 +1009,9 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, const GCNSchedStageID &StageID) {
1002
1009
case GCNSchedStageID::OccInitialSchedule:
1003
1010
OS << " Max Occupancy Initial Schedule" ;
1004
1011
break ;
1012
+ case GCNSchedStageID::RewriteSchedule:
1013
+ OS << " Instruction Rewriting Reschedule" ;
1014
+ break ;
1005
1015
case GCNSchedStageID::UnclusteredHighRPReschedule:
1006
1016
OS << " Unclustered High Register Pressure Reschedule" ;
1007
1017
break ;
@@ -1035,6 +1045,245 @@ bool GCNSchedStage::initGCNSchedStage() {
1035
1045
return true ;
1036
1046
}
1037
1047
1048
+ bool RewriteScheduleStage::initGCNSchedStage () {
1049
+ const GCNSubtarget &ST = MF.getSubtarget <GCNSubtarget>();
1050
+ if (!ST.hasGFX90AInsts () || DAG.RegionsWithExcessArchVGPR .none ())
1051
+ return false ;
1052
+
1053
+ const SIInstrInfo *TII = ST.getInstrInfo ();
1054
+ const SIRegisterInfo *SRI = ST.getRegisterInfo ();
1055
+ SmallPtrSet<MachineInstr *, 16 > CrossRCUseCopies;
1056
+ SmallPtrSet<MachineInstr *, 16 > CrossRCDefCopies;
1057
+ std::vector<std::pair<MachineInstr *, unsigned >> RewriteInsts;
1058
+
1059
+ for (auto &MBB : MF) {
1060
+ for (auto &MI : MBB) {
1061
+ if (TII->isMAI (MI)) {
1062
+ int ReplacementOp = AMDGPU::getMFMASrcCVDstAGPROp (MI.getOpcode ());
1063
+ if (ReplacementOp == -1 )
1064
+ continue ;
1065
+ const TargetRegisterClass *VGPRRC =
1066
+ DAG.MRI .getRegClass (MI.getOperand (0 ).getReg ());
1067
+ const TargetRegisterClass *AGPRRC = SRI->getEquivalentAGPRClass (VGPRRC);
1068
+ const TargetRegisterClass *DestConstrainExceptRC =
1069
+ recomputeRegClassExceptRewritable (MI.getOperand (0 ).getReg (), VGPRRC,
1070
+ AGPRRC);
1071
+
1072
+ if (!DestConstrainExceptRC)
1073
+ CrossRCUseCopies.insert (&MI);
1074
+
1075
+ MachineOperand *Src2 = TII->getNamedOperand (MI, AMDGPU::OpName::src2);
1076
+ if (Src2 && Src2->isReg ()) {
1077
+ const TargetRegisterClass *Src2ConstrainExceptRC =
1078
+ recomputeRegClassExceptRewritable (Src2->getReg (), VGPRRC, AGPRRC);
1079
+ if ((!Src2ConstrainExceptRC || Src2ConstrainExceptRC != AGPRRC))
1080
+ CrossRCDefCopies.insert (&MI);
1081
+
1082
+ DAG.MRI .setRegClass (Src2->getReg (), AGPRRC);
1083
+ }
1084
+
1085
+ DAG.MRI .setRegClass (MI.getOperand (0 ).getReg (), AGPRRC);
1086
+
1087
+ auto OriginalOpc = MI.getOpcode ();
1088
+ MI.setDesc (TII->get (ReplacementOp));
1089
+ RewriteInsts.push_back ({&MI, OriginalOpc});
1090
+ }
1091
+ }
1092
+ }
1093
+
1094
+ bool ShouldRewrite = false ;
1095
+ for (unsigned RegionIdx = 0 ; RegionIdx < DAG.Regions .size (); RegionIdx++) {
1096
+ if (!DAG.RegionsWithExcessArchVGPR [RegionIdx])
1097
+ continue ;
1098
+
1099
+ // For the cases we care about (i.e. ArchVGPR usage is greater than the
1100
+ // addressable limit), rewriting alone should bring pressure to manageable
1101
+ // level. If we find any such region, then the rewrite is potentially
1102
+ // beneficial.
1103
+ auto PressureAfter = DAG.getRealRegPressure (RegionIdx);
1104
+ unsigned MaxCombinedVGPRs = ST.getMaxNumVGPRs (MF);
1105
+ if (PressureAfter.getArchVGPRNum () <= ST.getAddressableNumArchVGPRs () &&
1106
+ PressureAfter.getVGPRNum (true ) <= MaxCombinedVGPRs) {
1107
+ ShouldRewrite = true ;
1108
+ break ;
1109
+ }
1110
+ }
1111
+
1112
+ // If we find that we'll need to insert cross RC copies inside loop bodies,
1113
+ // then bail
1114
+ if (ShouldRewrite) {
1115
+ CI.clear ();
1116
+ CI.compute (MF);
1117
+
1118
+ for (auto *DefMI : CrossRCUseCopies) {
1119
+ auto DefReg = DefMI->getOperand (0 ).getReg ();
1120
+
1121
+ for (auto &UseMI : DAG.MRI .use_nodbg_instructions (DefReg)) {
1122
+ for (unsigned OpNo = 0 ; OpNo < UseMI.getNumOperands (); OpNo++) {
1123
+ auto &TheOp = UseMI.getOperand (OpNo);
1124
+ if (!TheOp.isReg () || !TheOp.isUse ())
1125
+ continue ;
1126
+ if (TheOp.getReg () != DefReg)
1127
+ continue ;
1128
+
1129
+ auto RequiredRC = UseMI.getRegClassConstraint (OpNo, DAG.TII , DAG.TRI );
1130
+ if (!RequiredRC || SRI->hasAGPRs (RequiredRC))
1131
+ continue ;
1132
+
1133
+ unsigned DefDepth = CI.getCycleDepth (DefMI->getParent ());
1134
+ if (DefDepth && CI.getCycleDepth (UseMI.getParent ()) >= DefDepth) {
1135
+ ShouldRewrite = false ;
1136
+ break ;
1137
+ }
1138
+ }
1139
+ if (!ShouldRewrite)
1140
+ break ;
1141
+ }
1142
+ if (!ShouldRewrite)
1143
+ break ;
1144
+ }
1145
+ }
1146
+
1147
+ // If we haven't found the beneficial conditions, prefer the VGPR form which
1148
+ // may result in less cross RC copies.
1149
+ if (!ShouldRewrite) {
1150
+ for (auto RI : RewriteInsts) {
1151
+ MachineInstr *MI = RI.first ;
1152
+
1153
+ assert (TII->isMAI (*MI));
1154
+ const TargetRegisterClass *AGPRRC =
1155
+ DAG.MRI .getRegClass (MI->getOperand (0 ).getReg ());
1156
+ const TargetRegisterClass *VGPRRC = SRI->getEquivalentVGPRClass (AGPRRC);
1157
+
1158
+ MachineOperand *Src2 = TII->getNamedOperand (*MI, AMDGPU::OpName::src2);
1159
+ assert (Src2);
1160
+
1161
+ if (Src2->isReg ()) {
1162
+ DAG.MRI .setRegClass (Src2->getReg (), VGPRRC);
1163
+ }
1164
+ DAG.MRI .setRegClass (MI->getOperand (0 ).getReg (), VGPRRC);
1165
+ MI->setDesc (TII->get (RI.second ));
1166
+ }
1167
+
1168
+ return false ;
1169
+ }
1170
+
1171
+ DAG.RegionsWithExcessArchVGPR .reset ();
1172
+ DAG.RegionsWithExcessRP .reset ();
1173
+
1174
+ // Insert cross RC copies for the users of the MFMA result
1175
+ for (auto MI : CrossRCUseCopies) {
1176
+ auto DefReg = MI->getOperand (0 ).getReg ();
1177
+ SmallVector<MachineInstr *, 4 > UseInstrs;
1178
+ for (auto &UseMI : DAG.MRI .use_nodbg_instructions (DefReg))
1179
+ UseInstrs.push_back (&UseMI);
1180
+
1181
+ DenseMap<Register, MachineInstr *> NewCopies;
1182
+ for (auto UseMI : UseInstrs) {
1183
+ for (unsigned OpNo = 0 ; OpNo < UseMI->getNumOperands (); OpNo++) {
1184
+ auto &TheOp = UseMI->getOperand (OpNo);
1185
+ if (!TheOp.isReg () || !TheOp.isUse ())
1186
+ continue ;
1187
+ if (TheOp.getReg () != DefReg)
1188
+ continue ;
1189
+
1190
+ auto RequiredRC = UseMI->getRegClassConstraint (OpNo, DAG.TII , DAG.TRI );
1191
+
1192
+ if (!RequiredRC || SRI->hasAGPRs (RequiredRC))
1193
+ continue ;
1194
+
1195
+ Register DestVGPR;
1196
+ if (!NewCopies.contains (DefReg)) {
1197
+ Register DestVGPR = DAG.MRI .createVirtualRegister (
1198
+ SRI->getEquivalentVGPRClass (DAG.MRI .getRegClass (DefReg)));
1199
+
1200
+ // Insert copy near the user to avoid inserting inside loops.
1201
+ MachineInstrBuilder VGPRCopy =
1202
+ BuildMI (*UseMI->getParent (), UseMI->getIterator (),
1203
+ UseMI->getDebugLoc (), TII->get (TargetOpcode::COPY))
1204
+ .addDef (DestVGPR, 0 , 0 )
1205
+ .addUse (DefReg, 0 , 0 );
1206
+
1207
+ NewCopies[DefReg] = VGPRCopy;
1208
+ }
1209
+ DestVGPR = NewCopies[DefReg]->getOperand (0 ).getReg ();
1210
+ TheOp.setReg (DestVGPR);
1211
+ }
1212
+ }
1213
+ if (NewCopies.contains (DefReg)) {
1214
+ DAG.LIS ->InsertMachineInstrInMaps (*NewCopies[DefReg]);
1215
+ DAG.LIS ->removeInterval (DefReg);
1216
+ DAG.LIS ->createAndComputeVirtRegInterval (DefReg);
1217
+ DAG.LIS ->createAndComputeVirtRegInterval (
1218
+ NewCopies[DefReg]->getOperand (0 ).getReg ());
1219
+ }
1220
+ }
1221
+
1222
+ // Insert cross RC copies for the use operands of the MFMA
1223
+ for (auto MI : CrossRCDefCopies) {
1224
+ MachineOperand *Src2 = TII->getNamedOperand (*MI, AMDGPU::OpName::src2);
1225
+ if (!Src2)
1226
+ continue ;
1227
+ if (!Src2->isReg ())
1228
+ continue ;
1229
+ auto Src2Reg = Src2->getReg ();
1230
+ SmallVector<MachineInstr *, 4 > DefInstrs;
1231
+ for (auto &DefMI : DAG.MRI .def_instructions (Src2Reg))
1232
+ DefInstrs.push_back (&DefMI);
1233
+
1234
+ DenseMap<Register, MachineInstr *> NewCopies;
1235
+ for (auto DefMI : DefInstrs) {
1236
+ for (unsigned OpNo = 0 ; OpNo < DefMI->getNumOperands (); OpNo++) {
1237
+ auto &TheOp = DefMI->getOperand (OpNo);
1238
+ if (!TheOp.isReg () || !TheOp.isDef ())
1239
+ continue ;
1240
+ if (TheOp.getReg () != Src2Reg)
1241
+ continue ;
1242
+
1243
+ auto RequiredRC = DefMI->getRegClassConstraint (OpNo, DAG.TII , DAG.TRI );
1244
+
1245
+ if (!RequiredRC || SRI->hasAGPRs (RequiredRC))
1246
+ continue ;
1247
+
1248
+ Register SrcVGPR;
1249
+ if (!NewCopies.contains (Src2Reg)) {
1250
+ Register SrcVGPR = DAG.MRI .createVirtualRegister (
1251
+ SRI->getEquivalentVGPRClass (DAG.MRI .getRegClass (Src2Reg)));
1252
+
1253
+ // Insert copy near the def to avoid inserting inside loops.
1254
+ MachineInstrBuilder VGPRCopy =
1255
+ BuildMI (*DefMI->getParent (), ++DefMI->getIterator (),
1256
+ DefMI->getDebugLoc (), TII->get (TargetOpcode::COPY))
1257
+ .addDef (Src2Reg, 0 , 0 )
1258
+ .addUse (SrcVGPR, 0 , 0 );
1259
+
1260
+ NewCopies[Src2Reg] = VGPRCopy;
1261
+ }
1262
+
1263
+ SrcVGPR = NewCopies[Src2Reg]->getOperand (1 ).getReg ();
1264
+ TheOp.setReg (SrcVGPR);
1265
+ }
1266
+ }
1267
+
1268
+ if (NewCopies.contains (Src2Reg)) {
1269
+ DAG.LIS ->InsertMachineInstrInMaps (*NewCopies[Src2Reg]);
1270
+ DAG.LIS ->removeInterval (Src2Reg);
1271
+ DAG.LIS ->createAndComputeVirtRegInterval (Src2Reg);
1272
+ DAG.LIS ->createAndComputeVirtRegInterval (
1273
+ NewCopies[Src2Reg]->getOperand (1 ).getReg ());
1274
+ }
1275
+ }
1276
+
1277
+ // Liveins may have been modified for cross RC copies
1278
+ RegionPressureMap LiveInUpdater (&DAG, false );
1279
+ LiveInUpdater.buildLiveRegMap ();
1280
+
1281
+ for (unsigned RegionIdx = 0 ; RegionIdx < DAG.Regions .size (); RegionIdx++)
1282
+ DAG.LiveIns [RegionIdx] = LiveInUpdater.getLiveRegsForRegionIdx (RegionIdx);
1283
+
1284
+ return true ;
1285
+ }
1286
+
1038
1287
bool UnclusteredHighRPStage::initGCNSchedStage () {
1039
1288
if (DisableUnclusterHighRP)
1040
1289
return false ;
@@ -1338,6 +1587,9 @@ void GCNSchedStage::checkScheduling() {
1338
1587
DAG.RegionsWithExcessRP [RegionIdx] = true ;
1339
1588
}
1340
1589
1590
+ if (PressureAfter.getArchVGPRNum () > ST.getAddressableNumArchVGPRs ())
1591
+ DAG.RegionsWithExcessArchVGPR [RegionIdx] = true ;
1592
+
1341
1593
// Revert if this region's schedule would cause a drop in occupancy or
1342
1594
// spilling.
1343
1595
if (shouldRevertScheduling (WavesAfter)) {
@@ -1641,6 +1893,38 @@ void GCNSchedStage::revertScheduling() {
1641
1893
DAG.Regions [RegionIdx] = std::pair (DAG.RegionBegin , DAG.RegionEnd );
1642
1894
}
1643
1895
1896
+ bool RewriteScheduleStage::isRewriteCandidate (MachineInstr *MI) const {
1897
+
1898
+ if (!static_cast <const SIInstrInfo *>(DAG.TII )->isMAI (*MI))
1899
+ return false ;
1900
+ return AMDGPU::getMFMASrcCVDstAGPROp (MI->getOpcode ()) != -1 ;
1901
+ }
1902
+
1903
+ const TargetRegisterClass *
1904
+ RewriteScheduleStage::recomputeRegClassExceptRewritable (
1905
+ Register Reg, const TargetRegisterClass *OldRC,
1906
+ const TargetRegisterClass *NewRC) const {
1907
+
1908
+ // Accumulate constraints from all uses.
1909
+ for (MachineOperand &MO : DAG.MRI .reg_nodbg_operands (Reg)) {
1910
+ // Apply the effect of the given operand to NewRC.
1911
+ MachineInstr *MI = MO.getParent ();
1912
+ // We can swap the classes of dst + src2 as a pair to AGPR, so ignore the
1913
+ // effects of rewrite candidates. It just so happens that we can use either
1914
+ // AGPR or VGPR in src0/src1, so don't bother checking the constraint
1915
+ // effects of the individual operands.
1916
+ if (isRewriteCandidate (MI))
1917
+ continue ;
1918
+
1919
+ unsigned OpNo = &MO - &MI->getOperand (0 );
1920
+ NewRC = MI->getRegClassConstraintEffect (OpNo, NewRC, DAG.TII , DAG.TRI );
1921
+ if (!NewRC || NewRC == OldRC)
1922
+ return nullptr ;
1923
+ }
1924
+
1925
+ return NewRC;
1926
+ }
1927
+
1644
1928
bool PreRARematStage::allUsesAvailableAt (const MachineInstr *InstToRemat,
1645
1929
SlotIndex OriginalIdx,
1646
1930
SlotIndex RematIdx) const {
0 commit comments