Skip to content

Commit b5f00f7

Browse files
committed
[AMDGPU] Constrain AV->VReg if we do not exceed RP thresholds
Change-Id: I17cb012504946fa9dca88b32548f922e2ce4b7a9
1 parent d737fe2 commit b5f00f7

12 files changed

+294
-0
lines changed

llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
#include "AMDGPU.h"
3535
#include "GCNSubtarget.h"
3636
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
37+
#include "SIMachineFunctionInfo.h"
3738
#include "SIRegisterInfo.h"
3839
#include "llvm/CodeGen/LiveIntervals.h"
3940
#include "llvm/CodeGen/MachineFunctionPass.h"
@@ -54,6 +55,9 @@ class GCNPreRAOptimizationsImpl {
5455

5556
bool processReg(Register Reg);
5657

58+
bool reconstrainRegClass(Register Reg, const TargetRegisterClass *NewRC,
59+
const GCNSubtarget &ST) const;
60+
5761
public:
5862
GCNPreRAOptimizationsImpl(LiveIntervals *LS) : LIS(LS) {}
5963
bool run(MachineFunction &MF);
@@ -225,6 +229,38 @@ bool GCNPreRAOptimizationsImpl::processReg(Register Reg) {
225229
return true;
226230
}
227231

232+
bool GCNPreRAOptimizationsImpl::reconstrainRegClass(
233+
Register Reg, const TargetRegisterClass *NewRC,
234+
const GCNSubtarget &ST) const {
235+
const SIInstrInfo *TII = ST.getInstrInfo();
236+
const TargetRegisterClass *OldRC = MRI->getRegClass(Reg);
237+
const TargetRegisterClass *ConstrainRC = NewRC;
238+
239+
// Stop early if there is nothing to do.
240+
if (!NewRC || NewRC == OldRC)
241+
return false;
242+
243+
// Accumulate constraints from all uses.
244+
for (MachineOperand &MO : MRI->reg_nodbg_operands(Reg)) {
245+
// Apply the effect of the given operand to ConstrainRC.
246+
MachineInstr *MI = MO.getParent();
247+
unsigned OpNo = &MO - &MI->getOperand(0);
248+
ConstrainRC = MI->getRegClassConstraintEffect(OpNo, ConstrainRC, TII, TRI);
249+
if (!ConstrainRC)
250+
return false;
251+
if (MI->isCopy()) {
252+
MachineOperand &OtherOp = MI->getOperand(1 - OpNo);
253+
if (!OtherOp.isReg())
254+
continue;
255+
256+
if (!TRI->isVGPR(*MRI, OtherOp.getReg()))
257+
return false;
258+
}
259+
}
260+
MRI->setRegClass(Reg, ConstrainRC);
261+
return true;
262+
}
263+
228264
bool GCNPreRAOptimizationsLegacy::runOnMachineFunction(MachineFunction &MF) {
229265
if (skipFunction(MF.getFunction()))
230266
return false;
@@ -245,6 +281,10 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) {
245281
TII = ST.getInstrInfo();
246282
MRI = &MF.getRegInfo();
247283
TRI = ST.getRegisterInfo();
284+
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
285+
bool ContrainAVGPRs =
286+
ST.hasGFX90AInsts() && MFI->getMaxArchVGPRPressure() &&
287+
(MFI->getMaxArchVGPRPressure() < ST.getAddressableNumArchVGPRs());
248288

249289
bool Changed = false;
250290

@@ -253,6 +293,15 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) {
253293
if (!LIS->hasInterval(Reg))
254294
continue;
255295
const TargetRegisterClass *RC = MRI->getRegClass(Reg);
296+
297+
// If we do not need to use AGPRs to assign AVRegs, it is beneficial
298+
// to contrain them to VGPR as this allows for better initial assignment
299+
// (based on register bitwidth).
300+
if (ContrainAVGPRs && TRI->isVectorSuperClass(RC)) {
301+
reconstrainRegClass(Reg, TRI->getEquivalentVGPRClass(RC), ST);
302+
continue;
303+
}
304+
256305
if ((RC->MC->getSizeInBits() != 64 || !TRI->isSGPRClass(RC)) &&
257306
(ST.hasGFX90AInsts() || !TRI->isAGPRClass(RC)))
258307
continue;

llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -941,10 +941,12 @@ void GCNScheduleDAGMILive::finalizeSchedule() {
941941
Pressure.resize(Regions.size());
942942
RegionsWithHighRP.resize(Regions.size());
943943
RegionsWithExcessRP.resize(Regions.size());
944+
RegionsWithExcessVGPRRP.resize(Regions.size());
944945
RegionsWithMinOcc.resize(Regions.size());
945946
RegionsWithIGLPInstrs.resize(Regions.size());
946947
RegionsWithHighRP.reset();
947948
RegionsWithExcessRP.reset();
949+
RegionsWithExcessVGPRRP.reset();
948950
RegionsWithMinOcc.reset();
949951
RegionsWithIGLPInstrs.reset();
950952

@@ -1263,6 +1265,14 @@ void GCNSchedStage::finalizeGCNRegion() {
12631265
// reason that the original schedule is better.
12641266
checkScheduling();
12651267

1268+
unsigned MaxArchVGPR = 0;
1269+
for (auto P : DAG.Pressure) {
1270+
if (P.getArchVGPRNum() > MaxArchVGPR)
1271+
MaxArchVGPR = P.getArchVGPRNum();
1272+
}
1273+
1274+
MF.getInfo<SIMachineFunctionInfo>()->setMaxArchVGPRPressure(MaxArchVGPR);
1275+
12661276
if (DAG.RegionsWithIGLPInstrs[RegionIdx] &&
12671277
StageID != GCNSchedStageID::UnclusteredHighRPReschedule)
12681278
SavedMutations.swap(DAG.Mutations);
@@ -1331,6 +1341,9 @@ void GCNSchedStage::checkScheduling() {
13311341
unsigned MaxArchVGPRs = std::min(MaxVGPRs, ST.getAddressableNumArchVGPRs());
13321342
unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF);
13331343

1344+
if (PressureAfter.getArchVGPRNum() > ST.getAddressableNumArchVGPRs())
1345+
DAG.RegionsWithExcessVGPRRP[RegionIdx] = true;
1346+
13341347
if (PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) > MaxVGPRs ||
13351348
PressureAfter.getArchVGPRNum() > MaxArchVGPRs ||
13361349
PressureAfter.getAGPRNum() > MaxArchVGPRs ||

llvm/lib/Target/AMDGPU/GCNSchedStrategy.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -250,6 +250,9 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
250250
// limit. Register pressure in these regions usually will result in spilling.
251251
BitVector RegionsWithExcessRP;
252252

253+
// Regions that have VGPR RP which exceed the addressable limit.
254+
BitVector RegionsWithExcessVGPRRP;
255+
253256
// Regions that has the same occupancy as the latest MinOccupancy
254257
BitVector RegionsWithMinOcc;
255258

llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -713,6 +713,7 @@ yaml::SIMachineFunctionInfo::SIMachineFunctionInfo(
713713
HasSpilledVGPRs(MFI.hasSpilledVGPRs()),
714714
HighBitsOf32BitAddress(MFI.get32BitAddressHighBits()),
715715
Occupancy(MFI.getOccupancy()),
716+
MaxArchVGPRPressure(MFI.getMaxArchVGPRPressure()),
716717
ScratchRSrcReg(regToString(MFI.getScratchRSrcReg(), TRI)),
717718
FrameOffsetReg(regToString(MFI.getFrameOffsetReg(), TRI)),
718719
StackPtrOffsetReg(regToString(MFI.getStackPtrOffsetReg(), TRI)),
@@ -760,6 +761,7 @@ bool SIMachineFunctionInfo::initializeBaseYamlFields(
760761
MaxMemoryClusterDWords = YamlMFI.MaxMemoryClusterDWords;
761762
HighBitsOf32BitAddress = YamlMFI.HighBitsOf32BitAddress;
762763
Occupancy = YamlMFI.Occupancy;
764+
MaxArchVGPRPressure = YamlMFI.MaxArchVGPRPressure;
763765
IsEntryFunction = YamlMFI.IsEntryFunction;
764766
NoSignedZerosFPMath = YamlMFI.NoSignedZerosFPMath;
765767
MemoryBound = YamlMFI.MemoryBound;

llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -274,6 +274,7 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo {
274274

275275
// TODO: 10 may be a better default since it's the maximum.
276276
unsigned Occupancy = 0;
277+
unsigned MaxArchVGPRPressure = 0;
277278

278279
SmallVector<StringValue, 2> SpillPhysVGPRS;
279280
SmallVector<StringValue> WWMReservedRegs;
@@ -343,6 +344,7 @@ template <> struct MappingTraits<SIMachineFunctionInfo> {
343344
YamlIO.mapOptional("highBitsOf32BitAddress",
344345
MFI.HighBitsOf32BitAddress, 0u);
345346
YamlIO.mapOptional("occupancy", MFI.Occupancy, 0);
347+
YamlIO.mapOptional("maxArchVGPRPressure", MFI.MaxArchVGPRPressure, 0u);
346348
YamlIO.mapOptional("spillPhysVGPRs", MFI.SpillPhysVGPRS);
347349
YamlIO.mapOptional("wwmReservedRegs", MFI.WWMReservedRegs);
348350
YamlIO.mapOptional("scavengeFI", MFI.ScavengeFI);
@@ -512,6 +514,9 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
512514
// Current recorded maximum possible occupancy.
513515
unsigned Occupancy;
514516

517+
// The max arch VGPR pressure found during scheduling.
518+
unsigned MaxArchVGPRPressure;
519+
515520
// Maximum number of dwords that can be clusterred during instruction
516521
// scheduler stage.
517522
unsigned MaxMemoryClusterDWords = DefaultMemoryClusterDWordsLimit;
@@ -1176,6 +1181,12 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
11761181
return MayNeedAGPRs;
11771182
}
11781183

1184+
unsigned getMaxArchVGPRPressure() const { return MaxArchVGPRPressure; }
1185+
1186+
void setMaxArchVGPRPressure(unsigned NewArchVGPRPressure) {
1187+
MaxArchVGPRPressure = NewArchVGPRPressure;
1188+
}
1189+
11791190
// \returns true if a function has a use of AGPRs via inline asm or
11801191
// has a call which may use it.
11811192
bool mayUseAGPRs(const Function &F) const;
Lines changed: 203 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,203 @@
1+
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
2+
# RUN: llc -mtriple=amdgcn -mcpu=gfx950 -run-pass=amdgpu-pre-ra-optimizations -o - %s | FileCheck -check-prefix=UNIFIED %s
3+
# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -run-pass=amdgpu-pre-ra-optimizations -o - %s | FileCheck -check-prefix=SPLIT %s
4+
5+
---
6+
name: reconstrain
7+
tracksRegLiveness: true
8+
machineFunctionInfo:
9+
isEntryFunction: true
10+
scratchRSrcReg: '$sgpr24_sgpr25_sgpr26_sgpr27'
11+
frameOffsetReg: '$sgpr32'
12+
stackPtrOffsetReg: '$sgpr32'
13+
argumentInfo:
14+
privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
15+
privateSegmentWaveByteOffset: { reg: '$sgpr33' }
16+
maxArchVGPRPressure: 2
17+
body: |
18+
bb.0:
19+
liveins: $vgpr0, $vgpr1
20+
; UNIFIED-LABEL: name: reconstrain
21+
; UNIFIED: liveins: $vgpr0, $vgpr1
22+
; UNIFIED-NEXT: {{ $}}
23+
; UNIFIED-NEXT: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
24+
; UNIFIED-NEXT: S_NOP 0, implicit [[DEF]]
25+
; UNIFIED-NEXT: S_ENDPGM 0
26+
;
27+
; SPLIT-LABEL: name: reconstrain
28+
; SPLIT: liveins: $vgpr0, $vgpr1
29+
; SPLIT-NEXT: {{ $}}
30+
; SPLIT-NEXT: [[DEF:%[0-9]+]]:av_64_align2 = IMPLICIT_DEF
31+
; SPLIT-NEXT: S_NOP 0, implicit [[DEF]]
32+
; SPLIT-NEXT: S_ENDPGM 0
33+
%0:av_64_align2 = IMPLICIT_DEF
34+
S_NOP 0, implicit %0
35+
S_ENDPGM 0
36+
...
37+
38+
---
39+
name: unspecified_yaml
40+
tracksRegLiveness: true
41+
machineFunctionInfo:
42+
isEntryFunction: true
43+
scratchRSrcReg: '$sgpr24_sgpr25_sgpr26_sgpr27'
44+
frameOffsetReg: '$sgpr32'
45+
stackPtrOffsetReg: '$sgpr32'
46+
argumentInfo:
47+
privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
48+
privateSegmentWaveByteOffset: { reg: '$sgpr33' }
49+
body: |
50+
bb.0:
51+
liveins: $vgpr0, $vgpr1
52+
; UNIFIED-LABEL: name: unspecified_yaml
53+
; UNIFIED: liveins: $vgpr0, $vgpr1
54+
; UNIFIED-NEXT: {{ $}}
55+
; UNIFIED-NEXT: [[DEF:%[0-9]+]]:av_64_align2 = IMPLICIT_DEF
56+
; UNIFIED-NEXT: S_NOP 0, implicit [[DEF]]
57+
; UNIFIED-NEXT: S_ENDPGM 0
58+
;
59+
; SPLIT-LABEL: name: unspecified_yaml
60+
; SPLIT: liveins: $vgpr0, $vgpr1
61+
; SPLIT-NEXT: {{ $}}
62+
; SPLIT-NEXT: [[DEF:%[0-9]+]]:av_64_align2 = IMPLICIT_DEF
63+
; SPLIT-NEXT: S_NOP 0, implicit [[DEF]]
64+
; SPLIT-NEXT: S_ENDPGM 0
65+
%0:av_64_align2 = IMPLICIT_DEF
66+
S_NOP 0, implicit %0
67+
S_ENDPGM 0
68+
...
69+
70+
---
71+
name: constrain_highrp
72+
tracksRegLiveness: true
73+
machineFunctionInfo:
74+
isEntryFunction: true
75+
scratchRSrcReg: '$sgpr24_sgpr25_sgpr26_sgpr27'
76+
frameOffsetReg: '$sgpr32'
77+
stackPtrOffsetReg: '$sgpr32'
78+
argumentInfo:
79+
privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
80+
privateSegmentWaveByteOffset: { reg: '$sgpr33' }
81+
maxArchVGPRPressure: 255
82+
body: |
83+
bb.0:
84+
liveins: $vgpr0, $vgpr1
85+
; UNIFIED-LABEL: name: constrain_highrp
86+
; UNIFIED: liveins: $vgpr0, $vgpr1
87+
; UNIFIED-NEXT: {{ $}}
88+
; UNIFIED-NEXT: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
89+
; UNIFIED-NEXT: S_NOP 0, implicit [[DEF]]
90+
; UNIFIED-NEXT: S_ENDPGM 0
91+
;
92+
; SPLIT-LABEL: name: constrain_highrp
93+
; SPLIT: liveins: $vgpr0, $vgpr1
94+
; SPLIT-NEXT: {{ $}}
95+
; SPLIT-NEXT: [[DEF:%[0-9]+]]:av_64_align2 = IMPLICIT_DEF
96+
; SPLIT-NEXT: S_NOP 0, implicit [[DEF]]
97+
; SPLIT-NEXT: S_ENDPGM 0
98+
%0:av_64_align2 = IMPLICIT_DEF
99+
S_NOP 0, implicit %0
100+
S_ENDPGM 0
101+
...
102+
103+
---
104+
name: no_constrain_highrp
105+
tracksRegLiveness: true
106+
machineFunctionInfo:
107+
isEntryFunction: true
108+
scratchRSrcReg: '$sgpr24_sgpr25_sgpr26_sgpr27'
109+
frameOffsetReg: '$sgpr32'
110+
stackPtrOffsetReg: '$sgpr32'
111+
argumentInfo:
112+
privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
113+
privateSegmentWaveByteOffset: { reg: '$sgpr33' }
114+
maxArchVGPRPressure: 256
115+
body: |
116+
bb.0:
117+
liveins: $vgpr0, $vgpr1
118+
; UNIFIED-LABEL: name: no_constrain_highrp
119+
; UNIFIED: liveins: $vgpr0, $vgpr1
120+
; UNIFIED-NEXT: {{ $}}
121+
; UNIFIED-NEXT: [[DEF:%[0-9]+]]:av_64_align2 = IMPLICIT_DEF
122+
; UNIFIED-NEXT: S_NOP 0, implicit [[DEF]]
123+
; UNIFIED-NEXT: S_ENDPGM 0
124+
;
125+
; SPLIT-LABEL: name: no_constrain_highrp
126+
; SPLIT: liveins: $vgpr0, $vgpr1
127+
; SPLIT-NEXT: {{ $}}
128+
; SPLIT-NEXT: [[DEF:%[0-9]+]]:av_64_align2 = IMPLICIT_DEF
129+
; SPLIT-NEXT: S_NOP 0, implicit [[DEF]]
130+
; SPLIT-NEXT: S_ENDPGM 0
131+
%0:av_64_align2 = IMPLICIT_DEF
132+
S_NOP 0, implicit %0
133+
S_ENDPGM 0
134+
...
135+
136+
---
137+
name: no_constrain_highrp1
138+
tracksRegLiveness: true
139+
machineFunctionInfo:
140+
isEntryFunction: true
141+
scratchRSrcReg: '$sgpr24_sgpr25_sgpr26_sgpr27'
142+
frameOffsetReg: '$sgpr32'
143+
stackPtrOffsetReg: '$sgpr32'
144+
argumentInfo:
145+
privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
146+
privateSegmentWaveByteOffset: { reg: '$sgpr33' }
147+
maxArchVGPRPressure: 257
148+
body: |
149+
bb.0:
150+
liveins: $vgpr0, $vgpr1
151+
; UNIFIED-LABEL: name: no_constrain_highrp1
152+
; UNIFIED: liveins: $vgpr0, $vgpr1
153+
; UNIFIED-NEXT: {{ $}}
154+
; UNIFIED-NEXT: [[DEF:%[0-9]+]]:av_64_align2 = IMPLICIT_DEF
155+
; UNIFIED-NEXT: S_NOP 0, implicit [[DEF]]
156+
; UNIFIED-NEXT: S_ENDPGM 0
157+
;
158+
; SPLIT-LABEL: name: no_constrain_highrp1
159+
; SPLIT: liveins: $vgpr0, $vgpr1
160+
; SPLIT-NEXT: {{ $}}
161+
; SPLIT-NEXT: [[DEF:%[0-9]+]]:av_64_align2 = IMPLICIT_DEF
162+
; SPLIT-NEXT: S_NOP 0, implicit [[DEF]]
163+
; SPLIT-NEXT: S_ENDPGM 0
164+
%0:av_64_align2 = IMPLICIT_DEF
165+
S_NOP 0, implicit %0
166+
S_ENDPGM 0
167+
...
168+
169+
---
170+
name: no_constrain_use
171+
tracksRegLiveness: true
172+
machineFunctionInfo:
173+
isEntryFunction: true
174+
scratchRSrcReg: '$sgpr24_sgpr25_sgpr26_sgpr27'
175+
frameOffsetReg: '$sgpr32'
176+
stackPtrOffsetReg: '$sgpr32'
177+
argumentInfo:
178+
privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
179+
privateSegmentWaveByteOffset: { reg: '$sgpr33' }
180+
maxArchVGPRPressure: 0
181+
body: |
182+
bb.0:
183+
liveins: $vgpr0, $vgpr1
184+
; UNIFIED-LABEL: name: no_constrain_use
185+
; UNIFIED: liveins: $vgpr0, $vgpr1
186+
; UNIFIED-NEXT: {{ $}}
187+
; UNIFIED-NEXT: [[DEF:%[0-9]+]]:av_64_align2 = IMPLICIT_DEF
188+
; UNIFIED-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3473417 /* reguse:AReg_64 */, [[DEF]]
189+
; UNIFIED-NEXT: S_NOP 0, implicit [[DEF]]
190+
; UNIFIED-NEXT: S_ENDPGM 0
191+
;
192+
; SPLIT-LABEL: name: no_constrain_use
193+
; SPLIT: liveins: $vgpr0, $vgpr1
194+
; SPLIT-NEXT: {{ $}}
195+
; SPLIT-NEXT: [[DEF:%[0-9]+]]:av_64_align2 = IMPLICIT_DEF
196+
; SPLIT-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3473417 /* reguse:AReg_64 */, [[DEF]]
197+
; SPLIT-NEXT: S_NOP 0, implicit [[DEF]]
198+
; SPLIT-NEXT: S_ENDPGM 0
199+
%0:av_64_align2 = IMPLICIT_DEF
200+
INLINEASM &"; use $0", 0 /* attdialect */, 3473417 /* reguse:AReg_64 */, %0
201+
S_NOP 0, implicit %0
202+
S_ENDPGM 0
203+
...

0 commit comments

Comments
 (0)