Skip to content
Open
46 changes: 16 additions & 30 deletions llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -447,11 +447,7 @@ void GCNIterativeScheduler::sortRegionsByPressure(unsigned TargetOcc) {
// BestSchedules aren't deleted on fail.
unsigned GCNIterativeScheduler::tryMaximizeOccupancy(unsigned TargetOcc) {
// TODO: assert Regions are sorted descending by pressure
const auto &ST = MF.getSubtarget<GCNSubtarget>();
const unsigned DynamicVGPRBlockSize =
MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();
const auto Occ =
Regions.front()->MaxPressure.getOccupancy(ST, DynamicVGPRBlockSize);
const auto Occ = Regions.front()->MaxPressure.getOccupancy(MF);
LLVM_DEBUG(dbgs() << "Trying to improve occupancy, target = " << TargetOcc
<< ", current = " << Occ << '\n');

Expand All @@ -460,7 +456,7 @@ unsigned GCNIterativeScheduler::tryMaximizeOccupancy(unsigned TargetOcc) {
// Always build the DAG to add mutations
BuildDAG DAG(*R, *this);

if (R->MaxPressure.getOccupancy(ST, DynamicVGPRBlockSize) >= NewOcc)
if (R->MaxPressure.getOccupancy(MF) >= NewOcc)
continue;

LLVM_DEBUG(printRegion(dbgs(), R->Begin, R->End, LIS, 3);
Expand All @@ -471,7 +467,7 @@ unsigned GCNIterativeScheduler::tryMaximizeOccupancy(unsigned TargetOcc) {
LLVM_DEBUG(dbgs() << "Occupancy improvement attempt:\n";
printSchedRP(dbgs(), R->MaxPressure, MaxRP));

NewOcc = std::min(NewOcc, MaxRP.getOccupancy(ST, DynamicVGPRBlockSize));
NewOcc = std::min(NewOcc, MaxRP.getOccupancy(MF));
if (NewOcc <= Occ)
break;

Expand All @@ -488,15 +484,12 @@ unsigned GCNIterativeScheduler::tryMaximizeOccupancy(unsigned TargetOcc) {
}

void GCNIterativeScheduler::scheduleLegacyMaxOccupancy(
bool TryMaximizeOccupancy) {
const auto &ST = MF.getSubtarget<GCNSubtarget>();
bool TryMaximizeOccupancy) {
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
auto TgtOcc = MFI->getMinAllowedOccupancy();
unsigned DynamicVGPRBlockSize = MFI->getDynamicVGPRBlockSize();

sortRegionsByPressure(TgtOcc);
auto Occ =
Regions.front()->MaxPressure.getOccupancy(ST, DynamicVGPRBlockSize);
auto Occ = Regions.front()->MaxPressure.getOccupancy(MF);

bool IsReentry = false;
if (TryMaximizeOccupancy && Occ < TgtOcc) {
Expand Down Expand Up @@ -527,21 +520,19 @@ void GCNIterativeScheduler::scheduleLegacyMaxOccupancy(
const auto RP = getRegionPressure(*R);
LLVM_DEBUG(printSchedRP(dbgs(), R->MaxPressure, RP));

if (RP.getOccupancy(ST, DynamicVGPRBlockSize) < TgtOcc) {
if (RP.getOccupancy(MF) < TgtOcc) {
LLVM_DEBUG(dbgs() << "Didn't fit into target occupancy O" << TgtOcc);
if (R->BestSchedule.get() && R->BestSchedule->MaxPressure.getOccupancy(
ST, DynamicVGPRBlockSize) >= TgtOcc) {
if (R->BestSchedule.get() &&
R->BestSchedule->MaxPressure.getOccupancy(MF) >= TgtOcc) {
LLVM_DEBUG(dbgs() << ", scheduling minimal register\n");
scheduleBest(*R);
} else {
LLVM_DEBUG(dbgs() << ", restoring\n");
Ovr.restoreOrder();
assert(R->MaxPressure.getOccupancy(ST, DynamicVGPRBlockSize) >=
TgtOcc);
assert(R->MaxPressure.getOccupancy(MF) >= TgtOcc);
}
}
FinalOccupancy =
std::min(FinalOccupancy, RP.getOccupancy(ST, DynamicVGPRBlockSize));
FinalOccupancy = std::min(FinalOccupancy, RP.getOccupancy(MF));
}
}
MFI->limitOccupancy(FinalOccupancy);
Expand Down Expand Up @@ -582,16 +573,12 @@ void GCNIterativeScheduler::scheduleMinReg(bool force) {
///////////////////////////////////////////////////////////////////////////////
// ILP scheduler port

void GCNIterativeScheduler::scheduleILP(
bool TryMaximizeOccupancy) {
const auto &ST = MF.getSubtarget<GCNSubtarget>();
void GCNIterativeScheduler::scheduleILP(bool TryMaximizeOccupancy) {
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
auto TgtOcc = MFI->getMinAllowedOccupancy();
unsigned DynamicVGPRBlockSize = MFI->getDynamicVGPRBlockSize();

sortRegionsByPressure(TgtOcc);
auto Occ =
Regions.front()->MaxPressure.getOccupancy(ST, DynamicVGPRBlockSize);
auto Occ = Regions.front()->MaxPressure.getOccupancy(MF);

bool IsReentry = false;
if (TryMaximizeOccupancy && Occ < TgtOcc) {
Expand All @@ -612,18 +599,17 @@ void GCNIterativeScheduler::scheduleILP(
const auto RP = getSchedulePressure(*R, ILPSchedule);
LLVM_DEBUG(printSchedRP(dbgs(), R->MaxPressure, RP));

if (RP.getOccupancy(ST, DynamicVGPRBlockSize) < TgtOcc) {
if (RP.getOccupancy(MF) < TgtOcc) {
LLVM_DEBUG(dbgs() << "Didn't fit into target occupancy O" << TgtOcc);
if (R->BestSchedule.get() && R->BestSchedule->MaxPressure.getOccupancy(
ST, DynamicVGPRBlockSize) >= TgtOcc) {
if (R->BestSchedule.get() &&
R->BestSchedule->MaxPressure.getOccupancy(MF) >= TgtOcc) {
LLVM_DEBUG(dbgs() << ", scheduling minimal register\n");
scheduleBest(*R);
}
} else {
scheduleRegion(*R, ILPSchedule, RP);
LLVM_DEBUG(printSchedResult(dbgs(), R, RP));
FinalOccupancy =
std::min(FinalOccupancy, RP.getOccupancy(ST, DynamicVGPRBlockSize));
FinalOccupancy = std::min(FinalOccupancy, RP.getOccupancy(MF));
}
}
MFI->limitOccupancy(FinalOccupancy);
Expand Down
156 changes: 93 additions & 63 deletions llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -99,20 +99,22 @@ void GCNRegPressure::inc(unsigned Reg,
bool GCNRegPressure::less(const MachineFunction &MF, const GCNRegPressure &O,
unsigned MaxOccupancy) const {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
unsigned ArchVGPRThreshold = ST.getMaxNumVectorRegs(MF.getFunction()).first;
unsigned DynamicVGPRBlockSize =
MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();

const auto SGPROcc = std::min(MaxOccupancy,
ST.getOccupancyWithNumSGPRs(getSGPRNum()));
const auto VGPROcc = std::min(
MaxOccupancy, ST.getOccupancyWithNumVGPRs(getVGPRNum(ST.hasGFX90AInsts()),
DynamicVGPRBlockSize));
MaxOccupancy, ST.getOccupancyWithNumVGPRs(
getVGPRNum(ST.hasGFX90AInsts(), ArchVGPRThreshold),
DynamicVGPRBlockSize));
const auto OtherSGPROcc = std::min(MaxOccupancy,
ST.getOccupancyWithNumSGPRs(O.getSGPRNum()));
const auto OtherVGPROcc =
std::min(MaxOccupancy,
ST.getOccupancyWithNumVGPRs(O.getVGPRNum(ST.hasGFX90AInsts()),
DynamicVGPRBlockSize));
const auto OtherVGPROcc = std::min(
MaxOccupancy, ST.getOccupancyWithNumVGPRs(
O.getVGPRNum(ST.hasGFX90AInsts(), ArchVGPRThreshold),
DynamicVGPRBlockSize));

const auto Occ = std::min(SGPROcc, VGPROcc);
const auto OtherOcc = std::min(OtherSGPROcc, OtherVGPROcc);
Expand All @@ -135,35 +137,39 @@ bool GCNRegPressure::less(const MachineFunction &MF, const GCNRegPressure &O,
unsigned OtherVGPRForSGPRSpills =
(OtherExcessSGPR + (WaveSize - 1)) / WaveSize;

unsigned MaxArchVGPRs = ST.getAddressableNumArchVGPRs();

// Unified excess pressure conditions, accounting for VGPRs used for SGPR
// spills
unsigned ExcessVGPR =
std::max(static_cast<int>(getVGPRNum(ST.hasGFX90AInsts()) +
VGPRForSGPRSpills - MaxVGPRs),
0);
unsigned OtherExcessVGPR =
std::max(static_cast<int>(O.getVGPRNum(ST.hasGFX90AInsts()) +
OtherVGPRForSGPRSpills - MaxVGPRs),
0);
unsigned ExcessVGPR = std::max(
static_cast<int>(getVGPRNum(ST.hasGFX90AInsts(), ArchVGPRThreshold) +
VGPRForSGPRSpills - MaxVGPRs),
0);
unsigned OtherExcessVGPR = std::max(
static_cast<int>(O.getVGPRNum(ST.hasGFX90AInsts(), ArchVGPRThreshold) +
OtherVGPRForSGPRSpills - MaxVGPRs),
0);
// Arch VGPR excess pressure conditions, accounting for VGPRs used for SGPR
// spills
unsigned ExcessArchVGPR = std::max(
static_cast<int>(getVGPRNum(false) + VGPRForSGPRSpills - MaxArchVGPRs),
0);
unsigned AddressableArchVGPRs = ST.getAddressableNumArchVGPRs();
unsigned ExcessArchVGPR =
std::max(static_cast<int>(getVGPRNum(false, ArchVGPRThreshold) +
VGPRForSGPRSpills - AddressableArchVGPRs),
0);
unsigned OtherExcessArchVGPR =
std::max(static_cast<int>(O.getVGPRNum(false) + OtherVGPRForSGPRSpills -
MaxArchVGPRs),
std::max(static_cast<int>(O.getVGPRNum(false, ArchVGPRThreshold) +
OtherVGPRForSGPRSpills - AddressableArchVGPRs),
0);
// AGPR excess pressure conditions
unsigned ExcessAGPR = std::max(
static_cast<int>(ST.hasGFX90AInsts() ? (getAGPRNum() - MaxArchVGPRs)
: (getAGPRNum() - MaxVGPRs)),
0);
unsigned ExcessAGPR =
std::max(static_cast<int>(
ST.hasGFX90AInsts()
? (getAGPRNum(ArchVGPRThreshold) - AddressableArchVGPRs)
: (getAGPRNum(ArchVGPRThreshold) - MaxVGPRs)),
0);
unsigned OtherExcessAGPR = std::max(
static_cast<int>(ST.hasGFX90AInsts() ? (O.getAGPRNum() - MaxArchVGPRs)
: (O.getAGPRNum() - MaxVGPRs)),
static_cast<int>(
ST.hasGFX90AInsts()
? (O.getAGPRNum(ArchVGPRThreshold) - AddressableArchVGPRs)
: (O.getAGPRNum(ArchVGPRThreshold) - MaxVGPRs)),
0);

bool ExcessRP = ExcessSGPR || ExcessVGPR || ExcessArchVGPR || ExcessAGPR;
Expand All @@ -184,14 +190,21 @@ bool GCNRegPressure::less(const MachineFunction &MF, const GCNRegPressure &O,
return VGPRDiff > 0;
if (SGPRDiff != 0) {
unsigned PureExcessVGPR =
std::max(static_cast<int>(getVGPRNum(ST.hasGFX90AInsts()) - MaxVGPRs),
std::max(static_cast<int>(
getVGPRNum(ST.hasGFX90AInsts(), ArchVGPRThreshold) -
MaxVGPRs),
0) +
std::max(static_cast<int>(getVGPRNum(false) - MaxArchVGPRs), 0);
std::max(static_cast<int>(getVGPRNum(false, ArchVGPRThreshold) -
AddressableArchVGPRs),
0);
unsigned OtherPureExcessVGPR =
std::max(
static_cast<int>(O.getVGPRNum(ST.hasGFX90AInsts()) - MaxVGPRs),
0) +
std::max(static_cast<int>(O.getVGPRNum(false) - MaxArchVGPRs), 0);
std::max(static_cast<int>(
O.getVGPRNum(ST.hasGFX90AInsts(), ArchVGPRThreshold) -
MaxVGPRs),
0) +
std::max(static_cast<int>(O.getVGPRNum(false, ArchVGPRThreshold) -
AddressableArchVGPRs),
0);

// If we have a special case where there is a tie in excess VGPR, but one
// of the pressures has VGPR usage from SGPR spills, prefer the pressure
Expand Down Expand Up @@ -221,38 +234,45 @@ bool GCNRegPressure::less(const MachineFunction &MF, const GCNRegPressure &O,
if (SW != OtherSW)
return SW < OtherSW;
} else {
auto VW = getVGPRTuplesWeight();
auto OtherVW = O.getVGPRTuplesWeight();
auto VW = getVGPRTuplesWeight(ArchVGPRThreshold);
auto OtherVW = O.getVGPRTuplesWeight(ArchVGPRThreshold);
if (VW != OtherVW)
return VW < OtherVW;
}
}

// Give final precedence to lower general RP.
return SGPRImportant ? (getSGPRNum() < O.getSGPRNum()):
(getVGPRNum(ST.hasGFX90AInsts()) <
O.getVGPRNum(ST.hasGFX90AInsts()));
return SGPRImportant ? (getSGPRNum() < O.getSGPRNum())
: (getVGPRNum(ST.hasGFX90AInsts(), ArchVGPRThreshold) <
O.getVGPRNum(ST.hasGFX90AInsts(), ArchVGPRThreshold));
}

Printable llvm::print(const GCNRegPressure &RP, const GCNSubtarget *ST,
unsigned DynamicVGPRBlockSize) {
return Printable([&RP, ST, DynamicVGPRBlockSize](raw_ostream &OS) {
OS << "VGPRs: " << RP.getArchVGPRNum() << ' '
<< "AGPRs: " << RP.getAGPRNum();
if (ST)
OS << "(O"
<< ST->getOccupancyWithNumVGPRs(RP.getVGPRNum(ST->hasGFX90AInsts()),
DynamicVGPRBlockSize)
<< ')';
OS << ", SGPRs: " << RP.getSGPRNum();
if (ST)
OS << "(O" << ST->getOccupancyWithNumSGPRs(RP.getSGPRNum()) << ')';
OS << ", LVGPR WT: " << RP.getVGPRTuplesWeight()
<< ", LSGPR WT: " << RP.getSGPRTuplesWeight();
if (ST)
OS << " -> Occ: " << RP.getOccupancy(*ST, DynamicVGPRBlockSize);
OS << '\n';
});
unsigned DynamicVGPRBlockSize,
const MachineFunction *MF) {
unsigned ArchVGPRThreshold = std::numeric_limits<unsigned int>::max();
if (ST && MF)
ArchVGPRThreshold = ST->getMaxNumVectorRegs(MF->getFunction()).first;

return Printable(
[&RP, ST, DynamicVGPRBlockSize, ArchVGPRThreshold, MF](raw_ostream &OS) {
OS << "VGPRs: " << RP.getArchVGPRNum(ArchVGPRThreshold) << ' '
<< "AGPRs: " << RP.getAGPRNum(ArchVGPRThreshold);
if (ST)
OS << "(O"
<< ST->getOccupancyWithNumVGPRs(
RP.getVGPRNum(ST->hasGFX90AInsts(), ArchVGPRThreshold),
DynamicVGPRBlockSize)
<< ')';
OS << ", SGPRs: " << RP.getSGPRNum();
if (ST)
OS << "(O" << ST->getOccupancyWithNumSGPRs(RP.getSGPRNum()) << ')';
OS << ", LVGPR WT: " << RP.getVGPRTuplesWeight(ArchVGPRThreshold)
<< ", LSGPR WT: " << RP.getSGPRTuplesWeight();
if (ST)
OS << " -> Occ: " << RP.getOccupancy(*MF);
OS << '\n';
});
}

static LaneBitmask getDefRegMask(const MachineOperand &MO,
Expand Down Expand Up @@ -398,8 +418,9 @@ void GCNRPTarget::setRegLimits(unsigned NumSGPRs, unsigned NumVGPRs,
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
unsigned DynamicVGPRBlockSize =
MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();
AddressableNumArchVGPRs = ST.getAddressableNumArchVGPRs();
MaxSGPRs = std::min(ST.getAddressableNumSGPRs(), NumSGPRs);
MaxVGPRs = std::min(ST.getAddressableNumArchVGPRs(), NumVGPRs);
MaxVGPRs = std::min(AddressableNumArchVGPRs, NumVGPRs);
MaxUnifiedVGPRs =
ST.hasGFX90AInsts()
? std::min(ST.getAddressableNumVGPRs(DynamicVGPRBlockSize), NumVGPRs)
Expand All @@ -414,15 +435,21 @@ bool GCNRPTarget::isSaveBeneficial(Register Reg,

if (SRI->isSGPRClass(RC))
return RP.getSGPRNum() > MaxSGPRs;
unsigned NumVGPRs =
SRI->isAGPRClass(RC) ? RP.getAGPRNum() : RP.getArchVGPRNum();

bool ShouldUseAGPR =
SRI->isAGPRClass(RC) ||
(SRI->isVectorSuperClass(RC) &&
RP.getArchVGPRNum(AddressableNumArchVGPRs) >= AddressableNumArchVGPRs);
unsigned NumVGPRs = ShouldUseAGPR
? RP.getAGPRNum(AddressableNumArchVGPRs)
: RP.getArchVGPRNum(AddressableNumArchVGPRs);
return isVGPRBankSaveBeneficial(NumVGPRs);
}

bool GCNRPTarget::satisfied() const {
if (RP.getSGPRNum() > MaxSGPRs)
return false;
if (RP.getVGPRNum(false) > MaxVGPRs &&
if (RP.getVGPRNum(false, AddressableNumArchVGPRs) > MaxVGPRs &&
(!CombineVGPRSavings || !satisifiesVGPRBanksTarget()))
return false;
return satisfiesUnifiedTarget();
Expand Down Expand Up @@ -876,10 +903,13 @@ bool GCNRegPressurePrinter::runOnMachineFunction(MachineFunction &MF) {

OS << "---\nname: " << MF.getName() << "\nbody: |\n";

auto printRP = [](const GCNRegPressure &RP) {
return Printable([&RP](raw_ostream &OS) {
auto printRP = [&MF](const GCNRegPressure &RP) {
return Printable([&RP, &MF](raw_ostream &OS) {
OS << format(PFX " %-5d", RP.getSGPRNum())
<< format(" %-5d", RP.getVGPRNum(false));
<< format(" %-5d", RP.getVGPRNum(false, MF.getSubtarget<GCNSubtarget>()
.getMaxNumVectorRegs(
MF.getFunction())
.first));
});
};

Expand Down
Loading