diff --git a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp index f253a841f16a6..87f5b9f16868a 100644 --- a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp +++ b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp @@ -447,11 +447,7 @@ void GCNIterativeScheduler::sortRegionsByPressure(unsigned TargetOcc) { // BestSchedules aren't deleted on fail. unsigned GCNIterativeScheduler::tryMaximizeOccupancy(unsigned TargetOcc) { // TODO: assert Regions are sorted descending by pressure - const auto &ST = MF.getSubtarget(); - const unsigned DynamicVGPRBlockSize = - MF.getInfo()->getDynamicVGPRBlockSize(); - const auto Occ = - Regions.front()->MaxPressure.getOccupancy(ST, DynamicVGPRBlockSize); + const auto Occ = Regions.front()->MaxPressure.getOccupancy(MF); LLVM_DEBUG(dbgs() << "Trying to improve occupancy, target = " << TargetOcc << ", current = " << Occ << '\n'); @@ -460,7 +456,7 @@ unsigned GCNIterativeScheduler::tryMaximizeOccupancy(unsigned TargetOcc) { // Always build the DAG to add mutations BuildDAG DAG(*R, *this); - if (R->MaxPressure.getOccupancy(ST, DynamicVGPRBlockSize) >= NewOcc) + if (R->MaxPressure.getOccupancy(MF) >= NewOcc) continue; LLVM_DEBUG(printRegion(dbgs(), R->Begin, R->End, LIS, 3); @@ -471,7 +467,7 @@ unsigned GCNIterativeScheduler::tryMaximizeOccupancy(unsigned TargetOcc) { LLVM_DEBUG(dbgs() << "Occupancy improvement attempt:\n"; printSchedRP(dbgs(), R->MaxPressure, MaxRP)); - NewOcc = std::min(NewOcc, MaxRP.getOccupancy(ST, DynamicVGPRBlockSize)); + NewOcc = std::min(NewOcc, MaxRP.getOccupancy(MF)); if (NewOcc <= Occ) break; @@ -488,15 +484,12 @@ unsigned GCNIterativeScheduler::tryMaximizeOccupancy(unsigned TargetOcc) { } void GCNIterativeScheduler::scheduleLegacyMaxOccupancy( - bool TryMaximizeOccupancy) { - const auto &ST = MF.getSubtarget(); + bool TryMaximizeOccupancy) { SIMachineFunctionInfo *MFI = MF.getInfo(); auto TgtOcc = MFI->getMinAllowedOccupancy(); - unsigned DynamicVGPRBlockSize = MFI->getDynamicVGPRBlockSize(); sortRegionsByPressure(TgtOcc); - auto Occ = - Regions.front()->MaxPressure.getOccupancy(ST, DynamicVGPRBlockSize); + auto Occ = Regions.front()->MaxPressure.getOccupancy(MF); bool IsReentry = false; if (TryMaximizeOccupancy && Occ < TgtOcc) { @@ -527,21 +520,19 @@ void GCNIterativeScheduler::scheduleLegacyMaxOccupancy( const auto RP = getRegionPressure(*R); LLVM_DEBUG(printSchedRP(dbgs(), R->MaxPressure, RP)); - if (RP.getOccupancy(ST, DynamicVGPRBlockSize) < TgtOcc) { + if (RP.getOccupancy(MF) < TgtOcc) { LLVM_DEBUG(dbgs() << "Didn't fit into target occupancy O" << TgtOcc); - if (R->BestSchedule.get() && R->BestSchedule->MaxPressure.getOccupancy( - ST, DynamicVGPRBlockSize) >= TgtOcc) { + if (R->BestSchedule.get() && + R->BestSchedule->MaxPressure.getOccupancy(MF) >= TgtOcc) { LLVM_DEBUG(dbgs() << ", scheduling minimal register\n"); scheduleBest(*R); } else { LLVM_DEBUG(dbgs() << ", restoring\n"); Ovr.restoreOrder(); - assert(R->MaxPressure.getOccupancy(ST, DynamicVGPRBlockSize) >= - TgtOcc); + assert(R->MaxPressure.getOccupancy(MF) >= TgtOcc); } } - FinalOccupancy = - std::min(FinalOccupancy, RP.getOccupancy(ST, DynamicVGPRBlockSize)); + FinalOccupancy = std::min(FinalOccupancy, RP.getOccupancy(MF)); } } MFI->limitOccupancy(FinalOccupancy); @@ -582,16 +573,12 @@ void GCNIterativeScheduler::scheduleMinReg(bool force) { /////////////////////////////////////////////////////////////////////////////// // ILP scheduler port -void GCNIterativeScheduler::scheduleILP( - bool TryMaximizeOccupancy) { - const auto &ST = MF.getSubtarget(); +void GCNIterativeScheduler::scheduleILP(bool TryMaximizeOccupancy) { SIMachineFunctionInfo *MFI = MF.getInfo(); auto TgtOcc = MFI->getMinAllowedOccupancy(); - unsigned DynamicVGPRBlockSize = MFI->getDynamicVGPRBlockSize(); sortRegionsByPressure(TgtOcc); - auto Occ = - Regions.front()->MaxPressure.getOccupancy(ST, DynamicVGPRBlockSize); + auto Occ = Regions.front()->MaxPressure.getOccupancy(MF); bool IsReentry = false; if (TryMaximizeOccupancy && Occ < TgtOcc) { @@ -612,18 +599,17 @@ void GCNIterativeScheduler::scheduleILP( const auto RP = getSchedulePressure(*R, ILPSchedule); LLVM_DEBUG(printSchedRP(dbgs(), R->MaxPressure, RP)); - if (RP.getOccupancy(ST, DynamicVGPRBlockSize) < TgtOcc) { + if (RP.getOccupancy(MF) < TgtOcc) { LLVM_DEBUG(dbgs() << "Didn't fit into target occupancy O" << TgtOcc); - if (R->BestSchedule.get() && R->BestSchedule->MaxPressure.getOccupancy( - ST, DynamicVGPRBlockSize) >= TgtOcc) { + if (R->BestSchedule.get() && + R->BestSchedule->MaxPressure.getOccupancy(MF) >= TgtOcc) { LLVM_DEBUG(dbgs() << ", scheduling minimal register\n"); scheduleBest(*R); } } else { scheduleRegion(*R, ILPSchedule, RP); LLVM_DEBUG(printSchedResult(dbgs(), R, RP)); - FinalOccupancy = - std::min(FinalOccupancy, RP.getOccupancy(ST, DynamicVGPRBlockSize)); + FinalOccupancy = std::min(FinalOccupancy, RP.getOccupancy(MF)); } } MFI->limitOccupancy(FinalOccupancy); diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp index 334afd3a2a5b4..dd007e6cd6b31 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp @@ -99,20 +99,22 @@ void GCNRegPressure::inc(unsigned Reg, bool GCNRegPressure::less(const MachineFunction &MF, const GCNRegPressure &O, unsigned MaxOccupancy) const { const GCNSubtarget &ST = MF.getSubtarget(); + unsigned ArchVGPRThreshold = ST.getMaxNumVectorRegs(MF.getFunction()).first; unsigned DynamicVGPRBlockSize = MF.getInfo()->getDynamicVGPRBlockSize(); const auto SGPROcc = std::min(MaxOccupancy, ST.getOccupancyWithNumSGPRs(getSGPRNum())); const auto VGPROcc = std::min( - MaxOccupancy, ST.getOccupancyWithNumVGPRs(getVGPRNum(ST.hasGFX90AInsts()), - DynamicVGPRBlockSize)); + MaxOccupancy, ST.getOccupancyWithNumVGPRs( + getVGPRNum(ST.hasGFX90AInsts(), ArchVGPRThreshold), + DynamicVGPRBlockSize)); const auto OtherSGPROcc = std::min(MaxOccupancy, ST.getOccupancyWithNumSGPRs(O.getSGPRNum())); - const auto OtherVGPROcc = - std::min(MaxOccupancy, - ST.getOccupancyWithNumVGPRs(O.getVGPRNum(ST.hasGFX90AInsts()), - DynamicVGPRBlockSize)); + const auto OtherVGPROcc = std::min( + MaxOccupancy, ST.getOccupancyWithNumVGPRs( + O.getVGPRNum(ST.hasGFX90AInsts(), ArchVGPRThreshold), + DynamicVGPRBlockSize)); const auto Occ = std::min(SGPROcc, VGPROcc); const auto OtherOcc = std::min(OtherSGPROcc, OtherVGPROcc); @@ -135,35 +137,39 @@ bool GCNRegPressure::less(const MachineFunction &MF, const GCNRegPressure &O, unsigned OtherVGPRForSGPRSpills = (OtherExcessSGPR + (WaveSize - 1)) / WaveSize; - unsigned MaxArchVGPRs = ST.getAddressableNumArchVGPRs(); - // Unified excess pressure conditions, accounting for VGPRs used for SGPR // spills - unsigned ExcessVGPR = - std::max(static_cast(getVGPRNum(ST.hasGFX90AInsts()) + - VGPRForSGPRSpills - MaxVGPRs), - 0); - unsigned OtherExcessVGPR = - std::max(static_cast(O.getVGPRNum(ST.hasGFX90AInsts()) + - OtherVGPRForSGPRSpills - MaxVGPRs), - 0); + unsigned ExcessVGPR = std::max( + static_cast(getVGPRNum(ST.hasGFX90AInsts(), ArchVGPRThreshold) + + VGPRForSGPRSpills - MaxVGPRs), + 0); + unsigned OtherExcessVGPR = std::max( + static_cast(O.getVGPRNum(ST.hasGFX90AInsts(), ArchVGPRThreshold) + + OtherVGPRForSGPRSpills - MaxVGPRs), + 0); // Arch VGPR excess pressure conditions, accounting for VGPRs used for SGPR // spills - unsigned ExcessArchVGPR = std::max( - static_cast(getVGPRNum(false) + VGPRForSGPRSpills - MaxArchVGPRs), - 0); + unsigned AddressableArchVGPRs = ST.getAddressableNumArchVGPRs(); + unsigned ExcessArchVGPR = + std::max(static_cast(getVGPRNum(false, ArchVGPRThreshold) + + VGPRForSGPRSpills - AddressableArchVGPRs), + 0); unsigned OtherExcessArchVGPR = - std::max(static_cast(O.getVGPRNum(false) + OtherVGPRForSGPRSpills - - MaxArchVGPRs), + std::max(static_cast(O.getVGPRNum(false, ArchVGPRThreshold) + + OtherVGPRForSGPRSpills - AddressableArchVGPRs), 0); // AGPR excess pressure conditions - unsigned ExcessAGPR = std::max( - static_cast(ST.hasGFX90AInsts() ? (getAGPRNum() - MaxArchVGPRs) - : (getAGPRNum() - MaxVGPRs)), - 0); + unsigned ExcessAGPR = + std::max(static_cast( + ST.hasGFX90AInsts() + ? (getAGPRNum(ArchVGPRThreshold) - AddressableArchVGPRs) + : (getAGPRNum(ArchVGPRThreshold) - MaxVGPRs)), + 0); unsigned OtherExcessAGPR = std::max( - static_cast(ST.hasGFX90AInsts() ? (O.getAGPRNum() - MaxArchVGPRs) - : (O.getAGPRNum() - MaxVGPRs)), + static_cast( + ST.hasGFX90AInsts() + ? (O.getAGPRNum(ArchVGPRThreshold) - AddressableArchVGPRs) + : (O.getAGPRNum(ArchVGPRThreshold) - MaxVGPRs)), 0); bool ExcessRP = ExcessSGPR || ExcessVGPR || ExcessArchVGPR || ExcessAGPR; @@ -184,14 +190,21 @@ bool GCNRegPressure::less(const MachineFunction &MF, const GCNRegPressure &O, return VGPRDiff > 0; if (SGPRDiff != 0) { unsigned PureExcessVGPR = - std::max(static_cast(getVGPRNum(ST.hasGFX90AInsts()) - MaxVGPRs), + std::max(static_cast( + getVGPRNum(ST.hasGFX90AInsts(), ArchVGPRThreshold) - + MaxVGPRs), 0) + - std::max(static_cast(getVGPRNum(false) - MaxArchVGPRs), 0); + std::max(static_cast(getVGPRNum(false, ArchVGPRThreshold) - + AddressableArchVGPRs), + 0); unsigned OtherPureExcessVGPR = - std::max( - static_cast(O.getVGPRNum(ST.hasGFX90AInsts()) - MaxVGPRs), - 0) + - std::max(static_cast(O.getVGPRNum(false) - MaxArchVGPRs), 0); + std::max(static_cast( + O.getVGPRNum(ST.hasGFX90AInsts(), ArchVGPRThreshold) - + MaxVGPRs), + 0) + + std::max(static_cast(O.getVGPRNum(false, ArchVGPRThreshold) - + AddressableArchVGPRs), + 0); // If we have a special case where there is a tie in excess VGPR, but one // of the pressures has VGPR usage from SGPR spills, prefer the pressure @@ -221,38 +234,45 @@ bool GCNRegPressure::less(const MachineFunction &MF, const GCNRegPressure &O, if (SW != OtherSW) return SW < OtherSW; } else { - auto VW = getVGPRTuplesWeight(); - auto OtherVW = O.getVGPRTuplesWeight(); + auto VW = getVGPRTuplesWeight(ArchVGPRThreshold); + auto OtherVW = O.getVGPRTuplesWeight(ArchVGPRThreshold); if (VW != OtherVW) return VW < OtherVW; } } // Give final precedence to lower general RP. - return SGPRImportant ? (getSGPRNum() < O.getSGPRNum()): - (getVGPRNum(ST.hasGFX90AInsts()) < - O.getVGPRNum(ST.hasGFX90AInsts())); + return SGPRImportant ? (getSGPRNum() < O.getSGPRNum()) + : (getVGPRNum(ST.hasGFX90AInsts(), ArchVGPRThreshold) < + O.getVGPRNum(ST.hasGFX90AInsts(), ArchVGPRThreshold)); } Printable llvm::print(const GCNRegPressure &RP, const GCNSubtarget *ST, - unsigned DynamicVGPRBlockSize) { - return Printable([&RP, ST, DynamicVGPRBlockSize](raw_ostream &OS) { - OS << "VGPRs: " << RP.getArchVGPRNum() << ' ' - << "AGPRs: " << RP.getAGPRNum(); - if (ST) - OS << "(O" - << ST->getOccupancyWithNumVGPRs(RP.getVGPRNum(ST->hasGFX90AInsts()), - DynamicVGPRBlockSize) - << ')'; - OS << ", SGPRs: " << RP.getSGPRNum(); - if (ST) - OS << "(O" << ST->getOccupancyWithNumSGPRs(RP.getSGPRNum()) << ')'; - OS << ", LVGPR WT: " << RP.getVGPRTuplesWeight() - << ", LSGPR WT: " << RP.getSGPRTuplesWeight(); - if (ST) - OS << " -> Occ: " << RP.getOccupancy(*ST, DynamicVGPRBlockSize); - OS << '\n'; - }); + unsigned DynamicVGPRBlockSize, + const MachineFunction *MF) { + unsigned ArchVGPRThreshold = std::numeric_limits::max(); + if (ST && MF) + ArchVGPRThreshold = ST->getMaxNumVectorRegs(MF->getFunction()).first; + + return Printable( + [&RP, ST, DynamicVGPRBlockSize, ArchVGPRThreshold, MF](raw_ostream &OS) { + OS << "VGPRs: " << RP.getArchVGPRNum(ArchVGPRThreshold) << ' ' + << "AGPRs: " << RP.getAGPRNum(ArchVGPRThreshold); + if (ST) + OS << "(O" + << ST->getOccupancyWithNumVGPRs( + RP.getVGPRNum(ST->hasGFX90AInsts(), ArchVGPRThreshold), + DynamicVGPRBlockSize) + << ')'; + OS << ", SGPRs: " << RP.getSGPRNum(); + if (ST) + OS << "(O" << ST->getOccupancyWithNumSGPRs(RP.getSGPRNum()) << ')'; + OS << ", LVGPR WT: " << RP.getVGPRTuplesWeight(ArchVGPRThreshold) + << ", LSGPR WT: " << RP.getSGPRTuplesWeight(); + if (ST) + OS << " -> Occ: " << RP.getOccupancy(*MF); + OS << '\n'; + }); } static LaneBitmask getDefRegMask(const MachineOperand &MO, @@ -398,8 +418,9 @@ void GCNRPTarget::setRegLimits(unsigned NumSGPRs, unsigned NumVGPRs, const GCNSubtarget &ST = MF.getSubtarget(); unsigned DynamicVGPRBlockSize = MF.getInfo()->getDynamicVGPRBlockSize(); + AddressableNumArchVGPRs = ST.getAddressableNumArchVGPRs(); MaxSGPRs = std::min(ST.getAddressableNumSGPRs(), NumSGPRs); - MaxVGPRs = std::min(ST.getAddressableNumArchVGPRs(), NumVGPRs); + MaxVGPRs = std::min(AddressableNumArchVGPRs, NumVGPRs); MaxUnifiedVGPRs = ST.hasGFX90AInsts() ? std::min(ST.getAddressableNumVGPRs(DynamicVGPRBlockSize), NumVGPRs) @@ -414,15 +435,21 @@ bool GCNRPTarget::isSaveBeneficial(Register Reg, if (SRI->isSGPRClass(RC)) return RP.getSGPRNum() > MaxSGPRs; - unsigned NumVGPRs = - SRI->isAGPRClass(RC) ? RP.getAGPRNum() : RP.getArchVGPRNum(); + + bool ShouldUseAGPR = + SRI->isAGPRClass(RC) || + (SRI->isVectorSuperClass(RC) && + RP.getArchVGPRNum(AddressableNumArchVGPRs) >= AddressableNumArchVGPRs); + unsigned NumVGPRs = ShouldUseAGPR + ? RP.getAGPRNum(AddressableNumArchVGPRs) + : RP.getArchVGPRNum(AddressableNumArchVGPRs); return isVGPRBankSaveBeneficial(NumVGPRs); } bool GCNRPTarget::satisfied() const { if (RP.getSGPRNum() > MaxSGPRs) return false; - if (RP.getVGPRNum(false) > MaxVGPRs && + if (RP.getVGPRNum(false, AddressableNumArchVGPRs) > MaxVGPRs && (!CombineVGPRSavings || !satisifiesVGPRBanksTarget())) return false; return satisfiesUnifiedTarget(); @@ -876,10 +903,13 @@ bool GCNRegPressurePrinter::runOnMachineFunction(MachineFunction &MF) { OS << "---\nname: " << MF.getName() << "\nbody: |\n"; - auto printRP = [](const GCNRegPressure &RP) { - return Printable([&RP](raw_ostream &OS) { + auto printRP = [&MF](const GCNRegPressure &RP) { + return Printable([&RP, &MF](raw_ostream &OS) { OS << format(PFX " %-5d", RP.getSGPRNum()) - << format(" %-5d", RP.getVGPRNum(false)); + << format(" %-5d", RP.getVGPRNum(false, MF.getSubtarget() + .getMaxNumVectorRegs( + MF.getFunction()) + .first)); }); }; diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h index ea33a229110c1..8b80cc42c9bb0 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h @@ -18,6 +18,7 @@ #define LLVM_LIB_TARGET_AMDGPU_GCNREGPRESSURE_H #include "GCNSubtarget.h" +#include "SIMachineFunctionInfo.h" #include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/RegisterPressure.h" #include @@ -43,51 +44,98 @@ struct GCNRegPressure { /// \returns the SGPR32 pressure unsigned getSGPRNum() const { return Value[SGPR]; } - /// \returns the aggregated ArchVGPR32, AccVGPR32, and Pseudo AVGPR pressure - /// dependent upon \p UnifiedVGPRFile - unsigned getVGPRNum(bool UnifiedVGPRFile) const { + unsigned getVGPRNum(bool UnifiedVGPRFile, + unsigned AddressableArchVGPR) const { if (UnifiedVGPRFile) { - return Value[AGPR] - ? getUnifiedVGPRNum(Value[VGPR], Value[AGPR], Value[AVGPR]) - : Value[VGPR] + Value[AVGPR]; + return Value[AGPR] || Value[AVGPR] + ? getUnifiedVGPRNum(Value[VGPR], Value[AGPR], Value[AVGPR], + AddressableArchVGPR) + : Value[VGPR]; } // AVGPR assignment priority is based on the width of the register. Account // AVGPR pressure as VGPR. return std::max(Value[VGPR] + Value[AVGPR], Value[AGPR]); } + inline static unsigned getAVGPRsAsVGPRsNum(unsigned NumArchVGPRs, + unsigned NumAVGPRs, + unsigned AddressableArchVGPR) { + + return NumArchVGPRs < AddressableArchVGPR + ? std::min((AddressableArchVGPR - NumArchVGPRs), NumAVGPRs) + : 0; + } + + inline static unsigned getAVGPRsAsAGPRsNum(unsigned NumArchVGPRs, + unsigned NumAGPRs, + unsigned NumAVGPRs, + unsigned AddressableArchVGPR) { + unsigned AVGPRsAsVGPRs = + getAVGPRsAsVGPRsNum(NumArchVGPRs, NumAVGPRs, AddressableArchVGPR); + return NumAVGPRs > AVGPRsAsVGPRs ? NumAVGPRs - AVGPRsAsVGPRs : 0; + } + /// Returns the aggregated VGPR pressure, assuming \p NumArchVGPRs ArchVGPRs /// \p NumAGPRs AGPRS, and \p NumAVGPRs AVGPRs for a target with a unified /// VGPR file. inline static unsigned getUnifiedVGPRNum(unsigned NumArchVGPRs, unsigned NumAGPRs, - unsigned NumAVGPRs) { - - // Assume AVGPRs will be assigned as VGPRs. - return alignTo(NumArchVGPRs + NumAVGPRs, + unsigned NumAVGPRs, + unsigned AddressableArchVGPR) { + + // Until we hit the VGPRThreshold, we will assign AV as VGPR. After that + // point, we will assign as AGPR. + unsigned AVGPRsAsVGPRs = + getAVGPRsAsVGPRsNum(NumArchVGPRs, NumAVGPRs, AddressableArchVGPR); + unsigned AVGPRsAsAGPRs = getAVGPRsAsAGPRsNum( + NumArchVGPRs, NumAGPRs, NumAVGPRs, AddressableArchVGPR); + return alignTo(NumArchVGPRs + AVGPRsAsVGPRs, AMDGPU::IsaInfo::getArchVGPRAllocGranule()) + - NumAGPRs; + NumAGPRs + AVGPRsAsAGPRs; } /// \returns the ArchVGPR32 pressure, plus the AVGPRS which we assume will be /// allocated as VGPR - unsigned getArchVGPRNum() const { return Value[VGPR] + Value[AVGPR]; } + unsigned getArchVGPRNum(unsigned AddressableArchVGPR) const { + unsigned AVGPRsAsVGPRs = + getAVGPRsAsVGPRsNum(Value[VGPR], Value[AVGPR], AddressableArchVGPR); + + return Value[VGPR] + AVGPRsAsVGPRs; + } /// \returns the AccVGPR32 pressure - unsigned getAGPRNum() const { return Value[AGPR]; } + unsigned getAGPRNum(unsigned AddressableArchVGPR) const { + unsigned AVGPRsAsAGPRs = getAVGPRsAsAGPRsNum( + Value[VGPR], Value[AGPR], Value[AVGPR], AddressableArchVGPR); + + return Value[AGPR] + AVGPRsAsAGPRs; + } /// \returns the AVGPR32 pressure unsigned getAVGPRNum() const { return Value[AVGPR]; } - unsigned getVGPRTuplesWeight() const { - return std::max(Value[TOTAL_KINDS + VGPR] + Value[TOTAL_KINDS + AVGPR], - Value[TOTAL_KINDS + AGPR]); + unsigned getVGPRTuplesWeight(unsigned AddressableArchVGPR) const { + unsigned AVGPRsAsVGPRs = + getAVGPRsAsVGPRsNum(Value[TOTAL_KINDS + VGPR], + Value[TOTAL_KINDS + AVGPR], AddressableArchVGPR); + unsigned AVGPRsAsAGPRs = getAVGPRsAsAGPRsNum( + Value[TOTAL_KINDS + VGPR], Value[TOTAL_KINDS + AGPR], + Value[TOTAL_KINDS + AVGPR], AddressableArchVGPR); + + return std::max(Value[TOTAL_KINDS + VGPR] + AVGPRsAsVGPRs, + Value[TOTAL_KINDS + AGPR] + AVGPRsAsAGPRs); } unsigned getSGPRTuplesWeight() const { return Value[TOTAL_KINDS + SGPR]; } - unsigned getOccupancy(const GCNSubtarget &ST, - unsigned DynamicVGPRBlockSize) const { - return std::min(ST.getOccupancyWithNumSGPRs(getSGPRNum()), - ST.getOccupancyWithNumVGPRs(getVGPRNum(ST.hasGFX90AInsts()), - DynamicVGPRBlockSize)); + unsigned getOccupancy(const MachineFunction &MF) const { + const GCNSubtarget &ST = MF.getSubtarget(); + unsigned DynamicVGPRBlockSize = + MF.getInfo()->getDynamicVGPRBlockSize(); + + return std::min( + ST.getOccupancyWithNumSGPRs(getSGPRNum()), + ST.getOccupancyWithNumVGPRs( + getVGPRNum(ST.hasGFX90AInsts(), + ST.getMaxNumVectorRegs(MF.getFunction()).first), + DynamicVGPRBlockSize)); } void inc(unsigned Reg, @@ -95,10 +143,9 @@ struct GCNRegPressure { LaneBitmask NewMask, const MachineRegisterInfo &MRI); - bool higherOccupancy(const GCNSubtarget &ST, const GCNRegPressure &O, - unsigned DynamicVGPRBlockSize) const { - return getOccupancy(ST, DynamicVGPRBlockSize) > - O.getOccupancy(ST, DynamicVGPRBlockSize); + bool higherOccupancy(const GCNRegPressure &O, + const MachineFunction &MF) const { + return getOccupancy(MF) > O.getOccupancy(MF); } /// Compares \p this GCNRegpressure to \p O, returning true if \p this is @@ -151,7 +198,7 @@ struct GCNRegPressure { friend GCNRegPressure max(const GCNRegPressure &P1, const GCNRegPressure &P2); - friend Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST, + friend Printable print(const GCNRegPressure &RP, unsigned DynamicVGPRBlockSize); }; @@ -220,16 +267,19 @@ class GCNRPTarget { #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) friend raw_ostream &operator<<(raw_ostream &OS, const GCNRPTarget &Target) { OS << "Actual/Target: " << Target.RP.getSGPRNum() << '/' << Target.MaxSGPRs - << " SGPRs, " << Target.RP.getArchVGPRNum() << '/' << Target.MaxVGPRs - << " ArchVGPRs, " << Target.RP.getAGPRNum() << '/' << Target.MaxVGPRs - << " AGPRs"; + << " SGPRs, " << Target.RP.getArchVGPRNum(Target.AddressableNumArchVGPRs) + << '/' << Target.MaxVGPRs << " ArchVGPRs, " + << Target.RP.getAGPRNum(Target.AddressableNumArchVGPRs) << '/' + << Target.MaxVGPRs << " AGPRs"; if (Target.MaxUnifiedVGPRs) { - OS << ", " << Target.RP.getVGPRNum(true) << '/' << Target.MaxUnifiedVGPRs - << " VGPRs (unified)"; + OS << ", " << Target.RP.getVGPRNum(true, Target.AddressableNumArchVGPRs) + << '/' << Target.MaxUnifiedVGPRs << " VGPRs (unified)"; } else if (Target.CombineVGPRSavings) { - OS << ", " << Target.RP.getArchVGPRNum() + Target.RP.getAGPRNum() << '/' - << 2 * Target.MaxVGPRs << " VGPRs (combined target)"; + OS << ", " + << Target.RP.getArchVGPRNum(Target.AddressableNumArchVGPRs) + + Target.RP.getAGPRNum(Target.AddressableNumArchVGPRs) + << '/' << 2 * Target.MaxVGPRs << " VGPRs (combined target)"; } return OS; } @@ -238,7 +288,6 @@ class GCNRPTarget { private: /// Current register pressure. GCNRegPressure RP; - /// Target number of SGPRs. unsigned MaxSGPRs; /// Target number of ArchVGPRs and AGPRs. @@ -246,6 +295,8 @@ class GCNRPTarget { /// Target number of overall VGPRs for subtargets with unified RFs. Always 0 /// for subtargets with non-unified RFs. unsigned MaxUnifiedVGPRs; + /// The maximum number of arch vgprs allowed by the subtarget. + unsigned AddressableNumArchVGPRs; /// Whether we consider that the register allocator will be able to swap /// between ArchVGPRs and AGPRs by copying them to a super register class. /// Concretely, this allows savings in one of the VGPR banks to help toward @@ -254,12 +305,15 @@ class GCNRPTarget { inline bool satisifiesVGPRBanksTarget() const { assert(CombineVGPRSavings && "only makes sense with combined savings"); - return RP.getArchVGPRNum() + RP.getAGPRNum() <= 2 * MaxVGPRs; + return RP.getArchVGPRNum(AddressableNumArchVGPRs) + + RP.getAGPRNum(AddressableNumArchVGPRs) <= + 2 * MaxVGPRs; } /// Always satisified when the subtarget doesn't have a unified RF. inline bool satisfiesUnifiedTarget() const { - return !MaxUnifiedVGPRs || RP.getVGPRNum(true) <= MaxUnifiedVGPRs; + return !MaxUnifiedVGPRs || + RP.getVGPRNum(true, AddressableNumArchVGPRs) <= MaxUnifiedVGPRs; } inline bool isVGPRBankSaveBeneficial(unsigned NumVGPRs) const { @@ -517,7 +571,8 @@ bool isEqual(const GCNRPTracker::LiveRegSet &S1, const GCNRPTracker::LiveRegSet &S2); Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST = nullptr, - unsigned DynamicVGPRBlockSize = 0); + unsigned DynamicVGPRBlockSize = 0, + const MachineFunction *MF = nullptr); Printable print(const GCNRPTracker::LiveRegSet &LiveRegs, const MachineRegisterInfo &MRI); diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index ce1ce687d0038..3cf9a7c0f972e 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -190,10 +190,14 @@ static void getRegisterPressures( TempUpwardTracker.recede(*MI); NewPressure = TempUpwardTracker.getPressure(); } + unsigned ArchVGPRThreshold = DAG->MF.getSubtarget() + .getMaxNumVectorRegs(DAG->MF.getFunction()) + .first; Pressure[AMDGPU::RegisterPressureSets::SReg_32] = NewPressure.getSGPRNum(); Pressure[AMDGPU::RegisterPressureSets::VGPR_32] = - NewPressure.getArchVGPRNum(); - Pressure[AMDGPU::RegisterPressureSets::AGPR_32] = NewPressure.getAGPRNum(); + NewPressure.getArchVGPRNum(ArchVGPRThreshold); + Pressure[AMDGPU::RegisterPressureSets::AGPR_32] = + NewPressure.getAGPRNum(ArchVGPRThreshold); } void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU, @@ -339,7 +343,10 @@ void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone, ? static_cast(&UpwardTracker) : static_cast(&DownwardTracker); SGPRPressure = T->getPressure().getSGPRNum(); - VGPRPressure = T->getPressure().getArchVGPRNum(); + VGPRPressure = T->getPressure().getArchVGPRNum( + DAG->MF.getSubtarget() + .getMaxNumVectorRegs(DAG->MF.getFunction()) + .first); } } ReadyQueue &Q = Zone.Available; @@ -1140,8 +1147,7 @@ void UnclusteredHighRPStage::finalizeGCNSchedStage() { if (DAG.MinOccupancy > InitialOccupancy) { for (unsigned IDX = 0; IDX < DAG.Pressure.size(); ++IDX) DAG.RegionsWithMinOcc[IDX] = - DAG.Pressure[IDX].getOccupancy( - DAG.ST, DAG.MFI.getDynamicVGPRBlockSize()) == DAG.MinOccupancy; + DAG.Pressure[IDX].getOccupancy(DAG.MF) == DAG.MinOccupancy; LLVM_DEBUG(dbgs() << StageID << " stage successfully increased occupancy to " @@ -1193,8 +1199,10 @@ bool GCNSchedStage::initGCNRegion() { dbgs() << "Pressure before scheduling:\nRegion live-ins:" << print(DAG.LiveIns[RegionIdx], DAG.MRI) << "Region live-in pressure: " - << print(llvm::getRegPressure(DAG.MRI, DAG.LiveIns[RegionIdx])) - << "Region register pressure: " << print(PressureBefore)); + << print(llvm::getRegPressure(DAG.MRI, DAG.LiveIns[RegionIdx]), + &ST, 0, &MF) + << "Region register pressure: " + << print(PressureBefore, &ST, 0, &MF)); S.HasHighPressure = false; S.KnownExcessRP = isRegionWithExcessRP(); @@ -1275,17 +1283,17 @@ void GCNSchedStage::checkScheduling() { // Check the results of scheduling. PressureAfter = DAG.getRealRegPressure(RegionIdx); - LLVM_DEBUG(dbgs() << "Pressure after scheduling: " << print(PressureAfter)); + LLVM_DEBUG(dbgs() << "Pressure after scheduling: " + << print(PressureAfter, &ST, 0, &MF)); LLVM_DEBUG(dbgs() << "Region: " << RegionIdx << ".\n"); - unsigned DynamicVGPRBlockSize = DAG.MFI.getDynamicVGPRBlockSize(); - + unsigned ArchVGPRThreshold = ST.getMaxNumVectorRegs(MF.getFunction()).first; if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit && - PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) <= S.VGPRCriticalLimit) { + PressureAfter.getVGPRNum(ST.hasGFX90AInsts(), ArchVGPRThreshold) <= + S.VGPRCriticalLimit) { DAG.Pressure[RegionIdx] = PressureAfter; DAG.RegionsWithMinOcc[RegionIdx] = - PressureAfter.getOccupancy(ST, DynamicVGPRBlockSize) == - DAG.MinOccupancy; + PressureAfter.getOccupancy(DAG.MF) == DAG.MinOccupancy; // Early out if we have achieved the occupancy target. LLVM_DEBUG(dbgs() << "Pressure in desired limits, done.\n"); @@ -1294,10 +1302,10 @@ void GCNSchedStage::checkScheduling() { unsigned TargetOccupancy = std::min( S.getTargetOccupancy(), ST.getOccupancyWithWorkGroupSizes(MF).second); - unsigned WavesAfter = std::min( - TargetOccupancy, PressureAfter.getOccupancy(ST, DynamicVGPRBlockSize)); - unsigned WavesBefore = std::min( - TargetOccupancy, PressureBefore.getOccupancy(ST, DynamicVGPRBlockSize)); + unsigned WavesAfter = + std::min(TargetOccupancy, PressureAfter.getOccupancy(DAG.MF)); + unsigned WavesBefore = + std::min(TargetOccupancy, PressureBefore.getOccupancy(DAG.MF)); LLVM_DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore << ", after " << WavesAfter << ".\n"); @@ -1331,9 +1339,10 @@ void GCNSchedStage::checkScheduling() { unsigned MaxArchVGPRs = std::min(MaxVGPRs, ST.getAddressableNumArchVGPRs()); unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF); - if (PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) > MaxVGPRs || - PressureAfter.getArchVGPRNum() > MaxArchVGPRs || - PressureAfter.getAGPRNum() > MaxArchVGPRs || + if (PressureAfter.getVGPRNum(ST.hasGFX90AInsts(), ArchVGPRThreshold) > + MaxVGPRs || + PressureAfter.getArchVGPRNum(ArchVGPRThreshold) > MaxArchVGPRs || + PressureAfter.getAGPRNum(ArchVGPRThreshold) > MaxArchVGPRs || PressureAfter.getSGPRNum() > MaxSGPRs) { DAG.RegionsWithHighRP[RegionIdx] = true; DAG.RegionsWithExcessRP[RegionIdx] = true; @@ -1346,8 +1355,7 @@ void GCNSchedStage::checkScheduling() { } else { DAG.Pressure[RegionIdx] = PressureAfter; DAG.RegionsWithMinOcc[RegionIdx] = - PressureAfter.getOccupancy(ST, DynamicVGPRBlockSize) == - DAG.MinOccupancy; + PressureAfter.getOccupancy(DAG.MF) == DAG.MinOccupancy; } } @@ -1471,12 +1479,13 @@ bool GCNSchedStage::shouldRevertScheduling(unsigned WavesAfter) { // For dynamic VGPR mode, we don't want to waste any VGPR blocks. if (DAG.MFI.isDynamicVGPREnabled()) { + unsigned ArchVGPRThreshold = ST.getMaxNumVectorRegs(MF.getFunction()).first; unsigned BlocksBefore = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks( &ST, DAG.MFI.getDynamicVGPRBlockSize(), - PressureBefore.getVGPRNum(false)); + PressureBefore.getVGPRNum(false, ArchVGPRThreshold)); unsigned BlocksAfter = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks( &ST, DAG.MFI.getDynamicVGPRBlockSize(), - PressureAfter.getVGPRNum(false)); + PressureAfter.getVGPRNum(false, ArchVGPRThreshold)); if (BlocksAfter > BlocksBefore) return true; } @@ -1500,8 +1509,7 @@ bool OccInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) { bool UnclusteredHighRPStage::shouldRevertScheduling(unsigned WavesAfter) { // If RP is not reduced in the unclustered reschedule stage, revert to the // old schedule. - if ((WavesAfter <= - PressureBefore.getOccupancy(ST, DAG.MFI.getDynamicVGPRBlockSize()) && + if ((WavesAfter <= PressureBefore.getOccupancy(DAG.MF) && mayCauseSpilling(WavesAfter)) || GCNSchedStage::shouldRevertScheduling(WavesAfter)) { LLVM_DEBUG(dbgs() << "Unclustered reschedule did not help.\n"); @@ -1523,9 +1531,8 @@ bool UnclusteredHighRPStage::shouldRevertScheduling(unsigned WavesAfter) { ScheduleMetrics MAfter = getScheduleMetrics(DAG); unsigned OldMetric = MBefore.getMetric(); unsigned NewMetric = MAfter.getMetric(); - unsigned WavesBefore = std::min( - S.getTargetOccupancy(), - PressureBefore.getOccupancy(ST, DAG.MFI.getDynamicVGPRBlockSize())); + unsigned WavesBefore = + std::min(S.getTargetOccupancy(), PressureBefore.getOccupancy(DAG.MF)); unsigned Profit = ((WavesAfter * ScheduleMetrics::ScaleFactor) / WavesBefore * ((OldMetric + ScheduleMetricBias) * ScheduleMetrics::ScaleFactor) / @@ -1579,8 +1586,7 @@ bool GCNSchedStage::mayCauseSpilling(unsigned WavesAfter) { void GCNSchedStage::revertScheduling() { DAG.RegionsWithMinOcc[RegionIdx] = - PressureBefore.getOccupancy(ST, DAG.MFI.getDynamicVGPRBlockSize()) == - DAG.MinOccupancy; + PressureBefore.getOccupancy(DAG.MF) == DAG.MinOccupancy; LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n"); DAG.RegionEnd = DAG.RegionBegin; int SkippedDebugInstr = 0; @@ -2017,9 +2023,7 @@ void PreRARematStage::rematerialize() { } } DAG.Pressure[I] = RP; - AchievedOcc = std::min( - AchievedOcc, RP.getOccupancy(ST, MF.getInfo() - ->getDynamicVGPRBlockSize())); + AchievedOcc = std::min(AchievedOcc, RP.getOccupancy(DAG.MF)); } REMAT_DEBUG(dbgs() << "Achieved occupancy " << AchievedOcc << "\n"); } diff --git a/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp b/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp index 6b13b06590102..e29ac72c7ba31 100644 --- a/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp +++ b/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp @@ -197,9 +197,7 @@ bool SIFormMemoryClausesImpl::checkPressure(const MachineInstr &MI, // pointer becomes dead and could otherwise be reused for destination. RPT.advanceToNext(); GCNRegPressure MaxPressure = RPT.moveMaxPressure(); - unsigned Occupancy = MaxPressure.getOccupancy( - *ST, - MI.getMF()->getInfo()->getDynamicVGPRBlockSize()); + unsigned Occupancy = MaxPressure.getOccupancy(*MI.getMF()); // Don't push over half the register budget. We don't want to introduce // spilling just to form a soft clause. @@ -211,7 +209,10 @@ bool SIFormMemoryClausesImpl::checkPressure(const MachineInstr &MI, // tracking does not account for the alignment requirements for SGPRs, or the // fragmentation of registers the allocator will need to satisfy. if (Occupancy >= MFI->getMinAllowedOccupancy() && - MaxPressure.getVGPRNum(ST->hasGFX90AInsts()) <= MaxVGPRs / 2 && + MaxPressure.getVGPRNum( + ST->hasGFX90AInsts(), + ST->getMaxNumVectorRegs(MI.getMF()->getFunction()).first) <= + MaxVGPRs / 2 && MaxPressure.getSGPRNum() <= MaxSGPRs / 2) { LastRecordedOccupancy = Occupancy; return true; diff --git a/llvm/test/CodeGen/AMDGPU/avgpr-pressure.mir b/llvm/test/CodeGen/AMDGPU/avgpr-pressure.mir new file mode 100644 index 0000000000000..a5183ce0d2661 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/avgpr-pressure.mir @@ -0,0 +1,481 @@ +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -run-pass=machine-scheduler --debug-only=machine-scheduler -o - %s 2>&1 | FileCheck %s +# REQUIRES: asserts + +--- | + define void @avgpr_rp_occ1() #0 { + entry: + unreachable + } + + define void @avgpr_rp_occ2() #1 { + entry: + unreachable + } + + define void @avgpr_rp_occ3() #2 { + entry: + unreachable + } + + define void @avgpr_rp_occ4() #3 { + entry: + unreachable + } + + define void @avgpr_rp_occ5() #4 { + entry: + unreachable + } + + define void @avgpr_rp_occ6() #5 { + entry: + unreachable + } + + define void @avgpr_rp_occ7() #6 { + entry: + unreachable + } + + define void @avgpr_rp_occ8() #7 { + entry: + unreachable + } + + + define void @vgpr_rp_occ1() #0 { + entry: + unreachable + } + + define void @vgpr_rp_occ2() #1 { + entry: + unreachable + } + + define void @vgpr_rp_occ3() #2 { + entry: + unreachable + } + + attributes #0 = {"amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="64,64"} + attributes #1 = {"amdgpu-waves-per-eu"="2,2" "amdgpu-flat-work-group-size"="64,64"} + attributes #2 = {"amdgpu-waves-per-eu"="3,3" "amdgpu-flat-work-group-size"="64,64"} + attributes #3 = {"amdgpu-waves-per-eu"="4,4" "amdgpu-flat-work-group-size"="64,64"} + attributes #4 = {"amdgpu-waves-per-eu"="5,5" "amdgpu-flat-work-group-size"="64,64"} + attributes #5 = {"amdgpu-waves-per-eu"="6,6" "amdgpu-flat-work-group-size"="64,64"} + attributes #6 = {"amdgpu-waves-per-eu"="7,7" "amdgpu-flat-work-group-size"="64,64"} + attributes #7 = {"amdgpu-waves-per-eu"="8,8" "amdgpu-flat-work-group-size"="64,64"} + + +... + +# CHECK: avgpr_rp_occ1:%bb.0 +# CHECK: Pressure before scheduling: +# CHECK-NEXT: Region live-ins: +# CHECK-NEXT: Region live-in pressure: VGPRs: 0 AGPRs: 0(O8), SGPRs: 0(O10), LVGPR WT: 0, LSGPR WT: 0 -> Occ: 8 +# CHECK-NEXT: Region register pressure: VGPRs: 256 AGPRs: 192(O1), SGPRs: 0(O10), LVGPR WT: 256, LSGPR WT: 0 -> Occ: 1 + +--- +name: avgpr_rp_occ1 +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true + scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99' + stackPtrOffsetReg: '$sgpr32' + argumentInfo: + privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } + kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } + workGroupIDX: { reg: '$sgpr6' } + privateSegmentWaveByteOffset: { reg: '$sgpr7' } + workItemIDX: { reg: '$vgpr0' } + sgprForEXECCopy: '$sgpr100_sgpr101' +body: | + bb.0: + liveins: $vgpr0, $sgpr4_sgpr5 + %1:vreg_1024 = IMPLICIT_DEF + %2:vreg_1024 = IMPLICIT_DEF + %3:vreg_1024 = IMPLICIT_DEF + %4:vreg_1024 = IMPLICIT_DEF + %5:vreg_1024 = IMPLICIT_DEF + %6:vreg_1024 = IMPLICIT_DEF + %7:vreg_1024 = IMPLICIT_DEF + %8:av_1024 = IMPLICIT_DEF + %9:av_1024 = IMPLICIT_DEF + %10:av_1024 = IMPLICIT_DEF + %11:av_1024 = IMPLICIT_DEF + %12:av_1024 = IMPLICIT_DEF + %13:av_1024 = IMPLICIT_DEF + %14:av_1024 = IMPLICIT_DEF + SCHED_BARRIER 0 + KILL %1, %2, %3, %4, %5, %6, %7 + + bb.1: + KILL %8, %9, %10, %11, %12, %13, %14 + S_ENDPGM 0 +... + +# CHECK: avgpr_rp_occ2:%bb.0 +# CHECK: Pressure before scheduling: +# CHECK-NEXT: Region live-ins: +# CHECK-NEXT: Region live-in pressure: VGPRs: 0 AGPRs: 0(O8), SGPRs: 0(O10), LVGPR WT: 0, LSGPR WT: 0 -> Occ: 8 +# CHECK-NEXT: Region register pressure: VGPRs: 128 AGPRs: 64(O2), SGPRs: 0(O10), LVGPR WT: 128, LSGPR WT: 0 -> Occ: 2 + +--- +name: avgpr_rp_occ2 +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true + scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99' + stackPtrOffsetReg: '$sgpr32' + argumentInfo: + privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } + kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } + workGroupIDX: { reg: '$sgpr6' } + privateSegmentWaveByteOffset: { reg: '$sgpr7' } + workItemIDX: { reg: '$vgpr0' } + sgprForEXECCopy: '$sgpr100_sgpr101' +body: | + bb.0: + liveins: $vgpr0, $sgpr4_sgpr5 + %1:vreg_1024 = IMPLICIT_DEF + %2:vreg_1024 = IMPLICIT_DEF + %3:vreg_1024 = IMPLICIT_DEF + %4:av_1024 = IMPLICIT_DEF + %5:av_1024 = IMPLICIT_DEF + %6:av_1024 = IMPLICIT_DEF + SCHED_BARRIER 0 + KILL %1, %2, %3 + + bb.1: + KILL %4, %5, %6 + S_ENDPGM 0 +... + +# CHECK: avgpr_rp_occ3:%bb.0 +# CHECK: Pressure before scheduling: +# CHECK-NEXT: Region live-ins: +# CHECK-NEXT: Region live-in pressure: VGPRs: 0 AGPRs: 0(O8), SGPRs: 0(O10), LVGPR WT: 0, LSGPR WT: 0 -> Occ: 8 +# CHECK-NEXT: Region register pressure: VGPRs: 84 AGPRs: 44(O4), SGPRs: 0(O10), LVGPR WT: 84, LSGPR WT: 0 -> Occ: 4 + +--- +name: avgpr_rp_occ3 +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true + scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99' + stackPtrOffsetReg: '$sgpr32' + argumentInfo: + privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } + kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } + workGroupIDX: { reg: '$sgpr6' } + privateSegmentWaveByteOffset: { reg: '$sgpr7' } + workItemIDX: { reg: '$vgpr0' } + sgprForEXECCopy: '$sgpr100_sgpr101' +body: | + bb.0: + liveins: $vgpr0, $sgpr4_sgpr5 + %1:vreg_1024 = IMPLICIT_DEF + %2:vreg_1024 = IMPLICIT_DEF + %3:av_1024 = IMPLICIT_DEF + %4:av_1024 = IMPLICIT_DEF + SCHED_BARRIER 0 + KILL %1, %2 + + bb.1: + KILL %3, %4 + S_ENDPGM 0 +... + +# CHECK: avgpr_rp_occ4:%bb.0 +# CHECK: Pressure before scheduling: +# CHECK-NEXT: Region live-ins: +# CHECK-NEXT: Region live-in pressure: VGPRs: 0 AGPRs: 0(O8), SGPRs: 0(O10), LVGPR WT: 0, LSGPR WT: 0 -> Occ: 8 +# CHECK-NEXT: Region register pressure: VGPRs: 64 AGPRs: 64(O4), SGPRs: 0(O10), LVGPR WT: 64, LSGPR WT: 0 -> Occ: 4 + +--- +name: avgpr_rp_occ4 +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true + scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99' + stackPtrOffsetReg: '$sgpr32' + argumentInfo: + privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } + kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } + workGroupIDX: { reg: '$sgpr6' } + privateSegmentWaveByteOffset: { reg: '$sgpr7' } + workItemIDX: { reg: '$vgpr0' } + sgprForEXECCopy: '$sgpr100_sgpr101' +body: | + bb.0: + liveins: $vgpr0, $sgpr4_sgpr5 + %1:av_1024 = IMPLICIT_DEF + %2:av_1024 = IMPLICIT_DEF + %3:av_1024 = IMPLICIT_DEF + %4:av_1024 = IMPLICIT_DEF + SCHED_BARRIER 0 + KILL %1, %2 + + bb.1: + KILL %3, %4 + S_ENDPGM 0 +... + +# CHECK: avgpr_rp_occ5:%bb.0 +# CHECK: Pressure before scheduling: +# CHECK-NEXT: Region live-ins: +# CHECK-NEXT: Region live-in pressure: VGPRs: 0 AGPRs: 0(O8), SGPRs: 0(O10), LVGPR WT: 0, LSGPR WT: 0 -> Occ: 8 +# CHECK-NEXT: Region register pressure: VGPRs: 48 AGPRs: 80(O4), SGPRs: 0(O10), LVGPR WT: 80, LSGPR WT: 0 -> Occ: 4 + +--- +name: avgpr_rp_occ5 +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true + scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99' + stackPtrOffsetReg: '$sgpr32' + argumentInfo: + privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } + kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } + workGroupIDX: { reg: '$sgpr6' } + privateSegmentWaveByteOffset: { reg: '$sgpr7' } + workItemIDX: { reg: '$vgpr0' } + sgprForEXECCopy: '$sgpr100_sgpr101' +body: | + bb.0: + liveins: $vgpr0, $sgpr4_sgpr5 + %1:av_1024 = IMPLICIT_DEF + %2:av_1024 = IMPLICIT_DEF + %3:av_1024 = IMPLICIT_DEF + %4:av_1024 = IMPLICIT_DEF + SCHED_BARRIER 0 + KILL %1, %2 + + bb.1: + KILL %3, %4 + S_ENDPGM 0 +... + +# CHECK: avgpr_rp_occ6:%bb.0 +# CHECK: Pressure before scheduling: +# CHECK-NEXT: Region live-ins: +# CHECK-NEXT: Region live-in pressure: VGPRs: 0 AGPRs: 0(O8), SGPRs: 0(O10), LVGPR WT: 0, LSGPR WT: 0 -> Occ: 8 +# CHECK-NEXT: Region register pressure: VGPRs: 40 AGPRs: 88(O4), SGPRs: 0(O10), LVGPR WT: 88, LSGPR WT: 0 -> Occ: 4 + +--- +name: avgpr_rp_occ6 +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true + scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99' + stackPtrOffsetReg: '$sgpr32' + argumentInfo: + privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } + kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } + workGroupIDX: { reg: '$sgpr6' } + privateSegmentWaveByteOffset: { reg: '$sgpr7' } + workItemIDX: { reg: '$vgpr0' } + sgprForEXECCopy: '$sgpr100_sgpr101' +body: | + bb.0: + liveins: $vgpr0, $sgpr4_sgpr5 + %1:av_1024 = IMPLICIT_DEF + %2:av_1024 = IMPLICIT_DEF + %3:av_1024 = IMPLICIT_DEF + %4:av_1024 = IMPLICIT_DEF + SCHED_BARRIER 0 + KILL %1, %2 + + bb.1: + KILL %3, %4 + S_ENDPGM 0 +... + +# CHECK: avgpr_rp_occ7:%bb.0 +# CHECK: Pressure before scheduling: +# CHECK-NEXT: Region live-ins: +# CHECK-NEXT: Region live-in pressure: VGPRs: 0 AGPRs: 0(O8), SGPRs: 0(O10), LVGPR WT: 0, LSGPR WT: 0 -> Occ: 8 +# CHECK-NEXT: Region register pressure: VGPRs: 36 AGPRs: 92(O4), SGPRs: 0(O10), LVGPR WT: 92, LSGPR WT: 0 -> Occ: 4 + +--- +name: avgpr_rp_occ7 +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true + scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99' + stackPtrOffsetReg: '$sgpr32' + argumentInfo: + privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } + kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } + workGroupIDX: { reg: '$sgpr6' } + privateSegmentWaveByteOffset: { reg: '$sgpr7' } + workItemIDX: { reg: '$vgpr0' } + sgprForEXECCopy: '$sgpr100_sgpr101' +body: | + bb.0: + liveins: $vgpr0, $sgpr4_sgpr5 + %1:av_1024 = IMPLICIT_DEF + %2:av_1024 = IMPLICIT_DEF + %3:av_1024 = IMPLICIT_DEF + %4:av_1024 = IMPLICIT_DEF + SCHED_BARRIER 0 + KILL %1, %2 + + bb.1: + KILL %3, %4 + S_ENDPGM 0 +... + +# CHECK: avgpr_rp_occ8:%bb.0 +# CHECK: Pressure before scheduling: +# CHECK-NEXT: Region live-ins: +# CHECK-NEXT: Region live-in pressure: VGPRs: 0 AGPRs: 0(O8), SGPRs: 0(O10), LVGPR WT: 0, LSGPR WT: 0 -> Occ: 8 +# CHECK-NEXT: Region register pressure: VGPRs: 32 AGPRs: 96(O4), SGPRs: 0(O10), LVGPR WT: 96, LSGPR WT: 0 -> Occ: 4 + +--- +name: avgpr_rp_occ8 +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true + scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99' + stackPtrOffsetReg: '$sgpr32' + argumentInfo: + privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } + kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } + workGroupIDX: { reg: '$sgpr6' } + privateSegmentWaveByteOffset: { reg: '$sgpr7' } + workItemIDX: { reg: '$vgpr0' } + sgprForEXECCopy: '$sgpr100_sgpr101' +body: | + bb.0: + liveins: $vgpr0, $sgpr4_sgpr5 + %1:av_1024 = IMPLICIT_DEF + %2:av_1024 = IMPLICIT_DEF + %3:av_1024 = IMPLICIT_DEF + %4:av_1024 = IMPLICIT_DEF + SCHED_BARRIER 0 + KILL %1, %2 + + bb.1: + KILL %3, %4 + S_ENDPGM 0 +... + +# CHECK: vgpr_rp_occ1:%bb.0 +# CHECK: Pressure before scheduling: +# CHECK-NEXT: Region live-ins: +# CHECK-NEXT: Region live-in pressure: VGPRs: 0 AGPRs: 0(O8), SGPRs: 0(O10), LVGPR WT: 0, LSGPR WT: 0 -> Occ: 8 +# CHECK-NEXT: Region register pressure: VGPRs: 448 AGPRs: 0(O1), SGPRs: 0(O10), LVGPR WT: 448, LSGPR WT: 0 -> Occ: 1 + +--- +name: vgpr_rp_occ1 +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true + scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99' + stackPtrOffsetReg: '$sgpr32' + argumentInfo: + privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } + kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } + workGroupIDX: { reg: '$sgpr6' } + privateSegmentWaveByteOffset: { reg: '$sgpr7' } + workItemIDX: { reg: '$vgpr0' } + sgprForEXECCopy: '$sgpr100_sgpr101' +body: | + bb.0: + liveins: $vgpr0, $sgpr4_sgpr5 + %1:vreg_1024 = IMPLICIT_DEF + %2:vreg_1024 = IMPLICIT_DEF + %3:vreg_1024 = IMPLICIT_DEF + %4:vreg_1024 = IMPLICIT_DEF + %5:vreg_1024 = IMPLICIT_DEF + %6:vreg_1024 = IMPLICIT_DEF + %7:vreg_1024 = IMPLICIT_DEF + %8:vreg_1024 = IMPLICIT_DEF + %9:vreg_1024 = IMPLICIT_DEF + %10:vreg_1024 = IMPLICIT_DEF + %11:vreg_1024 = IMPLICIT_DEF + %12:vreg_1024 = IMPLICIT_DEF + %13:vreg_1024 = IMPLICIT_DEF + %14:vreg_1024 = IMPLICIT_DEF + SCHED_BARRIER 0 + KILL %1, %2, %3, %4, %5, %6, %7 + + bb.1: + KILL %8, %9, %10, %11, %12, %13, %14 + S_ENDPGM 0 +... + +# CHECK: vgpr_rp_occ2:%bb.0 +# CHECK: Pressure before scheduling: +# CHECK-NEXT: Region live-ins: +# CHECK-NEXT: Region live-in pressure: VGPRs: 0 AGPRs: 0(O8), SGPRs: 0(O10), LVGPR WT: 0, LSGPR WT: 0 -> Occ: 8 +# CHECK-NEXT: Region register pressure: VGPRs: 192 AGPRs: 0(O2), SGPRs: 0(O10), LVGPR WT: 192, LSGPR WT: 0 -> Occ: 2 + +--- +name: vgpr_rp_occ2 +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true + scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99' + stackPtrOffsetReg: '$sgpr32' + argumentInfo: + privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } + kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } + workGroupIDX: { reg: '$sgpr6' } + privateSegmentWaveByteOffset: { reg: '$sgpr7' } + workItemIDX: { reg: '$vgpr0' } + sgprForEXECCopy: '$sgpr100_sgpr101' +body: | + bb.0: + liveins: $vgpr0, $sgpr4_sgpr5 + %1:vreg_1024 = IMPLICIT_DEF + %2:vreg_1024 = IMPLICIT_DEF + %3:vreg_1024 = IMPLICIT_DEF + %4:vreg_1024 = IMPLICIT_DEF + %5:vreg_1024 = IMPLICIT_DEF + %6:vreg_1024 = IMPLICIT_DEF + SCHED_BARRIER 0 + KILL %1, %2, %3 + + bb.1: + KILL %4, %5, %6 + S_ENDPGM 0 +... + +# CHECK: vgpr_rp_occ3:%bb.0 +# CHECK: Pressure before scheduling: +# CHECK-NEXT: Region live-ins: +# CHECK-NEXT: Region live-in pressure: VGPRs: 0 AGPRs: 0(O8), SGPRs: 0(O10), LVGPR WT: 0, LSGPR WT: 0 -> Occ: 8 +# CHECK-NEXT: Region register pressure: VGPRs: 128 AGPRs: 0(O4), SGPRs: 0(O10), LVGPR WT: 128, LSGPR WT: 0 -> Occ: 4 + + +--- +name: vgpr_rp_occ3 +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true + scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99' + stackPtrOffsetReg: '$sgpr32' + argumentInfo: + privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } + kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } + workGroupIDX: { reg: '$sgpr6' } + privateSegmentWaveByteOffset: { reg: '$sgpr7' } + workItemIDX: { reg: '$vgpr0' } + sgprForEXECCopy: '$sgpr100_sgpr101' +body: | + bb.0: + liveins: $vgpr0, $sgpr4_sgpr5 + %1:vreg_1024 = IMPLICIT_DEF + %2:vreg_1024 = IMPLICIT_DEF + %3:vreg_1024 = IMPLICIT_DEF + %4:vreg_1024 = IMPLICIT_DEF + SCHED_BARRIER 0 + KILL %1, %2 + + bb.1: + KILL %3, %4 + S_ENDPGM 0 +... diff --git a/llvm/test/CodeGen/AMDGPU/debug-value-scheduler-liveins.mir b/llvm/test/CodeGen/AMDGPU/debug-value-scheduler-liveins.mir index 2a08c52e447ba..72181346764fb 100644 --- a/llvm/test/CodeGen/AMDGPU/debug-value-scheduler-liveins.mir +++ b/llvm/test/CodeGen/AMDGPU/debug-value-scheduler-liveins.mir @@ -6,7 +6,7 @@ # CHECK-NEXT: test_get_liveins:%bb.0 # CHECK: ********** MI Scheduling ********** # CHECK-NEXT: test_get_liveins:%bb.1 -# CHECK: Region live-in pressure: VGPRs: 1 AGPRs: 0, SGPRs: 0, LVGPR WT: 0, LSGPR WT: 0 +# CHECK: Region live-in pressure: VGPRs: 1 AGPRs: 0(O10), SGPRs: 0(O10), LVGPR WT: 0, LSGPR WT: 0 -> Occ: 10 # CHECK: ScheduleDAGMILive::schedule starting ---