From 52cbccd2cd6185be0499998ec969952865218169 Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Wed, 23 Jul 2025 15:41:11 -0700 Subject: [PATCH 01/11] [AMDGPU] More accurately account for AVGPR pressure Change-Id: I6f129c2723b52a391a96178e390f60535164ac9b --- llvm/lib/Target/AMDGPU/GCNRegPressure.cpp | 114 +++++++++++------- llvm/lib/Target/AMDGPU/GCNRegPressure.h | 94 ++++++++++----- llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 27 +++-- .../lib/Target/AMDGPU/SIFormMemoryClauses.cpp | 4 +- 4 files changed, 152 insertions(+), 87 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp index 334afd3a2a5b4..286c8d9529731 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp @@ -99,20 +99,22 @@ void GCNRegPressure::inc(unsigned Reg, bool GCNRegPressure::less(const MachineFunction &MF, const GCNRegPressure &O, unsigned MaxOccupancy) const { const GCNSubtarget &ST = MF.getSubtarget(); + unsigned MaxArchVGPRs = ST.getAddressableNumArchVGPRs(); unsigned DynamicVGPRBlockSize = MF.getInfo()->getDynamicVGPRBlockSize(); const auto SGPROcc = std::min(MaxOccupancy, ST.getOccupancyWithNumSGPRs(getSGPRNum())); const auto VGPROcc = std::min( - MaxOccupancy, ST.getOccupancyWithNumVGPRs(getVGPRNum(ST.hasGFX90AInsts()), - DynamicVGPRBlockSize)); + MaxOccupancy, + ST.getOccupancyWithNumVGPRs(getVGPRNum(ST.hasGFX90AInsts(), MaxArchVGPRs), + DynamicVGPRBlockSize)); const auto OtherSGPROcc = std::min(MaxOccupancy, ST.getOccupancyWithNumSGPRs(O.getSGPRNum())); - const auto OtherVGPROcc = - std::min(MaxOccupancy, - ST.getOccupancyWithNumVGPRs(O.getVGPRNum(ST.hasGFX90AInsts()), - DynamicVGPRBlockSize)); + const auto OtherVGPROcc = std::min( + MaxOccupancy, ST.getOccupancyWithNumVGPRs( + O.getVGPRNum(ST.hasGFX90AInsts(), MaxArchVGPRs), + DynamicVGPRBlockSize)); const auto Occ = std::min(SGPROcc, VGPROcc); const auto OtherOcc = std::min(OtherSGPROcc, OtherVGPROcc); @@ -135,35 +137,36 @@ bool GCNRegPressure::less(const MachineFunction &MF, const GCNRegPressure &O, unsigned OtherVGPRForSGPRSpills = (OtherExcessSGPR + (WaveSize - 1)) / WaveSize; - unsigned MaxArchVGPRs = ST.getAddressableNumArchVGPRs(); - // Unified excess pressure conditions, accounting for VGPRs used for SGPR // spills unsigned ExcessVGPR = - std::max(static_cast(getVGPRNum(ST.hasGFX90AInsts()) + + std::max(static_cast(getVGPRNum(ST.hasGFX90AInsts(), MaxArchVGPRs) + VGPRForSGPRSpills - MaxVGPRs), 0); - unsigned OtherExcessVGPR = - std::max(static_cast(O.getVGPRNum(ST.hasGFX90AInsts()) + - OtherVGPRForSGPRSpills - MaxVGPRs), - 0); + unsigned OtherExcessVGPR = std::max( + static_cast(O.getVGPRNum(ST.hasGFX90AInsts(), MaxArchVGPRs) + + OtherVGPRForSGPRSpills - MaxVGPRs), + 0); // Arch VGPR excess pressure conditions, accounting for VGPRs used for SGPR // spills - unsigned ExcessArchVGPR = std::max( - static_cast(getVGPRNum(false) + VGPRForSGPRSpills - MaxArchVGPRs), - 0); + unsigned ExcessArchVGPR = + std::max(static_cast(getVGPRNum(false, MaxArchVGPRs) + + VGPRForSGPRSpills - MaxArchVGPRs), + 0); unsigned OtherExcessArchVGPR = - std::max(static_cast(O.getVGPRNum(false) + OtherVGPRForSGPRSpills - - MaxArchVGPRs), + std::max(static_cast(O.getVGPRNum(false, MaxArchVGPRs) + + OtherVGPRForSGPRSpills - MaxArchVGPRs), 0); // AGPR excess pressure conditions - unsigned ExcessAGPR = std::max( - static_cast(ST.hasGFX90AInsts() ? (getAGPRNum() - MaxArchVGPRs) - : (getAGPRNum() - MaxVGPRs)), - 0); + unsigned ExcessAGPR = + std::max(static_cast(ST.hasGFX90AInsts() + ? (getAGPRNum(MaxArchVGPRs) - MaxArchVGPRs) + : (getAGPRNum(MaxArchVGPRs) - MaxVGPRs)), + 0); unsigned OtherExcessAGPR = std::max( - static_cast(ST.hasGFX90AInsts() ? (O.getAGPRNum() - MaxArchVGPRs) - : (O.getAGPRNum() - MaxVGPRs)), + static_cast(ST.hasGFX90AInsts() + ? (O.getAGPRNum(MaxArchVGPRs) - MaxArchVGPRs) + : (O.getAGPRNum(MaxArchVGPRs) - MaxVGPRs)), 0); bool ExcessRP = ExcessSGPR || ExcessVGPR || ExcessArchVGPR || ExcessAGPR; @@ -184,14 +187,21 @@ bool GCNRegPressure::less(const MachineFunction &MF, const GCNRegPressure &O, return VGPRDiff > 0; if (SGPRDiff != 0) { unsigned PureExcessVGPR = - std::max(static_cast(getVGPRNum(ST.hasGFX90AInsts()) - MaxVGPRs), - 0) + - std::max(static_cast(getVGPRNum(false) - MaxArchVGPRs), 0); + std::max( + static_cast(getVGPRNum(ST.hasGFX90AInsts(), MaxArchVGPRs) - + MaxVGPRs), + 0) + + std::max( + static_cast(getVGPRNum(false, MaxArchVGPRs) - MaxArchVGPRs), + 0); unsigned OtherPureExcessVGPR = std::max( - static_cast(O.getVGPRNum(ST.hasGFX90AInsts()) - MaxVGPRs), + static_cast(O.getVGPRNum(ST.hasGFX90AInsts(), MaxArchVGPRs) - + MaxVGPRs), 0) + - std::max(static_cast(O.getVGPRNum(false) - MaxArchVGPRs), 0); + std::max(static_cast(O.getVGPRNum(false, MaxArchVGPRs) - + MaxArchVGPRs), + 0); // If we have a special case where there is a tie in excess VGPR, but one // of the pressures has VGPR usage from SGPR spills, prefer the pressure @@ -221,33 +231,36 @@ bool GCNRegPressure::less(const MachineFunction &MF, const GCNRegPressure &O, if (SW != OtherSW) return SW < OtherSW; } else { - auto VW = getVGPRTuplesWeight(); - auto OtherVW = O.getVGPRTuplesWeight(); + auto VW = getVGPRTuplesWeight(MaxArchVGPRs); + auto OtherVW = O.getVGPRTuplesWeight(MaxArchVGPRs); if (VW != OtherVW) return VW < OtherVW; } } // Give final precedence to lower general RP. - return SGPRImportant ? (getSGPRNum() < O.getSGPRNum()): - (getVGPRNum(ST.hasGFX90AInsts()) < - O.getVGPRNum(ST.hasGFX90AInsts())); + return SGPRImportant ? (getSGPRNum() < O.getSGPRNum()) + : (getVGPRNum(ST.hasGFX90AInsts(), MaxArchVGPRs) < + O.getVGPRNum(ST.hasGFX90AInsts(), MaxArchVGPRs)); } Printable llvm::print(const GCNRegPressure &RP, const GCNSubtarget *ST, unsigned DynamicVGPRBlockSize) { return Printable([&RP, ST, DynamicVGPRBlockSize](raw_ostream &OS) { - OS << "VGPRs: " << RP.getArchVGPRNum() << ' ' - << "AGPRs: " << RP.getAGPRNum(); + OS << "VGPRs: " << RP.getArchVGPRNum(ST->getAddressableNumArchVGPRs()) + << ' ' << "AGPRs: " << RP.getAGPRNum(ST->getAddressableNumArchVGPRs()); if (ST) OS << "(O" - << ST->getOccupancyWithNumVGPRs(RP.getVGPRNum(ST->hasGFX90AInsts()), - DynamicVGPRBlockSize) + << ST->getOccupancyWithNumVGPRs( + RP.getVGPRNum(ST->hasGFX90AInsts(), + ST->getAddressableNumArchVGPRs()), + DynamicVGPRBlockSize) << ')'; OS << ", SGPRs: " << RP.getSGPRNum(); if (ST) OS << "(O" << ST->getOccupancyWithNumSGPRs(RP.getSGPRNum()) << ')'; - OS << ", LVGPR WT: " << RP.getVGPRTuplesWeight() + OS << ", LVGPR WT: " + << RP.getVGPRTuplesWeight(ST->getAddressableNumArchVGPRs()) << ", LSGPR WT: " << RP.getSGPRTuplesWeight(); if (ST) OS << " -> Occ: " << RP.getOccupancy(*ST, DynamicVGPRBlockSize); @@ -398,8 +411,9 @@ void GCNRPTarget::setRegLimits(unsigned NumSGPRs, unsigned NumVGPRs, const GCNSubtarget &ST = MF.getSubtarget(); unsigned DynamicVGPRBlockSize = MF.getInfo()->getDynamicVGPRBlockSize(); + AddressableNumArchVGPRs = ST.getAddressableNumArchVGPRs(); MaxSGPRs = std::min(ST.getAddressableNumSGPRs(), NumSGPRs); - MaxVGPRs = std::min(ST.getAddressableNumArchVGPRs(), NumVGPRs); + MaxVGPRs = std::min(AddressableNumArchVGPRs, NumVGPRs); MaxUnifiedVGPRs = ST.hasGFX90AInsts() ? std::min(ST.getAddressableNumVGPRs(DynamicVGPRBlockSize), NumVGPRs) @@ -414,15 +428,21 @@ bool GCNRPTarget::isSaveBeneficial(Register Reg, if (SRI->isSGPRClass(RC)) return RP.getSGPRNum() > MaxSGPRs; - unsigned NumVGPRs = - SRI->isAGPRClass(RC) ? RP.getAGPRNum() : RP.getArchVGPRNum(); + + bool ShouldUseAGPR = + SRI->isAGPRClass(RC) || + (SRI->isVectorSuperClass(RC) && + RP.getArchVGPRNum(AddressableNumArchVGPRs) >= AddressableNumArchVGPRs); + unsigned NumVGPRs = ShouldUseAGPR + ? RP.getAGPRNum(AddressableNumArchVGPRs) + : RP.getArchVGPRNum(AddressableNumArchVGPRs); return isVGPRBankSaveBeneficial(NumVGPRs); } bool GCNRPTarget::satisfied() const { if (RP.getSGPRNum() > MaxSGPRs) return false; - if (RP.getVGPRNum(false) > MaxVGPRs && + if (RP.getVGPRNum(false, AddressableNumArchVGPRs) > MaxVGPRs && (!CombineVGPRSavings || !satisifiesVGPRBanksTarget())) return false; return satisfiesUnifiedTarget(); @@ -876,10 +896,12 @@ bool GCNRegPressurePrinter::runOnMachineFunction(MachineFunction &MF) { OS << "---\nname: " << MF.getName() << "\nbody: |\n"; - auto printRP = [](const GCNRegPressure &RP) { - return Printable([&RP](raw_ostream &OS) { + auto printRP = [&MF](const GCNRegPressure &RP) { + return Printable([&RP, &MF](raw_ostream &OS) { OS << format(PFX " %-5d", RP.getSGPRNum()) - << format(" %-5d", RP.getVGPRNum(false)); + << format(" %-5d", + RP.getVGPRNum(false, MF.getSubtarget() + .getAddressableNumArchVGPRs())); }); }; diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h index ea33a229110c1..a8c1c3bfd8703 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h @@ -43,13 +43,13 @@ struct GCNRegPressure { /// \returns the SGPR32 pressure unsigned getSGPRNum() const { return Value[SGPR]; } - /// \returns the aggregated ArchVGPR32, AccVGPR32, and Pseudo AVGPR pressure - /// dependent upon \p UnifiedVGPRFile - unsigned getVGPRNum(bool UnifiedVGPRFile) const { + unsigned getVGPRNum(bool UnifiedVGPRFile, + unsigned AddressableArchVGPR) const { if (UnifiedVGPRFile) { - return Value[AGPR] - ? getUnifiedVGPRNum(Value[VGPR], Value[AGPR], Value[AVGPR]) - : Value[VGPR] + Value[AVGPR]; + return Value[AGPR] || Value[AVGPR] + ? getUnifiedVGPRNum(Value[VGPR], Value[AGPR], Value[AVGPR], + AddressableArchVGPR) + : Value[VGPR]; } // AVGPR assignment priority is based on the width of the register. Account // AVGPR pressure as VGPR. @@ -61,33 +61,60 @@ struct GCNRegPressure { /// VGPR file. inline static unsigned getUnifiedVGPRNum(unsigned NumArchVGPRs, unsigned NumAGPRs, - unsigned NumAVGPRs) { - - // Assume AVGPRs will be assigned as VGPRs. - return alignTo(NumArchVGPRs + NumAVGPRs, + unsigned NumAVGPRs, + unsigned AddressableArchVGPR) { + + // Until we hit the VGPRThreshold, we will assign AV as VGPR. After that + // point, we will assign as AGPR. + unsigned AVGPRsAsVGPRs = + NumArchVGPRs < AddressableArchVGPR + ? std::min((AddressableArchVGPR - NumArchVGPRs), NumAVGPRs) + : 0; + unsigned AVGPRsAsAGPRs = + NumAVGPRs > AVGPRsAsVGPRs ? NumAVGPRs - AVGPRsAsVGPRs : 0; + return alignTo(NumArchVGPRs + AVGPRsAsVGPRs, AMDGPU::IsaInfo::getArchVGPRAllocGranule()) + - NumAGPRs; + NumAGPRs + AVGPRsAsAGPRs; } /// \returns the ArchVGPR32 pressure, plus the AVGPRS which we assume will be /// allocated as VGPR - unsigned getArchVGPRNum() const { return Value[VGPR] + Value[AVGPR]; } + unsigned getArchVGPRNum(unsigned AddressableArchVGPR) const { + return std::min(Value[VGPR] + Value[AVGPR], AddressableArchVGPR); + } /// \returns the AccVGPR32 pressure - unsigned getAGPRNum() const { return Value[AGPR]; } + unsigned getAGPRNum(unsigned AddressableArchVGPR) const { + unsigned VGPRsForAGPRs = + Value[VGPR] + Value[AVGPR] > AddressableArchVGPR + ? (Value[VGPR] + Value[AVGPR] - AddressableArchVGPR) + : 0; + return Value[AGPR] + VGPRsForAGPRs; + } /// \returns the AVGPR32 pressure unsigned getAVGPRNum() const { return Value[AVGPR]; } - unsigned getVGPRTuplesWeight() const { - return std::max(Value[TOTAL_KINDS + VGPR] + Value[TOTAL_KINDS + AVGPR], - Value[TOTAL_KINDS + AGPR]); + unsigned getVGPRTuplesWeight(unsigned AddressableArchVGPR) const { + unsigned AVGPRsAsVGPRs = + Value[TOTAL_KINDS + VGPR] < AddressableArchVGPR + ? std::min(AddressableArchVGPR - Value[TOTAL_KINDS + VGPR], + Value[TOTAL_KINDS + AVGPR]) + : 0; + unsigned AVGPRsAsAGPRs = Value[TOTAL_KINDS + AVGPR] > AVGPRsAsVGPRs + ? Value[TOTAL_KINDS + AVGPR] - AVGPRsAsVGPRs + : 0; + + return std::max(Value[TOTAL_KINDS + VGPR] + AVGPRsAsVGPRs, + Value[TOTAL_KINDS + AGPR] + AVGPRsAsAGPRs); } unsigned getSGPRTuplesWeight() const { return Value[TOTAL_KINDS + SGPR]; } unsigned getOccupancy(const GCNSubtarget &ST, unsigned DynamicVGPRBlockSize) const { - return std::min(ST.getOccupancyWithNumSGPRs(getSGPRNum()), - ST.getOccupancyWithNumVGPRs(getVGPRNum(ST.hasGFX90AInsts()), - DynamicVGPRBlockSize)); + return std::min( + ST.getOccupancyWithNumSGPRs(getSGPRNum()), + ST.getOccupancyWithNumVGPRs( + getVGPRNum(ST.hasGFX90AInsts(), ST.getAddressableNumArchVGPRs()), + DynamicVGPRBlockSize)); } void inc(unsigned Reg, @@ -151,7 +178,7 @@ struct GCNRegPressure { friend GCNRegPressure max(const GCNRegPressure &P1, const GCNRegPressure &P2); - friend Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST, + friend Printable print(const GCNRegPressure &RP, unsigned DynamicVGPRBlockSize); }; @@ -220,16 +247,19 @@ class GCNRPTarget { #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) friend raw_ostream &operator<<(raw_ostream &OS, const GCNRPTarget &Target) { OS << "Actual/Target: " << Target.RP.getSGPRNum() << '/' << Target.MaxSGPRs - << " SGPRs, " << Target.RP.getArchVGPRNum() << '/' << Target.MaxVGPRs - << " ArchVGPRs, " << Target.RP.getAGPRNum() << '/' << Target.MaxVGPRs - << " AGPRs"; + << " SGPRs, " << Target.RP.getArchVGPRNum(Target.AddressableNumArchVGPRs) + << '/' << Target.MaxVGPRs << " ArchVGPRs, " + << Target.RP.getAGPRNum(Target.AddressableNumArchVGPRs) << '/' + << Target.MaxVGPRs << " AGPRs"; if (Target.MaxUnifiedVGPRs) { - OS << ", " << Target.RP.getVGPRNum(true) << '/' << Target.MaxUnifiedVGPRs - << " VGPRs (unified)"; + OS << ", " << Target.RP.getVGPRNum(true, Target.AddressableNumArchVGPRs) + << '/' << Target.MaxUnifiedVGPRs << " VGPRs (unified)"; } else if (Target.CombineVGPRSavings) { - OS << ", " << Target.RP.getArchVGPRNum() + Target.RP.getAGPRNum() << '/' - << 2 * Target.MaxVGPRs << " VGPRs (combined target)"; + OS << ", " + << Target.RP.getArchVGPRNum(Target.AddressableNumArchVGPRs) + + Target.RP.getAGPRNum(Target.AddressableNumArchVGPRs) + << '/' << 2 * Target.MaxVGPRs << " VGPRs (combined target)"; } return OS; } @@ -238,7 +268,6 @@ class GCNRPTarget { private: /// Current register pressure. GCNRegPressure RP; - /// Target number of SGPRs. unsigned MaxSGPRs; /// Target number of ArchVGPRs and AGPRs. @@ -246,6 +275,8 @@ class GCNRPTarget { /// Target number of overall VGPRs for subtargets with unified RFs. Always 0 /// for subtargets with non-unified RFs. unsigned MaxUnifiedVGPRs; + /// The maximum number of arch vgprs allowed by the subtarget. + unsigned AddressableNumArchVGPRs; /// Whether we consider that the register allocator will be able to swap /// between ArchVGPRs and AGPRs by copying them to a super register class. /// Concretely, this allows savings in one of the VGPR banks to help toward @@ -254,12 +285,15 @@ class GCNRPTarget { inline bool satisifiesVGPRBanksTarget() const { assert(CombineVGPRSavings && "only makes sense with combined savings"); - return RP.getArchVGPRNum() + RP.getAGPRNum() <= 2 * MaxVGPRs; + return RP.getArchVGPRNum(AddressableNumArchVGPRs) + + RP.getAGPRNum(AddressableNumArchVGPRs) <= + 2 * MaxVGPRs; } /// Always satisified when the subtarget doesn't have a unified RF. inline bool satisfiesUnifiedTarget() const { - return !MaxUnifiedVGPRs || RP.getVGPRNum(true) <= MaxUnifiedVGPRs; + return !MaxUnifiedVGPRs || + RP.getVGPRNum(true, AddressableNumArchVGPRs) <= MaxUnifiedVGPRs; } inline bool isVGPRBankSaveBeneficial(unsigned NumVGPRs) const { diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index ce1ce687d0038..772c979809b75 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -190,10 +190,13 @@ static void getRegisterPressures( TempUpwardTracker.recede(*MI); NewPressure = TempUpwardTracker.getPressure(); } + unsigned AddressableArchVGPR = + DAG->MF.getSubtarget().getAddressableNumArchVGPRs(); Pressure[AMDGPU::RegisterPressureSets::SReg_32] = NewPressure.getSGPRNum(); Pressure[AMDGPU::RegisterPressureSets::VGPR_32] = - NewPressure.getArchVGPRNum(); - Pressure[AMDGPU::RegisterPressureSets::AGPR_32] = NewPressure.getAGPRNum(); + NewPressure.getArchVGPRNum(AddressableArchVGPR); + Pressure[AMDGPU::RegisterPressureSets::AGPR_32] = + NewPressure.getAGPRNum(AddressableArchVGPR); } void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU, @@ -339,7 +342,8 @@ void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone, ? static_cast(&UpwardTracker) : static_cast(&DownwardTracker); SGPRPressure = T->getPressure().getSGPRNum(); - VGPRPressure = T->getPressure().getArchVGPRNum(); + VGPRPressure = T->getPressure().getArchVGPRNum( + DAG->MF.getSubtarget().getAddressableNumArchVGPRs()); } } ReadyQueue &Q = Zone.Available; @@ -1279,9 +1283,10 @@ void GCNSchedStage::checkScheduling() { LLVM_DEBUG(dbgs() << "Region: " << RegionIdx << ".\n"); unsigned DynamicVGPRBlockSize = DAG.MFI.getDynamicVGPRBlockSize(); - + unsigned AddressableArchVGPR = ST.getAddressableNumArchVGPRs(); if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit && - PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) <= S.VGPRCriticalLimit) { + PressureAfter.getVGPRNum(ST.hasGFX90AInsts(), AddressableArchVGPR) <= + S.VGPRCriticalLimit) { DAG.Pressure[RegionIdx] = PressureAfter; DAG.RegionsWithMinOcc[RegionIdx] = PressureAfter.getOccupancy(ST, DynamicVGPRBlockSize) == @@ -1331,9 +1336,10 @@ void GCNSchedStage::checkScheduling() { unsigned MaxArchVGPRs = std::min(MaxVGPRs, ST.getAddressableNumArchVGPRs()); unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF); - if (PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) > MaxVGPRs || - PressureAfter.getArchVGPRNum() > MaxArchVGPRs || - PressureAfter.getAGPRNum() > MaxArchVGPRs || + if (PressureAfter.getVGPRNum(ST.hasGFX90AInsts(), AddressableArchVGPR) > + MaxVGPRs || + PressureAfter.getArchVGPRNum(AddressableArchVGPR) > MaxArchVGPRs || + PressureAfter.getAGPRNum(AddressableArchVGPR) > MaxArchVGPRs || PressureAfter.getSGPRNum() > MaxSGPRs) { DAG.RegionsWithHighRP[RegionIdx] = true; DAG.RegionsWithExcessRP[RegionIdx] = true; @@ -1471,12 +1477,13 @@ bool GCNSchedStage::shouldRevertScheduling(unsigned WavesAfter) { // For dynamic VGPR mode, we don't want to waste any VGPR blocks. if (DAG.MFI.isDynamicVGPREnabled()) { + unsigned AddressableArchVGPR = ST.getAddressableNumArchVGPRs(); unsigned BlocksBefore = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks( &ST, DAG.MFI.getDynamicVGPRBlockSize(), - PressureBefore.getVGPRNum(false)); + PressureBefore.getVGPRNum(false, AddressableArchVGPR)); unsigned BlocksAfter = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks( &ST, DAG.MFI.getDynamicVGPRBlockSize(), - PressureAfter.getVGPRNum(false)); + PressureAfter.getVGPRNum(false, AddressableArchVGPR)); if (BlocksAfter > BlocksBefore) return true; } diff --git a/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp b/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp index 6b13b06590102..844908a5ce8d9 100644 --- a/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp +++ b/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp @@ -211,7 +211,9 @@ bool SIFormMemoryClausesImpl::checkPressure(const MachineInstr &MI, // tracking does not account for the alignment requirements for SGPRs, or the // fragmentation of registers the allocator will need to satisfy. if (Occupancy >= MFI->getMinAllowedOccupancy() && - MaxPressure.getVGPRNum(ST->hasGFX90AInsts()) <= MaxVGPRs / 2 && + MaxPressure.getVGPRNum(ST->hasGFX90AInsts(), + ST->getAddressableNumArchVGPRs()) <= + MaxVGPRs / 2 && MaxPressure.getSGPRNum() <= MaxSGPRs / 2) { LastRecordedOccupancy = Occupancy; return true; From 99a540244f4c848c9d228b10c6fa33de605c64d4 Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Mon, 28 Jul 2025 13:05:45 -0700 Subject: [PATCH 02/11] Handle gfx908 case Change-Id: Ic16c8a4ffdf58027de164c598cfac70fc453bb00 --- .../Target/AMDGPU/GCNIterativeScheduler.cpp | 34 ++--- llvm/lib/Target/AMDGPU/GCNRegPressure.cpp | 127 +++++++++--------- llvm/lib/Target/AMDGPU/GCNRegPressure.h | 24 ++-- llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 77 ++++++----- llvm/lib/Target/AMDGPU/GCNSubtarget.h | 9 ++ .../lib/Target/AMDGPU/SIFormMemoryClauses.cpp | 5 +- .../AMDGPU/debug-value-scheduler-liveins.mir | 2 +- 7 files changed, 154 insertions(+), 124 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp index f253a841f16a6..050e47270498b 100644 --- a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp +++ b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp @@ -451,7 +451,7 @@ unsigned GCNIterativeScheduler::tryMaximizeOccupancy(unsigned TargetOcc) { const unsigned DynamicVGPRBlockSize = MF.getInfo()->getDynamicVGPRBlockSize(); const auto Occ = - Regions.front()->MaxPressure.getOccupancy(ST, DynamicVGPRBlockSize); + Regions.front()->MaxPressure.getOccupancy(ST, DynamicVGPRBlockSize, MF); LLVM_DEBUG(dbgs() << "Trying to improve occupancy, target = " << TargetOcc << ", current = " << Occ << '\n'); @@ -460,7 +460,7 @@ unsigned GCNIterativeScheduler::tryMaximizeOccupancy(unsigned TargetOcc) { // Always build the DAG to add mutations BuildDAG DAG(*R, *this); - if (R->MaxPressure.getOccupancy(ST, DynamicVGPRBlockSize) >= NewOcc) + if (R->MaxPressure.getOccupancy(ST, DynamicVGPRBlockSize, MF) >= NewOcc) continue; LLVM_DEBUG(printRegion(dbgs(), R->Begin, R->End, LIS, 3); @@ -471,7 +471,7 @@ unsigned GCNIterativeScheduler::tryMaximizeOccupancy(unsigned TargetOcc) { LLVM_DEBUG(dbgs() << "Occupancy improvement attempt:\n"; printSchedRP(dbgs(), R->MaxPressure, MaxRP)); - NewOcc = std::min(NewOcc, MaxRP.getOccupancy(ST, DynamicVGPRBlockSize)); + NewOcc = std::min(NewOcc, MaxRP.getOccupancy(ST, DynamicVGPRBlockSize, MF)); if (NewOcc <= Occ) break; @@ -496,7 +496,7 @@ void GCNIterativeScheduler::scheduleLegacyMaxOccupancy( sortRegionsByPressure(TgtOcc); auto Occ = - Regions.front()->MaxPressure.getOccupancy(ST, DynamicVGPRBlockSize); + Regions.front()->MaxPressure.getOccupancy(ST, DynamicVGPRBlockSize, MF); bool IsReentry = false; if (TryMaximizeOccupancy && Occ < TgtOcc) { @@ -527,21 +527,22 @@ void GCNIterativeScheduler::scheduleLegacyMaxOccupancy( const auto RP = getRegionPressure(*R); LLVM_DEBUG(printSchedRP(dbgs(), R->MaxPressure, RP)); - if (RP.getOccupancy(ST, DynamicVGPRBlockSize) < TgtOcc) { + if (RP.getOccupancy(ST, DynamicVGPRBlockSize, MF) < TgtOcc) { LLVM_DEBUG(dbgs() << "Didn't fit into target occupancy O" << TgtOcc); - if (R->BestSchedule.get() && R->BestSchedule->MaxPressure.getOccupancy( - ST, DynamicVGPRBlockSize) >= TgtOcc) { + if (R->BestSchedule.get() && + R->BestSchedule->MaxPressure.getOccupancy(ST, DynamicVGPRBlockSize, + MF) >= TgtOcc) { LLVM_DEBUG(dbgs() << ", scheduling minimal register\n"); scheduleBest(*R); } else { LLVM_DEBUG(dbgs() << ", restoring\n"); Ovr.restoreOrder(); - assert(R->MaxPressure.getOccupancy(ST, DynamicVGPRBlockSize) >= + assert(R->MaxPressure.getOccupancy(ST, DynamicVGPRBlockSize, MF) >= TgtOcc); } } - FinalOccupancy = - std::min(FinalOccupancy, RP.getOccupancy(ST, DynamicVGPRBlockSize)); + FinalOccupancy = std::min(FinalOccupancy, + RP.getOccupancy(ST, DynamicVGPRBlockSize, MF)); } } MFI->limitOccupancy(FinalOccupancy); @@ -591,7 +592,7 @@ void GCNIterativeScheduler::scheduleILP( sortRegionsByPressure(TgtOcc); auto Occ = - Regions.front()->MaxPressure.getOccupancy(ST, DynamicVGPRBlockSize); + Regions.front()->MaxPressure.getOccupancy(ST, DynamicVGPRBlockSize, MF); bool IsReentry = false; if (TryMaximizeOccupancy && Occ < TgtOcc) { @@ -612,18 +613,19 @@ void GCNIterativeScheduler::scheduleILP( const auto RP = getSchedulePressure(*R, ILPSchedule); LLVM_DEBUG(printSchedRP(dbgs(), R->MaxPressure, RP)); - if (RP.getOccupancy(ST, DynamicVGPRBlockSize) < TgtOcc) { + if (RP.getOccupancy(ST, DynamicVGPRBlockSize, MF) < TgtOcc) { LLVM_DEBUG(dbgs() << "Didn't fit into target occupancy O" << TgtOcc); - if (R->BestSchedule.get() && R->BestSchedule->MaxPressure.getOccupancy( - ST, DynamicVGPRBlockSize) >= TgtOcc) { + if (R->BestSchedule.get() && + R->BestSchedule->MaxPressure.getOccupancy(ST, DynamicVGPRBlockSize, + MF) >= TgtOcc) { LLVM_DEBUG(dbgs() << ", scheduling minimal register\n"); scheduleBest(*R); } } else { scheduleRegion(*R, ILPSchedule, RP); LLVM_DEBUG(printSchedResult(dbgs(), R, RP)); - FinalOccupancy = - std::min(FinalOccupancy, RP.getOccupancy(ST, DynamicVGPRBlockSize)); + FinalOccupancy = std::min(FinalOccupancy, + RP.getOccupancy(ST, DynamicVGPRBlockSize, MF)); } } MFI->limitOccupancy(FinalOccupancy); diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp index 286c8d9529731..786b45902ae48 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp @@ -99,21 +99,21 @@ void GCNRegPressure::inc(unsigned Reg, bool GCNRegPressure::less(const MachineFunction &MF, const GCNRegPressure &O, unsigned MaxOccupancy) const { const GCNSubtarget &ST = MF.getSubtarget(); - unsigned MaxArchVGPRs = ST.getAddressableNumArchVGPRs(); + unsigned ArchVGPRThreshold = ST.getArchVGPRAllocationThreshold(MF); unsigned DynamicVGPRBlockSize = MF.getInfo()->getDynamicVGPRBlockSize(); const auto SGPROcc = std::min(MaxOccupancy, ST.getOccupancyWithNumSGPRs(getSGPRNum())); const auto VGPROcc = std::min( - MaxOccupancy, - ST.getOccupancyWithNumVGPRs(getVGPRNum(ST.hasGFX90AInsts(), MaxArchVGPRs), - DynamicVGPRBlockSize)); + MaxOccupancy, ST.getOccupancyWithNumVGPRs( + getVGPRNum(ST.hasGFX90AInsts(), ArchVGPRThreshold), + DynamicVGPRBlockSize)); const auto OtherSGPROcc = std::min(MaxOccupancy, ST.getOccupancyWithNumSGPRs(O.getSGPRNum())); const auto OtherVGPROcc = std::min( MaxOccupancy, ST.getOccupancyWithNumVGPRs( - O.getVGPRNum(ST.hasGFX90AInsts(), MaxArchVGPRs), + O.getVGPRNum(ST.hasGFX90AInsts(), ArchVGPRThreshold), DynamicVGPRBlockSize)); const auto Occ = std::min(SGPROcc, VGPROcc); @@ -139,34 +139,37 @@ bool GCNRegPressure::less(const MachineFunction &MF, const GCNRegPressure &O, // Unified excess pressure conditions, accounting for VGPRs used for SGPR // spills - unsigned ExcessVGPR = - std::max(static_cast(getVGPRNum(ST.hasGFX90AInsts(), MaxArchVGPRs) + - VGPRForSGPRSpills - MaxVGPRs), - 0); + unsigned ExcessVGPR = std::max( + static_cast(getVGPRNum(ST.hasGFX90AInsts(), ArchVGPRThreshold) + + VGPRForSGPRSpills - MaxVGPRs), + 0); unsigned OtherExcessVGPR = std::max( - static_cast(O.getVGPRNum(ST.hasGFX90AInsts(), MaxArchVGPRs) + + static_cast(O.getVGPRNum(ST.hasGFX90AInsts(), ArchVGPRThreshold) + OtherVGPRForSGPRSpills - MaxVGPRs), 0); // Arch VGPR excess pressure conditions, accounting for VGPRs used for SGPR // spills + unsigned AddressableArchVGPRs = ST.getAddressableNumArchVGPRs(); unsigned ExcessArchVGPR = - std::max(static_cast(getVGPRNum(false, MaxArchVGPRs) + - VGPRForSGPRSpills - MaxArchVGPRs), + std::max(static_cast(getVGPRNum(false, ArchVGPRThreshold) + + VGPRForSGPRSpills - AddressableArchVGPRs), 0); unsigned OtherExcessArchVGPR = - std::max(static_cast(O.getVGPRNum(false, MaxArchVGPRs) + - OtherVGPRForSGPRSpills - MaxArchVGPRs), + std::max(static_cast(O.getVGPRNum(false, ArchVGPRThreshold) + + OtherVGPRForSGPRSpills - AddressableArchVGPRs), 0); // AGPR excess pressure conditions unsigned ExcessAGPR = - std::max(static_cast(ST.hasGFX90AInsts() - ? (getAGPRNum(MaxArchVGPRs) - MaxArchVGPRs) - : (getAGPRNum(MaxArchVGPRs) - MaxVGPRs)), + std::max(static_cast( + ST.hasGFX90AInsts() + ? (getAGPRNum(ArchVGPRThreshold) - AddressableArchVGPRs) + : (getAGPRNum(ArchVGPRThreshold) - MaxVGPRs)), 0); unsigned OtherExcessAGPR = std::max( - static_cast(ST.hasGFX90AInsts() - ? (O.getAGPRNum(MaxArchVGPRs) - MaxArchVGPRs) - : (O.getAGPRNum(MaxArchVGPRs) - MaxVGPRs)), + static_cast( + ST.hasGFX90AInsts() + ? (O.getAGPRNum(ArchVGPRThreshold) - AddressableArchVGPRs) + : (O.getAGPRNum(ArchVGPRThreshold) - MaxVGPRs)), 0); bool ExcessRP = ExcessSGPR || ExcessVGPR || ExcessArchVGPR || ExcessAGPR; @@ -187,20 +190,20 @@ bool GCNRegPressure::less(const MachineFunction &MF, const GCNRegPressure &O, return VGPRDiff > 0; if (SGPRDiff != 0) { unsigned PureExcessVGPR = - std::max( - static_cast(getVGPRNum(ST.hasGFX90AInsts(), MaxArchVGPRs) - - MaxVGPRs), - 0) + - std::max( - static_cast(getVGPRNum(false, MaxArchVGPRs) - MaxArchVGPRs), - 0); + std::max(static_cast( + getVGPRNum(ST.hasGFX90AInsts(), ArchVGPRThreshold) - + MaxVGPRs), + 0) + + std::max(static_cast(getVGPRNum(false, ArchVGPRThreshold) - + AddressableArchVGPRs), + 0); unsigned OtherPureExcessVGPR = - std::max( - static_cast(O.getVGPRNum(ST.hasGFX90AInsts(), MaxArchVGPRs) - - MaxVGPRs), - 0) + - std::max(static_cast(O.getVGPRNum(false, MaxArchVGPRs) - - MaxArchVGPRs), + std::max(static_cast( + O.getVGPRNum(ST.hasGFX90AInsts(), ArchVGPRThreshold) - + MaxVGPRs), + 0) + + std::max(static_cast(O.getVGPRNum(false, ArchVGPRThreshold) - + AddressableArchVGPRs), 0); // If we have a special case where there is a tie in excess VGPR, but one @@ -231,8 +234,8 @@ bool GCNRegPressure::less(const MachineFunction &MF, const GCNRegPressure &O, if (SW != OtherSW) return SW < OtherSW; } else { - auto VW = getVGPRTuplesWeight(MaxArchVGPRs); - auto OtherVW = O.getVGPRTuplesWeight(MaxArchVGPRs); + auto VW = getVGPRTuplesWeight(ArchVGPRThreshold); + auto OtherVW = O.getVGPRTuplesWeight(ArchVGPRThreshold); if (VW != OtherVW) return VW < OtherVW; } @@ -240,32 +243,33 @@ bool GCNRegPressure::less(const MachineFunction &MF, const GCNRegPressure &O, // Give final precedence to lower general RP. return SGPRImportant ? (getSGPRNum() < O.getSGPRNum()) - : (getVGPRNum(ST.hasGFX90AInsts(), MaxArchVGPRs) < - O.getVGPRNum(ST.hasGFX90AInsts(), MaxArchVGPRs)); + : (getVGPRNum(ST.hasGFX90AInsts(), ArchVGPRThreshold) < + O.getVGPRNum(ST.hasGFX90AInsts(), ArchVGPRThreshold)); } Printable llvm::print(const GCNRegPressure &RP, const GCNSubtarget *ST, - unsigned DynamicVGPRBlockSize) { - return Printable([&RP, ST, DynamicVGPRBlockSize](raw_ostream &OS) { - OS << "VGPRs: " << RP.getArchVGPRNum(ST->getAddressableNumArchVGPRs()) - << ' ' << "AGPRs: " << RP.getAGPRNum(ST->getAddressableNumArchVGPRs()); - if (ST) - OS << "(O" - << ST->getOccupancyWithNumVGPRs( - RP.getVGPRNum(ST->hasGFX90AInsts(), - ST->getAddressableNumArchVGPRs()), - DynamicVGPRBlockSize) - << ')'; - OS << ", SGPRs: " << RP.getSGPRNum(); - if (ST) - OS << "(O" << ST->getOccupancyWithNumSGPRs(RP.getSGPRNum()) << ')'; - OS << ", LVGPR WT: " - << RP.getVGPRTuplesWeight(ST->getAddressableNumArchVGPRs()) - << ", LSGPR WT: " << RP.getSGPRTuplesWeight(); - if (ST) - OS << " -> Occ: " << RP.getOccupancy(*ST, DynamicVGPRBlockSize); - OS << '\n'; - }); + unsigned DynamicVGPRBlockSize, + const MachineFunction *MF) { + unsigned ArchVGPRThreshold = ST->getArchVGPRAllocationThreshold(*MF); + return Printable( + [&RP, ST, DynamicVGPRBlockSize, ArchVGPRThreshold, MF](raw_ostream &OS) { + OS << "VGPRs: " << RP.getArchVGPRNum(ArchVGPRThreshold) << ' ' + << "AGPRs: " << RP.getAGPRNum(ArchVGPRThreshold); + if (ST) + OS << "(O" + << ST->getOccupancyWithNumVGPRs( + RP.getVGPRNum(ST->hasGFX90AInsts(), ArchVGPRThreshold), + DynamicVGPRBlockSize) + << ')'; + OS << ", SGPRs: " << RP.getSGPRNum(); + if (ST) + OS << "(O" << ST->getOccupancyWithNumSGPRs(RP.getSGPRNum()) << ')'; + OS << ", LVGPR WT: " << RP.getVGPRTuplesWeight(ArchVGPRThreshold) + << ", LSGPR WT: " << RP.getSGPRTuplesWeight(); + if (ST) + OS << " -> Occ: " << RP.getOccupancy(*ST, DynamicVGPRBlockSize, *MF); + OS << '\n'; + }); } static LaneBitmask getDefRegMask(const MachineOperand &MO, @@ -899,9 +903,10 @@ bool GCNRegPressurePrinter::runOnMachineFunction(MachineFunction &MF) { auto printRP = [&MF](const GCNRegPressure &RP) { return Printable([&RP, &MF](raw_ostream &OS) { OS << format(PFX " %-5d", RP.getSGPRNum()) - << format(" %-5d", - RP.getVGPRNum(false, MF.getSubtarget() - .getAddressableNumArchVGPRs())); + << format( + " %-5d", + RP.getVGPRNum(false, MF.getSubtarget() + .getArchVGPRAllocationThreshold(MF))); }); }; diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h index a8c1c3bfd8703..98eb35eaaca8e 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h @@ -108,13 +108,13 @@ struct GCNRegPressure { } unsigned getSGPRTuplesWeight() const { return Value[TOTAL_KINDS + SGPR]; } - unsigned getOccupancy(const GCNSubtarget &ST, - unsigned DynamicVGPRBlockSize) const { - return std::min( - ST.getOccupancyWithNumSGPRs(getSGPRNum()), - ST.getOccupancyWithNumVGPRs( - getVGPRNum(ST.hasGFX90AInsts(), ST.getAddressableNumArchVGPRs()), - DynamicVGPRBlockSize)); + unsigned getOccupancy(const GCNSubtarget &ST, unsigned DynamicVGPRBlockSize, + const MachineFunction &MF) const { + return std::min(ST.getOccupancyWithNumSGPRs(getSGPRNum()), + ST.getOccupancyWithNumVGPRs( + getVGPRNum(ST.hasGFX90AInsts(), + ST.getArchVGPRAllocationThreshold(MF)), + DynamicVGPRBlockSize)); } void inc(unsigned Reg, @@ -123,9 +123,10 @@ struct GCNRegPressure { const MachineRegisterInfo &MRI); bool higherOccupancy(const GCNSubtarget &ST, const GCNRegPressure &O, - unsigned DynamicVGPRBlockSize) const { - return getOccupancy(ST, DynamicVGPRBlockSize) > - O.getOccupancy(ST, DynamicVGPRBlockSize); + unsigned DynamicVGPRBlockSize, + const MachineFunction &MF) const { + return getOccupancy(ST, DynamicVGPRBlockSize, MF) > + O.getOccupancy(ST, DynamicVGPRBlockSize, MF); } /// Compares \p this GCNRegpressure to \p O, returning true if \p this is @@ -551,7 +552,8 @@ bool isEqual(const GCNRPTracker::LiveRegSet &S1, const GCNRPTracker::LiveRegSet &S2); Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST = nullptr, - unsigned DynamicVGPRBlockSize = 0); + unsigned DynamicVGPRBlockSize = 0, + const MachineFunction *MF = nullptr); Printable print(const GCNRPTracker::LiveRegSet &LiveRegs, const MachineRegisterInfo &MRI); diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index 772c979809b75..2b61ad4e7a8d5 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -190,13 +190,14 @@ static void getRegisterPressures( TempUpwardTracker.recede(*MI); NewPressure = TempUpwardTracker.getPressure(); } - unsigned AddressableArchVGPR = - DAG->MF.getSubtarget().getAddressableNumArchVGPRs(); + unsigned ArchVGPRThreshold = + DAG->MF.getSubtarget().getArchVGPRAllocationThreshold( + DAG->MF); Pressure[AMDGPU::RegisterPressureSets::SReg_32] = NewPressure.getSGPRNum(); Pressure[AMDGPU::RegisterPressureSets::VGPR_32] = - NewPressure.getArchVGPRNum(AddressableArchVGPR); + NewPressure.getArchVGPRNum(ArchVGPRThreshold); Pressure[AMDGPU::RegisterPressureSets::AGPR_32] = - NewPressure.getAGPRNum(AddressableArchVGPR); + NewPressure.getAGPRNum(ArchVGPRThreshold); } void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU, @@ -343,7 +344,8 @@ void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone, : static_cast(&DownwardTracker); SGPRPressure = T->getPressure().getSGPRNum(); VGPRPressure = T->getPressure().getArchVGPRNum( - DAG->MF.getSubtarget().getAddressableNumArchVGPRs()); + DAG->MF.getSubtarget().getArchVGPRAllocationThreshold( + DAG->MF)); } } ReadyQueue &Q = Zone.Available; @@ -1144,8 +1146,9 @@ void UnclusteredHighRPStage::finalizeGCNSchedStage() { if (DAG.MinOccupancy > InitialOccupancy) { for (unsigned IDX = 0; IDX < DAG.Pressure.size(); ++IDX) DAG.RegionsWithMinOcc[IDX] = - DAG.Pressure[IDX].getOccupancy( - DAG.ST, DAG.MFI.getDynamicVGPRBlockSize()) == DAG.MinOccupancy; + DAG.Pressure[IDX].getOccupancy(DAG.ST, + DAG.MFI.getDynamicVGPRBlockSize(), + DAG.MF) == DAG.MinOccupancy; LLVM_DEBUG(dbgs() << StageID << " stage successfully increased occupancy to " @@ -1197,8 +1200,10 @@ bool GCNSchedStage::initGCNRegion() { dbgs() << "Pressure before scheduling:\nRegion live-ins:" << print(DAG.LiveIns[RegionIdx], DAG.MRI) << "Region live-in pressure: " - << print(llvm::getRegPressure(DAG.MRI, DAG.LiveIns[RegionIdx])) - << "Region register pressure: " << print(PressureBefore)); + << print(llvm::getRegPressure(DAG.MRI, DAG.LiveIns[RegionIdx]), + &ST, 0, &MF) + << "Region register pressure: " + << print(PressureBefore, &ST, 0, &MF)); S.HasHighPressure = false; S.KnownExcessRP = isRegionWithExcessRP(); @@ -1279,17 +1284,18 @@ void GCNSchedStage::checkScheduling() { // Check the results of scheduling. PressureAfter = DAG.getRealRegPressure(RegionIdx); - LLVM_DEBUG(dbgs() << "Pressure after scheduling: " << print(PressureAfter)); + LLVM_DEBUG(dbgs() << "Pressure after scheduling: " + << print(PressureAfter, &ST, 0, &MF)); LLVM_DEBUG(dbgs() << "Region: " << RegionIdx << ".\n"); unsigned DynamicVGPRBlockSize = DAG.MFI.getDynamicVGPRBlockSize(); - unsigned AddressableArchVGPR = ST.getAddressableNumArchVGPRs(); + unsigned ArchVGPRThreshold = ST.getArchVGPRAllocationThreshold(MF); if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit && - PressureAfter.getVGPRNum(ST.hasGFX90AInsts(), AddressableArchVGPR) <= + PressureAfter.getVGPRNum(ST.hasGFX90AInsts(), ArchVGPRThreshold) <= S.VGPRCriticalLimit) { DAG.Pressure[RegionIdx] = PressureAfter; DAG.RegionsWithMinOcc[RegionIdx] = - PressureAfter.getOccupancy(ST, DynamicVGPRBlockSize) == + PressureAfter.getOccupancy(ST, DynamicVGPRBlockSize, DAG.MF) == DAG.MinOccupancy; // Early out if we have achieved the occupancy target. @@ -1299,10 +1305,12 @@ void GCNSchedStage::checkScheduling() { unsigned TargetOccupancy = std::min( S.getTargetOccupancy(), ST.getOccupancyWithWorkGroupSizes(MF).second); - unsigned WavesAfter = std::min( - TargetOccupancy, PressureAfter.getOccupancy(ST, DynamicVGPRBlockSize)); - unsigned WavesBefore = std::min( - TargetOccupancy, PressureBefore.getOccupancy(ST, DynamicVGPRBlockSize)); + unsigned WavesAfter = + std::min(TargetOccupancy, + PressureAfter.getOccupancy(ST, DynamicVGPRBlockSize, DAG.MF)); + unsigned WavesBefore = + std::min(TargetOccupancy, + PressureBefore.getOccupancy(ST, DynamicVGPRBlockSize, DAG.MF)); LLVM_DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore << ", after " << WavesAfter << ".\n"); @@ -1336,10 +1344,10 @@ void GCNSchedStage::checkScheduling() { unsigned MaxArchVGPRs = std::min(MaxVGPRs, ST.getAddressableNumArchVGPRs()); unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF); - if (PressureAfter.getVGPRNum(ST.hasGFX90AInsts(), AddressableArchVGPR) > + if (PressureAfter.getVGPRNum(ST.hasGFX90AInsts(), ArchVGPRThreshold) > MaxVGPRs || - PressureAfter.getArchVGPRNum(AddressableArchVGPR) > MaxArchVGPRs || - PressureAfter.getAGPRNum(AddressableArchVGPR) > MaxArchVGPRs || + PressureAfter.getArchVGPRNum(ArchVGPRThreshold) > MaxArchVGPRs || + PressureAfter.getAGPRNum(ArchVGPRThreshold) > MaxArchVGPRs || PressureAfter.getSGPRNum() > MaxSGPRs) { DAG.RegionsWithHighRP[RegionIdx] = true; DAG.RegionsWithExcessRP[RegionIdx] = true; @@ -1352,7 +1360,7 @@ void GCNSchedStage::checkScheduling() { } else { DAG.Pressure[RegionIdx] = PressureAfter; DAG.RegionsWithMinOcc[RegionIdx] = - PressureAfter.getOccupancy(ST, DynamicVGPRBlockSize) == + PressureAfter.getOccupancy(ST, DynamicVGPRBlockSize, DAG.MF) == DAG.MinOccupancy; } } @@ -1477,13 +1485,13 @@ bool GCNSchedStage::shouldRevertScheduling(unsigned WavesAfter) { // For dynamic VGPR mode, we don't want to waste any VGPR blocks. if (DAG.MFI.isDynamicVGPREnabled()) { - unsigned AddressableArchVGPR = ST.getAddressableNumArchVGPRs(); + unsigned ArchVGPRThreshold = ST.getArchVGPRAllocationThreshold(MF); unsigned BlocksBefore = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks( &ST, DAG.MFI.getDynamicVGPRBlockSize(), - PressureBefore.getVGPRNum(false, AddressableArchVGPR)); + PressureBefore.getVGPRNum(false, ArchVGPRThreshold)); unsigned BlocksAfter = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks( &ST, DAG.MFI.getDynamicVGPRBlockSize(), - PressureAfter.getVGPRNum(false, AddressableArchVGPR)); + PressureAfter.getVGPRNum(false, ArchVGPRThreshold)); if (BlocksAfter > BlocksBefore) return true; } @@ -1507,8 +1515,8 @@ bool OccInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) { bool UnclusteredHighRPStage::shouldRevertScheduling(unsigned WavesAfter) { // If RP is not reduced in the unclustered reschedule stage, revert to the // old schedule. - if ((WavesAfter <= - PressureBefore.getOccupancy(ST, DAG.MFI.getDynamicVGPRBlockSize()) && + if ((WavesAfter <= PressureBefore.getOccupancy( + ST, DAG.MFI.getDynamicVGPRBlockSize(), DAG.MF) && mayCauseSpilling(WavesAfter)) || GCNSchedStage::shouldRevertScheduling(WavesAfter)) { LLVM_DEBUG(dbgs() << "Unclustered reschedule did not help.\n"); @@ -1530,9 +1538,10 @@ bool UnclusteredHighRPStage::shouldRevertScheduling(unsigned WavesAfter) { ScheduleMetrics MAfter = getScheduleMetrics(DAG); unsigned OldMetric = MBefore.getMetric(); unsigned NewMetric = MAfter.getMetric(); - unsigned WavesBefore = std::min( - S.getTargetOccupancy(), - PressureBefore.getOccupancy(ST, DAG.MFI.getDynamicVGPRBlockSize())); + unsigned WavesBefore = + std::min(S.getTargetOccupancy(), + PressureBefore.getOccupancy( + ST, DAG.MFI.getDynamicVGPRBlockSize(), DAG.MF)); unsigned Profit = ((WavesAfter * ScheduleMetrics::ScaleFactor) / WavesBefore * ((OldMetric + ScheduleMetricBias) * ScheduleMetrics::ScaleFactor) / @@ -1586,8 +1595,8 @@ bool GCNSchedStage::mayCauseSpilling(unsigned WavesAfter) { void GCNSchedStage::revertScheduling() { DAG.RegionsWithMinOcc[RegionIdx] = - PressureBefore.getOccupancy(ST, DAG.MFI.getDynamicVGPRBlockSize()) == - DAG.MinOccupancy; + PressureBefore.getOccupancy(ST, DAG.MFI.getDynamicVGPRBlockSize(), + DAG.MF) == DAG.MinOccupancy; LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n"); DAG.RegionEnd = DAG.RegionBegin; int SkippedDebugInstr = 0; @@ -2025,8 +2034,10 @@ void PreRARematStage::rematerialize() { } DAG.Pressure[I] = RP; AchievedOcc = std::min( - AchievedOcc, RP.getOccupancy(ST, MF.getInfo() - ->getDynamicVGPRBlockSize())); + AchievedOcc, + RP.getOccupancy( + ST, MF.getInfo()->getDynamicVGPRBlockSize(), + DAG.MF)); } REMAT_DEBUG(dbgs() << "Achieved occupancy " << AchievedOcc << "\n"); } diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 10ded0e1d1c3a..a259b90545ee9 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -1629,6 +1629,15 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, return AMDGPU::IsaInfo::getAddressableNumArchVGPRs(this); } + unsigned getArchVGPRAllocationThreshold(const MachineFunction &MF) const { + if (hasGFX90AInsts() || !hasMAIInsts()) + return AMDGPU::IsaInfo::getAddressableNumArchVGPRs(this); + + const Function &F = MF.getFunction(); + std::pair Waves = getWavesPerEU(F); + return getMaxNumVGPRs(Waves.first, 0); + } + /// \returns Addressable number of VGPRs supported by the subtarget. unsigned getAddressableNumVGPRs(unsigned DynamicVGPRBlockSize) const { return AMDGPU::IsaInfo::getAddressableNumVGPRs(this, DynamicVGPRBlockSize); diff --git a/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp b/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp index 844908a5ce8d9..f4cf8f4e03df8 100644 --- a/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp +++ b/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp @@ -199,7 +199,8 @@ bool SIFormMemoryClausesImpl::checkPressure(const MachineInstr &MI, GCNRegPressure MaxPressure = RPT.moveMaxPressure(); unsigned Occupancy = MaxPressure.getOccupancy( *ST, - MI.getMF()->getInfo()->getDynamicVGPRBlockSize()); + MI.getMF()->getInfo()->getDynamicVGPRBlockSize(), + *MI.getMF()); // Don't push over half the register budget. We don't want to introduce // spilling just to form a soft clause. @@ -212,7 +213,7 @@ bool SIFormMemoryClausesImpl::checkPressure(const MachineInstr &MI, // fragmentation of registers the allocator will need to satisfy. if (Occupancy >= MFI->getMinAllowedOccupancy() && MaxPressure.getVGPRNum(ST->hasGFX90AInsts(), - ST->getAddressableNumArchVGPRs()) <= + ST->getArchVGPRAllocationThreshold(*MI.getMF())) <= MaxVGPRs / 2 && MaxPressure.getSGPRNum() <= MaxSGPRs / 2) { LastRecordedOccupancy = Occupancy; diff --git a/llvm/test/CodeGen/AMDGPU/debug-value-scheduler-liveins.mir b/llvm/test/CodeGen/AMDGPU/debug-value-scheduler-liveins.mir index 2a08c52e447ba..72181346764fb 100644 --- a/llvm/test/CodeGen/AMDGPU/debug-value-scheduler-liveins.mir +++ b/llvm/test/CodeGen/AMDGPU/debug-value-scheduler-liveins.mir @@ -6,7 +6,7 @@ # CHECK-NEXT: test_get_liveins:%bb.0 # CHECK: ********** MI Scheduling ********** # CHECK-NEXT: test_get_liveins:%bb.1 -# CHECK: Region live-in pressure: VGPRs: 1 AGPRs: 0, SGPRs: 0, LVGPR WT: 0, LSGPR WT: 0 +# CHECK: Region live-in pressure: VGPRs: 1 AGPRs: 0(O10), SGPRs: 0(O10), LVGPR WT: 0, LSGPR WT: 0 -> Occ: 10 # CHECK: ScheduleDAGMILive::schedule starting --- From 1f24d721d1bb0ef89fe91a787cf0941edc63816b Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Mon, 28 Jul 2025 13:19:52 -0700 Subject: [PATCH 03/11] Cleanup signature for getOccupancy Change-Id: I0b74f6ee1d93bd5e6fc3e285c0c6e91a8090d28e --- .../Target/AMDGPU/GCNIterativeScheduler.cpp | 39 ++++++------------- llvm/lib/Target/AMDGPU/GCNRegPressure.cpp | 2 +- llvm/lib/Target/AMDGPU/GCNRegPressure.h | 14 ++++--- llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 33 +++++----------- .../lib/Target/AMDGPU/SIFormMemoryClauses.cpp | 5 +-- 5 files changed, 31 insertions(+), 62 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp index 050e47270498b..2c833abedbfb7 100644 --- a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp +++ b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp @@ -447,11 +447,7 @@ void GCNIterativeScheduler::sortRegionsByPressure(unsigned TargetOcc) { // BestSchedules aren't deleted on fail. unsigned GCNIterativeScheduler::tryMaximizeOccupancy(unsigned TargetOcc) { // TODO: assert Regions are sorted descending by pressure - const auto &ST = MF.getSubtarget(); - const unsigned DynamicVGPRBlockSize = - MF.getInfo()->getDynamicVGPRBlockSize(); - const auto Occ = - Regions.front()->MaxPressure.getOccupancy(ST, DynamicVGPRBlockSize, MF); + const auto Occ = Regions.front()->MaxPressure.getOccupancy(MF); LLVM_DEBUG(dbgs() << "Trying to improve occupancy, target = " << TargetOcc << ", current = " << Occ << '\n'); @@ -460,7 +456,7 @@ unsigned GCNIterativeScheduler::tryMaximizeOccupancy(unsigned TargetOcc) { // Always build the DAG to add mutations BuildDAG DAG(*R, *this); - if (R->MaxPressure.getOccupancy(ST, DynamicVGPRBlockSize, MF) >= NewOcc) + if (R->MaxPressure.getOccupancy(MF) >= NewOcc) continue; LLVM_DEBUG(printRegion(dbgs(), R->Begin, R->End, LIS, 3); @@ -471,7 +467,7 @@ unsigned GCNIterativeScheduler::tryMaximizeOccupancy(unsigned TargetOcc) { LLVM_DEBUG(dbgs() << "Occupancy improvement attempt:\n"; printSchedRP(dbgs(), R->MaxPressure, MaxRP)); - NewOcc = std::min(NewOcc, MaxRP.getOccupancy(ST, DynamicVGPRBlockSize, MF)); + NewOcc = std::min(NewOcc, MaxRP.getOccupancy(MF)); if (NewOcc <= Occ) break; @@ -489,14 +485,11 @@ unsigned GCNIterativeScheduler::tryMaximizeOccupancy(unsigned TargetOcc) { void GCNIterativeScheduler::scheduleLegacyMaxOccupancy( bool TryMaximizeOccupancy) { - const auto &ST = MF.getSubtarget(); SIMachineFunctionInfo *MFI = MF.getInfo(); auto TgtOcc = MFI->getMinAllowedOccupancy(); - unsigned DynamicVGPRBlockSize = MFI->getDynamicVGPRBlockSize(); sortRegionsByPressure(TgtOcc); - auto Occ = - Regions.front()->MaxPressure.getOccupancy(ST, DynamicVGPRBlockSize, MF); + auto Occ = Regions.front()->MaxPressure.getOccupancy(MF); bool IsReentry = false; if (TryMaximizeOccupancy && Occ < TgtOcc) { @@ -527,22 +520,19 @@ void GCNIterativeScheduler::scheduleLegacyMaxOccupancy( const auto RP = getRegionPressure(*R); LLVM_DEBUG(printSchedRP(dbgs(), R->MaxPressure, RP)); - if (RP.getOccupancy(ST, DynamicVGPRBlockSize, MF) < TgtOcc) { + if (RP.getOccupancy(MF) < TgtOcc) { LLVM_DEBUG(dbgs() << "Didn't fit into target occupancy O" << TgtOcc); if (R->BestSchedule.get() && - R->BestSchedule->MaxPressure.getOccupancy(ST, DynamicVGPRBlockSize, - MF) >= TgtOcc) { + R->BestSchedule->MaxPressure.getOccupancy(MF) >= TgtOcc) { LLVM_DEBUG(dbgs() << ", scheduling minimal register\n"); scheduleBest(*R); } else { LLVM_DEBUG(dbgs() << ", restoring\n"); Ovr.restoreOrder(); - assert(R->MaxPressure.getOccupancy(ST, DynamicVGPRBlockSize, MF) >= - TgtOcc); + assert(R->MaxPressure.getOccupancy(MF) >= TgtOcc); } } - FinalOccupancy = std::min(FinalOccupancy, - RP.getOccupancy(ST, DynamicVGPRBlockSize, MF)); + FinalOccupancy = std::min(FinalOccupancy, RP.getOccupancy(MF)); } } MFI->limitOccupancy(FinalOccupancy); @@ -585,14 +575,11 @@ void GCNIterativeScheduler::scheduleMinReg(bool force) { void GCNIterativeScheduler::scheduleILP( bool TryMaximizeOccupancy) { - const auto &ST = MF.getSubtarget(); SIMachineFunctionInfo *MFI = MF.getInfo(); auto TgtOcc = MFI->getMinAllowedOccupancy(); - unsigned DynamicVGPRBlockSize = MFI->getDynamicVGPRBlockSize(); sortRegionsByPressure(TgtOcc); - auto Occ = - Regions.front()->MaxPressure.getOccupancy(ST, DynamicVGPRBlockSize, MF); + auto Occ = Regions.front()->MaxPressure.getOccupancy(MF); bool IsReentry = false; if (TryMaximizeOccupancy && Occ < TgtOcc) { @@ -613,19 +600,17 @@ void GCNIterativeScheduler::scheduleILP( const auto RP = getSchedulePressure(*R, ILPSchedule); LLVM_DEBUG(printSchedRP(dbgs(), R->MaxPressure, RP)); - if (RP.getOccupancy(ST, DynamicVGPRBlockSize, MF) < TgtOcc) { + if (RP.getOccupancy(MF) < TgtOcc) { LLVM_DEBUG(dbgs() << "Didn't fit into target occupancy O" << TgtOcc); if (R->BestSchedule.get() && - R->BestSchedule->MaxPressure.getOccupancy(ST, DynamicVGPRBlockSize, - MF) >= TgtOcc) { + R->BestSchedule->MaxPressure.getOccupancy(MF) >= TgtOcc) { LLVM_DEBUG(dbgs() << ", scheduling minimal register\n"); scheduleBest(*R); } } else { scheduleRegion(*R, ILPSchedule, RP); LLVM_DEBUG(printSchedResult(dbgs(), R, RP)); - FinalOccupancy = std::min(FinalOccupancy, - RP.getOccupancy(ST, DynamicVGPRBlockSize, MF)); + FinalOccupancy = std::min(FinalOccupancy, RP.getOccupancy(MF)); } } MFI->limitOccupancy(FinalOccupancy); diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp index 786b45902ae48..9ee171e1b9999 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp @@ -267,7 +267,7 @@ Printable llvm::print(const GCNRegPressure &RP, const GCNSubtarget *ST, OS << ", LVGPR WT: " << RP.getVGPRTuplesWeight(ArchVGPRThreshold) << ", LSGPR WT: " << RP.getSGPRTuplesWeight(); if (ST) - OS << " -> Occ: " << RP.getOccupancy(*ST, DynamicVGPRBlockSize, *MF); + OS << " -> Occ: " << RP.getOccupancy(*MF); OS << '\n'; }); } diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h index 98eb35eaaca8e..9eb86017adafc 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h @@ -18,6 +18,7 @@ #define LLVM_LIB_TARGET_AMDGPU_GCNREGPRESSURE_H #include "GCNSubtarget.h" +#include "SIMachineFunctionInfo.h" #include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/RegisterPressure.h" #include @@ -108,8 +109,11 @@ struct GCNRegPressure { } unsigned getSGPRTuplesWeight() const { return Value[TOTAL_KINDS + SGPR]; } - unsigned getOccupancy(const GCNSubtarget &ST, unsigned DynamicVGPRBlockSize, - const MachineFunction &MF) const { + unsigned getOccupancy(const MachineFunction &MF) const { + const GCNSubtarget &ST = MF.getSubtarget(); + unsigned DynamicVGPRBlockSize = + MF.getInfo()->getDynamicVGPRBlockSize(); + return std::min(ST.getOccupancyWithNumSGPRs(getSGPRNum()), ST.getOccupancyWithNumVGPRs( getVGPRNum(ST.hasGFX90AInsts(), @@ -122,11 +126,9 @@ struct GCNRegPressure { LaneBitmask NewMask, const MachineRegisterInfo &MRI); - bool higherOccupancy(const GCNSubtarget &ST, const GCNRegPressure &O, - unsigned DynamicVGPRBlockSize, + bool higherOccupancy(const GCNRegPressure &O, const MachineFunction &MF) const { - return getOccupancy(ST, DynamicVGPRBlockSize, MF) > - O.getOccupancy(ST, DynamicVGPRBlockSize, MF); + return getOccupancy(MF) > O.getOccupancy(MF); } /// Compares \p this GCNRegpressure to \p O, returning true if \p this is diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index 2b61ad4e7a8d5..ef3dcea2fcac5 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -1146,9 +1146,7 @@ void UnclusteredHighRPStage::finalizeGCNSchedStage() { if (DAG.MinOccupancy > InitialOccupancy) { for (unsigned IDX = 0; IDX < DAG.Pressure.size(); ++IDX) DAG.RegionsWithMinOcc[IDX] = - DAG.Pressure[IDX].getOccupancy(DAG.ST, - DAG.MFI.getDynamicVGPRBlockSize(), - DAG.MF) == DAG.MinOccupancy; + DAG.Pressure[IDX].getOccupancy(DAG.MF) == DAG.MinOccupancy; LLVM_DEBUG(dbgs() << StageID << " stage successfully increased occupancy to " @@ -1288,15 +1286,13 @@ void GCNSchedStage::checkScheduling() { << print(PressureAfter, &ST, 0, &MF)); LLVM_DEBUG(dbgs() << "Region: " << RegionIdx << ".\n"); - unsigned DynamicVGPRBlockSize = DAG.MFI.getDynamicVGPRBlockSize(); unsigned ArchVGPRThreshold = ST.getArchVGPRAllocationThreshold(MF); if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit && PressureAfter.getVGPRNum(ST.hasGFX90AInsts(), ArchVGPRThreshold) <= S.VGPRCriticalLimit) { DAG.Pressure[RegionIdx] = PressureAfter; DAG.RegionsWithMinOcc[RegionIdx] = - PressureAfter.getOccupancy(ST, DynamicVGPRBlockSize, DAG.MF) == - DAG.MinOccupancy; + PressureAfter.getOccupancy(DAG.MF) == DAG.MinOccupancy; // Early out if we have achieved the occupancy target. LLVM_DEBUG(dbgs() << "Pressure in desired limits, done.\n"); @@ -1306,11 +1302,9 @@ void GCNSchedStage::checkScheduling() { unsigned TargetOccupancy = std::min( S.getTargetOccupancy(), ST.getOccupancyWithWorkGroupSizes(MF).second); unsigned WavesAfter = - std::min(TargetOccupancy, - PressureAfter.getOccupancy(ST, DynamicVGPRBlockSize, DAG.MF)); + std::min(TargetOccupancy, PressureAfter.getOccupancy(DAG.MF)); unsigned WavesBefore = - std::min(TargetOccupancy, - PressureBefore.getOccupancy(ST, DynamicVGPRBlockSize, DAG.MF)); + std::min(TargetOccupancy, PressureBefore.getOccupancy(DAG.MF)); LLVM_DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore << ", after " << WavesAfter << ".\n"); @@ -1360,8 +1354,7 @@ void GCNSchedStage::checkScheduling() { } else { DAG.Pressure[RegionIdx] = PressureAfter; DAG.RegionsWithMinOcc[RegionIdx] = - PressureAfter.getOccupancy(ST, DynamicVGPRBlockSize, DAG.MF) == - DAG.MinOccupancy; + PressureAfter.getOccupancy(DAG.MF) == DAG.MinOccupancy; } } @@ -1515,8 +1508,7 @@ bool OccInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) { bool UnclusteredHighRPStage::shouldRevertScheduling(unsigned WavesAfter) { // If RP is not reduced in the unclustered reschedule stage, revert to the // old schedule. - if ((WavesAfter <= PressureBefore.getOccupancy( - ST, DAG.MFI.getDynamicVGPRBlockSize(), DAG.MF) && + if ((WavesAfter <= PressureBefore.getOccupancy(DAG.MF) && mayCauseSpilling(WavesAfter)) || GCNSchedStage::shouldRevertScheduling(WavesAfter)) { LLVM_DEBUG(dbgs() << "Unclustered reschedule did not help.\n"); @@ -1539,9 +1531,7 @@ bool UnclusteredHighRPStage::shouldRevertScheduling(unsigned WavesAfter) { unsigned OldMetric = MBefore.getMetric(); unsigned NewMetric = MAfter.getMetric(); unsigned WavesBefore = - std::min(S.getTargetOccupancy(), - PressureBefore.getOccupancy( - ST, DAG.MFI.getDynamicVGPRBlockSize(), DAG.MF)); + std::min(S.getTargetOccupancy(), PressureBefore.getOccupancy(DAG.MF)); unsigned Profit = ((WavesAfter * ScheduleMetrics::ScaleFactor) / WavesBefore * ((OldMetric + ScheduleMetricBias) * ScheduleMetrics::ScaleFactor) / @@ -1595,8 +1585,7 @@ bool GCNSchedStage::mayCauseSpilling(unsigned WavesAfter) { void GCNSchedStage::revertScheduling() { DAG.RegionsWithMinOcc[RegionIdx] = - PressureBefore.getOccupancy(ST, DAG.MFI.getDynamicVGPRBlockSize(), - DAG.MF) == DAG.MinOccupancy; + PressureBefore.getOccupancy(DAG.MF) == DAG.MinOccupancy; LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n"); DAG.RegionEnd = DAG.RegionBegin; int SkippedDebugInstr = 0; @@ -2033,11 +2022,7 @@ void PreRARematStage::rematerialize() { } } DAG.Pressure[I] = RP; - AchievedOcc = std::min( - AchievedOcc, - RP.getOccupancy( - ST, MF.getInfo()->getDynamicVGPRBlockSize(), - DAG.MF)); + AchievedOcc = std::min(AchievedOcc, RP.getOccupancy(DAG.MF)); } REMAT_DEBUG(dbgs() << "Achieved occupancy " << AchievedOcc << "\n"); } diff --git a/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp b/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp index f4cf8f4e03df8..03d5ac6dec025 100644 --- a/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp +++ b/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp @@ -197,10 +197,7 @@ bool SIFormMemoryClausesImpl::checkPressure(const MachineInstr &MI, // pointer becomes dead and could otherwise be reused for destination. RPT.advanceToNext(); GCNRegPressure MaxPressure = RPT.moveMaxPressure(); - unsigned Occupancy = MaxPressure.getOccupancy( - *ST, - MI.getMF()->getInfo()->getDynamicVGPRBlockSize(), - *MI.getMF()); + unsigned Occupancy = MaxPressure.getOccupancy(*MI.getMF()); // Don't push over half the register budget. We don't want to introduce // spilling just to form a soft clause. From c3351970a84afa0293f2160b7dce6bb3d48f8ef3 Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Mon, 28 Jul 2025 19:13:14 -0700 Subject: [PATCH 04/11] Factor out getAVGPRSAs*GPRsNum Change-Id: Ia3b8507f95763079ee3c2224655990a299c8854d --- llvm/lib/Target/AMDGPU/GCNRegPressure.h | 39 +++++++++++++++++-------- 1 file changed, 27 insertions(+), 12 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h index 9eb86017adafc..c7449b43a35b4 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h @@ -57,6 +57,24 @@ struct GCNRegPressure { return std::max(Value[VGPR] + Value[AVGPR], Value[AGPR]); } + inline static unsigned getAVGPRsAsVGPRsNum(unsigned NumArchVGPRs, + unsigned NumAVGPRs, + unsigned AddressableArchVGPR) { + + return NumArchVGPRs < AddressableArchVGPR + ? std::min((AddressableArchVGPR - NumArchVGPRs), NumAVGPRs) + : 0; + } + + inline static unsigned getAVGPRsAsAGPRsNum(unsigned NumArchVGPRs, + unsigned NumAGPRs, + unsigned NumAVGPRs, + unsigned AddressableArchVGPR) { + unsigned AVGPRsAsVGPRs = + getAVGPRsAsVGPRsNum(NumArchVGPRs, NumAVGPRs, AddressableArchVGPR); + return NumAVGPRs > AVGPRsAsVGPRs ? NumAVGPRs - AVGPRsAsVGPRs : 0; + } + /// Returns the aggregated VGPR pressure, assuming \p NumArchVGPRs ArchVGPRs /// \p NumAGPRs AGPRS, and \p NumAVGPRs AVGPRs for a target with a unified /// VGPR file. @@ -68,11 +86,10 @@ struct GCNRegPressure { // Until we hit the VGPRThreshold, we will assign AV as VGPR. After that // point, we will assign as AGPR. unsigned AVGPRsAsVGPRs = - NumArchVGPRs < AddressableArchVGPR - ? std::min((AddressableArchVGPR - NumArchVGPRs), NumAVGPRs) - : 0; - unsigned AVGPRsAsAGPRs = - NumAVGPRs > AVGPRsAsVGPRs ? NumAVGPRs - AVGPRsAsVGPRs : 0; + getAVGPRsAsVGPRsNum(NumArchVGPRs, NumAVGPRs, AddressableArchVGPR); + unsigned AVGPRsAsAGPRs = getAVGPRsAsAGPRsNum( + NumArchVGPRs, NumAGPRs, NumAVGPRs, AddressableArchVGPR); + NumAVGPRs > AVGPRsAsVGPRs ? NumAVGPRs - AVGPRsAsVGPRs : 0; return alignTo(NumArchVGPRs + AVGPRsAsVGPRs, AMDGPU::IsaInfo::getArchVGPRAllocGranule()) + NumAGPRs + AVGPRsAsAGPRs; @@ -96,13 +113,11 @@ struct GCNRegPressure { unsigned getVGPRTuplesWeight(unsigned AddressableArchVGPR) const { unsigned AVGPRsAsVGPRs = - Value[TOTAL_KINDS + VGPR] < AddressableArchVGPR - ? std::min(AddressableArchVGPR - Value[TOTAL_KINDS + VGPR], - Value[TOTAL_KINDS + AVGPR]) - : 0; - unsigned AVGPRsAsAGPRs = Value[TOTAL_KINDS + AVGPR] > AVGPRsAsVGPRs - ? Value[TOTAL_KINDS + AVGPR] - AVGPRsAsVGPRs - : 0; + getAVGPRsAsVGPRsNum(Value[TOTAL_KINDS + VGPR], + Value[TOTAL_KINDS + AVGPR], AddressableArchVGPR); + unsigned AVGPRsAsAGPRs = getAVGPRsAsAGPRsNum( + Value[TOTAL_KINDS + VGPR], Value[TOTAL_KINDS + AGPR], + Value[TOTAL_KINDS + AVGPR], AddressableArchVGPR); return std::max(Value[TOTAL_KINDS + VGPR] + AVGPRsAsVGPRs, Value[TOTAL_KINDS + AGPR] + AVGPRsAsAGPRs); From 33a70f2a3faa35ebda87554ce22244cd5e6101d3 Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Mon, 28 Jul 2025 19:14:44 -0700 Subject: [PATCH 05/11] Formatting Change-Id: I14486056bef5e9a97842be68a7f5abe82ecc37fe --- llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp index 2c833abedbfb7..87f5b9f16868a 100644 --- a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp +++ b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp @@ -484,7 +484,7 @@ unsigned GCNIterativeScheduler::tryMaximizeOccupancy(unsigned TargetOcc) { } void GCNIterativeScheduler::scheduleLegacyMaxOccupancy( - bool TryMaximizeOccupancy) { + bool TryMaximizeOccupancy) { SIMachineFunctionInfo *MFI = MF.getInfo(); auto TgtOcc = MFI->getMinAllowedOccupancy(); @@ -573,8 +573,7 @@ void GCNIterativeScheduler::scheduleMinReg(bool force) { /////////////////////////////////////////////////////////////////////////////// // ILP scheduler port -void GCNIterativeScheduler::scheduleILP( - bool TryMaximizeOccupancy) { +void GCNIterativeScheduler::scheduleILP(bool TryMaximizeOccupancy) { SIMachineFunctionInfo *MFI = MF.getInfo(); auto TgtOcc = MFI->getMinAllowedOccupancy(); From 572732449576479c7394638b1e21a92a39559d35 Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Mon, 28 Jul 2025 19:34:17 -0700 Subject: [PATCH 06/11] Use getMaxNumVectorRegs instead of getArchVGPRAllocationThreshold Change-Id: I36e92840e35774cb419389ee6dadc26dd376ebaa --- llvm/lib/Target/AMDGPU/GCNRegPressure.cpp | 14 +++++++------ llvm/lib/Target/AMDGPU/GCNRegPressure.h | 12 +++++------ llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 21 ++++++++++++------- llvm/lib/Target/AMDGPU/GCNSubtarget.h | 9 -------- .../lib/Target/AMDGPU/SIFormMemoryClauses.cpp | 5 +++-- 5 files changed, 30 insertions(+), 31 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp index 9ee171e1b9999..966e810115195 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp @@ -99,7 +99,8 @@ void GCNRegPressure::inc(unsigned Reg, bool GCNRegPressure::less(const MachineFunction &MF, const GCNRegPressure &O, unsigned MaxOccupancy) const { const GCNSubtarget &ST = MF.getSubtarget(); - unsigned ArchVGPRThreshold = ST.getArchVGPRAllocationThreshold(MF); + unsigned ArchVGPRThreshold = + ST.getRegisterInfo()->getMaxNumVectorRegs(MF).first; unsigned DynamicVGPRBlockSize = MF.getInfo()->getDynamicVGPRBlockSize(); @@ -250,7 +251,8 @@ bool GCNRegPressure::less(const MachineFunction &MF, const GCNRegPressure &O, Printable llvm::print(const GCNRegPressure &RP, const GCNSubtarget *ST, unsigned DynamicVGPRBlockSize, const MachineFunction *MF) { - unsigned ArchVGPRThreshold = ST->getArchVGPRAllocationThreshold(*MF); + unsigned ArchVGPRThreshold = + ST->getRegisterInfo()->getMaxNumVectorRegs(*MF).first; return Printable( [&RP, ST, DynamicVGPRBlockSize, ArchVGPRThreshold, MF](raw_ostream &OS) { OS << "VGPRs: " << RP.getArchVGPRNum(ArchVGPRThreshold) << ' ' @@ -903,10 +905,10 @@ bool GCNRegPressurePrinter::runOnMachineFunction(MachineFunction &MF) { auto printRP = [&MF](const GCNRegPressure &RP) { return Printable([&RP, &MF](raw_ostream &OS) { OS << format(PFX " %-5d", RP.getSGPRNum()) - << format( - " %-5d", - RP.getVGPRNum(false, MF.getSubtarget() - .getArchVGPRAllocationThreshold(MF))); + << format(" %-5d", RP.getVGPRNum(false, MF.getSubtarget() + .getRegisterInfo() + ->getMaxNumVectorRegs(MF) + .first)); }); }; diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h index c7449b43a35b4..d61e0348dabb4 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h @@ -89,7 +89,6 @@ struct GCNRegPressure { getAVGPRsAsVGPRsNum(NumArchVGPRs, NumAVGPRs, AddressableArchVGPR); unsigned AVGPRsAsAGPRs = getAVGPRsAsAGPRsNum( NumArchVGPRs, NumAGPRs, NumAVGPRs, AddressableArchVGPR); - NumAVGPRs > AVGPRsAsVGPRs ? NumAVGPRs - AVGPRsAsVGPRs : 0; return alignTo(NumArchVGPRs + AVGPRsAsVGPRs, AMDGPU::IsaInfo::getArchVGPRAllocGranule()) + NumAGPRs + AVGPRsAsAGPRs; @@ -129,11 +128,12 @@ struct GCNRegPressure { unsigned DynamicVGPRBlockSize = MF.getInfo()->getDynamicVGPRBlockSize(); - return std::min(ST.getOccupancyWithNumSGPRs(getSGPRNum()), - ST.getOccupancyWithNumVGPRs( - getVGPRNum(ST.hasGFX90AInsts(), - ST.getArchVGPRAllocationThreshold(MF)), - DynamicVGPRBlockSize)); + return std::min( + ST.getOccupancyWithNumSGPRs(getSGPRNum()), + ST.getOccupancyWithNumVGPRs( + getVGPRNum(ST.hasGFX90AInsts(), + ST.getRegisterInfo()->getMaxNumVectorRegs(MF).first), + DynamicVGPRBlockSize)); } void inc(unsigned Reg, diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index ef3dcea2fcac5..52359135f8893 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -190,9 +190,10 @@ static void getRegisterPressures( TempUpwardTracker.recede(*MI); NewPressure = TempUpwardTracker.getPressure(); } - unsigned ArchVGPRThreshold = - DAG->MF.getSubtarget().getArchVGPRAllocationThreshold( - DAG->MF); + unsigned ArchVGPRThreshold = DAG->MF.getSubtarget() + .getRegisterInfo() + ->getMaxNumVectorRegs(DAG->MF) + .first; Pressure[AMDGPU::RegisterPressureSets::SReg_32] = NewPressure.getSGPRNum(); Pressure[AMDGPU::RegisterPressureSets::VGPR_32] = NewPressure.getArchVGPRNum(ArchVGPRThreshold); @@ -343,9 +344,11 @@ void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone, ? static_cast(&UpwardTracker) : static_cast(&DownwardTracker); SGPRPressure = T->getPressure().getSGPRNum(); - VGPRPressure = T->getPressure().getArchVGPRNum( - DAG->MF.getSubtarget().getArchVGPRAllocationThreshold( - DAG->MF)); + VGPRPressure = + T->getPressure().getArchVGPRNum(DAG->MF.getSubtarget() + .getRegisterInfo() + ->getMaxNumVectorRegs(DAG->MF) + .first); } } ReadyQueue &Q = Zone.Available; @@ -1286,7 +1289,8 @@ void GCNSchedStage::checkScheduling() { << print(PressureAfter, &ST, 0, &MF)); LLVM_DEBUG(dbgs() << "Region: " << RegionIdx << ".\n"); - unsigned ArchVGPRThreshold = ST.getArchVGPRAllocationThreshold(MF); + unsigned ArchVGPRThreshold = + ST.getRegisterInfo()->getMaxNumVectorRegs(MF).first; if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit && PressureAfter.getVGPRNum(ST.hasGFX90AInsts(), ArchVGPRThreshold) <= S.VGPRCriticalLimit) { @@ -1478,7 +1482,8 @@ bool GCNSchedStage::shouldRevertScheduling(unsigned WavesAfter) { // For dynamic VGPR mode, we don't want to waste any VGPR blocks. if (DAG.MFI.isDynamicVGPREnabled()) { - unsigned ArchVGPRThreshold = ST.getArchVGPRAllocationThreshold(MF); + unsigned ArchVGPRThreshold = + ST.getRegisterInfo()->getMaxNumVectorRegs(MF).first; unsigned BlocksBefore = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks( &ST, DAG.MFI.getDynamicVGPRBlockSize(), PressureBefore.getVGPRNum(false, ArchVGPRThreshold)); diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index a259b90545ee9..10ded0e1d1c3a 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -1629,15 +1629,6 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, return AMDGPU::IsaInfo::getAddressableNumArchVGPRs(this); } - unsigned getArchVGPRAllocationThreshold(const MachineFunction &MF) const { - if (hasGFX90AInsts() || !hasMAIInsts()) - return AMDGPU::IsaInfo::getAddressableNumArchVGPRs(this); - - const Function &F = MF.getFunction(); - std::pair Waves = getWavesPerEU(F); - return getMaxNumVGPRs(Waves.first, 0); - } - /// \returns Addressable number of VGPRs supported by the subtarget. unsigned getAddressableNumVGPRs(unsigned DynamicVGPRBlockSize) const { return AMDGPU::IsaInfo::getAddressableNumVGPRs(this, DynamicVGPRBlockSize); diff --git a/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp b/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp index 03d5ac6dec025..cdc80ca9267d6 100644 --- a/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp +++ b/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp @@ -209,8 +209,9 @@ bool SIFormMemoryClausesImpl::checkPressure(const MachineInstr &MI, // tracking does not account for the alignment requirements for SGPRs, or the // fragmentation of registers the allocator will need to satisfy. if (Occupancy >= MFI->getMinAllowedOccupancy() && - MaxPressure.getVGPRNum(ST->hasGFX90AInsts(), - ST->getArchVGPRAllocationThreshold(*MI.getMF())) <= + MaxPressure.getVGPRNum( + ST->hasGFX90AInsts(), + ST->getRegisterInfo()->getMaxNumVectorRegs(*MI.getMF()).first) <= MaxVGPRs / 2 && MaxPressure.getSGPRNum() <= MaxSGPRs / 2) { LastRecordedOccupancy = Occupancy; From 5a0696d975c90c5d634311b56d3224d351dcccdf Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Tue, 29 Jul 2025 08:24:20 -0700 Subject: [PATCH 07/11] Add test Change-Id: I68bc69d5bafa3d8161c7b507721a9cde3e99d2b1 --- llvm/test/CodeGen/AMDGPU/avgpr-pressure.mir | 348 ++++++++++++++++++++ 1 file changed, 348 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/avgpr-pressure.mir diff --git a/llvm/test/CodeGen/AMDGPU/avgpr-pressure.mir b/llvm/test/CodeGen/AMDGPU/avgpr-pressure.mir new file mode 100644 index 0000000000000..358942e73a7c6 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/avgpr-pressure.mir @@ -0,0 +1,348 @@ +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -run-pass=machine-scheduler --debug-only=machine-scheduler -o - %s 2>&1 | FileCheck %s +# REQUIRES: asserts + +--- | + define void @avgpr_rp_occ1() #0 { + entry: + unreachable + } + + define void @avgpr_rp_occ2() #1 { + entry: + unreachable + } + + define void @avgpr_rp_occ3() #2 { + entry: + unreachable + } + + define void @avgpr_rp_occ4() #3 { + entry: + unreachable + } + + define void @avgpr_rp_occ5() #4 { + entry: + unreachable + } + + define void @avgpr_rp_occ6() #5 { + entry: + unreachable + } + + define void @avgpr_rp_occ7() #6 { + entry: + unreachable + } + + define void @avgpr_rp_occ8() #7 { + entry: + unreachable + } + + attributes #0 = {"amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="64,64"} + attributes #1 = {"amdgpu-waves-per-eu"="2,2" "amdgpu-flat-work-group-size"="64,64"} + attributes #2 = {"amdgpu-waves-per-eu"="3,3" "amdgpu-flat-work-group-size"="64,64"} + attributes #3 = {"amdgpu-waves-per-eu"="4,4" "amdgpu-flat-work-group-size"="64,64"} + attributes #4 = {"amdgpu-waves-per-eu"="5,5" "amdgpu-flat-work-group-size"="64,64"} + attributes #5 = {"amdgpu-waves-per-eu"="6,6" "amdgpu-flat-work-group-size"="64,64"} + attributes #6 = {"amdgpu-waves-per-eu"="7,7" "amdgpu-flat-work-group-size"="64,64"} + attributes #7 = {"amdgpu-waves-per-eu"="8,8" "amdgpu-flat-work-group-size"="64,64"} + + +... + +# CHECK: avgpr_rp_occ1:%bb.0 +# CHECK: Pressure before scheduling: +# CHECK-NEXT: Region live-ins: +# CHECK-NEXT: Region live-in pressure: VGPRs: 0 AGPRs: 0(O8), SGPRs: 0(O10), LVGPR WT: 0, LSGPR WT: 0 -> Occ: 8 +# CHECK-NEXT: Region register pressure: VGPRs: 256 AGPRs: 192(O1), SGPRs: 0(O10), LVGPR WT: 256, LSGPR WT: 0 -> Occ: 1 + +--- +name: avgpr_rp_occ1 +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true + scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99' + stackPtrOffsetReg: '$sgpr32' + argumentInfo: + privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } + kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } + workGroupIDX: { reg: '$sgpr6' } + privateSegmentWaveByteOffset: { reg: '$sgpr7' } + workItemIDX: { reg: '$vgpr0' } + sgprForEXECCopy: '$sgpr100_sgpr101' +body: | + bb.0: + liveins: $vgpr0, $sgpr4_sgpr5 + %1:vreg_1024 = IMPLICIT_DEF + %2:vreg_1024 = IMPLICIT_DEF + %3:vreg_1024 = IMPLICIT_DEF + %4:vreg_1024 = IMPLICIT_DEF + %5:vreg_1024 = IMPLICIT_DEF + %6:vreg_1024 = IMPLICIT_DEF + %7:vreg_1024 = IMPLICIT_DEF + %8:av_1024 = IMPLICIT_DEF + %9:av_1024 = IMPLICIT_DEF + %10:av_1024 = IMPLICIT_DEF + %11:av_1024 = IMPLICIT_DEF + %12:av_1024 = IMPLICIT_DEF + %13:av_1024 = IMPLICIT_DEF + %14:av_1024 = IMPLICIT_DEF + SCHED_BARRIER 0 + KILL %1, %2, %3, %4, %5, %6, %7 + + bb.1: + KILL %8, %9, %10, %11, %12, %13, %14 + S_ENDPGM 0 +... + +# CHECK: avgpr_rp_occ2:%bb.0 +# CHECK: Pressure before scheduling: +# CHECK-NEXT: Region live-ins: +# CHECK-NEXT: Region live-in pressure: VGPRs: 0 AGPRs: 0(O8), SGPRs: 0(O10), LVGPR WT: 0, LSGPR WT: 0 -> Occ: 8 +# CHECK-NEXT: Region register pressure: VGPRs: 128 AGPRs: 64(O2), SGPRs: 0(O10), LVGPR WT: 128, LSGPR WT: 0 -> Occ: 2 + +--- +name: avgpr_rp_occ2 +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true + scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99' + stackPtrOffsetReg: '$sgpr32' + argumentInfo: + privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } + kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } + workGroupIDX: { reg: '$sgpr6' } + privateSegmentWaveByteOffset: { reg: '$sgpr7' } + workItemIDX: { reg: '$vgpr0' } + sgprForEXECCopy: '$sgpr100_sgpr101' +body: | + bb.0: + liveins: $vgpr0, $sgpr4_sgpr5 + %1:vreg_1024 = IMPLICIT_DEF + %2:vreg_1024 = IMPLICIT_DEF + %3:vreg_1024 = IMPLICIT_DEF + %4:av_1024 = IMPLICIT_DEF + %5:av_1024 = IMPLICIT_DEF + %6:av_1024 = IMPLICIT_DEF + SCHED_BARRIER 0 + KILL %1, %2, %3 + + bb.1: + KILL %4, %5, %6 + S_ENDPGM 0 +... + +# CHECK: avgpr_rp_occ3:%bb.0 +# CHECK: Pressure before scheduling: +# CHECK-NEXT: Region live-ins: +# CHECK-NEXT: Region live-in pressure: VGPRs: 0 AGPRs: 0(O8), SGPRs: 0(O10), LVGPR WT: 0, LSGPR WT: 0 -> Occ: 8 +# CHECK-NEXT: Region register pressure: VGPRs: 84 AGPRs: 44(O4), SGPRs: 0(O10), LVGPR WT: 84, LSGPR WT: 0 -> Occ: 4 + +--- +name: avgpr_rp_occ3 +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true + scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99' + stackPtrOffsetReg: '$sgpr32' + argumentInfo: + privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } + kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } + workGroupIDX: { reg: '$sgpr6' } + privateSegmentWaveByteOffset: { reg: '$sgpr7' } + workItemIDX: { reg: '$vgpr0' } + sgprForEXECCopy: '$sgpr100_sgpr101' +body: | + bb.0: + liveins: $vgpr0, $sgpr4_sgpr5 + %1:vreg_1024 = IMPLICIT_DEF + %2:vreg_1024 = IMPLICIT_DEF + %3:av_1024 = IMPLICIT_DEF + %4:av_1024 = IMPLICIT_DEF + SCHED_BARRIER 0 + KILL %1, %2 + + bb.1: + KILL %3, %4 + S_ENDPGM 0 +... + +# CHECK: avgpr_rp_occ4:%bb.0 +# CHECK: Pressure before scheduling: +# CHECK-NEXT: Region live-ins: +# CHECK-NEXT: Region live-in pressure: VGPRs: 0 AGPRs: 0(O8), SGPRs: 0(O10), LVGPR WT: 0, LSGPR WT: 0 -> Occ: 8 +# CHECK-NEXT: Region register pressure: VGPRs: 64 AGPRs: 64(O4), SGPRs: 0(O10), LVGPR WT: 64, LSGPR WT: 0 -> Occ: 4 + +--- +name: avgpr_rp_occ4 +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true + scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99' + stackPtrOffsetReg: '$sgpr32' + argumentInfo: + privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } + kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } + workGroupIDX: { reg: '$sgpr6' } + privateSegmentWaveByteOffset: { reg: '$sgpr7' } + workItemIDX: { reg: '$vgpr0' } + sgprForEXECCopy: '$sgpr100_sgpr101' +body: | + bb.0: + liveins: $vgpr0, $sgpr4_sgpr5 + %1:vreg_1024 = IMPLICIT_DEF + %2:vreg_1024 = IMPLICIT_DEF + %3:av_1024 = IMPLICIT_DEF + %4:av_1024 = IMPLICIT_DEF + SCHED_BARRIER 0 + KILL %1, %2 + + bb.1: + KILL %3, %4 + S_ENDPGM 0 +... + +# CHECK: avgpr_rp_occ5:%bb.0 +# CHECK: Pressure before scheduling: +# CHECK-NEXT: Region live-ins: +# CHECK-NEXT: Region live-in pressure: VGPRs: 0 AGPRs: 0(O8), SGPRs: 0(O10), LVGPR WT: 0, LSGPR WT: 0 -> Occ: 8 +# CHECK-NEXT: Region register pressure: VGPRs: 48 AGPRs: 80(O4), SGPRs: 0(O10), LVGPR WT: 64, LSGPR WT: 0 -> Occ: 4 + +--- +name: avgpr_rp_occ5 +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true + scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99' + stackPtrOffsetReg: '$sgpr32' + argumentInfo: + privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } + kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } + workGroupIDX: { reg: '$sgpr6' } + privateSegmentWaveByteOffset: { reg: '$sgpr7' } + workItemIDX: { reg: '$vgpr0' } + sgprForEXECCopy: '$sgpr100_sgpr101' +body: | + bb.0: + liveins: $vgpr0, $sgpr4_sgpr5 + %1:vreg_1024 = IMPLICIT_DEF + %2:vreg_1024 = IMPLICIT_DEF + %3:av_1024 = IMPLICIT_DEF + %4:av_1024 = IMPLICIT_DEF + SCHED_BARRIER 0 + KILL %1, %2 + + bb.1: + KILL %3, %4 + S_ENDPGM 0 +... + +# CHECK: avgpr_rp_occ6:%bb.0 +# CHECK: Pressure before scheduling: +# CHECK-NEXT: Region live-ins: +# CHECK-NEXT: Region live-in pressure: VGPRs: 0 AGPRs: 0(O8), SGPRs: 0(O10), LVGPR WT: 0, LSGPR WT: 0 -> Occ: 8 +# CHECK-NEXT: Region register pressure: VGPRs: 40 AGPRs: 88(O4), SGPRs: 0(O10), LVGPR WT: 64, LSGPR WT: 0 -> Occ: 4 + +--- +name: avgpr_rp_occ6 +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true + scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99' + stackPtrOffsetReg: '$sgpr32' + argumentInfo: + privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } + kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } + workGroupIDX: { reg: '$sgpr6' } + privateSegmentWaveByteOffset: { reg: '$sgpr7' } + workItemIDX: { reg: '$vgpr0' } + sgprForEXECCopy: '$sgpr100_sgpr101' +body: | + bb.0: + liveins: $vgpr0, $sgpr4_sgpr5 + %1:vreg_1024 = IMPLICIT_DEF + %2:vreg_1024 = IMPLICIT_DEF + %3:av_1024 = IMPLICIT_DEF + %4:av_1024 = IMPLICIT_DEF + SCHED_BARRIER 0 + KILL %1, %2 + + bb.1: + KILL %3, %4 + S_ENDPGM 0 +... + +# CHECK: avgpr_rp_occ7:%bb.0 +# CHECK: Pressure before scheduling: +# CHECK-NEXT: Region live-ins: +# CHECK-NEXT: Region live-in pressure: VGPRs: 0 AGPRs: 0(O8), SGPRs: 0(O10), LVGPR WT: 0, LSGPR WT: 0 -> Occ: 8 +# CHECK-NEXT: Region register pressure: VGPRs: 36 AGPRs: 92(O4), SGPRs: 0(O10), LVGPR WT: 64, LSGPR WT: 0 -> Occ: 4 + +--- +name: avgpr_rp_occ7 +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true + scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99' + stackPtrOffsetReg: '$sgpr32' + argumentInfo: + privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } + kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } + workGroupIDX: { reg: '$sgpr6' } + privateSegmentWaveByteOffset: { reg: '$sgpr7' } + workItemIDX: { reg: '$vgpr0' } + sgprForEXECCopy: '$sgpr100_sgpr101' +body: | + bb.0: + liveins: $vgpr0, $sgpr4_sgpr5 + %1:vreg_1024 = IMPLICIT_DEF + %2:vreg_1024 = IMPLICIT_DEF + %3:av_1024 = IMPLICIT_DEF + %4:av_1024 = IMPLICIT_DEF + SCHED_BARRIER 0 + KILL %1, %2 + + bb.1: + KILL %3, %4 + S_ENDPGM 0 +... + +# CHECK: avgpr_rp_occ8:%bb.0 +# CHECK: Pressure before scheduling: +# CHECK-NEXT: Region live-ins: +# CHECK-NEXT: Region live-in pressure: VGPRs: 0 AGPRs: 0(O8), SGPRs: 0(O10), LVGPR WT: 0, LSGPR WT: 0 -> Occ: 8 +# CHECK-NEXT: Region register pressure: VGPRs: 32 AGPRs: 96(O4), SGPRs: 0(O10), LVGPR WT: 64, LSGPR WT: 0 -> Occ: 4 + +--- +name: avgpr_rp_occ8 +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true + scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99' + stackPtrOffsetReg: '$sgpr32' + argumentInfo: + privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } + kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } + workGroupIDX: { reg: '$sgpr6' } + privateSegmentWaveByteOffset: { reg: '$sgpr7' } + workItemIDX: { reg: '$vgpr0' } + sgprForEXECCopy: '$sgpr100_sgpr101' +body: | + bb.0: + liveins: $vgpr0, $sgpr4_sgpr5 + %1:vreg_1024 = IMPLICIT_DEF + %2:vreg_1024 = IMPLICIT_DEF + %3:av_1024 = IMPLICIT_DEF + %4:av_1024 = IMPLICIT_DEF + SCHED_BARRIER 0 + KILL %1, %2 + + bb.1: + KILL %3, %4 + S_ENDPGM 0 +... + From 38e255d764360972d6c61b3dddf329c48d3c46fd Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Tue, 29 Jul 2025 09:05:44 -0700 Subject: [PATCH 08/11] Rebase for getMaxNumVectorRegs move Change-Id: I17c9239229b94c42c35b5683d77f8dfe3f70bafc --- llvm/lib/Target/AMDGPU/GCNRegPressure.cpp | 7 +++---- llvm/lib/Target/AMDGPU/GCNRegPressure.h | 2 +- llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 10 ++++------ llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp | 2 +- 4 files changed, 9 insertions(+), 12 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp index 966e810115195..bd03ccf5322e5 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp @@ -100,7 +100,7 @@ bool GCNRegPressure::less(const MachineFunction &MF, const GCNRegPressure &O, unsigned MaxOccupancy) const { const GCNSubtarget &ST = MF.getSubtarget(); unsigned ArchVGPRThreshold = - ST.getRegisterInfo()->getMaxNumVectorRegs(MF).first; + ST.getMaxNumVectorRegs(MF.getFunction()).first; unsigned DynamicVGPRBlockSize = MF.getInfo()->getDynamicVGPRBlockSize(); @@ -252,7 +252,7 @@ Printable llvm::print(const GCNRegPressure &RP, const GCNSubtarget *ST, unsigned DynamicVGPRBlockSize, const MachineFunction *MF) { unsigned ArchVGPRThreshold = - ST->getRegisterInfo()->getMaxNumVectorRegs(*MF).first; + ST->getMaxNumVectorRegs(MF->getFunction()).first; return Printable( [&RP, ST, DynamicVGPRBlockSize, ArchVGPRThreshold, MF](raw_ostream &OS) { OS << "VGPRs: " << RP.getArchVGPRNum(ArchVGPRThreshold) << ' ' @@ -906,8 +906,7 @@ bool GCNRegPressurePrinter::runOnMachineFunction(MachineFunction &MF) { return Printable([&RP, &MF](raw_ostream &OS) { OS << format(PFX " %-5d", RP.getSGPRNum()) << format(" %-5d", RP.getVGPRNum(false, MF.getSubtarget() - .getRegisterInfo() - ->getMaxNumVectorRegs(MF) + .getMaxNumVectorRegs(MF.getFunction()) .first)); }); }; diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h index d61e0348dabb4..0e03834380525 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h @@ -132,7 +132,7 @@ struct GCNRegPressure { ST.getOccupancyWithNumSGPRs(getSGPRNum()), ST.getOccupancyWithNumVGPRs( getVGPRNum(ST.hasGFX90AInsts(), - ST.getRegisterInfo()->getMaxNumVectorRegs(MF).first), + ST.getMaxNumVectorRegs(MF.getFunction()).first), DynamicVGPRBlockSize)); } diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index 52359135f8893..80e6c49c42fbc 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -191,8 +191,7 @@ static void getRegisterPressures( NewPressure = TempUpwardTracker.getPressure(); } unsigned ArchVGPRThreshold = DAG->MF.getSubtarget() - .getRegisterInfo() - ->getMaxNumVectorRegs(DAG->MF) + .getMaxNumVectorRegs(DAG->MF.getFunction()) .first; Pressure[AMDGPU::RegisterPressureSets::SReg_32] = NewPressure.getSGPRNum(); Pressure[AMDGPU::RegisterPressureSets::VGPR_32] = @@ -346,8 +345,7 @@ void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone, SGPRPressure = T->getPressure().getSGPRNum(); VGPRPressure = T->getPressure().getArchVGPRNum(DAG->MF.getSubtarget() - .getRegisterInfo() - ->getMaxNumVectorRegs(DAG->MF) + .getMaxNumVectorRegs(DAG->MF.getFunction()) .first); } } @@ -1290,7 +1288,7 @@ void GCNSchedStage::checkScheduling() { LLVM_DEBUG(dbgs() << "Region: " << RegionIdx << ".\n"); unsigned ArchVGPRThreshold = - ST.getRegisterInfo()->getMaxNumVectorRegs(MF).first; + ST.getMaxNumVectorRegs(MF.getFunction()).first; if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit && PressureAfter.getVGPRNum(ST.hasGFX90AInsts(), ArchVGPRThreshold) <= S.VGPRCriticalLimit) { @@ -1483,7 +1481,7 @@ bool GCNSchedStage::shouldRevertScheduling(unsigned WavesAfter) { // For dynamic VGPR mode, we don't want to waste any VGPR blocks. if (DAG.MFI.isDynamicVGPREnabled()) { unsigned ArchVGPRThreshold = - ST.getRegisterInfo()->getMaxNumVectorRegs(MF).first; + ST.getMaxNumVectorRegs(MF.getFunction()).first; unsigned BlocksBefore = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks( &ST, DAG.MFI.getDynamicVGPRBlockSize(), PressureBefore.getVGPRNum(false, ArchVGPRThreshold)); diff --git a/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp b/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp index cdc80ca9267d6..e29ac72c7ba31 100644 --- a/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp +++ b/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp @@ -211,7 +211,7 @@ bool SIFormMemoryClausesImpl::checkPressure(const MachineInstr &MI, if (Occupancy >= MFI->getMinAllowedOccupancy() && MaxPressure.getVGPRNum( ST->hasGFX90AInsts(), - ST->getRegisterInfo()->getMaxNumVectorRegs(*MI.getMF()).first) <= + ST->getMaxNumVectorRegs(MI.getMF()->getFunction()).first) <= MaxVGPRs / 2 && MaxPressure.getSGPRNum() <= MaxSGPRs / 2) { LastRecordedOccupancy = Occupancy; From dcecf426736fad35ae746e9d5e8b29602fdf797a Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Tue, 29 Jul 2025 09:15:16 -0700 Subject: [PATCH 09/11] Formatting Change-Id: I992cdc7ab89d244eaed82d4e671238878376c8d2 --- llvm/lib/Target/AMDGPU/GCNRegPressure.cpp | 9 ++++----- llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 14 ++++++-------- 2 files changed, 10 insertions(+), 13 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp index bd03ccf5322e5..4564163b137be 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp @@ -99,8 +99,7 @@ void GCNRegPressure::inc(unsigned Reg, bool GCNRegPressure::less(const MachineFunction &MF, const GCNRegPressure &O, unsigned MaxOccupancy) const { const GCNSubtarget &ST = MF.getSubtarget(); - unsigned ArchVGPRThreshold = - ST.getMaxNumVectorRegs(MF.getFunction()).first; + unsigned ArchVGPRThreshold = ST.getMaxNumVectorRegs(MF.getFunction()).first; unsigned DynamicVGPRBlockSize = MF.getInfo()->getDynamicVGPRBlockSize(); @@ -251,8 +250,7 @@ bool GCNRegPressure::less(const MachineFunction &MF, const GCNRegPressure &O, Printable llvm::print(const GCNRegPressure &RP, const GCNSubtarget *ST, unsigned DynamicVGPRBlockSize, const MachineFunction *MF) { - unsigned ArchVGPRThreshold = - ST->getMaxNumVectorRegs(MF->getFunction()).first; + unsigned ArchVGPRThreshold = ST->getMaxNumVectorRegs(MF->getFunction()).first; return Printable( [&RP, ST, DynamicVGPRBlockSize, ArchVGPRThreshold, MF](raw_ostream &OS) { OS << "VGPRs: " << RP.getArchVGPRNum(ArchVGPRThreshold) << ' ' @@ -906,7 +904,8 @@ bool GCNRegPressurePrinter::runOnMachineFunction(MachineFunction &MF) { return Printable([&RP, &MF](raw_ostream &OS) { OS << format(PFX " %-5d", RP.getSGPRNum()) << format(" %-5d", RP.getVGPRNum(false, MF.getSubtarget() - .getMaxNumVectorRegs(MF.getFunction()) + .getMaxNumVectorRegs( + MF.getFunction()) .first)); }); }; diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index 80e6c49c42fbc..3cf9a7c0f972e 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -343,10 +343,10 @@ void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone, ? static_cast(&UpwardTracker) : static_cast(&DownwardTracker); SGPRPressure = T->getPressure().getSGPRNum(); - VGPRPressure = - T->getPressure().getArchVGPRNum(DAG->MF.getSubtarget() - .getMaxNumVectorRegs(DAG->MF.getFunction()) - .first); + VGPRPressure = T->getPressure().getArchVGPRNum( + DAG->MF.getSubtarget() + .getMaxNumVectorRegs(DAG->MF.getFunction()) + .first); } } ReadyQueue &Q = Zone.Available; @@ -1287,8 +1287,7 @@ void GCNSchedStage::checkScheduling() { << print(PressureAfter, &ST, 0, &MF)); LLVM_DEBUG(dbgs() << "Region: " << RegionIdx << ".\n"); - unsigned ArchVGPRThreshold = - ST.getMaxNumVectorRegs(MF.getFunction()).first; + unsigned ArchVGPRThreshold = ST.getMaxNumVectorRegs(MF.getFunction()).first; if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit && PressureAfter.getVGPRNum(ST.hasGFX90AInsts(), ArchVGPRThreshold) <= S.VGPRCriticalLimit) { @@ -1480,8 +1479,7 @@ bool GCNSchedStage::shouldRevertScheduling(unsigned WavesAfter) { // For dynamic VGPR mode, we don't want to waste any VGPR blocks. if (DAG.MFI.isDynamicVGPREnabled()) { - unsigned ArchVGPRThreshold = - ST.getMaxNumVectorRegs(MF.getFunction()).first; + unsigned ArchVGPRThreshold = ST.getMaxNumVectorRegs(MF.getFunction()).first; unsigned BlocksBefore = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks( &ST, DAG.MFI.getDynamicVGPRBlockSize(), PressureBefore.getVGPRNum(false, ArchVGPRThreshold)); From d59fba1f01a00df11a46d0ae236901dfb30ea899 Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Tue, 29 Jul 2025 11:48:29 -0700 Subject: [PATCH 10/11] Fix test + handling of ArchVGPR pressure Change-Id: I15cd9b4e9e38d7000a403bed56918819ae858658 --- llvm/lib/Target/AMDGPU/GCNRegPressure.h | 14 +- llvm/test/CodeGen/AMDGPU/avgpr-pressure.mir | 161 ++++++++++++++++++-- 2 files changed, 155 insertions(+), 20 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h index 0e03834380525..8b80cc42c9bb0 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h @@ -97,15 +97,17 @@ struct GCNRegPressure { /// \returns the ArchVGPR32 pressure, plus the AVGPRS which we assume will be /// allocated as VGPR unsigned getArchVGPRNum(unsigned AddressableArchVGPR) const { - return std::min(Value[VGPR] + Value[AVGPR], AddressableArchVGPR); + unsigned AVGPRsAsVGPRs = + getAVGPRsAsVGPRsNum(Value[VGPR], Value[AVGPR], AddressableArchVGPR); + + return Value[VGPR] + AVGPRsAsVGPRs; } /// \returns the AccVGPR32 pressure unsigned getAGPRNum(unsigned AddressableArchVGPR) const { - unsigned VGPRsForAGPRs = - Value[VGPR] + Value[AVGPR] > AddressableArchVGPR - ? (Value[VGPR] + Value[AVGPR] - AddressableArchVGPR) - : 0; - return Value[AGPR] + VGPRsForAGPRs; + unsigned AVGPRsAsAGPRs = getAVGPRsAsAGPRsNum( + Value[VGPR], Value[AGPR], Value[AVGPR], AddressableArchVGPR); + + return Value[AGPR] + AVGPRsAsAGPRs; } /// \returns the AVGPR32 pressure unsigned getAVGPRNum() const { return Value[AVGPR]; } diff --git a/llvm/test/CodeGen/AMDGPU/avgpr-pressure.mir b/llvm/test/CodeGen/AMDGPU/avgpr-pressure.mir index 358942e73a7c6..a5183ce0d2661 100644 --- a/llvm/test/CodeGen/AMDGPU/avgpr-pressure.mir +++ b/llvm/test/CodeGen/AMDGPU/avgpr-pressure.mir @@ -42,6 +42,22 @@ unreachable } + + define void @vgpr_rp_occ1() #0 { + entry: + unreachable + } + + define void @vgpr_rp_occ2() #1 { + entry: + unreachable + } + + define void @vgpr_rp_occ3() #2 { + entry: + unreachable + } + attributes #0 = {"amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="64,64"} attributes #1 = {"amdgpu-waves-per-eu"="2,2" "amdgpu-flat-work-group-size"="64,64"} attributes #2 = {"amdgpu-waves-per-eu"="3,3" "amdgpu-flat-work-group-size"="64,64"} @@ -194,8 +210,8 @@ machineFunctionInfo: body: | bb.0: liveins: $vgpr0, $sgpr4_sgpr5 - %1:vreg_1024 = IMPLICIT_DEF - %2:vreg_1024 = IMPLICIT_DEF + %1:av_1024 = IMPLICIT_DEF + %2:av_1024 = IMPLICIT_DEF %3:av_1024 = IMPLICIT_DEF %4:av_1024 = IMPLICIT_DEF SCHED_BARRIER 0 @@ -210,7 +226,7 @@ body: | # CHECK: Pressure before scheduling: # CHECK-NEXT: Region live-ins: # CHECK-NEXT: Region live-in pressure: VGPRs: 0 AGPRs: 0(O8), SGPRs: 0(O10), LVGPR WT: 0, LSGPR WT: 0 -> Occ: 8 -# CHECK-NEXT: Region register pressure: VGPRs: 48 AGPRs: 80(O4), SGPRs: 0(O10), LVGPR WT: 64, LSGPR WT: 0 -> Occ: 4 +# CHECK-NEXT: Region register pressure: VGPRs: 48 AGPRs: 80(O4), SGPRs: 0(O10), LVGPR WT: 80, LSGPR WT: 0 -> Occ: 4 --- name: avgpr_rp_occ5 @@ -229,8 +245,8 @@ machineFunctionInfo: body: | bb.0: liveins: $vgpr0, $sgpr4_sgpr5 - %1:vreg_1024 = IMPLICIT_DEF - %2:vreg_1024 = IMPLICIT_DEF + %1:av_1024 = IMPLICIT_DEF + %2:av_1024 = IMPLICIT_DEF %3:av_1024 = IMPLICIT_DEF %4:av_1024 = IMPLICIT_DEF SCHED_BARRIER 0 @@ -245,7 +261,7 @@ body: | # CHECK: Pressure before scheduling: # CHECK-NEXT: Region live-ins: # CHECK-NEXT: Region live-in pressure: VGPRs: 0 AGPRs: 0(O8), SGPRs: 0(O10), LVGPR WT: 0, LSGPR WT: 0 -> Occ: 8 -# CHECK-NEXT: Region register pressure: VGPRs: 40 AGPRs: 88(O4), SGPRs: 0(O10), LVGPR WT: 64, LSGPR WT: 0 -> Occ: 4 +# CHECK-NEXT: Region register pressure: VGPRs: 40 AGPRs: 88(O4), SGPRs: 0(O10), LVGPR WT: 88, LSGPR WT: 0 -> Occ: 4 --- name: avgpr_rp_occ6 @@ -264,8 +280,8 @@ machineFunctionInfo: body: | bb.0: liveins: $vgpr0, $sgpr4_sgpr5 - %1:vreg_1024 = IMPLICIT_DEF - %2:vreg_1024 = IMPLICIT_DEF + %1:av_1024 = IMPLICIT_DEF + %2:av_1024 = IMPLICIT_DEF %3:av_1024 = IMPLICIT_DEF %4:av_1024 = IMPLICIT_DEF SCHED_BARRIER 0 @@ -280,7 +296,7 @@ body: | # CHECK: Pressure before scheduling: # CHECK-NEXT: Region live-ins: # CHECK-NEXT: Region live-in pressure: VGPRs: 0 AGPRs: 0(O8), SGPRs: 0(O10), LVGPR WT: 0, LSGPR WT: 0 -> Occ: 8 -# CHECK-NEXT: Region register pressure: VGPRs: 36 AGPRs: 92(O4), SGPRs: 0(O10), LVGPR WT: 64, LSGPR WT: 0 -> Occ: 4 +# CHECK-NEXT: Region register pressure: VGPRs: 36 AGPRs: 92(O4), SGPRs: 0(O10), LVGPR WT: 92, LSGPR WT: 0 -> Occ: 4 --- name: avgpr_rp_occ7 @@ -299,8 +315,8 @@ machineFunctionInfo: body: | bb.0: liveins: $vgpr0, $sgpr4_sgpr5 - %1:vreg_1024 = IMPLICIT_DEF - %2:vreg_1024 = IMPLICIT_DEF + %1:av_1024 = IMPLICIT_DEF + %2:av_1024 = IMPLICIT_DEF %3:av_1024 = IMPLICIT_DEF %4:av_1024 = IMPLICIT_DEF SCHED_BARRIER 0 @@ -315,7 +331,7 @@ body: | # CHECK: Pressure before scheduling: # CHECK-NEXT: Region live-ins: # CHECK-NEXT: Region live-in pressure: VGPRs: 0 AGPRs: 0(O8), SGPRs: 0(O10), LVGPR WT: 0, LSGPR WT: 0 -> Occ: 8 -# CHECK-NEXT: Region register pressure: VGPRs: 32 AGPRs: 96(O4), SGPRs: 0(O10), LVGPR WT: 64, LSGPR WT: 0 -> Occ: 4 +# CHECK-NEXT: Region register pressure: VGPRs: 32 AGPRs: 96(O4), SGPRs: 0(O10), LVGPR WT: 96, LSGPR WT: 0 -> Occ: 4 --- name: avgpr_rp_occ8 @@ -334,8 +350,8 @@ machineFunctionInfo: body: | bb.0: liveins: $vgpr0, $sgpr4_sgpr5 - %1:vreg_1024 = IMPLICIT_DEF - %2:vreg_1024 = IMPLICIT_DEF + %1:av_1024 = IMPLICIT_DEF + %2:av_1024 = IMPLICIT_DEF %3:av_1024 = IMPLICIT_DEF %4:av_1024 = IMPLICIT_DEF SCHED_BARRIER 0 @@ -346,3 +362,120 @@ body: | S_ENDPGM 0 ... +# CHECK: vgpr_rp_occ1:%bb.0 +# CHECK: Pressure before scheduling: +# CHECK-NEXT: Region live-ins: +# CHECK-NEXT: Region live-in pressure: VGPRs: 0 AGPRs: 0(O8), SGPRs: 0(O10), LVGPR WT: 0, LSGPR WT: 0 -> Occ: 8 +# CHECK-NEXT: Region register pressure: VGPRs: 448 AGPRs: 0(O1), SGPRs: 0(O10), LVGPR WT: 448, LSGPR WT: 0 -> Occ: 1 + +--- +name: vgpr_rp_occ1 +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true + scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99' + stackPtrOffsetReg: '$sgpr32' + argumentInfo: + privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } + kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } + workGroupIDX: { reg: '$sgpr6' } + privateSegmentWaveByteOffset: { reg: '$sgpr7' } + workItemIDX: { reg: '$vgpr0' } + sgprForEXECCopy: '$sgpr100_sgpr101' +body: | + bb.0: + liveins: $vgpr0, $sgpr4_sgpr5 + %1:vreg_1024 = IMPLICIT_DEF + %2:vreg_1024 = IMPLICIT_DEF + %3:vreg_1024 = IMPLICIT_DEF + %4:vreg_1024 = IMPLICIT_DEF + %5:vreg_1024 = IMPLICIT_DEF + %6:vreg_1024 = IMPLICIT_DEF + %7:vreg_1024 = IMPLICIT_DEF + %8:vreg_1024 = IMPLICIT_DEF + %9:vreg_1024 = IMPLICIT_DEF + %10:vreg_1024 = IMPLICIT_DEF + %11:vreg_1024 = IMPLICIT_DEF + %12:vreg_1024 = IMPLICIT_DEF + %13:vreg_1024 = IMPLICIT_DEF + %14:vreg_1024 = IMPLICIT_DEF + SCHED_BARRIER 0 + KILL %1, %2, %3, %4, %5, %6, %7 + + bb.1: + KILL %8, %9, %10, %11, %12, %13, %14 + S_ENDPGM 0 +... + +# CHECK: vgpr_rp_occ2:%bb.0 +# CHECK: Pressure before scheduling: +# CHECK-NEXT: Region live-ins: +# CHECK-NEXT: Region live-in pressure: VGPRs: 0 AGPRs: 0(O8), SGPRs: 0(O10), LVGPR WT: 0, LSGPR WT: 0 -> Occ: 8 +# CHECK-NEXT: Region register pressure: VGPRs: 192 AGPRs: 0(O2), SGPRs: 0(O10), LVGPR WT: 192, LSGPR WT: 0 -> Occ: 2 + +--- +name: vgpr_rp_occ2 +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true + scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99' + stackPtrOffsetReg: '$sgpr32' + argumentInfo: + privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } + kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } + workGroupIDX: { reg: '$sgpr6' } + privateSegmentWaveByteOffset: { reg: '$sgpr7' } + workItemIDX: { reg: '$vgpr0' } + sgprForEXECCopy: '$sgpr100_sgpr101' +body: | + bb.0: + liveins: $vgpr0, $sgpr4_sgpr5 + %1:vreg_1024 = IMPLICIT_DEF + %2:vreg_1024 = IMPLICIT_DEF + %3:vreg_1024 = IMPLICIT_DEF + %4:vreg_1024 = IMPLICIT_DEF + %5:vreg_1024 = IMPLICIT_DEF + %6:vreg_1024 = IMPLICIT_DEF + SCHED_BARRIER 0 + KILL %1, %2, %3 + + bb.1: + KILL %4, %5, %6 + S_ENDPGM 0 +... + +# CHECK: vgpr_rp_occ3:%bb.0 +# CHECK: Pressure before scheduling: +# CHECK-NEXT: Region live-ins: +# CHECK-NEXT: Region live-in pressure: VGPRs: 0 AGPRs: 0(O8), SGPRs: 0(O10), LVGPR WT: 0, LSGPR WT: 0 -> Occ: 8 +# CHECK-NEXT: Region register pressure: VGPRs: 128 AGPRs: 0(O4), SGPRs: 0(O10), LVGPR WT: 128, LSGPR WT: 0 -> Occ: 4 + + +--- +name: vgpr_rp_occ3 +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true + scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99' + stackPtrOffsetReg: '$sgpr32' + argumentInfo: + privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } + kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } + workGroupIDX: { reg: '$sgpr6' } + privateSegmentWaveByteOffset: { reg: '$sgpr7' } + workItemIDX: { reg: '$vgpr0' } + sgprForEXECCopy: '$sgpr100_sgpr101' +body: | + bb.0: + liveins: $vgpr0, $sgpr4_sgpr5 + %1:vreg_1024 = IMPLICIT_DEF + %2:vreg_1024 = IMPLICIT_DEF + %3:vreg_1024 = IMPLICIT_DEF + %4:vreg_1024 = IMPLICIT_DEF + SCHED_BARRIER 0 + KILL %1, %2 + + bb.1: + KILL %3, %4 + S_ENDPGM 0 +... From 5f9d402fa866aa1667702df692410c35dadc70af Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Tue, 29 Jul 2025 15:37:45 -0700 Subject: [PATCH 11/11] Fix print with unspecified ST/MF Change-Id: I172e8c013f41daea997266dea9c20335c75c9b83 --- llvm/lib/Target/AMDGPU/GCNRegPressure.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp index 4564163b137be..dd007e6cd6b31 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp @@ -250,7 +250,10 @@ bool GCNRegPressure::less(const MachineFunction &MF, const GCNRegPressure &O, Printable llvm::print(const GCNRegPressure &RP, const GCNSubtarget *ST, unsigned DynamicVGPRBlockSize, const MachineFunction *MF) { - unsigned ArchVGPRThreshold = ST->getMaxNumVectorRegs(MF->getFunction()).first; + unsigned ArchVGPRThreshold = std::numeric_limits::max(); + if (ST && MF) + ArchVGPRThreshold = ST->getMaxNumVectorRegs(MF->getFunction()).first; + return Printable( [&RP, ST, DynamicVGPRBlockSize, ArchVGPRThreshold, MF](raw_ostream &OS) { OS << "VGPRs: " << RP.getArchVGPRNum(ArchVGPRThreshold) << ' '