From 52cbccd2cd6185be0499998ec969952865218169 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes@amd.com>
Date: Wed, 23 Jul 2025 15:41:11 -0700
Subject: [PATCH 01/11] [AMDGPU] More accurately account for AVGPR pressure

Change-Id: I6f129c2723b52a391a96178e390f60535164ac9b
---
 llvm/lib/Target/AMDGPU/GCNRegPressure.cpp     | 114 +++++++++++-------
 llvm/lib/Target/AMDGPU/GCNRegPressure.h       |  94 ++++++++++-----
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp   |  27 +++--
 .../lib/Target/AMDGPU/SIFormMemoryClauses.cpp |   4 +-
 4 files changed, 152 insertions(+), 87 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
index 334afd3a2a5b4..286c8d9529731 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -99,20 +99,22 @@ void GCNRegPressure::inc(unsigned Reg,
 bool GCNRegPressure::less(const MachineFunction &MF, const GCNRegPressure &O,
                           unsigned MaxOccupancy) const {
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+  unsigned MaxArchVGPRs = ST.getAddressableNumArchVGPRs();
   unsigned DynamicVGPRBlockSize =
       MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();
 
   const auto SGPROcc = std::min(MaxOccupancy,
                                 ST.getOccupancyWithNumSGPRs(getSGPRNum()));
   const auto VGPROcc = std::min(
-      MaxOccupancy, ST.getOccupancyWithNumVGPRs(getVGPRNum(ST.hasGFX90AInsts()),
-                                                DynamicVGPRBlockSize));
+      MaxOccupancy,
+      ST.getOccupancyWithNumVGPRs(getVGPRNum(ST.hasGFX90AInsts(), MaxArchVGPRs),
+                                  DynamicVGPRBlockSize));
   const auto OtherSGPROcc = std::min(MaxOccupancy,
                                 ST.getOccupancyWithNumSGPRs(O.getSGPRNum()));
-  const auto OtherVGPROcc =
-      std::min(MaxOccupancy,
-               ST.getOccupancyWithNumVGPRs(O.getVGPRNum(ST.hasGFX90AInsts()),
-                                           DynamicVGPRBlockSize));
+  const auto OtherVGPROcc = std::min(
+      MaxOccupancy, ST.getOccupancyWithNumVGPRs(
+                        O.getVGPRNum(ST.hasGFX90AInsts(), MaxArchVGPRs),
+                        DynamicVGPRBlockSize));
 
   const auto Occ = std::min(SGPROcc, VGPROcc);
   const auto OtherOcc = std::min(OtherSGPROcc, OtherVGPROcc);
@@ -135,35 +137,36 @@ bool GCNRegPressure::less(const MachineFunction &MF, const GCNRegPressure &O,
   unsigned OtherVGPRForSGPRSpills =
       (OtherExcessSGPR + (WaveSize - 1)) / WaveSize;
 
-  unsigned MaxArchVGPRs = ST.getAddressableNumArchVGPRs();
-
   // Unified excess pressure conditions, accounting for VGPRs used for SGPR
   // spills
   unsigned ExcessVGPR =
-      std::max(static_cast<int>(getVGPRNum(ST.hasGFX90AInsts()) +
+      std::max(static_cast<int>(getVGPRNum(ST.hasGFX90AInsts(), MaxArchVGPRs) +
                                 VGPRForSGPRSpills - MaxVGPRs),
                0);
-  unsigned OtherExcessVGPR =
-      std::max(static_cast<int>(O.getVGPRNum(ST.hasGFX90AInsts()) +
-                                OtherVGPRForSGPRSpills - MaxVGPRs),
-               0);
+  unsigned OtherExcessVGPR = std::max(
+      static_cast<int>(O.getVGPRNum(ST.hasGFX90AInsts(), MaxArchVGPRs) +
+                       OtherVGPRForSGPRSpills - MaxVGPRs),
+      0);
   // Arch VGPR excess pressure conditions, accounting for VGPRs used for SGPR
   // spills
-  unsigned ExcessArchVGPR = std::max(
-      static_cast<int>(getVGPRNum(false) + VGPRForSGPRSpills - MaxArchVGPRs),
-      0);
+  unsigned ExcessArchVGPR =
+      std::max(static_cast<int>(getVGPRNum(false, MaxArchVGPRs) +
+                                VGPRForSGPRSpills - MaxArchVGPRs),
+               0);
   unsigned OtherExcessArchVGPR =
-      std::max(static_cast<int>(O.getVGPRNum(false) + OtherVGPRForSGPRSpills -
-                                MaxArchVGPRs),
+      std::max(static_cast<int>(O.getVGPRNum(false, MaxArchVGPRs) +
+                                OtherVGPRForSGPRSpills - MaxArchVGPRs),
                0);
   // AGPR excess pressure conditions
-  unsigned ExcessAGPR = std::max(
-      static_cast<int>(ST.hasGFX90AInsts() ? (getAGPRNum() - MaxArchVGPRs)
-                                           : (getAGPRNum() - MaxVGPRs)),
-      0);
+  unsigned ExcessAGPR =
+      std::max(static_cast<int>(ST.hasGFX90AInsts()
+                                    ? (getAGPRNum(MaxArchVGPRs) - MaxArchVGPRs)
+                                    : (getAGPRNum(MaxArchVGPRs) - MaxVGPRs)),
+               0);
   unsigned OtherExcessAGPR = std::max(
-      static_cast<int>(ST.hasGFX90AInsts() ? (O.getAGPRNum() - MaxArchVGPRs)
-                                           : (O.getAGPRNum() - MaxVGPRs)),
+      static_cast<int>(ST.hasGFX90AInsts()
+                           ? (O.getAGPRNum(MaxArchVGPRs) - MaxArchVGPRs)
+                           : (O.getAGPRNum(MaxArchVGPRs) - MaxVGPRs)),
       0);
 
   bool ExcessRP = ExcessSGPR || ExcessVGPR || ExcessArchVGPR || ExcessAGPR;
@@ -184,14 +187,21 @@ bool GCNRegPressure::less(const MachineFunction &MF, const GCNRegPressure &O,
       return VGPRDiff > 0;
     if (SGPRDiff != 0) {
       unsigned PureExcessVGPR =
-          std::max(static_cast<int>(getVGPRNum(ST.hasGFX90AInsts()) - MaxVGPRs),
-                   0) +
-          std::max(static_cast<int>(getVGPRNum(false) - MaxArchVGPRs), 0);
+          std::max(
+              static_cast<int>(getVGPRNum(ST.hasGFX90AInsts(), MaxArchVGPRs) -
+                               MaxVGPRs),
+              0) +
+          std::max(
+              static_cast<int>(getVGPRNum(false, MaxArchVGPRs) - MaxArchVGPRs),
+              0);
       unsigned OtherPureExcessVGPR =
           std::max(
-              static_cast<int>(O.getVGPRNum(ST.hasGFX90AInsts()) - MaxVGPRs),
+              static_cast<int>(O.getVGPRNum(ST.hasGFX90AInsts(), MaxArchVGPRs) -
+                               MaxVGPRs),
               0) +
-          std::max(static_cast<int>(O.getVGPRNum(false) - MaxArchVGPRs), 0);
+          std::max(static_cast<int>(O.getVGPRNum(false, MaxArchVGPRs) -
+                                    MaxArchVGPRs),
+                   0);
 
       // If we have a special case where there is a tie in excess VGPR, but one
       // of the pressures has VGPR usage from SGPR spills, prefer the pressure
@@ -221,33 +231,36 @@ bool GCNRegPressure::less(const MachineFunction &MF, const GCNRegPressure &O,
       if (SW != OtherSW)
         return SW < OtherSW;
     } else {
-      auto VW = getVGPRTuplesWeight();
-      auto OtherVW = O.getVGPRTuplesWeight();
+      auto VW = getVGPRTuplesWeight(MaxArchVGPRs);
+      auto OtherVW = O.getVGPRTuplesWeight(MaxArchVGPRs);
       if (VW != OtherVW)
         return VW < OtherVW;
     }
   }
 
   // Give final precedence to lower general RP.
-  return SGPRImportant ? (getSGPRNum() < O.getSGPRNum()):
-                         (getVGPRNum(ST.hasGFX90AInsts()) <
-                          O.getVGPRNum(ST.hasGFX90AInsts()));
+  return SGPRImportant ? (getSGPRNum() < O.getSGPRNum())
+                       : (getVGPRNum(ST.hasGFX90AInsts(), MaxArchVGPRs) <
+                          O.getVGPRNum(ST.hasGFX90AInsts(), MaxArchVGPRs));
 }
 
 Printable llvm::print(const GCNRegPressure &RP, const GCNSubtarget *ST,
                       unsigned DynamicVGPRBlockSize) {
   return Printable([&RP, ST, DynamicVGPRBlockSize](raw_ostream &OS) {
-    OS << "VGPRs: " << RP.getArchVGPRNum() << ' '
-       << "AGPRs: " << RP.getAGPRNum();
+    OS << "VGPRs: " << RP.getArchVGPRNum(ST->getAddressableNumArchVGPRs())
+       << ' ' << "AGPRs: " << RP.getAGPRNum(ST->getAddressableNumArchVGPRs());
     if (ST)
       OS << "(O"
-         << ST->getOccupancyWithNumVGPRs(RP.getVGPRNum(ST->hasGFX90AInsts()),
-                                         DynamicVGPRBlockSize)
+         << ST->getOccupancyWithNumVGPRs(
+                RP.getVGPRNum(ST->hasGFX90AInsts(),
+                              ST->getAddressableNumArchVGPRs()),
+                DynamicVGPRBlockSize)
          << ')';
     OS << ", SGPRs: " << RP.getSGPRNum();
     if (ST)
       OS << "(O" << ST->getOccupancyWithNumSGPRs(RP.getSGPRNum()) << ')';
-    OS << ", LVGPR WT: " << RP.getVGPRTuplesWeight()
+    OS << ", LVGPR WT: "
+       << RP.getVGPRTuplesWeight(ST->getAddressableNumArchVGPRs())
        << ", LSGPR WT: " << RP.getSGPRTuplesWeight();
     if (ST)
       OS << " -> Occ: " << RP.getOccupancy(*ST, DynamicVGPRBlockSize);
@@ -398,8 +411,9 @@ void GCNRPTarget::setRegLimits(unsigned NumSGPRs, unsigned NumVGPRs,
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   unsigned DynamicVGPRBlockSize =
       MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();
+  AddressableNumArchVGPRs = ST.getAddressableNumArchVGPRs();
   MaxSGPRs = std::min(ST.getAddressableNumSGPRs(), NumSGPRs);
-  MaxVGPRs = std::min(ST.getAddressableNumArchVGPRs(), NumVGPRs);
+  MaxVGPRs = std::min(AddressableNumArchVGPRs, NumVGPRs);
   MaxUnifiedVGPRs =
       ST.hasGFX90AInsts()
           ? std::min(ST.getAddressableNumVGPRs(DynamicVGPRBlockSize), NumVGPRs)
@@ -414,15 +428,21 @@ bool GCNRPTarget::isSaveBeneficial(Register Reg,
 
   if (SRI->isSGPRClass(RC))
     return RP.getSGPRNum() > MaxSGPRs;
-  unsigned NumVGPRs =
-      SRI->isAGPRClass(RC) ? RP.getAGPRNum() : RP.getArchVGPRNum();
+
+  bool ShouldUseAGPR =
+      SRI->isAGPRClass(RC) ||
+      (SRI->isVectorSuperClass(RC) &&
+       RP.getArchVGPRNum(AddressableNumArchVGPRs) >= AddressableNumArchVGPRs);
+  unsigned NumVGPRs = ShouldUseAGPR
+                          ? RP.getAGPRNum(AddressableNumArchVGPRs)
+                          : RP.getArchVGPRNum(AddressableNumArchVGPRs);
   return isVGPRBankSaveBeneficial(NumVGPRs);
 }
 
 bool GCNRPTarget::satisfied() const {
   if (RP.getSGPRNum() > MaxSGPRs)
     return false;
-  if (RP.getVGPRNum(false) > MaxVGPRs &&
+  if (RP.getVGPRNum(false, AddressableNumArchVGPRs) > MaxVGPRs &&
       (!CombineVGPRSavings || !satisifiesVGPRBanksTarget()))
     return false;
   return satisfiesUnifiedTarget();
@@ -876,10 +896,12 @@ bool GCNRegPressurePrinter::runOnMachineFunction(MachineFunction &MF) {
 
   OS << "---\nname: " << MF.getName() << "\nbody:             |\n";
 
-  auto printRP = [](const GCNRegPressure &RP) {
-    return Printable([&RP](raw_ostream &OS) {
+  auto printRP = [&MF](const GCNRegPressure &RP) {
+    return Printable([&RP, &MF](raw_ostream &OS) {
       OS << format(PFX "  %-5d", RP.getSGPRNum())
-         << format(" %-5d", RP.getVGPRNum(false));
+         << format(" %-5d",
+                   RP.getVGPRNum(false, MF.getSubtarget<GCNSubtarget>()
+                                            .getAddressableNumArchVGPRs()));
     });
   };
 
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
index ea33a229110c1..a8c1c3bfd8703 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
@@ -43,13 +43,13 @@ struct GCNRegPressure {
 
   /// \returns the SGPR32 pressure
   unsigned getSGPRNum() const { return Value[SGPR]; }
-  /// \returns the aggregated ArchVGPR32, AccVGPR32, and Pseudo AVGPR pressure
-  /// dependent upon \p UnifiedVGPRFile
-  unsigned getVGPRNum(bool UnifiedVGPRFile) const {
+  unsigned getVGPRNum(bool UnifiedVGPRFile,
+                      unsigned AddressableArchVGPR) const {
     if (UnifiedVGPRFile) {
-      return Value[AGPR]
-                 ? getUnifiedVGPRNum(Value[VGPR], Value[AGPR], Value[AVGPR])
-                 : Value[VGPR] + Value[AVGPR];
+      return Value[AGPR] || Value[AVGPR]
+                 ? getUnifiedVGPRNum(Value[VGPR], Value[AGPR], Value[AVGPR],
+                                     AddressableArchVGPR)
+                 : Value[VGPR];
     }
     // AVGPR assignment priority is based on the width of the register. Account
     // AVGPR pressure as VGPR.
@@ -61,33 +61,60 @@ struct GCNRegPressure {
   /// VGPR file.
   inline static unsigned getUnifiedVGPRNum(unsigned NumArchVGPRs,
                                            unsigned NumAGPRs,
-                                           unsigned NumAVGPRs) {
-
-    // Assume AVGPRs will be assigned as VGPRs.
-    return alignTo(NumArchVGPRs + NumAVGPRs,
+                                           unsigned NumAVGPRs,
+                                           unsigned AddressableArchVGPR) {
+
+    // Until we hit the VGPRThreshold, we will assign AV as VGPR. After that
+    // point, we will assign as AGPR.
+    unsigned AVGPRsAsVGPRs =
+        NumArchVGPRs < AddressableArchVGPR
+            ? std::min((AddressableArchVGPR - NumArchVGPRs), NumAVGPRs)
+            : 0;
+    unsigned AVGPRsAsAGPRs =
+        NumAVGPRs > AVGPRsAsVGPRs ? NumAVGPRs - AVGPRsAsVGPRs : 0;
+    return alignTo(NumArchVGPRs + AVGPRsAsVGPRs,
                    AMDGPU::IsaInfo::getArchVGPRAllocGranule()) +
-           NumAGPRs;
+           NumAGPRs + AVGPRsAsAGPRs;
   }
 
   /// \returns the ArchVGPR32 pressure, plus the AVGPRS which we assume will be
   /// allocated as VGPR
-  unsigned getArchVGPRNum() const { return Value[VGPR] + Value[AVGPR]; }
+  unsigned getArchVGPRNum(unsigned AddressableArchVGPR) const {
+    return std::min(Value[VGPR] + Value[AVGPR], AddressableArchVGPR);
+  }
   /// \returns the AccVGPR32 pressure
-  unsigned getAGPRNum() const { return Value[AGPR]; }
+  unsigned getAGPRNum(unsigned AddressableArchVGPR) const {
+    unsigned VGPRsForAGPRs =
+        Value[VGPR] + Value[AVGPR] > AddressableArchVGPR
+            ? (Value[VGPR] + Value[AVGPR] - AddressableArchVGPR)
+            : 0;
+    return Value[AGPR] + VGPRsForAGPRs;
+  }
   /// \returns the AVGPR32 pressure
   unsigned getAVGPRNum() const { return Value[AVGPR]; }
 
-  unsigned getVGPRTuplesWeight() const {
-    return std::max(Value[TOTAL_KINDS + VGPR] + Value[TOTAL_KINDS + AVGPR],
-                    Value[TOTAL_KINDS + AGPR]);
+  unsigned getVGPRTuplesWeight(unsigned AddressableArchVGPR) const {
+    unsigned AVGPRsAsVGPRs =
+        Value[TOTAL_KINDS + VGPR] < AddressableArchVGPR
+            ? std::min(AddressableArchVGPR - Value[TOTAL_KINDS + VGPR],
+                       Value[TOTAL_KINDS + AVGPR])
+            : 0;
+    unsigned AVGPRsAsAGPRs = Value[TOTAL_KINDS + AVGPR] > AVGPRsAsVGPRs
+                                 ? Value[TOTAL_KINDS + AVGPR] - AVGPRsAsVGPRs
+                                 : 0;
+
+    return std::max(Value[TOTAL_KINDS + VGPR] + AVGPRsAsVGPRs,
+                    Value[TOTAL_KINDS + AGPR] + AVGPRsAsAGPRs);
   }
   unsigned getSGPRTuplesWeight() const { return Value[TOTAL_KINDS + SGPR]; }
 
   unsigned getOccupancy(const GCNSubtarget &ST,
                         unsigned DynamicVGPRBlockSize) const {
-    return std::min(ST.getOccupancyWithNumSGPRs(getSGPRNum()),
-                    ST.getOccupancyWithNumVGPRs(getVGPRNum(ST.hasGFX90AInsts()),
-                                                DynamicVGPRBlockSize));
+    return std::min(
+        ST.getOccupancyWithNumSGPRs(getSGPRNum()),
+        ST.getOccupancyWithNumVGPRs(
+            getVGPRNum(ST.hasGFX90AInsts(), ST.getAddressableNumArchVGPRs()),
+            DynamicVGPRBlockSize));
   }
 
   void inc(unsigned Reg,
@@ -151,7 +178,7 @@ struct GCNRegPressure {
   friend GCNRegPressure max(const GCNRegPressure &P1,
                             const GCNRegPressure &P2);
 
-  friend Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST,
+  friend Printable print(const GCNRegPressure &RP,
                          unsigned DynamicVGPRBlockSize);
 };
 
@@ -220,16 +247,19 @@ class GCNRPTarget {
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   friend raw_ostream &operator<<(raw_ostream &OS, const GCNRPTarget &Target) {
     OS << "Actual/Target: " << Target.RP.getSGPRNum() << '/' << Target.MaxSGPRs
-       << " SGPRs, " << Target.RP.getArchVGPRNum() << '/' << Target.MaxVGPRs
-       << " ArchVGPRs, " << Target.RP.getAGPRNum() << '/' << Target.MaxVGPRs
-       << " AGPRs";
+       << " SGPRs, " << Target.RP.getArchVGPRNum(Target.AddressableNumArchVGPRs)
+       << '/' << Target.MaxVGPRs << " ArchVGPRs, "
+       << Target.RP.getAGPRNum(Target.AddressableNumArchVGPRs) << '/'
+       << Target.MaxVGPRs << " AGPRs";
 
     if (Target.MaxUnifiedVGPRs) {
-      OS << ", " << Target.RP.getVGPRNum(true) << '/' << Target.MaxUnifiedVGPRs
-         << " VGPRs (unified)";
+      OS << ", " << Target.RP.getVGPRNum(true, Target.AddressableNumArchVGPRs)
+         << '/' << Target.MaxUnifiedVGPRs << " VGPRs (unified)";
     } else if (Target.CombineVGPRSavings) {
-      OS << ", " << Target.RP.getArchVGPRNum() + Target.RP.getAGPRNum() << '/'
-         << 2 * Target.MaxVGPRs << " VGPRs (combined target)";
+      OS << ", "
+         << Target.RP.getArchVGPRNum(Target.AddressableNumArchVGPRs) +
+                Target.RP.getAGPRNum(Target.AddressableNumArchVGPRs)
+         << '/' << 2 * Target.MaxVGPRs << " VGPRs (combined target)";
     }
     return OS;
   }
@@ -238,7 +268,6 @@ class GCNRPTarget {
 private:
   /// Current register pressure.
   GCNRegPressure RP;
-
   /// Target number of SGPRs.
   unsigned MaxSGPRs;
   /// Target number of ArchVGPRs and AGPRs.
@@ -246,6 +275,8 @@ class GCNRPTarget {
   /// Target number of overall VGPRs for subtargets with unified RFs. Always 0
   /// for subtargets with non-unified RFs.
   unsigned MaxUnifiedVGPRs;
+  /// The maximum number of arch vgprs allowed by the subtarget.
+  unsigned AddressableNumArchVGPRs;
   /// Whether we consider that the register allocator will be able to swap
   /// between ArchVGPRs and AGPRs by copying them to a super register class.
   /// Concretely, this allows savings in one of the VGPR banks to help toward
@@ -254,12 +285,15 @@ class GCNRPTarget {
 
   inline bool satisifiesVGPRBanksTarget() const {
     assert(CombineVGPRSavings && "only makes sense with combined savings");
-    return RP.getArchVGPRNum() + RP.getAGPRNum() <= 2 * MaxVGPRs;
+    return RP.getArchVGPRNum(AddressableNumArchVGPRs) +
+               RP.getAGPRNum(AddressableNumArchVGPRs) <=
+           2 * MaxVGPRs;
   }
 
   /// Always satisified when the subtarget doesn't have a unified RF.
   inline bool satisfiesUnifiedTarget() const {
-    return !MaxUnifiedVGPRs || RP.getVGPRNum(true) <= MaxUnifiedVGPRs;
+    return !MaxUnifiedVGPRs ||
+           RP.getVGPRNum(true, AddressableNumArchVGPRs) <= MaxUnifiedVGPRs;
   }
 
   inline bool isVGPRBankSaveBeneficial(unsigned NumVGPRs) const {
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index ce1ce687d0038..772c979809b75 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -190,10 +190,13 @@ static void getRegisterPressures(
     TempUpwardTracker.recede(*MI);
     NewPressure = TempUpwardTracker.getPressure();
   }
+  unsigned AddressableArchVGPR =
+      DAG->MF.getSubtarget<GCNSubtarget>().getAddressableNumArchVGPRs();
   Pressure[AMDGPU::RegisterPressureSets::SReg_32] = NewPressure.getSGPRNum();
   Pressure[AMDGPU::RegisterPressureSets::VGPR_32] =
-      NewPressure.getArchVGPRNum();
-  Pressure[AMDGPU::RegisterPressureSets::AGPR_32] = NewPressure.getAGPRNum();
+      NewPressure.getArchVGPRNum(AddressableArchVGPR);
+  Pressure[AMDGPU::RegisterPressureSets::AGPR_32] =
+      NewPressure.getAGPRNum(AddressableArchVGPR);
 }
 
 void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
@@ -339,7 +342,8 @@ void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
                             ? static_cast<GCNRPTracker *>(&UpwardTracker)
                             : static_cast<GCNRPTracker *>(&DownwardTracker);
       SGPRPressure = T->getPressure().getSGPRNum();
-      VGPRPressure = T->getPressure().getArchVGPRNum();
+      VGPRPressure = T->getPressure().getArchVGPRNum(
+          DAG->MF.getSubtarget<GCNSubtarget>().getAddressableNumArchVGPRs());
     }
   }
   ReadyQueue &Q = Zone.Available;
@@ -1279,9 +1283,10 @@ void GCNSchedStage::checkScheduling() {
   LLVM_DEBUG(dbgs() << "Region: " << RegionIdx << ".\n");
 
   unsigned DynamicVGPRBlockSize = DAG.MFI.getDynamicVGPRBlockSize();
-
+  unsigned AddressableArchVGPR = ST.getAddressableNumArchVGPRs();
   if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit &&
-      PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) <= S.VGPRCriticalLimit) {
+      PressureAfter.getVGPRNum(ST.hasGFX90AInsts(), AddressableArchVGPR) <=
+          S.VGPRCriticalLimit) {
     DAG.Pressure[RegionIdx] = PressureAfter;
     DAG.RegionsWithMinOcc[RegionIdx] =
         PressureAfter.getOccupancy(ST, DynamicVGPRBlockSize) ==
@@ -1331,9 +1336,10 @@ void GCNSchedStage::checkScheduling() {
   unsigned MaxArchVGPRs = std::min(MaxVGPRs, ST.getAddressableNumArchVGPRs());
   unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF);
 
-  if (PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) > MaxVGPRs ||
-      PressureAfter.getArchVGPRNum() > MaxArchVGPRs ||
-      PressureAfter.getAGPRNum() > MaxArchVGPRs ||
+  if (PressureAfter.getVGPRNum(ST.hasGFX90AInsts(), AddressableArchVGPR) >
+          MaxVGPRs ||
+      PressureAfter.getArchVGPRNum(AddressableArchVGPR) > MaxArchVGPRs ||
+      PressureAfter.getAGPRNum(AddressableArchVGPR) > MaxArchVGPRs ||
       PressureAfter.getSGPRNum() > MaxSGPRs) {
     DAG.RegionsWithHighRP[RegionIdx] = true;
     DAG.RegionsWithExcessRP[RegionIdx] = true;
@@ -1471,12 +1477,13 @@ bool GCNSchedStage::shouldRevertScheduling(unsigned WavesAfter) {
 
   // For dynamic VGPR mode, we don't want to waste any VGPR blocks.
   if (DAG.MFI.isDynamicVGPREnabled()) {
+    unsigned AddressableArchVGPR = ST.getAddressableNumArchVGPRs();
     unsigned BlocksBefore = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks(
         &ST, DAG.MFI.getDynamicVGPRBlockSize(),
-        PressureBefore.getVGPRNum(false));
+        PressureBefore.getVGPRNum(false, AddressableArchVGPR));
     unsigned BlocksAfter = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks(
         &ST, DAG.MFI.getDynamicVGPRBlockSize(),
-        PressureAfter.getVGPRNum(false));
+        PressureAfter.getVGPRNum(false, AddressableArchVGPR));
     if (BlocksAfter > BlocksBefore)
       return true;
   }
diff --git a/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp b/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
index 6b13b06590102..844908a5ce8d9 100644
--- a/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
@@ -211,7 +211,9 @@ bool SIFormMemoryClausesImpl::checkPressure(const MachineInstr &MI,
   // tracking does not account for the alignment requirements for SGPRs, or the
   // fragmentation of registers the allocator will need to satisfy.
   if (Occupancy >= MFI->getMinAllowedOccupancy() &&
-      MaxPressure.getVGPRNum(ST->hasGFX90AInsts()) <= MaxVGPRs / 2 &&
+      MaxPressure.getVGPRNum(ST->hasGFX90AInsts(),
+                             ST->getAddressableNumArchVGPRs()) <=
+          MaxVGPRs / 2 &&
       MaxPressure.getSGPRNum() <= MaxSGPRs / 2) {
     LastRecordedOccupancy = Occupancy;
     return true;

From 99a540244f4c848c9d228b10c6fa33de605c64d4 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes@amd.com>
Date: Mon, 28 Jul 2025 13:05:45 -0700
Subject: [PATCH 02/11] Handle gfx908 case

Change-Id: Ic16c8a4ffdf58027de164c598cfac70fc453bb00
---
 .../Target/AMDGPU/GCNIterativeScheduler.cpp   |  34 ++---
 llvm/lib/Target/AMDGPU/GCNRegPressure.cpp     | 127 +++++++++---------
 llvm/lib/Target/AMDGPU/GCNRegPressure.h       |  24 ++--
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp   |  77 ++++++-----
 llvm/lib/Target/AMDGPU/GCNSubtarget.h         |   9 ++
 .../lib/Target/AMDGPU/SIFormMemoryClauses.cpp |   5 +-
 .../AMDGPU/debug-value-scheduler-liveins.mir  |   2 +-
 7 files changed, 154 insertions(+), 124 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
index f253a841f16a6..050e47270498b 100644
--- a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
@@ -451,7 +451,7 @@ unsigned GCNIterativeScheduler::tryMaximizeOccupancy(unsigned TargetOcc) {
   const unsigned DynamicVGPRBlockSize =
       MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();
   const auto Occ =
-      Regions.front()->MaxPressure.getOccupancy(ST, DynamicVGPRBlockSize);
+      Regions.front()->MaxPressure.getOccupancy(ST, DynamicVGPRBlockSize, MF);
   LLVM_DEBUG(dbgs() << "Trying to improve occupancy, target = " << TargetOcc
                     << ", current = " << Occ << '\n');
 
@@ -460,7 +460,7 @@ unsigned GCNIterativeScheduler::tryMaximizeOccupancy(unsigned TargetOcc) {
     // Always build the DAG to add mutations
     BuildDAG DAG(*R, *this);
 
-    if (R->MaxPressure.getOccupancy(ST, DynamicVGPRBlockSize) >= NewOcc)
+    if (R->MaxPressure.getOccupancy(ST, DynamicVGPRBlockSize, MF) >= NewOcc)
       continue;
 
     LLVM_DEBUG(printRegion(dbgs(), R->Begin, R->End, LIS, 3);
@@ -471,7 +471,7 @@ unsigned GCNIterativeScheduler::tryMaximizeOccupancy(unsigned TargetOcc) {
     LLVM_DEBUG(dbgs() << "Occupancy improvement attempt:\n";
                printSchedRP(dbgs(), R->MaxPressure, MaxRP));
 
-    NewOcc = std::min(NewOcc, MaxRP.getOccupancy(ST, DynamicVGPRBlockSize));
+    NewOcc = std::min(NewOcc, MaxRP.getOccupancy(ST, DynamicVGPRBlockSize, MF));
     if (NewOcc <= Occ)
       break;
 
@@ -496,7 +496,7 @@ void GCNIterativeScheduler::scheduleLegacyMaxOccupancy(
 
   sortRegionsByPressure(TgtOcc);
   auto Occ =
-      Regions.front()->MaxPressure.getOccupancy(ST, DynamicVGPRBlockSize);
+      Regions.front()->MaxPressure.getOccupancy(ST, DynamicVGPRBlockSize, MF);
 
   bool IsReentry = false;
   if (TryMaximizeOccupancy && Occ < TgtOcc) {
@@ -527,21 +527,22 @@ void GCNIterativeScheduler::scheduleLegacyMaxOccupancy(
       const auto RP = getRegionPressure(*R);
       LLVM_DEBUG(printSchedRP(dbgs(), R->MaxPressure, RP));
 
-      if (RP.getOccupancy(ST, DynamicVGPRBlockSize) < TgtOcc) {
+      if (RP.getOccupancy(ST, DynamicVGPRBlockSize, MF) < TgtOcc) {
         LLVM_DEBUG(dbgs() << "Didn't fit into target occupancy O" << TgtOcc);
-        if (R->BestSchedule.get() && R->BestSchedule->MaxPressure.getOccupancy(
-                                         ST, DynamicVGPRBlockSize) >= TgtOcc) {
+        if (R->BestSchedule.get() &&
+            R->BestSchedule->MaxPressure.getOccupancy(ST, DynamicVGPRBlockSize,
+                                                      MF) >= TgtOcc) {
           LLVM_DEBUG(dbgs() << ", scheduling minimal register\n");
           scheduleBest(*R);
         } else {
           LLVM_DEBUG(dbgs() << ", restoring\n");
           Ovr.restoreOrder();
-          assert(R->MaxPressure.getOccupancy(ST, DynamicVGPRBlockSize) >=
+          assert(R->MaxPressure.getOccupancy(ST, DynamicVGPRBlockSize, MF) >=
                  TgtOcc);
         }
       }
-      FinalOccupancy =
-          std::min(FinalOccupancy, RP.getOccupancy(ST, DynamicVGPRBlockSize));
+      FinalOccupancy = std::min(FinalOccupancy,
+                                RP.getOccupancy(ST, DynamicVGPRBlockSize, MF));
     }
   }
   MFI->limitOccupancy(FinalOccupancy);
@@ -591,7 +592,7 @@ void GCNIterativeScheduler::scheduleILP(
 
   sortRegionsByPressure(TgtOcc);
   auto Occ =
-      Regions.front()->MaxPressure.getOccupancy(ST, DynamicVGPRBlockSize);
+      Regions.front()->MaxPressure.getOccupancy(ST, DynamicVGPRBlockSize, MF);
 
   bool IsReentry = false;
   if (TryMaximizeOccupancy && Occ < TgtOcc) {
@@ -612,18 +613,19 @@ void GCNIterativeScheduler::scheduleILP(
     const auto RP = getSchedulePressure(*R, ILPSchedule);
     LLVM_DEBUG(printSchedRP(dbgs(), R->MaxPressure, RP));
 
-    if (RP.getOccupancy(ST, DynamicVGPRBlockSize) < TgtOcc) {
+    if (RP.getOccupancy(ST, DynamicVGPRBlockSize, MF) < TgtOcc) {
       LLVM_DEBUG(dbgs() << "Didn't fit into target occupancy O" << TgtOcc);
-      if (R->BestSchedule.get() && R->BestSchedule->MaxPressure.getOccupancy(
-                                       ST, DynamicVGPRBlockSize) >= TgtOcc) {
+      if (R->BestSchedule.get() &&
+          R->BestSchedule->MaxPressure.getOccupancy(ST, DynamicVGPRBlockSize,
+                                                    MF) >= TgtOcc) {
         LLVM_DEBUG(dbgs() << ", scheduling minimal register\n");
         scheduleBest(*R);
       }
     } else {
       scheduleRegion(*R, ILPSchedule, RP);
       LLVM_DEBUG(printSchedResult(dbgs(), R, RP));
-      FinalOccupancy =
-          std::min(FinalOccupancy, RP.getOccupancy(ST, DynamicVGPRBlockSize));
+      FinalOccupancy = std::min(FinalOccupancy,
+                                RP.getOccupancy(ST, DynamicVGPRBlockSize, MF));
     }
   }
   MFI->limitOccupancy(FinalOccupancy);
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
index 286c8d9529731..786b45902ae48 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -99,21 +99,21 @@ void GCNRegPressure::inc(unsigned Reg,
 bool GCNRegPressure::less(const MachineFunction &MF, const GCNRegPressure &O,
                           unsigned MaxOccupancy) const {
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
-  unsigned MaxArchVGPRs = ST.getAddressableNumArchVGPRs();
+  unsigned ArchVGPRThreshold = ST.getArchVGPRAllocationThreshold(MF);
   unsigned DynamicVGPRBlockSize =
       MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();
 
   const auto SGPROcc = std::min(MaxOccupancy,
                                 ST.getOccupancyWithNumSGPRs(getSGPRNum()));
   const auto VGPROcc = std::min(
-      MaxOccupancy,
-      ST.getOccupancyWithNumVGPRs(getVGPRNum(ST.hasGFX90AInsts(), MaxArchVGPRs),
-                                  DynamicVGPRBlockSize));
+      MaxOccupancy, ST.getOccupancyWithNumVGPRs(
+                        getVGPRNum(ST.hasGFX90AInsts(), ArchVGPRThreshold),
+                        DynamicVGPRBlockSize));
   const auto OtherSGPROcc = std::min(MaxOccupancy,
                                 ST.getOccupancyWithNumSGPRs(O.getSGPRNum()));
   const auto OtherVGPROcc = std::min(
       MaxOccupancy, ST.getOccupancyWithNumVGPRs(
-                        O.getVGPRNum(ST.hasGFX90AInsts(), MaxArchVGPRs),
+                        O.getVGPRNum(ST.hasGFX90AInsts(), ArchVGPRThreshold),
                         DynamicVGPRBlockSize));
 
   const auto Occ = std::min(SGPROcc, VGPROcc);
@@ -139,34 +139,37 @@ bool GCNRegPressure::less(const MachineFunction &MF, const GCNRegPressure &O,
 
   // Unified excess pressure conditions, accounting for VGPRs used for SGPR
   // spills
-  unsigned ExcessVGPR =
-      std::max(static_cast<int>(getVGPRNum(ST.hasGFX90AInsts(), MaxArchVGPRs) +
-                                VGPRForSGPRSpills - MaxVGPRs),
-               0);
+  unsigned ExcessVGPR = std::max(
+      static_cast<int>(getVGPRNum(ST.hasGFX90AInsts(), ArchVGPRThreshold) +
+                       VGPRForSGPRSpills - MaxVGPRs),
+      0);
   unsigned OtherExcessVGPR = std::max(
-      static_cast<int>(O.getVGPRNum(ST.hasGFX90AInsts(), MaxArchVGPRs) +
+      static_cast<int>(O.getVGPRNum(ST.hasGFX90AInsts(), ArchVGPRThreshold) +
                        OtherVGPRForSGPRSpills - MaxVGPRs),
       0);
   // Arch VGPR excess pressure conditions, accounting for VGPRs used for SGPR
   // spills
+  unsigned AddressableArchVGPRs = ST.getAddressableNumArchVGPRs();
   unsigned ExcessArchVGPR =
-      std::max(static_cast<int>(getVGPRNum(false, MaxArchVGPRs) +
-                                VGPRForSGPRSpills - MaxArchVGPRs),
+      std::max(static_cast<int>(getVGPRNum(false, ArchVGPRThreshold) +
+                                VGPRForSGPRSpills - AddressableArchVGPRs),
                0);
   unsigned OtherExcessArchVGPR =
-      std::max(static_cast<int>(O.getVGPRNum(false, MaxArchVGPRs) +
-                                OtherVGPRForSGPRSpills - MaxArchVGPRs),
+      std::max(static_cast<int>(O.getVGPRNum(false, ArchVGPRThreshold) +
+                                OtherVGPRForSGPRSpills - AddressableArchVGPRs),
                0);
   // AGPR excess pressure conditions
   unsigned ExcessAGPR =
-      std::max(static_cast<int>(ST.hasGFX90AInsts()
-                                    ? (getAGPRNum(MaxArchVGPRs) - MaxArchVGPRs)
-                                    : (getAGPRNum(MaxArchVGPRs) - MaxVGPRs)),
+      std::max(static_cast<int>(
+                   ST.hasGFX90AInsts()
+                       ? (getAGPRNum(ArchVGPRThreshold) - AddressableArchVGPRs)
+                       : (getAGPRNum(ArchVGPRThreshold) - MaxVGPRs)),
                0);
   unsigned OtherExcessAGPR = std::max(
-      static_cast<int>(ST.hasGFX90AInsts()
-                           ? (O.getAGPRNum(MaxArchVGPRs) - MaxArchVGPRs)
-                           : (O.getAGPRNum(MaxArchVGPRs) - MaxVGPRs)),
+      static_cast<int>(
+          ST.hasGFX90AInsts()
+              ? (O.getAGPRNum(ArchVGPRThreshold) - AddressableArchVGPRs)
+              : (O.getAGPRNum(ArchVGPRThreshold) - MaxVGPRs)),
       0);
 
   bool ExcessRP = ExcessSGPR || ExcessVGPR || ExcessArchVGPR || ExcessAGPR;
@@ -187,20 +190,20 @@ bool GCNRegPressure::less(const MachineFunction &MF, const GCNRegPressure &O,
       return VGPRDiff > 0;
     if (SGPRDiff != 0) {
       unsigned PureExcessVGPR =
-          std::max(
-              static_cast<int>(getVGPRNum(ST.hasGFX90AInsts(), MaxArchVGPRs) -
-                               MaxVGPRs),
-              0) +
-          std::max(
-              static_cast<int>(getVGPRNum(false, MaxArchVGPRs) - MaxArchVGPRs),
-              0);
+          std::max(static_cast<int>(
+                       getVGPRNum(ST.hasGFX90AInsts(), ArchVGPRThreshold) -
+                       MaxVGPRs),
+                   0) +
+          std::max(static_cast<int>(getVGPRNum(false, ArchVGPRThreshold) -
+                                    AddressableArchVGPRs),
+                   0);
       unsigned OtherPureExcessVGPR =
-          std::max(
-              static_cast<int>(O.getVGPRNum(ST.hasGFX90AInsts(), MaxArchVGPRs) -
-                               MaxVGPRs),
-              0) +
-          std::max(static_cast<int>(O.getVGPRNum(false, MaxArchVGPRs) -
-                                    MaxArchVGPRs),
+          std::max(static_cast<int>(
+                       O.getVGPRNum(ST.hasGFX90AInsts(), ArchVGPRThreshold) -
+                       MaxVGPRs),
+                   0) +
+          std::max(static_cast<int>(O.getVGPRNum(false, ArchVGPRThreshold) -
+                                    AddressableArchVGPRs),
                    0);
 
       // If we have a special case where there is a tie in excess VGPR, but one
@@ -231,8 +234,8 @@ bool GCNRegPressure::less(const MachineFunction &MF, const GCNRegPressure &O,
       if (SW != OtherSW)
         return SW < OtherSW;
     } else {
-      auto VW = getVGPRTuplesWeight(MaxArchVGPRs);
-      auto OtherVW = O.getVGPRTuplesWeight(MaxArchVGPRs);
+      auto VW = getVGPRTuplesWeight(ArchVGPRThreshold);
+      auto OtherVW = O.getVGPRTuplesWeight(ArchVGPRThreshold);
       if (VW != OtherVW)
         return VW < OtherVW;
     }
@@ -240,32 +243,33 @@ bool GCNRegPressure::less(const MachineFunction &MF, const GCNRegPressure &O,
 
   // Give final precedence to lower general RP.
   return SGPRImportant ? (getSGPRNum() < O.getSGPRNum())
-                       : (getVGPRNum(ST.hasGFX90AInsts(), MaxArchVGPRs) <
-                          O.getVGPRNum(ST.hasGFX90AInsts(), MaxArchVGPRs));
+                       : (getVGPRNum(ST.hasGFX90AInsts(), ArchVGPRThreshold) <
+                          O.getVGPRNum(ST.hasGFX90AInsts(), ArchVGPRThreshold));
 }
 
 Printable llvm::print(const GCNRegPressure &RP, const GCNSubtarget *ST,
-                      unsigned DynamicVGPRBlockSize) {
-  return Printable([&RP, ST, DynamicVGPRBlockSize](raw_ostream &OS) {
-    OS << "VGPRs: " << RP.getArchVGPRNum(ST->getAddressableNumArchVGPRs())
-       << ' ' << "AGPRs: " << RP.getAGPRNum(ST->getAddressableNumArchVGPRs());
-    if (ST)
-      OS << "(O"
-         << ST->getOccupancyWithNumVGPRs(
-                RP.getVGPRNum(ST->hasGFX90AInsts(),
-                              ST->getAddressableNumArchVGPRs()),
-                DynamicVGPRBlockSize)
-         << ')';
-    OS << ", SGPRs: " << RP.getSGPRNum();
-    if (ST)
-      OS << "(O" << ST->getOccupancyWithNumSGPRs(RP.getSGPRNum()) << ')';
-    OS << ", LVGPR WT: "
-       << RP.getVGPRTuplesWeight(ST->getAddressableNumArchVGPRs())
-       << ", LSGPR WT: " << RP.getSGPRTuplesWeight();
-    if (ST)
-      OS << " -> Occ: " << RP.getOccupancy(*ST, DynamicVGPRBlockSize);
-    OS << '\n';
-  });
+                      unsigned DynamicVGPRBlockSize,
+                      const MachineFunction *MF) {
+  unsigned ArchVGPRThreshold = ST->getArchVGPRAllocationThreshold(*MF);
+  return Printable(
+      [&RP, ST, DynamicVGPRBlockSize, ArchVGPRThreshold, MF](raw_ostream &OS) {
+        OS << "VGPRs: " << RP.getArchVGPRNum(ArchVGPRThreshold) << ' '
+           << "AGPRs: " << RP.getAGPRNum(ArchVGPRThreshold);
+        if (ST)
+          OS << "(O"
+             << ST->getOccupancyWithNumVGPRs(
+                    RP.getVGPRNum(ST->hasGFX90AInsts(), ArchVGPRThreshold),
+                    DynamicVGPRBlockSize)
+             << ')';
+        OS << ", SGPRs: " << RP.getSGPRNum();
+        if (ST)
+          OS << "(O" << ST->getOccupancyWithNumSGPRs(RP.getSGPRNum()) << ')';
+        OS << ", LVGPR WT: " << RP.getVGPRTuplesWeight(ArchVGPRThreshold)
+           << ", LSGPR WT: " << RP.getSGPRTuplesWeight();
+        if (ST)
+          OS << " -> Occ: " << RP.getOccupancy(*ST, DynamicVGPRBlockSize, *MF);
+        OS << '\n';
+      });
 }
 
 static LaneBitmask getDefRegMask(const MachineOperand &MO,
@@ -899,9 +903,10 @@ bool GCNRegPressurePrinter::runOnMachineFunction(MachineFunction &MF) {
   auto printRP = [&MF](const GCNRegPressure &RP) {
     return Printable([&RP, &MF](raw_ostream &OS) {
       OS << format(PFX "  %-5d", RP.getSGPRNum())
-         << format(" %-5d",
-                   RP.getVGPRNum(false, MF.getSubtarget<GCNSubtarget>()
-                                            .getAddressableNumArchVGPRs()));
+         << format(
+                " %-5d",
+                RP.getVGPRNum(false, MF.getSubtarget<GCNSubtarget>()
+                                         .getArchVGPRAllocationThreshold(MF)));
     });
   };
 
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
index a8c1c3bfd8703..98eb35eaaca8e 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
@@ -108,13 +108,13 @@ struct GCNRegPressure {
   }
   unsigned getSGPRTuplesWeight() const { return Value[TOTAL_KINDS + SGPR]; }
 
-  unsigned getOccupancy(const GCNSubtarget &ST,
-                        unsigned DynamicVGPRBlockSize) const {
-    return std::min(
-        ST.getOccupancyWithNumSGPRs(getSGPRNum()),
-        ST.getOccupancyWithNumVGPRs(
-            getVGPRNum(ST.hasGFX90AInsts(), ST.getAddressableNumArchVGPRs()),
-            DynamicVGPRBlockSize));
+  unsigned getOccupancy(const GCNSubtarget &ST, unsigned DynamicVGPRBlockSize,
+                        const MachineFunction &MF) const {
+    return std::min(ST.getOccupancyWithNumSGPRs(getSGPRNum()),
+                    ST.getOccupancyWithNumVGPRs(
+                        getVGPRNum(ST.hasGFX90AInsts(),
+                                   ST.getArchVGPRAllocationThreshold(MF)),
+                        DynamicVGPRBlockSize));
   }
 
   void inc(unsigned Reg,
@@ -123,9 +123,10 @@ struct GCNRegPressure {
            const MachineRegisterInfo &MRI);
 
   bool higherOccupancy(const GCNSubtarget &ST, const GCNRegPressure &O,
-                       unsigned DynamicVGPRBlockSize) const {
-    return getOccupancy(ST, DynamicVGPRBlockSize) >
-           O.getOccupancy(ST, DynamicVGPRBlockSize);
+                       unsigned DynamicVGPRBlockSize,
+                       const MachineFunction &MF) const {
+    return getOccupancy(ST, DynamicVGPRBlockSize, MF) >
+           O.getOccupancy(ST, DynamicVGPRBlockSize, MF);
   }
 
   /// Compares \p this GCNRegpressure to \p O, returning true if \p this is
@@ -551,7 +552,8 @@ bool isEqual(const GCNRPTracker::LiveRegSet &S1,
              const GCNRPTracker::LiveRegSet &S2);
 
 Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST = nullptr,
-                unsigned DynamicVGPRBlockSize = 0);
+                unsigned DynamicVGPRBlockSize = 0,
+                const MachineFunction *MF = nullptr);
 
 Printable print(const GCNRPTracker::LiveRegSet &LiveRegs,
                 const MachineRegisterInfo &MRI);
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 772c979809b75..2b61ad4e7a8d5 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -190,13 +190,14 @@ static void getRegisterPressures(
     TempUpwardTracker.recede(*MI);
     NewPressure = TempUpwardTracker.getPressure();
   }
-  unsigned AddressableArchVGPR =
-      DAG->MF.getSubtarget<GCNSubtarget>().getAddressableNumArchVGPRs();
+  unsigned ArchVGPRThreshold =
+      DAG->MF.getSubtarget<GCNSubtarget>().getArchVGPRAllocationThreshold(
+          DAG->MF);
   Pressure[AMDGPU::RegisterPressureSets::SReg_32] = NewPressure.getSGPRNum();
   Pressure[AMDGPU::RegisterPressureSets::VGPR_32] =
-      NewPressure.getArchVGPRNum(AddressableArchVGPR);
+      NewPressure.getArchVGPRNum(ArchVGPRThreshold);
   Pressure[AMDGPU::RegisterPressureSets::AGPR_32] =
-      NewPressure.getAGPRNum(AddressableArchVGPR);
+      NewPressure.getAGPRNum(ArchVGPRThreshold);
 }
 
 void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
@@ -343,7 +344,8 @@ void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
                             : static_cast<GCNRPTracker *>(&DownwardTracker);
       SGPRPressure = T->getPressure().getSGPRNum();
       VGPRPressure = T->getPressure().getArchVGPRNum(
-          DAG->MF.getSubtarget<GCNSubtarget>().getAddressableNumArchVGPRs());
+          DAG->MF.getSubtarget<GCNSubtarget>().getArchVGPRAllocationThreshold(
+              DAG->MF));
     }
   }
   ReadyQueue &Q = Zone.Available;
@@ -1144,8 +1146,9 @@ void UnclusteredHighRPStage::finalizeGCNSchedStage() {
   if (DAG.MinOccupancy > InitialOccupancy) {
     for (unsigned IDX = 0; IDX < DAG.Pressure.size(); ++IDX)
       DAG.RegionsWithMinOcc[IDX] =
-          DAG.Pressure[IDX].getOccupancy(
-              DAG.ST, DAG.MFI.getDynamicVGPRBlockSize()) == DAG.MinOccupancy;
+          DAG.Pressure[IDX].getOccupancy(DAG.ST,
+                                         DAG.MFI.getDynamicVGPRBlockSize(),
+                                         DAG.MF) == DAG.MinOccupancy;
 
     LLVM_DEBUG(dbgs() << StageID
                       << " stage successfully increased occupancy to "
@@ -1197,8 +1200,10 @@ bool GCNSchedStage::initGCNRegion() {
       dbgs() << "Pressure before scheduling:\nRegion live-ins:"
              << print(DAG.LiveIns[RegionIdx], DAG.MRI)
              << "Region live-in pressure:  "
-             << print(llvm::getRegPressure(DAG.MRI, DAG.LiveIns[RegionIdx]))
-             << "Region register pressure: " << print(PressureBefore));
+             << print(llvm::getRegPressure(DAG.MRI, DAG.LiveIns[RegionIdx]),
+                      &ST, 0, &MF)
+             << "Region register pressure: "
+             << print(PressureBefore, &ST, 0, &MF));
 
   S.HasHighPressure = false;
   S.KnownExcessRP = isRegionWithExcessRP();
@@ -1279,17 +1284,18 @@ void GCNSchedStage::checkScheduling() {
   // Check the results of scheduling.
   PressureAfter = DAG.getRealRegPressure(RegionIdx);
 
-  LLVM_DEBUG(dbgs() << "Pressure after scheduling: " << print(PressureAfter));
+  LLVM_DEBUG(dbgs() << "Pressure after scheduling: "
+                    << print(PressureAfter, &ST, 0, &MF));
   LLVM_DEBUG(dbgs() << "Region: " << RegionIdx << ".\n");
 
   unsigned DynamicVGPRBlockSize = DAG.MFI.getDynamicVGPRBlockSize();
-  unsigned AddressableArchVGPR = ST.getAddressableNumArchVGPRs();
+  unsigned ArchVGPRThreshold = ST.getArchVGPRAllocationThreshold(MF);
   if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit &&
-      PressureAfter.getVGPRNum(ST.hasGFX90AInsts(), AddressableArchVGPR) <=
+      PressureAfter.getVGPRNum(ST.hasGFX90AInsts(), ArchVGPRThreshold) <=
           S.VGPRCriticalLimit) {
     DAG.Pressure[RegionIdx] = PressureAfter;
     DAG.RegionsWithMinOcc[RegionIdx] =
-        PressureAfter.getOccupancy(ST, DynamicVGPRBlockSize) ==
+        PressureAfter.getOccupancy(ST, DynamicVGPRBlockSize, DAG.MF) ==
         DAG.MinOccupancy;
 
     // Early out if we have achieved the occupancy target.
@@ -1299,10 +1305,12 @@ void GCNSchedStage::checkScheduling() {
 
   unsigned TargetOccupancy = std::min(
       S.getTargetOccupancy(), ST.getOccupancyWithWorkGroupSizes(MF).second);
-  unsigned WavesAfter = std::min(
-      TargetOccupancy, PressureAfter.getOccupancy(ST, DynamicVGPRBlockSize));
-  unsigned WavesBefore = std::min(
-      TargetOccupancy, PressureBefore.getOccupancy(ST, DynamicVGPRBlockSize));
+  unsigned WavesAfter =
+      std::min(TargetOccupancy,
+               PressureAfter.getOccupancy(ST, DynamicVGPRBlockSize, DAG.MF));
+  unsigned WavesBefore =
+      std::min(TargetOccupancy,
+               PressureBefore.getOccupancy(ST, DynamicVGPRBlockSize, DAG.MF));
   LLVM_DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore
                     << ", after " << WavesAfter << ".\n");
 
@@ -1336,10 +1344,10 @@ void GCNSchedStage::checkScheduling() {
   unsigned MaxArchVGPRs = std::min(MaxVGPRs, ST.getAddressableNumArchVGPRs());
   unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF);
 
-  if (PressureAfter.getVGPRNum(ST.hasGFX90AInsts(), AddressableArchVGPR) >
+  if (PressureAfter.getVGPRNum(ST.hasGFX90AInsts(), ArchVGPRThreshold) >
           MaxVGPRs ||
-      PressureAfter.getArchVGPRNum(AddressableArchVGPR) > MaxArchVGPRs ||
-      PressureAfter.getAGPRNum(AddressableArchVGPR) > MaxArchVGPRs ||
+      PressureAfter.getArchVGPRNum(ArchVGPRThreshold) > MaxArchVGPRs ||
+      PressureAfter.getAGPRNum(ArchVGPRThreshold) > MaxArchVGPRs ||
       PressureAfter.getSGPRNum() > MaxSGPRs) {
     DAG.RegionsWithHighRP[RegionIdx] = true;
     DAG.RegionsWithExcessRP[RegionIdx] = true;
@@ -1352,7 +1360,7 @@ void GCNSchedStage::checkScheduling() {
   } else {
     DAG.Pressure[RegionIdx] = PressureAfter;
     DAG.RegionsWithMinOcc[RegionIdx] =
-        PressureAfter.getOccupancy(ST, DynamicVGPRBlockSize) ==
+        PressureAfter.getOccupancy(ST, DynamicVGPRBlockSize, DAG.MF) ==
         DAG.MinOccupancy;
   }
 }
@@ -1477,13 +1485,13 @@ bool GCNSchedStage::shouldRevertScheduling(unsigned WavesAfter) {
 
   // For dynamic VGPR mode, we don't want to waste any VGPR blocks.
   if (DAG.MFI.isDynamicVGPREnabled()) {
-    unsigned AddressableArchVGPR = ST.getAddressableNumArchVGPRs();
+    unsigned ArchVGPRThreshold = ST.getArchVGPRAllocationThreshold(MF);
     unsigned BlocksBefore = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks(
         &ST, DAG.MFI.getDynamicVGPRBlockSize(),
-        PressureBefore.getVGPRNum(false, AddressableArchVGPR));
+        PressureBefore.getVGPRNum(false, ArchVGPRThreshold));
     unsigned BlocksAfter = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks(
         &ST, DAG.MFI.getDynamicVGPRBlockSize(),
-        PressureAfter.getVGPRNum(false, AddressableArchVGPR));
+        PressureAfter.getVGPRNum(false, ArchVGPRThreshold));
     if (BlocksAfter > BlocksBefore)
       return true;
   }
@@ -1507,8 +1515,8 @@ bool OccInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) {
 bool UnclusteredHighRPStage::shouldRevertScheduling(unsigned WavesAfter) {
   // If RP is not reduced in the unclustered reschedule stage, revert to the
   // old schedule.
-  if ((WavesAfter <=
-           PressureBefore.getOccupancy(ST, DAG.MFI.getDynamicVGPRBlockSize()) &&
+  if ((WavesAfter <= PressureBefore.getOccupancy(
+                         ST, DAG.MFI.getDynamicVGPRBlockSize(), DAG.MF) &&
        mayCauseSpilling(WavesAfter)) ||
       GCNSchedStage::shouldRevertScheduling(WavesAfter)) {
     LLVM_DEBUG(dbgs() << "Unclustered reschedule did not help.\n");
@@ -1530,9 +1538,10 @@ bool UnclusteredHighRPStage::shouldRevertScheduling(unsigned WavesAfter) {
   ScheduleMetrics MAfter = getScheduleMetrics(DAG);
   unsigned OldMetric = MBefore.getMetric();
   unsigned NewMetric = MAfter.getMetric();
-  unsigned WavesBefore = std::min(
-      S.getTargetOccupancy(),
-      PressureBefore.getOccupancy(ST, DAG.MFI.getDynamicVGPRBlockSize()));
+  unsigned WavesBefore =
+      std::min(S.getTargetOccupancy(),
+               PressureBefore.getOccupancy(
+                   ST, DAG.MFI.getDynamicVGPRBlockSize(), DAG.MF));
   unsigned Profit =
       ((WavesAfter * ScheduleMetrics::ScaleFactor) / WavesBefore *
        ((OldMetric + ScheduleMetricBias) * ScheduleMetrics::ScaleFactor) /
@@ -1586,8 +1595,8 @@ bool GCNSchedStage::mayCauseSpilling(unsigned WavesAfter) {
 
 void GCNSchedStage::revertScheduling() {
   DAG.RegionsWithMinOcc[RegionIdx] =
-      PressureBefore.getOccupancy(ST, DAG.MFI.getDynamicVGPRBlockSize()) ==
-      DAG.MinOccupancy;
+      PressureBefore.getOccupancy(ST, DAG.MFI.getDynamicVGPRBlockSize(),
+                                  DAG.MF) == DAG.MinOccupancy;
   LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n");
   DAG.RegionEnd = DAG.RegionBegin;
   int SkippedDebugInstr = 0;
@@ -2025,8 +2034,10 @@ void PreRARematStage::rematerialize() {
     }
     DAG.Pressure[I] = RP;
     AchievedOcc = std::min(
-        AchievedOcc, RP.getOccupancy(ST, MF.getInfo<SIMachineFunctionInfo>()
-                                             ->getDynamicVGPRBlockSize()));
+        AchievedOcc,
+        RP.getOccupancy(
+            ST, MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize(),
+            DAG.MF));
   }
   REMAT_DEBUG(dbgs() << "Achieved occupancy " << AchievedOcc << "\n");
 }
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 10ded0e1d1c3a..a259b90545ee9 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1629,6 +1629,15 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
     return AMDGPU::IsaInfo::getAddressableNumArchVGPRs(this);
   }
 
+  unsigned getArchVGPRAllocationThreshold(const MachineFunction &MF) const {
+    if (hasGFX90AInsts() || !hasMAIInsts())
+      return AMDGPU::IsaInfo::getAddressableNumArchVGPRs(this);
+
+    const Function &F = MF.getFunction();
+    std::pair<unsigned, unsigned> Waves = getWavesPerEU(F);
+    return getMaxNumVGPRs(Waves.first, 0);
+  }
+
   /// \returns Addressable number of VGPRs supported by the subtarget.
   unsigned getAddressableNumVGPRs(unsigned DynamicVGPRBlockSize) const {
     return AMDGPU::IsaInfo::getAddressableNumVGPRs(this, DynamicVGPRBlockSize);
diff --git a/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp b/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
index 844908a5ce8d9..f4cf8f4e03df8 100644
--- a/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
@@ -199,7 +199,8 @@ bool SIFormMemoryClausesImpl::checkPressure(const MachineInstr &MI,
   GCNRegPressure MaxPressure = RPT.moveMaxPressure();
   unsigned Occupancy = MaxPressure.getOccupancy(
       *ST,
-      MI.getMF()->getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize());
+      MI.getMF()->getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize(),
+      *MI.getMF());
 
   // Don't push over half the register budget. We don't want to introduce
   // spilling just to form a soft clause.
@@ -212,7 +213,7 @@ bool SIFormMemoryClausesImpl::checkPressure(const MachineInstr &MI,
   // fragmentation of registers the allocator will need to satisfy.
   if (Occupancy >= MFI->getMinAllowedOccupancy() &&
       MaxPressure.getVGPRNum(ST->hasGFX90AInsts(),
-                             ST->getAddressableNumArchVGPRs()) <=
+                             ST->getArchVGPRAllocationThreshold(*MI.getMF())) <=
           MaxVGPRs / 2 &&
       MaxPressure.getSGPRNum() <= MaxSGPRs / 2) {
     LastRecordedOccupancy = Occupancy;
diff --git a/llvm/test/CodeGen/AMDGPU/debug-value-scheduler-liveins.mir b/llvm/test/CodeGen/AMDGPU/debug-value-scheduler-liveins.mir
index 2a08c52e447ba..72181346764fb 100644
--- a/llvm/test/CodeGen/AMDGPU/debug-value-scheduler-liveins.mir
+++ b/llvm/test/CodeGen/AMDGPU/debug-value-scheduler-liveins.mir
@@ -6,7 +6,7 @@
 # CHECK-NEXT: test_get_liveins:%bb.0
 # CHECK: ********** MI Scheduling **********
 # CHECK-NEXT: test_get_liveins:%bb.1
-# CHECK: Region live-in pressure:  VGPRs: 1 AGPRs: 0, SGPRs: 0, LVGPR WT: 0, LSGPR WT: 0
+# CHECK: Region live-in pressure:  VGPRs: 1 AGPRs: 0(O10), SGPRs: 0(O10), LVGPR WT: 0, LSGPR WT: 0 -> Occ: 10
 # CHECK: ScheduleDAGMILive::schedule starting
 
 ---

From 1f24d721d1bb0ef89fe91a787cf0941edc63816b Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes@amd.com>
Date: Mon, 28 Jul 2025 13:19:52 -0700
Subject: [PATCH 03/11] Cleanup signature for getOccupancy

Change-Id: I0b74f6ee1d93bd5e6fc3e285c0c6e91a8090d28e
---
 .../Target/AMDGPU/GCNIterativeScheduler.cpp   | 39 ++++++-------------
 llvm/lib/Target/AMDGPU/GCNRegPressure.cpp     |  2 +-
 llvm/lib/Target/AMDGPU/GCNRegPressure.h       | 14 ++++---
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp   | 33 +++++-----------
 .../lib/Target/AMDGPU/SIFormMemoryClauses.cpp |  5 +--
 5 files changed, 31 insertions(+), 62 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
index 050e47270498b..2c833abedbfb7 100644
--- a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
@@ -447,11 +447,7 @@ void GCNIterativeScheduler::sortRegionsByPressure(unsigned TargetOcc) {
 // BestSchedules aren't deleted on fail.
 unsigned GCNIterativeScheduler::tryMaximizeOccupancy(unsigned TargetOcc) {
   // TODO: assert Regions are sorted descending by pressure
-  const auto &ST = MF.getSubtarget<GCNSubtarget>();
-  const unsigned DynamicVGPRBlockSize =
-      MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();
-  const auto Occ =
-      Regions.front()->MaxPressure.getOccupancy(ST, DynamicVGPRBlockSize, MF);
+  const auto Occ = Regions.front()->MaxPressure.getOccupancy(MF);
   LLVM_DEBUG(dbgs() << "Trying to improve occupancy, target = " << TargetOcc
                     << ", current = " << Occ << '\n');
 
@@ -460,7 +456,7 @@ unsigned GCNIterativeScheduler::tryMaximizeOccupancy(unsigned TargetOcc) {
     // Always build the DAG to add mutations
     BuildDAG DAG(*R, *this);
 
-    if (R->MaxPressure.getOccupancy(ST, DynamicVGPRBlockSize, MF) >= NewOcc)
+    if (R->MaxPressure.getOccupancy(MF) >= NewOcc)
       continue;
 
     LLVM_DEBUG(printRegion(dbgs(), R->Begin, R->End, LIS, 3);
@@ -471,7 +467,7 @@ unsigned GCNIterativeScheduler::tryMaximizeOccupancy(unsigned TargetOcc) {
     LLVM_DEBUG(dbgs() << "Occupancy improvement attempt:\n";
                printSchedRP(dbgs(), R->MaxPressure, MaxRP));
 
-    NewOcc = std::min(NewOcc, MaxRP.getOccupancy(ST, DynamicVGPRBlockSize, MF));
+    NewOcc = std::min(NewOcc, MaxRP.getOccupancy(MF));
     if (NewOcc <= Occ)
       break;
 
@@ -489,14 +485,11 @@ unsigned GCNIterativeScheduler::tryMaximizeOccupancy(unsigned TargetOcc) {
 
 void GCNIterativeScheduler::scheduleLegacyMaxOccupancy(
   bool TryMaximizeOccupancy) {
-  const auto &ST = MF.getSubtarget<GCNSubtarget>();
   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
   auto TgtOcc = MFI->getMinAllowedOccupancy();
-  unsigned DynamicVGPRBlockSize = MFI->getDynamicVGPRBlockSize();
 
   sortRegionsByPressure(TgtOcc);
-  auto Occ =
-      Regions.front()->MaxPressure.getOccupancy(ST, DynamicVGPRBlockSize, MF);
+  auto Occ = Regions.front()->MaxPressure.getOccupancy(MF);
 
   bool IsReentry = false;
   if (TryMaximizeOccupancy && Occ < TgtOcc) {
@@ -527,22 +520,19 @@ void GCNIterativeScheduler::scheduleLegacyMaxOccupancy(
       const auto RP = getRegionPressure(*R);
       LLVM_DEBUG(printSchedRP(dbgs(), R->MaxPressure, RP));
 
-      if (RP.getOccupancy(ST, DynamicVGPRBlockSize, MF) < TgtOcc) {
+      if (RP.getOccupancy(MF) < TgtOcc) {
         LLVM_DEBUG(dbgs() << "Didn't fit into target occupancy O" << TgtOcc);
         if (R->BestSchedule.get() &&
-            R->BestSchedule->MaxPressure.getOccupancy(ST, DynamicVGPRBlockSize,
-                                                      MF) >= TgtOcc) {
+            R->BestSchedule->MaxPressure.getOccupancy(MF) >= TgtOcc) {
           LLVM_DEBUG(dbgs() << ", scheduling minimal register\n");
           scheduleBest(*R);
         } else {
           LLVM_DEBUG(dbgs() << ", restoring\n");
           Ovr.restoreOrder();
-          assert(R->MaxPressure.getOccupancy(ST, DynamicVGPRBlockSize, MF) >=
-                 TgtOcc);
+          assert(R->MaxPressure.getOccupancy(MF) >= TgtOcc);
         }
       }
-      FinalOccupancy = std::min(FinalOccupancy,
-                                RP.getOccupancy(ST, DynamicVGPRBlockSize, MF));
+      FinalOccupancy = std::min(FinalOccupancy, RP.getOccupancy(MF));
     }
   }
   MFI->limitOccupancy(FinalOccupancy);
@@ -585,14 +575,11 @@ void GCNIterativeScheduler::scheduleMinReg(bool force) {
 
 void GCNIterativeScheduler::scheduleILP(
   bool TryMaximizeOccupancy) {
-  const auto &ST = MF.getSubtarget<GCNSubtarget>();
   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
   auto TgtOcc = MFI->getMinAllowedOccupancy();
-  unsigned DynamicVGPRBlockSize = MFI->getDynamicVGPRBlockSize();
 
   sortRegionsByPressure(TgtOcc);
-  auto Occ =
-      Regions.front()->MaxPressure.getOccupancy(ST, DynamicVGPRBlockSize, MF);
+  auto Occ = Regions.front()->MaxPressure.getOccupancy(MF);
 
   bool IsReentry = false;
   if (TryMaximizeOccupancy && Occ < TgtOcc) {
@@ -613,19 +600,17 @@ void GCNIterativeScheduler::scheduleILP(
     const auto RP = getSchedulePressure(*R, ILPSchedule);
     LLVM_DEBUG(printSchedRP(dbgs(), R->MaxPressure, RP));
 
-    if (RP.getOccupancy(ST, DynamicVGPRBlockSize, MF) < TgtOcc) {
+    if (RP.getOccupancy(MF) < TgtOcc) {
       LLVM_DEBUG(dbgs() << "Didn't fit into target occupancy O" << TgtOcc);
       if (R->BestSchedule.get() &&
-          R->BestSchedule->MaxPressure.getOccupancy(ST, DynamicVGPRBlockSize,
-                                                    MF) >= TgtOcc) {
+          R->BestSchedule->MaxPressure.getOccupancy(MF) >= TgtOcc) {
         LLVM_DEBUG(dbgs() << ", scheduling minimal register\n");
         scheduleBest(*R);
       }
     } else {
       scheduleRegion(*R, ILPSchedule, RP);
       LLVM_DEBUG(printSchedResult(dbgs(), R, RP));
-      FinalOccupancy = std::min(FinalOccupancy,
-                                RP.getOccupancy(ST, DynamicVGPRBlockSize, MF));
+      FinalOccupancy = std::min(FinalOccupancy, RP.getOccupancy(MF));
     }
   }
   MFI->limitOccupancy(FinalOccupancy);
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
index 786b45902ae48..9ee171e1b9999 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -267,7 +267,7 @@ Printable llvm::print(const GCNRegPressure &RP, const GCNSubtarget *ST,
         OS << ", LVGPR WT: " << RP.getVGPRTuplesWeight(ArchVGPRThreshold)
            << ", LSGPR WT: " << RP.getSGPRTuplesWeight();
         if (ST)
-          OS << " -> Occ: " << RP.getOccupancy(*ST, DynamicVGPRBlockSize, *MF);
+          OS << " -> Occ: " << RP.getOccupancy(*MF);
         OS << '\n';
       });
 }
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
index 98eb35eaaca8e..9eb86017adafc 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
@@ -18,6 +18,7 @@
 #define LLVM_LIB_TARGET_AMDGPU_GCNREGPRESSURE_H
 
 #include "GCNSubtarget.h"
+#include "SIMachineFunctionInfo.h"
 #include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/RegisterPressure.h"
 #include <algorithm>
@@ -108,8 +109,11 @@ struct GCNRegPressure {
   }
   unsigned getSGPRTuplesWeight() const { return Value[TOTAL_KINDS + SGPR]; }
 
-  unsigned getOccupancy(const GCNSubtarget &ST, unsigned DynamicVGPRBlockSize,
-                        const MachineFunction &MF) const {
+  unsigned getOccupancy(const MachineFunction &MF) const {
+    const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+    unsigned DynamicVGPRBlockSize =
+        MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();
+
     return std::min(ST.getOccupancyWithNumSGPRs(getSGPRNum()),
                     ST.getOccupancyWithNumVGPRs(
                         getVGPRNum(ST.hasGFX90AInsts(),
@@ -122,11 +126,9 @@ struct GCNRegPressure {
            LaneBitmask NewMask,
            const MachineRegisterInfo &MRI);
 
-  bool higherOccupancy(const GCNSubtarget &ST, const GCNRegPressure &O,
-                       unsigned DynamicVGPRBlockSize,
+  bool higherOccupancy(const GCNRegPressure &O,
                        const MachineFunction &MF) const {
-    return getOccupancy(ST, DynamicVGPRBlockSize, MF) >
-           O.getOccupancy(ST, DynamicVGPRBlockSize, MF);
+    return getOccupancy(MF) > O.getOccupancy(MF);
   }
 
   /// Compares \p this GCNRegpressure to \p O, returning true if \p this is
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 2b61ad4e7a8d5..ef3dcea2fcac5 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -1146,9 +1146,7 @@ void UnclusteredHighRPStage::finalizeGCNSchedStage() {
   if (DAG.MinOccupancy > InitialOccupancy) {
     for (unsigned IDX = 0; IDX < DAG.Pressure.size(); ++IDX)
       DAG.RegionsWithMinOcc[IDX] =
-          DAG.Pressure[IDX].getOccupancy(DAG.ST,
-                                         DAG.MFI.getDynamicVGPRBlockSize(),
-                                         DAG.MF) == DAG.MinOccupancy;
+          DAG.Pressure[IDX].getOccupancy(DAG.MF) == DAG.MinOccupancy;
 
     LLVM_DEBUG(dbgs() << StageID
                       << " stage successfully increased occupancy to "
@@ -1288,15 +1286,13 @@ void GCNSchedStage::checkScheduling() {
                     << print(PressureAfter, &ST, 0, &MF));
   LLVM_DEBUG(dbgs() << "Region: " << RegionIdx << ".\n");
 
-  unsigned DynamicVGPRBlockSize = DAG.MFI.getDynamicVGPRBlockSize();
   unsigned ArchVGPRThreshold = ST.getArchVGPRAllocationThreshold(MF);
   if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit &&
       PressureAfter.getVGPRNum(ST.hasGFX90AInsts(), ArchVGPRThreshold) <=
           S.VGPRCriticalLimit) {
     DAG.Pressure[RegionIdx] = PressureAfter;
     DAG.RegionsWithMinOcc[RegionIdx] =
-        PressureAfter.getOccupancy(ST, DynamicVGPRBlockSize, DAG.MF) ==
-        DAG.MinOccupancy;
+        PressureAfter.getOccupancy(DAG.MF) == DAG.MinOccupancy;
 
     // Early out if we have achieved the occupancy target.
     LLVM_DEBUG(dbgs() << "Pressure in desired limits, done.\n");
@@ -1306,11 +1302,9 @@ void GCNSchedStage::checkScheduling() {
   unsigned TargetOccupancy = std::min(
       S.getTargetOccupancy(), ST.getOccupancyWithWorkGroupSizes(MF).second);
   unsigned WavesAfter =
-      std::min(TargetOccupancy,
-               PressureAfter.getOccupancy(ST, DynamicVGPRBlockSize, DAG.MF));
+      std::min(TargetOccupancy, PressureAfter.getOccupancy(DAG.MF));
   unsigned WavesBefore =
-      std::min(TargetOccupancy,
-               PressureBefore.getOccupancy(ST, DynamicVGPRBlockSize, DAG.MF));
+      std::min(TargetOccupancy, PressureBefore.getOccupancy(DAG.MF));
   LLVM_DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore
                     << ", after " << WavesAfter << ".\n");
 
@@ -1360,8 +1354,7 @@ void GCNSchedStage::checkScheduling() {
   } else {
     DAG.Pressure[RegionIdx] = PressureAfter;
     DAG.RegionsWithMinOcc[RegionIdx] =
-        PressureAfter.getOccupancy(ST, DynamicVGPRBlockSize, DAG.MF) ==
-        DAG.MinOccupancy;
+        PressureAfter.getOccupancy(DAG.MF) == DAG.MinOccupancy;
   }
 }
 
@@ -1515,8 +1508,7 @@ bool OccInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) {
 bool UnclusteredHighRPStage::shouldRevertScheduling(unsigned WavesAfter) {
   // If RP is not reduced in the unclustered reschedule stage, revert to the
   // old schedule.
-  if ((WavesAfter <= PressureBefore.getOccupancy(
-                         ST, DAG.MFI.getDynamicVGPRBlockSize(), DAG.MF) &&
+  if ((WavesAfter <= PressureBefore.getOccupancy(DAG.MF) &&
        mayCauseSpilling(WavesAfter)) ||
       GCNSchedStage::shouldRevertScheduling(WavesAfter)) {
     LLVM_DEBUG(dbgs() << "Unclustered reschedule did not help.\n");
@@ -1539,9 +1531,7 @@ bool UnclusteredHighRPStage::shouldRevertScheduling(unsigned WavesAfter) {
   unsigned OldMetric = MBefore.getMetric();
   unsigned NewMetric = MAfter.getMetric();
   unsigned WavesBefore =
-      std::min(S.getTargetOccupancy(),
-               PressureBefore.getOccupancy(
-                   ST, DAG.MFI.getDynamicVGPRBlockSize(), DAG.MF));
+      std::min(S.getTargetOccupancy(), PressureBefore.getOccupancy(DAG.MF));
   unsigned Profit =
       ((WavesAfter * ScheduleMetrics::ScaleFactor) / WavesBefore *
        ((OldMetric + ScheduleMetricBias) * ScheduleMetrics::ScaleFactor) /
@@ -1595,8 +1585,7 @@ bool GCNSchedStage::mayCauseSpilling(unsigned WavesAfter) {
 
 void GCNSchedStage::revertScheduling() {
   DAG.RegionsWithMinOcc[RegionIdx] =
-      PressureBefore.getOccupancy(ST, DAG.MFI.getDynamicVGPRBlockSize(),
-                                  DAG.MF) == DAG.MinOccupancy;
+      PressureBefore.getOccupancy(DAG.MF) == DAG.MinOccupancy;
   LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n");
   DAG.RegionEnd = DAG.RegionBegin;
   int SkippedDebugInstr = 0;
@@ -2033,11 +2022,7 @@ void PreRARematStage::rematerialize() {
       }
     }
     DAG.Pressure[I] = RP;
-    AchievedOcc = std::min(
-        AchievedOcc,
-        RP.getOccupancy(
-            ST, MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize(),
-            DAG.MF));
+    AchievedOcc = std::min(AchievedOcc, RP.getOccupancy(DAG.MF));
   }
   REMAT_DEBUG(dbgs() << "Achieved occupancy " << AchievedOcc << "\n");
 }
diff --git a/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp b/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
index f4cf8f4e03df8..03d5ac6dec025 100644
--- a/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
@@ -197,10 +197,7 @@ bool SIFormMemoryClausesImpl::checkPressure(const MachineInstr &MI,
   // pointer becomes dead and could otherwise be reused for destination.
   RPT.advanceToNext();
   GCNRegPressure MaxPressure = RPT.moveMaxPressure();
-  unsigned Occupancy = MaxPressure.getOccupancy(
-      *ST,
-      MI.getMF()->getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize(),
-      *MI.getMF());
+  unsigned Occupancy = MaxPressure.getOccupancy(*MI.getMF());
 
   // Don't push over half the register budget. We don't want to introduce
   // spilling just to form a soft clause.

From c3351970a84afa0293f2160b7dce6bb3d48f8ef3 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes@amd.com>
Date: Mon, 28 Jul 2025 19:13:14 -0700
Subject: [PATCH 04/11] Factor out getAVGPRSAs*GPRsNum

Change-Id: Ia3b8507f95763079ee3c2224655990a299c8854d
---
 llvm/lib/Target/AMDGPU/GCNRegPressure.h | 39 +++++++++++++++++--------
 1 file changed, 27 insertions(+), 12 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
index 9eb86017adafc..c7449b43a35b4 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
@@ -57,6 +57,24 @@ struct GCNRegPressure {
     return std::max(Value[VGPR] + Value[AVGPR], Value[AGPR]);
   }
 
+  inline static unsigned getAVGPRsAsVGPRsNum(unsigned NumArchVGPRs,
+                                             unsigned NumAVGPRs,
+                                             unsigned AddressableArchVGPR) {
+
+    return NumArchVGPRs < AddressableArchVGPR
+               ? std::min((AddressableArchVGPR - NumArchVGPRs), NumAVGPRs)
+               : 0;
+  }
+
+  inline static unsigned getAVGPRsAsAGPRsNum(unsigned NumArchVGPRs,
+                                             unsigned NumAGPRs,
+                                             unsigned NumAVGPRs,
+                                             unsigned AddressableArchVGPR) {
+    unsigned AVGPRsAsVGPRs =
+        getAVGPRsAsVGPRsNum(NumArchVGPRs, NumAVGPRs, AddressableArchVGPR);
+    return NumAVGPRs > AVGPRsAsVGPRs ? NumAVGPRs - AVGPRsAsVGPRs : 0;
+  }
+
   /// Returns the aggregated VGPR pressure, assuming \p NumArchVGPRs ArchVGPRs
   /// \p NumAGPRs AGPRS, and \p NumAVGPRs AVGPRs for a target with a unified
   /// VGPR file.
@@ -68,11 +86,10 @@ struct GCNRegPressure {
     // Until we hit the VGPRThreshold, we will assign AV as VGPR. After that
     // point, we will assign as AGPR.
     unsigned AVGPRsAsVGPRs =
-        NumArchVGPRs < AddressableArchVGPR
-            ? std::min((AddressableArchVGPR - NumArchVGPRs), NumAVGPRs)
-            : 0;
-    unsigned AVGPRsAsAGPRs =
-        NumAVGPRs > AVGPRsAsVGPRs ? NumAVGPRs - AVGPRsAsVGPRs : 0;
+        getAVGPRsAsVGPRsNum(NumArchVGPRs, NumAVGPRs, AddressableArchVGPR);
+    unsigned AVGPRsAsAGPRs = getAVGPRsAsAGPRsNum(
+        NumArchVGPRs, NumAGPRs, NumAVGPRs, AddressableArchVGPR);
+    NumAVGPRs > AVGPRsAsVGPRs ? NumAVGPRs - AVGPRsAsVGPRs : 0;
     return alignTo(NumArchVGPRs + AVGPRsAsVGPRs,
                    AMDGPU::IsaInfo::getArchVGPRAllocGranule()) +
            NumAGPRs + AVGPRsAsAGPRs;
@@ -96,13 +113,11 @@ struct GCNRegPressure {
 
   unsigned getVGPRTuplesWeight(unsigned AddressableArchVGPR) const {
     unsigned AVGPRsAsVGPRs =
-        Value[TOTAL_KINDS + VGPR] < AddressableArchVGPR
-            ? std::min(AddressableArchVGPR - Value[TOTAL_KINDS + VGPR],
-                       Value[TOTAL_KINDS + AVGPR])
-            : 0;
-    unsigned AVGPRsAsAGPRs = Value[TOTAL_KINDS + AVGPR] > AVGPRsAsVGPRs
-                                 ? Value[TOTAL_KINDS + AVGPR] - AVGPRsAsVGPRs
-                                 : 0;
+        getAVGPRsAsVGPRsNum(Value[TOTAL_KINDS + VGPR],
+                            Value[TOTAL_KINDS + AVGPR], AddressableArchVGPR);
+    unsigned AVGPRsAsAGPRs = getAVGPRsAsAGPRsNum(
+        Value[TOTAL_KINDS + VGPR], Value[TOTAL_KINDS + AGPR],
+        Value[TOTAL_KINDS + AVGPR], AddressableArchVGPR);
 
     return std::max(Value[TOTAL_KINDS + VGPR] + AVGPRsAsVGPRs,
                     Value[TOTAL_KINDS + AGPR] + AVGPRsAsAGPRs);

From 33a70f2a3faa35ebda87554ce22244cd5e6101d3 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes@amd.com>
Date: Mon, 28 Jul 2025 19:14:44 -0700
Subject: [PATCH 05/11] Formatting

Change-Id: I14486056bef5e9a97842be68a7f5abe82ecc37fe
---
 llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
index 2c833abedbfb7..87f5b9f16868a 100644
--- a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
@@ -484,7 +484,7 @@ unsigned GCNIterativeScheduler::tryMaximizeOccupancy(unsigned TargetOcc) {
 }
 
 void GCNIterativeScheduler::scheduleLegacyMaxOccupancy(
-  bool TryMaximizeOccupancy) {
+    bool TryMaximizeOccupancy) {
   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
   auto TgtOcc = MFI->getMinAllowedOccupancy();
 
@@ -573,8 +573,7 @@ void GCNIterativeScheduler::scheduleMinReg(bool force) {
 ///////////////////////////////////////////////////////////////////////////////
 // ILP scheduler port
 
-void GCNIterativeScheduler::scheduleILP(
-  bool TryMaximizeOccupancy) {
+void GCNIterativeScheduler::scheduleILP(bool TryMaximizeOccupancy) {
   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
   auto TgtOcc = MFI->getMinAllowedOccupancy();
 

From 572732449576479c7394638b1e21a92a39559d35 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes@amd.com>
Date: Mon, 28 Jul 2025 19:34:17 -0700
Subject: [PATCH 06/11] Use getMaxNumVectorRegs instead of
 getArchVGPRAllocationThreshold

Change-Id: I36e92840e35774cb419389ee6dadc26dd376ebaa
---
 llvm/lib/Target/AMDGPU/GCNRegPressure.cpp     | 14 +++++++------
 llvm/lib/Target/AMDGPU/GCNRegPressure.h       | 12 +++++------
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp   | 21 ++++++++++++-------
 llvm/lib/Target/AMDGPU/GCNSubtarget.h         |  9 --------
 .../lib/Target/AMDGPU/SIFormMemoryClauses.cpp |  5 +++--
 5 files changed, 30 insertions(+), 31 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
index 9ee171e1b9999..966e810115195 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -99,7 +99,8 @@ void GCNRegPressure::inc(unsigned Reg,
 bool GCNRegPressure::less(const MachineFunction &MF, const GCNRegPressure &O,
                           unsigned MaxOccupancy) const {
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
-  unsigned ArchVGPRThreshold = ST.getArchVGPRAllocationThreshold(MF);
+  unsigned ArchVGPRThreshold =
+      ST.getRegisterInfo()->getMaxNumVectorRegs(MF).first;
   unsigned DynamicVGPRBlockSize =
       MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();
 
@@ -250,7 +251,8 @@ bool GCNRegPressure::less(const MachineFunction &MF, const GCNRegPressure &O,
 Printable llvm::print(const GCNRegPressure &RP, const GCNSubtarget *ST,
                       unsigned DynamicVGPRBlockSize,
                       const MachineFunction *MF) {
-  unsigned ArchVGPRThreshold = ST->getArchVGPRAllocationThreshold(*MF);
+  unsigned ArchVGPRThreshold =
+      ST->getRegisterInfo()->getMaxNumVectorRegs(*MF).first;
   return Printable(
       [&RP, ST, DynamicVGPRBlockSize, ArchVGPRThreshold, MF](raw_ostream &OS) {
         OS << "VGPRs: " << RP.getArchVGPRNum(ArchVGPRThreshold) << ' '
@@ -903,10 +905,10 @@ bool GCNRegPressurePrinter::runOnMachineFunction(MachineFunction &MF) {
   auto printRP = [&MF](const GCNRegPressure &RP) {
     return Printable([&RP, &MF](raw_ostream &OS) {
       OS << format(PFX "  %-5d", RP.getSGPRNum())
-         << format(
-                " %-5d",
-                RP.getVGPRNum(false, MF.getSubtarget<GCNSubtarget>()
-                                         .getArchVGPRAllocationThreshold(MF)));
+         << format(" %-5d", RP.getVGPRNum(false, MF.getSubtarget<GCNSubtarget>()
+                                                     .getRegisterInfo()
+                                                     ->getMaxNumVectorRegs(MF)
+                                                     .first));
     });
   };
 
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
index c7449b43a35b4..d61e0348dabb4 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
@@ -89,7 +89,6 @@ struct GCNRegPressure {
         getAVGPRsAsVGPRsNum(NumArchVGPRs, NumAVGPRs, AddressableArchVGPR);
     unsigned AVGPRsAsAGPRs = getAVGPRsAsAGPRsNum(
         NumArchVGPRs, NumAGPRs, NumAVGPRs, AddressableArchVGPR);
-    NumAVGPRs > AVGPRsAsVGPRs ? NumAVGPRs - AVGPRsAsVGPRs : 0;
     return alignTo(NumArchVGPRs + AVGPRsAsVGPRs,
                    AMDGPU::IsaInfo::getArchVGPRAllocGranule()) +
            NumAGPRs + AVGPRsAsAGPRs;
@@ -129,11 +128,12 @@ struct GCNRegPressure {
     unsigned DynamicVGPRBlockSize =
         MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();
 
-    return std::min(ST.getOccupancyWithNumSGPRs(getSGPRNum()),
-                    ST.getOccupancyWithNumVGPRs(
-                        getVGPRNum(ST.hasGFX90AInsts(),
-                                   ST.getArchVGPRAllocationThreshold(MF)),
-                        DynamicVGPRBlockSize));
+    return std::min(
+        ST.getOccupancyWithNumSGPRs(getSGPRNum()),
+        ST.getOccupancyWithNumVGPRs(
+            getVGPRNum(ST.hasGFX90AInsts(),
+                       ST.getRegisterInfo()->getMaxNumVectorRegs(MF).first),
+            DynamicVGPRBlockSize));
   }
 
   void inc(unsigned Reg,
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index ef3dcea2fcac5..52359135f8893 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -190,9 +190,10 @@ static void getRegisterPressures(
     TempUpwardTracker.recede(*MI);
     NewPressure = TempUpwardTracker.getPressure();
   }
-  unsigned ArchVGPRThreshold =
-      DAG->MF.getSubtarget<GCNSubtarget>().getArchVGPRAllocationThreshold(
-          DAG->MF);
+  unsigned ArchVGPRThreshold = DAG->MF.getSubtarget<GCNSubtarget>()
+                                   .getRegisterInfo()
+                                   ->getMaxNumVectorRegs(DAG->MF)
+                                   .first;
   Pressure[AMDGPU::RegisterPressureSets::SReg_32] = NewPressure.getSGPRNum();
   Pressure[AMDGPU::RegisterPressureSets::VGPR_32] =
       NewPressure.getArchVGPRNum(ArchVGPRThreshold);
@@ -343,9 +344,11 @@ void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
                             ? static_cast<GCNRPTracker *>(&UpwardTracker)
                             : static_cast<GCNRPTracker *>(&DownwardTracker);
       SGPRPressure = T->getPressure().getSGPRNum();
-      VGPRPressure = T->getPressure().getArchVGPRNum(
-          DAG->MF.getSubtarget<GCNSubtarget>().getArchVGPRAllocationThreshold(
-              DAG->MF));
+      VGPRPressure =
+          T->getPressure().getArchVGPRNum(DAG->MF.getSubtarget<GCNSubtarget>()
+                                              .getRegisterInfo()
+                                              ->getMaxNumVectorRegs(DAG->MF)
+                                              .first);
     }
   }
   ReadyQueue &Q = Zone.Available;
@@ -1286,7 +1289,8 @@ void GCNSchedStage::checkScheduling() {
                     << print(PressureAfter, &ST, 0, &MF));
   LLVM_DEBUG(dbgs() << "Region: " << RegionIdx << ".\n");
 
-  unsigned ArchVGPRThreshold = ST.getArchVGPRAllocationThreshold(MF);
+  unsigned ArchVGPRThreshold =
+      ST.getRegisterInfo()->getMaxNumVectorRegs(MF).first;
   if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit &&
       PressureAfter.getVGPRNum(ST.hasGFX90AInsts(), ArchVGPRThreshold) <=
           S.VGPRCriticalLimit) {
@@ -1478,7 +1482,8 @@ bool GCNSchedStage::shouldRevertScheduling(unsigned WavesAfter) {
 
   // For dynamic VGPR mode, we don't want to waste any VGPR blocks.
   if (DAG.MFI.isDynamicVGPREnabled()) {
-    unsigned ArchVGPRThreshold = ST.getArchVGPRAllocationThreshold(MF);
+    unsigned ArchVGPRThreshold =
+        ST.getRegisterInfo()->getMaxNumVectorRegs(MF).first;
     unsigned BlocksBefore = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks(
         &ST, DAG.MFI.getDynamicVGPRBlockSize(),
         PressureBefore.getVGPRNum(false, ArchVGPRThreshold));
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index a259b90545ee9..10ded0e1d1c3a 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1629,15 +1629,6 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
     return AMDGPU::IsaInfo::getAddressableNumArchVGPRs(this);
   }
 
-  unsigned getArchVGPRAllocationThreshold(const MachineFunction &MF) const {
-    if (hasGFX90AInsts() || !hasMAIInsts())
-      return AMDGPU::IsaInfo::getAddressableNumArchVGPRs(this);
-
-    const Function &F = MF.getFunction();
-    std::pair<unsigned, unsigned> Waves = getWavesPerEU(F);
-    return getMaxNumVGPRs(Waves.first, 0);
-  }
-
   /// \returns Addressable number of VGPRs supported by the subtarget.
   unsigned getAddressableNumVGPRs(unsigned DynamicVGPRBlockSize) const {
     return AMDGPU::IsaInfo::getAddressableNumVGPRs(this, DynamicVGPRBlockSize);
diff --git a/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp b/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
index 03d5ac6dec025..cdc80ca9267d6 100644
--- a/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
@@ -209,8 +209,9 @@ bool SIFormMemoryClausesImpl::checkPressure(const MachineInstr &MI,
   // tracking does not account for the alignment requirements for SGPRs, or the
   // fragmentation of registers the allocator will need to satisfy.
   if (Occupancy >= MFI->getMinAllowedOccupancy() &&
-      MaxPressure.getVGPRNum(ST->hasGFX90AInsts(),
-                             ST->getArchVGPRAllocationThreshold(*MI.getMF())) <=
+      MaxPressure.getVGPRNum(
+          ST->hasGFX90AInsts(),
+          ST->getRegisterInfo()->getMaxNumVectorRegs(*MI.getMF()).first) <=
           MaxVGPRs / 2 &&
       MaxPressure.getSGPRNum() <= MaxSGPRs / 2) {
     LastRecordedOccupancy = Occupancy;

From 5a0696d975c90c5d634311b56d3224d351dcccdf Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes@amd.com>
Date: Tue, 29 Jul 2025 08:24:20 -0700
Subject: [PATCH 07/11] Add test

Change-Id: I68bc69d5bafa3d8161c7b507721a9cde3e99d2b1
---
 llvm/test/CodeGen/AMDGPU/avgpr-pressure.mir | 348 ++++++++++++++++++++
 1 file changed, 348 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/avgpr-pressure.mir

diff --git a/llvm/test/CodeGen/AMDGPU/avgpr-pressure.mir b/llvm/test/CodeGen/AMDGPU/avgpr-pressure.mir
new file mode 100644
index 0000000000000..358942e73a7c6
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/avgpr-pressure.mir
@@ -0,0 +1,348 @@
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950  -run-pass=machine-scheduler --debug-only=machine-scheduler -o - %s 2>&1 | FileCheck %s
+# REQUIRES: asserts
+
+--- |
+  define void @avgpr_rp_occ1() #0 {
+  entry:
+    unreachable
+  }
+
+  define void @avgpr_rp_occ2() #1 {
+  entry:
+    unreachable
+  }
+
+  define void @avgpr_rp_occ3() #2 {
+  entry:
+    unreachable
+  }
+
+  define void @avgpr_rp_occ4() #3 {
+  entry:
+    unreachable
+  }
+
+  define void @avgpr_rp_occ5() #4 {
+  entry:
+    unreachable
+  }
+
+  define void @avgpr_rp_occ6() #5 {
+  entry:
+    unreachable
+  }
+
+  define void @avgpr_rp_occ7() #6 {
+  entry:
+    unreachable
+  }
+
+  define void @avgpr_rp_occ8() #7 {
+  entry:
+    unreachable
+  }
+
+  attributes #0 = {"amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="64,64"}
+  attributes #1 = {"amdgpu-waves-per-eu"="2,2" "amdgpu-flat-work-group-size"="64,64"}
+  attributes #2 = {"amdgpu-waves-per-eu"="3,3" "amdgpu-flat-work-group-size"="64,64"}
+  attributes #3 = {"amdgpu-waves-per-eu"="4,4" "amdgpu-flat-work-group-size"="64,64"}
+  attributes #4 = {"amdgpu-waves-per-eu"="5,5" "amdgpu-flat-work-group-size"="64,64"}
+  attributes #5 = {"amdgpu-waves-per-eu"="6,6" "amdgpu-flat-work-group-size"="64,64"}
+  attributes #6 = {"amdgpu-waves-per-eu"="7,7" "amdgpu-flat-work-group-size"="64,64"}
+  attributes #7 = {"amdgpu-waves-per-eu"="8,8" "amdgpu-flat-work-group-size"="64,64"}
+
+
+...
+
+# CHECK: avgpr_rp_occ1:%bb.0
+# CHECK: Pressure before scheduling:
+# CHECK-NEXT: Region live-ins:
+# CHECK-NEXT: Region live-in pressure:  VGPRs: 0 AGPRs: 0(O8), SGPRs: 0(O10), LVGPR WT: 0, LSGPR WT: 0 -> Occ: 8
+# CHECK-NEXT: Region register pressure: VGPRs: 256 AGPRs: 192(O1), SGPRs: 0(O10), LVGPR WT: 256, LSGPR WT: 0 -> Occ: 1
+
+---
+name:            avgpr_rp_occ1
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:vreg_1024 = IMPLICIT_DEF
+    %2:vreg_1024 = IMPLICIT_DEF
+    %3:vreg_1024 = IMPLICIT_DEF
+    %4:vreg_1024 = IMPLICIT_DEF
+    %5:vreg_1024 = IMPLICIT_DEF
+    %6:vreg_1024 = IMPLICIT_DEF
+    %7:vreg_1024 = IMPLICIT_DEF
+    %8:av_1024 = IMPLICIT_DEF
+    %9:av_1024 = IMPLICIT_DEF
+    %10:av_1024 = IMPLICIT_DEF
+    %11:av_1024 = IMPLICIT_DEF
+    %12:av_1024 = IMPLICIT_DEF
+    %13:av_1024 = IMPLICIT_DEF
+    %14:av_1024 = IMPLICIT_DEF
+    SCHED_BARRIER 0
+    KILL %1, %2, %3, %4, %5, %6, %7
+
+  bb.1:
+    KILL %8, %9, %10, %11, %12, %13, %14
+    S_ENDPGM 0
+...
+
+# CHECK: avgpr_rp_occ2:%bb.0
+# CHECK: Pressure before scheduling:
+# CHECK-NEXT: Region live-ins:
+# CHECK-NEXT: Region live-in pressure:  VGPRs: 0 AGPRs: 0(O8), SGPRs: 0(O10), LVGPR WT: 0, LSGPR WT: 0 -> Occ: 8
+# CHECK-NEXT: Region register pressure: VGPRs: 128 AGPRs: 64(O2), SGPRs: 0(O10), LVGPR WT: 128, LSGPR WT: 0 -> Occ: 2
+
+---
+name:            avgpr_rp_occ2
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:vreg_1024 = IMPLICIT_DEF
+    %2:vreg_1024 = IMPLICIT_DEF
+    %3:vreg_1024 = IMPLICIT_DEF
+    %4:av_1024 = IMPLICIT_DEF
+    %5:av_1024 = IMPLICIT_DEF
+    %6:av_1024 = IMPLICIT_DEF
+    SCHED_BARRIER 0
+    KILL %1, %2, %3
+
+  bb.1:
+    KILL %4, %5, %6
+    S_ENDPGM 0
+...
+
+# CHECK: avgpr_rp_occ3:%bb.0
+# CHECK: Pressure before scheduling:
+# CHECK-NEXT: Region live-ins:
+# CHECK-NEXT: Region live-in pressure:  VGPRs: 0 AGPRs: 0(O8), SGPRs: 0(O10), LVGPR WT: 0, LSGPR WT: 0 -> Occ: 8
+# CHECK-NEXT: Region register pressure: VGPRs: 84 AGPRs: 44(O4), SGPRs: 0(O10), LVGPR WT: 84, LSGPR WT: 0 -> Occ: 4
+
+---
+name:            avgpr_rp_occ3
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:vreg_1024 = IMPLICIT_DEF
+    %2:vreg_1024 = IMPLICIT_DEF
+    %3:av_1024 = IMPLICIT_DEF
+    %4:av_1024 = IMPLICIT_DEF
+    SCHED_BARRIER 0
+    KILL %1, %2
+
+  bb.1:
+    KILL %3, %4
+    S_ENDPGM 0
+...
+
+# CHECK: avgpr_rp_occ4:%bb.0
+# CHECK: Pressure before scheduling:
+# CHECK-NEXT: Region live-ins:
+# CHECK-NEXT: Region live-in pressure:  VGPRs: 0 AGPRs: 0(O8), SGPRs: 0(O10), LVGPR WT: 0, LSGPR WT: 0 -> Occ: 8
+# CHECK-NEXT: Region register pressure: VGPRs: 64 AGPRs: 64(O4), SGPRs: 0(O10), LVGPR WT: 64, LSGPR WT: 0 -> Occ: 4
+
+---
+name:            avgpr_rp_occ4
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:vreg_1024 = IMPLICIT_DEF
+    %2:vreg_1024 = IMPLICIT_DEF
+    %3:av_1024 = IMPLICIT_DEF
+    %4:av_1024 = IMPLICIT_DEF
+    SCHED_BARRIER 0
+    KILL %1, %2 
+
+  bb.1:
+    KILL %3, %4 
+    S_ENDPGM 0
+...
+
+# CHECK: avgpr_rp_occ5:%bb.0
+# CHECK: Pressure before scheduling:
+# CHECK-NEXT: Region live-ins:
+# CHECK-NEXT: Region live-in pressure:  VGPRs: 0 AGPRs: 0(O8), SGPRs: 0(O10), LVGPR WT: 0, LSGPR WT: 0 -> Occ: 8
+# CHECK-NEXT: Region register pressure: VGPRs: 48 AGPRs: 80(O4), SGPRs: 0(O10), LVGPR WT: 64, LSGPR WT: 0 -> Occ: 4
+
+---
+name:            avgpr_rp_occ5
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:vreg_1024 = IMPLICIT_DEF
+    %2:vreg_1024 = IMPLICIT_DEF
+    %3:av_1024 = IMPLICIT_DEF
+    %4:av_1024 = IMPLICIT_DEF
+    SCHED_BARRIER 0
+    KILL %1, %2 
+
+  bb.1:
+    KILL %3, %4 
+    S_ENDPGM 0
+...
+
+# CHECK: avgpr_rp_occ6:%bb.0
+# CHECK: Pressure before scheduling:
+# CHECK-NEXT: Region live-ins:
+# CHECK-NEXT: Region live-in pressure:  VGPRs: 0 AGPRs: 0(O8), SGPRs: 0(O10), LVGPR WT: 0, LSGPR WT: 0 -> Occ: 8
+# CHECK-NEXT: Region register pressure: VGPRs: 40 AGPRs: 88(O4), SGPRs: 0(O10), LVGPR WT: 64, LSGPR WT: 0 -> Occ: 4
+
+---
+name:            avgpr_rp_occ6
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:vreg_1024 = IMPLICIT_DEF
+    %2:vreg_1024 = IMPLICIT_DEF
+    %3:av_1024 = IMPLICIT_DEF
+    %4:av_1024 = IMPLICIT_DEF
+    SCHED_BARRIER 0
+    KILL %1, %2 
+
+  bb.1:
+    KILL %3, %4 
+    S_ENDPGM 0
+...
+
+# CHECK: avgpr_rp_occ7:%bb.0
+# CHECK: Pressure before scheduling:
+# CHECK-NEXT: Region live-ins:
+# CHECK-NEXT: Region live-in pressure:  VGPRs: 0 AGPRs: 0(O8), SGPRs: 0(O10), LVGPR WT: 0, LSGPR WT: 0 -> Occ: 8
+# CHECK-NEXT: Region register pressure: VGPRs: 36 AGPRs: 92(O4), SGPRs: 0(O10), LVGPR WT: 64, LSGPR WT: 0 -> Occ: 4
+
+---
+name:            avgpr_rp_occ7
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:vreg_1024 = IMPLICIT_DEF
+    %2:vreg_1024 = IMPLICIT_DEF
+    %3:av_1024 = IMPLICIT_DEF
+    %4:av_1024 = IMPLICIT_DEF
+    SCHED_BARRIER 0
+    KILL %1, %2 
+
+  bb.1:
+    KILL %3, %4 
+    S_ENDPGM 0
+...
+
+# CHECK: avgpr_rp_occ8:%bb.0
+# CHECK: Pressure before scheduling:
+# CHECK-NEXT: Region live-ins:
+# CHECK-NEXT: Region live-in pressure:  VGPRs: 0 AGPRs: 0(O8), SGPRs: 0(O10), LVGPR WT: 0, LSGPR WT: 0 -> Occ: 8
+# CHECK-NEXT: Region register pressure: VGPRs: 32 AGPRs: 96(O4), SGPRs: 0(O10), LVGPR WT: 64, LSGPR WT: 0 -> Occ: 4
+
+---
+name:            avgpr_rp_occ8
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:vreg_1024 = IMPLICIT_DEF
+    %2:vreg_1024 = IMPLICIT_DEF
+    %3:av_1024 = IMPLICIT_DEF
+    %4:av_1024 = IMPLICIT_DEF
+    SCHED_BARRIER 0
+    KILL %1, %2 
+
+  bb.1:
+    KILL %3, %4 
+    S_ENDPGM 0
+...
+

From 38e255d764360972d6c61b3dddf329c48d3c46fd Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes@amd.com>
Date: Tue, 29 Jul 2025 09:05:44 -0700
Subject: [PATCH 08/11] Rebase for getMaxNumVectorRegs move

Change-Id: I17c9239229b94c42c35b5683d77f8dfe3f70bafc
---
 llvm/lib/Target/AMDGPU/GCNRegPressure.cpp      |  7 +++----
 llvm/lib/Target/AMDGPU/GCNRegPressure.h        |  2 +-
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp    | 10 ++++------
 llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp |  2 +-
 4 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
index 966e810115195..bd03ccf5322e5 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -100,7 +100,7 @@ bool GCNRegPressure::less(const MachineFunction &MF, const GCNRegPressure &O,
                           unsigned MaxOccupancy) const {
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   unsigned ArchVGPRThreshold =
-      ST.getRegisterInfo()->getMaxNumVectorRegs(MF).first;
+      ST.getMaxNumVectorRegs(MF.getFunction()).first;
   unsigned DynamicVGPRBlockSize =
       MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();
 
@@ -252,7 +252,7 @@ Printable llvm::print(const GCNRegPressure &RP, const GCNSubtarget *ST,
                       unsigned DynamicVGPRBlockSize,
                       const MachineFunction *MF) {
   unsigned ArchVGPRThreshold =
-      ST->getRegisterInfo()->getMaxNumVectorRegs(*MF).first;
+      ST->getMaxNumVectorRegs(MF->getFunction()).first;
   return Printable(
       [&RP, ST, DynamicVGPRBlockSize, ArchVGPRThreshold, MF](raw_ostream &OS) {
         OS << "VGPRs: " << RP.getArchVGPRNum(ArchVGPRThreshold) << ' '
@@ -906,8 +906,7 @@ bool GCNRegPressurePrinter::runOnMachineFunction(MachineFunction &MF) {
     return Printable([&RP, &MF](raw_ostream &OS) {
       OS << format(PFX "  %-5d", RP.getSGPRNum())
          << format(" %-5d", RP.getVGPRNum(false, MF.getSubtarget<GCNSubtarget>()
-                                                     .getRegisterInfo()
-                                                     ->getMaxNumVectorRegs(MF)
+                                                     .getMaxNumVectorRegs(MF.getFunction())
                                                      .first));
     });
   };
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
index d61e0348dabb4..0e03834380525 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
@@ -132,7 +132,7 @@ struct GCNRegPressure {
         ST.getOccupancyWithNumSGPRs(getSGPRNum()),
         ST.getOccupancyWithNumVGPRs(
             getVGPRNum(ST.hasGFX90AInsts(),
-                       ST.getRegisterInfo()->getMaxNumVectorRegs(MF).first),
+                       ST.getMaxNumVectorRegs(MF.getFunction()).first),
             DynamicVGPRBlockSize));
   }
 
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 52359135f8893..80e6c49c42fbc 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -191,8 +191,7 @@ static void getRegisterPressures(
     NewPressure = TempUpwardTracker.getPressure();
   }
   unsigned ArchVGPRThreshold = DAG->MF.getSubtarget<GCNSubtarget>()
-                                   .getRegisterInfo()
-                                   ->getMaxNumVectorRegs(DAG->MF)
+                                   .getMaxNumVectorRegs(DAG->MF.getFunction())
                                    .first;
   Pressure[AMDGPU::RegisterPressureSets::SReg_32] = NewPressure.getSGPRNum();
   Pressure[AMDGPU::RegisterPressureSets::VGPR_32] =
@@ -346,8 +345,7 @@ void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
       SGPRPressure = T->getPressure().getSGPRNum();
       VGPRPressure =
           T->getPressure().getArchVGPRNum(DAG->MF.getSubtarget<GCNSubtarget>()
-                                              .getRegisterInfo()
-                                              ->getMaxNumVectorRegs(DAG->MF)
+                                              .getMaxNumVectorRegs(DAG->MF.getFunction())
                                               .first);
     }
   }
@@ -1290,7 +1288,7 @@ void GCNSchedStage::checkScheduling() {
   LLVM_DEBUG(dbgs() << "Region: " << RegionIdx << ".\n");
 
   unsigned ArchVGPRThreshold =
-      ST.getRegisterInfo()->getMaxNumVectorRegs(MF).first;
+      ST.getMaxNumVectorRegs(MF.getFunction()).first;
   if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit &&
       PressureAfter.getVGPRNum(ST.hasGFX90AInsts(), ArchVGPRThreshold) <=
           S.VGPRCriticalLimit) {
@@ -1483,7 +1481,7 @@ bool GCNSchedStage::shouldRevertScheduling(unsigned WavesAfter) {
   // For dynamic VGPR mode, we don't want to waste any VGPR blocks.
   if (DAG.MFI.isDynamicVGPREnabled()) {
     unsigned ArchVGPRThreshold =
-        ST.getRegisterInfo()->getMaxNumVectorRegs(MF).first;
+        ST.getMaxNumVectorRegs(MF.getFunction()).first;
     unsigned BlocksBefore = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks(
         &ST, DAG.MFI.getDynamicVGPRBlockSize(),
         PressureBefore.getVGPRNum(false, ArchVGPRThreshold));
diff --git a/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp b/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
index cdc80ca9267d6..e29ac72c7ba31 100644
--- a/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
@@ -211,7 +211,7 @@ bool SIFormMemoryClausesImpl::checkPressure(const MachineInstr &MI,
   if (Occupancy >= MFI->getMinAllowedOccupancy() &&
       MaxPressure.getVGPRNum(
           ST->hasGFX90AInsts(),
-          ST->getRegisterInfo()->getMaxNumVectorRegs(*MI.getMF()).first) <=
+          ST->getMaxNumVectorRegs(MI.getMF()->getFunction()).first) <=
           MaxVGPRs / 2 &&
       MaxPressure.getSGPRNum() <= MaxSGPRs / 2) {
     LastRecordedOccupancy = Occupancy;

From dcecf426736fad35ae746e9d5e8b29602fdf797a Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes@amd.com>
Date: Tue, 29 Jul 2025 09:15:16 -0700
Subject: [PATCH 09/11] Formatting

Change-Id: I992cdc7ab89d244eaed82d4e671238878376c8d2
---
 llvm/lib/Target/AMDGPU/GCNRegPressure.cpp   |  9 ++++-----
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 14 ++++++--------
 2 files changed, 10 insertions(+), 13 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
index bd03ccf5322e5..4564163b137be 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -99,8 +99,7 @@ void GCNRegPressure::inc(unsigned Reg,
 bool GCNRegPressure::less(const MachineFunction &MF, const GCNRegPressure &O,
                           unsigned MaxOccupancy) const {
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
-  unsigned ArchVGPRThreshold =
-      ST.getMaxNumVectorRegs(MF.getFunction()).first;
+  unsigned ArchVGPRThreshold = ST.getMaxNumVectorRegs(MF.getFunction()).first;
   unsigned DynamicVGPRBlockSize =
       MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();
 
@@ -251,8 +250,7 @@ bool GCNRegPressure::less(const MachineFunction &MF, const GCNRegPressure &O,
 Printable llvm::print(const GCNRegPressure &RP, const GCNSubtarget *ST,
                       unsigned DynamicVGPRBlockSize,
                       const MachineFunction *MF) {
-  unsigned ArchVGPRThreshold =
-      ST->getMaxNumVectorRegs(MF->getFunction()).first;
+  unsigned ArchVGPRThreshold = ST->getMaxNumVectorRegs(MF->getFunction()).first;
   return Printable(
       [&RP, ST, DynamicVGPRBlockSize, ArchVGPRThreshold, MF](raw_ostream &OS) {
         OS << "VGPRs: " << RP.getArchVGPRNum(ArchVGPRThreshold) << ' '
@@ -906,7 +904,8 @@ bool GCNRegPressurePrinter::runOnMachineFunction(MachineFunction &MF) {
     return Printable([&RP, &MF](raw_ostream &OS) {
       OS << format(PFX "  %-5d", RP.getSGPRNum())
          << format(" %-5d", RP.getVGPRNum(false, MF.getSubtarget<GCNSubtarget>()
-                                                     .getMaxNumVectorRegs(MF.getFunction())
+                                                     .getMaxNumVectorRegs(
+                                                         MF.getFunction())
                                                      .first));
     });
   };
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 80e6c49c42fbc..3cf9a7c0f972e 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -343,10 +343,10 @@ void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
                             ? static_cast<GCNRPTracker *>(&UpwardTracker)
                             : static_cast<GCNRPTracker *>(&DownwardTracker);
       SGPRPressure = T->getPressure().getSGPRNum();
-      VGPRPressure =
-          T->getPressure().getArchVGPRNum(DAG->MF.getSubtarget<GCNSubtarget>()
-                                              .getMaxNumVectorRegs(DAG->MF.getFunction())
-                                              .first);
+      VGPRPressure = T->getPressure().getArchVGPRNum(
+          DAG->MF.getSubtarget<GCNSubtarget>()
+              .getMaxNumVectorRegs(DAG->MF.getFunction())
+              .first);
     }
   }
   ReadyQueue &Q = Zone.Available;
@@ -1287,8 +1287,7 @@ void GCNSchedStage::checkScheduling() {
                     << print(PressureAfter, &ST, 0, &MF));
   LLVM_DEBUG(dbgs() << "Region: " << RegionIdx << ".\n");
 
-  unsigned ArchVGPRThreshold =
-      ST.getMaxNumVectorRegs(MF.getFunction()).first;
+  unsigned ArchVGPRThreshold = ST.getMaxNumVectorRegs(MF.getFunction()).first;
   if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit &&
       PressureAfter.getVGPRNum(ST.hasGFX90AInsts(), ArchVGPRThreshold) <=
           S.VGPRCriticalLimit) {
@@ -1480,8 +1479,7 @@ bool GCNSchedStage::shouldRevertScheduling(unsigned WavesAfter) {
 
   // For dynamic VGPR mode, we don't want to waste any VGPR blocks.
   if (DAG.MFI.isDynamicVGPREnabled()) {
-    unsigned ArchVGPRThreshold =
-        ST.getMaxNumVectorRegs(MF.getFunction()).first;
+    unsigned ArchVGPRThreshold = ST.getMaxNumVectorRegs(MF.getFunction()).first;
     unsigned BlocksBefore = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks(
         &ST, DAG.MFI.getDynamicVGPRBlockSize(),
         PressureBefore.getVGPRNum(false, ArchVGPRThreshold));

From d59fba1f01a00df11a46d0ae236901dfb30ea899 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes@amd.com>
Date: Tue, 29 Jul 2025 11:48:29 -0700
Subject: [PATCH 10/11] Fix test + handling of ArchVGPR pressure

Change-Id: I15cd9b4e9e38d7000a403bed56918819ae858658
---
 llvm/lib/Target/AMDGPU/GCNRegPressure.h     |  14 +-
 llvm/test/CodeGen/AMDGPU/avgpr-pressure.mir | 161 ++++++++++++++++++--
 2 files changed, 155 insertions(+), 20 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
index 0e03834380525..8b80cc42c9bb0 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
@@ -97,15 +97,17 @@ struct GCNRegPressure {
   /// \returns the ArchVGPR32 pressure, plus the AVGPRS which we assume will be
   /// allocated as VGPR
   unsigned getArchVGPRNum(unsigned AddressableArchVGPR) const {
-    return std::min(Value[VGPR] + Value[AVGPR], AddressableArchVGPR);
+    unsigned AVGPRsAsVGPRs =
+        getAVGPRsAsVGPRsNum(Value[VGPR], Value[AVGPR], AddressableArchVGPR);
+
+    return Value[VGPR] + AVGPRsAsVGPRs;
   }
   /// \returns the AccVGPR32 pressure
   unsigned getAGPRNum(unsigned AddressableArchVGPR) const {
-    unsigned VGPRsForAGPRs =
-        Value[VGPR] + Value[AVGPR] > AddressableArchVGPR
-            ? (Value[VGPR] + Value[AVGPR] - AddressableArchVGPR)
-            : 0;
-    return Value[AGPR] + VGPRsForAGPRs;
+    unsigned AVGPRsAsAGPRs = getAVGPRsAsAGPRsNum(
+        Value[VGPR], Value[AGPR], Value[AVGPR], AddressableArchVGPR);
+
+    return Value[AGPR] + AVGPRsAsAGPRs;
   }
   /// \returns the AVGPR32 pressure
   unsigned getAVGPRNum() const { return Value[AVGPR]; }
diff --git a/llvm/test/CodeGen/AMDGPU/avgpr-pressure.mir b/llvm/test/CodeGen/AMDGPU/avgpr-pressure.mir
index 358942e73a7c6..a5183ce0d2661 100644
--- a/llvm/test/CodeGen/AMDGPU/avgpr-pressure.mir
+++ b/llvm/test/CodeGen/AMDGPU/avgpr-pressure.mir
@@ -42,6 +42,22 @@
     unreachable
   }
 
+
+  define void @vgpr_rp_occ1() #0 {
+  entry:
+    unreachable
+  }
+
+  define void @vgpr_rp_occ2() #1 {
+  entry:
+    unreachable
+  }
+
+  define void @vgpr_rp_occ3() #2 {
+  entry:
+    unreachable
+  }
+
   attributes #0 = {"amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="64,64"}
   attributes #1 = {"amdgpu-waves-per-eu"="2,2" "amdgpu-flat-work-group-size"="64,64"}
   attributes #2 = {"amdgpu-waves-per-eu"="3,3" "amdgpu-flat-work-group-size"="64,64"}
@@ -194,8 +210,8 @@ machineFunctionInfo:
 body:             |
   bb.0:
    liveins: $vgpr0, $sgpr4_sgpr5
-    %1:vreg_1024 = IMPLICIT_DEF
-    %2:vreg_1024 = IMPLICIT_DEF
+    %1:av_1024 = IMPLICIT_DEF
+    %2:av_1024 = IMPLICIT_DEF
     %3:av_1024 = IMPLICIT_DEF
     %4:av_1024 = IMPLICIT_DEF
     SCHED_BARRIER 0
@@ -210,7 +226,7 @@ body:             |
 # CHECK: Pressure before scheduling:
 # CHECK-NEXT: Region live-ins:
 # CHECK-NEXT: Region live-in pressure:  VGPRs: 0 AGPRs: 0(O8), SGPRs: 0(O10), LVGPR WT: 0, LSGPR WT: 0 -> Occ: 8
-# CHECK-NEXT: Region register pressure: VGPRs: 48 AGPRs: 80(O4), SGPRs: 0(O10), LVGPR WT: 64, LSGPR WT: 0 -> Occ: 4
+# CHECK-NEXT: Region register pressure: VGPRs: 48 AGPRs: 80(O4), SGPRs: 0(O10), LVGPR WT: 80, LSGPR WT: 0 -> Occ: 4
 
 ---
 name:            avgpr_rp_occ5
@@ -229,8 +245,8 @@ machineFunctionInfo:
 body:             |
   bb.0:
    liveins: $vgpr0, $sgpr4_sgpr5
-    %1:vreg_1024 = IMPLICIT_DEF
-    %2:vreg_1024 = IMPLICIT_DEF
+    %1:av_1024 = IMPLICIT_DEF
+    %2:av_1024 = IMPLICIT_DEF
     %3:av_1024 = IMPLICIT_DEF
     %4:av_1024 = IMPLICIT_DEF
     SCHED_BARRIER 0
@@ -245,7 +261,7 @@ body:             |
 # CHECK: Pressure before scheduling:
 # CHECK-NEXT: Region live-ins:
 # CHECK-NEXT: Region live-in pressure:  VGPRs: 0 AGPRs: 0(O8), SGPRs: 0(O10), LVGPR WT: 0, LSGPR WT: 0 -> Occ: 8
-# CHECK-NEXT: Region register pressure: VGPRs: 40 AGPRs: 88(O4), SGPRs: 0(O10), LVGPR WT: 64, LSGPR WT: 0 -> Occ: 4
+# CHECK-NEXT: Region register pressure: VGPRs: 40 AGPRs: 88(O4), SGPRs: 0(O10), LVGPR WT: 88, LSGPR WT: 0 -> Occ: 4
 
 ---
 name:            avgpr_rp_occ6
@@ -264,8 +280,8 @@ machineFunctionInfo:
 body:             |
   bb.0:
    liveins: $vgpr0, $sgpr4_sgpr5
-    %1:vreg_1024 = IMPLICIT_DEF
-    %2:vreg_1024 = IMPLICIT_DEF
+    %1:av_1024 = IMPLICIT_DEF
+    %2:av_1024 = IMPLICIT_DEF
     %3:av_1024 = IMPLICIT_DEF
     %4:av_1024 = IMPLICIT_DEF
     SCHED_BARRIER 0
@@ -280,7 +296,7 @@ body:             |
 # CHECK: Pressure before scheduling:
 # CHECK-NEXT: Region live-ins:
 # CHECK-NEXT: Region live-in pressure:  VGPRs: 0 AGPRs: 0(O8), SGPRs: 0(O10), LVGPR WT: 0, LSGPR WT: 0 -> Occ: 8
-# CHECK-NEXT: Region register pressure: VGPRs: 36 AGPRs: 92(O4), SGPRs: 0(O10), LVGPR WT: 64, LSGPR WT: 0 -> Occ: 4
+# CHECK-NEXT: Region register pressure: VGPRs: 36 AGPRs: 92(O4), SGPRs: 0(O10), LVGPR WT: 92, LSGPR WT: 0 -> Occ: 4
 
 ---
 name:            avgpr_rp_occ7
@@ -299,8 +315,8 @@ machineFunctionInfo:
 body:             |
   bb.0:
    liveins: $vgpr0, $sgpr4_sgpr5
-    %1:vreg_1024 = IMPLICIT_DEF
-    %2:vreg_1024 = IMPLICIT_DEF
+    %1:av_1024 = IMPLICIT_DEF
+    %2:av_1024 = IMPLICIT_DEF
     %3:av_1024 = IMPLICIT_DEF
     %4:av_1024 = IMPLICIT_DEF
     SCHED_BARRIER 0
@@ -315,7 +331,7 @@ body:             |
 # CHECK: Pressure before scheduling:
 # CHECK-NEXT: Region live-ins:
 # CHECK-NEXT: Region live-in pressure:  VGPRs: 0 AGPRs: 0(O8), SGPRs: 0(O10), LVGPR WT: 0, LSGPR WT: 0 -> Occ: 8
-# CHECK-NEXT: Region register pressure: VGPRs: 32 AGPRs: 96(O4), SGPRs: 0(O10), LVGPR WT: 64, LSGPR WT: 0 -> Occ: 4
+# CHECK-NEXT: Region register pressure: VGPRs: 32 AGPRs: 96(O4), SGPRs: 0(O10), LVGPR WT: 96, LSGPR WT: 0 -> Occ: 4
 
 ---
 name:            avgpr_rp_occ8
@@ -334,8 +350,8 @@ machineFunctionInfo:
 body:             |
   bb.0:
    liveins: $vgpr0, $sgpr4_sgpr5
-    %1:vreg_1024 = IMPLICIT_DEF
-    %2:vreg_1024 = IMPLICIT_DEF
+    %1:av_1024 = IMPLICIT_DEF
+    %2:av_1024 = IMPLICIT_DEF
     %3:av_1024 = IMPLICIT_DEF
     %4:av_1024 = IMPLICIT_DEF
     SCHED_BARRIER 0
@@ -346,3 +362,120 @@ body:             |
     S_ENDPGM 0
 ...
 
+# CHECK: vgpr_rp_occ1:%bb.0
+# CHECK: Pressure before scheduling:
+# CHECK-NEXT: Region live-ins:
+# CHECK-NEXT: Region live-in pressure:  VGPRs: 0 AGPRs: 0(O8), SGPRs: 0(O10), LVGPR WT: 0, LSGPR WT: 0 -> Occ: 8
+# CHECK-NEXT: Region register pressure: VGPRs: 448 AGPRs: 0(O1), SGPRs: 0(O10), LVGPR WT: 448, LSGPR WT: 0 -> Occ: 1
+
+---
+name:            vgpr_rp_occ1
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:vreg_1024 = IMPLICIT_DEF
+    %2:vreg_1024 = IMPLICIT_DEF
+    %3:vreg_1024 = IMPLICIT_DEF
+    %4:vreg_1024 = IMPLICIT_DEF
+    %5:vreg_1024 = IMPLICIT_DEF
+    %6:vreg_1024 = IMPLICIT_DEF
+    %7:vreg_1024 = IMPLICIT_DEF
+    %8:vreg_1024 = IMPLICIT_DEF
+    %9:vreg_1024 = IMPLICIT_DEF
+    %10:vreg_1024 = IMPLICIT_DEF
+    %11:vreg_1024 = IMPLICIT_DEF
+    %12:vreg_1024 = IMPLICIT_DEF
+    %13:vreg_1024 = IMPLICIT_DEF
+    %14:vreg_1024 = IMPLICIT_DEF
+    SCHED_BARRIER 0
+    KILL %1, %2, %3, %4, %5, %6, %7
+
+  bb.1:
+    KILL %8, %9, %10, %11, %12, %13, %14
+    S_ENDPGM 0
+...
+
+# CHECK: vgpr_rp_occ2:%bb.0
+# CHECK: Pressure before scheduling:
+# CHECK-NEXT: Region live-ins:
+# CHECK-NEXT: Region live-in pressure:  VGPRs: 0 AGPRs: 0(O8), SGPRs: 0(O10), LVGPR WT: 0, LSGPR WT: 0 -> Occ: 8
+# CHECK-NEXT: Region register pressure: VGPRs: 192 AGPRs: 0(O2), SGPRs: 0(O10), LVGPR WT: 192, LSGPR WT: 0 -> Occ: 2
+
+---
+name:            vgpr_rp_occ2
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:vreg_1024 = IMPLICIT_DEF
+    %2:vreg_1024 = IMPLICIT_DEF
+    %3:vreg_1024 = IMPLICIT_DEF
+    %4:vreg_1024 = IMPLICIT_DEF
+    %5:vreg_1024 = IMPLICIT_DEF
+    %6:vreg_1024 = IMPLICIT_DEF
+    SCHED_BARRIER 0
+    KILL %1, %2, %3
+
+  bb.1:
+    KILL %4, %5, %6
+    S_ENDPGM 0
+...
+
+# CHECK: vgpr_rp_occ3:%bb.0
+# CHECK: Pressure before scheduling:
+# CHECK-NEXT: Region live-ins:
+# CHECK-NEXT: Region live-in pressure:  VGPRs: 0 AGPRs: 0(O8), SGPRs: 0(O10), LVGPR WT: 0, LSGPR WT: 0 -> Occ: 8
+# CHECK-NEXT: Region register pressure: VGPRs: 128 AGPRs: 0(O4), SGPRs: 0(O10), LVGPR WT: 128, LSGPR WT: 0 -> Occ: 4
+
+
+---
+name:            vgpr_rp_occ3
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:vreg_1024 = IMPLICIT_DEF
+    %2:vreg_1024 = IMPLICIT_DEF
+    %3:vreg_1024 = IMPLICIT_DEF
+    %4:vreg_1024 = IMPLICIT_DEF
+    SCHED_BARRIER 0
+    KILL %1, %2
+
+  bb.1:
+    KILL %3, %4
+    S_ENDPGM 0
+...

From 5f9d402fa866aa1667702df692410c35dadc70af Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes@amd.com>
Date: Tue, 29 Jul 2025 15:37:45 -0700
Subject: [PATCH 11/11] Fix print with unspecified ST/MF

Change-Id: I172e8c013f41daea997266dea9c20335c75c9b83
---
 llvm/lib/Target/AMDGPU/GCNRegPressure.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
index 4564163b137be..dd007e6cd6b31 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -250,7 +250,10 @@ bool GCNRegPressure::less(const MachineFunction &MF, const GCNRegPressure &O,
 Printable llvm::print(const GCNRegPressure &RP, const GCNSubtarget *ST,
                       unsigned DynamicVGPRBlockSize,
                       const MachineFunction *MF) {
-  unsigned ArchVGPRThreshold = ST->getMaxNumVectorRegs(MF->getFunction()).first;
+  unsigned ArchVGPRThreshold = std::numeric_limits<unsigned int>::max();
+  if (ST && MF)
+    ArchVGPRThreshold = ST->getMaxNumVectorRegs(MF->getFunction()).first;
+
   return Printable(
       [&RP, ST, DynamicVGPRBlockSize, ArchVGPRThreshold, MF](raw_ostream &OS) {
         OS << "VGPRs: " << RP.getArchVGPRNum(ArchVGPRThreshold) << ' '