llvm · ssahasra · Jun 17, 2025 · Jul 23, 2025 · Jul 23, 2025 · Pierre-vh
diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -10,6 +10,7 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_SIDEFINES_H
 #define LLVM_LIB_TARGET_AMDGPU_SIDEFINES_H
 
+#include "llvm/ADT/BitmaskEnum.h"
 #include "llvm/MC/MCInstrDesc.h"
 
 namespace llvm {
@@ -419,6 +420,38 @@ enum CPol {
 
 } // namespace CPol
 
+/// The atomic synchronization scopes supported by the AMDGPU target.
+enum class SIAtomicScope {
+  NONE,
+  SINGLETHREAD,
+  WAVEFRONT,
+  WORKGROUP,
+  AGENT,
+  SYSTEM
+};
+
+/// The distinct address spaces supported by the AMDGPU target for
+/// atomic memory operation. Can be ORed together.
+enum class SIAtomicAddrSpace {
+  NONE = 0u,
+  GLOBAL = 1u << 0,
+  LDS = 1u << 1,
+  SCRATCH = 1u << 2,
+  GDS = 1u << 3,
+  OTHER = 1u << 4,
+
+  /// The address spaces that can be accessed by a FLAT instruction.
+  FLAT = GLOBAL | LDS | SCRATCH,
+
+  /// The address spaces that support atomic instructions.
+  ATOMIC = GLOBAL | LDS | SCRATCH | GDS,
+
+  /// All address spaces.
+  ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER,
+
+  LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
+};
+
 namespace SendMsg { // Encoding of SIMM16 used in s_sendmsg* insns.
 
 enum Id { // Message ID, width(4) [3:0].

diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -37,7 +37,9 @@
 #include "llvm/CodeGen/MachinePostDominators.h"
 #include "llvm/Support/DebugCounter.h"
 #include "llvm/TargetParser/TargetParser.h"
+
 using namespace llvm;
+using namespace AMDGPU;
 
 #define DEBUG_TYPE "si-insert-waitcnts"
 
@@ -1381,6 +1383,32 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
         Modified = true;
       } else
         WaitcntInstr = &II;
+    } else if (Opcode == AMDGPU::S_WAITCNT_FENCE_soft) {
+      // Each direct load to LDS is also a store to LDS, but we do not have a
+      // separate counter for it. Instead these operations increment LOAD_CNT
+      // and need to be waited for at a release fence. So we treat a release
+      // fence as if it depends on any previous LDS DMA stores.
+      unsigned Ordering =
+          TII->getNamedOperand(II, AMDGPU::OpName::Ordering)->getImm();
+      unsigned Scope =
+          TII->getNamedOperand(II, AMDGPU::OpName::Scope)->getImm();
+      unsigned AddrSpace =
+          TII->getNamedOperand(II, AMDGPU::OpName::AddrSpace)->getImm();
+      if (isReleaseOrStronger((AtomicOrdering)Ordering) &&
+          Scope >= (unsigned)AMDGPU::SIAtomicScope::WORKGROUP &&
+          any((SIAtomicAddrSpace)AddrSpace & SIAtomicAddrSpace::LDS)) {
+        LLVM_DEBUG(dbgs() << "Processing S_WAITCNT_FENCE_soft: " << II
+                          << "Before: " << Wait.LoadCnt << '\n';);
+        ScoreBrackets.determineWait(LOAD_CNT, FIRST_LDS_VGPR, Wait);
+        LLVM_DEBUG(dbgs() << "After: " << Wait.LoadCnt << '\n';);
+      }
+      // It is possible (but unlikely) that this is the only wait instruction,
+      // in which case, we exit this loop without a WaitcntInstr to consume
+      // `Wait`. But that works because `Wait` was passed in by reference, and
+      // the callee eventually calls createNewWaitcnt on it. We test this
+      // possibility in an articial MIR test since such a situation cannot be
+      // recreated by running the memory legalizer.
+      II.eraseFromParent();
     } else {
       assert(Opcode == AMDGPU::S_WAITCNT_VSCNT);
       assert(II.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
@@ -1552,6 +1580,11 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
         ScoreBrackets.simplifyWaitcnt(OldWait);
       Wait = Wait.combined(OldWait);
       UpdatableInstr = &CombinedStoreDsCntInstr;
+    } else if (Opcode == AMDGPU::S_WAITCNT_FENCE_soft) {
+      // Architectures higher than GFX10 do not have direct loads to
+      // LDS, so no work required here yet.
+      II.eraseFromParent();
+      continue;
     } else {
       std::optional<InstCounterType> CT = counterTypeForInstr(Opcode);
       assert(CT.has_value());
@@ -2444,6 +2477,7 @@ static bool isWaitInstr(MachineInstr &Inst) {
           Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL) ||
          Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT ||
          Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT ||
+         Opcode == AMDGPU::S_WAITCNT_FENCE_soft ||
          counterTypeForInstr(Opcode).has_value();
 }
 

diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -57,38 +57,6 @@ enum class Position {
   AFTER
 };
 
-/// The atomic synchronization scopes supported by the AMDGPU target.
-enum class SIAtomicScope {
-  NONE,
-  SINGLETHREAD,
-  WAVEFRONT,
-  WORKGROUP,
-  AGENT,
-  SYSTEM
-};
-
-/// The distinct address spaces supported by the AMDGPU target for
-/// atomic memory operation. Can be ORed together.
-enum class SIAtomicAddrSpace {
-  NONE = 0u,
-  GLOBAL = 1u << 0,
-  LDS = 1u << 1,
-  SCRATCH = 1u << 2,
-  GDS = 1u << 3,
-  OTHER = 1u << 4,
-
-  /// The address spaces that can be accessed by a FLAT instruction.
-  FLAT = GLOBAL | LDS | SCRATCH,
-
-  /// The address spaces that support atomic instructions.
-  ATOMIC = GLOBAL | LDS | SCRATCH | GDS,
-
-  /// All address spaces.
-  ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER,
-
-  LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
-};
-
 class SIMemOpInfo final {
 private:
 
@@ -1160,6 +1128,19 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
     Changed = true;
   }
 
+  // Emit a soft wait count as a place holder for SIInsertWaitcnts, which will
+  // later add additional waits. To minimize clutter, we do this only when
+  // required. For now this just means a release operation at workgroup scope
+  // that synchronizes LDS, required by direct loads to LDS.
+  if (isReleaseOrStronger(Order) && Scope == SIAtomicScope::WORKGROUP &&
+      any((SIAtomicAddrSpace)AddrSpace & SIAtomicAddrSpace::LDS)) {
+    BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_FENCE_soft))
+        .addImm((unsigned)Order)
+        .addImm((unsigned)Scope)
+        .addImm((unsigned)AddrSpace);
+    Changed = true;
+  }
+
   if (Pos == Position::AFTER)
     --MI;
 
@@ -2068,6 +2049,19 @@ bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
     Changed = true;
   }
 
+  // Emit a soft wait count as a place holder for SIInsertWaitcnts, which will
+  // later add additional waits. To minimize clutter, we do this only when
+  // required. For now this just means a release operation at workgroup scope
+  // that synchronizes LDS, required by direct loads to LDS.
+  if (isReleaseOrStronger(Order) && Scope == SIAtomicScope::WORKGROUP &&
+      any((SIAtomicAddrSpace)AddrSpace & SIAtomicAddrSpace::LDS)) {
+    BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_FENCE_soft))
+        .addImm((unsigned)Order)
+        .addImm((unsigned)Scope)
+        .addImm((unsigned)AddrSpace);
+    Changed = true;
+  }
+
   if (VSCnt) {
     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT_soft))
         .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
@@ -2385,6 +2379,19 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
     Changed = true;
   }
 
+  // Emit a soft wait count as a place holder for SIInsertWaitcnts, which will
+  // later add additional waits. To minimize clutter, we do this only when
+  // required. For now this just means a release operation at workgroup scope
+  // that synchronizes LDS, required by direct loads to LDS.
+  if (isReleaseOrStronger(Order) && Scope == SIAtomicScope::WORKGROUP &&
+      any((SIAtomicAddrSpace)AddrSpace & SIAtomicAddrSpace::LDS)) {
+    BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_FENCE_soft))
+        .addImm((unsigned)Order)
+        .addImm((unsigned)Scope)
+        .addImm((unsigned)AddrSpace);
+    Changed = true;
+  }
+
   if (Pos == Position::AFTER)
     --MI;
 

diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -1621,6 +1621,12 @@ let OtherPredicates = [HasImageInsts] in {
   def S_WAIT_KMCNT_soft : SOPP_Pseudo <"s_soft_wait_kmcnt", (ins s16imm:$simm16), "$simm16">;
 }
 
+def S_WAITCNT_FENCE_soft : SPseudoInstSI <
+   (outs), (ins i32imm:$Ordering, i32imm:$Scope, i32imm:$AddrSpace)> {
+   let hasSideEffects = 0;
+   let UseNamedOperandTable = 1;
+}
+
 def S_SETHALT : SOPP_Pseudo <"s_sethalt" , (ins i32imm:$simm16), "$simm16",
     [(int_amdgcn_s_sethalt timm:$simm16)]>;
 def S_SETKILL : SOPP_Pseudo <"s_setkill" , (ins i16imm:$simm16), "$simm16">;