diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h index 3902d4c3b1027..9d30951cac1a3 100644 --- a/llvm/lib/Target/AMDGPU/SIDefines.h +++ b/llvm/lib/Target/AMDGPU/SIDefines.h @@ -10,6 +10,7 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_SIDEFINES_H #define LLVM_LIB_TARGET_AMDGPU_SIDEFINES_H +#include "llvm/ADT/BitmaskEnum.h" #include "llvm/MC/MCInstrDesc.h" namespace llvm { @@ -419,6 +420,38 @@ enum CPol { } // namespace CPol +/// The atomic synchronization scopes supported by the AMDGPU target. +enum class SIAtomicScope { + NONE, + SINGLETHREAD, + WAVEFRONT, + WORKGROUP, + AGENT, + SYSTEM +}; + +/// The distinct address spaces supported by the AMDGPU target for +/// atomic memory operation. Can be ORed together. +enum class SIAtomicAddrSpace { + NONE = 0u, + GLOBAL = 1u << 0, + LDS = 1u << 1, + SCRATCH = 1u << 2, + GDS = 1u << 3, + OTHER = 1u << 4, + + /// The address spaces that can be accessed by a FLAT instruction. + FLAT = GLOBAL | LDS | SCRATCH, + + /// The address spaces that support atomic instructions. + ATOMIC = GLOBAL | LDS | SCRATCH | GDS, + + /// All address spaces. + ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER, + + LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL) +}; + namespace SendMsg { // Encoding of SIMM16 used in s_sendmsg* insns. enum Id { // Message ID, width(4) [3:0]. diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 9faf4974e3fd6..ca1c534896b5a 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -37,7 +37,9 @@ #include "llvm/CodeGen/MachinePostDominators.h" #include "llvm/Support/DebugCounter.h" #include "llvm/TargetParser/TargetParser.h" + using namespace llvm; +using namespace AMDGPU; #define DEBUG_TYPE "si-insert-waitcnts" @@ -1381,6 +1383,32 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt( Modified = true; } else WaitcntInstr = &II; + } else if (Opcode == AMDGPU::S_WAITCNT_FENCE_soft) { + // Each direct load to LDS is also a store to LDS, but we do not have a + // separate counter for it. Instead these operations increment LOAD_CNT + // and need to be waited for at a release fence. So we treat a release + // fence as if it depends on any previous LDS DMA stores. + unsigned Ordering = + TII->getNamedOperand(II, AMDGPU::OpName::Ordering)->getImm(); + unsigned Scope = + TII->getNamedOperand(II, AMDGPU::OpName::Scope)->getImm(); + unsigned AddrSpace = + TII->getNamedOperand(II, AMDGPU::OpName::AddrSpace)->getImm(); + if (isReleaseOrStronger((AtomicOrdering)Ordering) && + Scope >= (unsigned)AMDGPU::SIAtomicScope::WORKGROUP && + any((SIAtomicAddrSpace)AddrSpace & SIAtomicAddrSpace::LDS)) { + LLVM_DEBUG(dbgs() << "Processing S_WAITCNT_FENCE_soft: " << II + << "Before: " << Wait.LoadCnt << '\n';); + ScoreBrackets.determineWait(LOAD_CNT, FIRST_LDS_VGPR, Wait); + LLVM_DEBUG(dbgs() << "After: " << Wait.LoadCnt << '\n';); + } + // It is possible (but unlikely) that this is the only wait instruction, + // in which case, we exit this loop without a WaitcntInstr to consume + // `Wait`. But that works because `Wait` was passed in by reference, and + // the callee eventually calls createNewWaitcnt on it. We test this + // possibility in an articial MIR test since such a situation cannot be + // recreated by running the memory legalizer. + II.eraseFromParent(); } else { assert(Opcode == AMDGPU::S_WAITCNT_VSCNT); assert(II.getOperand(0).getReg() == AMDGPU::SGPR_NULL); @@ -1552,6 +1580,11 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt( ScoreBrackets.simplifyWaitcnt(OldWait); Wait = Wait.combined(OldWait); UpdatableInstr = &CombinedStoreDsCntInstr; + } else if (Opcode == AMDGPU::S_WAITCNT_FENCE_soft) { + // Architectures higher than GFX10 do not have direct loads to + // LDS, so no work required here yet. + II.eraseFromParent(); + continue; } else { std::optional CT = counterTypeForInstr(Opcode); assert(CT.has_value()); @@ -2444,6 +2477,7 @@ static bool isWaitInstr(MachineInstr &Inst) { Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL) || Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT || Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT || + Opcode == AMDGPU::S_WAITCNT_FENCE_soft || counterTypeForInstr(Opcode).has_value(); } diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index 3212060f303a5..6e998098a1ab2 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -57,38 +57,6 @@ enum class Position { AFTER }; -/// The atomic synchronization scopes supported by the AMDGPU target. -enum class SIAtomicScope { - NONE, - SINGLETHREAD, - WAVEFRONT, - WORKGROUP, - AGENT, - SYSTEM -}; - -/// The distinct address spaces supported by the AMDGPU target for -/// atomic memory operation. Can be ORed together. -enum class SIAtomicAddrSpace { - NONE = 0u, - GLOBAL = 1u << 0, - LDS = 1u << 1, - SCRATCH = 1u << 2, - GDS = 1u << 3, - OTHER = 1u << 4, - - /// The address spaces that can be accessed by a FLAT instruction. - FLAT = GLOBAL | LDS | SCRATCH, - - /// The address spaces that support atomic instructions. - ATOMIC = GLOBAL | LDS | SCRATCH | GDS, - - /// All address spaces. - ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER, - - LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL) -}; - class SIMemOpInfo final { private: @@ -1160,6 +1128,19 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI, Changed = true; } + // Emit a soft wait count as a place holder for SIInsertWaitcnts, which will + // later add additional waits. To minimize clutter, we do this only when + // required. For now this just means a release operation at workgroup scope + // that synchronizes LDS, required by direct loads to LDS. + if (isReleaseOrStronger(Order) && Scope == SIAtomicScope::WORKGROUP && + any((SIAtomicAddrSpace)AddrSpace & SIAtomicAddrSpace::LDS)) { + BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_FENCE_soft)) + .addImm((unsigned)Order) + .addImm((unsigned)Scope) + .addImm((unsigned)AddrSpace); + Changed = true; + } + if (Pos == Position::AFTER) --MI; @@ -2068,6 +2049,19 @@ bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI, Changed = true; } + // Emit a soft wait count as a place holder for SIInsertWaitcnts, which will + // later add additional waits. To minimize clutter, we do this only when + // required. For now this just means a release operation at workgroup scope + // that synchronizes LDS, required by direct loads to LDS. + if (isReleaseOrStronger(Order) && Scope == SIAtomicScope::WORKGROUP && + any((SIAtomicAddrSpace)AddrSpace & SIAtomicAddrSpace::LDS)) { + BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_FENCE_soft)) + .addImm((unsigned)Order) + .addImm((unsigned)Scope) + .addImm((unsigned)AddrSpace); + Changed = true; + } + if (VSCnt) { BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT_soft)) .addReg(AMDGPU::SGPR_NULL, RegState::Undef) @@ -2385,6 +2379,19 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI, Changed = true; } + // Emit a soft wait count as a place holder for SIInsertWaitcnts, which will + // later add additional waits. To minimize clutter, we do this only when + // required. For now this just means a release operation at workgroup scope + // that synchronizes LDS, required by direct loads to LDS. + if (isReleaseOrStronger(Order) && Scope == SIAtomicScope::WORKGROUP && + any((SIAtomicAddrSpace)AddrSpace & SIAtomicAddrSpace::LDS)) { + BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_FENCE_soft)) + .addImm((unsigned)Order) + .addImm((unsigned)Scope) + .addImm((unsigned)AddrSpace); + Changed = true; + } + if (Pos == Position::AFTER) --MI; diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td index e103ccc2f00e6..df1a7bd4424bc 100644 --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -1621,6 +1621,12 @@ let OtherPredicates = [HasImageInsts] in { def S_WAIT_KMCNT_soft : SOPP_Pseudo <"s_soft_wait_kmcnt", (ins s16imm:$simm16), "$simm16">; } +def S_WAITCNT_FENCE_soft : SPseudoInstSI < + (outs), (ins i32imm:$Ordering, i32imm:$Scope, i32imm:$AddrSpace)> { + let hasSideEffects = 0; + let UseNamedOperandTable = 1; +} + def S_SETHALT : SOPP_Pseudo <"s_sethalt" , (ins i32imm:$simm16), "$simm16", [(int_amdgcn_s_sethalt timm:$simm16)]>; def S_SETKILL : SOPP_Pseudo <"s_setkill" , (ins i16imm:$simm16), "$simm16">; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll index 66037615f0ba0..1f01c64de546c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll @@ -536,30 +536,36 @@ entry: define amdgpu_kernel void @workgroup_one_as_release() #0 { ; GFX6-LABEL: name: workgroup_one_as_release ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_FENCE_soft 5, 3, 15 ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: workgroup_one_as_release ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_FENCE_soft 5, 3, 15 ; GFX8-NEXT: S_ENDPGM 0 ; ; GFX10WGP-LABEL: name: workgroup_one_as_release ; GFX10WGP: bb.0.entry: ; GFX10WGP-NEXT: S_WAITCNT_soft 16240 + ; GFX10WGP-NEXT: S_WAITCNT_FENCE_soft 5, 3, 15 ; GFX10WGP-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0 ; GFX10WGP-NEXT: S_ENDPGM 0 ; ; GFX10CU-LABEL: name: workgroup_one_as_release ; GFX10CU: bb.0.entry: + ; GFX10CU-NEXT: S_WAITCNT_FENCE_soft 5, 3, 15 ; GFX10CU-NEXT: S_ENDPGM 0 ; ; GFX11WGP-LABEL: name: workgroup_one_as_release ; GFX11WGP: bb.0.entry: ; GFX11WGP-NEXT: S_WAITCNT_soft 1015 + ; GFX11WGP-NEXT: S_WAITCNT_FENCE_soft 5, 3, 15 ; GFX11WGP-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0 ; GFX11WGP-NEXT: S_ENDPGM 0 ; ; GFX11CU-LABEL: name: workgroup_one_as_release ; GFX11CU: bb.0.entry: + ; GFX11CU-NEXT: S_WAITCNT_FENCE_soft 5, 3, 15 ; GFX11CU-NEXT: S_ENDPGM 0 entry: fence syncscope("workgroup-one-as") release @@ -569,32 +575,38 @@ entry: define amdgpu_kernel void @workgroup_one_as_acq_rel() #0 { ; GFX6-LABEL: name: workgroup_one_as_acq_rel ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_FENCE_soft 5, 3, 15 ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: workgroup_one_as_acq_rel ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_FENCE_soft 5, 3, 15 ; GFX8-NEXT: S_ENDPGM 0 ; ; GFX10WGP-LABEL: name: workgroup_one_as_acq_rel ; GFX10WGP: bb.0.entry: ; GFX10WGP-NEXT: S_WAITCNT_soft 16240 + ; GFX10WGP-NEXT: S_WAITCNT_FENCE_soft 5, 3, 15 ; GFX10WGP-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0 ; GFX10WGP-NEXT: BUFFER_GL0_INV implicit $exec ; GFX10WGP-NEXT: S_ENDPGM 0 ; ; GFX10CU-LABEL: name: workgroup_one_as_acq_rel ; GFX10CU: bb.0.entry: + ; GFX10CU-NEXT: S_WAITCNT_FENCE_soft 5, 3, 15 ; GFX10CU-NEXT: S_ENDPGM 0 ; ; GFX11WGP-LABEL: name: workgroup_one_as_acq_rel ; GFX11WGP: bb.0.entry: ; GFX11WGP-NEXT: S_WAITCNT_soft 1015 + ; GFX11WGP-NEXT: S_WAITCNT_FENCE_soft 5, 3, 15 ; GFX11WGP-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0 ; GFX11WGP-NEXT: BUFFER_GL0_INV implicit $exec ; GFX11WGP-NEXT: S_ENDPGM 0 ; ; GFX11CU-LABEL: name: workgroup_one_as_acq_rel ; GFX11CU: bb.0.entry: + ; GFX11CU-NEXT: S_WAITCNT_FENCE_soft 5, 3, 15 ; GFX11CU-NEXT: S_ENDPGM 0 entry: fence syncscope("workgroup-one-as") acq_rel @@ -604,32 +616,38 @@ entry: define amdgpu_kernel void @workgroup_one_as_seq_cst() #0 { ; GFX6-LABEL: name: workgroup_one_as_seq_cst ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_FENCE_soft 5, 3, 15 ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: workgroup_one_as_seq_cst ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_FENCE_soft 5, 3, 15 ; GFX8-NEXT: S_ENDPGM 0 ; ; GFX10WGP-LABEL: name: workgroup_one_as_seq_cst ; GFX10WGP: bb.0.entry: ; GFX10WGP-NEXT: S_WAITCNT_soft 16240 + ; GFX10WGP-NEXT: S_WAITCNT_FENCE_soft 5, 3, 15 ; GFX10WGP-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0 ; GFX10WGP-NEXT: BUFFER_GL0_INV implicit $exec ; GFX10WGP-NEXT: S_ENDPGM 0 ; ; GFX10CU-LABEL: name: workgroup_one_as_seq_cst ; GFX10CU: bb.0.entry: + ; GFX10CU-NEXT: S_WAITCNT_FENCE_soft 5, 3, 15 ; GFX10CU-NEXT: S_ENDPGM 0 ; ; GFX11WGP-LABEL: name: workgroup_one_as_seq_cst ; GFX11WGP: bb.0.entry: ; GFX11WGP-NEXT: S_WAITCNT_soft 1015 + ; GFX11WGP-NEXT: S_WAITCNT_FENCE_soft 5, 3, 15 ; GFX11WGP-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0 ; GFX11WGP-NEXT: BUFFER_GL0_INV implicit $exec ; GFX11WGP-NEXT: S_ENDPGM 0 ; ; GFX11CU-LABEL: name: workgroup_one_as_seq_cst ; GFX11CU: bb.0.entry: + ; GFX11CU-NEXT: S_WAITCNT_FENCE_soft 5, 3, 15 ; GFX11CU-NEXT: S_ENDPGM 0 entry: fence syncscope("workgroup-one-as") seq_cst @@ -1283,33 +1301,39 @@ define amdgpu_kernel void @workgroup_release() #0 { ; GFX6-LABEL: name: workgroup_release ; GFX6: bb.0.entry: ; GFX6-NEXT: S_WAITCNT_soft 127 + ; GFX6-NEXT: S_WAITCNT_FENCE_soft 5, 3, 15 ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: workgroup_release ; GFX8: bb.0.entry: ; GFX8-NEXT: S_WAITCNT_soft 127 + ; GFX8-NEXT: S_WAITCNT_FENCE_soft 5, 3, 15 ; GFX8-NEXT: S_ENDPGM 0 ; ; GFX10WGP-LABEL: name: workgroup_release ; GFX10WGP: bb.0.entry: ; GFX10WGP-NEXT: S_WAITCNT_soft 112 + ; GFX10WGP-NEXT: S_WAITCNT_FENCE_soft 5, 3, 15 ; GFX10WGP-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0 ; GFX10WGP-NEXT: S_ENDPGM 0 ; ; GFX10CU-LABEL: name: workgroup_release ; GFX10CU: bb.0.entry: ; GFX10CU-NEXT: S_WAITCNT_soft 49279 + ; GFX10CU-NEXT: S_WAITCNT_FENCE_soft 5, 3, 15 ; GFX10CU-NEXT: S_ENDPGM 0 ; ; GFX11WGP-LABEL: name: workgroup_release ; GFX11WGP: bb.0.entry: ; GFX11WGP-NEXT: S_WAITCNT_soft 7 + ; GFX11WGP-NEXT: S_WAITCNT_FENCE_soft 5, 3, 15 ; GFX11WGP-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0 ; GFX11WGP-NEXT: S_ENDPGM 0 ; ; GFX11CU-LABEL: name: workgroup_release ; GFX11CU: bb.0.entry: ; GFX11CU-NEXT: S_WAITCNT_soft 64519 + ; GFX11CU-NEXT: S_WAITCNT_FENCE_soft 5, 3, 15 ; GFX11CU-NEXT: S_ENDPGM 0 entry: fence syncscope("workgroup") release @@ -1320,16 +1344,19 @@ define amdgpu_kernel void @workgroup_acq_rel() #0 { ; GFX6-LABEL: name: workgroup_acq_rel ; GFX6: bb.0.entry: ; GFX6-NEXT: S_WAITCNT_soft 127 + ; GFX6-NEXT: S_WAITCNT_FENCE_soft 5, 3, 15 ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: workgroup_acq_rel ; GFX8: bb.0.entry: ; GFX8-NEXT: S_WAITCNT_soft 127 + ; GFX8-NEXT: S_WAITCNT_FENCE_soft 5, 3, 15 ; GFX8-NEXT: S_ENDPGM 0 ; ; GFX10WGP-LABEL: name: workgroup_acq_rel ; GFX10WGP: bb.0.entry: ; GFX10WGP-NEXT: S_WAITCNT_soft 112 + ; GFX10WGP-NEXT: S_WAITCNT_FENCE_soft 5, 3, 15 ; GFX10WGP-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0 ; GFX10WGP-NEXT: BUFFER_GL0_INV implicit $exec ; GFX10WGP-NEXT: S_ENDPGM 0 @@ -1337,11 +1364,13 @@ define amdgpu_kernel void @workgroup_acq_rel() #0 { ; GFX10CU-LABEL: name: workgroup_acq_rel ; GFX10CU: bb.0.entry: ; GFX10CU-NEXT: S_WAITCNT_soft 49279 + ; GFX10CU-NEXT: S_WAITCNT_FENCE_soft 5, 3, 15 ; GFX10CU-NEXT: S_ENDPGM 0 ; ; GFX11WGP-LABEL: name: workgroup_acq_rel ; GFX11WGP: bb.0.entry: ; GFX11WGP-NEXT: S_WAITCNT_soft 7 + ; GFX11WGP-NEXT: S_WAITCNT_FENCE_soft 5, 3, 15 ; GFX11WGP-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0 ; GFX11WGP-NEXT: BUFFER_GL0_INV implicit $exec ; GFX11WGP-NEXT: S_ENDPGM 0 @@ -1349,6 +1378,7 @@ define amdgpu_kernel void @workgroup_acq_rel() #0 { ; GFX11CU-LABEL: name: workgroup_acq_rel ; GFX11CU: bb.0.entry: ; GFX11CU-NEXT: S_WAITCNT_soft 64519 + ; GFX11CU-NEXT: S_WAITCNT_FENCE_soft 5, 3, 15 ; GFX11CU-NEXT: S_ENDPGM 0 entry: fence syncscope("workgroup") acq_rel @@ -1359,16 +1389,19 @@ define amdgpu_kernel void @workgroup_seq_cst() #0 { ; GFX6-LABEL: name: workgroup_seq_cst ; GFX6: bb.0.entry: ; GFX6-NEXT: S_WAITCNT_soft 127 + ; GFX6-NEXT: S_WAITCNT_FENCE_soft 5, 3, 15 ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: workgroup_seq_cst ; GFX8: bb.0.entry: ; GFX8-NEXT: S_WAITCNT_soft 127 + ; GFX8-NEXT: S_WAITCNT_FENCE_soft 5, 3, 15 ; GFX8-NEXT: S_ENDPGM 0 ; ; GFX10WGP-LABEL: name: workgroup_seq_cst ; GFX10WGP: bb.0.entry: ; GFX10WGP-NEXT: S_WAITCNT_soft 112 + ; GFX10WGP-NEXT: S_WAITCNT_FENCE_soft 5, 3, 15 ; GFX10WGP-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0 ; GFX10WGP-NEXT: BUFFER_GL0_INV implicit $exec ; GFX10WGP-NEXT: S_ENDPGM 0 @@ -1376,11 +1409,13 @@ define amdgpu_kernel void @workgroup_seq_cst() #0 { ; GFX10CU-LABEL: name: workgroup_seq_cst ; GFX10CU: bb.0.entry: ; GFX10CU-NEXT: S_WAITCNT_soft 49279 + ; GFX10CU-NEXT: S_WAITCNT_FENCE_soft 5, 3, 15 ; GFX10CU-NEXT: S_ENDPGM 0 ; ; GFX11WGP-LABEL: name: workgroup_seq_cst ; GFX11WGP: bb.0.entry: ; GFX11WGP-NEXT: S_WAITCNT_soft 7 + ; GFX11WGP-NEXT: S_WAITCNT_FENCE_soft 5, 3, 15 ; GFX11WGP-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0 ; GFX11WGP-NEXT: BUFFER_GL0_INV implicit $exec ; GFX11WGP-NEXT: S_ENDPGM 0 @@ -1388,6 +1423,7 @@ define amdgpu_kernel void @workgroup_seq_cst() #0 { ; GFX11CU-LABEL: name: workgroup_seq_cst ; GFX11CU: bb.0.entry: ; GFX11CU-NEXT: S_WAITCNT_soft 64519 + ; GFX11CU-NEXT: S_WAITCNT_FENCE_soft 5, 3, 15 ; GFX11CU-NEXT: S_ENDPGM 0 entry: fence syncscope("workgroup") seq_cst diff --git a/llvm/test/CodeGen/AMDGPU/insert-waitcnts-fence-soft.mir b/llvm/test/CodeGen/AMDGPU/insert-waitcnts-fence-soft.mir new file mode 100644 index 0000000000000..4b129b22e87bf --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/insert-waitcnts-fence-soft.mir @@ -0,0 +1,193 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefix=GCN %s + + +# Expected vmcnt(0) since the direct load is the only load. +--- +name: dma_then_fence +body: | + bb.0: + ; GCN-LABEL: name: dma_then_fence + ; GCN: S_WAITCNT 0 + ; GCN-NEXT: $m0 = S_MOV_B32 0 + ; GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4, addrspace 1), (store (s32) into `ptr addrspace(3) poison` + 4, addrspace 3) + ; GCN-NEXT: S_WAITCNT 3952 + ; GCN-NEXT: $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + $m0 = S_MOV_B32 0 + BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4) + S_WAITCNT_FENCE_soft 5, 3, 15 + $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + S_ENDPGM 0 + +... + +# Expected vmcnt(1) since the global load is not processed by SIInsertWaitcnts. + +--- +name: dma_then_global_load +body: | + bb.0: + ; GCN-LABEL: name: dma_then_global_load + ; GCN: S_WAITCNT 0 + ; GCN-NEXT: $m0 = S_MOV_B32 0 + ; GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4, addrspace 1), (store (s32) into `ptr addrspace(3) poison` + 4, addrspace 3) + ; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec + ; GCN-NEXT: S_WAITCNT 3953 + ; GCN-NEXT: $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + $m0 = S_MOV_B32 0 + BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4) + $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec + S_WAITCNT_FENCE_soft 5, 3, 15 + $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + S_ENDPGM 0 + +... + +# Expected vmcnt(1) since there is no direct load to LDS, and the global load is not processed by SIInsertWaitcnts. + +--- +name: no_dma_just_fence +body: | + bb.0: + ; GCN-LABEL: name: no_dma_just_fence + ; GCN: S_WAITCNT 0 + ; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec + ; GCN-NEXT: $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec + S_WAITCNT_FENCE_soft 5, 3, 15 + $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + S_ENDPGM 0 + +... + +# Expected vmcnt(1) since the global load is not processed by SIInsertWaitcnts. + +--- +name: dma_then_system_fence +body: | + bb.0: + ; GCN-LABEL: name: dma_then_system_fence + ; GCN: S_WAITCNT 0 + ; GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4, addrspace 1), (store (s32) into `ptr addrspace(3) poison` + 4, addrspace 3) + ; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec + ; GCN-NEXT: S_WAITCNT 3953 + ; GCN-NEXT: $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4) + $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec + S_WAITCNT_FENCE_soft 5, 5, 15 + $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + S_ENDPGM 0 + +... + +# Expected no vmcnt since the fence does not specify LDS + +--- +name: dma_then_no_lds +body: | + bb.0: + ; GCN-LABEL: name: dma_then_no_lds + ; GCN: S_WAITCNT 0 + ; GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4, addrspace 1), (store (s32) into `ptr addrspace(3) poison` + 4, addrspace 3) + ; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec + ; GCN-NEXT: $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4) + $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec + S_WAITCNT_FENCE_soft 5, 5, 5 + $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + S_ENDPGM 0 + +... + +# Expected no vmcnt since the fence is not a release + +--- +name: no_release +body: | + bb.0: + ; GCN-LABEL: name: no_release + ; GCN: S_WAITCNT 0 + ; GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4, addrspace 1), (store (s32) into `ptr addrspace(3) poison` + 4, addrspace 3) + ; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec + ; GCN-NEXT: $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4) + $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec + S_WAITCNT_FENCE_soft 4, 5, 15 + $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + S_ENDPGM 0 + +... + +# Expected no vmcnt since direct loads to LDS don't matter at wavefront scope. + +--- +name: dma_then_wavefront_fence +body: | + bb.0: + ; GCN-LABEL: name: dma_then_wavefront_fence + ; GCN: S_WAITCNT 0 + ; GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4, addrspace 1), (store (s32) into `ptr addrspace(3) poison` + 4, addrspace 3) + ; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec + ; GCN-NEXT: $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4) + $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec + S_WAITCNT_FENCE_soft 5, 2, 15 + $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + S_ENDPGM 0 + +... + +# The computed vmcnt(1) gets merged with the existing vmcnt(0). + +--- +name: merge_with_prev_wait +body: | + bb.0: + ; GCN-LABEL: name: merge_with_prev_wait + ; GCN: S_WAITCNT 0 + ; GCN-NEXT: $m0 = S_MOV_B32 0 + ; GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4, addrspace 1), (store (s32) into `ptr addrspace(3) poison` + 4, addrspace 3) + ; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec + ; GCN-NEXT: S_WAITCNT 3952 + ; GCN-NEXT: $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + $m0 = S_MOV_B32 0 + BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4) + $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec + S_WAITCNT 3952 + S_WAITCNT_FENCE_soft 5, 3, 15 + $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + S_ENDPGM 0 + +... + +# The computed vmcnt(1) gets merged with the existing vmcnt(0). + +--- +name: merge_with_next_wait +body: | + bb.0: + ; GCN-LABEL: name: merge_with_next_wait + ; GCN: S_WAITCNT 0 + ; GCN-NEXT: $m0 = S_MOV_B32 0 + ; GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4, addrspace 1), (store (s32) into `ptr addrspace(3) poison` + 4, addrspace 3) + ; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec + ; GCN-NEXT: S_WAITCNT 3952 + ; GCN-NEXT: $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + $m0 = S_MOV_B32 0 + BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4) + $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec + S_WAITCNT_FENCE_soft 5, 3, 15 + S_WAITCNT 3952 + $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + S_ENDPGM 0 + +... diff --git a/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll b/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll new file mode 100644 index 0000000000000..d23509b5aa812 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll @@ -0,0 +1,543 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s --check-prefixes=GFX900 +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck %s --check-prefixes=GFX90A +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck %s --check-prefixes=GFX90A-TGSPLIT +; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck %s --check-prefixes=GFX942 +; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -mattr=+tgsplit < %s | FileCheck %s --check-prefixes=GFX942-TGSPLIT +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck %s -check-prefixes=GFX10WGP +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck %s -check-prefixes=GFX10CU + +; In each of these tests, an LDS DMA operation is followed by a release pattern +; at workgroup scope. The fence in such a release (implicit or explicit) should +; wait for the store component in the LDS DMA. The additional noalias metadata +; is just meant to ensure that the wait counts are not generated due to some +; unintended aliasing. + +declare void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) nocapture, i32 %size, i32 %voffset, i32 %soffset, i32 %offset, i32 %aux) + +define amdgpu_kernel void @barrier_release(<4 x i32> inreg %rsrc, +; GFX900-LABEL: barrier_release: +; GFX900: ; %bb.0: ; %main_body +; GFX900-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX900-NEXT: v_mov_b32_e32 v0, 0x800 +; GFX900-NEXT: v_mov_b32_e32 v1, 0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 m0, s12 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: buffer_load_dword v0, s[8:11], 0 offen lds +; GFX900-NEXT: v_mov_b32_e32 v0, s13 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_barrier +; GFX900-NEXT: ds_read_b32 v0, v0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: global_store_dword v1, v0, s[14:15] +; GFX900-NEXT: s_endpgm +; +; GFX90A-LABEL: barrier_release: +; GFX90A: ; %bb.1: +; GFX90A-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX90A-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_branch .LBB0_0 +; GFX90A-NEXT: .p2align 8 +; GFX90A-NEXT: ; %bb.2: +; GFX90A-NEXT: .LBB0_0: ; %main_body +; GFX90A-NEXT: s_mov_b32 m0, s12 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0x800 +; GFX90A-NEXT: buffer_load_dword v0, s[8:11], 0 offen lds +; GFX90A-NEXT: v_mov_b32_e32 v0, s13 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x3c +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_barrier +; GFX90A-NEXT: ds_read_b32 v0, v0 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: global_store_dword v1, v0, s[0:1] +; GFX90A-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: barrier_release: +; GFX90A-TGSPLIT: ; %bb.1: +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_branch .LBB0_0 +; GFX90A-TGSPLIT-NEXT: .p2align 8 +; GFX90A-TGSPLIT-NEXT: ; %bb.2: +; GFX90A-TGSPLIT-NEXT: .LBB0_0: ; %main_body +; GFX90A-TGSPLIT-NEXT: s_mov_b32 m0, s12 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0x800 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, s[8:11], 0 offen lds +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s13 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x3c +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_barrier +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v1, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-LABEL: barrier_release: +; GFX942: ; %bb.1: +; GFX942-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX942-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_branch .LBB0_0 +; GFX942-NEXT: .p2align 8 +; GFX942-NEXT: ; %bb.2: +; GFX942-NEXT: .LBB0_0: ; %main_body +; GFX942-NEXT: s_mov_b32 m0, s12 +; GFX942-NEXT: v_mov_b32_e32 v0, 0x800 +; GFX942-NEXT: buffer_load_dword v0, s[8:11], 0 offen lds +; GFX942-NEXT: v_mov_b32_e32 v0, s13 +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x3c +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_barrier +; GFX942-NEXT: ds_read_b32 v0, v0 +; GFX942-NEXT: v_mov_b32_e32 v1, 0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: global_store_dword v1, v0, s[0:1] +; GFX942-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: barrier_release: +; GFX942-TGSPLIT: ; %bb.1: +; GFX942-TGSPLIT-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_branch .LBB0_0 +; GFX942-TGSPLIT-NEXT: .p2align 8 +; GFX942-TGSPLIT-NEXT: ; %bb.2: +; GFX942-TGSPLIT-NEXT: .LBB0_0: ; %main_body +; GFX942-TGSPLIT-NEXT: s_mov_b32 m0, s12 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0x800 +; GFX942-TGSPLIT-NEXT: buffer_load_dword v0, s[8:11], 0 offen lds +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s13 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x3c +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_barrier +; GFX942-TGSPLIT-NEXT: buffer_inv sc0 +; GFX942-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: global_store_dword v1, v0, s[0:1] +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX10WGP-LABEL: barrier_release: +; GFX10WGP: ; %bb.0: ; %main_body +; GFX10WGP-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX10WGP-NEXT: v_mov_b32_e32 v0, 0x800 +; GFX10WGP-NEXT: v_mov_b32_e32 v1, 0 +; GFX10WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10WGP-NEXT: s_mov_b32 m0, s12 +; GFX10WGP-NEXT: buffer_load_dword v0, s[8:11], 0 offen lds +; GFX10WGP-NEXT: v_mov_b32_e32 v0, s13 +; GFX10WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10WGP-NEXT: s_barrier +; GFX10WGP-NEXT: buffer_gl0_inv +; GFX10WGP-NEXT: ds_read_b32 v0, v0 +; GFX10WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10WGP-NEXT: global_store_dword v1, v0, s[14:15] +; GFX10WGP-NEXT: s_endpgm +; +; GFX10CU-LABEL: barrier_release: +; GFX10CU: ; %bb.0: ; %main_body +; GFX10CU-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX10CU-NEXT: v_mov_b32_e32 v0, 0x800 +; GFX10CU-NEXT: v_mov_b32_e32 v1, 0 +; GFX10CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10CU-NEXT: s_mov_b32 m0, s12 +; GFX10CU-NEXT: buffer_load_dword v0, s[8:11], 0 offen lds +; GFX10CU-NEXT: v_mov_b32_e32 v0, s13 +; GFX10CU-NEXT: s_waitcnt vmcnt(0) +; GFX10CU-NEXT: s_barrier +; GFX10CU-NEXT: ds_read_b32 v0, v0 +; GFX10CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10CU-NEXT: global_store_dword v1, v0, s[14:15] +; GFX10CU-NEXT: s_endpgm + ptr addrspace(3) inreg %lds1, + ptr addrspace(3) inreg %lds2, + ptr addrspace(1) %dummy2) { +main_body: + call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) %lds1, i32 4, i32 2048, i32 0, i32 0, i32 0), !alias.scope !102 + fence syncscope("workgroup") release + tail call void @llvm.amdgcn.s.barrier() + fence syncscope("workgroup") acquire + %load = load i32, ptr addrspace(3) %lds2, align 4, !noalias !105 + store i32 %load, ptr addrspace(1) %dummy2, align 4, !noalias !105 + ret void +} + +define amdgpu_kernel void @fence_fence(<4 x i32> inreg %rsrc, +; GFX900-LABEL: fence_fence: +; GFX900: ; %bb.0: ; %main_body +; GFX900-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX900-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x3c +; GFX900-NEXT: v_mov_b32_e32 v1, 0x800 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 m0, s6 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: buffer_load_dword v1, s[0:3], 0 offen lds +; GFX900-NEXT: v_mov_b32_e32 v1, 1 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: global_store_dword v0, v1, s[8:9] +; GFX900-NEXT: global_load_dword v1, v0, s[8:9] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v1, s7 +; GFX900-NEXT: ds_read_b32 v1, v1 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: global_store_dword v0, v1, s[10:11] +; GFX900-NEXT: s_endpgm +; +; GFX90A-LABEL: fence_fence: +; GFX90A: ; %bb.1: +; GFX90A-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX90A-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_branch .LBB1_0 +; GFX90A-NEXT: .p2align 8 +; GFX90A-NEXT: ; %bb.2: +; GFX90A-NEXT: .LBB1_0: ; %main_body +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x3c +; GFX90A-NEXT: s_mov_b32 m0, s12 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0x800 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: buffer_load_dword v1, s[8:11], 0 offen lds +; GFX90A-NEXT: v_mov_b32_e32 v1, 1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v1, s13 +; GFX90A-NEXT: ds_read_b32 v1, v1 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: fence_fence: +; GFX90A-TGSPLIT: ; %bb.1: +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_branch .LBB1_0 +; GFX90A-TGSPLIT-NEXT: .p2align 8 +; GFX90A-TGSPLIT-NEXT: ; %bb.2: +; GFX90A-TGSPLIT-NEXT: .LBB1_0: ; %main_body +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x3c +; GFX90A-TGSPLIT-NEXT: s_mov_b32 m0, s12 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0x800 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v1, s[8:11], 0 offen lds +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, 1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s13 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-LABEL: fence_fence: +; GFX942: ; %bb.1: +; GFX942-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX942-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_branch .LBB1_0 +; GFX942-NEXT: .p2align 8 +; GFX942-NEXT: ; %bb.2: +; GFX942-NEXT: .LBB1_0: ; %main_body +; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x3c +; GFX942-NEXT: s_mov_b32 m0, s12 +; GFX942-NEXT: v_mov_b32_e32 v1, 0x800 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: buffer_load_dword v1, s[8:11], 0 offen lds +; GFX942-NEXT: v_mov_b32_e32 v1, 1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: global_store_dword v0, v1, s[0:1] sc0 +; GFX942-NEXT: global_load_dword v1, v0, s[0:1] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v1, s13 +; GFX942-NEXT: ds_read_b32 v1, v1 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: global_store_dword v0, v1, s[2:3] +; GFX942-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: fence_fence: +; GFX942-TGSPLIT: ; %bb.1: +; GFX942-TGSPLIT-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_branch .LBB1_0 +; GFX942-TGSPLIT-NEXT: .p2align 8 +; GFX942-TGSPLIT-NEXT: ; %bb.2: +; GFX942-TGSPLIT-NEXT: .LBB1_0: ; %main_body +; GFX942-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x3c +; GFX942-TGSPLIT-NEXT: s_mov_b32 m0, s12 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0x800 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: buffer_load_dword v1, s[8:11], 0 offen lds +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, 1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc0 +; GFX942-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s13 +; GFX942-TGSPLIT-NEXT: buffer_inv sc0 +; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX10WGP-LABEL: fence_fence: +; GFX10WGP: ; %bb.0: ; %main_body +; GFX10WGP-NEXT: s_clause 0x2 +; GFX10WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX10WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10WGP-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x3c +; GFX10WGP-NEXT: v_mov_b32_e32 v0, 0x800 +; GFX10WGP-NEXT: v_mov_b32_e32 v1, 0 +; GFX10WGP-NEXT: v_mov_b32_e32 v2, 1 +; GFX10WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10WGP-NEXT: s_mov_b32 m0, s6 +; GFX10WGP-NEXT: buffer_load_dword v0, s[0:3], 0 offen lds +; GFX10WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10WGP-NEXT: global_store_dword v1, v2, s[8:9] +; GFX10WGP-NEXT: global_load_dword v0, v1, s[8:9] glc +; GFX10WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10WGP-NEXT: v_mov_b32_e32 v0, s7 +; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10WGP-NEXT: buffer_gl0_inv +; GFX10WGP-NEXT: ds_read_b32 v0, v0 +; GFX10WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10WGP-NEXT: global_store_dword v1, v0, s[10:11] +; GFX10WGP-NEXT: s_endpgm +; +; GFX10CU-LABEL: fence_fence: +; GFX10CU: ; %bb.0: ; %main_body +; GFX10CU-NEXT: s_clause 0x2 +; GFX10CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX10CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10CU-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x3c +; GFX10CU-NEXT: v_mov_b32_e32 v0, 0x800 +; GFX10CU-NEXT: v_mov_b32_e32 v1, 0 +; GFX10CU-NEXT: v_mov_b32_e32 v2, 1 +; GFX10CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10CU-NEXT: s_mov_b32 m0, s6 +; GFX10CU-NEXT: buffer_load_dword v0, s[0:3], 0 offen lds +; GFX10CU-NEXT: s_waitcnt vmcnt(0) +; GFX10CU-NEXT: global_store_dword v1, v2, s[8:9] +; GFX10CU-NEXT: global_load_dword v0, v1, s[8:9] +; GFX10CU-NEXT: s_waitcnt vmcnt(0) +; GFX10CU-NEXT: v_mov_b32_e32 v0, s7 +; GFX10CU-NEXT: ds_read_b32 v0, v0 +; GFX10CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10CU-NEXT: global_store_dword v1, v0, s[10:11] +; GFX10CU-NEXT: s_endpgm + ptr addrspace(3) inreg %lds1, + ptr addrspace(3) inreg %lds2, + ptr addrspace(1) %flag, + ptr addrspace(1) %dummy2) { +main_body: + call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) %lds1, i32 4, i32 2048, i32 0, i32 0, i32 0), !alias.scope !102 + fence syncscope("workgroup") release + store atomic i32 1, ptr addrspace(1) %flag syncscope("workgroup") monotonic, align 4, !noalias !105 + %unused_flag = load atomic i32, ptr addrspace(1) %flag syncscope("workgroup") monotonic, align 4, !noalias !105 + fence syncscope("workgroup") acquire + %load = load i32, ptr addrspace(3) %lds2, align 4, !noalias !105 + store i32 %load, ptr addrspace(1) %dummy2, align 4, !noalias !105 + ret void +} + +define amdgpu_kernel void @release_acquire(<4 x i32> inreg %rsrc, +; GFX900-LABEL: release_acquire: +; GFX900: ; %bb.0: ; %main_body +; GFX900-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX900-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x3c +; GFX900-NEXT: v_mov_b32_e32 v1, 0x800 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 m0, s6 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: buffer_load_dword v1, s[0:3], 0 offen lds +; GFX900-NEXT: v_mov_b32_e32 v1, 1 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: global_store_dword v0, v1, s[8:9] +; GFX900-NEXT: global_load_dword v1, v0, s[8:9] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v1, s7 +; GFX900-NEXT: ds_read_b32 v1, v1 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: global_store_dword v0, v1, s[10:11] +; GFX900-NEXT: s_endpgm +; +; GFX90A-LABEL: release_acquire: +; GFX90A: ; %bb.1: +; GFX90A-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX90A-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_branch .LBB2_0 +; GFX90A-NEXT: .p2align 8 +; GFX90A-NEXT: ; %bb.2: +; GFX90A-NEXT: .LBB2_0: ; %main_body +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x3c +; GFX90A-NEXT: s_mov_b32 m0, s12 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0x800 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: buffer_load_dword v1, s[8:11], 0 offen lds +; GFX90A-NEXT: v_mov_b32_e32 v1, 1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v1, s13 +; GFX90A-NEXT: ds_read_b32 v1, v1 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: release_acquire: +; GFX90A-TGSPLIT: ; %bb.1: +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_branch .LBB2_0 +; GFX90A-TGSPLIT-NEXT: .p2align 8 +; GFX90A-TGSPLIT-NEXT: ; %bb.2: +; GFX90A-TGSPLIT-NEXT: .LBB2_0: ; %main_body +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x3c +; GFX90A-TGSPLIT-NEXT: s_mov_b32 m0, s12 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0x800 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v1, s[8:11], 0 offen lds +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, 1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s13 +; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-LABEL: release_acquire: +; GFX942: ; %bb.1: +; GFX942-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX942-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_branch .LBB2_0 +; GFX942-NEXT: .p2align 8 +; GFX942-NEXT: ; %bb.2: +; GFX942-NEXT: .LBB2_0: ; %main_body +; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x3c +; GFX942-NEXT: s_mov_b32 m0, s12 +; GFX942-NEXT: v_mov_b32_e32 v1, 0x800 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: buffer_load_dword v1, s[8:11], 0 offen lds +; GFX942-NEXT: v_mov_b32_e32 v1, 1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: global_store_dword v0, v1, s[0:1] sc0 +; GFX942-NEXT: global_load_dword v1, v0, s[0:1] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v1, s13 +; GFX942-NEXT: ds_read_b32 v1, v1 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: global_store_dword v0, v1, s[2:3] +; GFX942-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: release_acquire: +; GFX942-TGSPLIT: ; %bb.1: +; GFX942-TGSPLIT-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_branch .LBB2_0 +; GFX942-TGSPLIT-NEXT: .p2align 8 +; GFX942-TGSPLIT-NEXT: ; %bb.2: +; GFX942-TGSPLIT-NEXT: .LBB2_0: ; %main_body +; GFX942-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x3c +; GFX942-TGSPLIT-NEXT: s_mov_b32 m0, s12 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0x800 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: buffer_load_dword v1, s[8:11], 0 offen lds +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, 1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc0 +; GFX942-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc0 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s13 +; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX10WGP-LABEL: release_acquire: +; GFX10WGP: ; %bb.0: ; %main_body +; GFX10WGP-NEXT: s_clause 0x2 +; GFX10WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX10WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10WGP-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x3c +; GFX10WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10WGP-NEXT: v_mov_b32_e32 v1, 0x800 +; GFX10WGP-NEXT: v_mov_b32_e32 v2, 1 +; GFX10WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10WGP-NEXT: s_mov_b32 m0, s6 +; GFX10WGP-NEXT: buffer_load_dword v1, s[0:3], 0 offen lds +; GFX10WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10WGP-NEXT: global_store_dword v0, v2, s[8:9] +; GFX10WGP-NEXT: global_load_dword v1, v0, s[8:9] glc +; GFX10WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10WGP-NEXT: buffer_gl0_inv +; GFX10WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10WGP-NEXT: ds_read_b32 v1, v1 +; GFX10WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10WGP-NEXT: global_store_dword v0, v1, s[10:11] +; GFX10WGP-NEXT: s_endpgm +; +; GFX10CU-LABEL: release_acquire: +; GFX10CU: ; %bb.0: ; %main_body +; GFX10CU-NEXT: s_clause 0x2 +; GFX10CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX10CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10CU-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x3c +; GFX10CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10CU-NEXT: v_mov_b32_e32 v1, 0x800 +; GFX10CU-NEXT: v_mov_b32_e32 v2, 1 +; GFX10CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10CU-NEXT: s_mov_b32 m0, s6 +; GFX10CU-NEXT: buffer_load_dword v1, s[0:3], 0 offen lds +; GFX10CU-NEXT: s_waitcnt vmcnt(0) +; GFX10CU-NEXT: global_store_dword v0, v2, s[8:9] +; GFX10CU-NEXT: global_load_dword v1, v0, s[8:9] +; GFX10CU-NEXT: s_waitcnt vmcnt(0) +; GFX10CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10CU-NEXT: ds_read_b32 v1, v1 +; GFX10CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10CU-NEXT: global_store_dword v0, v1, s[10:11] +; GFX10CU-NEXT: s_endpgm + ptr addrspace(3) inreg %lds1, + ptr addrspace(3) inreg %lds2, + ptr addrspace(1) %flag, + ptr addrspace(1) %dummy2) { +main_body: + call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) %lds1, i32 4, i32 2048, i32 0, i32 0, i32 0), !alias.scope !102 + store atomic i32 1, ptr addrspace(1) %flag syncscope("workgroup") release, align 4, !noalias !105 + %unused_flag = load atomic i32, ptr addrspace(1) %flag syncscope("workgroup") acquire, align 4, !noalias !105 + %load = load i32, ptr addrspace(3) %lds2, align 4, !noalias !105 + store i32 %load, ptr addrspace(1) %dummy2, align 4, !noalias !105 + ret void +} + +!100 = !{!100} +!101 = !{!101, !100} +!102 = !{!101} +!103 = !{!103, !100} +!104 = !{!103} +!105 = !{!101, !103} diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local.mir b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local.mir index 56dd95e373dc6..7a8e00acede74 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local.mir +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local.mir @@ -321,7 +321,9 @@ body: | ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) ; GCN-NEXT: $m0 = S_MOV_B32 -1 ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_FENCE_soft 7, 3, 2 ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") seq_cst (s32) from `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: S_WAITCNT_FENCE_soft 7, 3, 2 ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) @@ -433,7 +435,9 @@ body: | ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) ; GCN-NEXT: $m0 = S_MOV_B32 -1 ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_FENCE_soft 7, 3, 2 ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") seq_cst (s32) from `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: S_WAITCNT_FENCE_soft 7, 3, 2 ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) @@ -545,7 +549,9 @@ body: | ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) ; GCN-NEXT: $m0 = S_MOV_B32 -1 ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_FENCE_soft 7, 3, 2 ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") seq_cst (s32) from `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: S_WAITCNT_FENCE_soft 7, 3, 2 ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) @@ -814,6 +820,7 @@ body: | ; GCN-NEXT: $m0 = S_MOV_B32 -1 ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_FENCE_soft 5, 3, 2 ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup-one-as") release (s32) into `ptr addrspace(3) poison`, addrspace 3) ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) @@ -838,6 +845,7 @@ body: | ; GCN-NEXT: $m0 = S_MOV_B32 -1 ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_FENCE_soft 5, 3, 2 ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup-one-as") seq_cst (s32) into `ptr addrspace(3) poison`, addrspace 3) ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) @@ -910,6 +918,7 @@ body: | ; GCN-NEXT: $m0 = S_MOV_B32 -1 ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_FENCE_soft 5, 3, 2 ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("agent-one-as") release (s32) into `ptr addrspace(3) poison`, addrspace 3) ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) @@ -934,6 +943,7 @@ body: | ; GCN-NEXT: $m0 = S_MOV_B32 -1 ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_FENCE_soft 5, 3, 2 ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("agent-one-as") seq_cst (s32) into `ptr addrspace(3) poison`, addrspace 3) ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) @@ -1006,6 +1016,7 @@ body: | ; GCN-NEXT: $m0 = S_MOV_B32 -1 ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_FENCE_soft 5, 3, 2 ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("one-as") release (s32) into `ptr addrspace(3) poison`, addrspace 3) ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) @@ -1030,6 +1041,7 @@ body: | ; GCN-NEXT: $m0 = S_MOV_B32 -1 ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_FENCE_soft 5, 3, 2 ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("one-as") seq_cst (s32) into `ptr addrspace(3) poison`, addrspace 3) ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4)