diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index dd3f2fe25a239..d3918abc3fa20 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -1381,6 +1381,20 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt( Modified = true; } else WaitcntInstr = &II; + } else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) { + assert(ST->hasVMemToLDSLoad()); + LLVM_DEBUG(dbgs() << "Processing S_WAITCNT_lds_direct: " << II + << "Before: " << Wait.LoadCnt << '\n';); + ScoreBrackets.determineWait(LOAD_CNT, FIRST_LDS_VGPR, Wait); + LLVM_DEBUG(dbgs() << "After: " << Wait.LoadCnt << '\n';); + + // It is possible (but unlikely) that this is the only wait instruction, + // in which case, we exit this loop without a WaitcntInstr to consume + // `Wait`. But that works because `Wait` was passed in by reference, and + // the callee eventually calls createNewWaitcnt on it. We test this + // possibility in an articial MIR test since such a situation cannot be + // recreated by running the memory legalizer. + II.eraseFromParent(); } else { assert(Opcode == AMDGPU::S_WAITCNT_VSCNT); assert(II.getOperand(0).getReg() == AMDGPU::SGPR_NULL); @@ -1552,6 +1566,11 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt( ScoreBrackets.simplifyWaitcnt(OldWait); Wait = Wait.combined(OldWait); UpdatableInstr = &CombinedStoreDsCntInstr; + } else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) { + // Architectures higher than GFX10 do not have direct loads to + // LDS, so no work required here yet. + II.eraseFromParent(); + continue; } else { std::optional CT = counterTypeForInstr(Opcode); assert(CT.has_value()); @@ -2442,6 +2461,7 @@ static bool isWaitInstr(MachineInstr &Inst) { Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL) || Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT || Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT || + Opcode == AMDGPU::S_WAITCNT_lds_direct || counterTypeForInstr(Opcode).has_value(); } diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index 0e8a420fbb70a..0ee465716dc37 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -1170,6 +1170,16 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI, Changed = true; } + // On architectures that support direct loads to LDS, emit an unknown waitcnt + // at workgroup-scoped release operations that specify the LDS address space. + // SIInsertWaitcnts will later replace this with a vmcnt(). + if (ST.hasVMemToLDSLoad() && isReleaseOrStronger(Order) && + Scope == SIAtomicScope::WORKGROUP && + (AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) { + BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_lds_direct)); + Changed = true; + } + if (Pos == Position::AFTER) --MI; @@ -2078,6 +2088,16 @@ bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI, Changed = true; } + // On architectures that support direct loads to LDS, emit an unknown waitcnt + // at workgroup-scoped release operations that specify the LDS address space. + // SIInsertWaitcnts will later replace this with a vmcnt(). + if (ST.hasVMemToLDSLoad() && isReleaseOrStronger(Order) && + Scope == SIAtomicScope::WORKGROUP && + (AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) { + BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_lds_direct)); + Changed = true; + } + if (VSCnt) { BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT_soft)) .addReg(AMDGPU::SGPR_NULL, RegState::Undef) diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td index e103ccc2f00e6..8303410115f93 100644 --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -1621,6 +1621,13 @@ let OtherPredicates = [HasImageInsts] in { def S_WAIT_KMCNT_soft : SOPP_Pseudo <"s_soft_wait_kmcnt", (ins s16imm:$simm16), "$simm16">; } +// Represents the point at which a wave must wait for all outstanding direct loads to LDS. +// Typically inserted by the memory legalizer and consumed by SIInsertWaitcnts. + +def S_WAITCNT_lds_direct : SPseudoInstSI<(outs), (ins)> { + let hasSideEffects = 0; +} + def S_SETHALT : SOPP_Pseudo <"s_sethalt" , (ins i32imm:$simm16), "$simm16", [(int_amdgcn_s_sethalt timm:$simm16)]>; def S_SETKILL : SOPP_Pseudo <"s_setkill" , (ins i16imm:$simm16), "$simm16">; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll index 66037615f0ba0..002c03aa7967d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll @@ -545,11 +545,13 @@ define amdgpu_kernel void @workgroup_one_as_release() #0 { ; GFX10WGP-LABEL: name: workgroup_one_as_release ; GFX10WGP: bb.0.entry: ; GFX10WGP-NEXT: S_WAITCNT_soft 16240 + ; GFX10WGP-NEXT: S_WAITCNT_lds_direct ; GFX10WGP-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0 ; GFX10WGP-NEXT: S_ENDPGM 0 ; ; GFX10CU-LABEL: name: workgroup_one_as_release ; GFX10CU: bb.0.entry: + ; GFX10CU-NEXT: S_WAITCNT_lds_direct ; GFX10CU-NEXT: S_ENDPGM 0 ; ; GFX11WGP-LABEL: name: workgroup_one_as_release @@ -578,12 +580,14 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel() #0 { ; GFX10WGP-LABEL: name: workgroup_one_as_acq_rel ; GFX10WGP: bb.0.entry: ; GFX10WGP-NEXT: S_WAITCNT_soft 16240 + ; GFX10WGP-NEXT: S_WAITCNT_lds_direct ; GFX10WGP-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0 ; GFX10WGP-NEXT: BUFFER_GL0_INV implicit $exec ; GFX10WGP-NEXT: S_ENDPGM 0 ; ; GFX10CU-LABEL: name: workgroup_one_as_acq_rel ; GFX10CU: bb.0.entry: + ; GFX10CU-NEXT: S_WAITCNT_lds_direct ; GFX10CU-NEXT: S_ENDPGM 0 ; ; GFX11WGP-LABEL: name: workgroup_one_as_acq_rel @@ -613,12 +617,14 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst() #0 { ; GFX10WGP-LABEL: name: workgroup_one_as_seq_cst ; GFX10WGP: bb.0.entry: ; GFX10WGP-NEXT: S_WAITCNT_soft 16240 + ; GFX10WGP-NEXT: S_WAITCNT_lds_direct ; GFX10WGP-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0 ; GFX10WGP-NEXT: BUFFER_GL0_INV implicit $exec ; GFX10WGP-NEXT: S_ENDPGM 0 ; ; GFX10CU-LABEL: name: workgroup_one_as_seq_cst ; GFX10CU: bb.0.entry: + ; GFX10CU-NEXT: S_WAITCNT_lds_direct ; GFX10CU-NEXT: S_ENDPGM 0 ; ; GFX11WGP-LABEL: name: workgroup_one_as_seq_cst @@ -1293,12 +1299,14 @@ define amdgpu_kernel void @workgroup_release() #0 { ; GFX10WGP-LABEL: name: workgroup_release ; GFX10WGP: bb.0.entry: ; GFX10WGP-NEXT: S_WAITCNT_soft 112 + ; GFX10WGP-NEXT: S_WAITCNT_lds_direct ; GFX10WGP-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0 ; GFX10WGP-NEXT: S_ENDPGM 0 ; ; GFX10CU-LABEL: name: workgroup_release ; GFX10CU: bb.0.entry: ; GFX10CU-NEXT: S_WAITCNT_soft 49279 + ; GFX10CU-NEXT: S_WAITCNT_lds_direct ; GFX10CU-NEXT: S_ENDPGM 0 ; ; GFX11WGP-LABEL: name: workgroup_release @@ -1330,6 +1338,7 @@ define amdgpu_kernel void @workgroup_acq_rel() #0 { ; GFX10WGP-LABEL: name: workgroup_acq_rel ; GFX10WGP: bb.0.entry: ; GFX10WGP-NEXT: S_WAITCNT_soft 112 + ; GFX10WGP-NEXT: S_WAITCNT_lds_direct ; GFX10WGP-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0 ; GFX10WGP-NEXT: BUFFER_GL0_INV implicit $exec ; GFX10WGP-NEXT: S_ENDPGM 0 @@ -1337,6 +1346,7 @@ define amdgpu_kernel void @workgroup_acq_rel() #0 { ; GFX10CU-LABEL: name: workgroup_acq_rel ; GFX10CU: bb.0.entry: ; GFX10CU-NEXT: S_WAITCNT_soft 49279 + ; GFX10CU-NEXT: S_WAITCNT_lds_direct ; GFX10CU-NEXT: S_ENDPGM 0 ; ; GFX11WGP-LABEL: name: workgroup_acq_rel @@ -1369,6 +1379,7 @@ define amdgpu_kernel void @workgroup_seq_cst() #0 { ; GFX10WGP-LABEL: name: workgroup_seq_cst ; GFX10WGP: bb.0.entry: ; GFX10WGP-NEXT: S_WAITCNT_soft 112 + ; GFX10WGP-NEXT: S_WAITCNT_lds_direct ; GFX10WGP-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0 ; GFX10WGP-NEXT: BUFFER_GL0_INV implicit $exec ; GFX10WGP-NEXT: S_ENDPGM 0 @@ -1376,6 +1387,7 @@ define amdgpu_kernel void @workgroup_seq_cst() #0 { ; GFX10CU-LABEL: name: workgroup_seq_cst ; GFX10CU: bb.0.entry: ; GFX10CU-NEXT: S_WAITCNT_soft 49279 + ; GFX10CU-NEXT: S_WAITCNT_lds_direct ; GFX10CU-NEXT: S_ENDPGM 0 ; ; GFX11WGP-LABEL: name: workgroup_seq_cst diff --git a/llvm/test/CodeGen/AMDGPU/insert-waitcnts-fence-soft.mir b/llvm/test/CodeGen/AMDGPU/insert-waitcnts-fence-soft.mir new file mode 100644 index 0000000000000..675a1c94bc435 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/insert-waitcnts-fence-soft.mir @@ -0,0 +1,133 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefix=GCN %s + + +# Expected vmcnt(0) since the direct load is the only load. +--- +name: dma_then_fence +body: | + bb.0: + ; GCN-LABEL: name: dma_then_fence + ; GCN: S_WAITCNT 0 + ; GCN-NEXT: $m0 = S_MOV_B32 0 + ; GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4, addrspace 1), (store (s32) into `ptr addrspace(3) poison` + 4, addrspace 3) + ; GCN-NEXT: S_WAITCNT 3952 + ; GCN-NEXT: $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + $m0 = S_MOV_B32 0 + BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4) + S_WAITCNT_lds_direct + $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + S_ENDPGM 0 + +... + +# Expected vmcnt(1) since the global load is not processed by SIInsertWaitcnts. + +--- +name: dma_then_global_load +body: | + bb.0: + ; GCN-LABEL: name: dma_then_global_load + ; GCN: S_WAITCNT 0 + ; GCN-NEXT: $m0 = S_MOV_B32 0 + ; GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4, addrspace 1), (store (s32) into `ptr addrspace(3) poison` + 4, addrspace 3) + ; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec + ; GCN-NEXT: S_WAITCNT 3953 + ; GCN-NEXT: $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + $m0 = S_MOV_B32 0 + BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4) + $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec + S_WAITCNT_lds_direct + $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + S_ENDPGM 0 + +... + +# Expected no vmcnt since there is no direct load to LDS, and the global load is not processed by SIInsertWaitcnts. + +--- +name: no_dma_just_fence +body: | + bb.0: + ; GCN-LABEL: name: no_dma_just_fence + ; GCN: S_WAITCNT 0 + ; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec + ; GCN-NEXT: $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec + S_WAITCNT_lds_direct + $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + S_ENDPGM 0 + +... + +# Expected vmcnt(1) since the global load is not processed by SIInsertWaitcnts. + +--- +name: dma_then_system_fence +body: | + bb.0: + ; GCN-LABEL: name: dma_then_system_fence + ; GCN: S_WAITCNT 0 + ; GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4, addrspace 1), (store (s32) into `ptr addrspace(3) poison` + 4, addrspace 3) + ; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec + ; GCN-NEXT: S_WAITCNT 3953 + ; GCN-NEXT: $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4) + $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec + S_WAITCNT_lds_direct + $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + S_ENDPGM 0 + +... + +# The computed vmcnt(1) gets merged with the existing vmcnt(0). + +--- +name: merge_with_prev_wait +body: | + bb.0: + ; GCN-LABEL: name: merge_with_prev_wait + ; GCN: S_WAITCNT 0 + ; GCN-NEXT: $m0 = S_MOV_B32 0 + ; GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4, addrspace 1), (store (s32) into `ptr addrspace(3) poison` + 4, addrspace 3) + ; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec + ; GCN-NEXT: S_WAITCNT 3952 + ; GCN-NEXT: $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + $m0 = S_MOV_B32 0 + BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4) + $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec + S_WAITCNT 3952 + S_WAITCNT_lds_direct + $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + S_ENDPGM 0 + +... + +# The computed vmcnt(1) gets merged with the existing vmcnt(0). + +--- +name: merge_with_next_wait +body: | + bb.0: + ; GCN-LABEL: name: merge_with_next_wait + ; GCN: S_WAITCNT 0 + ; GCN-NEXT: $m0 = S_MOV_B32 0 + ; GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4, addrspace 1), (store (s32) into `ptr addrspace(3) poison` + 4, addrspace 3) + ; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec + ; GCN-NEXT: S_WAITCNT 3952 + ; GCN-NEXT: $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + $m0 = S_MOV_B32 0 + BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4) + $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec + S_WAITCNT_lds_direct + S_WAITCNT 3952 + $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + S_ENDPGM 0 + +... diff --git a/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll b/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll new file mode 100644 index 0000000000000..d23509b5aa812 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll @@ -0,0 +1,543 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s --check-prefixes=GFX900 +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck %s --check-prefixes=GFX90A +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck %s --check-prefixes=GFX90A-TGSPLIT +; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck %s --check-prefixes=GFX942 +; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -mattr=+tgsplit < %s | FileCheck %s --check-prefixes=GFX942-TGSPLIT +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck %s -check-prefixes=GFX10WGP +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck %s -check-prefixes=GFX10CU + +; In each of these tests, an LDS DMA operation is followed by a release pattern +; at workgroup scope. The fence in such a release (implicit or explicit) should +; wait for the store component in the LDS DMA. The additional noalias metadata +; is just meant to ensure that the wait counts are not generated due to some +; unintended aliasing. + +declare void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) nocapture, i32 %size, i32 %voffset, i32 %soffset, i32 %offset, i32 %aux) + +define amdgpu_kernel void @barrier_release(<4 x i32> inreg %rsrc, +; GFX900-LABEL: barrier_release: +; GFX900: ; %bb.0: ; %main_body +; GFX900-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX900-NEXT: v_mov_b32_e32 v0, 0x800 +; GFX900-NEXT: v_mov_b32_e32 v1, 0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 m0, s12 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: buffer_load_dword v0, s[8:11], 0 offen lds +; GFX900-NEXT: v_mov_b32_e32 v0, s13 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_barrier +; GFX900-NEXT: ds_read_b32 v0, v0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: global_store_dword v1, v0, s[14:15] +; GFX900-NEXT: s_endpgm +; +; GFX90A-LABEL: barrier_release: +; GFX90A: ; %bb.1: +; GFX90A-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX90A-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_branch .LBB0_0 +; GFX90A-NEXT: .p2align 8 +; GFX90A-NEXT: ; %bb.2: +; GFX90A-NEXT: .LBB0_0: ; %main_body +; GFX90A-NEXT: s_mov_b32 m0, s12 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0x800 +; GFX90A-NEXT: buffer_load_dword v0, s[8:11], 0 offen lds +; GFX90A-NEXT: v_mov_b32_e32 v0, s13 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x3c +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_barrier +; GFX90A-NEXT: ds_read_b32 v0, v0 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: global_store_dword v1, v0, s[0:1] +; GFX90A-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: barrier_release: +; GFX90A-TGSPLIT: ; %bb.1: +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_branch .LBB0_0 +; GFX90A-TGSPLIT-NEXT: .p2align 8 +; GFX90A-TGSPLIT-NEXT: ; %bb.2: +; GFX90A-TGSPLIT-NEXT: .LBB0_0: ; %main_body +; GFX90A-TGSPLIT-NEXT: s_mov_b32 m0, s12 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0x800 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, s[8:11], 0 offen lds +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s13 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x3c +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_barrier +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v1, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-LABEL: barrier_release: +; GFX942: ; %bb.1: +; GFX942-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX942-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_branch .LBB0_0 +; GFX942-NEXT: .p2align 8 +; GFX942-NEXT: ; %bb.2: +; GFX942-NEXT: .LBB0_0: ; %main_body +; GFX942-NEXT: s_mov_b32 m0, s12 +; GFX942-NEXT: v_mov_b32_e32 v0, 0x800 +; GFX942-NEXT: buffer_load_dword v0, s[8:11], 0 offen lds +; GFX942-NEXT: v_mov_b32_e32 v0, s13 +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x3c +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_barrier +; GFX942-NEXT: ds_read_b32 v0, v0 +; GFX942-NEXT: v_mov_b32_e32 v1, 0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: global_store_dword v1, v0, s[0:1] +; GFX942-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: barrier_release: +; GFX942-TGSPLIT: ; %bb.1: +; GFX942-TGSPLIT-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_branch .LBB0_0 +; GFX942-TGSPLIT-NEXT: .p2align 8 +; GFX942-TGSPLIT-NEXT: ; %bb.2: +; GFX942-TGSPLIT-NEXT: .LBB0_0: ; %main_body +; GFX942-TGSPLIT-NEXT: s_mov_b32 m0, s12 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0x800 +; GFX942-TGSPLIT-NEXT: buffer_load_dword v0, s[8:11], 0 offen lds +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s13 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x3c +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_barrier +; GFX942-TGSPLIT-NEXT: buffer_inv sc0 +; GFX942-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: global_store_dword v1, v0, s[0:1] +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX10WGP-LABEL: barrier_release: +; GFX10WGP: ; %bb.0: ; %main_body +; GFX10WGP-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX10WGP-NEXT: v_mov_b32_e32 v0, 0x800 +; GFX10WGP-NEXT: v_mov_b32_e32 v1, 0 +; GFX10WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10WGP-NEXT: s_mov_b32 m0, s12 +; GFX10WGP-NEXT: buffer_load_dword v0, s[8:11], 0 offen lds +; GFX10WGP-NEXT: v_mov_b32_e32 v0, s13 +; GFX10WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10WGP-NEXT: s_barrier +; GFX10WGP-NEXT: buffer_gl0_inv +; GFX10WGP-NEXT: ds_read_b32 v0, v0 +; GFX10WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10WGP-NEXT: global_store_dword v1, v0, s[14:15] +; GFX10WGP-NEXT: s_endpgm +; +; GFX10CU-LABEL: barrier_release: +; GFX10CU: ; %bb.0: ; %main_body +; GFX10CU-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX10CU-NEXT: v_mov_b32_e32 v0, 0x800 +; GFX10CU-NEXT: v_mov_b32_e32 v1, 0 +; GFX10CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10CU-NEXT: s_mov_b32 m0, s12 +; GFX10CU-NEXT: buffer_load_dword v0, s[8:11], 0 offen lds +; GFX10CU-NEXT: v_mov_b32_e32 v0, s13 +; GFX10CU-NEXT: s_waitcnt vmcnt(0) +; GFX10CU-NEXT: s_barrier +; GFX10CU-NEXT: ds_read_b32 v0, v0 +; GFX10CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10CU-NEXT: global_store_dword v1, v0, s[14:15] +; GFX10CU-NEXT: s_endpgm + ptr addrspace(3) inreg %lds1, + ptr addrspace(3) inreg %lds2, + ptr addrspace(1) %dummy2) { +main_body: + call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) %lds1, i32 4, i32 2048, i32 0, i32 0, i32 0), !alias.scope !102 + fence syncscope("workgroup") release + tail call void @llvm.amdgcn.s.barrier() + fence syncscope("workgroup") acquire + %load = load i32, ptr addrspace(3) %lds2, align 4, !noalias !105 + store i32 %load, ptr addrspace(1) %dummy2, align 4, !noalias !105 + ret void +} + +define amdgpu_kernel void @fence_fence(<4 x i32> inreg %rsrc, +; GFX900-LABEL: fence_fence: +; GFX900: ; %bb.0: ; %main_body +; GFX900-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX900-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x3c +; GFX900-NEXT: v_mov_b32_e32 v1, 0x800 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 m0, s6 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: buffer_load_dword v1, s[0:3], 0 offen lds +; GFX900-NEXT: v_mov_b32_e32 v1, 1 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: global_store_dword v0, v1, s[8:9] +; GFX900-NEXT: global_load_dword v1, v0, s[8:9] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v1, s7 +; GFX900-NEXT: ds_read_b32 v1, v1 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: global_store_dword v0, v1, s[10:11] +; GFX900-NEXT: s_endpgm +; +; GFX90A-LABEL: fence_fence: +; GFX90A: ; %bb.1: +; GFX90A-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX90A-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_branch .LBB1_0 +; GFX90A-NEXT: .p2align 8 +; GFX90A-NEXT: ; %bb.2: +; GFX90A-NEXT: .LBB1_0: ; %main_body +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x3c +; GFX90A-NEXT: s_mov_b32 m0, s12 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0x800 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: buffer_load_dword v1, s[8:11], 0 offen lds +; GFX90A-NEXT: v_mov_b32_e32 v1, 1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v1, s13 +; GFX90A-NEXT: ds_read_b32 v1, v1 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: fence_fence: +; GFX90A-TGSPLIT: ; %bb.1: +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_branch .LBB1_0 +; GFX90A-TGSPLIT-NEXT: .p2align 8 +; GFX90A-TGSPLIT-NEXT: ; %bb.2: +; GFX90A-TGSPLIT-NEXT: .LBB1_0: ; %main_body +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x3c +; GFX90A-TGSPLIT-NEXT: s_mov_b32 m0, s12 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0x800 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v1, s[8:11], 0 offen lds +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, 1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s13 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-LABEL: fence_fence: +; GFX942: ; %bb.1: +; GFX942-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX942-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_branch .LBB1_0 +; GFX942-NEXT: .p2align 8 +; GFX942-NEXT: ; %bb.2: +; GFX942-NEXT: .LBB1_0: ; %main_body +; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x3c +; GFX942-NEXT: s_mov_b32 m0, s12 +; GFX942-NEXT: v_mov_b32_e32 v1, 0x800 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: buffer_load_dword v1, s[8:11], 0 offen lds +; GFX942-NEXT: v_mov_b32_e32 v1, 1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: global_store_dword v0, v1, s[0:1] sc0 +; GFX942-NEXT: global_load_dword v1, v0, s[0:1] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v1, s13 +; GFX942-NEXT: ds_read_b32 v1, v1 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: global_store_dword v0, v1, s[2:3] +; GFX942-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: fence_fence: +; GFX942-TGSPLIT: ; %bb.1: +; GFX942-TGSPLIT-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_branch .LBB1_0 +; GFX942-TGSPLIT-NEXT: .p2align 8 +; GFX942-TGSPLIT-NEXT: ; %bb.2: +; GFX942-TGSPLIT-NEXT: .LBB1_0: ; %main_body +; GFX942-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x3c +; GFX942-TGSPLIT-NEXT: s_mov_b32 m0, s12 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0x800 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: buffer_load_dword v1, s[8:11], 0 offen lds +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, 1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc0 +; GFX942-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s13 +; GFX942-TGSPLIT-NEXT: buffer_inv sc0 +; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX10WGP-LABEL: fence_fence: +; GFX10WGP: ; %bb.0: ; %main_body +; GFX10WGP-NEXT: s_clause 0x2 +; GFX10WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX10WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10WGP-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x3c +; GFX10WGP-NEXT: v_mov_b32_e32 v0, 0x800 +; GFX10WGP-NEXT: v_mov_b32_e32 v1, 0 +; GFX10WGP-NEXT: v_mov_b32_e32 v2, 1 +; GFX10WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10WGP-NEXT: s_mov_b32 m0, s6 +; GFX10WGP-NEXT: buffer_load_dword v0, s[0:3], 0 offen lds +; GFX10WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10WGP-NEXT: global_store_dword v1, v2, s[8:9] +; GFX10WGP-NEXT: global_load_dword v0, v1, s[8:9] glc +; GFX10WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10WGP-NEXT: v_mov_b32_e32 v0, s7 +; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10WGP-NEXT: buffer_gl0_inv +; GFX10WGP-NEXT: ds_read_b32 v0, v0 +; GFX10WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10WGP-NEXT: global_store_dword v1, v0, s[10:11] +; GFX10WGP-NEXT: s_endpgm +; +; GFX10CU-LABEL: fence_fence: +; GFX10CU: ; %bb.0: ; %main_body +; GFX10CU-NEXT: s_clause 0x2 +; GFX10CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX10CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10CU-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x3c +; GFX10CU-NEXT: v_mov_b32_e32 v0, 0x800 +; GFX10CU-NEXT: v_mov_b32_e32 v1, 0 +; GFX10CU-NEXT: v_mov_b32_e32 v2, 1 +; GFX10CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10CU-NEXT: s_mov_b32 m0, s6 +; GFX10CU-NEXT: buffer_load_dword v0, s[0:3], 0 offen lds +; GFX10CU-NEXT: s_waitcnt vmcnt(0) +; GFX10CU-NEXT: global_store_dword v1, v2, s[8:9] +; GFX10CU-NEXT: global_load_dword v0, v1, s[8:9] +; GFX10CU-NEXT: s_waitcnt vmcnt(0) +; GFX10CU-NEXT: v_mov_b32_e32 v0, s7 +; GFX10CU-NEXT: ds_read_b32 v0, v0 +; GFX10CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10CU-NEXT: global_store_dword v1, v0, s[10:11] +; GFX10CU-NEXT: s_endpgm + ptr addrspace(3) inreg %lds1, + ptr addrspace(3) inreg %lds2, + ptr addrspace(1) %flag, + ptr addrspace(1) %dummy2) { +main_body: + call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) %lds1, i32 4, i32 2048, i32 0, i32 0, i32 0), !alias.scope !102 + fence syncscope("workgroup") release + store atomic i32 1, ptr addrspace(1) %flag syncscope("workgroup") monotonic, align 4, !noalias !105 + %unused_flag = load atomic i32, ptr addrspace(1) %flag syncscope("workgroup") monotonic, align 4, !noalias !105 + fence syncscope("workgroup") acquire + %load = load i32, ptr addrspace(3) %lds2, align 4, !noalias !105 + store i32 %load, ptr addrspace(1) %dummy2, align 4, !noalias !105 + ret void +} + +define amdgpu_kernel void @release_acquire(<4 x i32> inreg %rsrc, +; GFX900-LABEL: release_acquire: +; GFX900: ; %bb.0: ; %main_body +; GFX900-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX900-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x3c +; GFX900-NEXT: v_mov_b32_e32 v1, 0x800 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 m0, s6 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: buffer_load_dword v1, s[0:3], 0 offen lds +; GFX900-NEXT: v_mov_b32_e32 v1, 1 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: global_store_dword v0, v1, s[8:9] +; GFX900-NEXT: global_load_dword v1, v0, s[8:9] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v1, s7 +; GFX900-NEXT: ds_read_b32 v1, v1 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: global_store_dword v0, v1, s[10:11] +; GFX900-NEXT: s_endpgm +; +; GFX90A-LABEL: release_acquire: +; GFX90A: ; %bb.1: +; GFX90A-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX90A-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_branch .LBB2_0 +; GFX90A-NEXT: .p2align 8 +; GFX90A-NEXT: ; %bb.2: +; GFX90A-NEXT: .LBB2_0: ; %main_body +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x3c +; GFX90A-NEXT: s_mov_b32 m0, s12 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0x800 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: buffer_load_dword v1, s[8:11], 0 offen lds +; GFX90A-NEXT: v_mov_b32_e32 v1, 1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v1, s13 +; GFX90A-NEXT: ds_read_b32 v1, v1 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: release_acquire: +; GFX90A-TGSPLIT: ; %bb.1: +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_branch .LBB2_0 +; GFX90A-TGSPLIT-NEXT: .p2align 8 +; GFX90A-TGSPLIT-NEXT: ; %bb.2: +; GFX90A-TGSPLIT-NEXT: .LBB2_0: ; %main_body +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x3c +; GFX90A-TGSPLIT-NEXT: s_mov_b32 m0, s12 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0x800 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v1, s[8:11], 0 offen lds +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, 1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s13 +; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-LABEL: release_acquire: +; GFX942: ; %bb.1: +; GFX942-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX942-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_branch .LBB2_0 +; GFX942-NEXT: .p2align 8 +; GFX942-NEXT: ; %bb.2: +; GFX942-NEXT: .LBB2_0: ; %main_body +; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x3c +; GFX942-NEXT: s_mov_b32 m0, s12 +; GFX942-NEXT: v_mov_b32_e32 v1, 0x800 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: buffer_load_dword v1, s[8:11], 0 offen lds +; GFX942-NEXT: v_mov_b32_e32 v1, 1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: global_store_dword v0, v1, s[0:1] sc0 +; GFX942-NEXT: global_load_dword v1, v0, s[0:1] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v1, s13 +; GFX942-NEXT: ds_read_b32 v1, v1 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: global_store_dword v0, v1, s[2:3] +; GFX942-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: release_acquire: +; GFX942-TGSPLIT: ; %bb.1: +; GFX942-TGSPLIT-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_branch .LBB2_0 +; GFX942-TGSPLIT-NEXT: .p2align 8 +; GFX942-TGSPLIT-NEXT: ; %bb.2: +; GFX942-TGSPLIT-NEXT: .LBB2_0: ; %main_body +; GFX942-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x3c +; GFX942-TGSPLIT-NEXT: s_mov_b32 m0, s12 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0x800 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: buffer_load_dword v1, s[8:11], 0 offen lds +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, 1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc0 +; GFX942-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc0 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s13 +; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX10WGP-LABEL: release_acquire: +; GFX10WGP: ; %bb.0: ; %main_body +; GFX10WGP-NEXT: s_clause 0x2 +; GFX10WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX10WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10WGP-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x3c +; GFX10WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10WGP-NEXT: v_mov_b32_e32 v1, 0x800 +; GFX10WGP-NEXT: v_mov_b32_e32 v2, 1 +; GFX10WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10WGP-NEXT: s_mov_b32 m0, s6 +; GFX10WGP-NEXT: buffer_load_dword v1, s[0:3], 0 offen lds +; GFX10WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10WGP-NEXT: global_store_dword v0, v2, s[8:9] +; GFX10WGP-NEXT: global_load_dword v1, v0, s[8:9] glc +; GFX10WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10WGP-NEXT: buffer_gl0_inv +; GFX10WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10WGP-NEXT: ds_read_b32 v1, v1 +; GFX10WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10WGP-NEXT: global_store_dword v0, v1, s[10:11] +; GFX10WGP-NEXT: s_endpgm +; +; GFX10CU-LABEL: release_acquire: +; GFX10CU: ; %bb.0: ; %main_body +; GFX10CU-NEXT: s_clause 0x2 +; GFX10CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX10CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10CU-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x3c +; GFX10CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10CU-NEXT: v_mov_b32_e32 v1, 0x800 +; GFX10CU-NEXT: v_mov_b32_e32 v2, 1 +; GFX10CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10CU-NEXT: s_mov_b32 m0, s6 +; GFX10CU-NEXT: buffer_load_dword v1, s[0:3], 0 offen lds +; GFX10CU-NEXT: s_waitcnt vmcnt(0) +; GFX10CU-NEXT: global_store_dword v0, v2, s[8:9] +; GFX10CU-NEXT: global_load_dword v1, v0, s[8:9] +; GFX10CU-NEXT: s_waitcnt vmcnt(0) +; GFX10CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10CU-NEXT: ds_read_b32 v1, v1 +; GFX10CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10CU-NEXT: global_store_dword v0, v1, s[10:11] +; GFX10CU-NEXT: s_endpgm + ptr addrspace(3) inreg %lds1, + ptr addrspace(3) inreg %lds2, + ptr addrspace(1) %flag, + ptr addrspace(1) %dummy2) { +main_body: + call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) %lds1, i32 4, i32 2048, i32 0, i32 0, i32 0), !alias.scope !102 + store atomic i32 1, ptr addrspace(1) %flag syncscope("workgroup") release, align 4, !noalias !105 + %unused_flag = load atomic i32, ptr addrspace(1) %flag syncscope("workgroup") acquire, align 4, !noalias !105 + %load = load i32, ptr addrspace(3) %lds2, align 4, !noalias !105 + store i32 %load, ptr addrspace(1) %dummy2, align 4, !noalias !105 + ret void +} + +!100 = !{!100} +!101 = !{!101, !100} +!102 = !{!101} +!103 = !{!103, !100} +!104 = !{!103} +!105 = !{!101, !103}