Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1381,6 +1381,20 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
Modified = true;
} else
WaitcntInstr = ⅈ
} else if (Opcode == AMDGPU::S_WAITCNT_LDS_DIRECT) {
assert(ST->hasVMemToLDSLoad());
LLVM_DEBUG(dbgs() << "Processing S_WAITCNT_LDS_DIRECT: " << II
<< "Before: " << Wait.LoadCnt << '\n';);
ScoreBrackets.determineWait(LOAD_CNT, FIRST_LDS_VGPR, Wait);
LLVM_DEBUG(dbgs() << "After: " << Wait.LoadCnt << '\n';);

// It is possible (but unlikely) that this is the only wait instruction,
// in which case, we exit this loop without a WaitcntInstr to consume
// `Wait`. But that works because `Wait` was passed in by reference, and
// the callee eventually calls createNewWaitcnt on it. We test this
// possibility in an articial MIR test since such a situation cannot be
// recreated by running the memory legalizer.
II.eraseFromParent();
} else {
assert(Opcode == AMDGPU::S_WAITCNT_VSCNT);
assert(II.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
Expand Down Expand Up @@ -1552,6 +1566,11 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
ScoreBrackets.simplifyWaitcnt(OldWait);
Wait = Wait.combined(OldWait);
UpdatableInstr = &CombinedStoreDsCntInstr;
} else if (Opcode == AMDGPU::S_WAITCNT_LDS_DIRECT) {
// Architectures higher than GFX10 do not have direct loads to
// LDS, so no work required here yet.
II.eraseFromParent();
continue;
} else {
std::optional<InstCounterType> CT = counterTypeForInstr(Opcode);
assert(CT.has_value());
Expand Down Expand Up @@ -2442,6 +2461,7 @@ static bool isWaitInstr(MachineInstr &Inst) {
Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL) ||
Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT ||
Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT ||
Opcode == AMDGPU::S_WAITCNT_LDS_DIRECT ||
counterTypeForInstr(Opcode).has_value();
}

Expand Down
20 changes: 20 additions & 0 deletions llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1170,6 +1170,16 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
Changed = true;
}

// On architectures that support direct loads to LDS, emit an unknown waitcnt
// at workgroup-scoped release operations that specify the LDS address space.
// SIInsertWaitcnts will later replace this with a vmcnt().
if (ST.hasVMemToLDSLoad() && isReleaseOrStronger(Order) &&
Scope == SIAtomicScope::WORKGROUP &&
any((SIAtomicAddrSpace)AddrSpace & SIAtomicAddrSpace::LDS)) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
any((SIAtomicAddrSpace)AddrSpace & SIAtomicAddrSpace::LDS)) {
(AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {

Just to be consistent with the rest of the file

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Meh. Fixed it. Although I would rather have the rest of the file be consistent with this (highly recommended and correct) use of any(). In fact BitmaskEnum there should be able to provide a much clearer way to check for individual bits.

BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_LDS_DIRECT));
Changed = true;
}

if (Pos == Position::AFTER)
--MI;

Expand Down Expand Up @@ -2078,6 +2088,16 @@ bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
Changed = true;
}

// On architectures that support direct loads to LDS, emit an unknown waitcnt
// at workgroup-scoped release operations that specify the LDS address space.
// SIInsertWaitcnts will later replace this with a vmcnt().
if (ST.hasVMemToLDSLoad() && isReleaseOrStronger(Order) &&
Scope == SIAtomicScope::WORKGROUP &&
any((SIAtomicAddrSpace)AddrSpace & SIAtomicAddrSpace::LDS)) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same here

BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_LDS_DIRECT));
Changed = true;
}

if (VSCnt) {
BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT_soft))
.addReg(AMDGPU::SGPR_NULL, RegState::Undef)
Expand Down
7 changes: 7 additions & 0 deletions llvm/lib/Target/AMDGPU/SOPInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -1621,6 +1621,13 @@ let OtherPredicates = [HasImageInsts] in {
def S_WAIT_KMCNT_soft : SOPP_Pseudo <"s_soft_wait_kmcnt", (ins s16imm:$simm16), "$simm16">;
}

// Represents the point at which a wave must wait for all outstanding direct loads to LDS.
// Typically inserted by the memory legalizer and consumed by SIInsertWaitcnts.

def S_WAITCNT_LDS_DIRECT : SPseudoInstSI<(outs), (ins)> {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Synthetic variants of real instructions should use a lowercase suffix

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, I have a great deal of uncertainty on what this pseudo should be called. It seems the "_soft" suffix signifies a definite wait count argument, which the SIInsertWaitcnts pass is allowed to relax. Are you suggesting "S_WAITCNT_lds_direct"? Because "_direct" as a suffix doesn't contain enough information, and there is no "S_WAITCNT_LDS".

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

WAIT_LDS_DIRECT_PSEUDO (following a similar pattern to ATOMIC_FENCE and other SOP pseudos?)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

S_WAITCNT_something_lowercase

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am now using S_WAITCNT_lds_direct .... it represents a wait count, for direct loads to LDS, but it's not real. Having it called S_WAITCNT_something like everything else near it helps reinforce that its relevance is to the SIInsertWaitcnts pass, where it is included by the isWaitInstr() predicate.

let hasSideEffects = 0;
}

def S_SETHALT : SOPP_Pseudo <"s_sethalt" , (ins i32imm:$simm16), "$simm16",
[(int_amdgcn_s_sethalt timm:$simm16)]>;
def S_SETKILL : SOPP_Pseudo <"s_setkill" , (ins i16imm:$simm16), "$simm16">;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -545,11 +545,13 @@ define amdgpu_kernel void @workgroup_one_as_release() #0 {
; GFX10WGP-LABEL: name: workgroup_one_as_release
; GFX10WGP: bb.0.entry:
; GFX10WGP-NEXT: S_WAITCNT_soft 16240
; GFX10WGP-NEXT: S_WAITCNT_LDS_DIRECT
; GFX10WGP-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX10WGP-NEXT: S_ENDPGM 0
;
; GFX10CU-LABEL: name: workgroup_one_as_release
; GFX10CU: bb.0.entry:
; GFX10CU-NEXT: S_WAITCNT_LDS_DIRECT
; GFX10CU-NEXT: S_ENDPGM 0
;
; GFX11WGP-LABEL: name: workgroup_one_as_release
Expand Down Expand Up @@ -578,12 +580,14 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel() #0 {
; GFX10WGP-LABEL: name: workgroup_one_as_acq_rel
; GFX10WGP: bb.0.entry:
; GFX10WGP-NEXT: S_WAITCNT_soft 16240
; GFX10WGP-NEXT: S_WAITCNT_LDS_DIRECT
; GFX10WGP-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX10WGP-NEXT: BUFFER_GL0_INV implicit $exec
; GFX10WGP-NEXT: S_ENDPGM 0
;
; GFX10CU-LABEL: name: workgroup_one_as_acq_rel
; GFX10CU: bb.0.entry:
; GFX10CU-NEXT: S_WAITCNT_LDS_DIRECT
; GFX10CU-NEXT: S_ENDPGM 0
;
; GFX11WGP-LABEL: name: workgroup_one_as_acq_rel
Expand Down Expand Up @@ -613,12 +617,14 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst() #0 {
; GFX10WGP-LABEL: name: workgroup_one_as_seq_cst
; GFX10WGP: bb.0.entry:
; GFX10WGP-NEXT: S_WAITCNT_soft 16240
; GFX10WGP-NEXT: S_WAITCNT_LDS_DIRECT
; GFX10WGP-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX10WGP-NEXT: BUFFER_GL0_INV implicit $exec
; GFX10WGP-NEXT: S_ENDPGM 0
;
; GFX10CU-LABEL: name: workgroup_one_as_seq_cst
; GFX10CU: bb.0.entry:
; GFX10CU-NEXT: S_WAITCNT_LDS_DIRECT
; GFX10CU-NEXT: S_ENDPGM 0
;
; GFX11WGP-LABEL: name: workgroup_one_as_seq_cst
Expand Down Expand Up @@ -1293,12 +1299,14 @@ define amdgpu_kernel void @workgroup_release() #0 {
; GFX10WGP-LABEL: name: workgroup_release
; GFX10WGP: bb.0.entry:
; GFX10WGP-NEXT: S_WAITCNT_soft 112
; GFX10WGP-NEXT: S_WAITCNT_LDS_DIRECT
; GFX10WGP-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX10WGP-NEXT: S_ENDPGM 0
;
; GFX10CU-LABEL: name: workgroup_release
; GFX10CU: bb.0.entry:
; GFX10CU-NEXT: S_WAITCNT_soft 49279
; GFX10CU-NEXT: S_WAITCNT_LDS_DIRECT
; GFX10CU-NEXT: S_ENDPGM 0
;
; GFX11WGP-LABEL: name: workgroup_release
Expand Down Expand Up @@ -1330,13 +1338,15 @@ define amdgpu_kernel void @workgroup_acq_rel() #0 {
; GFX10WGP-LABEL: name: workgroup_acq_rel
; GFX10WGP: bb.0.entry:
; GFX10WGP-NEXT: S_WAITCNT_soft 112
; GFX10WGP-NEXT: S_WAITCNT_LDS_DIRECT
; GFX10WGP-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX10WGP-NEXT: BUFFER_GL0_INV implicit $exec
; GFX10WGP-NEXT: S_ENDPGM 0
;
; GFX10CU-LABEL: name: workgroup_acq_rel
; GFX10CU: bb.0.entry:
; GFX10CU-NEXT: S_WAITCNT_soft 49279
; GFX10CU-NEXT: S_WAITCNT_LDS_DIRECT
; GFX10CU-NEXT: S_ENDPGM 0
;
; GFX11WGP-LABEL: name: workgroup_acq_rel
Expand Down Expand Up @@ -1369,13 +1379,15 @@ define amdgpu_kernel void @workgroup_seq_cst() #0 {
; GFX10WGP-LABEL: name: workgroup_seq_cst
; GFX10WGP: bb.0.entry:
; GFX10WGP-NEXT: S_WAITCNT_soft 112
; GFX10WGP-NEXT: S_WAITCNT_LDS_DIRECT
; GFX10WGP-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX10WGP-NEXT: BUFFER_GL0_INV implicit $exec
; GFX10WGP-NEXT: S_ENDPGM 0
;
; GFX10CU-LABEL: name: workgroup_seq_cst
; GFX10CU: bb.0.entry:
; GFX10CU-NEXT: S_WAITCNT_soft 49279
; GFX10CU-NEXT: S_WAITCNT_LDS_DIRECT
; GFX10CU-NEXT: S_ENDPGM 0
;
; GFX11WGP-LABEL: name: workgroup_seq_cst
Expand Down
133 changes: 133 additions & 0 deletions llvm/test/CodeGen/AMDGPU/insert-waitcnts-fence-soft.mir
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
# RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefix=GCN %s


# Expected vmcnt(0) since the direct load is the only load.
---
name: dma_then_fence
body: |
bb.0:
; GCN-LABEL: name: dma_then_fence
; GCN: S_WAITCNT 0
; GCN-NEXT: $m0 = S_MOV_B32 0
; GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4, addrspace 1), (store (s32) into `ptr addrspace(3) poison` + 4, addrspace 3)
; GCN-NEXT: S_WAITCNT 3952
; GCN-NEXT: $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
; GCN-NEXT: S_ENDPGM 0
$m0 = S_MOV_B32 0
BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4)
S_WAITCNT_LDS_DIRECT
$vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
S_ENDPGM 0

...

# Expected vmcnt(1) since the global load is not processed by SIInsertWaitcnts.

---
name: dma_then_global_load
body: |
bb.0:
; GCN-LABEL: name: dma_then_global_load
; GCN: S_WAITCNT 0
; GCN-NEXT: $m0 = S_MOV_B32 0
; GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4, addrspace 1), (store (s32) into `ptr addrspace(3) poison` + 4, addrspace 3)
; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec
; GCN-NEXT: S_WAITCNT 3953
; GCN-NEXT: $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
; GCN-NEXT: S_ENDPGM 0
$m0 = S_MOV_B32 0
BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4)
$vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec
S_WAITCNT_LDS_DIRECT
$vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
S_ENDPGM 0

...

# Expected no vmcnt since there is no direct load to LDS, and the global load is not processed by SIInsertWaitcnts.

---
name: no_dma_just_fence
body: |
bb.0:
; GCN-LABEL: name: no_dma_just_fence
; GCN: S_WAITCNT 0
; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec
; GCN-NEXT: $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
; GCN-NEXT: S_ENDPGM 0
$vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec
S_WAITCNT_LDS_DIRECT
$vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
S_ENDPGM 0

...

# Expected vmcnt(1) since the global load is not processed by SIInsertWaitcnts.

---
name: dma_then_system_fence
body: |
bb.0:
; GCN-LABEL: name: dma_then_system_fence
; GCN: S_WAITCNT 0
; GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4, addrspace 1), (store (s32) into `ptr addrspace(3) poison` + 4, addrspace 3)
; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec
; GCN-NEXT: S_WAITCNT 3953
; GCN-NEXT: $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
; GCN-NEXT: S_ENDPGM 0
BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4)
$vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec
S_WAITCNT_LDS_DIRECT
$vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
S_ENDPGM 0

...

# The computed vmcnt(1) gets merged with the existing vmcnt(0).

---
name: merge_with_prev_wait
body: |
bb.0:
; GCN-LABEL: name: merge_with_prev_wait
; GCN: S_WAITCNT 0
; GCN-NEXT: $m0 = S_MOV_B32 0
; GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4, addrspace 1), (store (s32) into `ptr addrspace(3) poison` + 4, addrspace 3)
; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec
; GCN-NEXT: S_WAITCNT 3952
; GCN-NEXT: $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
; GCN-NEXT: S_ENDPGM 0
$m0 = S_MOV_B32 0
BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4)
$vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec
S_WAITCNT 3952
S_WAITCNT_LDS_DIRECT
$vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
S_ENDPGM 0

...

# The computed vmcnt(1) gets merged with the existing vmcnt(0).

---
name: merge_with_next_wait
body: |
bb.0:
; GCN-LABEL: name: merge_with_next_wait
; GCN: S_WAITCNT 0
; GCN-NEXT: $m0 = S_MOV_B32 0
; GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4, addrspace 1), (store (s32) into `ptr addrspace(3) poison` + 4, addrspace 3)
; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec
; GCN-NEXT: S_WAITCNT 3952
; GCN-NEXT: $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
; GCN-NEXT: S_ENDPGM 0
$m0 = S_MOV_B32 0
BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4)
$vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec
S_WAITCNT_LDS_DIRECT
S_WAITCNT 3952
$vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
S_ENDPGM 0

...
Loading