Skip to content

[AMDGPU] introduce S_WAITCNT_LDS_DIRECT in the memory legalizer #150887

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Jul 30, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1381,6 +1381,20 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
Modified = true;
} else
WaitcntInstr = ⅈ
} else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) {
assert(ST->hasVMemToLDSLoad());
LLVM_DEBUG(dbgs() << "Processing S_WAITCNT_lds_direct: " << II
<< "Before: " << Wait.LoadCnt << '\n';);
ScoreBrackets.determineWait(LOAD_CNT, FIRST_LDS_VGPR, Wait);
LLVM_DEBUG(dbgs() << "After: " << Wait.LoadCnt << '\n';);

// It is possible (but unlikely) that this is the only wait instruction,
// in which case, we exit this loop without a WaitcntInstr to consume
// `Wait`. But that works because `Wait` was passed in by reference, and
// the callee eventually calls createNewWaitcnt on it. We test this
// possibility in an articial MIR test since such a situation cannot be
// recreated by running the memory legalizer.
II.eraseFromParent();
} else {
assert(Opcode == AMDGPU::S_WAITCNT_VSCNT);
assert(II.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
Expand Down Expand Up @@ -1552,6 +1566,11 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
ScoreBrackets.simplifyWaitcnt(OldWait);
Wait = Wait.combined(OldWait);
UpdatableInstr = &CombinedStoreDsCntInstr;
} else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) {
// Architectures higher than GFX10 do not have direct loads to
// LDS, so no work required here yet.
II.eraseFromParent();
continue;
} else {
std::optional<InstCounterType> CT = counterTypeForInstr(Opcode);
assert(CT.has_value());
Expand Down Expand Up @@ -2442,6 +2461,7 @@ static bool isWaitInstr(MachineInstr &Inst) {
Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL) ||
Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT ||
Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT ||
Opcode == AMDGPU::S_WAITCNT_lds_direct ||
counterTypeForInstr(Opcode).has_value();
}

Expand Down
20 changes: 20 additions & 0 deletions llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1170,6 +1170,16 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
Changed = true;
}

// On architectures that support direct loads to LDS, emit an unknown waitcnt
// at workgroup-scoped release operations that specify the LDS address space.
// SIInsertWaitcnts will later replace this with a vmcnt().
if (ST.hasVMemToLDSLoad() && isReleaseOrStronger(Order) &&
Scope == SIAtomicScope::WORKGROUP &&
(AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_lds_direct));
Changed = true;
}

if (Pos == Position::AFTER)
--MI;

Expand Down Expand Up @@ -2078,6 +2088,16 @@ bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
Changed = true;
}

// On architectures that support direct loads to LDS, emit an unknown waitcnt
// at workgroup-scoped release operations that specify the LDS address space.
// SIInsertWaitcnts will later replace this with a vmcnt().
if (ST.hasVMemToLDSLoad() && isReleaseOrStronger(Order) &&
Scope == SIAtomicScope::WORKGROUP &&
(AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_lds_direct));
Changed = true;
}

if (VSCnt) {
BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT_soft))
.addReg(AMDGPU::SGPR_NULL, RegState::Undef)
Expand Down
7 changes: 7 additions & 0 deletions llvm/lib/Target/AMDGPU/SOPInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -1621,6 +1621,13 @@ let OtherPredicates = [HasImageInsts] in {
def S_WAIT_KMCNT_soft : SOPP_Pseudo <"s_soft_wait_kmcnt", (ins s16imm:$simm16), "$simm16">;
}

// Represents the point at which a wave must wait for all outstanding direct loads to LDS.
// Typically inserted by the memory legalizer and consumed by SIInsertWaitcnts.

def S_WAITCNT_lds_direct : SPseudoInstSI<(outs), (ins)> {
let hasSideEffects = 0;
}

def S_SETHALT : SOPP_Pseudo <"s_sethalt" , (ins i32imm:$simm16), "$simm16",
[(int_amdgcn_s_sethalt timm:$simm16)]>;
def S_SETKILL : SOPP_Pseudo <"s_setkill" , (ins i16imm:$simm16), "$simm16">;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -545,11 +545,13 @@ define amdgpu_kernel void @workgroup_one_as_release() #0 {
; GFX10WGP-LABEL: name: workgroup_one_as_release
; GFX10WGP: bb.0.entry:
; GFX10WGP-NEXT: S_WAITCNT_soft 16240
; GFX10WGP-NEXT: S_WAITCNT_lds_direct
; GFX10WGP-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX10WGP-NEXT: S_ENDPGM 0
;
; GFX10CU-LABEL: name: workgroup_one_as_release
; GFX10CU: bb.0.entry:
; GFX10CU-NEXT: S_WAITCNT_lds_direct
; GFX10CU-NEXT: S_ENDPGM 0
;
; GFX11WGP-LABEL: name: workgroup_one_as_release
Expand Down Expand Up @@ -578,12 +580,14 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel() #0 {
; GFX10WGP-LABEL: name: workgroup_one_as_acq_rel
; GFX10WGP: bb.0.entry:
; GFX10WGP-NEXT: S_WAITCNT_soft 16240
; GFX10WGP-NEXT: S_WAITCNT_lds_direct
; GFX10WGP-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX10WGP-NEXT: BUFFER_GL0_INV implicit $exec
; GFX10WGP-NEXT: S_ENDPGM 0
;
; GFX10CU-LABEL: name: workgroup_one_as_acq_rel
; GFX10CU: bb.0.entry:
; GFX10CU-NEXT: S_WAITCNT_lds_direct
; GFX10CU-NEXT: S_ENDPGM 0
;
; GFX11WGP-LABEL: name: workgroup_one_as_acq_rel
Expand Down Expand Up @@ -613,12 +617,14 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst() #0 {
; GFX10WGP-LABEL: name: workgroup_one_as_seq_cst
; GFX10WGP: bb.0.entry:
; GFX10WGP-NEXT: S_WAITCNT_soft 16240
; GFX10WGP-NEXT: S_WAITCNT_lds_direct
; GFX10WGP-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX10WGP-NEXT: BUFFER_GL0_INV implicit $exec
; GFX10WGP-NEXT: S_ENDPGM 0
;
; GFX10CU-LABEL: name: workgroup_one_as_seq_cst
; GFX10CU: bb.0.entry:
; GFX10CU-NEXT: S_WAITCNT_lds_direct
; GFX10CU-NEXT: S_ENDPGM 0
;
; GFX11WGP-LABEL: name: workgroup_one_as_seq_cst
Expand Down Expand Up @@ -1293,12 +1299,14 @@ define amdgpu_kernel void @workgroup_release() #0 {
; GFX10WGP-LABEL: name: workgroup_release
; GFX10WGP: bb.0.entry:
; GFX10WGP-NEXT: S_WAITCNT_soft 112
; GFX10WGP-NEXT: S_WAITCNT_lds_direct
; GFX10WGP-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX10WGP-NEXT: S_ENDPGM 0
;
; GFX10CU-LABEL: name: workgroup_release
; GFX10CU: bb.0.entry:
; GFX10CU-NEXT: S_WAITCNT_soft 49279
; GFX10CU-NEXT: S_WAITCNT_lds_direct
; GFX10CU-NEXT: S_ENDPGM 0
;
; GFX11WGP-LABEL: name: workgroup_release
Expand Down Expand Up @@ -1330,13 +1338,15 @@ define amdgpu_kernel void @workgroup_acq_rel() #0 {
; GFX10WGP-LABEL: name: workgroup_acq_rel
; GFX10WGP: bb.0.entry:
; GFX10WGP-NEXT: S_WAITCNT_soft 112
; GFX10WGP-NEXT: S_WAITCNT_lds_direct
; GFX10WGP-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX10WGP-NEXT: BUFFER_GL0_INV implicit $exec
; GFX10WGP-NEXT: S_ENDPGM 0
;
; GFX10CU-LABEL: name: workgroup_acq_rel
; GFX10CU: bb.0.entry:
; GFX10CU-NEXT: S_WAITCNT_soft 49279
; GFX10CU-NEXT: S_WAITCNT_lds_direct
; GFX10CU-NEXT: S_ENDPGM 0
;
; GFX11WGP-LABEL: name: workgroup_acq_rel
Expand Down Expand Up @@ -1369,13 +1379,15 @@ define amdgpu_kernel void @workgroup_seq_cst() #0 {
; GFX10WGP-LABEL: name: workgroup_seq_cst
; GFX10WGP: bb.0.entry:
; GFX10WGP-NEXT: S_WAITCNT_soft 112
; GFX10WGP-NEXT: S_WAITCNT_lds_direct
; GFX10WGP-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX10WGP-NEXT: BUFFER_GL0_INV implicit $exec
; GFX10WGP-NEXT: S_ENDPGM 0
;
; GFX10CU-LABEL: name: workgroup_seq_cst
; GFX10CU: bb.0.entry:
; GFX10CU-NEXT: S_WAITCNT_soft 49279
; GFX10CU-NEXT: S_WAITCNT_lds_direct
; GFX10CU-NEXT: S_ENDPGM 0
;
; GFX11WGP-LABEL: name: workgroup_seq_cst
Expand Down
133 changes: 133 additions & 0 deletions llvm/test/CodeGen/AMDGPU/insert-waitcnts-fence-soft.mir
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
# RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefix=GCN %s


# Expected vmcnt(0) since the direct load is the only load.
---
name: dma_then_fence
body: |
bb.0:
; GCN-LABEL: name: dma_then_fence
; GCN: S_WAITCNT 0
; GCN-NEXT: $m0 = S_MOV_B32 0
; GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4, addrspace 1), (store (s32) into `ptr addrspace(3) poison` + 4, addrspace 3)
; GCN-NEXT: S_WAITCNT 3952
; GCN-NEXT: $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
; GCN-NEXT: S_ENDPGM 0
$m0 = S_MOV_B32 0
BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4)
S_WAITCNT_lds_direct
$vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
S_ENDPGM 0

...

# Expected vmcnt(1) since the global load is not processed by SIInsertWaitcnts.

---
name: dma_then_global_load
body: |
bb.0:
; GCN-LABEL: name: dma_then_global_load
; GCN: S_WAITCNT 0
; GCN-NEXT: $m0 = S_MOV_B32 0
; GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4, addrspace 1), (store (s32) into `ptr addrspace(3) poison` + 4, addrspace 3)
; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec
; GCN-NEXT: S_WAITCNT 3953
; GCN-NEXT: $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
; GCN-NEXT: S_ENDPGM 0
$m0 = S_MOV_B32 0
BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4)
$vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec
S_WAITCNT_lds_direct
$vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
S_ENDPGM 0

...

# Expected no vmcnt since there is no direct load to LDS, and the global load is not processed by SIInsertWaitcnts.

---
name: no_dma_just_fence
body: |
bb.0:
; GCN-LABEL: name: no_dma_just_fence
; GCN: S_WAITCNT 0
; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec
; GCN-NEXT: $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
; GCN-NEXT: S_ENDPGM 0
$vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec
S_WAITCNT_lds_direct
$vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
S_ENDPGM 0

...

# Expected vmcnt(1) since the global load is not processed by SIInsertWaitcnts.

---
name: dma_then_system_fence
body: |
bb.0:
; GCN-LABEL: name: dma_then_system_fence
; GCN: S_WAITCNT 0
; GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4, addrspace 1), (store (s32) into `ptr addrspace(3) poison` + 4, addrspace 3)
; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec
; GCN-NEXT: S_WAITCNT 3953
; GCN-NEXT: $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
; GCN-NEXT: S_ENDPGM 0
BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4)
$vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec
S_WAITCNT_lds_direct
$vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
S_ENDPGM 0

...

# The computed vmcnt(1) gets merged with the existing vmcnt(0).

---
name: merge_with_prev_wait
body: |
bb.0:
; GCN-LABEL: name: merge_with_prev_wait
; GCN: S_WAITCNT 0
; GCN-NEXT: $m0 = S_MOV_B32 0
; GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4, addrspace 1), (store (s32) into `ptr addrspace(3) poison` + 4, addrspace 3)
; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec
; GCN-NEXT: S_WAITCNT 3952
; GCN-NEXT: $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
; GCN-NEXT: S_ENDPGM 0
$m0 = S_MOV_B32 0
BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4)
$vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec
S_WAITCNT 3952
S_WAITCNT_lds_direct
$vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
S_ENDPGM 0

...

# The computed vmcnt(1) gets merged with the existing vmcnt(0).

---
name: merge_with_next_wait
body: |
bb.0:
; GCN-LABEL: name: merge_with_next_wait
; GCN: S_WAITCNT 0
; GCN-NEXT: $m0 = S_MOV_B32 0
; GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4, addrspace 1), (store (s32) into `ptr addrspace(3) poison` + 4, addrspace 3)
; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec
; GCN-NEXT: S_WAITCNT 3952
; GCN-NEXT: $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
; GCN-NEXT: S_ENDPGM 0
$m0 = S_MOV_B32 0
BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4)
$vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec
S_WAITCNT_lds_direct
S_WAITCNT 3952
$vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
S_ENDPGM 0

...
Loading