-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[AMDGPU] introduce S_WAITCNT_LDS_DIRECT in the memory legalizer #150887
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 2 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1170,6 +1170,16 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI, | |
Changed = true; | ||
} | ||
|
||
// On architectures that support direct loads to LDS, emit an unknown waitcnt | ||
// at workgroup-scoped release operations that specify the LDS address space. | ||
// SIInsertWaitcnts will later replace this with a vmcnt(). | ||
if (ST.hasVMemToLDSLoad() && isReleaseOrStronger(Order) && | ||
Scope == SIAtomicScope::WORKGROUP && | ||
any((SIAtomicAddrSpace)AddrSpace & SIAtomicAddrSpace::LDS)) { | ||
BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_LDS_DIRECT)); | ||
Changed = true; | ||
} | ||
|
||
if (Pos == Position::AFTER) | ||
--MI; | ||
|
||
|
@@ -2078,6 +2088,16 @@ bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI, | |
Changed = true; | ||
} | ||
|
||
// On architectures that support direct loads to LDS, emit an unknown waitcnt | ||
// at workgroup-scoped release operations that specify the LDS address space. | ||
// SIInsertWaitcnts will later replace this with a vmcnt(). | ||
if (ST.hasVMemToLDSLoad() && isReleaseOrStronger(Order) && | ||
Scope == SIAtomicScope::WORKGROUP && | ||
any((SIAtomicAddrSpace)AddrSpace & SIAtomicAddrSpace::LDS)) { | ||
|
||
BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_LDS_DIRECT)); | ||
Changed = true; | ||
} | ||
|
||
if (VSCnt) { | ||
BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT_soft)) | ||
.addReg(AMDGPU::SGPR_NULL, RegState::Undef) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1621,6 +1621,13 @@ let OtherPredicates = [HasImageInsts] in { | |
def S_WAIT_KMCNT_soft : SOPP_Pseudo <"s_soft_wait_kmcnt", (ins s16imm:$simm16), "$simm16">; | ||
} | ||
|
||
// Represents the point at which a wave must wait for all outstanding direct loads to LDS. | ||
// Typically inserted by the memory legalizer and consumed by SIInsertWaitcnts. | ||
|
||
def S_WAITCNT_LDS_DIRECT : SPseudoInstSI<(outs), (ins)> { | ||
|
||
let hasSideEffects = 0; | ||
} | ||
|
||
def S_SETHALT : SOPP_Pseudo <"s_sethalt" , (ins i32imm:$simm16), "$simm16", | ||
[(int_amdgcn_s_sethalt timm:$simm16)]>; | ||
def S_SETKILL : SOPP_Pseudo <"s_setkill" , (ins i16imm:$simm16), "$simm16">; | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,133 @@ | ||
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 | ||
# RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefix=GCN %s | ||
|
||
|
||
# Expected vmcnt(0) since the direct load is the only load. | ||
--- | ||
name: dma_then_fence | ||
body: | | ||
bb.0: | ||
; GCN-LABEL: name: dma_then_fence | ||
; GCN: S_WAITCNT 0 | ||
; GCN-NEXT: $m0 = S_MOV_B32 0 | ||
; GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4, addrspace 1), (store (s32) into `ptr addrspace(3) poison` + 4, addrspace 3) | ||
; GCN-NEXT: S_WAITCNT 3952 | ||
; GCN-NEXT: $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec | ||
; GCN-NEXT: S_ENDPGM 0 | ||
$m0 = S_MOV_B32 0 | ||
BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4) | ||
S_WAITCNT_LDS_DIRECT | ||
$vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec | ||
S_ENDPGM 0 | ||
|
||
... | ||
|
||
# Expected vmcnt(1) since the global load is not processed by SIInsertWaitcnts. | ||
|
||
--- | ||
name: dma_then_global_load | ||
body: | | ||
bb.0: | ||
; GCN-LABEL: name: dma_then_global_load | ||
; GCN: S_WAITCNT 0 | ||
; GCN-NEXT: $m0 = S_MOV_B32 0 | ||
; GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4, addrspace 1), (store (s32) into `ptr addrspace(3) poison` + 4, addrspace 3) | ||
; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec | ||
; GCN-NEXT: S_WAITCNT 3953 | ||
; GCN-NEXT: $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec | ||
; GCN-NEXT: S_ENDPGM 0 | ||
$m0 = S_MOV_B32 0 | ||
BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4) | ||
$vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec | ||
S_WAITCNT_LDS_DIRECT | ||
$vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec | ||
S_ENDPGM 0 | ||
|
||
... | ||
|
||
# Expected no vmcnt since there is no direct load to LDS, and the global load is not processed by SIInsertWaitcnts. | ||
|
||
--- | ||
name: no_dma_just_fence | ||
body: | | ||
bb.0: | ||
; GCN-LABEL: name: no_dma_just_fence | ||
; GCN: S_WAITCNT 0 | ||
; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec | ||
; GCN-NEXT: $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec | ||
; GCN-NEXT: S_ENDPGM 0 | ||
$vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec | ||
S_WAITCNT_LDS_DIRECT | ||
$vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec | ||
S_ENDPGM 0 | ||
|
||
... | ||
|
||
# Expected vmcnt(1) since the global load is not processed by SIInsertWaitcnts. | ||
|
||
--- | ||
name: dma_then_system_fence | ||
body: | | ||
bb.0: | ||
; GCN-LABEL: name: dma_then_system_fence | ||
; GCN: S_WAITCNT 0 | ||
; GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4, addrspace 1), (store (s32) into `ptr addrspace(3) poison` + 4, addrspace 3) | ||
; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec | ||
; GCN-NEXT: S_WAITCNT 3953 | ||
; GCN-NEXT: $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec | ||
; GCN-NEXT: S_ENDPGM 0 | ||
BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4) | ||
$vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec | ||
S_WAITCNT_LDS_DIRECT | ||
$vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec | ||
S_ENDPGM 0 | ||
|
||
... | ||
|
||
# The computed vmcnt(1) gets merged with the existing vmcnt(0). | ||
|
||
--- | ||
name: merge_with_prev_wait | ||
body: | | ||
bb.0: | ||
; GCN-LABEL: name: merge_with_prev_wait | ||
; GCN: S_WAITCNT 0 | ||
; GCN-NEXT: $m0 = S_MOV_B32 0 | ||
; GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4, addrspace 1), (store (s32) into `ptr addrspace(3) poison` + 4, addrspace 3) | ||
; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec | ||
; GCN-NEXT: S_WAITCNT 3952 | ||
; GCN-NEXT: $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec | ||
; GCN-NEXT: S_ENDPGM 0 | ||
$m0 = S_MOV_B32 0 | ||
BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4) | ||
$vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec | ||
S_WAITCNT 3952 | ||
S_WAITCNT_LDS_DIRECT | ||
$vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec | ||
S_ENDPGM 0 | ||
|
||
... | ||
|
||
# The computed vmcnt(1) gets merged with the existing vmcnt(0). | ||
|
||
--- | ||
name: merge_with_next_wait | ||
body: | | ||
bb.0: | ||
; GCN-LABEL: name: merge_with_next_wait | ||
; GCN: S_WAITCNT 0 | ||
; GCN-NEXT: $m0 = S_MOV_B32 0 | ||
; GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4, addrspace 1), (store (s32) into `ptr addrspace(3) poison` + 4, addrspace 3) | ||
; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec | ||
; GCN-NEXT: S_WAITCNT 3952 | ||
; GCN-NEXT: $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec | ||
; GCN-NEXT: S_ENDPGM 0 | ||
$m0 = S_MOV_B32 0 | ||
BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4) | ||
$vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec | ||
S_WAITCNT_LDS_DIRECT | ||
S_WAITCNT 3952 | ||
$vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec | ||
S_ENDPGM 0 | ||
|
||
... |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Just to be consistent with the rest of the file
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Meh. Fixed it. Although I would rather have the rest of the file be consistent with this (highly recommended and correct) use of
any()
. In factBitmaskEnum
there should be able to provide a much clearer way to check for individual bits.