From 2efe0cd67fc0e7f2d035d7913cbf858493036fb0 Mon Sep 17 00:00:00 2001 From: Sameer Sahasrabuddhe Date: Tue, 17 Jun 2025 13:11:55 +0530 Subject: [PATCH 1/4] [AMDGCN] pre-checkin test for LDS DMA and release operations --- .../AMDGPU/lds-dma-workgroup-release.ll | 541 ++++++++++++++++++ 1 file changed, 541 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll diff --git a/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll b/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll new file mode 100644 index 0000000000000..98e42a2c4c402 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll @@ -0,0 +1,541 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s --check-prefixes=GFX900 +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck %s --check-prefixes=GFX90A +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck %s --check-prefixes=GFX90A-TGSPLIT +; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck %s --check-prefixes=GFX942 +; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -mattr=+tgsplit < %s | FileCheck %s --check-prefixes=GFX942-TGSPLIT +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck %s -check-prefixes=GFX10WGP +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck %s -check-prefixes=GFX10CU + +; In each of these tests, an LDS DMA operation is followed by a release pattern +; at workgroup scope. The fence in such a release (implicit or explicit) should +; wait for the store component in the LDS DMA. The additional noalias metadata +; is just meant to ensure that the wait counts are not generated due to some +; unintended aliasing. + +declare void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) nocapture, i32 %size, i32 %voffset, i32 %soffset, i32 %offset, i32 %aux) + +define amdgpu_kernel void @barrier_release(<4 x i32> inreg %rsrc, +; GFX900-LABEL: barrier_release: +; GFX900: ; %bb.0: ; %main_body +; GFX900-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX900-NEXT: v_mov_b32_e32 v0, 0x800 +; GFX900-NEXT: v_mov_b32_e32 v1, 0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 m0, s12 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: buffer_load_dword v0, s[8:11], 0 offen lds +; GFX900-NEXT: v_mov_b32_e32 v0, s13 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_barrier +; GFX900-NEXT: ds_read_b32 v0, v0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: global_store_dword v1, v0, s[14:15] +; GFX900-NEXT: s_endpgm +; +; GFX90A-LABEL: barrier_release: +; GFX90A: ; %bb.1: +; GFX90A-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX90A-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_branch .LBB0_0 +; GFX90A-NEXT: .p2align 8 +; GFX90A-NEXT: ; %bb.2: +; GFX90A-NEXT: .LBB0_0: ; %main_body +; GFX90A-NEXT: s_mov_b32 m0, s12 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0x800 +; GFX90A-NEXT: buffer_load_dword v0, s[8:11], 0 offen lds +; GFX90A-NEXT: v_mov_b32_e32 v0, s13 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x3c +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_barrier +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: ds_read_b32 v0, v0 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: global_store_dword v1, v0, s[0:1] +; GFX90A-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: barrier_release: +; GFX90A-TGSPLIT: ; %bb.1: +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_branch .LBB0_0 +; GFX90A-TGSPLIT-NEXT: .p2align 8 +; GFX90A-TGSPLIT-NEXT: ; %bb.2: +; GFX90A-TGSPLIT-NEXT: .LBB0_0: ; %main_body +; GFX90A-TGSPLIT-NEXT: s_mov_b32 m0, s12 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0x800 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, s[8:11], 0 offen lds +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s13 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x3c +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_barrier +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v1, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-LABEL: barrier_release: +; GFX942: ; %bb.1: +; GFX942-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX942-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_branch .LBB0_0 +; GFX942-NEXT: .p2align 8 +; GFX942-NEXT: ; %bb.2: +; GFX942-NEXT: .LBB0_0: ; %main_body +; GFX942-NEXT: s_mov_b32 m0, s12 +; GFX942-NEXT: v_mov_b32_e32 v0, 0x800 +; GFX942-NEXT: buffer_load_dword v0, s[8:11], 0 offen lds +; GFX942-NEXT: v_mov_b32_e32 v0, s13 +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x3c +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_barrier +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: ds_read_b32 v0, v0 +; GFX942-NEXT: v_mov_b32_e32 v1, 0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: global_store_dword v1, v0, s[0:1] +; GFX942-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: barrier_release: +; GFX942-TGSPLIT: ; %bb.1: +; GFX942-TGSPLIT-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_branch .LBB0_0 +; GFX942-TGSPLIT-NEXT: .p2align 8 +; GFX942-TGSPLIT-NEXT: ; %bb.2: +; GFX942-TGSPLIT-NEXT: .LBB0_0: ; %main_body +; GFX942-TGSPLIT-NEXT: s_mov_b32 m0, s12 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0x800 +; GFX942-TGSPLIT-NEXT: buffer_load_dword v0, s[8:11], 0 offen lds +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s13 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x3c +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_barrier +; GFX942-TGSPLIT-NEXT: buffer_inv sc0 +; GFX942-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: global_store_dword v1, v0, s[0:1] +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX10WGP-LABEL: barrier_release: +; GFX10WGP: ; %bb.0: ; %main_body +; GFX10WGP-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX10WGP-NEXT: v_mov_b32_e32 v0, 0x800 +; GFX10WGP-NEXT: v_mov_b32_e32 v1, 0 +; GFX10WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10WGP-NEXT: s_mov_b32 m0, s12 +; GFX10WGP-NEXT: buffer_load_dword v0, s[8:11], 0 offen lds +; GFX10WGP-NEXT: v_mov_b32_e32 v0, s13 +; GFX10WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10WGP-NEXT: s_barrier +; GFX10WGP-NEXT: buffer_gl0_inv +; GFX10WGP-NEXT: ds_read_b32 v0, v0 +; GFX10WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10WGP-NEXT: global_store_dword v1, v0, s[14:15] +; GFX10WGP-NEXT: s_endpgm +; +; GFX10CU-LABEL: barrier_release: +; GFX10CU: ; %bb.0: ; %main_body +; GFX10CU-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX10CU-NEXT: v_mov_b32_e32 v0, 0x800 +; GFX10CU-NEXT: v_mov_b32_e32 v1, 0 +; GFX10CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10CU-NEXT: s_mov_b32 m0, s12 +; GFX10CU-NEXT: buffer_load_dword v0, s[8:11], 0 offen lds +; GFX10CU-NEXT: v_mov_b32_e32 v0, s13 +; GFX10CU-NEXT: s_barrier +; GFX10CU-NEXT: s_waitcnt vmcnt(0) +; GFX10CU-NEXT: ds_read_b32 v0, v0 +; GFX10CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10CU-NEXT: global_store_dword v1, v0, s[14:15] +; GFX10CU-NEXT: s_endpgm + ptr addrspace(3) inreg %lds1, + ptr addrspace(3) inreg %lds2, + ptr addrspace(1) %dummy2) { +main_body: + call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) %lds1, i32 4, i32 2048, i32 0, i32 0, i32 0), !alias.scope !102 + fence syncscope("workgroup") release + tail call void @llvm.amdgcn.s.barrier() + fence syncscope("workgroup") acquire + %load = load i32, ptr addrspace(3) %lds2, align 4, !noalias !105 + store i32 %load, ptr addrspace(1) %dummy2, align 4, !noalias !105 + ret void +} + +define amdgpu_kernel void @fence_fence(<4 x i32> inreg %rsrc, +; GFX900-LABEL: fence_fence: +; GFX900: ; %bb.0: ; %main_body +; GFX900-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX900-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x3c +; GFX900-NEXT: v_mov_b32_e32 v1, 0x800 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 m0, s6 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: buffer_load_dword v1, s[0:3], 0 offen lds +; GFX900-NEXT: v_mov_b32_e32 v1, 1 +; GFX900-NEXT: global_store_dword v0, v1, s[8:9] +; GFX900-NEXT: global_load_dword v1, v0, s[8:9] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v1, s7 +; GFX900-NEXT: ds_read_b32 v1, v1 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: global_store_dword v0, v1, s[10:11] +; GFX900-NEXT: s_endpgm +; +; GFX90A-LABEL: fence_fence: +; GFX90A: ; %bb.1: +; GFX90A-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX90A-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_branch .LBB1_0 +; GFX90A-NEXT: .p2align 8 +; GFX90A-NEXT: ; %bb.2: +; GFX90A-NEXT: .LBB1_0: ; %main_body +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x3c +; GFX90A-NEXT: s_mov_b32 m0, s12 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0x800 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: buffer_load_dword v1, s[8:11], 0 offen lds +; GFX90A-NEXT: v_mov_b32_e32 v1, 1 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v1, s13 +; GFX90A-NEXT: ds_read_b32 v1, v1 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: fence_fence: +; GFX90A-TGSPLIT: ; %bb.1: +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_branch .LBB1_0 +; GFX90A-TGSPLIT-NEXT: .p2align 8 +; GFX90A-TGSPLIT-NEXT: ; %bb.2: +; GFX90A-TGSPLIT-NEXT: .LBB1_0: ; %main_body +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x3c +; GFX90A-TGSPLIT-NEXT: s_mov_b32 m0, s12 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0x800 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v1, s[8:11], 0 offen lds +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, 1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s13 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-LABEL: fence_fence: +; GFX942: ; %bb.1: +; GFX942-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX942-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_branch .LBB1_0 +; GFX942-NEXT: .p2align 8 +; GFX942-NEXT: ; %bb.2: +; GFX942-NEXT: .LBB1_0: ; %main_body +; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x3c +; GFX942-NEXT: s_mov_b32 m0, s12 +; GFX942-NEXT: v_mov_b32_e32 v1, 0x800 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: buffer_load_dword v1, s[8:11], 0 offen lds +; GFX942-NEXT: v_mov_b32_e32 v1, 1 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: global_store_dword v0, v1, s[0:1] sc0 +; GFX942-NEXT: global_load_dword v1, v0, s[0:1] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v1, s13 +; GFX942-NEXT: ds_read_b32 v1, v1 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: global_store_dword v0, v1, s[2:3] +; GFX942-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: fence_fence: +; GFX942-TGSPLIT: ; %bb.1: +; GFX942-TGSPLIT-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_branch .LBB1_0 +; GFX942-TGSPLIT-NEXT: .p2align 8 +; GFX942-TGSPLIT-NEXT: ; %bb.2: +; GFX942-TGSPLIT-NEXT: .LBB1_0: ; %main_body +; GFX942-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x3c +; GFX942-TGSPLIT-NEXT: s_mov_b32 m0, s12 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0x800 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: buffer_load_dword v1, s[8:11], 0 offen lds +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, 1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc0 +; GFX942-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s13 +; GFX942-TGSPLIT-NEXT: buffer_inv sc0 +; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX10WGP-LABEL: fence_fence: +; GFX10WGP: ; %bb.0: ; %main_body +; GFX10WGP-NEXT: s_clause 0x2 +; GFX10WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX10WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10WGP-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x3c +; GFX10WGP-NEXT: v_mov_b32_e32 v0, 0x800 +; GFX10WGP-NEXT: v_mov_b32_e32 v1, 0 +; GFX10WGP-NEXT: v_mov_b32_e32 v2, 1 +; GFX10WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10WGP-NEXT: s_mov_b32 m0, s6 +; GFX10WGP-NEXT: buffer_load_dword v0, s[0:3], 0 offen lds +; GFX10WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10WGP-NEXT: global_store_dword v1, v2, s[8:9] +; GFX10WGP-NEXT: global_load_dword v0, v1, s[8:9] glc +; GFX10WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10WGP-NEXT: v_mov_b32_e32 v0, s7 +; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10WGP-NEXT: buffer_gl0_inv +; GFX10WGP-NEXT: ds_read_b32 v0, v0 +; GFX10WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10WGP-NEXT: global_store_dword v1, v0, s[10:11] +; GFX10WGP-NEXT: s_endpgm +; +; GFX10CU-LABEL: fence_fence: +; GFX10CU: ; %bb.0: ; %main_body +; GFX10CU-NEXT: s_clause 0x2 +; GFX10CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX10CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10CU-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x3c +; GFX10CU-NEXT: v_mov_b32_e32 v0, 0x800 +; GFX10CU-NEXT: v_mov_b32_e32 v1, 0 +; GFX10CU-NEXT: v_mov_b32_e32 v2, 1 +; GFX10CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10CU-NEXT: s_mov_b32 m0, s6 +; GFX10CU-NEXT: buffer_load_dword v0, s[0:3], 0 offen lds +; GFX10CU-NEXT: global_store_dword v1, v2, s[8:9] +; GFX10CU-NEXT: global_load_dword v0, v1, s[8:9] +; GFX10CU-NEXT: s_waitcnt vmcnt(0) +; GFX10CU-NEXT: v_mov_b32_e32 v0, s7 +; GFX10CU-NEXT: ds_read_b32 v0, v0 +; GFX10CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10CU-NEXT: global_store_dword v1, v0, s[10:11] +; GFX10CU-NEXT: s_endpgm + ptr addrspace(3) inreg %lds1, + ptr addrspace(3) inreg %lds2, + ptr addrspace(1) %flag, + ptr addrspace(1) %dummy2) { +main_body: + call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) %lds1, i32 4, i32 2048, i32 0, i32 0, i32 0), !alias.scope !102 + fence syncscope("workgroup") release + store atomic i32 1, ptr addrspace(1) %flag syncscope("workgroup") monotonic, align 4, !noalias !105 + %unused_flag = load atomic i32, ptr addrspace(1) %flag syncscope("workgroup") monotonic, align 4, !noalias !105 + fence syncscope("workgroup") acquire + %load = load i32, ptr addrspace(3) %lds2, align 4, !noalias !105 + store i32 %load, ptr addrspace(1) %dummy2, align 4, !noalias !105 + ret void +} + +define amdgpu_kernel void @release_acquire(<4 x i32> inreg %rsrc, +; GFX900-LABEL: release_acquire: +; GFX900: ; %bb.0: ; %main_body +; GFX900-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX900-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x3c +; GFX900-NEXT: v_mov_b32_e32 v1, 0x800 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 m0, s6 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: buffer_load_dword v1, s[0:3], 0 offen lds +; GFX900-NEXT: v_mov_b32_e32 v1, 1 +; GFX900-NEXT: global_store_dword v0, v1, s[8:9] +; GFX900-NEXT: global_load_dword v1, v0, s[8:9] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v1, s7 +; GFX900-NEXT: ds_read_b32 v1, v1 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: global_store_dword v0, v1, s[10:11] +; GFX900-NEXT: s_endpgm +; +; GFX90A-LABEL: release_acquire: +; GFX90A: ; %bb.1: +; GFX90A-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX90A-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_branch .LBB2_0 +; GFX90A-NEXT: .p2align 8 +; GFX90A-NEXT: ; %bb.2: +; GFX90A-NEXT: .LBB2_0: ; %main_body +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x3c +; GFX90A-NEXT: s_mov_b32 m0, s12 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0x800 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: buffer_load_dword v1, s[8:11], 0 offen lds +; GFX90A-NEXT: v_mov_b32_e32 v1, 1 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v1, s13 +; GFX90A-NEXT: ds_read_b32 v1, v1 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: release_acquire: +; GFX90A-TGSPLIT: ; %bb.1: +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_branch .LBB2_0 +; GFX90A-TGSPLIT-NEXT: .p2align 8 +; GFX90A-TGSPLIT-NEXT: ; %bb.2: +; GFX90A-TGSPLIT-NEXT: .LBB2_0: ; %main_body +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x3c +; GFX90A-TGSPLIT-NEXT: s_mov_b32 m0, s12 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0x800 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v1, s[8:11], 0 offen lds +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, 1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s13 +; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-LABEL: release_acquire: +; GFX942: ; %bb.1: +; GFX942-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX942-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_branch .LBB2_0 +; GFX942-NEXT: .p2align 8 +; GFX942-NEXT: ; %bb.2: +; GFX942-NEXT: .LBB2_0: ; %main_body +; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x3c +; GFX942-NEXT: s_mov_b32 m0, s12 +; GFX942-NEXT: v_mov_b32_e32 v1, 0x800 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: buffer_load_dword v1, s[8:11], 0 offen lds +; GFX942-NEXT: v_mov_b32_e32 v1, 1 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: global_store_dword v0, v1, s[0:1] sc0 +; GFX942-NEXT: global_load_dword v1, v0, s[0:1] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v1, s13 +; GFX942-NEXT: ds_read_b32 v1, v1 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: global_store_dword v0, v1, s[2:3] +; GFX942-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: release_acquire: +; GFX942-TGSPLIT: ; %bb.1: +; GFX942-TGSPLIT-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_branch .LBB2_0 +; GFX942-TGSPLIT-NEXT: .p2align 8 +; GFX942-TGSPLIT-NEXT: ; %bb.2: +; GFX942-TGSPLIT-NEXT: .LBB2_0: ; %main_body +; GFX942-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x3c +; GFX942-TGSPLIT-NEXT: s_mov_b32 m0, s12 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0x800 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: buffer_load_dword v1, s[8:11], 0 offen lds +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, 1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc0 +; GFX942-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc0 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s13 +; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX10WGP-LABEL: release_acquire: +; GFX10WGP: ; %bb.0: ; %main_body +; GFX10WGP-NEXT: s_clause 0x2 +; GFX10WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX10WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10WGP-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x3c +; GFX10WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10WGP-NEXT: v_mov_b32_e32 v1, 0x800 +; GFX10WGP-NEXT: v_mov_b32_e32 v2, 1 +; GFX10WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10WGP-NEXT: s_mov_b32 m0, s6 +; GFX10WGP-NEXT: buffer_load_dword v1, s[0:3], 0 offen lds +; GFX10WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10WGP-NEXT: global_store_dword v0, v2, s[8:9] +; GFX10WGP-NEXT: global_load_dword v1, v0, s[8:9] glc +; GFX10WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10WGP-NEXT: buffer_gl0_inv +; GFX10WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10WGP-NEXT: ds_read_b32 v1, v1 +; GFX10WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10WGP-NEXT: global_store_dword v0, v1, s[10:11] +; GFX10WGP-NEXT: s_endpgm +; +; GFX10CU-LABEL: release_acquire: +; GFX10CU: ; %bb.0: ; %main_body +; GFX10CU-NEXT: s_clause 0x2 +; GFX10CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX10CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10CU-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x3c +; GFX10CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10CU-NEXT: v_mov_b32_e32 v1, 0x800 +; GFX10CU-NEXT: v_mov_b32_e32 v2, 1 +; GFX10CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10CU-NEXT: s_mov_b32 m0, s6 +; GFX10CU-NEXT: buffer_load_dword v1, s[0:3], 0 offen lds +; GFX10CU-NEXT: global_store_dword v0, v2, s[8:9] +; GFX10CU-NEXT: global_load_dword v1, v0, s[8:9] +; GFX10CU-NEXT: s_waitcnt vmcnt(0) +; GFX10CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10CU-NEXT: ds_read_b32 v1, v1 +; GFX10CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10CU-NEXT: global_store_dword v0, v1, s[10:11] +; GFX10CU-NEXT: s_endpgm + ptr addrspace(3) inreg %lds1, + ptr addrspace(3) inreg %lds2, + ptr addrspace(1) %flag, + ptr addrspace(1) %dummy2) { +main_body: + call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) %lds1, i32 4, i32 2048, i32 0, i32 0, i32 0), !alias.scope !102 + store atomic i32 1, ptr addrspace(1) %flag syncscope("workgroup") release, align 4, !noalias !105 + %unused_flag = load atomic i32, ptr addrspace(1) %flag syncscope("workgroup") acquire, align 4, !noalias !105 + %load = load i32, ptr addrspace(3) %lds2, align 4, !noalias !105 + store i32 %load, ptr addrspace(1) %dummy2, align 4, !noalias !105 + ret void +} + +!100 = !{!100} +!101 = !{!101, !100} +!102 = !{!101} +!103 = !{!103, !100} +!104 = !{!103} +!105 = !{!101, !103} From a997d3282b52ed3f2820c1570c4c979efa4ed3bb Mon Sep 17 00:00:00 2001 From: Sameer Sahasrabuddhe Date: Fri, 25 Jul 2025 20:16:21 +0530 Subject: [PATCH 2/4] [AMDGPU] introduce S_WAITCNT_LDS_DIRECT in the memory legalizer The new instruction represents the unknown number of waitcnts needed at a release operation to ensure that prior direct loads to LDS (formerly called LDS DMA) are completed. The instruction is replaced in SIInsertWaitcnts with a suitable value for vmcnt(). Co-authored-by: Austin Kerbow . --- llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 20 +++ llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp | 20 +++ llvm/lib/Target/AMDGPU/SOPInstructions.td | 7 + .../memory-legalizer-atomic-fence.ll | 12 ++ .../AMDGPU/insert-waitcnts-fence-soft.mir | 133 ++++++++++++++++++ .../AMDGPU/lds-dma-workgroup-release.ll | 20 +-- 6 files changed, 203 insertions(+), 9 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/insert-waitcnts-fence-soft.mir diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index dd3f2fe25a239..9a4360374621d 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -1381,6 +1381,20 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt( Modified = true; } else WaitcntInstr = &II; + } else if (Opcode == AMDGPU::S_WAITCNT_LDS_DIRECT) { + assert(ST->hasVMemToLDSLoad()); + LLVM_DEBUG(dbgs() << "Processing S_WAITCNT_LDS_DIRECT: " << II + << "Before: " << Wait.LoadCnt << '\n';); + ScoreBrackets.determineWait(LOAD_CNT, FIRST_LDS_VGPR, Wait); + LLVM_DEBUG(dbgs() << "After: " << Wait.LoadCnt << '\n';); + + // It is possible (but unlikely) that this is the only wait instruction, + // in which case, we exit this loop without a WaitcntInstr to consume + // `Wait`. But that works because `Wait` was passed in by reference, and + // the callee eventually calls createNewWaitcnt on it. We test this + // possibility in an articial MIR test since such a situation cannot be + // recreated by running the memory legalizer. + II.eraseFromParent(); } else { assert(Opcode == AMDGPU::S_WAITCNT_VSCNT); assert(II.getOperand(0).getReg() == AMDGPU::SGPR_NULL); @@ -1552,6 +1566,11 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt( ScoreBrackets.simplifyWaitcnt(OldWait); Wait = Wait.combined(OldWait); UpdatableInstr = &CombinedStoreDsCntInstr; + } else if (Opcode == AMDGPU::S_WAITCNT_LDS_DIRECT) { + // Architectures higher than GFX10 do not have direct loads to + // LDS, so no work required here yet. + II.eraseFromParent(); + continue; } else { std::optional CT = counterTypeForInstr(Opcode); assert(CT.has_value()); @@ -2442,6 +2461,7 @@ static bool isWaitInstr(MachineInstr &Inst) { Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL) || Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT || Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT || + Opcode == AMDGPU::S_WAITCNT_LDS_DIRECT || counterTypeForInstr(Opcode).has_value(); } diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index 0e8a420fbb70a..30c180c0e420e 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -1170,6 +1170,16 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI, Changed = true; } + // On architectures that support direct loads to LDS, emit an unknown waitcnt + // at workgroup-scoped release operations that specify the LDS address space. + // SIInsertWaitcnts will later replace this with a vmcnt(). + if (ST.hasVMemToLDSLoad() && isReleaseOrStronger(Order) && + Scope == SIAtomicScope::WORKGROUP && + any((SIAtomicAddrSpace)AddrSpace & SIAtomicAddrSpace::LDS)) { + BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_LDS_DIRECT)); + Changed = true; + } + if (Pos == Position::AFTER) --MI; @@ -2078,6 +2088,16 @@ bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI, Changed = true; } + // On architectures that support direct loads to LDS, emit an unknown waitcnt + // at workgroup-scoped release operations that specify the LDS address space. + // SIInsertWaitcnts will later replace this with a vmcnt(). + if (ST.hasVMemToLDSLoad() && isReleaseOrStronger(Order) && + Scope == SIAtomicScope::WORKGROUP && + any((SIAtomicAddrSpace)AddrSpace & SIAtomicAddrSpace::LDS)) { + BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_LDS_DIRECT)); + Changed = true; + } + if (VSCnt) { BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT_soft)) .addReg(AMDGPU::SGPR_NULL, RegState::Undef) diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td index e103ccc2f00e6..09630e20840cf 100644 --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -1621,6 +1621,13 @@ let OtherPredicates = [HasImageInsts] in { def S_WAIT_KMCNT_soft : SOPP_Pseudo <"s_soft_wait_kmcnt", (ins s16imm:$simm16), "$simm16">; } +// Represents the point at which a wave must wait for all outstanding direct loads to LDS. +// Typically inserted by the memory legalizer and consumed by SIInsertWaitcnts. + +def S_WAITCNT_LDS_DIRECT : SPseudoInstSI<(outs), (ins)> { + let hasSideEffects = 0; +} + def S_SETHALT : SOPP_Pseudo <"s_sethalt" , (ins i32imm:$simm16), "$simm16", [(int_amdgcn_s_sethalt timm:$simm16)]>; def S_SETKILL : SOPP_Pseudo <"s_setkill" , (ins i16imm:$simm16), "$simm16">; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll index 66037615f0ba0..5fd8553820685 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll @@ -545,11 +545,13 @@ define amdgpu_kernel void @workgroup_one_as_release() #0 { ; GFX10WGP-LABEL: name: workgroup_one_as_release ; GFX10WGP: bb.0.entry: ; GFX10WGP-NEXT: S_WAITCNT_soft 16240 + ; GFX10WGP-NEXT: S_WAITCNT_LDS_DIRECT ; GFX10WGP-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0 ; GFX10WGP-NEXT: S_ENDPGM 0 ; ; GFX10CU-LABEL: name: workgroup_one_as_release ; GFX10CU: bb.0.entry: + ; GFX10CU-NEXT: S_WAITCNT_LDS_DIRECT ; GFX10CU-NEXT: S_ENDPGM 0 ; ; GFX11WGP-LABEL: name: workgroup_one_as_release @@ -578,12 +580,14 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel() #0 { ; GFX10WGP-LABEL: name: workgroup_one_as_acq_rel ; GFX10WGP: bb.0.entry: ; GFX10WGP-NEXT: S_WAITCNT_soft 16240 + ; GFX10WGP-NEXT: S_WAITCNT_LDS_DIRECT ; GFX10WGP-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0 ; GFX10WGP-NEXT: BUFFER_GL0_INV implicit $exec ; GFX10WGP-NEXT: S_ENDPGM 0 ; ; GFX10CU-LABEL: name: workgroup_one_as_acq_rel ; GFX10CU: bb.0.entry: + ; GFX10CU-NEXT: S_WAITCNT_LDS_DIRECT ; GFX10CU-NEXT: S_ENDPGM 0 ; ; GFX11WGP-LABEL: name: workgroup_one_as_acq_rel @@ -613,12 +617,14 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst() #0 { ; GFX10WGP-LABEL: name: workgroup_one_as_seq_cst ; GFX10WGP: bb.0.entry: ; GFX10WGP-NEXT: S_WAITCNT_soft 16240 + ; GFX10WGP-NEXT: S_WAITCNT_LDS_DIRECT ; GFX10WGP-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0 ; GFX10WGP-NEXT: BUFFER_GL0_INV implicit $exec ; GFX10WGP-NEXT: S_ENDPGM 0 ; ; GFX10CU-LABEL: name: workgroup_one_as_seq_cst ; GFX10CU: bb.0.entry: + ; GFX10CU-NEXT: S_WAITCNT_LDS_DIRECT ; GFX10CU-NEXT: S_ENDPGM 0 ; ; GFX11WGP-LABEL: name: workgroup_one_as_seq_cst @@ -1293,12 +1299,14 @@ define amdgpu_kernel void @workgroup_release() #0 { ; GFX10WGP-LABEL: name: workgroup_release ; GFX10WGP: bb.0.entry: ; GFX10WGP-NEXT: S_WAITCNT_soft 112 + ; GFX10WGP-NEXT: S_WAITCNT_LDS_DIRECT ; GFX10WGP-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0 ; GFX10WGP-NEXT: S_ENDPGM 0 ; ; GFX10CU-LABEL: name: workgroup_release ; GFX10CU: bb.0.entry: ; GFX10CU-NEXT: S_WAITCNT_soft 49279 + ; GFX10CU-NEXT: S_WAITCNT_LDS_DIRECT ; GFX10CU-NEXT: S_ENDPGM 0 ; ; GFX11WGP-LABEL: name: workgroup_release @@ -1330,6 +1338,7 @@ define amdgpu_kernel void @workgroup_acq_rel() #0 { ; GFX10WGP-LABEL: name: workgroup_acq_rel ; GFX10WGP: bb.0.entry: ; GFX10WGP-NEXT: S_WAITCNT_soft 112 + ; GFX10WGP-NEXT: S_WAITCNT_LDS_DIRECT ; GFX10WGP-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0 ; GFX10WGP-NEXT: BUFFER_GL0_INV implicit $exec ; GFX10WGP-NEXT: S_ENDPGM 0 @@ -1337,6 +1346,7 @@ define amdgpu_kernel void @workgroup_acq_rel() #0 { ; GFX10CU-LABEL: name: workgroup_acq_rel ; GFX10CU: bb.0.entry: ; GFX10CU-NEXT: S_WAITCNT_soft 49279 + ; GFX10CU-NEXT: S_WAITCNT_LDS_DIRECT ; GFX10CU-NEXT: S_ENDPGM 0 ; ; GFX11WGP-LABEL: name: workgroup_acq_rel @@ -1369,6 +1379,7 @@ define amdgpu_kernel void @workgroup_seq_cst() #0 { ; GFX10WGP-LABEL: name: workgroup_seq_cst ; GFX10WGP: bb.0.entry: ; GFX10WGP-NEXT: S_WAITCNT_soft 112 + ; GFX10WGP-NEXT: S_WAITCNT_LDS_DIRECT ; GFX10WGP-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0 ; GFX10WGP-NEXT: BUFFER_GL0_INV implicit $exec ; GFX10WGP-NEXT: S_ENDPGM 0 @@ -1376,6 +1387,7 @@ define amdgpu_kernel void @workgroup_seq_cst() #0 { ; GFX10CU-LABEL: name: workgroup_seq_cst ; GFX10CU: bb.0.entry: ; GFX10CU-NEXT: S_WAITCNT_soft 49279 + ; GFX10CU-NEXT: S_WAITCNT_LDS_DIRECT ; GFX10CU-NEXT: S_ENDPGM 0 ; ; GFX11WGP-LABEL: name: workgroup_seq_cst diff --git a/llvm/test/CodeGen/AMDGPU/insert-waitcnts-fence-soft.mir b/llvm/test/CodeGen/AMDGPU/insert-waitcnts-fence-soft.mir new file mode 100644 index 0000000000000..b376360157141 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/insert-waitcnts-fence-soft.mir @@ -0,0 +1,133 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefix=GCN %s + + +# Expected vmcnt(0) since the direct load is the only load. +--- +name: dma_then_fence +body: | + bb.0: + ; GCN-LABEL: name: dma_then_fence + ; GCN: S_WAITCNT 0 + ; GCN-NEXT: $m0 = S_MOV_B32 0 + ; GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4, addrspace 1), (store (s32) into `ptr addrspace(3) poison` + 4, addrspace 3) + ; GCN-NEXT: S_WAITCNT 3952 + ; GCN-NEXT: $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + $m0 = S_MOV_B32 0 + BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4) + S_WAITCNT_LDS_DIRECT + $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + S_ENDPGM 0 + +... + +# Expected vmcnt(1) since the global load is not processed by SIInsertWaitcnts. + +--- +name: dma_then_global_load +body: | + bb.0: + ; GCN-LABEL: name: dma_then_global_load + ; GCN: S_WAITCNT 0 + ; GCN-NEXT: $m0 = S_MOV_B32 0 + ; GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4, addrspace 1), (store (s32) into `ptr addrspace(3) poison` + 4, addrspace 3) + ; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec + ; GCN-NEXT: S_WAITCNT 3953 + ; GCN-NEXT: $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + $m0 = S_MOV_B32 0 + BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4) + $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec + S_WAITCNT_LDS_DIRECT + $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + S_ENDPGM 0 + +... + +# Expected no vmcnt since there is no direct load to LDS, and the global load is not processed by SIInsertWaitcnts. + +--- +name: no_dma_just_fence +body: | + bb.0: + ; GCN-LABEL: name: no_dma_just_fence + ; GCN: S_WAITCNT 0 + ; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec + ; GCN-NEXT: $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec + S_WAITCNT_LDS_DIRECT + $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + S_ENDPGM 0 + +... + +# Expected vmcnt(1) since the global load is not processed by SIInsertWaitcnts. + +--- +name: dma_then_system_fence +body: | + bb.0: + ; GCN-LABEL: name: dma_then_system_fence + ; GCN: S_WAITCNT 0 + ; GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4, addrspace 1), (store (s32) into `ptr addrspace(3) poison` + 4, addrspace 3) + ; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec + ; GCN-NEXT: S_WAITCNT 3953 + ; GCN-NEXT: $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4) + $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec + S_WAITCNT_LDS_DIRECT + $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + S_ENDPGM 0 + +... + +# The computed vmcnt(1) gets merged with the existing vmcnt(0). + +--- +name: merge_with_prev_wait +body: | + bb.0: + ; GCN-LABEL: name: merge_with_prev_wait + ; GCN: S_WAITCNT 0 + ; GCN-NEXT: $m0 = S_MOV_B32 0 + ; GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4, addrspace 1), (store (s32) into `ptr addrspace(3) poison` + 4, addrspace 3) + ; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec + ; GCN-NEXT: S_WAITCNT 3952 + ; GCN-NEXT: $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + $m0 = S_MOV_B32 0 + BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4) + $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec + S_WAITCNT 3952 + S_WAITCNT_LDS_DIRECT + $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + S_ENDPGM 0 + +... + +# The computed vmcnt(1) gets merged with the existing vmcnt(0). + +--- +name: merge_with_next_wait +body: | + bb.0: + ; GCN-LABEL: name: merge_with_next_wait + ; GCN: S_WAITCNT 0 + ; GCN-NEXT: $m0 = S_MOV_B32 0 + ; GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4, addrspace 1), (store (s32) into `ptr addrspace(3) poison` + 4, addrspace 3) + ; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec + ; GCN-NEXT: S_WAITCNT 3952 + ; GCN-NEXT: $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + $m0 = S_MOV_B32 0 + BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4) + $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec + S_WAITCNT_LDS_DIRECT + S_WAITCNT 3952 + $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + S_ENDPGM 0 + +... diff --git a/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll b/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll index 98e42a2c4c402..d23509b5aa812 100644 --- a/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll +++ b/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll @@ -47,9 +47,8 @@ define amdgpu_kernel void @barrier_release(<4 x i32> inreg %rsrc, ; GFX90A-NEXT: buffer_load_dword v0, s[8:11], 0 offen lds ; GFX90A-NEXT: v_mov_b32_e32 v0, s13 ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x3c -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_barrier -; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: ds_read_b32 v0, v0 ; GFX90A-NEXT: v_mov_b32_e32 v1, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) @@ -93,9 +92,8 @@ define amdgpu_kernel void @barrier_release(<4 x i32> inreg %rsrc, ; GFX942-NEXT: buffer_load_dword v0, s[8:11], 0 offen lds ; GFX942-NEXT: v_mov_b32_e32 v0, s13 ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x3c -; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: s_barrier -; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: ds_read_b32 v0, v0 ; GFX942-NEXT: v_mov_b32_e32 v1, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) @@ -151,8 +149,8 @@ define amdgpu_kernel void @barrier_release(<4 x i32> inreg %rsrc, ; GFX10CU-NEXT: s_mov_b32 m0, s12 ; GFX10CU-NEXT: buffer_load_dword v0, s[8:11], 0 offen lds ; GFX10CU-NEXT: v_mov_b32_e32 v0, s13 -; GFX10CU-NEXT: s_barrier ; GFX10CU-NEXT: s_waitcnt vmcnt(0) +; GFX10CU-NEXT: s_barrier ; GFX10CU-NEXT: ds_read_b32 v0, v0 ; GFX10CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10CU-NEXT: global_store_dword v1, v0, s[14:15] @@ -183,6 +181,7 @@ define amdgpu_kernel void @fence_fence(<4 x i32> inreg %rsrc, ; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: buffer_load_dword v1, s[0:3], 0 offen lds ; GFX900-NEXT: v_mov_b32_e32 v1, 1 +; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: global_store_dword v0, v1, s[8:9] ; GFX900-NEXT: global_load_dword v1, v0, s[8:9] ; GFX900-NEXT: s_waitcnt vmcnt(0) @@ -207,7 +206,7 @@ define amdgpu_kernel void @fence_fence(<4 x i32> inreg %rsrc, ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: buffer_load_dword v1, s[8:11], 0 offen lds ; GFX90A-NEXT: v_mov_b32_e32 v1, 1 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-NEXT: global_load_dword v1, v0, s[0:1] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -258,7 +257,7 @@ define amdgpu_kernel void @fence_fence(<4 x i32> inreg %rsrc, ; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: buffer_load_dword v1, s[8:11], 0 offen lds ; GFX942-NEXT: v_mov_b32_e32 v1, 1 -; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_store_dword v0, v1, s[0:1] sc0 ; GFX942-NEXT: global_load_dword v1, v0, s[0:1] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -330,6 +329,7 @@ define amdgpu_kernel void @fence_fence(<4 x i32> inreg %rsrc, ; GFX10CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10CU-NEXT: s_mov_b32 m0, s6 ; GFX10CU-NEXT: buffer_load_dword v0, s[0:3], 0 offen lds +; GFX10CU-NEXT: s_waitcnt vmcnt(0) ; GFX10CU-NEXT: global_store_dword v1, v2, s[8:9] ; GFX10CU-NEXT: global_load_dword v0, v1, s[8:9] ; GFX10CU-NEXT: s_waitcnt vmcnt(0) @@ -366,6 +366,7 @@ define amdgpu_kernel void @release_acquire(<4 x i32> inreg %rsrc, ; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: buffer_load_dword v1, s[0:3], 0 offen lds ; GFX900-NEXT: v_mov_b32_e32 v1, 1 +; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: global_store_dword v0, v1, s[8:9] ; GFX900-NEXT: global_load_dword v1, v0, s[8:9] ; GFX900-NEXT: s_waitcnt vmcnt(0) @@ -390,7 +391,7 @@ define amdgpu_kernel void @release_acquire(<4 x i32> inreg %rsrc, ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: buffer_load_dword v1, s[8:11], 0 offen lds ; GFX90A-NEXT: v_mov_b32_e32 v1, 1 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-NEXT: global_load_dword v1, v0, s[0:1] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -441,7 +442,7 @@ define amdgpu_kernel void @release_acquire(<4 x i32> inreg %rsrc, ; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: buffer_load_dword v1, s[8:11], 0 offen lds ; GFX942-NEXT: v_mov_b32_e32 v1, 1 -; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_store_dword v0, v1, s[0:1] sc0 ; GFX942-NEXT: global_load_dword v1, v0, s[0:1] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -512,6 +513,7 @@ define amdgpu_kernel void @release_acquire(<4 x i32> inreg %rsrc, ; GFX10CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10CU-NEXT: s_mov_b32 m0, s6 ; GFX10CU-NEXT: buffer_load_dword v1, s[0:3], 0 offen lds +; GFX10CU-NEXT: s_waitcnt vmcnt(0) ; GFX10CU-NEXT: global_store_dword v0, v2, s[8:9] ; GFX10CU-NEXT: global_load_dword v1, v0, s[8:9] ; GFX10CU-NEXT: s_waitcnt vmcnt(0) From 73e20c7d04f51fdcdb67ffd79326679366b26117 Mon Sep 17 00:00:00 2001 From: Sameer Sahasrabuddhe Date: Mon, 28 Jul 2025 16:29:15 +0530 Subject: [PATCH 3/4] review comments: - try renaming to S_WAITCNT_lds_direct - be consistent (even at the cost of brevity?) --- llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 8 +++---- llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp | 10 ++++---- llvm/lib/Target/AMDGPU/SOPInstructions.td | 2 +- .../memory-legalizer-atomic-fence.ll | 24 +++++++++---------- .../AMDGPU/insert-waitcnts-fence-soft.mir | 12 +++++----- 5 files changed, 29 insertions(+), 27 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 9a4360374621d..d3918abc3fa20 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -1381,9 +1381,9 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt( Modified = true; } else WaitcntInstr = &II; - } else if (Opcode == AMDGPU::S_WAITCNT_LDS_DIRECT) { + } else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) { assert(ST->hasVMemToLDSLoad()); - LLVM_DEBUG(dbgs() << "Processing S_WAITCNT_LDS_DIRECT: " << II + LLVM_DEBUG(dbgs() << "Processing S_WAITCNT_lds_direct: " << II << "Before: " << Wait.LoadCnt << '\n';); ScoreBrackets.determineWait(LOAD_CNT, FIRST_LDS_VGPR, Wait); LLVM_DEBUG(dbgs() << "After: " << Wait.LoadCnt << '\n';); @@ -1566,7 +1566,7 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt( ScoreBrackets.simplifyWaitcnt(OldWait); Wait = Wait.combined(OldWait); UpdatableInstr = &CombinedStoreDsCntInstr; - } else if (Opcode == AMDGPU::S_WAITCNT_LDS_DIRECT) { + } else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) { // Architectures higher than GFX10 do not have direct loads to // LDS, so no work required here yet. II.eraseFromParent(); @@ -2461,7 +2461,7 @@ static bool isWaitInstr(MachineInstr &Inst) { Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL) || Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT || Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT || - Opcode == AMDGPU::S_WAITCNT_LDS_DIRECT || + Opcode == AMDGPU::S_WAITCNT_lds_direct || counterTypeForInstr(Opcode).has_value(); } diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index 30c180c0e420e..b99eba2280e1f 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -1175,8 +1175,9 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI, // SIInsertWaitcnts will later replace this with a vmcnt(). if (ST.hasVMemToLDSLoad() && isReleaseOrStronger(Order) && Scope == SIAtomicScope::WORKGROUP && - any((SIAtomicAddrSpace)AddrSpace & SIAtomicAddrSpace::LDS)) { - BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_LDS_DIRECT)); + ((SIAtomicAddrSpace)AddrSpace & SIAtomicAddrSpace::LDS) != + SIAtomicAddrSpace::NONE) { + BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_lds_direct)); Changed = true; } @@ -2093,8 +2094,9 @@ bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI, // SIInsertWaitcnts will later replace this with a vmcnt(). if (ST.hasVMemToLDSLoad() && isReleaseOrStronger(Order) && Scope == SIAtomicScope::WORKGROUP && - any((SIAtomicAddrSpace)AddrSpace & SIAtomicAddrSpace::LDS)) { - BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_LDS_DIRECT)); + ((SIAtomicAddrSpace)AddrSpace & SIAtomicAddrSpace::LDS) != + SIAtomicAddrSpace::NONE) { + BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_lds_direct)); Changed = true; } diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td index 09630e20840cf..8303410115f93 100644 --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -1624,7 +1624,7 @@ let OtherPredicates = [HasImageInsts] in { // Represents the point at which a wave must wait for all outstanding direct loads to LDS. // Typically inserted by the memory legalizer and consumed by SIInsertWaitcnts. -def S_WAITCNT_LDS_DIRECT : SPseudoInstSI<(outs), (ins)> { +def S_WAITCNT_lds_direct : SPseudoInstSI<(outs), (ins)> { let hasSideEffects = 0; } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll index 5fd8553820685..002c03aa7967d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll @@ -545,13 +545,13 @@ define amdgpu_kernel void @workgroup_one_as_release() #0 { ; GFX10WGP-LABEL: name: workgroup_one_as_release ; GFX10WGP: bb.0.entry: ; GFX10WGP-NEXT: S_WAITCNT_soft 16240 - ; GFX10WGP-NEXT: S_WAITCNT_LDS_DIRECT + ; GFX10WGP-NEXT: S_WAITCNT_lds_direct ; GFX10WGP-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0 ; GFX10WGP-NEXT: S_ENDPGM 0 ; ; GFX10CU-LABEL: name: workgroup_one_as_release ; GFX10CU: bb.0.entry: - ; GFX10CU-NEXT: S_WAITCNT_LDS_DIRECT + ; GFX10CU-NEXT: S_WAITCNT_lds_direct ; GFX10CU-NEXT: S_ENDPGM 0 ; ; GFX11WGP-LABEL: name: workgroup_one_as_release @@ -580,14 +580,14 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel() #0 { ; GFX10WGP-LABEL: name: workgroup_one_as_acq_rel ; GFX10WGP: bb.0.entry: ; GFX10WGP-NEXT: S_WAITCNT_soft 16240 - ; GFX10WGP-NEXT: S_WAITCNT_LDS_DIRECT + ; GFX10WGP-NEXT: S_WAITCNT_lds_direct ; GFX10WGP-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0 ; GFX10WGP-NEXT: BUFFER_GL0_INV implicit $exec ; GFX10WGP-NEXT: S_ENDPGM 0 ; ; GFX10CU-LABEL: name: workgroup_one_as_acq_rel ; GFX10CU: bb.0.entry: - ; GFX10CU-NEXT: S_WAITCNT_LDS_DIRECT + ; GFX10CU-NEXT: S_WAITCNT_lds_direct ; GFX10CU-NEXT: S_ENDPGM 0 ; ; GFX11WGP-LABEL: name: workgroup_one_as_acq_rel @@ -617,14 +617,14 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst() #0 { ; GFX10WGP-LABEL: name: workgroup_one_as_seq_cst ; GFX10WGP: bb.0.entry: ; GFX10WGP-NEXT: S_WAITCNT_soft 16240 - ; GFX10WGP-NEXT: S_WAITCNT_LDS_DIRECT + ; GFX10WGP-NEXT: S_WAITCNT_lds_direct ; GFX10WGP-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0 ; GFX10WGP-NEXT: BUFFER_GL0_INV implicit $exec ; GFX10WGP-NEXT: S_ENDPGM 0 ; ; GFX10CU-LABEL: name: workgroup_one_as_seq_cst ; GFX10CU: bb.0.entry: - ; GFX10CU-NEXT: S_WAITCNT_LDS_DIRECT + ; GFX10CU-NEXT: S_WAITCNT_lds_direct ; GFX10CU-NEXT: S_ENDPGM 0 ; ; GFX11WGP-LABEL: name: workgroup_one_as_seq_cst @@ -1299,14 +1299,14 @@ define amdgpu_kernel void @workgroup_release() #0 { ; GFX10WGP-LABEL: name: workgroup_release ; GFX10WGP: bb.0.entry: ; GFX10WGP-NEXT: S_WAITCNT_soft 112 - ; GFX10WGP-NEXT: S_WAITCNT_LDS_DIRECT + ; GFX10WGP-NEXT: S_WAITCNT_lds_direct ; GFX10WGP-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0 ; GFX10WGP-NEXT: S_ENDPGM 0 ; ; GFX10CU-LABEL: name: workgroup_release ; GFX10CU: bb.0.entry: ; GFX10CU-NEXT: S_WAITCNT_soft 49279 - ; GFX10CU-NEXT: S_WAITCNT_LDS_DIRECT + ; GFX10CU-NEXT: S_WAITCNT_lds_direct ; GFX10CU-NEXT: S_ENDPGM 0 ; ; GFX11WGP-LABEL: name: workgroup_release @@ -1338,7 +1338,7 @@ define amdgpu_kernel void @workgroup_acq_rel() #0 { ; GFX10WGP-LABEL: name: workgroup_acq_rel ; GFX10WGP: bb.0.entry: ; GFX10WGP-NEXT: S_WAITCNT_soft 112 - ; GFX10WGP-NEXT: S_WAITCNT_LDS_DIRECT + ; GFX10WGP-NEXT: S_WAITCNT_lds_direct ; GFX10WGP-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0 ; GFX10WGP-NEXT: BUFFER_GL0_INV implicit $exec ; GFX10WGP-NEXT: S_ENDPGM 0 @@ -1346,7 +1346,7 @@ define amdgpu_kernel void @workgroup_acq_rel() #0 { ; GFX10CU-LABEL: name: workgroup_acq_rel ; GFX10CU: bb.0.entry: ; GFX10CU-NEXT: S_WAITCNT_soft 49279 - ; GFX10CU-NEXT: S_WAITCNT_LDS_DIRECT + ; GFX10CU-NEXT: S_WAITCNT_lds_direct ; GFX10CU-NEXT: S_ENDPGM 0 ; ; GFX11WGP-LABEL: name: workgroup_acq_rel @@ -1379,7 +1379,7 @@ define amdgpu_kernel void @workgroup_seq_cst() #0 { ; GFX10WGP-LABEL: name: workgroup_seq_cst ; GFX10WGP: bb.0.entry: ; GFX10WGP-NEXT: S_WAITCNT_soft 112 - ; GFX10WGP-NEXT: S_WAITCNT_LDS_DIRECT + ; GFX10WGP-NEXT: S_WAITCNT_lds_direct ; GFX10WGP-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0 ; GFX10WGP-NEXT: BUFFER_GL0_INV implicit $exec ; GFX10WGP-NEXT: S_ENDPGM 0 @@ -1387,7 +1387,7 @@ define amdgpu_kernel void @workgroup_seq_cst() #0 { ; GFX10CU-LABEL: name: workgroup_seq_cst ; GFX10CU: bb.0.entry: ; GFX10CU-NEXT: S_WAITCNT_soft 49279 - ; GFX10CU-NEXT: S_WAITCNT_LDS_DIRECT + ; GFX10CU-NEXT: S_WAITCNT_lds_direct ; GFX10CU-NEXT: S_ENDPGM 0 ; ; GFX11WGP-LABEL: name: workgroup_seq_cst diff --git a/llvm/test/CodeGen/AMDGPU/insert-waitcnts-fence-soft.mir b/llvm/test/CodeGen/AMDGPU/insert-waitcnts-fence-soft.mir index b376360157141..675a1c94bc435 100644 --- a/llvm/test/CodeGen/AMDGPU/insert-waitcnts-fence-soft.mir +++ b/llvm/test/CodeGen/AMDGPU/insert-waitcnts-fence-soft.mir @@ -16,7 +16,7 @@ body: | ; GCN-NEXT: S_ENDPGM 0 $m0 = S_MOV_B32 0 BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4) - S_WAITCNT_LDS_DIRECT + S_WAITCNT_lds_direct $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec S_ENDPGM 0 @@ -39,7 +39,7 @@ body: | $m0 = S_MOV_B32 0 BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4) $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec - S_WAITCNT_LDS_DIRECT + S_WAITCNT_lds_direct $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec S_ENDPGM 0 @@ -57,7 +57,7 @@ body: | ; GCN-NEXT: $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec ; GCN-NEXT: S_ENDPGM 0 $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec - S_WAITCNT_LDS_DIRECT + S_WAITCNT_lds_direct $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec S_ENDPGM 0 @@ -78,7 +78,7 @@ body: | ; GCN-NEXT: S_ENDPGM 0 BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4) $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec - S_WAITCNT_LDS_DIRECT + S_WAITCNT_lds_direct $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec S_ENDPGM 0 @@ -102,7 +102,7 @@ body: | BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4) $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec S_WAITCNT 3952 - S_WAITCNT_LDS_DIRECT + S_WAITCNT_lds_direct $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec S_ENDPGM 0 @@ -125,7 +125,7 @@ body: | $m0 = S_MOV_B32 0 BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4) $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec - S_WAITCNT_LDS_DIRECT + S_WAITCNT_lds_direct S_WAITCNT 3952 $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec S_ENDPGM 0 From 912e1207e38826911091bd72b172b41e40dca0d7 Mon Sep 17 00:00:00 2001 From: Sameer Sahasrabuddhe Date: Mon, 28 Jul 2025 20:59:49 +0530 Subject: [PATCH 4/4] remove unnecessary typecast --- llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index b99eba2280e1f..0ee465716dc37 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -1175,8 +1175,7 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI, // SIInsertWaitcnts will later replace this with a vmcnt(). if (ST.hasVMemToLDSLoad() && isReleaseOrStronger(Order) && Scope == SIAtomicScope::WORKGROUP && - ((SIAtomicAddrSpace)AddrSpace & SIAtomicAddrSpace::LDS) != - SIAtomicAddrSpace::NONE) { + (AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) { BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_lds_direct)); Changed = true; } @@ -2094,8 +2093,7 @@ bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI, // SIInsertWaitcnts will later replace this with a vmcnt(). if (ST.hasVMemToLDSLoad() && isReleaseOrStronger(Order) && Scope == SIAtomicScope::WORKGROUP && - ((SIAtomicAddrSpace)AddrSpace & SIAtomicAddrSpace::LDS) != - SIAtomicAddrSpace::NONE) { + (AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) { BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_lds_direct)); Changed = true; }