Skip to content

Commit 1e9f0a1

Browse files
committed
[AMDGPU] Add option to preinflate to AVGPR
Change-Id: Ia488b12f06bdc3e462f1cd90baf64a3375f15c4c
1 parent ff4faaa commit 1e9f0a1

File tree

3 files changed

+206
-0
lines changed

3 files changed

+206
-0
lines changed

llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,11 @@ using namespace llvm;
4343

4444
#define DEBUG_TYPE "amdgpu-pre-ra-optimizations"
4545

46+
static cl::opt<bool>
47+
InflateToAVGPR("amdgpu-avgpr-inflation", cl::Hidden, cl::init(false),
48+
cl::desc("Enable register inflation to avgpr register class "
49+
"(which can be assigned to either AGPR or VGPR)."));
50+
4651
namespace {
4752

4853
class GCNPreRAOptimizationsImpl {
@@ -253,6 +258,13 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) {
253258
if (!LIS->hasInterval(Reg))
254259
continue;
255260
const TargetRegisterClass *RC = MRI->getRegClass(Reg);
261+
262+
if (InflateToAVGPR && ST.hasGFX90AInsts() &&
263+
(TRI->isAGPRClass(RC) || TRI->isVGPRClass(RC))) {
264+
MRI->recomputeRegClass(Reg);
265+
continue;
266+
}
267+
256268
if ((RC->MC->getSizeInBits() != 64 || !TRI->isSGPRClass(RC)) &&
257269
(ST.hasGFX90AInsts() || !TRI->isAGPRClass(RC)))
258270
continue;
Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -verify-machineinstrs %s 2>&1 | FileCheck %s
3+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 --amdgpu-avgpr-inflation=1 -verify-machineinstrs %s 2>&1 | FileCheck -check-prefix=INFLATE %s
4+
5+
6+
define amdgpu_kernel void @attn_fwd(ptr addrspace(3) %in0, ptr addrspace(3) %in1, ptr addrspace(3) %in2, ptr addrspace(3) %in3, ptr addrspace(3) %in4, ptr addrspace(3) %in5, ptr addrspace(3) %in6, ptr addrspace(3) %in7, ptr addrspace(0) %out) #0 {
7+
; CHECK-LABEL: attn_fwd:
8+
; CHECK: ; %bb.0:
9+
; CHECK-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
10+
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x20
11+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
12+
; CHECK-NEXT: v_mov_b32_e32 v0, s8
13+
; CHECK-NEXT: v_mov_b32_e32 v4, s9
14+
; CHECK-NEXT: v_mov_b32_e32 v5, s10
15+
; CHECK-NEXT: ds_read_b128 v[0:3], v0
16+
; CHECK-NEXT: ds_read_b128 v[8:11], v4
17+
; CHECK-NEXT: ds_read_b128 v[4:7], v5
18+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
19+
; CHECK-NEXT: v_accvgpr_write_b32 a16, v7 ; Reload Reuse
20+
; CHECK-NEXT: v_accvgpr_write_b32 a17, v6 ; Reload Reuse
21+
; CHECK-NEXT: v_accvgpr_write_b32 a18, v5 ; Reload Reuse
22+
; CHECK-NEXT: v_accvgpr_write_b32 a19, v4 ; Reload Reuse
23+
; CHECK-NEXT: v_mov_b32_e32 v4, s11
24+
; CHECK-NEXT: ds_read_b128 v[12:15], v4
25+
; CHECK-NEXT: v_mov_b32_e32 v4, s12
26+
; CHECK-NEXT: ds_read_b128 v[16:19], v4
27+
; CHECK-NEXT: v_mov_b32_e32 v4, s13
28+
; CHECK-NEXT: v_mov_b32_e32 v5, s14
29+
; CHECK-NEXT: v_mov_b32_e32 v6, s15
30+
; CHECK-NEXT: ds_read_b128 v[20:23], v4
31+
; CHECK-NEXT: ds_read_b128 v[24:27], v5
32+
; CHECK-NEXT: ds_read_b128 v[4:7], v6
33+
; CHECK-NEXT: ; sched_barrier mask(0x00000000)
34+
; CHECK-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[0:3], 0
35+
; CHECK-NEXT: v_accvgpr_read_b32 v3, a16 ; Reload Reuse
36+
; CHECK-NEXT: v_accvgpr_read_b32 v2, a17 ; Reload Reuse
37+
; CHECK-NEXT: v_accvgpr_read_b32 v1, a18 ; Reload Reuse
38+
; CHECK-NEXT: v_accvgpr_read_b32 v0, a19 ; Reload Reuse
39+
; CHECK-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[8:11], v[8:11], a[0:15]
40+
; CHECK-NEXT: s_nop 0
41+
; CHECK-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[0:3], a[0:15]
42+
; CHECK-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
43+
; CHECK-NEXT: s_waitcnt lgkmcnt(4)
44+
; CHECK-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[12:15], v[12:15], a[0:15]
45+
; CHECK-NEXT: s_waitcnt lgkmcnt(3)
46+
; CHECK-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[16:19], v[16:19], a[0:15]
47+
; CHECK-NEXT: s_waitcnt lgkmcnt(2)
48+
; CHECK-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[20:23], v[20:23], a[0:15]
49+
; CHECK-NEXT: s_waitcnt lgkmcnt(1)
50+
; CHECK-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[24:27], v[24:27], a[0:15]
51+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
52+
; CHECK-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[4:7], v[4:7], a[0:15]
53+
; CHECK-NEXT: s_nop 7
54+
; CHECK-NEXT: s_nop 3
55+
; CHECK-NEXT: flat_store_dwordx4 v[0:1], a[12:15] offset:48
56+
; CHECK-NEXT: flat_store_dwordx4 v[0:1], a[8:11] offset:32
57+
; CHECK-NEXT: flat_store_dwordx4 v[0:1], a[4:7] offset:16
58+
; CHECK-NEXT: flat_store_dwordx4 v[0:1], a[0:3]
59+
; CHECK-NEXT: s_endpgm
60+
;
61+
; INFLATE-LABEL: attn_fwd:
62+
; INFLATE: ; %bb.0:
63+
; INFLATE-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
64+
; INFLATE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x20
65+
; INFLATE-NEXT: s_waitcnt lgkmcnt(0)
66+
; INFLATE-NEXT: v_mov_b32_e32 v0, s8
67+
; INFLATE-NEXT: v_mov_b32_e32 v4, s9
68+
; INFLATE-NEXT: v_mov_b32_e32 v8, s10
69+
; INFLATE-NEXT: v_mov_b32_e32 v12, s11
70+
; INFLATE-NEXT: v_mov_b32_e32 v16, s12
71+
; INFLATE-NEXT: v_mov_b32_e32 v20, s13
72+
; INFLATE-NEXT: v_mov_b32_e32 v24, s14
73+
; INFLATE-NEXT: ds_read_b128 a[0:3], v0
74+
; INFLATE-NEXT: ds_read_b128 v[4:7], v4
75+
; INFLATE-NEXT: ds_read_b128 v[8:11], v8
76+
; INFLATE-NEXT: ds_read_b128 v[12:15], v12
77+
; INFLATE-NEXT: ds_read_b128 v[16:19], v16
78+
; INFLATE-NEXT: v_mov_b32_e32 v0, s15
79+
; INFLATE-NEXT: ds_read_b128 v[20:23], v20
80+
; INFLATE-NEXT: ds_read_b128 v[24:27], v24
81+
; INFLATE-NEXT: ds_read_b128 a[16:19], v0
82+
; INFLATE-NEXT: ; sched_barrier mask(0x00000000)
83+
; INFLATE-NEXT: s_waitcnt lgkmcnt(7)
84+
; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], a[0:3], a[0:3], 0
85+
; INFLATE-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
86+
; INFLATE-NEXT: s_waitcnt lgkmcnt(6)
87+
; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[4:7], v[4:7], a[0:15]
88+
; INFLATE-NEXT: s_waitcnt lgkmcnt(5)
89+
; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[8:11], v[8:11], a[0:15]
90+
; INFLATE-NEXT: s_waitcnt lgkmcnt(4)
91+
; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[12:15], v[12:15], a[0:15]
92+
; INFLATE-NEXT: s_waitcnt lgkmcnt(3)
93+
; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[16:19], v[16:19], a[0:15]
94+
; INFLATE-NEXT: s_waitcnt lgkmcnt(2)
95+
; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[20:23], v[20:23], a[0:15]
96+
; INFLATE-NEXT: s_waitcnt lgkmcnt(1)
97+
; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[24:27], v[24:27], a[0:15]
98+
; INFLATE-NEXT: s_waitcnt lgkmcnt(0)
99+
; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], a[16:19], a[16:19], a[0:15]
100+
; INFLATE-NEXT: s_nop 7
101+
; INFLATE-NEXT: s_nop 3
102+
; INFLATE-NEXT: flat_store_dwordx4 v[0:1], a[12:15] offset:48
103+
; INFLATE-NEXT: flat_store_dwordx4 v[0:1], a[8:11] offset:32
104+
; INFLATE-NEXT: flat_store_dwordx4 v[0:1], a[4:7] offset:16
105+
; INFLATE-NEXT: flat_store_dwordx4 v[0:1], a[0:3]
106+
; INFLATE-NEXT: s_endpgm
107+
%load0 = load <8 x half>, ptr addrspace(3) %in0, align 16
108+
%load1 = load <8 x half>, ptr addrspace(3) %in1, align 16
109+
%load2 = load <8 x half>, ptr addrspace(3) %in2, align 16
110+
%load3 = load <8 x half>, ptr addrspace(3) %in3, align 16
111+
%load4 = load <8 x half>, ptr addrspace(3) %in4, align 16
112+
%load5 = load <8 x half>, ptr addrspace(3) %in5, align 16
113+
%load6 = load <8 x half>, ptr addrspace(3) %in6, align 16
114+
%load7 = load <8 x half>, ptr addrspace(3) %in7, align 16
115+
tail call void @llvm.amdgcn.sched.barrier(i32 0)
116+
%mfma0 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load0, <8 x half> %load0, <16 x float> zeroinitializer, i32 0, i32 0, i32 0)
117+
%mfma1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load1, <8 x half> %load1, <16 x float> %mfma0, i32 0, i32 0, i32 0)
118+
%mfma2 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load2, <8 x half> %load2, <16 x float> %mfma1, i32 0, i32 0, i32 0)
119+
%mfma3 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load3, <8 x half> %load3, <16 x float> %mfma2, i32 0, i32 0, i32 0)
120+
%mfma4 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load4, <8 x half> %load4, <16 x float> %mfma3, i32 0, i32 0, i32 0)
121+
%mfma5 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load5, <8 x half> %load5, <16 x float> %mfma4, i32 0, i32 0, i32 0)
122+
%mfma6 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load6, <8 x half> %load6, <16 x float> %mfma5, i32 0, i32 0, i32 0)
123+
%mfma7 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load7, <8 x half> %load7, <16 x float> %mfma6, i32 0, i32 0, i32 0)
124+
store <16 x float> %mfma7, ptr addrspace(0) %out
125+
ret void
126+
}
127+
128+
attributes #0 = { "amdgpu-num-vgpr"="24" "amdgpu-agpr-alloc"="20,256"}
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
2+
# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -verify-machineinstrs --run-pass=amdgpu-pre-ra-optimizations %s -o - | FileCheck %s
3+
# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -verify-machineinstrs --amdgpu-avgpr-inflation=1 --run-pass=amdgpu-pre-ra-optimizations %s -o - | FileCheck %s -check-prefix=INFLATE
4+
5+
---
6+
name: agpr_constraint
7+
tracksRegLiveness: true
8+
body: |
9+
bb.0:
10+
; CHECK-LABEL: name: agpr_constraint
11+
; CHECK: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
12+
; CHECK-NEXT: [[DS_READ_B128_gfx9_:%[0-9]+]]:areg_128_align2 = DS_READ_B128_gfx9 [[DEF]], 0, 0, implicit $exec
13+
; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 6291466 /* regdef:AReg_128_Align2 */, def dead [[DS_READ_B128_gfx9_]], 2147483657 /* reguse tiedto:$0 */, [[DS_READ_B128_gfx9_]](tied-def 3)
14+
; CHECK-NEXT: S_ENDPGM 0, amdgpu_allvgprs
15+
;
16+
; INFLATE-LABEL: name: agpr_constraint
17+
; INFLATE: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
18+
; INFLATE-NEXT: [[DS_READ_B128_gfx9_:%[0-9]+]]:areg_128_align2 = DS_READ_B128_gfx9 [[DEF]], 0, 0, implicit $exec
19+
; INFLATE-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 6291466 /* regdef:AReg_128_Align2 */, def dead [[DS_READ_B128_gfx9_]], 2147483657 /* reguse tiedto:$0 */, [[DS_READ_B128_gfx9_]](tied-def 3)
20+
; INFLATE-NEXT: S_ENDPGM 0, amdgpu_allvgprs
21+
%0:vgpr_32 = IMPLICIT_DEF
22+
%1:areg_128_align2 = DS_READ_B128_gfx9 %0, 0, 0, implicit $exec
23+
INLINEASM &"", 1 /* sideeffect attdialect */, 6291466 /* regdef:AReg_128_Align2 */, def %1, 2147483657 /* reguse tiedto:$0 */, %1(tied-def 3)
24+
S_ENDPGM 0, amdgpu_allvgprs
25+
...
26+
27+
---
28+
name: vgpr_constraint
29+
tracksRegLiveness: true
30+
body: |
31+
bb.0:
32+
; CHECK-LABEL: name: vgpr_constraint
33+
; CHECK: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
34+
; CHECK-NEXT: [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF]], 0, 0, implicit $exec
35+
; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 6422538 /* regdef:VReg_128_Align2 */, def dead [[DS_READ_B128_gfx9_]], 2147483657 /* reguse tiedto:$0 */, [[DS_READ_B128_gfx9_]](tied-def 3)
36+
; CHECK-NEXT: S_ENDPGM 0, amdgpu_allvgprs
37+
;
38+
; INFLATE-LABEL: name: vgpr_constraint
39+
; INFLATE: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
40+
; INFLATE-NEXT: [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF]], 0, 0, implicit $exec
41+
; INFLATE-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 6422538 /* regdef:VReg_128_Align2 */, def dead [[DS_READ_B128_gfx9_]], 2147483657 /* reguse tiedto:$0 */, [[DS_READ_B128_gfx9_]](tied-def 3)
42+
; INFLATE-NEXT: S_ENDPGM 0, amdgpu_allvgprs
43+
%0:vgpr_32 = IMPLICIT_DEF
44+
%1:vreg_128_align2 = DS_READ_B128_gfx9 %0, 0, 0, implicit $exec
45+
INLINEASM &"", 1 /* sideeffect attdialect */, 6422538 /* regdef:VReg_128_Align2 */, def %1, 2147483657 /* reguse tiedto:$0 */, %1(tied-def 3)
46+
S_ENDPGM 0, amdgpu_allvgprs
47+
...
48+
49+
---
50+
name: no_constraint
51+
tracksRegLiveness: true
52+
body: |
53+
bb.0:
54+
; CHECK-LABEL: name: no_constraint
55+
; CHECK: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
56+
; CHECK-NEXT: dead [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF]], 0, 0, implicit $exec
57+
; CHECK-NEXT: S_ENDPGM 0, amdgpu_allvgprs
58+
;
59+
; INFLATE-LABEL: name: no_constraint
60+
; INFLATE: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
61+
; INFLATE-NEXT: dead [[DS_READ_B128_gfx9_:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF]], 0, 0, implicit $exec
62+
; INFLATE-NEXT: S_ENDPGM 0, amdgpu_allvgprs
63+
%0:vgpr_32 = IMPLICIT_DEF
64+
%1:vreg_128_align2 = DS_READ_B128_gfx9 %0, 0, 0, implicit $exec
65+
S_ENDPGM 0, amdgpu_allvgprs
66+
...

0 commit comments

Comments
 (0)