Skip to content

Commit 695660c

Browse files
authored
[AMDGPU] Provide control to force VGPR MFMA form (#148079)
This gives an override to the user to force select VGPR form of MFMA. Eventually we will drop this in favor of compiler making better decisions, but this provides a mechanism for users to address the cases where MayNeedAGPRs favors the AGPR form and performance is degraded due to poor RA.
1 parent abdd453 commit 695660c

File tree

3 files changed

+3994
-2
lines changed

3 files changed

+3994
-2
lines changed

llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,16 @@ enum { MAX_LANES = 64 };
2929

3030
using namespace llvm;
3131

32+
// TODO -- delete this flag once we have more robust mechanisms to allocate the
33+
// optimal RC for Opc and Dest of MFMA. In particular, there are high RP cases
34+
// where it is better to produce the VGPR form (e.g. if there are VGPR users
35+
// of the MFMA result).
36+
cl::opt<bool> MFMAVGPRForm(
37+
"amdgpu-mfma-vgpr-form", cl::Hidden,
38+
cl::desc("Whether to force use VGPR for Opc and Dest of MFMA. If "
39+
"unspecified, default to compiler heuristics"),
40+
cl::init(false));
41+
3242
const GCNTargetMachine &getTM(const GCNSubtarget *STI) {
3343
const SITargetLowering *TLI = STI->getTargetLowering();
3444
return static_cast<const GCNTargetMachine &>(TLI->getTargetMachine());
@@ -69,8 +79,8 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
6979
PSInputAddr = AMDGPU::getInitialPSInputAddr(F);
7080
}
7181

72-
MayNeedAGPRs = ST.hasMAIInsts();
73-
if (ST.hasGFX90AInsts() &&
82+
MayNeedAGPRs = ST.hasMAIInsts() && !MFMAVGPRForm;
83+
if (!MFMAVGPRForm && ST.hasGFX90AInsts() &&
7484
ST.getMaxNumVGPRs(F) <= AMDGPU::VGPR_32RegClass.getNumRegs() &&
7585
!mayUseAGPRs(F))
7686
MayNeedAGPRs = false; // We will select all MAI with VGPR operands.
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
2+
; RUN: llc -mtriple=amdgcn -mcpu=gfx950 --amdgpu-mfma-vgpr-form=0 < %s | FileCheck -enable-var-scope --check-prefixes=HEURRC %s
3+
; RUN: llc -mtriple=amdgcn -mcpu=gfx950 --amdgpu-mfma-vgpr-form=1 < %s | FileCheck -enable-var-scope --check-prefixes=VGPRRC %s
4+
5+
declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half>, <8 x half>, <4 x float>, i32 immarg, i32 immarg, i32 immarg)
6+
7+
define <4 x float> @default(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2) {
8+
; HEURRC-LABEL: default:
9+
; HEURRC: ; %bb.0:
10+
; HEURRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11+
; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8
12+
; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9
13+
; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10
14+
; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11
15+
; HEURRC-NEXT: s_nop 1
16+
; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3]
17+
; HEURRC-NEXT: s_nop 7
18+
; HEURRC-NEXT: v_accvgpr_read_b32 v0, a0
19+
; HEURRC-NEXT: v_accvgpr_read_b32 v1, a1
20+
; HEURRC-NEXT: v_accvgpr_read_b32 v2, a2
21+
; HEURRC-NEXT: v_accvgpr_read_b32 v3, a3
22+
; HEURRC-NEXT: s_setpc_b64 s[30:31]
23+
;
24+
; VGPRRC-LABEL: default:
25+
; VGPRRC: ; %bb.0:
26+
; VGPRRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27+
; VGPRRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11]
28+
; VGPRRC-NEXT: s_setpc_b64 s[30:31]
29+
%result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0)
30+
ret <4 x float> %result
31+
}
32+
33+
define <4 x float> @request_agpr(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2) #0 {
34+
; HEURRC-LABEL: request_agpr:
35+
; HEURRC: ; %bb.0:
36+
; HEURRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
37+
; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8
38+
; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9
39+
; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10
40+
; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11
41+
; HEURRC-NEXT: s_nop 1
42+
; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3]
43+
; HEURRC-NEXT: s_nop 7
44+
; HEURRC-NEXT: v_accvgpr_read_b32 v0, a0
45+
; HEURRC-NEXT: v_accvgpr_read_b32 v1, a1
46+
; HEURRC-NEXT: v_accvgpr_read_b32 v2, a2
47+
; HEURRC-NEXT: v_accvgpr_read_b32 v3, a3
48+
; HEURRC-NEXT: s_setpc_b64 s[30:31]
49+
;
50+
; VGPRRC-LABEL: request_agpr:
51+
; VGPRRC: ; %bb.0:
52+
; VGPRRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
53+
; VGPRRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11]
54+
; VGPRRC-NEXT: s_setpc_b64 s[30:31]
55+
%result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0)
56+
ret <4 x float> %result
57+
}
58+
59+
define <4 x float> @request_no_agpr(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2) #1 {
60+
; HEURRC-LABEL: request_no_agpr:
61+
; HEURRC: ; %bb.0:
62+
; HEURRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
63+
; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11]
64+
; HEURRC-NEXT: s_setpc_b64 s[30:31]
65+
;
66+
; VGPRRC-LABEL: request_no_agpr:
67+
; VGPRRC: ; %bb.0:
68+
; VGPRRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
69+
; VGPRRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11]
70+
; VGPRRC-NEXT: s_setpc_b64 s[30:31]
71+
%result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0)
72+
ret <4 x float> %result
73+
}
74+
75+
attributes #0 = { "amdgpu-agpr-alloc"="32,256" }
76+
attributes #1 = { "amdgpu-agpr-alloc"="0,0" }

0 commit comments

Comments
 (0)