Skip to content

Commit 03fe1a4

Browse files
authored
[AMDGPU] Fix sgpr to vreg_1 copy analysis (#149181)
1 parent a8f5e9e commit 03fe1a4

File tree

3 files changed

+143
-7
lines changed

3 files changed

+143
-7
lines changed

llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -947,13 +947,18 @@ void SIFixSGPRCopies::analyzeVGPRToSGPRCopy(MachineInstr* MI) {
947947

948948
// Copies and REG_SEQUENCE do not contribute to the final assembly
949949
// So, skip them but take care of the SGPR to VGPR copies bookkeeping.
950-
if (Inst->isCopy() || Inst->isRegSequence()) {
951-
if (TRI->isVGPR(*MRI, Inst->getOperand(0).getReg())) {
952-
if (!Inst->isCopy() ||
953-
!tryChangeVGPRtoSGPRinCopy(*Inst, TRI, TII)) {
954-
Info.NumSVCopies++;
955-
continue;
956-
}
950+
if (Inst->isRegSequence() &&
951+
TRI->isVGPR(*MRI, Inst->getOperand(0).getReg())) {
952+
Info.NumSVCopies++;
953+
continue;
954+
}
955+
if (Inst->isCopy()) {
956+
const TargetRegisterClass *SrcRC, *DstRC;
957+
std::tie(SrcRC, DstRC) = getCopyRegClasses(*Inst, *TRI, *MRI);
958+
if (isSGPRToVGPRCopy(SrcRC, DstRC, *TRI) &&
959+
!tryChangeVGPRtoSGPRinCopy(*Inst, TRI, TII)) {
960+
Info.NumSVCopies++;
961+
continue;
957962
}
958963
}
959964

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck -check-prefixes=GCN %s
3+
4+
define amdgpu_kernel void @copy_to_vreg_1(i32 %0) {
5+
; GCN-LABEL: copy_to_vreg_1:
6+
; GCN: ; %bb.0: ; %._crit_edge
7+
; GCN-NEXT: s_load_dword s4, s[4:5], 0x0
8+
; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0
9+
; GCN-NEXT: v_mov_b64_e32 v[2:3], 0
10+
; GCN-NEXT: s_waitcnt lgkmcnt(0)
11+
; GCN-NEXT: s_sub_i32 s5, 1, s4
12+
; GCN-NEXT: s_cmp_lt_u32 s4, 2
13+
; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
14+
; GCN-NEXT: s_and_b64 s[2:3], s[0:1], exec
15+
; GCN-NEXT: s_cselect_b32 s3, s5, 1
16+
; GCN-NEXT: s_cmp_lg_u64 s[0:1], 0
17+
; GCN-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
18+
; GCN-NEXT: s_addc_u32 s0, 1, 0
19+
; GCN-NEXT: v_readfirstlane_b32 s2, v1
20+
; GCN-NEXT: s_cmp_ge_u32 s3, s4
21+
; GCN-NEXT: s_cselect_b32 s4, s0, s2
22+
; GCN-NEXT: v_mov_b32_e32 v1, 0
23+
; GCN-NEXT: s_cmp_lg_u64 0, 0
24+
; GCN-NEXT: s_mov_b64 s[0:1], 0
25+
; GCN-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
26+
; GCN-NEXT: s_cselect_b64 s[2:3], -1, 0
27+
; GCN-NEXT: s_branch .LBB0_3
28+
; GCN-NEXT: .LBB0_1: ; %Flow
29+
; GCN-NEXT: ; in Loop: Header=BB0_3 Depth=1
30+
; GCN-NEXT: s_or_b64 exec, exec, s[6:7]
31+
; GCN-NEXT: s_xor_b64 s[8:9], exec, -1
32+
; GCN-NEXT: .LBB0_2: ; %Flow3
33+
; GCN-NEXT: ; in Loop: Header=BB0_3 Depth=1
34+
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
35+
; GCN-NEXT: s_and_b64 s[4:5], exec, s[8:9]
36+
; GCN-NEXT: s_or_b64 s[0:1], s[4:5], s[0:1]
37+
; GCN-NEXT: s_mov_b32 s4, 0
38+
; GCN-NEXT: s_andn2_b64 exec, exec, s[0:1]
39+
; GCN-NEXT: s_cbranch_execz .LBB0_8
40+
; GCN-NEXT: .LBB0_3: ; %.lr.ph27
41+
; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
42+
; GCN-NEXT: s_cmp_lg_u32 s4, 0
43+
; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
44+
; GCN-NEXT: s_or_b64 s[8:9], vcc, s[4:5]
45+
; GCN-NEXT: s_xor_b64 s[6:7], s[8:9], -1
46+
; GCN-NEXT: s_and_saveexec_b64 s[4:5], s[8:9]
47+
; GCN-NEXT: s_cbranch_execz .LBB0_5
48+
; GCN-NEXT: ; %bb.4: ; %pred.store.if
49+
; GCN-NEXT: ; in Loop: Header=BB0_3 Depth=1
50+
; GCN-NEXT: s_or_b64 s[6:7], s[6:7], exec
51+
; GCN-NEXT: global_store_byte v[2:3], v1, off
52+
; GCN-NEXT: .LBB0_5: ; %Flow2
53+
; GCN-NEXT: ; in Loop: Header=BB0_3 Depth=1
54+
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
55+
; GCN-NEXT: s_mov_b64 s[8:9], -1
56+
; GCN-NEXT: s_and_saveexec_b64 s[4:5], s[6:7]
57+
; GCN-NEXT: s_cbranch_execz .LBB0_2
58+
; GCN-NEXT: ; %bb.6: ; %pred.store.continue
59+
; GCN-NEXT: ; in Loop: Header=BB0_3 Depth=1
60+
; GCN-NEXT: s_and_saveexec_b64 s[6:7], s[2:3]
61+
; GCN-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
62+
; GCN-NEXT: s_cbranch_execz .LBB0_1
63+
; GCN-NEXT: ; %bb.7: ; %pred.store.if41
64+
; GCN-NEXT: ; in Loop: Header=BB0_3 Depth=1
65+
; GCN-NEXT: global_store_byte v[2:3], v1, off
66+
; GCN-NEXT: s_branch .LBB0_1
67+
; GCN-NEXT: .LBB0_8: ; %DummyReturnBlock
68+
; GCN-NEXT: s_endpgm
69+
._crit_edge:
70+
%id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
71+
%div = udiv i32 1, %0
72+
br label %.lr.ph27
73+
74+
.lr.ph27: ; preds = %pred.store.if41, %pred.store.continue, %._crit_edge
75+
%iv = phi i32 [ %div, %._crit_edge ], [ 0, %pred.store.if41 ], [ 0, %pred.store.continue ]
76+
%cmp = icmp ugt i32 %iv, 0
77+
%broadcast.splatinsert37 = insertelement <4 x i1> zeroinitializer, i1 %cmp, i64 0
78+
%.zext = zext i32 %id.x to i64
79+
%broadcast.splatinsert39 = insertelement <4 x i64> zeroinitializer, i64 %.zext, i64 0
80+
%cmp.1 = icmp uge <4 x i64> %broadcast.splatinsert39, splat (i64 1)
81+
%or = or <4 x i1> %cmp.1, %broadcast.splatinsert37
82+
%extract = extractelement <4 x i1> %or, i64 0
83+
br i1 %extract, label %pred.store.if, label %pred.store.continue
84+
85+
pred.store.if: ; preds = %.lr.ph27
86+
store i8 0, ptr addrspace(1) null, align 64
87+
br label %pred.store.continue
88+
89+
pred.store.continue: ; preds = %pred.store.if, %.lr.ph27
90+
%extract.1 = extractelement <4 x i1> %or, i64 1
91+
br i1 %extract.1, label %pred.store.if41, label %.lr.ph27
92+
93+
pred.store.if41: ; preds = %pred.store.continue
94+
store i8 0, ptr addrspace(1) null, align 64
95+
br label %.lr.ph27
96+
}
97+
98+
declare noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x() #0
99+
100+
attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
# RUN: llc -mtriple=amdgcn -run-pass si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
2+
3+
---
4+
name: copy_to_vreg_1
5+
tracksRegLiveness: true
6+
body: |
7+
; GCN-LABEL: name: copy_to_vreg_1
8+
; GCN: bb.0:
9+
; GCN-NEXT: successors: %bb.1(0x80000000)
10+
; GCN-NEXT: liveins: $vgpr0, $vgpr1
11+
; GCN-NEXT: {{ $}}
12+
; GCN-NEXT: [[V_CVT_U32_F32_e64:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e64 0, killed $vgpr0, 0, 0, implicit $mode, implicit $exec
13+
; GCN-NEXT: [[IMPLICIT_DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
14+
; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
15+
; GCN-NEXT: [[V_CMP_GT_U32_e64:%[0-9]+]]:sreg_64_xexec = samesign V_CMP_GT_U32_e64 [[V_CVT_U32_F32_e64]], killed [[COPY1]], implicit $exec
16+
; GCN-NEXT: [[VREG1:%[0-9]+]]:vreg_1 = COPY [[V_CMP_GT_U32_e64]]
17+
; GCN-NEXT: {{ $}}
18+
; GCN-NEXT: bb.1:
19+
; GCN-NEXT: S_ENDPGM 0
20+
bb.0:
21+
liveins: $vgpr0, $vgpr1
22+
%0:vgpr_32 = nofpexcept V_CVT_U32_F32_e64 0, killed $vgpr0, 0, 0, implicit $mode, implicit $exec
23+
%1:sreg_32 = COPY %0:vgpr_32
24+
%2:sreg_32 = COPY $vgpr1
25+
samesign S_CMP_GT_U32 %1:sreg_32, killed %2:sreg_32, implicit-def $scc
26+
%3:sreg_64 = COPY $scc
27+
%4:vreg_1 = COPY %3:sreg_64
28+
29+
bb.1:
30+
S_ENDPGM 0
31+
...

0 commit comments

Comments
 (0)