Skip to content

Commit f761d73

Browse files
Shoreshenshiltian
andauthored
[AMDGPU] Add freeze for LowerSELECT (#148796)
Trying to solve #147635 Add freeze for legalizer when breaking i64 select to 2 i32 select. Several tests changed, still need to investigate why. --------- Co-authored-by: Shilei Tian <[email protected]>
1 parent de453e8 commit f761d73

20 files changed

+1194
-1054
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11131,7 +11131,7 @@ SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
1113111131
assert(VT.getSizeInBits() == 64);
1113211132

1113311133
SDLoc DL(Op);
11134-
SDValue Cond = Op.getOperand(0);
11134+
SDValue Cond = DAG.getFreeze(Op.getOperand(0));
1113511135

1113611136
SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
1113711137
SDValue One = DAG.getConstant(1, DL, MVT::i32);

llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -7791,7 +7791,7 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
77917791
;
77927792
; GFX6-LABEL: sdiv_i64_pow2_shl_denom:
77937793
; GFX6: ; %bb.0:
7794-
; GFX6-NEXT: s_load_dword s0, s[4:5], 0xd
7794+
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
77957795
; GFX6-NEXT: s_mov_b32 s7, 0xf000
77967796
; GFX6-NEXT: s_mov_b32 s6, -1
77977797
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
@@ -7927,7 +7927,7 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
79277927
;
79287928
; GFX9-LABEL: sdiv_i64_pow2_shl_denom:
79297929
; GFX9: ; %bb.0:
7930-
; GFX9-NEXT: s_load_dword s0, s[4:5], 0x34
7930+
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34
79317931
; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
79327932
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
79337933
; GFX9-NEXT: s_lshl_b64 s[0:1], 0x1000, s0
@@ -8982,7 +8982,7 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
89828982
;
89838983
; GFX6-LABEL: srem_i64_pow2_shl_denom:
89848984
; GFX6: ; %bb.0:
8985-
; GFX6-NEXT: s_load_dword s0, s[4:5], 0xd
8985+
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
89868986
; GFX6-NEXT: s_mov_b32 s7, 0xf000
89878987
; GFX6-NEXT: s_mov_b32 s6, -1
89888988
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
@@ -9116,7 +9116,7 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
91169116
;
91179117
; GFX9-LABEL: srem_i64_pow2_shl_denom:
91189118
; GFX9: ; %bb.0:
9119-
; GFX9-NEXT: s_load_dword s0, s[4:5], 0x34
9119+
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34
91209120
; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
91219121
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
91229122
; GFX9-NEXT: s_lshl_b64 s[0:1], 0x1000, s0
@@ -10096,9 +10096,15 @@ define i64 @udiv_i64_9divbits(i8 %size) {
1009610096
}
1009710097

1009810098
define <2 x i64> @srem_zero_zero() {
10099-
; GCN-LABEL: kernel:
10100-
; GCN: ; %bb.0: ; %entry
10101-
; GCN-NEXT: s_endpgm
10099+
; GFX6-LABEL: srem_zero_zero:
10100+
; GFX6: ; %bb.0: ; %entry
10101+
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10102+
; GFX6-NEXT: s_setpc_b64 s[30:31]
10103+
;
10104+
; GFX9-LABEL: srem_zero_zero:
10105+
; GFX9: ; %bb.0: ; %entry
10106+
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10107+
; GFX9-NEXT: s_setpc_b64 s[30:31]
1010210108
entry:
1010310109
%B = srem <2 x i64> zeroinitializer, zeroinitializer
1010410110
ret <2 x i64> %B

llvm/test/CodeGen/AMDGPU/div_i128.ll

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -521,16 +521,19 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
521521
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
522522
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v6
523523
; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[6:7], v[4:5], s[6:7]
524+
; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[8:9]
524525
; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11
525-
; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v1, v4, s[8:9]
526+
; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v1, v4, s[12:13]
527+
; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[8:9]
526528
; GFX9-O0-NEXT: v_mov_b32_e32 v1, s10
527-
; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[8:9]
529+
; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[12:13]
528530
; GFX9-O0-NEXT: ; implicit-def: $sgpr12
529531
; GFX9-O0-NEXT: ; implicit-def: $sgpr12
530532
; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
531533
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4
534+
; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[8:9]
532535
; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11
533-
; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v3, v4, s[8:9]
536+
; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v3, v4, s[12:13]
534537
; GFX9-O0-NEXT: v_mov_b32_e32 v3, s10
535538
; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[8:9]
536539
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
@@ -2710,16 +2713,19 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
27102713
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
27112714
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v6
27122715
; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[6:7], v[4:5], s[6:7]
2716+
; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[8:9]
27132717
; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11
2714-
; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v1, v4, s[8:9]
2718+
; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v1, v4, s[12:13]
2719+
; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[8:9]
27152720
; GFX9-O0-NEXT: v_mov_b32_e32 v1, s10
2716-
; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[8:9]
2721+
; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[12:13]
27172722
; GFX9-O0-NEXT: ; implicit-def: $sgpr12
27182723
; GFX9-O0-NEXT: ; implicit-def: $sgpr12
27192724
; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
27202725
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4
2726+
; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[8:9]
27212727
; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11
2722-
; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v3, v4, s[8:9]
2728+
; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v3, v4, s[12:13]
27232729
; GFX9-O0-NEXT: v_mov_b32_e32 v3, s10
27242730
; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[8:9]
27252731
; GFX9-O0-NEXT: ; implicit-def: $sgpr8

llvm/test/CodeGen/AMDGPU/fmaximum3.ll

Lines changed: 26 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -3272,9 +3272,10 @@ define double @v_fmaximum3_f64_fabs0(double %a, double %b, double %c) {
32723272
; GFX9-LABEL: v_fmaximum3_f64_fabs0:
32733273
; GFX9: ; %bb.0:
32743274
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3275-
; GFX9-NEXT: v_max_f64 v[6:7], |v[0:1]|, v[2:3]
3275+
; GFX9-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1
3276+
; GFX9-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3]
32763277
; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000
3277-
; GFX9-NEXT: v_cmp_u_f64_e64 vcc, |v[0:1]|, v[2:3]
3278+
; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
32783279
; GFX9-NEXT: s_nop 1
32793280
; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc
32803281
; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc
@@ -3306,9 +3307,10 @@ define double @v_fmaximum3_f64_fabs1(double %a, double %b, double %c) {
33063307
; GFX9-LABEL: v_fmaximum3_f64_fabs1:
33073308
; GFX9: ; %bb.0:
33083309
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3309-
; GFX9-NEXT: v_max_f64 v[6:7], v[0:1], |v[2:3]|
3310+
; GFX9-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3
3311+
; GFX9-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3]
33103312
; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000
3311-
; GFX9-NEXT: v_cmp_u_f64_e64 vcc, v[0:1], |v[2:3]|
3313+
; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
33123314
; GFX9-NEXT: s_nop 1
33133315
; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc
33143316
; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc
@@ -3343,11 +3345,12 @@ define double @v_fmaximum3_f64_fabs2(double %a, double %b, double %c) {
33433345
; GFX9-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3]
33443346
; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000
33453347
; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
3346-
; GFX9-NEXT: s_nop 1
3348+
; GFX9-NEXT: v_and_b32_e32 v5, 0x7fffffff, v5
3349+
; GFX9-NEXT: s_nop 0
33473350
; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc
33483351
; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc
3349-
; GFX9-NEXT: v_max_f64 v[2:3], v[0:1], |v[4:5]|
3350-
; GFX9-NEXT: v_cmp_u_f64_e64 vcc, v[0:1], |v[4:5]|
3352+
; GFX9-NEXT: v_max_f64 v[2:3], v[0:1], v[4:5]
3353+
; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
33513354
; GFX9-NEXT: s_nop 1
33523355
; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc
33533356
; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc
@@ -3374,14 +3377,17 @@ define double @v_fmaximum3_f64_fabs_all(double %a, double %b, double %c) {
33743377
; GFX9-LABEL: v_fmaximum3_f64_fabs_all:
33753378
; GFX9: ; %bb.0:
33763379
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3377-
; GFX9-NEXT: v_max_f64 v[6:7], |v[0:1]|, |v[2:3]|
3380+
; GFX9-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1
3381+
; GFX9-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3
3382+
; GFX9-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3]
33783383
; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000
3379-
; GFX9-NEXT: v_cmp_u_f64_e64 vcc, |v[0:1]|, |v[2:3]|
3380-
; GFX9-NEXT: s_nop 1
3384+
; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
3385+
; GFX9-NEXT: v_and_b32_e32 v5, 0x7fffffff, v5
3386+
; GFX9-NEXT: s_nop 0
33813387
; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc
33823388
; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc
3383-
; GFX9-NEXT: v_max_f64 v[2:3], v[0:1], |v[4:5]|
3384-
; GFX9-NEXT: v_cmp_u_f64_e64 vcc, v[0:1], |v[4:5]|
3389+
; GFX9-NEXT: v_max_f64 v[2:3], v[0:1], v[4:5]
3390+
; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
33853391
; GFX9-NEXT: s_nop 1
33863392
; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc
33873393
; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc
@@ -3446,14 +3452,17 @@ define double @v_fmaximum3_f64_fneg_fabs_all(double %a, double %b, double %c) {
34463452
; GFX9-LABEL: v_fmaximum3_f64_fneg_fabs_all:
34473453
; GFX9: ; %bb.0:
34483454
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3449-
; GFX9-NEXT: v_max_f64 v[6:7], -|v[0:1]|, -|v[2:3]|
3455+
; GFX9-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1
3456+
; GFX9-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3
3457+
; GFX9-NEXT: v_max_f64 v[6:7], -v[0:1], -v[2:3]
34503458
; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000
3451-
; GFX9-NEXT: v_cmp_u_f64_e64 vcc, -|v[0:1]|, -|v[2:3]|
3452-
; GFX9-NEXT: s_nop 1
3459+
; GFX9-NEXT: v_cmp_u_f64_e64 vcc, -v[0:1], -v[2:3]
3460+
; GFX9-NEXT: v_and_b32_e32 v5, 0x7fffffff, v5
3461+
; GFX9-NEXT: s_nop 0
34533462
; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc
34543463
; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc
3455-
; GFX9-NEXT: v_max_f64 v[2:3], v[0:1], -|v[4:5]|
3456-
; GFX9-NEXT: v_cmp_u_f64_e64 vcc, v[0:1], -|v[4:5]|
3464+
; GFX9-NEXT: v_max_f64 v[2:3], v[0:1], -v[4:5]
3465+
; GFX9-NEXT: v_cmp_u_f64_e64 vcc, v[0:1], -v[4:5]
34573466
; GFX9-NEXT: s_nop 1
34583467
; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc
34593468
; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc

llvm/test/CodeGen/AMDGPU/fminimum3.ll

Lines changed: 26 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -3272,9 +3272,10 @@ define double @v_fminimum3_f64_fabs0(double %a, double %b, double %c) {
32723272
; GFX9-LABEL: v_fminimum3_f64_fabs0:
32733273
; GFX9: ; %bb.0:
32743274
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3275-
; GFX9-NEXT: v_min_f64 v[6:7], |v[0:1]|, v[2:3]
3275+
; GFX9-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1
3276+
; GFX9-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3]
32763277
; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000
3277-
; GFX9-NEXT: v_cmp_u_f64_e64 vcc, |v[0:1]|, v[2:3]
3278+
; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
32783279
; GFX9-NEXT: s_nop 1
32793280
; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc
32803281
; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc
@@ -3306,9 +3307,10 @@ define double @v_fminimum3_f64_fabs1(double %a, double %b, double %c) {
33063307
; GFX9-LABEL: v_fminimum3_f64_fabs1:
33073308
; GFX9: ; %bb.0:
33083309
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3309-
; GFX9-NEXT: v_min_f64 v[6:7], v[0:1], |v[2:3]|
3310+
; GFX9-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3
3311+
; GFX9-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3]
33103312
; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000
3311-
; GFX9-NEXT: v_cmp_u_f64_e64 vcc, v[0:1], |v[2:3]|
3313+
; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
33123314
; GFX9-NEXT: s_nop 1
33133315
; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc
33143316
; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc
@@ -3343,11 +3345,12 @@ define double @v_fminimum3_f64_fabs2(double %a, double %b, double %c) {
33433345
; GFX9-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3]
33443346
; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000
33453347
; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
3346-
; GFX9-NEXT: s_nop 1
3348+
; GFX9-NEXT: v_and_b32_e32 v5, 0x7fffffff, v5
3349+
; GFX9-NEXT: s_nop 0
33473350
; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc
33483351
; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc
3349-
; GFX9-NEXT: v_min_f64 v[2:3], v[0:1], |v[4:5]|
3350-
; GFX9-NEXT: v_cmp_u_f64_e64 vcc, v[0:1], |v[4:5]|
3352+
; GFX9-NEXT: v_min_f64 v[2:3], v[0:1], v[4:5]
3353+
; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
33513354
; GFX9-NEXT: s_nop 1
33523355
; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc
33533356
; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc
@@ -3374,14 +3377,17 @@ define double @v_fminimum3_f64_fabs_all(double %a, double %b, double %c) {
33743377
; GFX9-LABEL: v_fminimum3_f64_fabs_all:
33753378
; GFX9: ; %bb.0:
33763379
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3377-
; GFX9-NEXT: v_min_f64 v[6:7], |v[0:1]|, |v[2:3]|
3380+
; GFX9-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1
3381+
; GFX9-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3
3382+
; GFX9-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3]
33783383
; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000
3379-
; GFX9-NEXT: v_cmp_u_f64_e64 vcc, |v[0:1]|, |v[2:3]|
3380-
; GFX9-NEXT: s_nop 1
3384+
; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
3385+
; GFX9-NEXT: v_and_b32_e32 v5, 0x7fffffff, v5
3386+
; GFX9-NEXT: s_nop 0
33813387
; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc
33823388
; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc
3383-
; GFX9-NEXT: v_min_f64 v[2:3], v[0:1], |v[4:5]|
3384-
; GFX9-NEXT: v_cmp_u_f64_e64 vcc, v[0:1], |v[4:5]|
3389+
; GFX9-NEXT: v_min_f64 v[2:3], v[0:1], v[4:5]
3390+
; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
33853391
; GFX9-NEXT: s_nop 1
33863392
; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc
33873393
; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc
@@ -3446,14 +3452,17 @@ define double @v_fminimum3_f64_fneg_fabs_all(double %a, double %b, double %c) {
34463452
; GFX9-LABEL: v_fminimum3_f64_fneg_fabs_all:
34473453
; GFX9: ; %bb.0:
34483454
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3449-
; GFX9-NEXT: v_min_f64 v[6:7], -|v[0:1]|, -|v[2:3]|
3455+
; GFX9-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1
3456+
; GFX9-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3
3457+
; GFX9-NEXT: v_min_f64 v[6:7], -v[0:1], -v[2:3]
34503458
; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000
3451-
; GFX9-NEXT: v_cmp_u_f64_e64 vcc, -|v[0:1]|, -|v[2:3]|
3452-
; GFX9-NEXT: s_nop 1
3459+
; GFX9-NEXT: v_cmp_u_f64_e64 vcc, -v[0:1], -v[2:3]
3460+
; GFX9-NEXT: v_and_b32_e32 v5, 0x7fffffff, v5
3461+
; GFX9-NEXT: s_nop 0
34533462
; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc
34543463
; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc
3455-
; GFX9-NEXT: v_min_f64 v[2:3], v[0:1], -|v[4:5]|
3456-
; GFX9-NEXT: v_cmp_u_f64_e64 vcc, v[0:1], -|v[4:5]|
3464+
; GFX9-NEXT: v_min_f64 v[2:3], v[0:1], -v[4:5]
3465+
; GFX9-NEXT: v_cmp_u_f64_e64 vcc, v[0:1], -v[4:5]
34573466
; GFX9-NEXT: s_nop 1
34583467
; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc
34593468
; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc

llvm/test/CodeGen/AMDGPU/fnearbyint.ll

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -223,8 +223,9 @@ define amdgpu_kernel void @nearbyint_f64(ptr addrspace(1) %out, double %in) {
223223
; SI-NEXT: v_bfi_b32 v1, s8, v1, v6
224224
; SI-NEXT: v_mov_b32_e32 v7, s2
225225
; SI-NEXT: v_add_f64 v[4:5], s[2:3], v[0:1]
226+
; SI-NEXT: s_bitset0_b32 s3, 31
226227
; SI-NEXT: v_add_f64 v[0:1], v[4:5], -v[0:1]
227-
; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[2:3]|, v[2:3]
228+
; SI-NEXT: v_cmp_gt_f64_e32 vcc, s[2:3], v[2:3]
228229
; SI-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc
229230
; SI-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
230231
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
@@ -284,14 +285,16 @@ define amdgpu_kernel void @nearbyint_v2f64(ptr addrspace(1) %out, <2 x double> %
284285
; SI-NEXT: v_mov_b32_e32 v9, s5
285286
; SI-NEXT: v_mov_b32_e32 v10, s4
286287
; SI-NEXT: v_add_f64 v[2:3], s[6:7], v[0:1]
288+
; SI-NEXT: s_bitset0_b32 s7, 31
287289
; SI-NEXT: v_add_f64 v[2:3], v[2:3], -v[0:1]
288290
; SI-NEXT: v_bfi_b32 v1, s10, v6, v9
289-
; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[6:7]|, v[4:5]
291+
; SI-NEXT: v_cmp_gt_f64_e32 vcc, s[6:7], v[4:5]
290292
; SI-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc
291293
; SI-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc
292294
; SI-NEXT: v_add_f64 v[6:7], s[4:5], v[0:1]
295+
; SI-NEXT: s_bitset0_b32 s5, 31
293296
; SI-NEXT: v_add_f64 v[0:1], v[6:7], -v[0:1]
294-
; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[4:5]|, v[4:5]
297+
; SI-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[4:5]
295298
; SI-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc
296299
; SI-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc
297300
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
@@ -365,26 +368,30 @@ define amdgpu_kernel void @nearbyint_v4f64(ptr addrspace(1) %out, <4 x double> %
365368
; SI-NEXT: v_mov_b32_e32 v14, s5
366369
; SI-NEXT: v_mov_b32_e32 v15, s4
367370
; SI-NEXT: v_add_f64 v[0:1], s[2:3], v[4:5]
371+
; SI-NEXT: s_bitset0_b32 s3, 31
368372
; SI-NEXT: v_add_f64 v[0:1], v[0:1], -v[4:5]
369373
; SI-NEXT: v_bfi_b32 v5, s14, v10, v7
370-
; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[2:3]|, v[8:9]
374+
; SI-NEXT: v_cmp_gt_f64_e32 vcc, s[2:3], v[8:9]
371375
; SI-NEXT: v_cndmask_b32_e32 v3, v1, v2, vcc
372376
; SI-NEXT: v_cndmask_b32_e32 v2, v0, v6, vcc
373377
; SI-NEXT: v_add_f64 v[0:1], s[0:1], v[4:5]
378+
; SI-NEXT: s_bitset0_b32 s1, 31
374379
; SI-NEXT: v_add_f64 v[0:1], v[0:1], -v[4:5]
375380
; SI-NEXT: v_bfi_b32 v5, s14, v10, v12
376-
; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[0:1]|, v[8:9]
381+
; SI-NEXT: v_cmp_gt_f64_e32 vcc, s[0:1], v[8:9]
377382
; SI-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc
378383
; SI-NEXT: v_cndmask_b32_e32 v0, v0, v11, vcc
379384
; SI-NEXT: v_add_f64 v[6:7], s[6:7], v[4:5]
385+
; SI-NEXT: s_bitset0_b32 s7, 31
380386
; SI-NEXT: v_add_f64 v[6:7], v[6:7], -v[4:5]
381387
; SI-NEXT: v_bfi_b32 v5, s14, v10, v14
382-
; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[6:7]|, v[8:9]
388+
; SI-NEXT: v_cmp_gt_f64_e32 vcc, s[6:7], v[8:9]
383389
; SI-NEXT: v_cndmask_b32_e32 v7, v7, v12, vcc
384390
; SI-NEXT: v_cndmask_b32_e32 v6, v6, v13, vcc
385391
; SI-NEXT: v_add_f64 v[10:11], s[4:5], v[4:5]
392+
; SI-NEXT: s_bitset0_b32 s5, 31
386393
; SI-NEXT: v_add_f64 v[4:5], v[10:11], -v[4:5]
387-
; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[4:5]|, v[8:9]
394+
; SI-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[8:9]
388395
; SI-NEXT: v_cndmask_b32_e32 v5, v5, v14, vcc
389396
; SI-NEXT: v_cndmask_b32_e32 v4, v4, v15, vcc
390397
; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:16

0 commit comments

Comments
 (0)