Skip to content

Commit 9b4ff42

Browse files
committed
[DAG] visitFREEZE - limit freezing of multiple operands (#149797)
This is a partial revert of #145939 (I've kept the BUILD_VECTOR(FREEZE(UNDEF), FREEZE(UNDEF), elt2, ...) canonicalization) as we're getting reports of infinite loops (#148084). The issue appears to be due to deep chains of nodes and how visitFREEZE replaces all instances of an operand with a common frozen version - other users of the original frozen node then get added back to the worklist but might no longer be able to confirm a node isn't poison due to recursion depth limits on isGuaranteedNotToBeUndefOrPoison. The issue still exists with the old implementation but by only allowing a single frozen operand it helps prevent cases of interdependent frozen nodes. I'm still working on supporting multiple operands as its critical for topological DAG handling but need to get a fix in for trunk and 21.x. Fixes #148084
1 parent 4f07800 commit 9b4ff42

16 files changed

+1816
-1698
lines changed

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16717,14 +16717,27 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) {
1671716717
// Fold freeze(op(x, ...)) -> op(freeze(x), ...).
1671816718
// Try to push freeze through instructions that propagate but don't produce
1671916719
// poison as far as possible. If an operand of freeze follows three
16720-
// conditions 1) one-use, and 2) does not produce poison then push
16720+
// conditions 1) one-use, 2) does not produce poison, and 3) has all but one
16721+
// guaranteed-non-poison operands (or is a BUILD_VECTOR or similar) then push
1672116722
// the freeze through to the operands that are not guaranteed non-poison.
1672216723
// NOTE: we will strip poison-generating flags, so ignore them here.
1672316724
if (DAG.canCreateUndefOrPoison(N0, /*PoisonOnly*/ false,
1672416725
/*ConsiderFlags*/ false) ||
1672516726
N0->getNumValues() != 1 || !N0->hasOneUse())
1672616727
return SDValue();
1672716728

16729+
// TOOD: we should always allow multiple operands, however this increases the
16730+
// likelihood of infinite loops due to the ReplaceAllUsesOfValueWith call
16731+
// below causing later nodes that share frozen operands to fold again and no
16732+
// longer being able to confirm other operands are not poison due to recursion
16733+
// depth limits on isGuaranteedNotToBeUndefOrPoison.
16734+
bool AllowMultipleMaybePoisonOperands =
16735+
N0.getOpcode() == ISD::SELECT_CC || N0.getOpcode() == ISD::SETCC ||
16736+
N0.getOpcode() == ISD::BUILD_VECTOR ||
16737+
N0.getOpcode() == ISD::BUILD_PAIR ||
16738+
N0.getOpcode() == ISD::VECTOR_SHUFFLE ||
16739+
N0.getOpcode() == ISD::CONCAT_VECTORS || N0.getOpcode() == ISD::FMUL;
16740+
1672816741
// Avoid turning a BUILD_VECTOR that can be recognized as "all zeros", "all
1672916742
// ones" or "constant" into something that depends on FrozenUndef. We can
1673016743
// instead pick undef values to keep those properties, while at the same time
@@ -16751,8 +16764,16 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) {
1675116764
if (DAG.isGuaranteedNotToBeUndefOrPoison(Op, /*PoisonOnly*/ false,
1675216765
/*Depth*/ 1))
1675316766
continue;
16754-
if (MaybePoisonOperands.insert(Op).second)
16767+
bool HadMaybePoisonOperands = !MaybePoisonOperands.empty();
16768+
bool IsNewMaybePoisonOperand = MaybePoisonOperands.insert(Op).second;
16769+
if (IsNewMaybePoisonOperand)
1675516770
MaybePoisonOperandNumbers.push_back(OpNo);
16771+
if (!HadMaybePoisonOperands)
16772+
continue;
16773+
if (IsNewMaybePoisonOperand && !AllowMultipleMaybePoisonOperands) {
16774+
// Multiple maybe-poison ops when not allowed - bail out.
16775+
return SDValue();
16776+
}
1675616777
}
1675716778
// NOTE: the whole op may be not guaranteed to not be undef or poison because
1675816779
// it could create undef or poison due to it's poison-generating flags.

llvm/test/CodeGen/AMDGPU/div_i128.ll

Lines changed: 24 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -475,28 +475,21 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
475475
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
476476
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
477477
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8
478+
; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
479+
; GFX9-O0-NEXT: s_nop 0
480+
; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
478481
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
479482
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
480483
; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
481484
; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6
482-
; GFX9-O0-NEXT: v_mov_b32_e32 v10, v8
483-
; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7
484-
; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
485-
; GFX9-O0-NEXT: s_nop 0
486-
; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
487-
; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5
488-
; GFX9-O0-NEXT: v_mov_b32_e32 v9, v4
489-
; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
485+
; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
490486
; GFX9-O0-NEXT: s_nop 0
491-
; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
492-
; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7]
493-
; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[8:9]
487+
; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
488+
; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[6:7]
494489
; GFX9-O0-NEXT: s_mov_b64 s[12:13], 0x7f
495-
; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[12:13]
496-
; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[14:15]
490+
; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[12:13]
497491
; GFX9-O0-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[14:15]
498-
; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[6:7]
499-
; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[14:15]
492+
; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[6:7]
500493
; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[14:15]
501494
; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9]
502495
; GFX9-O0-NEXT: v_and_b32_e64 v6, 1, v6
@@ -507,7 +500,6 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
507500
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5
508501
; GFX9-O0-NEXT: s_mov_b32 s14, s13
509502
; GFX9-O0-NEXT: v_xor_b32_e64 v6, v6, s14
510-
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
511503
; GFX9-O0-NEXT: ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13
512504
; GFX9-O0-NEXT: v_xor_b32_e64 v4, v4, s12
513505
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
@@ -1043,10 +1035,10 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
10431035
; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
10441036
; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
10451037
; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
1046-
; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
1047-
; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
1048-
; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
1049-
; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
1038+
; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
1039+
; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
1040+
; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
1041+
; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
10501042
; GFX9-O0-NEXT: s_mov_b64 s[6:7], 1
10511043
; GFX9-O0-NEXT: s_mov_b32 s5, s6
10521044
; GFX9-O0-NEXT: s_waitcnt vmcnt(1)
@@ -2664,28 +2656,21 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
26642656
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
26652657
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
26662658
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8
2659+
; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
2660+
; GFX9-O0-NEXT: s_nop 0
2661+
; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
26672662
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
26682663
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
26692664
; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
26702665
; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6
2671-
; GFX9-O0-NEXT: v_mov_b32_e32 v10, v8
2672-
; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7
2673-
; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
2674-
; GFX9-O0-NEXT: s_nop 0
2675-
; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
2676-
; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5
2677-
; GFX9-O0-NEXT: v_mov_b32_e32 v9, v4
2678-
; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
2666+
; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
26792667
; GFX9-O0-NEXT: s_nop 0
2680-
; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
2681-
; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7]
2682-
; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[8:9]
2668+
; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
2669+
; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[6:7]
26832670
; GFX9-O0-NEXT: s_mov_b64 s[12:13], 0x7f
2684-
; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[12:13]
2685-
; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[14:15]
2671+
; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[12:13]
26862672
; GFX9-O0-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[14:15]
2687-
; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[6:7]
2688-
; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[14:15]
2673+
; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[6:7]
26892674
; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[14:15]
26902675
; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9]
26912676
; GFX9-O0-NEXT: v_and_b32_e64 v6, 1, v6
@@ -2696,7 +2681,6 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
26962681
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5
26972682
; GFX9-O0-NEXT: s_mov_b32 s14, s13
26982683
; GFX9-O0-NEXT: v_xor_b32_e64 v6, v6, s14
2699-
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
27002684
; GFX9-O0-NEXT: ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13
27012685
; GFX9-O0-NEXT: v_xor_b32_e64 v4, v4, s12
27022686
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
@@ -3232,10 +3216,10 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
32323216
; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
32333217
; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
32343218
; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
3235-
; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
3236-
; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
3237-
; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
3238-
; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
3219+
; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
3220+
; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
3221+
; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
3222+
; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
32393223
; GFX9-O0-NEXT: s_mov_b64 s[6:7], 1
32403224
; GFX9-O0-NEXT: s_mov_b32 s5, s6
32413225
; GFX9-O0-NEXT: s_waitcnt vmcnt(1)

llvm/test/CodeGen/AMDGPU/rem_i128.ll

Lines changed: 24 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -513,28 +513,21 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
513513
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
514514
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
515515
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8
516+
; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
517+
; GFX9-O0-NEXT: s_nop 0
518+
; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
516519
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
517520
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
518521
; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
519522
; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6
520-
; GFX9-O0-NEXT: v_mov_b32_e32 v10, v8
521-
; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7
522-
; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
523-
; GFX9-O0-NEXT: s_nop 0
524-
; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
525-
; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5
526-
; GFX9-O0-NEXT: v_mov_b32_e32 v9, v4
527-
; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
523+
; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
528524
; GFX9-O0-NEXT: s_nop 0
529-
; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
530-
; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7]
531-
; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[8:9]
525+
; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
526+
; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[6:7]
532527
; GFX9-O0-NEXT: s_mov_b64 s[12:13], 0x7f
533-
; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[12:13]
534-
; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[14:15]
528+
; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[12:13]
535529
; GFX9-O0-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[14:15]
536-
; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[6:7]
537-
; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[14:15]
530+
; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[6:7]
538531
; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[14:15]
539532
; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9]
540533
; GFX9-O0-NEXT: v_and_b32_e64 v6, 1, v6
@@ -545,7 +538,6 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
545538
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5
546539
; GFX9-O0-NEXT: s_mov_b32 s14, s13
547540
; GFX9-O0-NEXT: v_xor_b32_e64 v6, v6, s14
548-
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
549541
; GFX9-O0-NEXT: ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13
550542
; GFX9-O0-NEXT: v_xor_b32_e64 v4, v4, s12
551543
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
@@ -1081,10 +1073,10 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
10811073
; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
10821074
; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
10831075
; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
1084-
; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
1085-
; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
1086-
; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
1087-
; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
1076+
; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
1077+
; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
1078+
; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
1079+
; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
10881080
; GFX9-O0-NEXT: s_mov_b64 s[6:7], 1
10891081
; GFX9-O0-NEXT: s_mov_b32 s5, s6
10901082
; GFX9-O0-NEXT: s_waitcnt vmcnt(1)
@@ -1897,28 +1889,21 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
18971889
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
18981890
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
18991891
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8
1892+
; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
1893+
; GFX9-O0-NEXT: s_nop 0
1894+
; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
19001895
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
19011896
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
19021897
; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
19031898
; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6
1904-
; GFX9-O0-NEXT: v_mov_b32_e32 v10, v8
1905-
; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7
1906-
; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
1907-
; GFX9-O0-NEXT: s_nop 0
1908-
; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
1909-
; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5
1910-
; GFX9-O0-NEXT: v_mov_b32_e32 v9, v4
1911-
; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
1899+
; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
19121900
; GFX9-O0-NEXT: s_nop 0
1913-
; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
1914-
; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7]
1915-
; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[8:9]
1901+
; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
1902+
; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[6:7]
19161903
; GFX9-O0-NEXT: s_mov_b64 s[12:13], 0x7f
1917-
; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[12:13]
1918-
; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[14:15]
1904+
; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[12:13]
19191905
; GFX9-O0-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[14:15]
1920-
; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[6:7]
1921-
; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[14:15]
1906+
; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[6:7]
19221907
; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[14:15]
19231908
; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9]
19241909
; GFX9-O0-NEXT: v_and_b32_e64 v6, 1, v6
@@ -1929,7 +1914,6 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
19291914
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5
19301915
; GFX9-O0-NEXT: s_mov_b32 s14, s13
19311916
; GFX9-O0-NEXT: v_xor_b32_e64 v6, v6, s14
1932-
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
19331917
; GFX9-O0-NEXT: ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13
19341918
; GFX9-O0-NEXT: v_xor_b32_e64 v4, v4, s12
19351919
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
@@ -2465,10 +2449,10 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
24652449
; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
24662450
; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
24672451
; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
2468-
; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
2469-
; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
2470-
; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
2471-
; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
2452+
; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
2453+
; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
2454+
; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
2455+
; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
24722456
; GFX9-O0-NEXT: s_mov_b64 s[6:7], 1
24732457
; GFX9-O0-NEXT: s_mov_b32 s5, s6
24742458
; GFX9-O0-NEXT: s_waitcnt vmcnt(1)

llvm/test/CodeGen/NVPTX/i1-select.ll

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -94,27 +94,27 @@ define i32 @test_select_i1_basic(i32 %v1, i32 %v2, i32 %v3, i32 %true, i32 %fals
9494
define i32 @test_select_i1_basic_folding(i32 %v1, i32 %v2, i32 %v3, i32 %true, i32 %false) {
9595
; CHECK-LABEL: test_select_i1_basic_folding(
9696
; CHECK: {
97-
; CHECK-NEXT: .reg .pred %p<12>;
98-
; CHECK-NEXT: .reg .b32 %r<9>;
97+
; CHECK-NEXT: .reg .pred %p<13>;
98+
; CHECK-NEXT: .reg .b32 %r<7>;
9999
; CHECK-EMPTY:
100100
; CHECK-NEXT: // %bb.0:
101101
; CHECK-NEXT: ld.param.b32 %r1, [test_select_i1_basic_folding_param_0];
102102
; CHECK-NEXT: setp.eq.b32 %p1, %r1, 0;
103-
; CHECK-NEXT: ld.param.b32 %r3, [test_select_i1_basic_folding_param_1];
104-
; CHECK-NEXT: setp.ne.b32 %p2, %r3, 0;
105-
; CHECK-NEXT: setp.eq.b32 %p3, %r3, 0;
106-
; CHECK-NEXT: ld.param.b32 %r5, [test_select_i1_basic_folding_param_2];
107-
; CHECK-NEXT: setp.eq.b32 %p4, %r5, 0;
108-
; CHECK-NEXT: ld.param.b32 %r6, [test_select_i1_basic_folding_param_3];
103+
; CHECK-NEXT: ld.param.b32 %r2, [test_select_i1_basic_folding_param_1];
104+
; CHECK-NEXT: setp.ne.b32 %p2, %r2, 0;
105+
; CHECK-NEXT: setp.eq.b32 %p3, %r2, 0;
106+
; CHECK-NEXT: ld.param.b32 %r3, [test_select_i1_basic_folding_param_2];
107+
; CHECK-NEXT: setp.eq.b32 %p4, %r3, 0;
108+
; CHECK-NEXT: ld.param.b32 %r4, [test_select_i1_basic_folding_param_3];
109109
; CHECK-NEXT: xor.pred %p6, %p1, %p3;
110-
; CHECK-NEXT: ld.param.b32 %r7, [test_select_i1_basic_folding_param_4];
110+
; CHECK-NEXT: ld.param.b32 %r5, [test_select_i1_basic_folding_param_4];
111111
; CHECK-NEXT: and.pred %p7, %p6, %p4;
112-
; CHECK-NEXT: and.pred %p8, %p2, %p4;
113-
; CHECK-NEXT: and.pred %p9, %p3, %p7;
114-
; CHECK-NEXT: or.pred %p10, %p9, %p8;
115-
; CHECK-NEXT: xor.pred %p11, %p10, %p3;
116-
; CHECK-NEXT: selp.b32 %r8, %r6, %r7, %p11;
117-
; CHECK-NEXT: st.param.b32 [func_retval0], %r8;
112+
; CHECK-NEXT: and.pred %p9, %p2, %p4;
113+
; CHECK-NEXT: and.pred %p10, %p3, %p7;
114+
; CHECK-NEXT: or.pred %p11, %p10, %p9;
115+
; CHECK-NEXT: xor.pred %p12, %p11, %p3;
116+
; CHECK-NEXT: selp.b32 %r6, %r4, %r5, %p12;
117+
; CHECK-NEXT: st.param.b32 [func_retval0], %r6;
118118
; CHECK-NEXT: ret;
119119
%b1 = icmp eq i32 %v1, 0
120120
%b2 = icmp eq i32 %v2, 0

0 commit comments

Comments
 (0)