-
Notifications
You must be signed in to change notification settings - Fork 14.7k
[NVPTX] don't erase CopyToRegs when folding movs into loads #149393
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[NVPTX] don't erase CopyToRegs when folding movs into loads #149393
Conversation
@llvm/pr-subscribers-backend-nvptx Author: Princeton Ferro (Prince781) ChangesWe may still need to keep CopyToReg even after folding uses into vector loads, since the original register may be used in other blocks. Partially reverts 1fdbe69 Patch is 91.59 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/149393.diff 6 Files Affected:
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index d017c658c53a3..967a640f2842f 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -5008,11 +5008,9 @@ combineUnpackingMovIntoLoad(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
return !U.getUser()->use_empty();
}
- // Handle CopyToReg nodes that will become dead after our replacement
- if (U.getUser()->getOpcode() == ISD::CopyToReg) {
- DeadCopyToRegs.push_back(U.getUser());
+ // Peek through CopyToReg nodes
+ if (U.getUser()->getOpcode() == ISD::CopyToReg)
return true;
- }
// Otherwise, this use prevents us from splitting a value.
return false;
@@ -5080,10 +5078,6 @@ combineUnpackingMovIntoLoad(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
for (unsigned I : seq(NewLoad->getNumValues() - NewNumOutputs))
Results.push_back(NewLoad.getValue(NewNumOutputs + I));
- // Remove dead CopyToReg nodes by folding them into the chain they reference
- for (SDNode *CTR : DeadCopyToRegs)
- DCI.CombineTo(CTR, CTR->getOperand(0));
-
return DCI.DAG.getMergeValues(Results, DL);
}
@@ -6420,4 +6414,4 @@ void NVPTXTargetLowering::computeKnownBitsForTargetNode(
default:
break;
}
-}
\ No newline at end of file
+}
diff --git a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll
index d0e2c1817f696..66166756aecf9 100644
--- a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll
@@ -50,6 +50,7 @@ define half @test_extract_0(<2 x half> %a) #0 {
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_extract_0_param_0];
+; CHECK-NEXT: mov.b32 %r1, {%rs1, %rs2};
; CHECK-NEXT: st.param.b16 [func_retval0], %rs1;
; CHECK-NEXT: ret;
%e = extractelement <2 x half> %a, i32 0
@@ -64,6 +65,7 @@ define half @test_extract_1(<2 x half> %a) #0 {
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_extract_1_param_0];
+; CHECK-NEXT: mov.b32 %r1, {%rs1, %rs2};
; CHECK-NEXT: st.param.b16 [func_retval0], %rs2;
; CHECK-NEXT: ret;
%e = extractelement <2 x half> %a, i32 1
@@ -79,8 +81,9 @@ define half @test_extract_i(<2 x half> %a, i64 %idx) #0 {
; CHECK-NEXT: .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd1, [test_extract_i_param_1];
; CHECK-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_extract_i_param_0];
+; CHECK-NEXT: mov.b32 %r1, {%rs1, %rs2};
+; CHECK-NEXT: ld.param.b64 %rd1, [test_extract_i_param_1];
; CHECK-NEXT: setp.eq.b64 %p1, %rd1, 0;
; CHECK-NEXT: selp.b16 %rs3, %rs1, %rs2, %p1;
; CHECK-NEXT: st.param.b16 [func_retval0], %rs3;
@@ -108,7 +111,9 @@ define <2 x half> @test_fadd(<2 x half> %a, <2 x half> %b) #0 {
; CHECK-NOF16-EMPTY:
; CHECK-NOF16-NEXT: // %bb.0:
; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fadd_param_0];
+; CHECK-NOF16-NEXT: mov.b32 %r1, {%rs1, %rs2};
; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fadd_param_1];
+; CHECK-NOF16-NEXT: mov.b32 %r2, {%rs3, %rs4};
; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs4;
; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs2;
; CHECK-NOF16-NEXT: add.rn.f32 %r5, %r4, %r3;
@@ -144,6 +149,7 @@ define <2 x half> @test_fadd_imm_0(<2 x half> %a) #0 {
; CHECK-NOF16-EMPTY:
; CHECK-NOF16-NEXT: // %bb.0:
; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fadd_imm_0_param_0];
+; CHECK-NOF16-NEXT: mov.b32 %r1, {%rs1, %rs2};
; CHECK-NOF16-NEXT: cvt.f32.f16 %r2, %rs2;
; CHECK-NOF16-NEXT: add.rn.f32 %r3, %r2, 0f40000000;
; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs3, %r3;
@@ -176,6 +182,7 @@ define <2 x half> @test_fadd_imm_1(<2 x half> %a) #0 {
; CHECK-NOF16-EMPTY:
; CHECK-NOF16-NEXT: // %bb.0:
; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fadd_imm_1_param_0];
+; CHECK-NOF16-NEXT: mov.b32 %r1, {%rs1, %rs2};
; CHECK-NOF16-NEXT: cvt.f32.f16 %r2, %rs2;
; CHECK-NOF16-NEXT: add.rn.f32 %r3, %r2, 0f40000000;
; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs3, %r3;
@@ -208,7 +215,9 @@ define <2 x half> @test_fsub(<2 x half> %a, <2 x half> %b) #0 {
; CHECK-NOF16-EMPTY:
; CHECK-NOF16-NEXT: // %bb.0:
; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fsub_param_0];
+; CHECK-NOF16-NEXT: mov.b32 %r1, {%rs1, %rs2};
; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fsub_param_1];
+; CHECK-NOF16-NEXT: mov.b32 %r2, {%rs3, %rs4};
; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs4;
; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs2;
; CHECK-NOF16-NEXT: sub.rn.f32 %r5, %r4, %r3;
@@ -243,6 +252,7 @@ define <2 x half> @test_fneg(<2 x half> %a) #0 {
; CHECK-NOF16-EMPTY:
; CHECK-NOF16-NEXT: // %bb.0:
; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fneg_param_0];
+; CHECK-NOF16-NEXT: mov.b32 %r1, {%rs1, %rs2};
; CHECK-NOF16-NEXT: cvt.f32.f16 %r2, %rs2;
; CHECK-NOF16-NEXT: mov.b32 %r3, 0f00000000;
; CHECK-NOF16-NEXT: sub.rn.f32 %r4, %r3, %r2;
@@ -276,7 +286,9 @@ define <2 x half> @test_fmul(<2 x half> %a, <2 x half> %b) #0 {
; CHECK-NOF16-EMPTY:
; CHECK-NOF16-NEXT: // %bb.0:
; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fmul_param_0];
+; CHECK-NOF16-NEXT: mov.b32 %r1, {%rs1, %rs2};
; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fmul_param_1];
+; CHECK-NOF16-NEXT: mov.b32 %r2, {%rs3, %rs4};
; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs4;
; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs2;
; CHECK-NOF16-NEXT: mul.rn.f32 %r5, %r4, %r3;
@@ -300,7 +312,9 @@ define <2 x half> @test_fdiv(<2 x half> %a, <2 x half> %b) #0 {
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fdiv_param_0];
+; CHECK-NEXT: mov.b32 %r1, {%rs1, %rs2};
; CHECK-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fdiv_param_1];
+; CHECK-NEXT: mov.b32 %r2, {%rs3, %rs4};
; CHECK-NEXT: cvt.f32.f16 %r3, %rs4;
; CHECK-NEXT: cvt.f32.f16 %r4, %rs2;
; CHECK-NEXT: div.rn.f32 %r5, %r4, %r3;
@@ -332,7 +346,9 @@ define <2 x half> @test_frem(<2 x half> %a, <2 x half> %b) #0 {
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_frem_param_0];
+; CHECK-NEXT: mov.b32 %r1, {%rs1, %rs2};
; CHECK-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_frem_param_1];
+; CHECK-NEXT: mov.b32 %r2, {%rs3, %rs4};
; CHECK-NEXT: cvt.f32.f16 %r3, %rs4;
; CHECK-NEXT: cvt.f32.f16 %r4, %rs2;
; CHECK-NEXT: div.rn.f32 %r5, %r4, %r3;
@@ -533,11 +549,13 @@ define <2 x half> @test_select_cc(<2 x half> %a, <2 x half> %b, <2 x half> %c, <
; CHECK-F16-NEXT: .reg .b32 %r<5>;
; CHECK-F16-EMPTY:
; CHECK-F16-NEXT: // %bb.0:
+; CHECK-F16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_select_cc_param_0];
+; CHECK-F16-NEXT: mov.b32 %r1, {%rs1, %rs2};
; CHECK-F16-NEXT: ld.param.b32 %r4, [test_select_cc_param_3];
; CHECK-F16-NEXT: ld.param.b32 %r3, [test_select_cc_param_2];
-; CHECK-F16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_select_cc_param_0];
-; CHECK-F16-NEXT: setp.neu.f16x2 %p1|%p2, %r3, %r4;
; CHECK-F16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_select_cc_param_1];
+; CHECK-F16-NEXT: mov.b32 %r2, {%rs3, %rs4};
+; CHECK-F16-NEXT: setp.neu.f16x2 %p1|%p2, %r3, %r4;
; CHECK-F16-NEXT: selp.b16 %rs5, %rs2, %rs4, %p2;
; CHECK-F16-NEXT: selp.b16 %rs6, %rs1, %rs3, %p1;
; CHECK-F16-NEXT: st.param.v2.b16 [func_retval0], {%rs6, %rs5};
@@ -551,15 +569,19 @@ define <2 x half> @test_select_cc(<2 x half> %a, <2 x half> %b, <2 x half> %c, <
; CHECK-NOF16-EMPTY:
; CHECK-NOF16-NEXT: // %bb.0:
; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_select_cc_param_0];
+; CHECK-NOF16-NEXT: mov.b32 %r1, {%rs1, %rs2};
; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_select_cc_param_3];
-; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs3;
+; CHECK-NOF16-NEXT: mov.b32 %r4, {%rs3, %rs4};
; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs5, %rs6}, [test_select_cc_param_2];
+; CHECK-NOF16-NEXT: mov.b32 %r3, {%rs5, %rs6};
+; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs7, %rs8}, [test_select_cc_param_1];
+; CHECK-NOF16-NEXT: mov.b32 %r2, {%rs7, %rs8};
+; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs3;
; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs5;
; CHECK-NOF16-NEXT: setp.neu.f32 %p1, %r6, %r5;
; CHECK-NOF16-NEXT: cvt.f32.f16 %r7, %rs4;
; CHECK-NOF16-NEXT: cvt.f32.f16 %r8, %rs6;
; CHECK-NOF16-NEXT: setp.neu.f32 %p2, %r8, %r7;
-; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs7, %rs8}, [test_select_cc_param_1];
; CHECK-NOF16-NEXT: selp.b16 %rs9, %rs2, %rs8, %p2;
; CHECK-NOF16-NEXT: selp.b16 %rs10, %rs1, %rs7, %p1;
; CHECK-NOF16-NEXT: st.param.v2.b16 [func_retval0], {%rs10, %rs9};
@@ -577,11 +599,13 @@ define <2 x float> @test_select_cc_f32_f16(<2 x float> %a, <2 x float> %b,
; CHECK-F16-NEXT: .reg .b64 %rd<3>;
; CHECK-F16-EMPTY:
; CHECK-F16-NEXT: // %bb.0:
+; CHECK-F16-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_select_cc_f32_f16_param_0];
+; CHECK-F16-NEXT: mov.b64 %rd1, {%r3, %r4};
; CHECK-F16-NEXT: ld.param.b32 %r2, [test_select_cc_f32_f16_param_3];
; CHECK-F16-NEXT: ld.param.b32 %r1, [test_select_cc_f32_f16_param_2];
-; CHECK-F16-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_select_cc_f32_f16_param_0];
-; CHECK-F16-NEXT: setp.neu.f16x2 %p1|%p2, %r1, %r2;
; CHECK-F16-NEXT: ld.param.v2.b32 {%r5, %r6}, [test_select_cc_f32_f16_param_1];
+; CHECK-F16-NEXT: mov.b64 %rd2, {%r5, %r6};
+; CHECK-F16-NEXT: setp.neu.f16x2 %p1|%p2, %r1, %r2;
; CHECK-F16-NEXT: selp.f32 %r7, %r4, %r6, %p2;
; CHECK-F16-NEXT: selp.f32 %r8, %r3, %r5, %p1;
; CHECK-F16-NEXT: st.param.v2.b32 [func_retval0], {%r8, %r7};
@@ -596,17 +620,21 @@ define <2 x float> @test_select_cc_f32_f16(<2 x float> %a, <2 x float> %b,
; CHECK-NOF16-EMPTY:
; CHECK-NOF16-NEXT: // %bb.0:
; CHECK-NOF16-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_select_cc_f32_f16_param_0];
+; CHECK-NOF16-NEXT: mov.b64 %rd1, {%r3, %r4};
; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_select_cc_f32_f16_param_3];
-; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs1;
+; CHECK-NOF16-NEXT: mov.b32 %r2, {%rs1, %rs2};
; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_select_cc_f32_f16_param_2];
-; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs3;
-; CHECK-NOF16-NEXT: setp.neu.f32 %p1, %r6, %r5;
-; CHECK-NOF16-NEXT: cvt.f32.f16 %r7, %rs2;
-; CHECK-NOF16-NEXT: cvt.f32.f16 %r8, %rs4;
-; CHECK-NOF16-NEXT: setp.neu.f32 %p2, %r8, %r7;
-; CHECK-NOF16-NEXT: ld.param.v2.b32 {%r9, %r10}, [test_select_cc_f32_f16_param_1];
-; CHECK-NOF16-NEXT: selp.f32 %r11, %r4, %r10, %p2;
-; CHECK-NOF16-NEXT: selp.f32 %r12, %r3, %r9, %p1;
+; CHECK-NOF16-NEXT: mov.b32 %r1, {%rs3, %rs4};
+; CHECK-NOF16-NEXT: ld.param.v2.b32 {%r5, %r6}, [test_select_cc_f32_f16_param_1];
+; CHECK-NOF16-NEXT: mov.b64 %rd2, {%r5, %r6};
+; CHECK-NOF16-NEXT: cvt.f32.f16 %r7, %rs1;
+; CHECK-NOF16-NEXT: cvt.f32.f16 %r8, %rs3;
+; CHECK-NOF16-NEXT: setp.neu.f32 %p1, %r8, %r7;
+; CHECK-NOF16-NEXT: cvt.f32.f16 %r9, %rs2;
+; CHECK-NOF16-NEXT: cvt.f32.f16 %r10, %rs4;
+; CHECK-NOF16-NEXT: setp.neu.f32 %p2, %r10, %r9;
+; CHECK-NOF16-NEXT: selp.f32 %r11, %r4, %r6, %p2;
+; CHECK-NOF16-NEXT: selp.f32 %r12, %r3, %r5, %p1;
; CHECK-NOF16-NEXT: st.param.v2.b32 [func_retval0], {%r12, %r11};
; CHECK-NOF16-NEXT: ret;
<2 x half> %c, <2 x half> %d) #0 {
@@ -625,11 +653,15 @@ define <2 x half> @test_select_cc_f16_f32(<2 x half> %a, <2 x half> %b,
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_select_cc_f16_f32_param_0];
-; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_select_cc_f16_f32_param_2];
-; CHECK-NEXT: ld.param.v2.b32 {%r5, %r6}, [test_select_cc_f16_f32_param_3];
-; CHECK-NEXT: setp.neu.f32 %p1, %r3, %r5;
-; CHECK-NEXT: setp.neu.f32 %p2, %r4, %r6;
+; CHECK-NEXT: mov.b32 %r1, {%rs1, %rs2};
+; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_select_cc_f16_f32_param_3];
+; CHECK-NEXT: mov.b64 %rd2, {%r3, %r4};
+; CHECK-NEXT: ld.param.v2.b32 {%r5, %r6}, [test_select_cc_f16_f32_param_2];
+; CHECK-NEXT: mov.b64 %rd1, {%r5, %r6};
; CHECK-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_select_cc_f16_f32_param_1];
+; CHECK-NEXT: mov.b32 %r2, {%rs3, %rs4};
+; CHECK-NEXT: setp.neu.f32 %p1, %r5, %r3;
+; CHECK-NEXT: setp.neu.f32 %p2, %r6, %r4;
; CHECK-NEXT: selp.b16 %rs5, %rs2, %rs4, %p2;
; CHECK-NEXT: selp.b16 %rs6, %rs1, %rs3, %p1;
; CHECK-NEXT: st.param.v2.b16 [func_retval0], {%rs6, %rs5};
@@ -665,7 +697,9 @@ define <2 x i1> @test_fcmp_une(<2 x half> %a, <2 x half> %b) #0 {
; CHECK-NOF16-EMPTY:
; CHECK-NOF16-NEXT: // %bb.0:
; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fcmp_une_param_0];
+; CHECK-NOF16-NEXT: mov.b32 %r1, {%rs1, %rs2};
; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fcmp_une_param_1];
+; CHECK-NOF16-NEXT: mov.b32 %r2, {%rs3, %rs4};
; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs4;
; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs2;
; CHECK-NOF16-NEXT: setp.neu.f32 %p1, %r4, %r3;
@@ -706,7 +740,9 @@ define <2 x i1> @test_fcmp_ueq(<2 x half> %a, <2 x half> %b) #0 {
; CHECK-NOF16-EMPTY:
; CHECK-NOF16-NEXT: // %bb.0:
; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fcmp_ueq_param_0];
+; CHECK-NOF16-NEXT: mov.b32 %r1, {%rs1, %rs2};
; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fcmp_ueq_param_1];
+; CHECK-NOF16-NEXT: mov.b32 %r2, {%rs3, %rs4};
; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs4;
; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs2;
; CHECK-NOF16-NEXT: setp.equ.f32 %p1, %r4, %r3;
@@ -747,7 +783,9 @@ define <2 x i1> @test_fcmp_ugt(<2 x half> %a, <2 x half> %b) #0 {
; CHECK-NOF16-EMPTY:
; CHECK-NOF16-NEXT: // %bb.0:
; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fcmp_ugt_param_0];
+; CHECK-NOF16-NEXT: mov.b32 %r1, {%rs1, %rs2};
; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fcmp_ugt_param_1];
+; CHECK-NOF16-NEXT: mov.b32 %r2, {%rs3, %rs4};
; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs4;
; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs2;
; CHECK-NOF16-NEXT: setp.gtu.f32 %p1, %r4, %r3;
@@ -788,7 +826,9 @@ define <2 x i1> @test_fcmp_uge(<2 x half> %a, <2 x half> %b) #0 {
; CHECK-NOF16-EMPTY:
; CHECK-NOF16-NEXT: // %bb.0:
; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fcmp_uge_param_0];
+; CHECK-NOF16-NEXT: mov.b32 %r1, {%rs1, %rs2};
; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fcmp_uge_param_1];
+; CHECK-NOF16-NEXT: mov.b32 %r2, {%rs3, %rs4};
; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs4;
; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs2;
; CHECK-NOF16-NEXT: setp.geu.f32 %p1, %r4, %r3;
@@ -829,7 +869,9 @@ define <2 x i1> @test_fcmp_ult(<2 x half> %a, <2 x half> %b) #0 {
; CHECK-NOF16-EMPTY:
; CHECK-NOF16-NEXT: // %bb.0:
; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fcmp_ult_param_0];
+; CHECK-NOF16-NEXT: mov.b32 %r1, {%rs1, %rs2};
; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fcmp_ult_param_1];
+; CHECK-NOF16-NEXT: mov.b32 %r2, {%rs3, %rs4};
; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs4;
; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs2;
; CHECK-NOF16-NEXT: setp.ltu.f32 %p1, %r4, %r3;
@@ -870,7 +912,9 @@ define <2 x i1> @test_fcmp_ule(<2 x half> %a, <2 x half> %b) #0 {
; CHECK-NOF16-EMPTY:
; CHECK-NOF16-NEXT: // %bb.0:
; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fcmp_ule_param_0];
+; CHECK-NOF16-NEXT: mov.b32 %r1, {%rs1, %rs2};
; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fcmp_ule_param_1];
+; CHECK-NOF16-NEXT: mov.b32 %r2, {%rs3, %rs4};
; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs4;
; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs2;
; CHECK-NOF16-NEXT: setp.leu.f32 %p1, %r4, %r3;
@@ -912,7 +956,9 @@ define <2 x i1> @test_fcmp_uno(<2 x half> %a, <2 x half> %b) #0 {
; CHECK-NOF16-EMPTY:
; CHECK-NOF16-NEXT: // %bb.0:
; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fcmp_uno_param_0];
+; CHECK-NOF16-NEXT: mov.b32 %r1, {%rs1, %rs2};
; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fcmp_uno_param_1];
+; CHECK-NOF16-NEXT: mov.b32 %r2, {%rs3, %rs4};
; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs4;
; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs2;
; CHECK-NOF16-NEXT: setp.nan.f32 %p1, %r4, %r3;
@@ -953,7 +999,9 @@ define <2 x i1> @test_fcmp_one(<2 x half> %a, <2 x half> %b) #0 {
; CHECK-NOF16-EMPTY:
; CHECK-NOF16-NEXT: // %bb.0:
; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fcmp_one_param_0];
+; CHECK-NOF16-NEXT: mov.b32 %r1, {%rs1, %rs2};
; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fcmp_one_param_1];
+; CHECK-NOF16-NEXT: mov.b32 %r2, {%rs3, %rs4};
; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs4;
; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs2;
; CHECK-NOF16-NEXT: setp.ne.f32 %p1, %r4, %r3;
@@ -994,7 +1042,9 @@ define <2 x i1> @test_fcmp_oeq(<2 x half> %a, <2 x half> %b) #0 {
; CHECK-NOF16-EMPTY:
; CHECK-NOF16-NEXT: // %bb.0:
; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fcmp_oeq_param_0];
+; CHECK-NOF16-NEXT: mov.b32 %r1, {%rs1, %rs2};
; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fcmp_oeq_param_1];
+; CHECK-NOF16-NEXT: mov.b32 %r2, {%rs3, %rs4};
; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs4;
; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs2;
; CHECK-NOF16-NEXT: setp.eq.f32 %p1, %r4, %r3;
@@ -1035,7 +1085,9 @@ define <2 x i1> @test_fcmp_ogt(<2 x half> %a, <2 x half> %b) #0 {
; CHECK-NOF16-EMPTY:
; CHECK-NOF16-NEXT: // %bb.0:
; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fcmp_ogt_param_0];
+; CHECK-NOF16-NEXT: mov.b32 %r1, {%rs1, %rs2};
; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fcmp_ogt_param_1];
+; CHECK-NOF16-NEXT: mov.b32 %r2, {%rs3, %rs4};
; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs4;
; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs2;
; CHECK-NOF16-NEXT: setp.gt.f32 %p1, %r4, %r3;
@@ -1076,7 +1128,9 @@ define <2 x i1> @test_fcmp_oge(<2 x half> %a, <2 x half> %b) #0 {
; CHECK-NOF16-EMPTY:
; CHECK-NOF16-NEXT: // %bb.0:
; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fcmp_oge_param_0];
+; CHECK-NOF16-NEXT: mov.b32 %r1, {%rs1, %rs2};
; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fcmp_oge_param_1];
+; CHECK-NOF16-NEXT: mov.b32 %r2, {%rs3, %rs4};
; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs4;
; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs2;
; CHECK-NOF16-NEXT: setp.ge.f32 %p1, %r4, %r3;
@@ -1117,7 +1171,9 @@ define <2 x i1> @test_fcmp_olt(<2 x half> %a, <2 x half> %b) #0 {
; CHECK-NOF16-EMPTY:
; CHECK-NOF16-NEXT: // %bb.0:
; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fcmp_olt_param_0];
+; CHECK-NOF16-NEXT: mov.b32 %r1, {%rs1, %rs2};
; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fcmp_olt_param_1];
+; CHECK-NOF16-NEXT: mov.b32 %r2, {%rs3, %rs4};
; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs4;
; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs2;
; CHECK-NOF16-NEXT: setp.lt.f32 %p1, %r4, %r3;
@@ -1158,7 +1214,9 @@ define <2 x i1> @test_fcmp_ole(<2 x half> %a, <2 x half> %b) #0 {
; CHECK-NOF16-EMPTY:
; CHECK-NOF16-NEXT: // %bb.0:
; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fcmp_ole_param_0];
+; CHECK-NOF16-NEXT: mov.b32 %r1, {%rs1, %rs2};
; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fcmp_ole_param_1];
+; CHECK-NOF16-NEXT: mov.b32 %r2, {%rs3, %rs4};
; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs4;
; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs2;
; CHECK-NOF16-NEXT: setp.le.f32 %p1, %r4, %r3;
@@ -1199,7 +1257,9 @@ define <2 x i1> @test_fcmp_ord(<2 x half> %a, <2 x half> %b) #0 {
; CHECK-NOF16-EMPTY:
; CHECK-NOF16-NEXT: // %bb.0:
; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fcmp_ord_param_0];
+; CHECK-NOF16-NEXT: mov.b32 %r1, {%rs1, %rs2};
; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fcmp_ord_param_1];
+; CHECK-NOF16-NEXT: mov.b32 %r2, {%rs3, %rs4};
; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs4;
; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs2;
; CHECK-NOF16-NEXT: setp.num.f32 %p1, %r4, %r3;
@@ -1223,6 +1283,7 @@ define <2 x i...
[truncated]
|
9d31b01
to
a534d8d
Compare
You may want to add the regression test from my #149379 |
a534d8d
to
8502f11
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thank you!
fyi @bangtianliu as well
8502f11
to
20cc9c3
Compare
@Prince781 can this be merged ? That should fix lots of stability issues we got after #126337 |
I'm wondering why CI checks are failing with this revert. |
Ah, it's a FileCheck failure. |
We may still need to keep CopyToReg even after folding uses into vector loads, since the original register may be used in other blocks. Partially reverts 1fdbe69
20cc9c3
to
9e45f4e
Compare
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/160/builds/21440 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/180/builds/21583 Here is the relevant piece of the build log for the reference
|
Looks like we need to add |
Integrate llvm/llvm-project@92c55a3. Local changes adapt to llvm/llvm-project@03bd0f3, plus a fix to a preexisting issue in `vmvx_materialize_encoding.mlir` which for some reason was tolerated up until now. No reverts or cherry-picks: dropped the former local revert since llvm/llvm-project#149393 was landed. --------- Signed-off-by: Benoit Jacob <[email protected]>
Hi @Prince781 , any progress with fixing |
Integrate llvm/llvm-project@aa1b416. IR/test changes to tests with amdgpu.fat_raw_buffer_cast from updates in upstream related to inferring canonical layout for resetOffset llvm/llvm-project@9052a85 No reverts or cherry-picks: dropped the former local revert since llvm/llvm-project#149393 was landed. Signed-off-by: Stanley Winata <[email protected]>
Fixed broken 'pr126337.ll' NVPTX related test (by #149393)
Fixed broken 'pr126337.ll' NVPTX related test (by llvm#149393)
We may still need to keep CopyToReg even after folding uses into vector loads, since the original register may be used in other blocks.
Partially reverts 1fdbe69