-
Notifications
You must be signed in to change notification settings - Fork 14.8k
[SLP] Prefer copyable vectorization over alternate opcodes #153684
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
[SLP] Prefer copyable vectorization over alternate opcodes #153684
Conversation
Created using spr 1.3.5
@llvm/pr-subscribers-vectorizers @llvm/pr-subscribers-backend-systemz Author: Alexey Bataev (alexey-bataev) ChangesCurrently, SLP vectorizer prefers alternate opcode vectorization over Patch is 33.53 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/153684.diff 15 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 7362d5b0b5865..df3089847dbe3 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -11307,27 +11307,27 @@ void BoUpSLP::buildTreeRec(ArrayRef<Value *> VLRef, unsigned Depth,
}
ScalarsVectorizationLegality Legality = getScalarsVectorizationLegality(
- VL, Depth, UserTreeIdx, /*TryCopyableElementsVectorization=*/false);
+ VL, Depth, UserTreeIdx, /*TryCopyableElementsVectorization=*/true);
InstructionsState S = Legality.getInstructionsState();
if (!Legality.isLegal()) {
- if (Legality.trySplitVectorize()) {
- auto [MainOp, AltOp] = getMainAltOpsNoStateVL(VL);
- // Last chance to try to vectorize alternate node.
- if (MainOp && AltOp && TrySplitNode(InstructionsState(MainOp, AltOp)))
- return;
- }
- if (!S)
+ if (!S) {
Legality = getScalarsVectorizationLegality(
- VL, Depth, UserTreeIdx, /*TryCopyableElementsVectorization=*/true);
+ VL, Depth, UserTreeIdx, /*TryCopyableElementsVectorization=*/false);
+ S = Legality.getInstructionsState();
+ }
if (!Legality.isLegal()) {
+ if (Legality.trySplitVectorize()) {
+ auto [MainOp, AltOp] = getMainAltOpsNoStateVL(VL);
+ // Last chance to try to vectorize alternate node.
+ if (MainOp && AltOp && TrySplitNode(InstructionsState(MainOp, AltOp)))
+ return;
+ }
if (Legality.tryToFindDuplicates())
tryToFindDuplicates(VL, ReuseShuffleIndices, *TTI, *TLI, S,
UserTreeIdx);
-
newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
return;
}
- S = Legality.getInstructionsState();
}
// FIXME: investigate if there are profitable cases for VL.size() <= 4.
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/remark-zext-incoming-for-neg-icmp.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/remark-zext-incoming-for-neg-icmp.ll
index 09c11bbefd4a3..485807e84966b 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/remark-zext-incoming-for-neg-icmp.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/remark-zext-incoming-for-neg-icmp.ll
@@ -16,11 +16,11 @@ define i32 @test(i32 %a, i8 %b, i8 %c) {
; CHECK-LABEL: define i32 @test(
; CHECK-SAME: i32 [[A:%.*]], i8 [[B:%.*]], i8 [[C:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i8> poison, i8 [[B]], i32 0
+; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i8> poison, i8 [[C]], i32 0
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i8> [[TMP1]], <i8 -1, i8 -2, i8 -3, i8 -4>
-; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i8> poison, i8 [[B]], i32 0
-; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: [[TMP8:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i16>
; CHECK-NEXT: [[TMP9:%.*]] = sext <4 x i8> [[TMP4]] to <4 x i16>
; CHECK-NEXT: [[TMP5:%.*]] = icmp sle <4 x i16> [[TMP8]], [[TMP9]]
diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/ext-not-resized-op-resized.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/ext-not-resized-op-resized.ll
index ca93cbd698ada..377d9b4751f58 100644
--- a/llvm/test/Transforms/SLPVectorizer/SystemZ/ext-not-resized-op-resized.ll
+++ b/llvm/test/Transforms/SLPVectorizer/SystemZ/ext-not-resized-op-resized.ll
@@ -4,12 +4,12 @@
define void @test(i64 %0, i1 %.cmp.i.2, i1 %1, ptr %a) {
; CHECK-LABEL: define void @test(
; CHECK-SAME: i64 [[TMP0:%.*]], i1 [[DOTCMP_I_2:%.*]], i1 [[TMP1:%.*]], ptr [[A:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i64> poison, i64 [[TMP0]], i32 0
-; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP5:%.*]] = lshr <4 x i64> [[TMP4]], splat (i64 63)
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i1> poison, i1 [[DOTCMP_I_2]], i32 0
; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i1> [[TMP6]], i1 [[TMP1]], i32 1
; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i1> [[TMP7]], <4 x i1> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 0>
+; CHECK-NEXT: [[TMP15:%.*]] = insertelement <4 x i64> poison, i64 [[TMP0]], i32 0
+; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <4 x i64> [[TMP15]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP5:%.*]] = lshr <4 x i64> [[TMP16]], splat (i64 63)
; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x i1> [[TMP7]], <4 x i1> poison, <4 x i32> <i32 1, i32 1, i32 0, i32 1>
; CHECK-NEXT: [[TMP10:%.*]] = trunc <4 x i64> [[TMP5]] to <4 x i1>
; CHECK-NEXT: [[TMP11:%.*]] = select <4 x i1> [[TMP9]], <4 x i1> [[TMP10]], <4 x i1> [[TMP8]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR35628_2.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR35628_2.ll
index 5ebbb76f3d6c3..f0b88e2647511 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/PR35628_2.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/PR35628_2.ll
@@ -9,11 +9,11 @@ define void @test() #0 {
; CHECK: loop:
; CHECK-NEXT: [[DUMMY_PHI:%.*]] = phi i64 [ 1, [[ENTRY:%.*]] ], [ [[OP_RDX1:%.*]], [[LOOP]] ]
; CHECK-NEXT: [[TMP0:%.*]] = phi i64 [ 2, [[ENTRY]] ], [ [[TMP4:%.*]], [[LOOP]] ]
+; CHECK-NEXT: [[DUMMY_ADD:%.*]] = add i16 0, 0
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i64> poison, i64 [[TMP0]], i32 0
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: [[TMP3:%.*]] = add <4 x i64> [[TMP2]], <i64 3, i64 2, i64 1, i64 0>
; CHECK-NEXT: [[TMP4]] = extractelement <4 x i64> [[TMP3]], i32 3
-; CHECK-NEXT: [[DUMMY_ADD:%.*]] = add i16 0, 0
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
; CHECK-NEXT: [[DUMMY_SHL:%.*]] = shl i64 [[TMP5]], 32
; CHECK-NEXT: [[TMP6:%.*]] = add <4 x i64> splat (i64 1), [[TMP3]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR40310.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR40310.ll
index 194c7021f60f5..fc9a7d8e658c3 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/PR40310.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/PR40310.ll
@@ -11,8 +11,8 @@ define void @mainTest(i32 %param, ptr %vals, i32 %len) {
; CHECK-NEXT: [[LOCAL_4_:%.*]] = phi i32 [ [[V44:%.*]], [[BCI_15]] ], [ 31, [[BCI_15_PREHEADER]] ]
; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ [[TMP6:%.*]], [[BCI_15]] ], [ [[TMP0]], [[BCI_15_PREHEADER]] ]
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <16 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; CHECK-NEXT: [[TMP3:%.*]] = add <16 x i32> [[TMP2]], <i32 -1, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
; CHECK-NEXT: store atomic i32 [[LOCAL_0_]], ptr [[VALS:%.*]] unordered, align 4
+; CHECK-NEXT: [[TMP3:%.*]] = add <16 x i32> [[TMP2]], <i32 -1, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> [[TMP3]])
; CHECK-NEXT: [[OP_RDX]] = and i32 [[TMP4]], [[LOCAL_4_]]
; CHECK-NEXT: [[V44]] = add i32 [[LOCAL_4_]], 16
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/multi-nodes-to-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/multi-nodes-to-shuffle.ll
index a48076adc8090..0667d58221966 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/multi-nodes-to-shuffle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/multi-nodes-to-shuffle.ll
@@ -9,8 +9,8 @@ define void @test(i64 %p0, i64 %p1, i64 %p2, i64 %p3) {
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i64> [[TMP0]], i64 [[P1:%.*]], i32 1
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i64> [[TMP1]], i64 [[P2:%.*]], i32 2
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[P3:%.*]], i32 3
-; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i64> [[TMP3]], [[TMP3]]
; CHECK-NEXT: [[TMP5:%.*]] = mul <4 x i64> [[TMP3]], [[TMP3]]
+; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i64> [[TMP3]], [[TMP3]]
; CHECK-NEXT: [[TMP6:%.*]] = sdiv <4 x i64> [[TMP3]], [[TMP3]]
; CHECK-NEXT: [[TMP7:%.*]] = sub <4 x i64> [[TMP5]], [[TMP6]]
; CHECK-NEXT: [[TMP8:%.*]] = shl <4 x i64> [[TMP4]], [[TMP7]]
@@ -34,8 +34,8 @@ define void @test(i64 %p0, i64 %p1, i64 %p2, i64 %p3) {
; AVX2-NEXT: [[TMP1:%.*]] = insertelement <4 x i64> [[TMP0]], i64 [[P1:%.*]], i32 1
; AVX2-NEXT: [[TMP2:%.*]] = insertelement <4 x i64> [[TMP1]], i64 [[P2:%.*]], i32 2
; AVX2-NEXT: [[TMP3:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[P3:%.*]], i32 3
-; AVX2-NEXT: [[TMP4:%.*]] = add <4 x i64> [[TMP3]], [[TMP3]]
; AVX2-NEXT: [[TMP5:%.*]] = mul <4 x i64> [[TMP3]], [[TMP3]]
+; AVX2-NEXT: [[TMP4:%.*]] = add <4 x i64> [[TMP3]], [[TMP3]]
; AVX2-NEXT: [[TMP6:%.*]] = sdiv <4 x i64> [[TMP3]], [[TMP3]]
; AVX2-NEXT: [[TMP7:%.*]] = sub <4 x i64> [[TMP5]], [[TMP6]]
; AVX2-NEXT: [[TMP8:%.*]] = shl <4 x i64> [[TMP4]], [[TMP7]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/no_alternate_divrem.ll b/llvm/test/Transforms/SLPVectorizer/X86/no_alternate_divrem.ll
index 89051c7aba42c..28b836e43efba 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/no_alternate_divrem.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/no_alternate_divrem.ll
@@ -10,14 +10,14 @@ define void @test_add_sdiv(ptr %arr1, ptr %arr2, i32 %a0, i32 %a1, i32 %a2, i32
; CHECK-NEXT: [[GEP2_3:%.*]] = getelementptr i32, ptr [[ARR2]], i32 3
; CHECK-NEXT: [[V2:%.*]] = load i32, ptr [[GEP1_2]], align 4
; CHECK-NEXT: [[V3:%.*]] = load i32, ptr [[GEP1_3]], align 4
-; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> poison, i32 [[A0:%.*]], i32 0
-; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[A1:%.*]], i32 1
-; CHECK-NEXT: [[TMP2:%.*]] = add nsw <2 x i32> [[TMP1]], <i32 1146, i32 146>
; CHECK-NEXT: [[Y2:%.*]] = add nsw i32 [[A2:%.*]], 42
; CHECK-NEXT: [[Y3:%.*]] = add nsw i32 [[A3:%.*]], 0
; CHECK-NEXT: [[RES2:%.*]] = sdiv i32 [[V2]], [[Y2]]
; CHECK-NEXT: [[RES3:%.*]] = add nsw i32 [[V3]], [[Y3]]
; CHECK-NEXT: [[TMP4:%.*]] = load <2 x i32>, ptr [[ARR1]], align 4
+; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[A0:%.*]], i32 0
+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[A1:%.*]], i32 1
+; CHECK-NEXT: [[TMP2:%.*]] = add nsw <2 x i32> [[TMP3]], <i32 1146, i32 146>
; CHECK-NEXT: [[TMP5:%.*]] = add nsw <2 x i32> [[TMP4]], [[TMP2]]
; CHECK-NEXT: store <2 x i32> [[TMP5]], ptr [[ARR2]], align 4
; CHECK-NEXT: store i32 [[RES2]], ptr [[GEP2_2]], align 4
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/non-power-of-2-subvectors-insert.ll b/llvm/test/Transforms/SLPVectorizer/X86/non-power-of-2-subvectors-insert.ll
index 1fedde4cc9fd7..cee333341271b 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/non-power-of-2-subvectors-insert.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/non-power-of-2-subvectors-insert.ll
@@ -4,17 +4,15 @@
define void @test() {
; CHECK-LABEL: define void @test() {
; CHECK-NEXT: [[XOR108_I_I_I:%.*]] = xor i64 0, 1
-; CHECK-NEXT: [[TMP1:%.*]] = insertelement <12 x i64> <i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 poison, i64 0>, i64 [[XOR108_I_I_I]], i32 10
-; CHECK-NEXT: [[TMP2:%.*]] = lshr <12 x i64> [[TMP1]], zeroinitializer
-; CHECK-NEXT: [[TMP3:%.*]] = insertelement <16 x i64> poison, i64 [[XOR108_I_I_I]], i32 3
-; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <12 x i64> [[TMP2]], <12 x i64> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <16 x i64> [[TMP5]], <16 x i64> [[TMP3]], <16 x i32> <i32 0, i32 1, i32 2, i32 19, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <16 x i64> [[TMP6]], <16 x i64> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 3, i32 7, i32 8, i32 9, i32 3, i32 10, i32 11, i32 12, i32 3>
-; CHECK-NEXT: [[TMP8:%.*]] = trunc <16 x i64> [[TMP7]] to <16 x i1>
-; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i1> [[TMP8]], zeroinitializer
-; CHECK-NEXT: [[TMP10:%.*]] = freeze <16 x i1> [[TMP9]]
+; CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[XOR108_I_I_I]] to i1
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <14 x i1> <i1 false, i1 false, i1 false, i1 poison, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 poison, i1 false, i1 poison>, i1 [[TMP1]], i32 3
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <14 x i1> [[TMP2]], <14 x i1> poison, <14 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 3, i32 12, i32 poison>
+; CHECK-NEXT: [[TMP4:%.*]] = lshr <14 x i1> [[TMP3]], <i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 poison>
+; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <14 x i1> [[TMP4]], <14 x i1> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 3, i32 7, i32 8, i32 9, i32 3, i32 10, i32 11, i32 12, i32 3>
; CHECK-NEXT: [[TMP11:%.*]] = zext <16 x i1> [[TMP10]] to <16 x i16>
-; CHECK-NEXT: [[TMP12:%.*]] = icmp eq <16 x i16> [[TMP11]], zeroinitializer
+; CHECK-NEXT: [[TMP7:%.*]] = or <16 x i16> [[TMP11]], zeroinitializer
+; CHECK-NEXT: [[TMP8:%.*]] = freeze <16 x i16> [[TMP7]]
+; CHECK-NEXT: [[TMP9:%.*]] = icmp eq <16 x i16> [[TMP8]], zeroinitializer
; CHECK-NEXT: ret void
;
%xor108.i.i.i = xor i64 0, 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/resched.ll b/llvm/test/Transforms/SLPVectorizer/X86/resched.ll
index 20a42777cf8e4..8f59a2ad09a0c 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/resched.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/resched.ll
@@ -11,27 +11,11 @@ define fastcc void @_ZN12_GLOBAL__N_127PolynomialMultiplyRecognize9recognizeEv(i
; CHECK: if.then22.i:
; CHECK-NEXT: [[SUB_I:%.*]] = add nsw i32 undef, -1
; CHECK-NEXT: [[CONV31_I:%.*]] = and i32 undef, [[SUB_I]]
-; CHECK-NEXT: [[SHR_I_I:%.*]] = lshr i32 [[CONV31_I]], 1
-; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> poison, i32 [[CONV31_I]], i32 0
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP2:%.*]] = lshr <2 x i32> [[TMP1]], <i32 2, i32 3>
-; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP4:%.*]] = lshr <4 x i32> [[TMP3]], <i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP6:%.*]] = lshr <8 x i32> [[TMP5]], <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT: [[TMP7:%.*]] = trunc i32 [[SUB_I]] to i8
-; CHECK-NEXT: [[TMP8:%.*]] = insertelement <16 x i8> poison, i8 [[TMP7]], i32 0
-; CHECK-NEXT: [[TMP9:%.*]] = trunc i32 [[SHR_I_I]] to i8
-; CHECK-NEXT: [[TMP10:%.*]] = insertelement <16 x i8> [[TMP8]], i8 [[TMP9]], i32 1
-; CHECK-NEXT: [[TMP11:%.*]] = trunc <8 x i32> [[TMP6]] to <8 x i8>
-; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <8 x i8> [[TMP11]], <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <16 x i8> [[TMP10]], <16 x i8> [[TMP12]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; CHECK-NEXT: [[TMP13:%.*]] = trunc <4 x i32> [[TMP4]] to <4 x i8>
-; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <4 x i8> [[TMP13]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <16 x i8> [[TMP14]], <16 x i8> [[TMP19]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT: [[TMP15:%.*]] = trunc <2 x i32> [[TMP2]] to <2 x i8>
-; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <2 x i8> [[TMP15]], <2 x i8> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <16 x i8> [[TMP20]], <16 x i8> [[TMP18]], <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT: [[TMP0:%.*]] = insertelement <16 x i32> poison, i32 [[SUB_I]], i32 0
+; CHECK-NEXT: [[TMP1:%.*]] = insertelement <16 x i32> [[TMP0]], i32 [[CONV31_I]], i32 1
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> poison, <16 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: [[TMP3:%.*]] = lshr <16 x i32> [[TMP2]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT: [[TMP16:%.*]] = trunc <16 x i32> [[TMP3]] to <16 x i8>
; CHECK-NEXT: [[TMP17:%.*]] = and <16 x i8> [[TMP16]], splat (i8 1)
; CHECK-NEXT: store <16 x i8> [[TMP17]], ptr undef, align 1
; CHECK-NEXT: ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/same-values-sub-node-with-poisons.ll b/llvm/test/Transforms/SLPVectorizer/X86/same-values-sub-node-with-poisons.ll
index 9c0f65ec27165..af533a6ccbb79 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/same-values-sub-node-with-poisons.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/same-values-sub-node-with-poisons.ll
@@ -30,9 +30,6 @@ define i32 @test(ptr %f, i1 %tobool.i.4, i32 %retval.0.i.219) {
; CHECK-NEXT: [[TMP5:%.*]] = phi <2 x i32> [ [[TMP8]], %[[D_EXIT_6]] ], [ poison, %[[IF_END_I_5]] ]
; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
-; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> <i32 poison, i32 1, i32 1, i32 poison>, i32 [[TMP0]], i32 0
-; CHECK-NEXT: [[TMP15:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[RETVAL_0_I_219]], i32 3
-; CHECK-NEXT: [[TMP16:%.*]] = add <4 x i32> [[TMP15]], [[TMP13]]
; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <4 x i32> [[TMP14]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
@@ -40,6 +37,9 @@ define i32 @test(ptr %f, i1 %tobool.i.4, i32 %retval.0.i.219) {
; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <8 x i32> <i32 0, i32 0, i32 poison, i32 poison, i32 poison, i32 0, i32 0, i32 poison>
; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <8 x i32> [[TMP23]], <8 x i32> <i32 poison, i32 poison, i32 1, i32 1, i32 1, i32 poison, i32 poison, i32 1>, <8 x i32> <i32 0, i32 1, i32 10, i32 11, i32 12, i32 5, i32 6, i32 15>
; CHECK-NEXT: [[TMP19:%.*]] = add <8 x i32> [[TMP18]], [[TMP22]]
+; CHECK-NEXT: [[TMP26:%.*]] = insertelement <4 x i32> <i32 poison, i32 1, i32 1, i32 poison>, i32 [[TMP0]], i32 0
+; CHECK-NEXT: [[TMP27:%.*]] = insertelement <4...
[truncated]
|
@llvm/pr-subscribers-backend-risc-v Author: Alexey Bataev (alexey-bataev) ChangesCurrently, SLP vectorizer prefers alternate opcode vectorization over Patch is 33.53 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/153684.diff 15 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 7362d5b0b5865..df3089847dbe3 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -11307,27 +11307,27 @@ void BoUpSLP::buildTreeRec(ArrayRef<Value *> VLRef, unsigned Depth,
}
ScalarsVectorizationLegality Legality = getScalarsVectorizationLegality(
- VL, Depth, UserTreeIdx, /*TryCopyableElementsVectorization=*/false);
+ VL, Depth, UserTreeIdx, /*TryCopyableElementsVectorization=*/true);
InstructionsState S = Legality.getInstructionsState();
if (!Legality.isLegal()) {
- if (Legality.trySplitVectorize()) {
- auto [MainOp, AltOp] = getMainAltOpsNoStateVL(VL);
- // Last chance to try to vectorize alternate node.
- if (MainOp && AltOp && TrySplitNode(InstructionsState(MainOp, AltOp)))
- return;
- }
- if (!S)
+ if (!S) {
Legality = getScalarsVectorizationLegality(
- VL, Depth, UserTreeIdx, /*TryCopyableElementsVectorization=*/true);
+ VL, Depth, UserTreeIdx, /*TryCopyableElementsVectorization=*/false);
+ S = Legality.getInstructionsState();
+ }
if (!Legality.isLegal()) {
+ if (Legality.trySplitVectorize()) {
+ auto [MainOp, AltOp] = getMainAltOpsNoStateVL(VL);
+ // Last chance to try to vectorize alternate node.
+ if (MainOp && AltOp && TrySplitNode(InstructionsState(MainOp, AltOp)))
+ return;
+ }
if (Legality.tryToFindDuplicates())
tryToFindDuplicates(VL, ReuseShuffleIndices, *TTI, *TLI, S,
UserTreeIdx);
-
newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
return;
}
- S = Legality.getInstructionsState();
}
// FIXME: investigate if there are profitable cases for VL.size() <= 4.
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/remark-zext-incoming-for-neg-icmp.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/remark-zext-incoming-for-neg-icmp.ll
index 09c11bbefd4a3..485807e84966b 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/remark-zext-incoming-for-neg-icmp.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/remark-zext-incoming-for-neg-icmp.ll
@@ -16,11 +16,11 @@ define i32 @test(i32 %a, i8 %b, i8 %c) {
; CHECK-LABEL: define i32 @test(
; CHECK-SAME: i32 [[A:%.*]], i8 [[B:%.*]], i8 [[C:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i8> poison, i8 [[B]], i32 0
+; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i8> poison, i8 [[C]], i32 0
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i8> [[TMP1]], <i8 -1, i8 -2, i8 -3, i8 -4>
-; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i8> poison, i8 [[B]], i32 0
-; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: [[TMP8:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i16>
; CHECK-NEXT: [[TMP9:%.*]] = sext <4 x i8> [[TMP4]] to <4 x i16>
; CHECK-NEXT: [[TMP5:%.*]] = icmp sle <4 x i16> [[TMP8]], [[TMP9]]
diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/ext-not-resized-op-resized.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/ext-not-resized-op-resized.ll
index ca93cbd698ada..377d9b4751f58 100644
--- a/llvm/test/Transforms/SLPVectorizer/SystemZ/ext-not-resized-op-resized.ll
+++ b/llvm/test/Transforms/SLPVectorizer/SystemZ/ext-not-resized-op-resized.ll
@@ -4,12 +4,12 @@
define void @test(i64 %0, i1 %.cmp.i.2, i1 %1, ptr %a) {
; CHECK-LABEL: define void @test(
; CHECK-SAME: i64 [[TMP0:%.*]], i1 [[DOTCMP_I_2:%.*]], i1 [[TMP1:%.*]], ptr [[A:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i64> poison, i64 [[TMP0]], i32 0
-; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP5:%.*]] = lshr <4 x i64> [[TMP4]], splat (i64 63)
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i1> poison, i1 [[DOTCMP_I_2]], i32 0
; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i1> [[TMP6]], i1 [[TMP1]], i32 1
; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i1> [[TMP7]], <4 x i1> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 0>
+; CHECK-NEXT: [[TMP15:%.*]] = insertelement <4 x i64> poison, i64 [[TMP0]], i32 0
+; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <4 x i64> [[TMP15]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP5:%.*]] = lshr <4 x i64> [[TMP16]], splat (i64 63)
; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x i1> [[TMP7]], <4 x i1> poison, <4 x i32> <i32 1, i32 1, i32 0, i32 1>
; CHECK-NEXT: [[TMP10:%.*]] = trunc <4 x i64> [[TMP5]] to <4 x i1>
; CHECK-NEXT: [[TMP11:%.*]] = select <4 x i1> [[TMP9]], <4 x i1> [[TMP10]], <4 x i1> [[TMP8]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR35628_2.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR35628_2.ll
index 5ebbb76f3d6c3..f0b88e2647511 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/PR35628_2.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/PR35628_2.ll
@@ -9,11 +9,11 @@ define void @test() #0 {
; CHECK: loop:
; CHECK-NEXT: [[DUMMY_PHI:%.*]] = phi i64 [ 1, [[ENTRY:%.*]] ], [ [[OP_RDX1:%.*]], [[LOOP]] ]
; CHECK-NEXT: [[TMP0:%.*]] = phi i64 [ 2, [[ENTRY]] ], [ [[TMP4:%.*]], [[LOOP]] ]
+; CHECK-NEXT: [[DUMMY_ADD:%.*]] = add i16 0, 0
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i64> poison, i64 [[TMP0]], i32 0
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: [[TMP3:%.*]] = add <4 x i64> [[TMP2]], <i64 3, i64 2, i64 1, i64 0>
; CHECK-NEXT: [[TMP4]] = extractelement <4 x i64> [[TMP3]], i32 3
-; CHECK-NEXT: [[DUMMY_ADD:%.*]] = add i16 0, 0
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
; CHECK-NEXT: [[DUMMY_SHL:%.*]] = shl i64 [[TMP5]], 32
; CHECK-NEXT: [[TMP6:%.*]] = add <4 x i64> splat (i64 1), [[TMP3]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR40310.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR40310.ll
index 194c7021f60f5..fc9a7d8e658c3 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/PR40310.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/PR40310.ll
@@ -11,8 +11,8 @@ define void @mainTest(i32 %param, ptr %vals, i32 %len) {
; CHECK-NEXT: [[LOCAL_4_:%.*]] = phi i32 [ [[V44:%.*]], [[BCI_15]] ], [ 31, [[BCI_15_PREHEADER]] ]
; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ [[TMP6:%.*]], [[BCI_15]] ], [ [[TMP0]], [[BCI_15_PREHEADER]] ]
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <16 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; CHECK-NEXT: [[TMP3:%.*]] = add <16 x i32> [[TMP2]], <i32 -1, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
; CHECK-NEXT: store atomic i32 [[LOCAL_0_]], ptr [[VALS:%.*]] unordered, align 4
+; CHECK-NEXT: [[TMP3:%.*]] = add <16 x i32> [[TMP2]], <i32 -1, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> [[TMP3]])
; CHECK-NEXT: [[OP_RDX]] = and i32 [[TMP4]], [[LOCAL_4_]]
; CHECK-NEXT: [[V44]] = add i32 [[LOCAL_4_]], 16
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/multi-nodes-to-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/multi-nodes-to-shuffle.ll
index a48076adc8090..0667d58221966 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/multi-nodes-to-shuffle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/multi-nodes-to-shuffle.ll
@@ -9,8 +9,8 @@ define void @test(i64 %p0, i64 %p1, i64 %p2, i64 %p3) {
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i64> [[TMP0]], i64 [[P1:%.*]], i32 1
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i64> [[TMP1]], i64 [[P2:%.*]], i32 2
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[P3:%.*]], i32 3
-; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i64> [[TMP3]], [[TMP3]]
; CHECK-NEXT: [[TMP5:%.*]] = mul <4 x i64> [[TMP3]], [[TMP3]]
+; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i64> [[TMP3]], [[TMP3]]
; CHECK-NEXT: [[TMP6:%.*]] = sdiv <4 x i64> [[TMP3]], [[TMP3]]
; CHECK-NEXT: [[TMP7:%.*]] = sub <4 x i64> [[TMP5]], [[TMP6]]
; CHECK-NEXT: [[TMP8:%.*]] = shl <4 x i64> [[TMP4]], [[TMP7]]
@@ -34,8 +34,8 @@ define void @test(i64 %p0, i64 %p1, i64 %p2, i64 %p3) {
; AVX2-NEXT: [[TMP1:%.*]] = insertelement <4 x i64> [[TMP0]], i64 [[P1:%.*]], i32 1
; AVX2-NEXT: [[TMP2:%.*]] = insertelement <4 x i64> [[TMP1]], i64 [[P2:%.*]], i32 2
; AVX2-NEXT: [[TMP3:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[P3:%.*]], i32 3
-; AVX2-NEXT: [[TMP4:%.*]] = add <4 x i64> [[TMP3]], [[TMP3]]
; AVX2-NEXT: [[TMP5:%.*]] = mul <4 x i64> [[TMP3]], [[TMP3]]
+; AVX2-NEXT: [[TMP4:%.*]] = add <4 x i64> [[TMP3]], [[TMP3]]
; AVX2-NEXT: [[TMP6:%.*]] = sdiv <4 x i64> [[TMP3]], [[TMP3]]
; AVX2-NEXT: [[TMP7:%.*]] = sub <4 x i64> [[TMP5]], [[TMP6]]
; AVX2-NEXT: [[TMP8:%.*]] = shl <4 x i64> [[TMP4]], [[TMP7]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/no_alternate_divrem.ll b/llvm/test/Transforms/SLPVectorizer/X86/no_alternate_divrem.ll
index 89051c7aba42c..28b836e43efba 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/no_alternate_divrem.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/no_alternate_divrem.ll
@@ -10,14 +10,14 @@ define void @test_add_sdiv(ptr %arr1, ptr %arr2, i32 %a0, i32 %a1, i32 %a2, i32
; CHECK-NEXT: [[GEP2_3:%.*]] = getelementptr i32, ptr [[ARR2]], i32 3
; CHECK-NEXT: [[V2:%.*]] = load i32, ptr [[GEP1_2]], align 4
; CHECK-NEXT: [[V3:%.*]] = load i32, ptr [[GEP1_3]], align 4
-; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> poison, i32 [[A0:%.*]], i32 0
-; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[A1:%.*]], i32 1
-; CHECK-NEXT: [[TMP2:%.*]] = add nsw <2 x i32> [[TMP1]], <i32 1146, i32 146>
; CHECK-NEXT: [[Y2:%.*]] = add nsw i32 [[A2:%.*]], 42
; CHECK-NEXT: [[Y3:%.*]] = add nsw i32 [[A3:%.*]], 0
; CHECK-NEXT: [[RES2:%.*]] = sdiv i32 [[V2]], [[Y2]]
; CHECK-NEXT: [[RES3:%.*]] = add nsw i32 [[V3]], [[Y3]]
; CHECK-NEXT: [[TMP4:%.*]] = load <2 x i32>, ptr [[ARR1]], align 4
+; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[A0:%.*]], i32 0
+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[A1:%.*]], i32 1
+; CHECK-NEXT: [[TMP2:%.*]] = add nsw <2 x i32> [[TMP3]], <i32 1146, i32 146>
; CHECK-NEXT: [[TMP5:%.*]] = add nsw <2 x i32> [[TMP4]], [[TMP2]]
; CHECK-NEXT: store <2 x i32> [[TMP5]], ptr [[ARR2]], align 4
; CHECK-NEXT: store i32 [[RES2]], ptr [[GEP2_2]], align 4
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/non-power-of-2-subvectors-insert.ll b/llvm/test/Transforms/SLPVectorizer/X86/non-power-of-2-subvectors-insert.ll
index 1fedde4cc9fd7..cee333341271b 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/non-power-of-2-subvectors-insert.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/non-power-of-2-subvectors-insert.ll
@@ -4,17 +4,15 @@
define void @test() {
; CHECK-LABEL: define void @test() {
; CHECK-NEXT: [[XOR108_I_I_I:%.*]] = xor i64 0, 1
-; CHECK-NEXT: [[TMP1:%.*]] = insertelement <12 x i64> <i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 poison, i64 0>, i64 [[XOR108_I_I_I]], i32 10
-; CHECK-NEXT: [[TMP2:%.*]] = lshr <12 x i64> [[TMP1]], zeroinitializer
-; CHECK-NEXT: [[TMP3:%.*]] = insertelement <16 x i64> poison, i64 [[XOR108_I_I_I]], i32 3
-; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <12 x i64> [[TMP2]], <12 x i64> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <16 x i64> [[TMP5]], <16 x i64> [[TMP3]], <16 x i32> <i32 0, i32 1, i32 2, i32 19, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <16 x i64> [[TMP6]], <16 x i64> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 3, i32 7, i32 8, i32 9, i32 3, i32 10, i32 11, i32 12, i32 3>
-; CHECK-NEXT: [[TMP8:%.*]] = trunc <16 x i64> [[TMP7]] to <16 x i1>
-; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i1> [[TMP8]], zeroinitializer
-; CHECK-NEXT: [[TMP10:%.*]] = freeze <16 x i1> [[TMP9]]
+; CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[XOR108_I_I_I]] to i1
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <14 x i1> <i1 false, i1 false, i1 false, i1 poison, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 poison, i1 false, i1 poison>, i1 [[TMP1]], i32 3
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <14 x i1> [[TMP2]], <14 x i1> poison, <14 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 3, i32 12, i32 poison>
+; CHECK-NEXT: [[TMP4:%.*]] = lshr <14 x i1> [[TMP3]], <i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 poison>
+; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <14 x i1> [[TMP4]], <14 x i1> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 3, i32 7, i32 8, i32 9, i32 3, i32 10, i32 11, i32 12, i32 3>
; CHECK-NEXT: [[TMP11:%.*]] = zext <16 x i1> [[TMP10]] to <16 x i16>
-; CHECK-NEXT: [[TMP12:%.*]] = icmp eq <16 x i16> [[TMP11]], zeroinitializer
+; CHECK-NEXT: [[TMP7:%.*]] = or <16 x i16> [[TMP11]], zeroinitializer
+; CHECK-NEXT: [[TMP8:%.*]] = freeze <16 x i16> [[TMP7]]
+; CHECK-NEXT: [[TMP9:%.*]] = icmp eq <16 x i16> [[TMP8]], zeroinitializer
; CHECK-NEXT: ret void
;
%xor108.i.i.i = xor i64 0, 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/resched.ll b/llvm/test/Transforms/SLPVectorizer/X86/resched.ll
index 20a42777cf8e4..8f59a2ad09a0c 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/resched.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/resched.ll
@@ -11,27 +11,11 @@ define fastcc void @_ZN12_GLOBAL__N_127PolynomialMultiplyRecognize9recognizeEv(i
; CHECK: if.then22.i:
; CHECK-NEXT: [[SUB_I:%.*]] = add nsw i32 undef, -1
; CHECK-NEXT: [[CONV31_I:%.*]] = and i32 undef, [[SUB_I]]
-; CHECK-NEXT: [[SHR_I_I:%.*]] = lshr i32 [[CONV31_I]], 1
-; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> poison, i32 [[CONV31_I]], i32 0
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP2:%.*]] = lshr <2 x i32> [[TMP1]], <i32 2, i32 3>
-; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP4:%.*]] = lshr <4 x i32> [[TMP3]], <i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP6:%.*]] = lshr <8 x i32> [[TMP5]], <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT: [[TMP7:%.*]] = trunc i32 [[SUB_I]] to i8
-; CHECK-NEXT: [[TMP8:%.*]] = insertelement <16 x i8> poison, i8 [[TMP7]], i32 0
-; CHECK-NEXT: [[TMP9:%.*]] = trunc i32 [[SHR_I_I]] to i8
-; CHECK-NEXT: [[TMP10:%.*]] = insertelement <16 x i8> [[TMP8]], i8 [[TMP9]], i32 1
-; CHECK-NEXT: [[TMP11:%.*]] = trunc <8 x i32> [[TMP6]] to <8 x i8>
-; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <8 x i8> [[TMP11]], <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <16 x i8> [[TMP10]], <16 x i8> [[TMP12]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; CHECK-NEXT: [[TMP13:%.*]] = trunc <4 x i32> [[TMP4]] to <4 x i8>
-; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <4 x i8> [[TMP13]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <16 x i8> [[TMP14]], <16 x i8> [[TMP19]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT: [[TMP15:%.*]] = trunc <2 x i32> [[TMP2]] to <2 x i8>
-; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <2 x i8> [[TMP15]], <2 x i8> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <16 x i8> [[TMP20]], <16 x i8> [[TMP18]], <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT: [[TMP0:%.*]] = insertelement <16 x i32> poison, i32 [[SUB_I]], i32 0
+; CHECK-NEXT: [[TMP1:%.*]] = insertelement <16 x i32> [[TMP0]], i32 [[CONV31_I]], i32 1
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> poison, <16 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: [[TMP3:%.*]] = lshr <16 x i32> [[TMP2]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT: [[TMP16:%.*]] = trunc <16 x i32> [[TMP3]] to <16 x i8>
; CHECK-NEXT: [[TMP17:%.*]] = and <16 x i8> [[TMP16]], splat (i8 1)
; CHECK-NEXT: store <16 x i8> [[TMP17]], ptr undef, align 1
; CHECK-NEXT: ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/same-values-sub-node-with-poisons.ll b/llvm/test/Transforms/SLPVectorizer/X86/same-values-sub-node-with-poisons.ll
index 9c0f65ec27165..af533a6ccbb79 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/same-values-sub-node-with-poisons.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/same-values-sub-node-with-poisons.ll
@@ -30,9 +30,6 @@ define i32 @test(ptr %f, i1 %tobool.i.4, i32 %retval.0.i.219) {
; CHECK-NEXT: [[TMP5:%.*]] = phi <2 x i32> [ [[TMP8]], %[[D_EXIT_6]] ], [ poison, %[[IF_END_I_5]] ]
; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
-; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> <i32 poison, i32 1, i32 1, i32 poison>, i32 [[TMP0]], i32 0
-; CHECK-NEXT: [[TMP15:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[RETVAL_0_I_219]], i32 3
-; CHECK-NEXT: [[TMP16:%.*]] = add <4 x i32> [[TMP15]], [[TMP13]]
; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <4 x i32> [[TMP14]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
@@ -40,6 +37,9 @@ define i32 @test(ptr %f, i1 %tobool.i.4, i32 %retval.0.i.219) {
; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <8 x i32> <i32 0, i32 0, i32 poison, i32 poison, i32 poison, i32 0, i32 0, i32 poison>
; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <8 x i32> [[TMP23]], <8 x i32> <i32 poison, i32 poison, i32 1, i32 1, i32 1, i32 poison, i32 poison, i32 1>, <8 x i32> <i32 0, i32 1, i32 10, i32 11, i32 12, i32 5, i32 6, i32 15>
; CHECK-NEXT: [[TMP19:%.*]] = add <8 x i32> [[TMP18]], [[TMP22]]
+; CHECK-NEXT: [[TMP26:%.*]] = insertelement <4 x i32> <i32 poison, i32 1, i32 1, i32 poison>, i32 [[TMP0]], i32 0
+; CHECK-NEXT: [[TMP27:%.*]] = insertelement <4...
[truncated]
|
Do we have benchmark numbers for this? |
No |
Will try to gather something tomorrow |
Please can you rephrase the patch summary - you confusingly switch between using alternative and alternate |
Fixed |
Currently, SLP vectorizer prefers alternate opcode vectorization over
copyable vectorization. Better to try try the copyable vectorization at
first and only after that fallback to (last chance!) alternate opcode
vectorization.
Metric: size..text
Program size..text
exp ref diff
test-suite :: SingleSource/Regression/C/gcc-c-torture/execute/GCC-C-execute-builtin-bitops-1.test 12293.00 13268.00 7.9%
test-suite :: MultiSource/Benchmarks/Trimaran/enc-3des/enc-3des.test 14623.00 14911.00 2.0%
test-suite :: External/SPEC/CINT2017rate/500.perlbench_r/500.perlbench_r.test 2185493.00 2186821.00 0.1%
test-suite :: External/SPEC/CINT2017speed/600.perlbench_s/600.perlbench_s.test 2185493.00 2186821.00 0.1%
test-suite :: MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/miniGMG.test 43019.00 43035.00 0.0%
test-suite :: MultiSource/Benchmarks/mediabench/mpeg2/mpeg2dec/mpeg2decode.test 62288.00 62304.00 0.0%
test-suite :: MultiSource/Benchmarks/mediabench/jpeg/jpeg-6a/cjpeg.test 99246.00 99262.00 0.0%
test-suite :: External/SPEC/CFP2006/447.dealII/447.dealII.test 584011.00 584091.00 0.0%
test-suite :: External/SPEC/CFP2006/444.namd/444.namd.test 248113.00 248145.00 0.0%
test-suite :: External/SPEC/CFP2017speed/638.imagick_s/638.imagick_s.test 1365397.00 1365461.00 0.0%
test-suite :: External/SPEC/CFP2017rate/538.imagick_r/538.imagick_r.test 1365397.00 1365461.00 0.0%
test-suite :: MultiSource/Benchmarks/7zip/7zip-benchmark.test 1079708.00 1079740.00 0.0%
test-suite :: External/SPEC/CINT2006/471.omnetpp/471.omnetpp.test 549073.00 549089.00 0.0%
test-suite :: MultiSource/Benchmarks/tramp3d-v4/tramp3d-v4.test 996140.00 996156.00 0.0%
test-suite :: External/SPEC/CINT2006/483.xalancbmk/483.xalancbmk.test 2315473.00 2315505.00 0.0%
test-suite :: External/SPEC/CINT2017speed/623.xalancbmk_s/623.xalancbmk_s.test 2799338.00 2799354.00 0.0%
test-suite :: External/SPEC/CINT2017rate/523.xalancbmk_r/523.xalancbmk_r.test 2799338.00 2799354.00 0.0%
test-suite :: External/SPEC/CFP2017rate/526.blender_r/526.blender_r.test 12653639.00 12653703.00 0.0%
test-suite :: External/SPEC/CFP2017rate/510.parest_r/510.parest_r.test 2040578.00 2040530.00 -0.0%
test-suite :: MultiSource/Applications/JM/lencod/lencod.test 839248.00 839216.00 -0.0%
test-suite :: External/SPEC/CFP2006/433.milc/433.milc.test 141296.00 141280.00 -0.0%
test-suite :: External/SPEC/CINT2017rate/525.x264_r/525.x264_r.test 671147.00 671067.00 -0.0%
test-suite :: External/SPEC/CINT2017speed/625.x264_s/625.x264_s.test 671147.00 671067.00 -0.0%
test-suite :: External/SPEC/CINT2006/464.h264ref/464.h264ref.test 774232.00 774104.00 -0.0%
test-suite :: MultiSource/Applications/sqlite3/sqlite3.test 506153.00 506057.00 -0.0%
test-suite :: MultiSource/Applications/JM/ldecod/ldecod.test 393027.00 392867.00 -0.0%
test-suite :: MultiSource/Benchmarks/ASCI_Purple/SMG2000/smg2000.test 250045.00 249693.00 -0.1%
test-suite :: SingleSource/Benchmarks/Polybench/linear-algebra/kernels/2mm/2mm.test 10635.00 10619.00 -0.2%
test-suite :: MultiSource/Applications/ALAC/encode/alacconvert-encode.test 58164.00 58068.00 -0.2%
test-suite :: MultiSource/Applications/ALAC/decode/alacconvert-decode.test 58164.00 58068.00 -0.2%
test-suite :: MultiSource/Benchmarks/Prolangs-C/simulator/simulator.test 31906.00 31730.00 -0.6%
test-suite :: MultiSource/Benchmarks/VersaBench/ecbdes/ecbdes.test 2957.00 2939.00 -0.6%
SingleSource/Regression/C/gcc-c-torture/execute/GCC-C-execute-builtin-bitops-1 - Better vector code
MultiSource/Benchmarks/Trimaran - Better vector code, extra vector code
External/SPEC/CINT2017rate/500.perlbench_r
External/SPEC/CINT2017speed/600.perlbench_s - Better vector code
MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG - small variations
MultiSource/Benchmarks/mediabench/mpeg2/mpeg2dec - extra vector code
MultiSource/Benchmarks/mediabench/jpeg - small variations
External/SPEC/CFP2006/447.dealII - Extra vector code
External/SPEC/CFP2006/444.namd - Small variations
External/SPEC/CFP2017speed/638.imagick_s
External/SPEC/CFP2017rate/538.imagick_r - Small variations
MultiSource/Benchmarks/7zip - extra vector code, small variations
External/SPEC/CINT2006/471.omnetpp - extra vector code
MultiSource/Benchmarks/tramp3d-v4 - Better vector code
External/SPEC/CINT2006/483.xalancbmk - small variations
External/SPEC/CINT2017speed/623.xalancbmk_s
External/SPEC/CINT2017rate/523.xalancbmk_r - same
External/SPEC/CFP2017rate/526.blender_r - extra vector code
External/SPEC/CFP2017rate/510.parest_r - small variations
MultiSource/Applications/JM/lencod - extra vector code
External/SPEC/CFP2006/433.milc - extra vector code
External/SPEC/CINT2017rate/525.x264_r
External/SPEC/CINT2017speed/625.x264_s - extra vector code, small variations
External/SPEC/CINT2006/464.h264ref - Extra vector code, small variations
MultiSource/Applications/sqlite3 - Extra vector code
MultiSource/Applications/JM/ldecod - small variations
MultiSource/Benchmarks/ASCI_Purple/SMG2000 - small variations
SingleSource/Benchmarks/Polybench/linear-algebra/kernels/2mm - small variations
MultiSource/Applications/ALAC/encode - extra vector code
MultiSource/Applications/ALAC/decode - extra vector code
MultiSource/Benchmarks/Prolangs-C/simulator - small regressions, caused by early optimizations, should improved with future patches
MultiSource/Benchmarks/VersaBench/ecbdes - small variations