-
Notifications
You must be signed in to change notification settings - Fork 14.7k
[VPlan] Introduce CSE pass #151872
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
[VPlan] Introduce CSE pass #151872
Conversation
✅ With the latest revision this PR passed the undef deprecator. |
71938b9
to
771fefc
Compare
f8f64e9
to
70925b7
Compare
Current status: there are lots of failing tests to update, and there is a DenseMap-already-present-value-when-growing crash in Transforms/LoopVectorize/X86/induction-costs.ll, which is turning out to be very hard to debug. |
@llvm/pr-subscribers-llvm-transforms @llvm/pr-subscribers-backend-powerpc Author: Ramkumar Ramachandra (artagnon) ChangesIntroduce a simple and limited common-subexpression-elimination pass at the VPlan-level, running late after recipes are executed. The long-term vision is to get rid of the legacy non-VPlan-based cse routine in LV, but this patch doesn't yet fully subsume it. Patch is 207.23 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/151872.diff 56 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index d04317bd8822d..b78017027dbf1 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7309,6 +7309,7 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
VPlanTransforms::narrowInterleaveGroups(
BestVPlan, BestVF,
TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector));
+ VPlanTransforms::cse(BestVPlan, *Legal->getWidestInductionType());
VPlanTransforms::removeDeadRecipes(BestVPlan);
VPlanTransforms::convertToConcreteRecipes(BestVPlan,
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 39f5e3651e9bb..7929d30a1c9f5 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -897,6 +897,11 @@ struct VPRecipeWithIRFlags : public VPSingleDefRecipe, public VPIRFlags {
return R && classof(R);
}
+ static inline bool classof(const VPSingleDefRecipe *U) {
+ auto *R = dyn_cast<VPRecipeBase>(U);
+ return R && classof(R);
+ }
+
void execute(VPTransformState &State) override = 0;
};
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 3ecffc7593d49..5e7815f26e9c0 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1753,6 +1753,76 @@ void VPlanTransforms::clearReductionWrapFlags(VPlan &Plan) {
}
}
+/// Hash the underlying data of a VPSingleDefRecipe pointer, instead of hashing
+/// the pointer itself.
+namespace {
+struct VPCSEDenseMapInfo : public DenseMapInfo<VPSingleDefRecipe *> {
+ static bool isSentinel(const VPSingleDefRecipe *Def) {
+ return Def == getEmptyKey() || Def == getTombstoneKey();
+ }
+
+ static bool canHandle(const VPSingleDefRecipe *Def) {
+ return isa<VPInstruction, VPWidenRecipe, VPWidenCastRecipe,
+ VPWidenSelectRecipe, VPHistogramRecipe, VPPartialReductionRecipe,
+ VPReplicateRecipe, VPWidenIntrinsicRecipe>(Def);
+ }
+
+ static unsigned getHashValue(const VPSingleDefRecipe *Def) {
+ return hash_combine(Def->getVPDefID(), vputils::getOpcode(*Def),
+ vputils::isSingleScalar(Def),
+ hash_combine_range(Def->operands()));
+ }
+
+ static bool isEqual(const VPSingleDefRecipe *L, const VPSingleDefRecipe *R) {
+ if (isSentinel(L) || isSentinel(R))
+ return L == R;
+ bool Result = L->getVPDefID() == R->getVPDefID() &&
+ vputils::getOpcode(*L) == vputils::getOpcode(*R) &&
+ vputils::isSingleScalar(L) == vputils::isSingleScalar(R) &&
+ equal(L->operands(), R->operands());
+ assert(!Result || getHashValue(L) == getHashValue(R));
+ return Result;
+ }
+};
+} // end anonymous namespace
+
+/// Perform a common-subexpression-elimination of VPSingleDefRecipes on the \p
+/// Plan.
+void VPlanTransforms::cse(VPlan &Plan, Type &CanonicalIVTy) {
+ VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
+ if (!LoopRegion)
+ return;
+ auto VPBBsOutsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
+ vp_depth_first_shallow(Plan.getEntry()));
+ auto VPBBsInsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
+ vp_depth_first_shallow(LoopRegion->getEntry()));
+
+ // There is existing logic to sink instructions into replicate regions, and
+ // we'd be undoing that work if we went through replicate regions. Hence,
+ // don't CSE in replicate regions.
+ DenseMap<VPSingleDefRecipe *, VPSingleDefRecipe *, VPCSEDenseMapInfo> CSEMap;
+ VPTypeAnalysis TypeInfo(&CanonicalIVTy);
+ for (VPBasicBlock *VPBB :
+ concat<VPBasicBlock *>(VPBBsOutsideLoopRegion, VPBBsInsideLoopRegion)) {
+ for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
+ auto *Def = dyn_cast<VPSingleDefRecipe>(&R);
+ if (!Def || !VPCSEDenseMapInfo::canHandle(Def))
+ continue;
+ if (VPSingleDefRecipe *V = CSEMap.lookup(Def)) {
+ if (TypeInfo.inferScalarType(Def) != TypeInfo.inferScalarType(V))
+ continue;
+ // Drop poison-generating flags when reusing a value.
+ if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(V))
+ RFlags->dropPoisonGeneratingFlags();
+ Def->replaceAllUsesWith(V);
+ Def->eraseFromParent();
+ continue;
+ }
+ CSEMap[Def] = Def;
+ }
+ }
+}
+
/// Move loop-invariant recipes out of the vector loop region in \p Plan.
static void licm(VPlan &Plan) {
VPBasicBlock *Preheader = Plan.getVectorPreheader();
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 5943684e17a76..9e99c781022d7 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -240,6 +240,10 @@ struct VPlanTransforms {
/// removing dead edges to their successors.
static void removeBranchOnConst(VPlan &Plan);
+ /// Perform common-subexpression-elimination, which is best done after the \p
+ /// Plan is executed.
+ static void cse(VPlan &Plan, Type &CanonicalIVType);
+
/// If there's a single exit block, optimize its phi recipes that use exiting
/// IV values by feeding them precomputed end values instead, possibly taken
/// one step backwards.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.h b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
index 8dcd57f1b3598..f0a6540a91915 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
@@ -10,6 +10,7 @@
#define LLVM_TRANSFORMS_VECTORIZE_VPLANUTILS_H
#include "VPlan.h"
+#include "llvm/ADT/TypeSwitch.h"
namespace llvm {
class ScalarEvolution;
@@ -37,6 +38,22 @@ VPValue *getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr,
/// SCEV expression could be constructed.
const SCEV *getSCEVExprForVPValue(VPValue *V, ScalarEvolution &SE);
+/// Get any instruction opcode data embedded in recipe \p R. Returns an optional
+/// pair, where the first element indicates whether it is an intrinsic ID.
+inline std::optional<std::pair<bool, unsigned>>
+getOpcode(const VPRecipeBase &R) {
+ return TypeSwitch<const VPRecipeBase *,
+ std::optional<std::pair<bool, unsigned>>>(&R)
+ .Case<VPInstruction, VPWidenRecipe, VPWidenCastRecipe,
+ VPWidenSelectRecipe, VPHistogramRecipe, VPPartialReductionRecipe,
+ VPReplicateRecipe>(
+ [](auto *I) { return std::make_pair(false, I->getOpcode()); })
+ .Case<VPWidenIntrinsicRecipe>([](auto *I) {
+ return std::make_pair(true, I->getVectorIntrinsicID());
+ })
+ .Default([](auto *) { return std::nullopt; });
+}
+
/// Returns true if \p VPV is a single scalar, either because it produces the
/// same value for all lanes or only has its first lane used.
inline bool isSingleScalar(const VPValue *VPV) {
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
index 0232d88347d0a..cefb191f74c3e 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
@@ -1032,8 +1032,8 @@ define void @test_conditional_interleave_group (ptr noalias %src.1, ptr noalias
; DEFAULT-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
; DEFAULT-NEXT: br label %[[VECTOR_BODY:.*]]
; DEFAULT: [[VECTOR_BODY]]:
-; DEFAULT-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE27:.*]] ]
-; DEFAULT-NEXT: [[VEC_IND:%.*]] = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE27]] ]
+; DEFAULT-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE25:.*]] ]
+; DEFAULT-NEXT: [[VEC_IND:%.*]] = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE25]] ]
; DEFAULT-NEXT: [[TMP15:%.*]] = load float, ptr [[SRC_1]], align 4
; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <8 x float> poison, float [[TMP15]], i64 0
; DEFAULT-NEXT: [[BROADCAST_SPLAT9:%.*]] = shufflevector <8 x float> [[BROADCAST_SPLATINSERT8]], <8 x float> poison, <8 x i32> zeroinitializer
@@ -1046,10 +1046,7 @@ define void @test_conditional_interleave_group (ptr noalias %src.1, ptr noalias
; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT10:%.*]] = insertelement <8 x float> poison, float [[TMP19]], i64 0
; DEFAULT-NEXT: [[BROADCAST_SPLAT11:%.*]] = shufflevector <8 x float> [[BROADCAST_SPLATINSERT10]], <8 x float> poison, <8 x i32> zeroinitializer
; DEFAULT-NEXT: [[TMP20:%.*]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[BROADCAST_SPLAT11]], <8 x float> zeroinitializer, <8 x float> [[TMP18]])
-; DEFAULT-NEXT: [[TMP21:%.*]] = load float, ptr [[SRC_3]], align 4
-; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT12:%.*]] = insertelement <8 x float> poison, float [[TMP21]], i64 0
-; DEFAULT-NEXT: [[BROADCAST_SPLAT13:%.*]] = shufflevector <8 x float> [[BROADCAST_SPLATINSERT12]], <8 x float> poison, <8 x i32> zeroinitializer
-; DEFAULT-NEXT: [[TMP22:%.*]] = fcmp ogt <8 x float> [[TMP20]], [[BROADCAST_SPLAT13]]
+; DEFAULT-NEXT: [[TMP22:%.*]] = fcmp ogt <8 x float> [[TMP20]], [[BROADCAST_SPLAT11]]
; DEFAULT-NEXT: [[TMP23:%.*]] = getelementptr { [4 x float] }, ptr [[DST]], <8 x i64> [[VEC_IND]]
; DEFAULT-NEXT: [[TMP24:%.*]] = extractelement <8 x i1> [[TMP22]], i32 0
; DEFAULT-NEXT: br i1 [[TMP24]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
@@ -1067,8 +1064,8 @@ define void @test_conditional_interleave_group (ptr noalias %src.1, ptr noalias
; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE]]
; DEFAULT: [[PRED_STORE_CONTINUE]]:
; DEFAULT-NEXT: [[TMP31:%.*]] = extractelement <8 x i1> [[TMP22]], i32 1
-; DEFAULT-NEXT: br i1 [[TMP31]], label %[[PRED_STORE_IF14:.*]], label %[[PRED_STORE_CONTINUE15:.*]]
-; DEFAULT: [[PRED_STORE_IF14]]:
+; DEFAULT-NEXT: br i1 [[TMP31]], label %[[PRED_STORE_IF12:.*]], label %[[PRED_STORE_CONTINUE13:.*]]
+; DEFAULT: [[PRED_STORE_IF12]]:
; DEFAULT-NEXT: [[TMP32:%.*]] = extractelement <8 x ptr> [[TMP23]], i32 1
; DEFAULT-NEXT: store float 0.000000e+00, ptr [[TMP32]], align 4
; DEFAULT-NEXT: [[TMP33:%.*]] = extractelement <8 x ptr> [[TMP23]], i32 1
@@ -1079,11 +1076,11 @@ define void @test_conditional_interleave_group (ptr noalias %src.1, ptr noalias
; DEFAULT-NEXT: store float 0.000000e+00, ptr [[TMP36]], align 4
; DEFAULT-NEXT: [[TMP37:%.*]] = extractelement <8 x ptr> [[TMP23]], i32 1
; DEFAULT-NEXT: store float 0.000000e+00, ptr [[TMP37]], align 4
-; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE15]]
-; DEFAULT: [[PRED_STORE_CONTINUE15]]:
+; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE13]]
+; DEFAULT: [[PRED_STORE_CONTINUE13]]:
; DEFAULT-NEXT: [[TMP38:%.*]] = extractelement <8 x i1> [[TMP22]], i32 2
-; DEFAULT-NEXT: br i1 [[TMP38]], label %[[PRED_STORE_IF16:.*]], label %[[PRED_STORE_CONTINUE17:.*]]
-; DEFAULT: [[PRED_STORE_IF16]]:
+; DEFAULT-NEXT: br i1 [[TMP38]], label %[[PRED_STORE_IF14:.*]], label %[[PRED_STORE_CONTINUE15:.*]]
+; DEFAULT: [[PRED_STORE_IF14]]:
; DEFAULT-NEXT: [[TMP39:%.*]] = extractelement <8 x ptr> [[TMP23]], i32 2
; DEFAULT-NEXT: store float 0.000000e+00, ptr [[TMP39]], align 4
; DEFAULT-NEXT: [[TMP40:%.*]] = extractelement <8 x ptr> [[TMP23]], i32 2
@@ -1094,11 +1091,11 @@ define void @test_conditional_interleave_group (ptr noalias %src.1, ptr noalias
; DEFAULT-NEXT: store float 0.000000e+00, ptr [[TMP43]], align 4
; DEFAULT-NEXT: [[TMP44:%.*]] = extractelement <8 x ptr> [[TMP23]], i32 2
; DEFAULT-NEXT: store float 0.000000e+00, ptr [[TMP44]], align 4
-; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE17]]
-; DEFAULT: [[PRED_STORE_CONTINUE17]]:
+; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE15]]
+; DEFAULT: [[PRED_STORE_CONTINUE15]]:
; DEFAULT-NEXT: [[TMP45:%.*]] = extractelement <8 x i1> [[TMP22]], i32 3
-; DEFAULT-NEXT: br i1 [[TMP45]], label %[[PRED_STORE_IF18:.*]], label %[[PRED_STORE_CONTINUE19:.*]]
-; DEFAULT: [[PRED_STORE_IF18]]:
+; DEFAULT-NEXT: br i1 [[TMP45]], label %[[PRED_STORE_IF16:.*]], label %[[PRED_STORE_CONTINUE17:.*]]
+; DEFAULT: [[PRED_STORE_IF16]]:
; DEFAULT-NEXT: [[TMP46:%.*]] = extractelement <8 x ptr> [[TMP23]], i32 3
; DEFAULT-NEXT: store float 0.000000e+00, ptr [[TMP46]], align 4
; DEFAULT-NEXT: [[TMP47:%.*]] = extractelement <8 x ptr> [[TMP23]], i32 3
@@ -1109,11 +1106,11 @@ define void @test_conditional_interleave_group (ptr noalias %src.1, ptr noalias
; DEFAULT-NEXT: store float 0.000000e+00, ptr [[TMP50]], align 4
; DEFAULT-NEXT: [[TMP51:%.*]] = extractelement <8 x ptr> [[TMP23]], i32 3
; DEFAULT-NEXT: store float 0.000000e+00, ptr [[TMP51]], align 4
-; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE19]]
-; DEFAULT: [[PRED_STORE_CONTINUE19]]:
+; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE17]]
+; DEFAULT: [[PRED_STORE_CONTINUE17]]:
; DEFAULT-NEXT: [[TMP52:%.*]] = extractelement <8 x i1> [[TMP22]], i32 4
-; DEFAULT-NEXT: br i1 [[TMP52]], label %[[PRED_STORE_IF20:.*]], label %[[PRED_STORE_CONTINUE21:.*]]
-; DEFAULT: [[PRED_STORE_IF20]]:
+; DEFAULT-NEXT: br i1 [[TMP52]], label %[[PRED_STORE_IF18:.*]], label %[[PRED_STORE_CONTINUE19:.*]]
+; DEFAULT: [[PRED_STORE_IF18]]:
; DEFAULT-NEXT: [[TMP53:%.*]] = extractelement <8 x ptr> [[TMP23]], i32 4
; DEFAULT-NEXT: store float 0.000000e+00, ptr [[TMP53]], align 4
; DEFAULT-NEXT: [[TMP54:%.*]] = extractelement <8 x ptr> [[TMP23]], i32 4
@@ -1124,11 +1121,11 @@ define void @test_conditional_interleave_group (ptr noalias %src.1, ptr noalias
; DEFAULT-NEXT: store float 0.000000e+00, ptr [[TMP57]], align 4
; DEFAULT-NEXT: [[TMP58:%.*]] = extractelement <8 x ptr> [[TMP23]], i32 4
; DEFAULT-NEXT: store float 0.000000e+00, ptr [[TMP58]], align 4
-; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE21]]
-; DEFAULT: [[PRED_STORE_CONTINUE21]]:
+; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE19]]
+; DEFAULT: [[PRED_STORE_CONTINUE19]]:
; DEFAULT-NEXT: [[TMP59:%.*]] = extractelement <8 x i1> [[TMP22]], i32 5
-; DEFAULT-NEXT: br i1 [[TMP59]], label %[[PRED_STORE_IF22:.*]], label %[[PRED_STORE_CONTINUE23:.*]]
-; DEFAULT: [[PRED_STORE_IF22]]:
+; DEFAULT-NEXT: br i1 [[TMP59]], label %[[PRED_STORE_IF20:.*]], label %[[PRED_STORE_CONTINUE21:.*]]
+; DEFAULT: [[PRED_STORE_IF20]]:
; DEFAULT-NEXT: [[TMP60:%.*]] = extractelement <8 x ptr> [[TMP23]], i32 5
; DEFAULT-NEXT: store float 0.000000e+00, ptr [[TMP60]], align 4
; DEFAULT-NEXT: [[TMP61:%.*]] = extractelement <8 x ptr> [[TMP23]], i32 5
@@ -1139,11 +1136,11 @@ define void @test_conditional_interleave_group (ptr noalias %src.1, ptr noalias
; DEFAULT-NEXT: store float 0.000000e+00, ptr [[TMP64]], align 4
; DEFAULT-NEXT: [[TMP65:%.*]] = extractelement <8 x ptr> [[TMP23]], i32 5
; DEFAULT-NEXT: store float 0.000000e+00, ptr [[TMP65]], align 4
-; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE23]]
-; DEFAULT: [[PRED_STORE_CONTINUE23]]:
+; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE21]]
+; DEFAULT: [[PRED_STORE_CONTINUE21]]:
; DEFAULT-NEXT: [[TMP66:%.*]] = extractelement <8 x i1> [[TMP22]], i32 6
-; DEFAULT-NEXT: br i1 [[TMP66]], label %[[PRED_STORE_IF24:.*]], label %[[PRED_STORE_CONTINUE25:.*]]
-; DEFAULT: [[PRED_STORE_IF24]]:
+; DEFAULT-NEXT: br i1 [[TMP66]], label %[[PRED_STORE_IF22:.*]], label %[[PRED_STORE_CONTINUE23:.*]]
+; DEFAULT: [[PRED_STORE_IF22]]:
; DEFAULT-NEXT: [[TMP67:%.*]] = extractelement <8 x ptr> [[TMP23]], i32 6
; DEFAULT-NEXT: store float 0.000000e+00, ptr [[TMP67]], align 4
; DEFAULT-NEXT: [[TMP68:%.*]] = extractelement <8 x ptr> [[TMP23]], i32 6
@@ -1154,11 +1151,11 @@ define void @test_conditional_interleave_group (ptr noalias %src.1, ptr noalias
; DEFAULT-NEXT: store float 0.000000e+00, ptr [[TMP71]], align 4
; DEFAULT-NEXT: [[TMP72:%.*]] = extractelement <8 x ptr> [[TMP23]], i32 6
; DEFAULT-NEXT: store float 0.000000e+00, ptr [[TMP72]], align 4
-; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE25]]
-; DEFAULT: [[PRED_STORE_CONTINUE25]]:
+; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE23]]
+; DEFAULT: [[PRED_STORE_CONTINUE23]]:
; DEFAULT-NEXT: [[TMP73:%.*]] = extractelement <8 x i1> [[TMP22]], i32 7
-; DEFAULT-NEXT: br i1 [[TMP73]], label %[[PRED_STORE_IF26:.*]], label %[[PRED_STORE_CONTINUE27]]
-; DEFAULT: [[PRED_STORE_IF26]]:
+; DEFAULT-NEXT: br i1 [[TMP73]], label %[[PRED_STORE_IF24:.*]], label %[[PRED_STORE_CONTINUE25]]
+; DEFAULT: [[PRED_STORE_IF24]]:
; DEFAULT-NEXT: [[TMP74:%.*]] = extractelement <8 x ptr> [[TMP23]], i32 7
; DEFAULT-NEXT: store float 0.000000e+00, ptr [[TMP74]], align 4
; DEFAULT-NEXT: [[TMP75:%.*]] = extractelement <8 x ptr> [[TMP23]], i32 7
@@ -1169,8 +1166,8 @@ define void @test_conditional_interleave_group (ptr noalias %src.1, ptr noalias
; DEFAULT-NEXT: store float 0.000000e+00, ptr [[TMP78]], align 4
; DEFAULT-NEXT: [[TMP79:%.*]] = extractelement <8 x ptr> [[TMP23]], i32 7
; DEFAULT-NEXT: store float 0.000000e+00, ptr [[TMP79]], align 4
-; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE27]]
-; DEFAULT: [[PRED_STORE_CONTINUE27]]:
+; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE25]]
+; DEFAULT: [[PRED_STORE_CONTINUE25]]:
; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; DEFAULT-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], splat (i64 8)
; DEFAULT-NEXT: [[TMP80:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -1251,9 +1248,9 @@ define void @test_conditional_interleave_group (ptr noalias %src.1, ptr noalias
; PRED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 0, i64 [[TMP0]])
; PRED-NEXT: br label %[[VECTOR_BODY:.*]]
; PRED: [[VECTOR_BODY]]:
-; PRED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE27:.*]] ]
-; PRED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[PRED_STORE_CONTINUE27]] ]
-; PRED-NEXT: [[VEC_IND:%.*]] = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE27]] ]
+; PRED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE25:.*]] ]
+; PRED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[PRED_STORE_CONTINUE25]] ]
+; PRED-NEXT: [[VEC_IND:%.*]] = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE25]] ]
; PRED-NEXT: [[TMP18:%.*]] = load float, ptr [[SRC_1]], align 4
; PRED-NEXT: [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <8 x float> poison, float [[TMP18]], i64 0
; PRED-NEXT: [[BROADCAST_SPLAT9:%.*]] = shufflevector <8 x float> [[BROADCAST_SPLATINSERT8]], <8 x float> poison, <8 x i32> zeroinitializer
@@ -1266,10 +1263,7 @@ define void @test_conditional_interleave_group (ptr noalias %src.1, ptr noalias
; PRED-NEXT: [[BROADCAST_SPLATINSERT10:%.*]] = insertelement <8 x float> poison, float [[TMP22]], i64 0
; PRED-NEXT: [[BROADCAST_SPLAT11:%.*]] = shufflevector <8 x float> [[BROADCAST_SPLATINSERT10]], <8 x float> poison, <8 x i32> zeroinitializer
; PRED-NEXT: [[TMP23:%.*]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[BROADCAST_SPLAT11]], <8 x float> zeroinitializer, <8 x float> [[TMP21]])
-; PRED-NEXT: [[TMP24:%.*]] = load float, ptr [[SRC_3]], align 4
-; PRED-NEXT: [[BROADCAST_SPLATINSERT12:%.*]] = insertelement <8 x float> poison, float [[TMP24]], i64 0
-; PRED-NEXT: [[BROADCAST_SPLAT13:%.*]] = shufflevector <8 x float> [[BROADCAST_SPLATINSERT12]], <8 x float> poison, <8 x i32> zeroinitializer
-; PRED-NEXT: [[...
[truncated]
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
Current status: managed to limit the cse to some recipes. All tests have been updated, and there is no crash. This patch is now ready to review. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looks like there are a number of rename-only changes, could you go over the test diffs and remove those to reduce the diff? I highlighted some but didn't go through the while diff
llvm/test/Transforms/LoopVectorize/AArch64/sve-vscale-based-trip-counts.ll
Outdated
Show resolved
Hide resolved
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looks like there are a number of rename-only changes, could you go over the test diffs and remove those to reduce the diff? I highlighted some but didn't go through the while diff
Looks like there are more left….
So if you look at the latest commit, we no longer remove duplicate bc.resume PHIs: not sure which recipe this is, and where the missing handling is. |
Introduce a simple and limited common-subexpression-elimination pass at the VPlan-level, running late during the execution of the VPlan. The long-term vision is to get rid of the legacy non-VPlan-based cse routine in LV, but this patch doesn't yet fully subsume it.