llvm
diff --git a/‎llvm/lib/Transforms/Vectorize/VPlan.h
Lines changed: 11 additions & 0 deletions b/‎llvm/lib/Transforms/Vectorize/VPlan.h
Lines changed: 11 additions & 0 deletions
diff --git a/‎llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
Lines changed: 28 additions & 36 deletions b/‎llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
Lines changed: 28 additions & 36 deletions
diff --git a/‎llvm/lib/Transforms/Vectorize/VPlanUtils.h
Lines changed: 11 additions & 4 deletions b/‎llvm/lib/Transforms/Vectorize/VPlanUtils.h
Lines changed: 11 additions & 4 deletions
diff --git a/‎llvm/test/Transforms/LoopVectorize/AArch64/sve-vscale-based-trip-counts.ll
Lines changed: 12 additions & 21 deletions b/‎llvm/test/Transforms/LoopVectorize/AArch64/sve-vscale-based-trip-counts.ll
Lines changed: 12 additions & 21 deletions
diff --git a/‎llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll
Lines changed: 1 addition & 2 deletions b/‎llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll
Lines changed: 1 addition & 2 deletions
diff --git a/‎llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-derived-ivs.ll
Lines changed: 4 additions & 8 deletions b/‎llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-derived-ivs.ll
Lines changed: 4 additions & 8 deletions
diff --git a/‎llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops.ll
Lines changed: 1 addition & 2 deletions b/‎llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops.ll
Lines changed: 1 addition & 2 deletions
diff --git a/‎llvm/test/Transforms/LoopVectorize/dead_instructions.ll
Lines changed: 3 additions & 3 deletions b/‎llvm/test/Transforms/LoopVectorize/dead_instructions.ll
Lines changed: 3 additions & 3 deletions
@@ -897,6 +897,11 @@ struct VPRecipeWithIRFlags : public VPSingleDefRecipe, public VPIRFlags {
     return R && classof(R);
   }
 
+  static inline bool classof(const VPSingleDefRecipe *U) {
+    auto *R = dyn_cast<VPRecipeBase>(U);
+    return R && classof(R);
+  }
+
   void execute(VPTransformState &State) override = 0;
 };
 
@@ -3569,6 +3574,12 @@ struct CastInfo<VPPhiAccessors, VPRecipeBase *>
 template <>
 struct CastInfo<VPPhiAccessors, const VPRecipeBase *>
     : CastInfoVPPhiAccessors<const VPRecipeBase *> {};
+template <>
+struct CastInfo<VPPhiAccessors, VPSingleDefRecipe *>
+    : CastInfoVPPhiAccessors<VPRecipeBase *> {};
+template <>
+struct CastInfo<VPPhiAccessors, const VPSingleDefRecipe *>
+    : CastInfoVPPhiAccessors<const VPRecipeBase *> {};
 
 /// VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph. It
 /// holds a sequence of zero or more VPRecipe's each representing a sequence of
 
@@ -1756,55 +1756,47 @@ void VPlanTransforms::clearReductionWrapFlags(VPlan &Plan) {
 /// Hash the underlying data of a VPSingleDefRecipe pointer, instead of hashing
 /// the pointer itself.
 namespace {
-static bool isIdentical(const VPValue *L, const VPValue *R) {
-  if (L->getVPValueID() != R->getVPValueID() ||
-      L->hasDefiningRecipe() != R->hasDefiningRecipe() ||
-      vputils::isSingleScalar(L) != vputils::isSingleScalar(R))
-    return false;
-  if (L->hasDefiningRecipe()) {
-    const VPRecipeBase *DefL = L->getDefiningRecipe();
-    const VPRecipeBase *DefR = R->getDefiningRecipe();
-    return vputils::getOpcode(*DefL) == vputils::getOpcode(*DefR) &&
-           equal(DefL->operands(), DefR->operands());
-  }
-  return L == R;
-}
-
-static hash_code hash_value(const VPValue &V) {
-  if (V.hasDefiningRecipe()) {
-    const VPRecipeBase *Def = V.getDefiningRecipe();
-    return hash_combine(V.getVPValueID(), vputils::getOpcode(*Def),
-                        vputils::isSingleScalar(&V),
+struct CSEDenseMapInfo : public DenseMapInfo<VPSingleDefRecipe *> {
+  static unsigned getHashValue(const VPSingleDefRecipe *Def) {
+    return hash_combine(Def->getVPDefID(), vputils::getOpcode(*Def),
+                        vputils::isSingleScalar(Def),
                         hash_combine_range(Def->operands()));
   }
-  return hash_value(&V);
-}
 
-struct CSEDenseMapInfo : public DenseMapInfo<VPSingleDefRecipe *> {
-  static unsigned getHashValue(const VPSingleDefRecipe *R) {
-    return hash_value(*R);
-  }
+  static bool isEqual(const VPSingleDefRecipe *L, const VPSingleDefRecipe *R) {
+    if (L == getEmptyKey() || R == getEmptyKey() || L == getTombstoneKey() ||
+        R == getTombstoneKey())
+      return L == R;
+
+    if (L == R)
+      return true;
 
-  static bool isEqual(const VPSingleDefRecipe *LHS,
-                      const VPSingleDefRecipe *RHS) {
-    if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
-        LHS == getTombstoneKey() || RHS == getTombstoneKey())
-      return LHS == RHS;
-    return isIdentical(LHS, RHS);
+    return L->getVPDefID() == R->getVPDefID() &&
+           vputils::getOpcode(*L) == vputils::getOpcode(*R) &&
+           vputils::isSingleScalar(L) == vputils::isSingleScalar(R) &&
+           equal(L->operands(), R->operands());
   }
 };
 } // end anonymous namespace
 
 /// Perform a common-subexpression-elimination of VPSingleDefRecipes on the \p
 /// Plan.
 void VPlanTransforms::cse(VPlan &Plan, Type &CanonicalIVTy) {
-  DenseMap<VPSingleDefRecipe *, VPSingleDefRecipe *, CSEDenseMapInfo> CSEMap;
-  VPTypeAnalysis TypeInfo(&CanonicalIVTy);
+  VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
+  if (!LoopRegion)
+    return;
+  auto VPBBsOutsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
+      vp_depth_first_shallow(Plan.getEntry()));
+  auto VPBBsInsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
+      vp_depth_first_shallow(LoopRegion->getEntry()));
+
   // There is existing logic to sink instructions into replicate regions, and
   // we'd be undoing that work if we went through replicate regions. Hence,
   // don't CSE in replicate regions.
-  for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
-           vp_depth_first_shallow(Plan.getEntry()))) {
+  DenseMap<VPSingleDefRecipe *, VPSingleDefRecipe *, CSEDenseMapInfo> CSEMap;
+  VPTypeAnalysis TypeInfo(&CanonicalIVTy);
+  for (VPBasicBlock *VPBB :
+       concat<VPBasicBlock *>(VPBBsOutsideLoopRegion, VPBBsInsideLoopRegion)) {
     for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
       auto *Def = dyn_cast<VPSingleDefRecipe>(&R);
       if (!Def)
@@ -1813,7 +1805,7 @@ void VPlanTransforms::cse(VPlan &Plan, Type &CanonicalIVTy) {
         if (TypeInfo.inferScalarType(Def) != TypeInfo.inferScalarType(V))
           continue;
         // Drop poison-generating flags when reusing a value.
-        if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(&R))
+        if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(V))
           RFlags->dropPoisonGeneratingFlags();
         Def->replaceAllUsesWith(V);
         Def->eraseFromParent();
 
@@ -38,12 +38,19 @@ VPValue *getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr,
 /// SCEV expression could be constructed.
 const SCEV *getSCEVExprForVPValue(VPValue *V, ScalarEvolution &SE);
 
-/// Get any instruction opcode data embedded in recipe \p R.
-inline std::optional<unsigned> getOpcode(const VPRecipeBase &R) {
-  return TypeSwitch<const VPRecipeBase *, std::optional<unsigned>>(&R)
+/// Get any instruction opcode data embedded in recipe \p R. Returns an optional
+/// pair, where the first element indicates whether it is an intrinsic ID.
+inline std::optional<std::pair<bool, unsigned>>
+getOpcode(const VPRecipeBase &R) {
+  return TypeSwitch<const VPRecipeBase *,
+                    std::optional<std::pair<bool, unsigned>>>(&R)
       .Case<VPInstruction, VPWidenRecipe, VPWidenCastRecipe,
             VPWidenSelectRecipe, VPHistogramRecipe, VPPartialReductionRecipe,
-            VPReplicateRecipe>([](auto *I) { return I->getOpcode(); })
+            VPReplicateRecipe>(
+          [](auto *I) { return std::make_pair(false, I->getOpcode()); })
+      .Case<VPWidenIntrinsicRecipe>([](auto *I) {
+        return std::make_pair(true, I->getVectorIntrinsicID());
+      })
       .Default([](auto *) { return std::nullopt; });
 }
 
 
@@ -84,8 +84,11 @@ define  void @vscale_mul_8(ptr noalias noundef readonly captures(none) %a, ptr n
 ; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 4 x float>, ptr [[TMP22]], align 4
 ; CHECK-NEXT:    [[TMP17:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD]], [[WIDE_LOAD2]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD1]], [[WIDE_LOAD3]]
+; CHECK-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP16:%.*]] = mul nuw i64 [[TMP15]], 4
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds nuw float, ptr [[B]], i64 [[TMP16]]
 ; CHECK-NEXT:    store <vscale x 4 x float> [[TMP17]], ptr [[B]], align 4
-; CHECK-NEXT:    store <vscale x 4 x float> [[TMP14]], ptr [[TMP22]], align 4
+; CHECK-NEXT:    store <vscale x 4 x float> [[TMP14]], ptr [[TMP18]], align 4
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[MUL1]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY:.*]]
 ; CHECK:       [[FOR_COND_CLEANUP]]:
@@ -222,16 +225,13 @@ define void @vscale_mul_31(ptr noalias noundef readonly captures(none) %a, ptr n
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw float, ptr [[B]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP15:%.*]] = mul nuw i64 [[TMP14]], 4
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw float, ptr [[TMP12]], i64 [[TMP15]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr float, ptr [[TMP12]], i64 [[TMP15]]
 ; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x float>, ptr [[TMP12]], align 4
 ; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 4 x float>, ptr [[TMP16]], align 4
 ; CHECK-NEXT:    [[TMP17:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD]], [[WIDE_LOAD2]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD1]], [[WIDE_LOAD3]]
-; CHECK-NEXT:    [[TMP19:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP20:%.*]] = mul nuw i64 [[TMP19]], 4
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds nuw float, ptr [[TMP12]], i64 [[TMP20]]
 ; CHECK-NEXT:    store <vscale x 4 x float> [[TMP17]], ptr [[TMP12]], align 4
-; CHECK-NEXT:    store <vscale x 4 x float> [[TMP18]], ptr [[TMP21]], align 4
+; CHECK-NEXT:    store <vscale x 4 x float> [[TMP18]], ptr [[TMP16]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP22]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
@@ -305,16 +305,13 @@ define void @vscale_mul_64(ptr noalias noundef readonly captures(none) %a, ptr n
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw float, ptr [[B]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP15:%.*]] = mul nuw i64 [[TMP14]], 4
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw float, ptr [[TMP12]], i64 [[TMP15]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr float, ptr [[TMP12]], i64 [[TMP15]]
 ; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x float>, ptr [[TMP12]], align 4
 ; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 4 x float>, ptr [[TMP16]], align 4
 ; CHECK-NEXT:    [[TMP17:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD]], [[WIDE_LOAD2]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD1]], [[WIDE_LOAD3]]
-; CHECK-NEXT:    [[TMP19:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP20:%.*]] = mul nuw i64 [[TMP19]], 4
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds nuw float, ptr [[TMP12]], i64 [[TMP20]]
 ; CHECK-NEXT:    store <vscale x 4 x float> [[TMP17]], ptr [[TMP12]], align 4
-; CHECK-NEXT:    store <vscale x 4 x float> [[TMP18]], ptr [[TMP21]], align 4
+; CHECK-NEXT:    store <vscale x 4 x float> [[TMP18]], ptr [[TMP16]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP22]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
@@ -390,16 +387,13 @@ define void @trip_count_with_overflow(ptr noalias noundef readonly captures(none
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds nuw float, ptr [[B]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP16:%.*]] = mul nuw i64 [[TMP15]], 4
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw float, ptr [[TMP13]], i64 [[TMP16]]
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr float, ptr [[TMP13]], i64 [[TMP16]]
 ; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x float>, ptr [[TMP13]], align 4
 ; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 4 x float>, ptr [[TMP17]], align 4
 ; CHECK-NEXT:    [[TMP18:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD]], [[WIDE_LOAD2]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD1]], [[WIDE_LOAD3]]
-; CHECK-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP21:%.*]] = mul nuw i64 [[TMP20]], 4
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds nuw float, ptr [[TMP13]], i64 [[TMP21]]
 ; CHECK-NEXT:    store <vscale x 4 x float> [[TMP18]], ptr [[TMP13]], align 4
-; CHECK-NEXT:    store <vscale x 4 x float> [[TMP19]], ptr [[TMP22]], align 4
+; CHECK-NEXT:    store <vscale x 4 x float> [[TMP19]], ptr [[TMP17]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP23]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
@@ -471,16 +465,13 @@ define void @trip_count_too_big_for_element_count(ptr noalias noundef readonly c
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds nuw float, ptr [[B]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP16:%.*]] = mul nuw i64 [[TMP15]], 4
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw float, ptr [[TMP13]], i64 [[TMP16]]
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr float, ptr [[TMP13]], i64 [[TMP16]]
 ; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x float>, ptr [[TMP13]], align 4
 ; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 4 x float>, ptr [[TMP17]], align 4
 ; CHECK-NEXT:    [[TMP18:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD]], [[WIDE_LOAD2]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD1]], [[WIDE_LOAD3]]
-; CHECK-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP21:%.*]] = mul nuw i64 [[TMP20]], 4
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds nuw float, ptr [[TMP13]], i64 [[TMP21]]
 ; CHECK-NEXT:    store <vscale x 4 x float> [[TMP18]], ptr [[TMP13]], align 4
-; CHECK-NEXT:    store <vscale x 4 x float> [[TMP19]], ptr [[TMP22]], align 4
+; CHECK-NEXT:    store <vscale x 4 x float> [[TMP19]], ptr [[TMP17]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP23]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 
@@ -148,8 +148,7 @@ define void @widen_2ptrs_phi_unrolled(ptr noalias nocapture %dst, ptr noalias no
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[OFFSET_IDX4:%.*]] = shl i64 [[INDEX]], 2
-; CHECK-NEXT:    [[NEXT_GEP5:%.*]] = getelementptr i8, ptr [[DST]], i64 [[OFFSET_IDX4]]
+; CHECK-NEXT:    [[NEXT_GEP5:%.*]] = getelementptr i8, ptr [[DST]], i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[DOTIDX:%.*]] = shl nuw nsw i64 [[TMP7]], 4
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 [[DOTIDX]]
 
@@ -165,8 +165,7 @@ define void @derived_pointer_ivs(ptr noalias %a, ptr noalias %b, ptr %end) {
 ; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; VF2-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 16
 ; VF2-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[OFFSET_IDX]]
-; VF2-NEXT:    [[OFFSET_IDX6:%.*]] = mul i64 [[INDEX]], 16
-; VF2-NEXT:    [[NEXT_GEP7:%.*]] = getelementptr i8, ptr [[B]], i64 [[OFFSET_IDX6]]
+; VF2-NEXT:    [[NEXT_GEP7:%.*]] = getelementptr i8, ptr [[B]], i64 [[OFFSET_IDX]]
 ; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[NEXT_GEP]], align 8
 ; VF2-NEXT:    store <2 x double> [[WIDE_LOAD]], ptr [[NEXT_GEP7]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 1
@@ -216,10 +215,8 @@ define void @derived_pointer_ivs(ptr noalias %a, ptr noalias %b, ptr %end) {
 ; VF2IC2-NEXT:    [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 16
 ; VF2IC2-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[OFFSET_IDX]]
 ; VF2IC2-NEXT:    [[NEXT_GEP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP13]]
-; VF2IC2-NEXT:    [[OFFSET_IDX7:%.*]] = mul i64 [[INDEX]], 16
-; VF2IC2-NEXT:    [[TMP14:%.*]] = add i64 [[OFFSET_IDX7]], 16
-; VF2IC2-NEXT:    [[NEXT_GEP8:%.*]] = getelementptr i8, ptr [[B]], i64 [[OFFSET_IDX7]]
-; VF2IC2-NEXT:    [[NEXT_GEP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP14]]
+; VF2IC2-NEXT:    [[NEXT_GEP8:%.*]] = getelementptr i8, ptr [[B]], i64 [[OFFSET_IDX]]
+; VF2IC2-NEXT:    [[NEXT_GEP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP13]]
 ; VF2IC2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[NEXT_GEP]], align 8
 ; VF2IC2-NEXT:    [[WIDE_LOAD10:%.*]] = load <2 x double>, ptr [[NEXT_GEP6]], align 8
 ; VF2IC2-NEXT:    store <2 x double> [[WIDE_LOAD]], ptr [[NEXT_GEP8]], align 8
@@ -269,8 +266,7 @@ define void @derived_pointer_ivs(ptr noalias %a, ptr noalias %b, ptr %end) {
 ; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; VF4-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 16
 ; VF4-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[OFFSET_IDX]]
-; VF4-NEXT:    [[OFFSET_IDX6:%.*]] = mul i64 [[INDEX]], 16
-; VF4-NEXT:    [[NEXT_GEP7:%.*]] = getelementptr i8, ptr [[B]], i64 [[OFFSET_IDX6]]
+; VF4-NEXT:    [[NEXT_GEP7:%.*]] = getelementptr i8, ptr [[B]], i64 [[OFFSET_IDX]]
 ; VF4-NEXT:    [[WIDE_VEC:%.*]] = load <8 x double>, ptr [[NEXT_GEP]], align 8
 ; VF4-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x double> [[WIDE_VEC]], <8 x double> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; VF4-NEXT:    [[STRIDED_VEC8:%.*]] = shufflevector <8 x double> [[WIDE_VEC]], <8 x double> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 
@@ -972,8 +972,7 @@ define void @test_2xi64_sub_of_wide_loads(ptr noalias %data, ptr noalias %A, ptr
 ; VF4-NEXT:    [[TMP5:%.*]] = sub <4 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]]
 ; VF4-NEXT:    [[TMP6:%.*]] = shl nsw i64 [[TMP0]], 1
 ; VF4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP6]]
-; VF4-NEXT:    [[TMP8:%.*]] = sub <4 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]]
-; VF4-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; VF4-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; VF4-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP9]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
 ; VF4-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP7]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP0]], 4
 
@@ -148,13 +148,13 @@ define void @dead_load_and_vector_pointer(ptr %a, ptr %b) {
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 2
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[TMP2]], i32 2
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 8, !alias.scope [[META6:![0-9]+]], !noalias [[META9:![0-9]+]]
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <2 x i32>, ptr [[TMP5]], align 8, !alias.scope [[META6]], !noalias [[META9]]
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <2 x i32>, ptr [[TMP1]], align 8, !alias.scope [[META6]], !noalias [[META9]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = add <2 x i32> [[WIDE_LOAD]], splat (i32 1)
 ; CHECK-NEXT:    [[TMP7:%.*]] = add <2 x i32> [[WIDE_LOAD2]], splat (i32 1)
 ; CHECK-NEXT:    store <2 x i32> [[TMP6]], ptr [[TMP2]], align 4, !alias.scope [[META6]], !noalias [[META9]]
-; CHECK-NEXT:    store <2 x i32> [[TMP7]], ptr [[TMP5]], align 4, !alias.scope [[META6]], !noalias [[META9]]
+; CHECK-NEXT:    store <2 x i32> [[TMP7]], ptr [[TMP1]], align 4, !alias.scope [[META6]], !noalias [[META9]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], 128
 ; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]