From e73271c71e321bc69e7a85a334077922759d7132 Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs@arm.com>
Date: Wed, 9 Jul 2025 11:43:12 +0100
Subject: [PATCH 1/6] [LV] Remove common extends and selects in CSE

This PR extends the LoopVectorizer's common sub-expression elimination
(CSE) to remove identical extend and select instructions. Originally the
work only included extends but that uncovered some duplicate selects in
reduction-inloop.ll.
---
 llvm/lib/IR/Instruction.cpp                   |   2 +
 .../Transforms/Utils/FunctionComparator.cpp   |   4 +
 .../Transforms/Vectorize/LoopVectorize.cpp    |   3 +-
 .../AArch64/induction-costs-sve.ll            |  19 +-
 ...outer_loop_test1_no_explicit_vect_width.ll |   3 +-
 .../LoopVectorize/ARM/mve-reductions.ll       |   3 +-
 .../LoopVectorize/PowerPC/vectorize-bswap.ll  |   4 +-
 .../RISCV/blocks-with-dead-instructions.ll    |  35 +-
 .../LoopVectorize/RISCV/dead-ops-cost.ll      |  15 +-
 .../RISCV/evl-compatible-loops.ll             |   8 +-
 .../LoopVectorize/RISCV/induction-costs.ll    |   5 +-
 .../LoopVectorize/RISCV/mask-index-type.ll    |   5 +-
 .../RISCV/masked_gather_scatter.ll            |  10 +-
 ...ruction-or-drop-poison-generating-flags.ll |   5 +-
 .../RISCV/riscv-vector-reverse-output.ll      | 653 ++++++++++++++++++
 .../RISCV/riscv-vector-reverse.ll             |  40 +-
 .../LoopVectorize/RISCV/strided-accesses.ll   |  24 +-
 .../RISCV/tail-folding-cast-intrinsics.ll     |   5 +-
 .../RISCV/tail-folding-cond-reduction.ll      |  10 +-
 .../tail-folding-fixed-order-recurrence.ll    |   5 +-
 .../RISCV/tail-folding-gather-scatter.ll      |   5 +-
 .../RISCV/tail-folding-interleave.ll          |  15 +-
 .../RISCV/tail-folding-reduction-cost.ll      |   1 +
 .../RISCV/tail-folding-reverse-load-store.ll  |  47 +-
 .../RISCV/tail-folding-uniform-store.ll       |   9 +-
 .../LoopVectorize/RISCV/uniform-load-store.ll |  30 +-
 ...outer_loop_test1_no_explicit_vect_width.ll |   6 +-
 llvm/test/Transforms/LoopVectorize/cse.ll     | 309 +++++++++
 .../LoopVectorize/first-order-recurrence.ll   |  14 +-
 .../LoopVectorize/float-induction.ll          |   3 +-
 .../LoopVectorize/outer_loop_test1.ll         |   3 +-
 .../LoopVectorize/reduction-inloop.ll         |   9 +-
 32 files changed, 1104 insertions(+), 205 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll
 create mode 100644 llvm/test/Transforms/LoopVectorize/cse.ll
diff --git a/llvm/lib/IR/Instruction.cpp b/llvm/lib/IR/Instruction.cpp
index 5e87b5ff941ad..94dfed4df72fe 100644
--- a/llvm/lib/IR/Instruction.cpp
+++ b/llvm/lib/IR/Instruction.cpp
@@ -935,6 +935,8 @@ bool Instruction::hasSameSpecialState(const Instruction *I2,
   if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I1))
     return GEP->getSourceElementType() ==
            cast<GetElementPtrInst>(I2)->getSourceElementType();
+  if (const CastInst *Cast = dyn_cast<CastInst>(I1))
+    return Cast->getDestTy() == cast<CastInst>(I2)->getDestTy();
 
   return true;
 }
diff --git a/llvm/lib/Transforms/Utils/FunctionComparator.cpp b/llvm/lib/Transforms/Utils/FunctionComparator.cpp
index 6d4026e8209de..8d1f7c9b3a2ea 100644
--- a/llvm/lib/Transforms/Utils/FunctionComparator.cpp
+++ b/llvm/lib/Transforms/Utils/FunctionComparator.cpp
@@ -804,6 +804,10 @@ int FunctionComparator::cmpOperations(const Instruction *L,
         return Res;
     }
   }
+  if (const CastInst *Cast = dyn_cast<CastInst>(L)) {
+    const CastInst *CastR = cast<CastInst>(R);
+    return cmpTypes(Cast->getDestTy(), CastR->getDestTy());
+  }
   return 0;
 }
 
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 675a230bd2c94..34cb28bf48a85 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2511,7 +2511,8 @@ namespace {
 struct CSEDenseMapInfo {
   static bool canHandle(const Instruction *I) {
     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
-           isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
+           isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I) ||
+           isa<CastInst>(I) || isa<SelectInst>(I);
   }
 
   static inline Instruction *getEmptyKey() {
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
index 137e07336fd50..392e6efd31376 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
@@ -44,10 +44,8 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
 ; DEFAULT-NEXT:    [[TMP27:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i16>
 ; DEFAULT-NEXT:    [[TMP28:%.*]] = mul <vscale x 8 x i16> [[TMP26]], [[TMP13]]
 ; DEFAULT-NEXT:    [[TMP29:%.*]] = mul <vscale x 8 x i16> [[TMP27]], [[TMP13]]
-; DEFAULT-NEXT:    [[TMP30:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i16>
-; DEFAULT-NEXT:    [[TMP31:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i16>
-; DEFAULT-NEXT:    [[TMP32:%.*]] = or <vscale x 8 x i16> [[TMP28]], [[TMP30]]
-; DEFAULT-NEXT:    [[TMP33:%.*]] = or <vscale x 8 x i16> [[TMP29]], [[TMP31]]
+; DEFAULT-NEXT:    [[TMP32:%.*]] = or <vscale x 8 x i16> [[TMP28]], [[TMP26]]
+; DEFAULT-NEXT:    [[TMP33:%.*]] = or <vscale x 8 x i16> [[TMP29]], [[TMP27]]
 ; DEFAULT-NEXT:    [[TMP34:%.*]] = lshr <vscale x 8 x i16> [[TMP32]], splat (i16 1)
 ; DEFAULT-NEXT:    [[TMP35:%.*]] = lshr <vscale x 8 x i16> [[TMP33]], splat (i16 1)
 ; DEFAULT-NEXT:    [[TMP36:%.*]] = trunc <vscale x 8 x i16> [[TMP34]] to <vscale x 8 x i8>
@@ -118,8 +116,7 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
 ; PRED-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP18]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
 ; PRED-NEXT:    [[TMP17:%.*]] = zext <vscale x 16 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 16 x i16>
 ; PRED-NEXT:    [[TMP22:%.*]] = mul <vscale x 16 x i16> [[TMP17]], [[TMP16]]
-; PRED-NEXT:    [[TMP24:%.*]] = zext <vscale x 16 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 16 x i16>
-; PRED-NEXT:    [[TMP20:%.*]] = or <vscale x 16 x i16> [[TMP22]], [[TMP24]]
+; PRED-NEXT:    [[TMP20:%.*]] = or <vscale x 16 x i16> [[TMP22]], [[TMP17]]
 ; PRED-NEXT:    [[TMP21:%.*]] = lshr <vscale x 16 x i16> [[TMP20]], splat (i16 1)
 ; PRED-NEXT:    [[TMP23:%.*]] = trunc <vscale x 16 x i16> [[TMP21]] to <vscale x 16 x i8>
 ; PRED-NEXT:    [[TMP26:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX]]
@@ -370,9 +367,8 @@ define void @trunc_ivs_and_store(i32 %x, ptr %dst, i64 %N) #0 {
 ; DEFAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; DEFAULT-NEXT:    [[OFFSET_IDX:%.*]] = trunc i64 [[INDEX]] to i32
 ; DEFAULT-NEXT:    [[TMP15:%.*]] = add i32 [[OFFSET_IDX]], 1
-; DEFAULT-NEXT:    [[TMP16:%.*]] = trunc i64 [[INDEX]] to i32
-; DEFAULT-NEXT:    [[TMP18:%.*]] = add i32 [[TMP16]], 1
-; DEFAULT-NEXT:    [[TMP19:%.*]] = mul i32 [[MUL]], [[TMP16]]
+; DEFAULT-NEXT:    [[TMP18:%.*]] = add i32 [[OFFSET_IDX]], 1
+; DEFAULT-NEXT:    [[TMP19:%.*]] = mul i32 [[MUL]], [[OFFSET_IDX]]
 ; DEFAULT-NEXT:    [[TMP20:%.*]] = mul i32 [[MUL]], [[TMP18]]
 ; DEFAULT-NEXT:    [[TMP21:%.*]] = zext i32 [[TMP19]] to i64
 ; DEFAULT-NEXT:    [[TMP22:%.*]] = zext i32 [[TMP20]] to i64
@@ -562,9 +558,8 @@ define void @ivs_trunc_and_ext(i32 %x, ptr %dst, i64 %N) #0 {
 ; DEFAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; DEFAULT-NEXT:    [[OFFSET_IDX:%.*]] = trunc i64 [[INDEX]] to i32
 ; DEFAULT-NEXT:    [[TMP14:%.*]] = add i32 [[OFFSET_IDX]], 1
-; DEFAULT-NEXT:    [[TMP15:%.*]] = trunc i64 [[INDEX]] to i32
-; DEFAULT-NEXT:    [[TMP17:%.*]] = add i32 [[TMP15]], 1
-; DEFAULT-NEXT:    [[TMP18:%.*]] = mul i32 [[ADD]], [[TMP15]]
+; DEFAULT-NEXT:    [[TMP17:%.*]] = add i32 [[OFFSET_IDX]], 1
+; DEFAULT-NEXT:    [[TMP18:%.*]] = mul i32 [[ADD]], [[OFFSET_IDX]]
 ; DEFAULT-NEXT:    [[TMP19:%.*]] = mul i32 [[ADD]], [[TMP17]]
 ; DEFAULT-NEXT:    [[TMP20:%.*]] = zext i32 [[TMP18]] to i64
 ; DEFAULT-NEXT:    [[TMP21:%.*]] = zext i32 [[TMP19]] to i64
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/outer_loop_test1_no_explicit_vect_width.ll b/llvm/test/Transforms/LoopVectorize/AArch64/outer_loop_test1_no_explicit_vect_width.ll
index 5e99425c1482c..ae28eb4c3a49d 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/outer_loop_test1_no_explicit_vect_width.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/outer_loop_test1_no_explicit_vect_width.ll
@@ -39,8 +39,7 @@ define void @foo_i32(i32 %n) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [8 x i32], ptr @arr2, i64 0, <4 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = trunc <4 x i64> [[VEC_IND]] to <4 x i32>
 ; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[TMP1]], <4 x ptr> [[TMP0]], i32 4, <4 x i1> splat (i1 true))
-; CHECK-NEXT:    [[TMP8:%.*]] = trunc <4 x i64> [[VEC_IND]] to <4 x i32>
-; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[TMP8]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[TMP1]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    br label %[[FOR_BODY31:.*]]
 ; CHECK:       [[FOR_BODY31]]:
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[TMP4:%.*]], %[[FOR_BODY31]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll
index 658b9a4569191..1540baab53719 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll
@@ -1679,8 +1679,7 @@ define i64 @test_std_q31(ptr %x, i32 %n) #0 {
 ; CHECK-NEXT:    [[TMP2:%.*]] = sext <4 x i32> [[TMP1]] to <4 x i64>
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP2]])
 ; CHECK-NEXT:    [[TMP4]] = add i64 [[VEC_PHI]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = sext <4 x i32> [[TMP1]] to <4 x i64>
-; CHECK-NEXT:    [[TMP6:%.*]] = mul nsw <4 x i64> [[TMP5]], [[TMP5]]
+; CHECK-NEXT:    [[TMP6:%.*]] = mul nsw <4 x i64> [[TMP2]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP6]])
 ; CHECK-NEXT:    [[TMP8]] = add i64 [[VEC_PHI1]], [[TMP7]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/vectorize-bswap.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/vectorize-bswap.ll
index 2c85b75dda018..7f0a487fca945 100644
--- a/llvm/test/Transforms/LoopVectorize/PowerPC/vectorize-bswap.ll
+++ b/llvm/test/Transforms/LoopVectorize/PowerPC/vectorize-bswap.ll
@@ -19,9 +19,7 @@ define dso_local void @test(ptr %Arr, i32 signext %Len) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[ARR:%.*]], i64 [[TMP1]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> [[WIDE_LOAD]])
-; CHECK-NEXT:    [[TMP5:%.*]] = sext i32 [[INDEX]] to i64
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP5]]
-; CHECK-NEXT:    store <4 x i32> [[TMP4]], ptr [[TMP6]], align 4
+; CHECK-NEXT:    store <4 x i32> [[TMP4]], ptr [[TMP3]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll b/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll
index 757c77fef98c8..2a5ec3de6d391 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll
@@ -28,9 +28,8 @@ define void @block_with_dead_inst_1(ptr %src, i64 %N) #0 {
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i64> [[DOTSPLATINSERT]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr i16, ptr [[SRC]], <vscale x 8 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    call void @llvm.vp.scatter.nxv8i16.nxv8p0(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x ptr> align 2 [[TMP20]], <vscale x 8 x i1> splat (i1 true), i32 [[TMP12]])
-; CHECK-NEXT:    [[TMP14:%.*]] = zext i32 [[TMP12]] to i64
-; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP14]], [[EVL_BASED_IV]]
-; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP14]]
+; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP17]], [[EVL_BASED_IV]]
+; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP17]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 8 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[TMP2]]
 ; CHECK-NEXT:    br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -104,9 +103,8 @@ define void @block_with_dead_inst_2(ptr %src) #0 {
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i16, ptr [[SRC]], <vscale x 8 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    call void @llvm.vp.scatter.nxv8i16.nxv8p0(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x ptr> align 2 [[TMP10]], <vscale x 8 x i1> splat (i1 true), i32 [[TMP9]])
-; CHECK-NEXT:    [[TMP11:%.*]] = zext i32 [[TMP9]] to i64
-; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP11]], [[EVL_BASED_IV]]
-; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]]
+; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP7]], [[EVL_BASED_IV]]
+; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP7]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 8 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], 333
 ; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
@@ -180,9 +178,8 @@ define void @multiple_blocks_with_dead_insts_3(ptr %src) #0 {
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i16, ptr [[SRC]], <vscale x 8 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    call void @llvm.vp.scatter.nxv8i16.nxv8p0(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x ptr> align 2 [[TMP10]], <vscale x 8 x i1> splat (i1 true), i32 [[TMP9]])
-; CHECK-NEXT:    [[TMP11:%.*]] = zext i32 [[TMP9]] to i64
-; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP11]], [[EVL_BASED_IV]]
-; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]]
+; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP7]], [[EVL_BASED_IV]]
+; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP7]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 8 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], 333
 ; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
@@ -266,9 +263,8 @@ define void @multiple_blocks_with_dead_insts_4(ptr %src, i64 %N) #0 {
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i64> [[DOTSPLATINSERT]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr i16, ptr [[SRC]], <vscale x 8 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    call void @llvm.vp.scatter.nxv8i16.nxv8p0(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x ptr> align 2 [[TMP20]], <vscale x 8 x i1> splat (i1 true), i32 [[TMP12]])
-; CHECK-NEXT:    [[TMP14:%.*]] = zext i32 [[TMP12]] to i64
-; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP14]], [[EVL_BASED_IV]]
-; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP14]]
+; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP17]], [[EVL_BASED_IV]]
+; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP17]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 8 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[TMP2]]
 ; CHECK-NEXT:    br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
@@ -354,9 +350,8 @@ define void @multiple_blocks_with_dead_inst_multiple_successors_5(ptr %src) #0 {
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i16, ptr [[SRC]], <vscale x 8 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    call void @llvm.vp.scatter.nxv8i16.nxv8p0(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x ptr> align 2 [[TMP10]], <vscale x 8 x i1> splat (i1 true), i32 [[TMP9]])
-; CHECK-NEXT:    [[TMP11:%.*]] = zext i32 [[TMP9]] to i64
-; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP11]], [[EVL_BASED_IV]]
-; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]]
+; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP7]], [[EVL_BASED_IV]]
+; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP7]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 8 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], 333
 ; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
@@ -466,9 +461,8 @@ define void @multiple_blocks_with_dead_inst_multiple_successors_6(ptr %src, i1 %
 ; CHECK-NEXT:    [[TMP23:%.*]] = select <vscale x 8 x i1> [[TMP18]], <vscale x 8 x i1> [[BROADCAST_SPLAT]], <vscale x 8 x i1> zeroinitializer
 ; CHECK-NEXT:    [[TMP24:%.*]] = or <vscale x 8 x i1> [[TMP22]], [[TMP23]]
 ; CHECK-NEXT:    call void @llvm.vp.scatter.nxv8i16.nxv8p0(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x ptr> align 2 [[TMP20]], <vscale x 8 x i1> [[TMP24]], i32 [[TMP27]])
-; CHECK-NEXT:    [[TMP25:%.*]] = zext i32 [[TMP27]] to i64
-; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP25]], [[EVL_BASED_IV]]
-; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP25]]
+; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP12]], [[EVL_BASED_IV]]
+; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP12]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 8 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[TMP2]]
 ; CHECK-NEXT:    br i1 [[TMP26]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
@@ -737,9 +731,8 @@ define void @dead_load_in_block(ptr %dst, ptr %src, i8 %N, i64 %x) #0 {
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr i32, ptr [[DST]], <vscale x 4 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    call void @llvm.vp.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x ptr> align 4 [[TMP21]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP18]]), !alias.scope [[META19:![0-9]+]], !noalias [[META22:![0-9]+]]
-; CHECK-NEXT:    [[TMP22:%.*]] = zext i32 [[TMP18]] to i64
-; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP22]], [[EVL_BASED_IV]]
-; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP22]]
+; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP17]], [[EVL_BASED_IV]]
+; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP17]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[TMP3]]
 ; CHECK-NEXT:    br i1 [[TMP26]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll
index 29d901f084bdb..60f86b37ab8cf 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll
@@ -37,9 +37,8 @@ define void @dead_load(ptr %p, i16 %start) {
 ; CHECK-NEXT:    [[DOTSPLAT2:%.*]] = shufflevector <vscale x 8 x i64> [[DOTSPLATINSERT1]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr i16, ptr [[P]], <vscale x 8 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    call void @llvm.vp.scatter.nxv8i16.nxv8p0(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x ptr> align 2 [[TMP21]], <vscale x 8 x i1> splat (i1 true), i32 [[TMP16]])
-; CHECK-NEXT:    [[TMP22:%.*]] = zext i32 [[TMP16]] to i64
-; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP22]], [[EVL_BASED_IV]]
-; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP22]]
+; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP19]], [[EVL_BASED_IV]]
+; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP19]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 8 x i64> [[VEC_IND]], [[DOTSPLAT2]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[TMP5]]
 ; CHECK-NEXT:    br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -332,9 +331,8 @@ define void @test_phi_in_latch_redundant(ptr %dst, i32 %a) {
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP14]], <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> [[TMP19]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i32, ptr [[DST]], <vscale x 4 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    call void @llvm.vp.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[PREDPHI]], <vscale x 4 x ptr> align 4 [[TMP16]], <vscale x 4 x i1> [[TMP15]], i32 [[TMP8]])
-; CHECK-NEXT:    [[TMP17:%.*]] = zext i32 [[TMP8]] to i64
-; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP17]], [[EVL_BASED_IV]]
-; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP17]]
+; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP5]], [[EVL_BASED_IV]]
+; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP5]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT2]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], 37
 ; CHECK-NEXT:    br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
@@ -419,9 +417,8 @@ define void @gather_interleave_group_with_dead_insert_pos(i64 %N, ptr noalias %s
 ; CHECK-NEXT:    [[TMP18:%.*]] = zext <vscale x 4 x i8> [[TMP17]] to <vscale x 4 x i32>
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr i32, ptr [[DST]], <vscale x 4 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    call void @llvm.vp.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP18]], <vscale x 4 x ptr> align 4 [[TMP19]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
-; CHECK-NEXT:    [[TMP20:%.*]] = zext i32 [[TMP10]] to i64
-; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP20]], [[EVL_BASED_IV]]
-; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP20]]
+; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP16]], [[EVL_BASED_IV]]
+; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP16]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[TMP2]]
 ; CHECK-NEXT:    br i1 [[TMP21]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/evl-compatible-loops.ll b/llvm/test/Transforms/LoopVectorize/RISCV/evl-compatible-loops.ll
index be6f32a6f4ea2..27722aa49378c 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/evl-compatible-loops.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/evl-compatible-loops.ll
@@ -24,9 +24,8 @@ define void @test_wide_integer_induction(ptr noalias %a, i64 %N) {
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[EVL_BASED_IV]]
 ; CHECK-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[VEC_IND]], ptr align 8 [[TMP14]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP11]])
-; CHECK-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP11]] to i64
-; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP16]], [[EVL_BASED_IV]]
-; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP16]]
+; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP12]], [[EVL_BASED_IV]]
+; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP12]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]]
 ; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -80,8 +79,7 @@ define void @test_wide_ptr_induction(ptr noalias %a, ptr noalias %b, i64 %N) {
 ; CHECK-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP7]] to i64
 ; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP9]], [[EVL_BASED_IV]]
 ; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP9]]
-; CHECK-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP7]] to i64
-; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 8, [[TMP10]]
+; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 8, [[TMP9]]
 ; CHECK-NEXT:    [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP11]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]]
 ; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/induction-costs.ll b/llvm/test/Transforms/LoopVectorize/RISCV/induction-costs.ll
index 18dc20495117e..f45e7c31ffc4f 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/induction-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/induction-costs.ll
@@ -70,9 +70,8 @@ define void @skip_free_iv_truncate(i16 %x, ptr %A) #0 {
 ; CHECK-NEXT:    [[DOTSPLAT25:%.*]] = shufflevector <vscale x 8 x i64> [[DOTSPLATINSERT24]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP59:%.*]] = getelementptr i16, ptr [[A]], <vscale x 8 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    call void @llvm.vp.scatter.nxv8i16.nxv8p0(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x ptr> align 2 [[TMP59]], <vscale x 8 x i1> splat (i1 true), i32 [[TMP27]]), !alias.scope [[META0:![0-9]+]], !noalias [[META3:![0-9]+]]
-; CHECK-NEXT:    [[TMP47:%.*]] = zext i32 [[TMP27]] to i64
-; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP47]], [[EVL_BASED_IV]]
-; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP47]]
+; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP28]], [[EVL_BASED_IV]]
+; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP28]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 8 x i64> [[VEC_IND]], [[DOTSPLAT25]]
 ; CHECK-NEXT:    [[TMP48:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[TMP5]]
 ; CHECK-NEXT:    br i1 [[TMP48]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/mask-index-type.ll b/llvm/test/Transforms/LoopVectorize/RISCV/mask-index-type.ll
index a46d877825f87..1fa0d08b1dcf6 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/mask-index-type.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/mask-index-type.ll
@@ -39,9 +39,8 @@ define void @test(ptr noalias nocapture %a, ptr noalias nocapture %b, i32 %v) {
 ; VLENUNK-NEXT:    [[TMP17:%.*]] = add <vscale x 4 x i32> [[PREDPHI]], [[BROADCAST_SPLAT]]
 ; VLENUNK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDEX]]
 ; VLENUNK-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP17]], ptr align 4 [[TMP18]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP7]])
-; VLENUNK-NEXT:    [[TMP19:%.*]] = zext i32 [[TMP7]] to i64
-; VLENUNK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP19]], [[INDEX]]
-; VLENUNK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP19]]
+; VLENUNK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP15]], [[INDEX]]
+; VLENUNK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP15]]
 ; VLENUNK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; VLENUNK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; VLENUNK-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll b/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll
index 83f23a48285f5..a86179384f443 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll
@@ -53,9 +53,8 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; RV32-NEXT:    [[TMP18:%.*]] = fadd <vscale x 2 x double> [[WIDE_MASKED_GATHER6]], [[TMP17]]
 ; RV32-NEXT:    [[TMP19:%.*]] = getelementptr inbounds double, ptr [[A]], <vscale x 2 x i64> [[VEC_IND]]
 ; RV32-NEXT:    call void @llvm.vp.scatter.nxv2f64.nxv2p0(<vscale x 2 x double> [[TMP18]], <vscale x 2 x ptr> align 8 [[TMP19]], <vscale x 2 x i1> [[TMP14]], i32 [[TMP10]]), !alias.scope [[META5:![0-9]+]], !noalias [[META7:![0-9]+]]
-; RV32-NEXT:    [[TMP20:%.*]] = zext i32 [[TMP10]] to i64
-; RV32-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP20]], [[INDEX]]
-; RV32-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP20]]
+; RV32-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP8]], [[INDEX]]
+; RV32-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP8]]
 ; RV32-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; RV32-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], 625
 ; RV32-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
@@ -125,9 +124,8 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; RV64-NEXT:    [[TMP18:%.*]] = fadd <vscale x 2 x double> [[WIDE_MASKED_GATHER6]], [[TMP17]]
 ; RV64-NEXT:    [[TMP19:%.*]] = getelementptr inbounds double, ptr [[A]], <vscale x 2 x i64> [[VEC_IND]]
 ; RV64-NEXT:    call void @llvm.vp.scatter.nxv2f64.nxv2p0(<vscale x 2 x double> [[TMP18]], <vscale x 2 x ptr> align 8 [[TMP19]], <vscale x 2 x i1> [[TMP14]], i32 [[TMP10]]), !alias.scope [[META5:![0-9]+]], !noalias [[META7:![0-9]+]]
-; RV64-NEXT:    [[TMP20:%.*]] = zext i32 [[TMP10]] to i64
-; RV64-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP20]], [[INDEX]]
-; RV64-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP20]]
+; RV64-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP8]], [[INDEX]]
+; RV64-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP8]]
 ; RV64-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; RV64-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], 625
 ; RV64-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/pr87378-vpinstruction-or-drop-poison-generating-flags.ll b/llvm/test/Transforms/LoopVectorize/RISCV/pr87378-vpinstruction-or-drop-poison-generating-flags.ll
index 37a0a8b4d7c87..2317cf76600fd 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/pr87378-vpinstruction-or-drop-poison-generating-flags.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/pr87378-vpinstruction-or-drop-poison-generating-flags.ll
@@ -49,9 +49,8 @@ define void @pr87378_vpinstruction_or_drop_poison_generating_flags(ptr %arg, i64
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select i1 [[TMP23]], i64 poison, i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr i16, ptr [[ARG]], i64 [[PREDPHI]]
 ; CHECK-NEXT:    call void @llvm.vp.store.nxv8i16.p0(<vscale x 8 x i16> zeroinitializer, ptr align 2 [[TMP24]], <vscale x 8 x i1> [[TMP22]], i32 [[TMP25]])
-; CHECK-NEXT:    [[TMP26:%.*]] = zext i32 [[TMP25]] to i64
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP26]], [[INDEX]]
-; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP26]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP8]], [[INDEX]]
+; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP8]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 8 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1001
 ; CHECK-NEXT:    br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll
new file mode 100644
index 0000000000000..1d7c80b3d722f
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll
@@ -0,0 +1,653 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 5
+;; This is the loop in c++ being vectorize in this file with
+;; vector.reverse
+;;  #pragma clang loop vectorize_width(4, scalable)
+;;  for (int i = N-1; i >= 0; --i)
+;;    a[i] = b[i] + 1.0;
+
+; RUN: opt -passes=loop-vectorize -mtriple=riscv64 -mattr=+v \
+; RUN: -riscv-v-vector-bits-min=128 -S < %s \
+; RUN: | FileCheck --check-prefix=RV64 %s
+
+; RUN: opt -passes=loop-vectorize -mtriple=riscv32 -mattr=+v \
+; RUN: -riscv-v-vector-bits-min=128 -S < %s \
+; RUN: | FileCheck --check-prefix=RV32 %s
+
+; RUN: opt -passes=loop-vectorize -mtriple=riscv64 -mattr=+v \
+; RUN: -riscv-v-vector-bits-min=128 -force-vector-interleave=2 -S < %s \
+; RUN: | FileCheck --check-prefix=RV64-UF2 %s
+
+define void @vector_reverse_i32(ptr noalias %A, ptr noalias %B) {
+; RV64-LABEL: define void @vector_reverse_i32(
+; RV64-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; RV64-NEXT:  [[ENTRY:.*:]]
+; RV64-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; RV64:       [[VECTOR_PH]]:
+; RV64-NEXT:    br label %[[VECTOR_BODY:.*]]
+; RV64:       [[VECTOR_BODY]]:
+; RV64-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; RV64-NEXT:    [[AVL:%.*]] = phi i64 [ 1023, %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; RV64-NEXT:    [[TMP0:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
+; RV64-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
+; RV64-NEXT:    [[TMP8:%.*]] = add nsw i64 [[OFFSET_IDX]], -1
+; RV64-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP8]]
+; RV64-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP0]] to i64
+; RV64-NEXT:    [[TMP10:%.*]] = mul i64 0, [[TMP5]]
+; RV64-NEXT:    [[TMP22:%.*]] = sub i64 [[TMP5]], 1
+; RV64-NEXT:    [[TMP11:%.*]] = mul i64 -1, [[TMP22]]
+; RV64-NEXT:    [[TMP7:%.*]] = getelementptr i32, ptr [[TMP9]], i64 [[TMP10]]
+; RV64-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[TMP7]], i64 [[TMP11]]
+; RV64-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP0]])
+; RV64-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP0]])
+; RV64-NEXT:    [[TMP14:%.*]] = add <vscale x 4 x i32> [[REVERSE]], splat (i32 1)
+; RV64-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP8]]
+; RV64-NEXT:    [[TMP16:%.*]] = mul i64 0, [[TMP5]]
+; RV64-NEXT:    [[TMP23:%.*]] = sub i64 [[TMP5]], 1
+; RV64-NEXT:    [[TMP17:%.*]] = mul i64 -1, [[TMP23]]
+; RV64-NEXT:    [[TMP18:%.*]] = getelementptr i32, ptr [[TMP15]], i64 [[TMP16]]
+; RV64-NEXT:    [[TMP19:%.*]] = getelementptr i32, ptr [[TMP18]], i64 [[TMP17]]
+; RV64-NEXT:    [[VP_REVERSE1:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[TMP14]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP0]])
+; RV64-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_REVERSE1]], ptr align 4 [[TMP19]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP0]])
+; RV64-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP5]], [[INDEX]]
+; RV64-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP5]]
+; RV64-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], 1023
+; RV64-NEXT:    br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; RV64:       [[MIDDLE_BLOCK]]:
+; RV64-NEXT:    br label %[[EXIT:.*]]
+; RV64:       [[SCALAR_PH]]:
+; RV64-NEXT:    br label %[[FOR_BODY:.*]]
+; RV64:       [[FOR_BODY]]:
+; RV64-NEXT:    [[DEC_IV:%.*]] = phi i64 [ 1023, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; RV64-NEXT:    [[IV_NEXT]] = add nsw i64 [[DEC_IV]], -1
+; RV64-NEXT:    [[ARRAYIDX_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV_NEXT]]
+; RV64-NEXT:    [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX_B]], align 4
+; RV64-NEXT:    [[ADD:%.*]] = add i32 [[TMP21]], 1
+; RV64-NEXT:    [[ARRAYIDX_A:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV_NEXT]]
+; RV64-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX_A]], align 4
+; RV64-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[DEC_IV]], 1
+; RV64-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[EXIT]], !llvm.loop [[LOOP4:![0-9]+]]
+; RV64:       [[EXIT]]:
+; RV64-NEXT:    ret void
+;
+; RV32-LABEL: define void @vector_reverse_i32(
+; RV32-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; RV32-NEXT:  [[ENTRY:.*:]]
+; RV32-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; RV32:       [[VECTOR_PH]]:
+; RV32-NEXT:    br label %[[VECTOR_BODY:.*]]
+; RV32:       [[VECTOR_BODY]]:
+; RV32-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; RV32-NEXT:    [[AVL:%.*]] = phi i64 [ 1023, %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; RV32-NEXT:    [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
+; RV32-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
+; RV32-NEXT:    [[TMP8:%.*]] = add nsw i64 [[OFFSET_IDX]], -1
+; RV32-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP8]]
+; RV32-NEXT:    [[TMP11:%.*]] = mul i32 0, [[TMP10]]
+; RV32-NEXT:    [[TMP24:%.*]] = sub i32 [[TMP10]], 1
+; RV32-NEXT:    [[TMP12:%.*]] = mul i32 -1, [[TMP24]]
+; RV32-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr [[TMP9]], i32 [[TMP11]]
+; RV32-NEXT:    [[TMP7:%.*]] = getelementptr i32, ptr [[TMP6]], i32 [[TMP12]]
+; RV32-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP7]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
+; RV32-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
+; RV32-NEXT:    [[TMP15:%.*]] = add <vscale x 4 x i32> [[REVERSE]], splat (i32 1)
+; RV32-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP8]]
+; RV32-NEXT:    [[TMP18:%.*]] = mul i32 0, [[TMP10]]
+; RV32-NEXT:    [[TMP25:%.*]] = sub i32 [[TMP10]], 1
+; RV32-NEXT:    [[TMP19:%.*]] = mul i32 -1, [[TMP25]]
+; RV32-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[TMP16]], i32 [[TMP18]]
+; RV32-NEXT:    [[TMP14:%.*]] = getelementptr i32, ptr [[TMP13]], i32 [[TMP19]]
+; RV32-NEXT:    [[VP_REVERSE1:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[TMP15]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
+; RV32-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_REVERSE1]], ptr align 4 [[TMP14]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
+; RV32-NEXT:    [[TMP17:%.*]] = zext i32 [[TMP10]] to i64
+; RV32-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP17]], [[INDEX]]
+; RV32-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP17]]
+; RV32-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], 1023
+; RV32-NEXT:    br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; RV32:       [[MIDDLE_BLOCK]]:
+; RV32-NEXT:    br label %[[EXIT:.*]]
+; RV32:       [[SCALAR_PH]]:
+; RV32-NEXT:    br label %[[FOR_BODY:.*]]
+; RV32:       [[FOR_BODY]]:
+; RV32-NEXT:    [[DEC_IV:%.*]] = phi i64 [ 1023, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; RV32-NEXT:    [[IV_NEXT]] = add nsw i64 [[DEC_IV]], -1
+; RV32-NEXT:    [[ARRAYIDX_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV_NEXT]]
+; RV32-NEXT:    [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX_B]], align 4
+; RV32-NEXT:    [[ADD:%.*]] = add i32 [[TMP23]], 1
+; RV32-NEXT:    [[ARRAYIDX_A:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV_NEXT]]
+; RV32-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX_A]], align 4
+; RV32-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[DEC_IV]], 1
+; RV32-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[EXIT]], !llvm.loop [[LOOP4:![0-9]+]]
+; RV32:       [[EXIT]]:
+; RV32-NEXT:    ret void
+;
+; RV64-UF2-LABEL: define void @vector_reverse_i32(
+; RV64-UF2-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; RV64-UF2-NEXT:  [[ENTRY:.*]]:
+; RV64-UF2-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-UF2-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 3
+; RV64-UF2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1023, [[TMP1]]
+; RV64-UF2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; RV64-UF2:       [[VECTOR_PH]]:
+; RV64-UF2-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-UF2-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
+; RV64-UF2-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 2
+; RV64-UF2-NEXT:    [[N_VEC:%.*]] = urem i64 1023, [[TMP6]]
+; RV64-UF2-NEXT:    [[TMP7:%.*]] = sub i64 1023, [[N_VEC]]
+; RV64-UF2-NEXT:    [[TMP8:%.*]] = sub i64 1023, [[TMP7]]
+; RV64-UF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; RV64-UF2:       [[VECTOR_BODY]]:
+; RV64-UF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; RV64-UF2-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
+; RV64-UF2-NEXT:    [[TMP9:%.*]] = add nsw i64 [[OFFSET_IDX]], -1
+; RV64-UF2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP9]]
+; RV64-UF2-NEXT:    [[TMP11:%.*]] = mul i64 0, [[TMP5]]
+; RV64-UF2-NEXT:    [[TMP32:%.*]] = sub i64 [[TMP5]], 1
+; RV64-UF2-NEXT:    [[TMP12:%.*]] = mul i64 -1, [[TMP32]]
+; RV64-UF2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i64 [[TMP11]]
+; RV64-UF2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i64 [[TMP12]]
+; RV64-UF2-NEXT:    [[TMP15:%.*]] = mul i64 -1, [[TMP5]]
+; RV64-UF2-NEXT:    [[TMP33:%.*]] = sub i64 [[TMP5]], 1
+; RV64-UF2-NEXT:    [[TMP16:%.*]] = mul i64 -1, [[TMP33]]
+; RV64-UF2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i64 [[TMP15]]
+; RV64-UF2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i64 [[TMP16]]
+; RV64-UF2-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP14]], align 4
+; RV64-UF2-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD]])
+; RV64-UF2-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i32>, ptr [[TMP18]], align 4
+; RV64-UF2-NEXT:    [[REVERSE2:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD1]])
+; RV64-UF2-NEXT:    [[TMP19:%.*]] = add <vscale x 4 x i32> [[REVERSE]], splat (i32 1)
+; RV64-UF2-NEXT:    [[TMP20:%.*]] = add <vscale x 4 x i32> [[REVERSE2]], splat (i32 1)
+; RV64-UF2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP9]]
+; RV64-UF2-NEXT:    [[TMP22:%.*]] = mul i64 0, [[TMP5]]
+; RV64-UF2-NEXT:    [[TMP34:%.*]] = sub i64 [[TMP5]], 1
+; RV64-UF2-NEXT:    [[TMP23:%.*]] = mul i64 -1, [[TMP34]]
+; RV64-UF2-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i64 [[TMP22]]
+; RV64-UF2-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i64 [[TMP23]]
+; RV64-UF2-NEXT:    [[TMP26:%.*]] = mul i64 -1, [[TMP5]]
+; RV64-UF2-NEXT:    [[TMP35:%.*]] = sub i64 [[TMP5]], 1
+; RV64-UF2-NEXT:    [[TMP27:%.*]] = mul i64 -1, [[TMP35]]
+; RV64-UF2-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i64 [[TMP26]]
+; RV64-UF2-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[TMP28]], i64 [[TMP27]]
+; RV64-UF2-NEXT:    [[REVERSE3:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP19]])
+; RV64-UF2-NEXT:    store <vscale x 4 x i32> [[REVERSE3]], ptr [[TMP25]], align 4
+; RV64-UF2-NEXT:    [[REVERSE4:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP20]])
+; RV64-UF2-NEXT:    store <vscale x 4 x i32> [[REVERSE4]], ptr [[TMP29]], align 4
+; RV64-UF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
+; RV64-UF2-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[TMP7]]
+; RV64-UF2-NEXT:    br i1 [[TMP36]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; RV64-UF2:       [[MIDDLE_BLOCK]]:
+; RV64-UF2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1023, [[TMP7]]
+; RV64-UF2-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; RV64-UF2:       [[SCALAR_PH]]:
+; RV64-UF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP8]], %[[MIDDLE_BLOCK]] ], [ 1023, %[[ENTRY]] ]
+; RV64-UF2-NEXT:    br label %[[FOR_BODY:.*]]
+; RV64-UF2:       [[FOR_BODY]]:
+; RV64-UF2-NEXT:    [[DEC_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; RV64-UF2-NEXT:    [[IV_NEXT]] = add nsw i64 [[DEC_IV]], -1
+; RV64-UF2-NEXT:    [[ARRAYIDX_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV_NEXT]]
+; RV64-UF2-NEXT:    [[TMP31:%.*]] = load i32, ptr [[ARRAYIDX_B]], align 4
+; RV64-UF2-NEXT:    [[ADD:%.*]] = add i32 [[TMP31]], 1
+; RV64-UF2-NEXT:    [[ARRAYIDX_A:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV_NEXT]]
+; RV64-UF2-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX_A]], align 4
+; RV64-UF2-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[DEC_IV]], 1
+; RV64-UF2-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[EXIT]], !llvm.loop [[LOOP3:![0-9]+]]
+; RV64-UF2:       [[EXIT]]:
+; RV64-UF2-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %dec.iv = phi i64 [ 1023, %entry ], [ %iv.next, %for.body ]
+  %iv.next = add nsw i64 %dec.iv, -1
+  %arrayidx.b = getelementptr inbounds i32, ptr %B, i64 %iv.next
+  %0 = load i32, ptr %arrayidx.b, align 4
+  %add = add i32 %0, 1
+  %arrayidx.a = getelementptr inbounds i32, ptr %A, i64 %iv.next
+  store i32 %add, ptr %arrayidx.a, align 4
+  %cmp = icmp ugt i64 %dec.iv, 1
+  br i1 %cmp, label %for.body, label %exit, !llvm.loop !0
+
+exit:
+  ret void
+}
+
+define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B) {
+; RV64-LABEL: define void @vector_reverse_f32(
+; RV64-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0]] {
+; RV64-NEXT:  [[ENTRY:.*:]]
+; RV64-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; RV64:       [[VECTOR_PH]]:
+; RV64-NEXT:    br label %[[VECTOR_BODY:.*]]
+; RV64:       [[VECTOR_BODY]]:
+; RV64-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; RV64-NEXT:    [[AVL:%.*]] = phi i64 [ 1023, %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; RV64-NEXT:    [[TMP0:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
+; RV64-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
+; RV64-NEXT:    [[TMP8:%.*]] = add nsw i64 [[OFFSET_IDX]], -1
+; RV64-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP8]]
+; RV64-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP0]] to i64
+; RV64-NEXT:    [[TMP10:%.*]] = mul i64 0, [[TMP5]]
+; RV64-NEXT:    [[TMP22:%.*]] = sub i64 [[TMP5]], 1
+; RV64-NEXT:    [[TMP11:%.*]] = mul i64 -1, [[TMP22]]
+; RV64-NEXT:    [[TMP7:%.*]] = getelementptr float, ptr [[TMP9]], i64 [[TMP10]]
+; RV64-NEXT:    [[TMP12:%.*]] = getelementptr float, ptr [[TMP7]], i64 [[TMP11]]
+; RV64-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP0]])
+; RV64-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x float> @llvm.experimental.vp.reverse.nxv4f32(<vscale x 4 x float> [[VP_OP_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP0]])
+; RV64-NEXT:    [[TMP14:%.*]] = fadd <vscale x 4 x float> [[REVERSE]], splat (float 1.000000e+00)
+; RV64-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP8]]
+; RV64-NEXT:    [[TMP16:%.*]] = mul i64 0, [[TMP5]]
+; RV64-NEXT:    [[TMP23:%.*]] = sub i64 [[TMP5]], 1
+; RV64-NEXT:    [[TMP17:%.*]] = mul i64 -1, [[TMP23]]
+; RV64-NEXT:    [[TMP18:%.*]] = getelementptr float, ptr [[TMP15]], i64 [[TMP16]]
+; RV64-NEXT:    [[TMP19:%.*]] = getelementptr float, ptr [[TMP18]], i64 [[TMP17]]
+; RV64-NEXT:    [[VP_REVERSE1:%.*]] = call <vscale x 4 x float> @llvm.experimental.vp.reverse.nxv4f32(<vscale x 4 x float> [[TMP14]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP0]])
+; RV64-NEXT:    call void @llvm.vp.store.nxv4f32.p0(<vscale x 4 x float> [[VP_REVERSE1]], ptr align 4 [[TMP19]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP0]])
+; RV64-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP5]], [[INDEX]]
+; RV64-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP5]]
+; RV64-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], 1023
+; RV64-NEXT:    br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; RV64:       [[MIDDLE_BLOCK]]:
+; RV64-NEXT:    br label %[[EXIT:.*]]
+; RV64:       [[SCALAR_PH]]:
+; RV64-NEXT:    br label %[[FOR_BODY:.*]]
+; RV64:       [[FOR_BODY]]:
+; RV64-NEXT:    [[DEC_IV:%.*]] = phi i64 [ 1023, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; RV64-NEXT:    [[IV_NEXT]] = add nsw i64 [[DEC_IV]], -1
+; RV64-NEXT:    [[ARRAYIDX_B:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV_NEXT]]
+; RV64-NEXT:    [[TMP21:%.*]] = load float, ptr [[ARRAYIDX_B]], align 4
+; RV64-NEXT:    [[FADD:%.*]] = fadd float [[TMP21]], 1.000000e+00
+; RV64-NEXT:    [[ARRAYIDX_A:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV_NEXT]]
+; RV64-NEXT:    store float [[FADD]], ptr [[ARRAYIDX_A]], align 4
+; RV64-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[DEC_IV]], 1
+; RV64-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[EXIT]], !llvm.loop [[LOOP6:![0-9]+]]
+; RV64:       [[EXIT]]:
+; RV64-NEXT:    ret void
+;
+; RV32-LABEL: define void @vector_reverse_f32(
+; RV32-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0]] {
+; RV32-NEXT:  [[ENTRY:.*:]]
+; RV32-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; RV32:       [[VECTOR_PH]]:
+; RV32-NEXT:    br label %[[VECTOR_BODY:.*]]
+; RV32:       [[VECTOR_BODY]]:
+; RV32-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; RV32-NEXT:    [[AVL:%.*]] = phi i64 [ 1023, %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; RV32-NEXT:    [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
+; RV32-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
+; RV32-NEXT:    [[TMP8:%.*]] = add nsw i64 [[OFFSET_IDX]], -1
+; RV32-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP8]]
+; RV32-NEXT:    [[TMP11:%.*]] = mul i32 0, [[TMP10]]
+; RV32-NEXT:    [[TMP24:%.*]] = sub i32 [[TMP10]], 1
+; RV32-NEXT:    [[TMP12:%.*]] = mul i32 -1, [[TMP24]]
+; RV32-NEXT:    [[TMP6:%.*]] = getelementptr float, ptr [[TMP9]], i32 [[TMP11]]
+; RV32-NEXT:    [[TMP7:%.*]] = getelementptr float, ptr [[TMP6]], i32 [[TMP12]]
+; RV32-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP7]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
+; RV32-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x float> @llvm.experimental.vp.reverse.nxv4f32(<vscale x 4 x float> [[VP_OP_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
+; RV32-NEXT:    [[TMP15:%.*]] = fadd <vscale x 4 x float> [[REVERSE]], splat (float 1.000000e+00)
+; RV32-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP8]]
+; RV32-NEXT:    [[TMP18:%.*]] = mul i32 0, [[TMP10]]
+; RV32-NEXT:    [[TMP25:%.*]] = sub i32 [[TMP10]], 1
+; RV32-NEXT:    [[TMP19:%.*]] = mul i32 -1, [[TMP25]]
+; RV32-NEXT:    [[TMP13:%.*]] = getelementptr float, ptr [[TMP16]], i32 [[TMP18]]
+; RV32-NEXT:    [[TMP14:%.*]] = getelementptr float, ptr [[TMP13]], i32 [[TMP19]]
+; RV32-NEXT:    [[VP_REVERSE1:%.*]] = call <vscale x 4 x float> @llvm.experimental.vp.reverse.nxv4f32(<vscale x 4 x float> [[TMP15]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
+; RV32-NEXT:    call void @llvm.vp.store.nxv4f32.p0(<vscale x 4 x float> [[VP_REVERSE1]], ptr align 4 [[TMP14]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
+; RV32-NEXT:    [[TMP17:%.*]] = zext i32 [[TMP10]] to i64
+; RV32-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP17]], [[INDEX]]
+; RV32-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP17]]
+; RV32-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], 1023
+; RV32-NEXT:    br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; RV32:       [[MIDDLE_BLOCK]]:
+; RV32-NEXT:    br label %[[EXIT:.*]]
+; RV32:       [[SCALAR_PH]]:
+; RV32-NEXT:    br label %[[FOR_BODY:.*]]
+; RV32:       [[FOR_BODY]]:
+; RV32-NEXT:    [[DEC_IV:%.*]] = phi i64 [ 1023, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; RV32-NEXT:    [[IV_NEXT]] = add nsw i64 [[DEC_IV]], -1
+; RV32-NEXT:    [[ARRAYIDX_B:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV_NEXT]]
+; RV32-NEXT:    [[TMP23:%.*]] = load float, ptr [[ARRAYIDX_B]], align 4
+; RV32-NEXT:    [[FADD:%.*]] = fadd float [[TMP23]], 1.000000e+00
+; RV32-NEXT:    [[ARRAYIDX_A:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV_NEXT]]
+; RV32-NEXT:    store float [[FADD]], ptr [[ARRAYIDX_A]], align 4
+; RV32-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[DEC_IV]], 1
+; RV32-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[EXIT]], !llvm.loop [[LOOP6:![0-9]+]]
+; RV32:       [[EXIT]]:
+; RV32-NEXT:    ret void
+;
+; RV64-UF2-LABEL: define void @vector_reverse_f32(
+; RV64-UF2-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0]] {
+; RV64-UF2-NEXT:  [[ENTRY:.*]]:
+; RV64-UF2-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-UF2-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 3
+; RV64-UF2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1023, [[TMP1]]
+; RV64-UF2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; RV64-UF2:       [[VECTOR_PH]]:
+; RV64-UF2-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-UF2-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
+; RV64-UF2-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 2
+; RV64-UF2-NEXT:    [[N_VEC:%.*]] = urem i64 1023, [[TMP6]]
+; RV64-UF2-NEXT:    [[TMP7:%.*]] = sub i64 1023, [[N_VEC]]
+; RV64-UF2-NEXT:    [[TMP8:%.*]] = sub i64 1023, [[TMP7]]
+; RV64-UF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; RV64-UF2:       [[VECTOR_BODY]]:
+; RV64-UF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; RV64-UF2-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
+; RV64-UF2-NEXT:    [[TMP9:%.*]] = add nsw i64 [[OFFSET_IDX]], -1
+; RV64-UF2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP9]]
+; RV64-UF2-NEXT:    [[TMP11:%.*]] = mul i64 0, [[TMP5]]
+; RV64-UF2-NEXT:    [[TMP32:%.*]] = sub i64 [[TMP5]], 1
+; RV64-UF2-NEXT:    [[TMP12:%.*]] = mul i64 -1, [[TMP32]]
+; RV64-UF2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i64 [[TMP11]]
+; RV64-UF2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i64 [[TMP12]]
+; RV64-UF2-NEXT:    [[TMP15:%.*]] = mul i64 -1, [[TMP5]]
+; RV64-UF2-NEXT:    [[TMP33:%.*]] = sub i64 [[TMP5]], 1
+; RV64-UF2-NEXT:    [[TMP16:%.*]] = mul i64 -1, [[TMP33]]
+; RV64-UF2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i64 [[TMP15]]
+; RV64-UF2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[TMP16]]
+; RV64-UF2-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP14]], align 4
+; RV64-UF2-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD]])
+; RV64-UF2-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x float>, ptr [[TMP18]], align 4
+; RV64-UF2-NEXT:    [[REVERSE2:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD1]])
+; RV64-UF2-NEXT:    [[TMP19:%.*]] = fadd <vscale x 4 x float> [[REVERSE]], splat (float 1.000000e+00)
+; RV64-UF2-NEXT:    [[TMP20:%.*]] = fadd <vscale x 4 x float> [[REVERSE2]], splat (float 1.000000e+00)
+; RV64-UF2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP9]]
+; RV64-UF2-NEXT:    [[TMP22:%.*]] = mul i64 0, [[TMP5]]
+; RV64-UF2-NEXT:    [[TMP34:%.*]] = sub i64 [[TMP5]], 1
+; RV64-UF2-NEXT:    [[TMP23:%.*]] = mul i64 -1, [[TMP34]]
+; RV64-UF2-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP21]], i64 [[TMP22]]
+; RV64-UF2-NEXT:    [[TMP25:%.*]] = getelementptr inbounds float, ptr [[TMP24]], i64 [[TMP23]]
+; RV64-UF2-NEXT:    [[TMP26:%.*]] = mul i64 -1, [[TMP5]]
+; RV64-UF2-NEXT:    [[TMP35:%.*]] = sub i64 [[TMP5]], 1
+; RV64-UF2-NEXT:    [[TMP27:%.*]] = mul i64 -1, [[TMP35]]
+; RV64-UF2-NEXT:    [[TMP28:%.*]] = getelementptr inbounds float, ptr [[TMP21]], i64 [[TMP26]]
+; RV64-UF2-NEXT:    [[TMP29:%.*]] = getelementptr inbounds float, ptr [[TMP28]], i64 [[TMP27]]
+; RV64-UF2-NEXT:    [[REVERSE3:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[TMP19]])
+; RV64-UF2-NEXT:    store <vscale x 4 x float> [[REVERSE3]], ptr [[TMP25]], align 4
+; RV64-UF2-NEXT:    [[REVERSE4:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[TMP20]])
+; RV64-UF2-NEXT:    store <vscale x 4 x float> [[REVERSE4]], ptr [[TMP29]], align 4
+; RV64-UF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
+; RV64-UF2-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[TMP7]]
+; RV64-UF2-NEXT:    br i1 [[TMP36]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; RV64-UF2:       [[MIDDLE_BLOCK]]:
+; RV64-UF2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1023, [[TMP7]]
+; RV64-UF2-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; RV64-UF2:       [[SCALAR_PH]]:
+; RV64-UF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP8]], %[[MIDDLE_BLOCK]] ], [ 1023, %[[ENTRY]] ]
+; RV64-UF2-NEXT:    br label %[[FOR_BODY:.*]]
+; RV64-UF2:       [[FOR_BODY]]:
+; RV64-UF2-NEXT:    [[DEC_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; RV64-UF2-NEXT:    [[IV_NEXT]] = add nsw i64 [[DEC_IV]], -1
+; RV64-UF2-NEXT:    [[ARRAYIDX_B:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV_NEXT]]
+; RV64-UF2-NEXT:    [[TMP31:%.*]] = load float, ptr [[ARRAYIDX_B]], align 4
+; RV64-UF2-NEXT:    [[FADD:%.*]] = fadd float [[TMP31]], 1.000000e+00
+; RV64-UF2-NEXT:    [[ARRAYIDX_A:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV_NEXT]]
+; RV64-UF2-NEXT:    store float [[FADD]], ptr [[ARRAYIDX_A]], align 4
+; RV64-UF2-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[DEC_IV]], 1
+; RV64-UF2-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[EXIT]], !llvm.loop [[LOOP5:![0-9]+]]
+; RV64-UF2:       [[EXIT]]:
+; RV64-UF2-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %dec.iv = phi i64 [ 1023, %entry ], [ %iv.next, %for.body ]
+  %iv.next = add nsw i64 %dec.iv, -1
+  %arrayidx.b = getelementptr inbounds float, ptr %B, i64 %iv.next
+  %0 = load float, ptr %arrayidx.b, align 4
+  %fadd = fadd float %0, 1.000000e+00
+  %arrayidx.a = getelementptr inbounds float, ptr %A, i64 %iv.next
+  store float %fadd, ptr %arrayidx.a, align 4
+  %cmp = icmp ugt i64 %dec.iv, 1
+  br i1 %cmp, label %for.body, label %exit, !llvm.loop !0
+
+exit:
+  ret void
+}
+
+define void @vector_reverse_irregular_type(ptr noalias %A, ptr noalias %B) {
+; RV64-LABEL: define void @vector_reverse_irregular_type(
+; RV64-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0]] {
+; RV64-NEXT:  [[ENTRY:.*]]:
+; RV64-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; RV64:       [[VECTOR_PH]]:
+; RV64-NEXT:    br label %[[VECTOR_BODY:.*]]
+; RV64:       [[VECTOR_BODY]]:
+; RV64-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; RV64-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
+; RV64-NEXT:    [[DEC_IV:%.*]] = add i64 [[OFFSET_IDX]], 0
+; RV64-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], -1
+; RV64-NEXT:    [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], -2
+; RV64-NEXT:    [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], -3
+; RV64-NEXT:    [[IV_NEXT:%.*]] = add nsw i64 [[DEC_IV]], -1
+; RV64-NEXT:    [[TMP5:%.*]] = add nsw i64 [[TMP1]], -1
+; RV64-NEXT:    [[TMP6:%.*]] = add nsw i64 [[TMP2]], -1
+; RV64-NEXT:    [[TMP7:%.*]] = add nsw i64 [[TMP3]], -1
+; RV64-NEXT:    [[ARRAYIDX_B:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[IV_NEXT]]
+; RV64-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP5]]
+; RV64-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP6]]
+; RV64-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP7]]
+; RV64-NEXT:    [[TMP0:%.*]] = load i7, ptr [[ARRAYIDX_B]], align 1
+; RV64-NEXT:    [[TMP13:%.*]] = load i7, ptr [[TMP9]], align 1
+; RV64-NEXT:    [[TMP14:%.*]] = load i7, ptr [[TMP10]], align 1
+; RV64-NEXT:    [[TMP15:%.*]] = load i7, ptr [[TMP11]], align 1
+; RV64-NEXT:    [[TMP16:%.*]] = insertelement <4 x i7> poison, i7 [[TMP0]], i32 0
+; RV64-NEXT:    [[TMP17:%.*]] = insertelement <4 x i7> [[TMP16]], i7 [[TMP13]], i32 1
+; RV64-NEXT:    [[TMP18:%.*]] = insertelement <4 x i7> [[TMP17]], i7 [[TMP14]], i32 2
+; RV64-NEXT:    [[TMP19:%.*]] = insertelement <4 x i7> [[TMP18]], i7 [[TMP15]], i32 3
+; RV64-NEXT:    [[TMP20:%.*]] = add <4 x i7> [[TMP19]], splat (i7 1)
+; RV64-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[IV_NEXT]]
+; RV64-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP5]]
+; RV64-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP6]]
+; RV64-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP7]]
+; RV64-NEXT:    [[TMP25:%.*]] = extractelement <4 x i7> [[TMP20]], i32 0
+; RV64-NEXT:    store i7 [[TMP25]], ptr [[TMP21]], align 1
+; RV64-NEXT:    [[TMP26:%.*]] = extractelement <4 x i7> [[TMP20]], i32 1
+; RV64-NEXT:    store i7 [[TMP26]], ptr [[TMP22]], align 1
+; RV64-NEXT:    [[TMP27:%.*]] = extractelement <4 x i7> [[TMP20]], i32 2
+; RV64-NEXT:    store i7 [[TMP27]], ptr [[TMP23]], align 1
+; RV64-NEXT:    [[TMP28:%.*]] = extractelement <4 x i7> [[TMP20]], i32 3
+; RV64-NEXT:    store i7 [[TMP28]], ptr [[TMP24]], align 1
+; RV64-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; RV64-NEXT:    [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1020
+; RV64-NEXT:    br i1 [[TMP29]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; RV64:       [[MIDDLE_BLOCK]]:
+; RV64-NEXT:    br label %[[SCALAR_PH]]
+; RV64:       [[SCALAR_PH]]:
+; RV64-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 3, %[[MIDDLE_BLOCK]] ], [ 1023, %[[ENTRY]] ]
+; RV64-NEXT:    br label %[[FOR_BODY:.*]]
+; RV64:       [[FOR_BODY]]:
+; RV64-NEXT:    [[DEC_IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], %[[FOR_BODY]] ]
+; RV64-NEXT:    [[IV_NEXT1]] = add nsw i64 [[DEC_IV1]], -1
+; RV64-NEXT:    [[ARRAYIDX_B1:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[IV_NEXT1]]
+; RV64-NEXT:    [[TMP30:%.*]] = load i7, ptr [[ARRAYIDX_B1]], align 1
+; RV64-NEXT:    [[ADD:%.*]] = add i7 [[TMP30]], 1
+; RV64-NEXT:    [[ARRAYIDX_A:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[IV_NEXT1]]
+; RV64-NEXT:    store i7 [[ADD]], ptr [[ARRAYIDX_A]], align 1
+; RV64-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[DEC_IV1]], 1
+; RV64-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[EXIT:.*]], !llvm.loop [[LOOP8:![0-9]+]]
+; RV64:       [[EXIT]]:
+; RV64-NEXT:    ret void
+;
+; RV32-LABEL: define void @vector_reverse_irregular_type(
+; RV32-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0]] {
+; RV32-NEXT:  [[ENTRY:.*]]:
+; RV32-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; RV32:       [[VECTOR_PH]]:
+; RV32-NEXT:    br label %[[VECTOR_BODY:.*]]
+; RV32:       [[VECTOR_BODY]]:
+; RV32-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; RV32-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
+; RV32-NEXT:    [[DEC_IV:%.*]] = add i64 [[OFFSET_IDX]], 0
+; RV32-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], -1
+; RV32-NEXT:    [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], -2
+; RV32-NEXT:    [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], -3
+; RV32-NEXT:    [[IV_NEXT:%.*]] = add nsw i64 [[DEC_IV]], -1
+; RV32-NEXT:    [[TMP5:%.*]] = add nsw i64 [[TMP1]], -1
+; RV32-NEXT:    [[TMP6:%.*]] = add nsw i64 [[TMP2]], -1
+; RV32-NEXT:    [[TMP7:%.*]] = add nsw i64 [[TMP3]], -1
+; RV32-NEXT:    [[ARRAYIDX_B:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[IV_NEXT]]
+; RV32-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP5]]
+; RV32-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP6]]
+; RV32-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP7]]
+; RV32-NEXT:    [[TMP0:%.*]] = load i7, ptr [[ARRAYIDX_B]], align 1
+; RV32-NEXT:    [[TMP13:%.*]] = load i7, ptr [[TMP9]], align 1
+; RV32-NEXT:    [[TMP14:%.*]] = load i7, ptr [[TMP10]], align 1
+; RV32-NEXT:    [[TMP15:%.*]] = load i7, ptr [[TMP11]], align 1
+; RV32-NEXT:    [[TMP16:%.*]] = insertelement <4 x i7> poison, i7 [[TMP0]], i32 0
+; RV32-NEXT:    [[TMP17:%.*]] = insertelement <4 x i7> [[TMP16]], i7 [[TMP13]], i32 1
+; RV32-NEXT:    [[TMP18:%.*]] = insertelement <4 x i7> [[TMP17]], i7 [[TMP14]], i32 2
+; RV32-NEXT:    [[TMP19:%.*]] = insertelement <4 x i7> [[TMP18]], i7 [[TMP15]], i32 3
+; RV32-NEXT:    [[TMP20:%.*]] = add <4 x i7> [[TMP19]], splat (i7 1)
+; RV32-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[IV_NEXT]]
+; RV32-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP5]]
+; RV32-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP6]]
+; RV32-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP7]]
+; RV32-NEXT:    [[TMP25:%.*]] = extractelement <4 x i7> [[TMP20]], i32 0
+; RV32-NEXT:    store i7 [[TMP25]], ptr [[TMP21]], align 1
+; RV32-NEXT:    [[TMP26:%.*]] = extractelement <4 x i7> [[TMP20]], i32 1
+; RV32-NEXT:    store i7 [[TMP26]], ptr [[TMP22]], align 1
+; RV32-NEXT:    [[TMP27:%.*]] = extractelement <4 x i7> [[TMP20]], i32 2
+; RV32-NEXT:    store i7 [[TMP27]], ptr [[TMP23]], align 1
+; RV32-NEXT:    [[TMP28:%.*]] = extractelement <4 x i7> [[TMP20]], i32 3
+; RV32-NEXT:    store i7 [[TMP28]], ptr [[TMP24]], align 1
+; RV32-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; RV32-NEXT:    [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1020
+; RV32-NEXT:    br i1 [[TMP29]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; RV32:       [[MIDDLE_BLOCK]]:
+; RV32-NEXT:    br label %[[SCALAR_PH]]
+; RV32:       [[SCALAR_PH]]:
+; RV32-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 3, %[[MIDDLE_BLOCK]] ], [ 1023, %[[ENTRY]] ]
+; RV32-NEXT:    br label %[[FOR_BODY:.*]]
+; RV32:       [[FOR_BODY]]:
+; RV32-NEXT:    [[DEC_IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], %[[FOR_BODY]] ]
+; RV32-NEXT:    [[IV_NEXT1]] = add nsw i64 [[DEC_IV1]], -1
+; RV32-NEXT:    [[ARRAYIDX_B1:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[IV_NEXT1]]
+; RV32-NEXT:    [[TMP30:%.*]] = load i7, ptr [[ARRAYIDX_B1]], align 1
+; RV32-NEXT:    [[ADD:%.*]] = add i7 [[TMP30]], 1
+; RV32-NEXT:    [[ARRAYIDX_A:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[IV_NEXT1]]
+; RV32-NEXT:    store i7 [[ADD]], ptr [[ARRAYIDX_A]], align 1
+; RV32-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[DEC_IV1]], 1
+; RV32-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[EXIT:.*]], !llvm.loop [[LOOP8:![0-9]+]]
+; RV32:       [[EXIT]]:
+; RV32-NEXT:    ret void
+;
+; RV64-UF2-LABEL: define void @vector_reverse_irregular_type(
+; RV64-UF2-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0]] {
+; RV64-UF2-NEXT:  [[ENTRY:.*]]:
+; RV64-UF2-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; RV64-UF2:       [[VECTOR_PH]]:
+; RV64-UF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; RV64-UF2:       [[VECTOR_BODY]]:
+; RV64-UF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; RV64-UF2-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
+; RV64-UF2-NEXT:    [[TMP16:%.*]] = add i64 [[OFFSET_IDX]], 0
+; RV64-UF2-NEXT:    [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], -1
+; RV64-UF2-NEXT:    [[TMP17:%.*]] = add i64 [[OFFSET_IDX]], -2
+; RV64-UF2-NEXT:    [[TMP24:%.*]] = add i64 [[OFFSET_IDX]], -3
+; RV64-UF2-NEXT:    [[TMP25:%.*]] = add i64 [[OFFSET_IDX]], -4
+; RV64-UF2-NEXT:    [[TMP42:%.*]] = add i64 [[OFFSET_IDX]], -5
+; RV64-UF2-NEXT:    [[TMP43:%.*]] = add i64 [[OFFSET_IDX]], -6
+; RV64-UF2-NEXT:    [[TMP50:%.*]] = add i64 [[OFFSET_IDX]], -7
+; RV64-UF2-NEXT:    [[TMP1:%.*]] = add nsw i64 [[TMP16]], -1
+; RV64-UF2-NEXT:    [[TMP2:%.*]] = add nsw i64 [[TMP0]], -1
+; RV64-UF2-NEXT:    [[TMP51:%.*]] = add nsw i64 [[TMP17]], -1
+; RV64-UF2-NEXT:    [[TMP11:%.*]] = add nsw i64 [[TMP24]], -1
+; RV64-UF2-NEXT:    [[TMP59:%.*]] = add nsw i64 [[TMP25]], -1
+; RV64-UF2-NEXT:    [[TMP13:%.*]] = add nsw i64 [[TMP42]], -1
+; RV64-UF2-NEXT:    [[TMP14:%.*]] = add nsw i64 [[TMP43]], -1
+; RV64-UF2-NEXT:    [[TMP15:%.*]] = add nsw i64 [[TMP50]], -1
+; RV64-UF2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP1]]
+; RV64-UF2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP2]]
+; RV64-UF2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP51]]
+; RV64-UF2-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP11]]
+; RV64-UF2-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP59]]
+; RV64-UF2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP13]]
+; RV64-UF2-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP14]]
+; RV64-UF2-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP15]]
+; RV64-UF2-NEXT:    [[TMP5:%.*]] = load i7, ptr [[TMP3]], align 1
+; RV64-UF2-NEXT:    [[TMP6:%.*]] = load i7, ptr [[TMP4]], align 1
+; RV64-UF2-NEXT:    [[TMP26:%.*]] = load i7, ptr [[TMP18]], align 1
+; RV64-UF2-NEXT:    [[TMP27:%.*]] = load i7, ptr [[TMP19]], align 1
+; RV64-UF2-NEXT:    [[TMP28:%.*]] = insertelement <4 x i7> poison, i7 [[TMP5]], i32 0
+; RV64-UF2-NEXT:    [[TMP29:%.*]] = insertelement <4 x i7> [[TMP28]], i7 [[TMP6]], i32 1
+; RV64-UF2-NEXT:    [[TMP30:%.*]] = insertelement <4 x i7> [[TMP29]], i7 [[TMP26]], i32 2
+; RV64-UF2-NEXT:    [[TMP31:%.*]] = insertelement <4 x i7> [[TMP30]], i7 [[TMP27]], i32 3
+; RV64-UF2-NEXT:    [[TMP32:%.*]] = load i7, ptr [[TMP20]], align 1
+; RV64-UF2-NEXT:    [[TMP33:%.*]] = load i7, ptr [[TMP21]], align 1
+; RV64-UF2-NEXT:    [[TMP34:%.*]] = load i7, ptr [[TMP22]], align 1
+; RV64-UF2-NEXT:    [[TMP35:%.*]] = load i7, ptr [[TMP23]], align 1
+; RV64-UF2-NEXT:    [[TMP36:%.*]] = insertelement <4 x i7> poison, i7 [[TMP32]], i32 0
+; RV64-UF2-NEXT:    [[TMP37:%.*]] = insertelement <4 x i7> [[TMP36]], i7 [[TMP33]], i32 1
+; RV64-UF2-NEXT:    [[TMP38:%.*]] = insertelement <4 x i7> [[TMP37]], i7 [[TMP34]], i32 2
+; RV64-UF2-NEXT:    [[TMP39:%.*]] = insertelement <4 x i7> [[TMP38]], i7 [[TMP35]], i32 3
+; RV64-UF2-NEXT:    [[TMP40:%.*]] = add <4 x i7> [[TMP31]], splat (i7 1)
+; RV64-UF2-NEXT:    [[TMP41:%.*]] = add <4 x i7> [[TMP39]], splat (i7 1)
+; RV64-UF2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP1]]
+; RV64-UF2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP2]]
+; RV64-UF2-NEXT:    [[TMP44:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP51]]
+; RV64-UF2-NEXT:    [[TMP45:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP11]]
+; RV64-UF2-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP59]]
+; RV64-UF2-NEXT:    [[TMP47:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP13]]
+; RV64-UF2-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP14]]
+; RV64-UF2-NEXT:    [[TMP49:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP15]]
+; RV64-UF2-NEXT:    [[TMP7:%.*]] = extractelement <4 x i7> [[TMP40]], i32 0
+; RV64-UF2-NEXT:    store i7 [[TMP7]], ptr [[TMP9]], align 1
+; RV64-UF2-NEXT:    [[TMP8:%.*]] = extractelement <4 x i7> [[TMP40]], i32 1
+; RV64-UF2-NEXT:    store i7 [[TMP8]], ptr [[TMP10]], align 1
+; RV64-UF2-NEXT:    [[TMP52:%.*]] = extractelement <4 x i7> [[TMP40]], i32 2
+; RV64-UF2-NEXT:    store i7 [[TMP52]], ptr [[TMP44]], align 1
+; RV64-UF2-NEXT:    [[TMP53:%.*]] = extractelement <4 x i7> [[TMP40]], i32 3
+; RV64-UF2-NEXT:    store i7 [[TMP53]], ptr [[TMP45]], align 1
+; RV64-UF2-NEXT:    [[TMP54:%.*]] = extractelement <4 x i7> [[TMP41]], i32 0
+; RV64-UF2-NEXT:    store i7 [[TMP54]], ptr [[TMP46]], align 1
+; RV64-UF2-NEXT:    [[TMP55:%.*]] = extractelement <4 x i7> [[TMP41]], i32 1
+; RV64-UF2-NEXT:    store i7 [[TMP55]], ptr [[TMP47]], align 1
+; RV64-UF2-NEXT:    [[TMP56:%.*]] = extractelement <4 x i7> [[TMP41]], i32 2
+; RV64-UF2-NEXT:    store i7 [[TMP56]], ptr [[TMP48]], align 1
+; RV64-UF2-NEXT:    [[TMP57:%.*]] = extractelement <4 x i7> [[TMP41]], i32 3
+; RV64-UF2-NEXT:    store i7 [[TMP57]], ptr [[TMP49]], align 1
+; RV64-UF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; RV64-UF2-NEXT:    [[TMP58:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1016
+; RV64-UF2-NEXT:    br i1 [[TMP58]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; RV64-UF2:       [[MIDDLE_BLOCK]]:
+; RV64-UF2-NEXT:    br label %[[SCALAR_PH]]
+; RV64-UF2:       [[SCALAR_PH]]:
+; RV64-UF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 7, %[[MIDDLE_BLOCK]] ], [ 1023, %[[ENTRY]] ]
+; RV64-UF2-NEXT:    br label %[[FOR_BODY:.*]]
+; RV64-UF2:       [[FOR_BODY]]:
+; RV64-UF2-NEXT:    [[DEC_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; RV64-UF2-NEXT:    [[IV_NEXT]] = add nsw i64 [[DEC_IV]], -1
+; RV64-UF2-NEXT:    [[ARRAYIDX_B:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[IV_NEXT]]
+; RV64-UF2-NEXT:    [[TMP12:%.*]] = load i7, ptr [[ARRAYIDX_B]], align 1
+; RV64-UF2-NEXT:    [[ADD:%.*]] = add i7 [[TMP12]], 1
+; RV64-UF2-NEXT:    [[ARRAYIDX_A:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[IV_NEXT]]
+; RV64-UF2-NEXT:    store i7 [[ADD]], ptr [[ARRAYIDX_A]], align 1
+; RV64-UF2-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[DEC_IV]], 1
+; RV64-UF2-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[EXIT:.*]], !llvm.loop [[LOOP7:![0-9]+]]
+; RV64-UF2:       [[EXIT]]:
+; RV64-UF2-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %dec.iv = phi i64 [ 1023, %entry ], [ %iv.next, %for.body ]
+  %iv.next = add nsw i64 %dec.iv, -1
+  %arrayidx.b = getelementptr inbounds i7, ptr %B, i64 %iv.next
+  %0 = load i7, ptr %arrayidx.b, align 1
+  %add = add i7 %0, 1
+  %arrayidx.a = getelementptr inbounds i7, ptr %A, i64 %iv.next
+  store i7 %add, ptr %arrayidx.a, align 1
+  %cmp = icmp ugt i64 %dec.iv, 1
+  br i1 %cmp, label %for.body, label %exit, !llvm.loop !4
+
+exit:
+  ret void
+}
+
+!0 = distinct !{!0, !1, !2, !3}
+!1 = !{!"llvm.loop.vectorize.width", i32 4}
+!2 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
+!3 = !{!"llvm.loop.vectorize.enable", i1 true}
+!4 = distinct !{!4, !1, !3}
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
index e49a71eb5b930..be69b714b5919 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
@@ -39,17 +39,15 @@ define void @vector_reverse_i32(ptr noalias %A, ptr noalias %B) {
 ; RV64-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP19]])
 ; RV64-NEXT:    [[TMP14:%.*]] = add <vscale x 4 x i32> [[REVERSE]], splat (i32 1)
 ; RV64-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP7]]
-; RV64-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP19]] to i64
-; RV64-NEXT:    [[TMP25:%.*]] = mul i64 0, [[TMP16]]
-; RV64-NEXT:    [[TMP17:%.*]] = sub i64 [[TMP16]], 1
+; RV64-NEXT:    [[TMP25:%.*]] = mul i64 0, [[TMP24]]
+; RV64-NEXT:    [[TMP17:%.*]] = sub i64 [[TMP24]], 1
 ; RV64-NEXT:    [[TMP18:%.*]] = mul i64 -1, [[TMP17]]
 ; RV64-NEXT:    [[TMP20:%.*]] = getelementptr i32, ptr [[TMP15]], i64 [[TMP25]]
 ; RV64-NEXT:    [[TMP21:%.*]] = getelementptr i32, ptr [[TMP20]], i64 [[TMP18]]
 ; RV64-NEXT:    [[VP_REVERSE1:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[TMP14]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP19]])
 ; RV64-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_REVERSE1]], ptr align 4 [[TMP21]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP19]])
-; RV64-NEXT:    [[TMP22:%.*]] = zext i32 [[TMP19]] to i64
-; RV64-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP22]], [[INDEX]]
-; RV64-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP22]]
+; RV64-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP24]], [[INDEX]]
+; RV64-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP24]]
 ; RV64-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], 1023
 ; RV64-NEXT:    br i1 [[TMP23]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; RV64:       [[MIDDLE_BLOCK]]:
@@ -230,17 +228,15 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; RV64-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP20]])
 ; RV64-NEXT:    [[TMP29:%.*]] = add <vscale x 4 x i32> [[REVERSE]], splat (i32 1)
 ; RV64-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP22]]
-; RV64-NEXT:    [[TMP39:%.*]] = zext i32 [[TMP20]] to i64
-; RV64-NEXT:    [[TMP31:%.*]] = mul i64 0, [[TMP39]]
-; RV64-NEXT:    [[TMP32:%.*]] = sub i64 [[TMP39]], 1
+; RV64-NEXT:    [[TMP31:%.*]] = mul i64 0, [[TMP24]]
+; RV64-NEXT:    [[TMP32:%.*]] = sub i64 [[TMP24]], 1
 ; RV64-NEXT:    [[TMP33:%.*]] = mul i64 -1, [[TMP32]]
 ; RV64-NEXT:    [[TMP34:%.*]] = getelementptr i32, ptr [[TMP30]], i64 [[TMP31]]
 ; RV64-NEXT:    [[TMP35:%.*]] = getelementptr i32, ptr [[TMP34]], i64 [[TMP33]]
 ; RV64-NEXT:    [[VP_REVERSE3:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[TMP29]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP20]])
 ; RV64-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_REVERSE3]], ptr align 4 [[TMP35]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP20]])
-; RV64-NEXT:    [[TMP36:%.*]] = zext i32 [[TMP20]] to i64
-; RV64-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP36]], [[INDEX]]
-; RV64-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP36]]
+; RV64-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP24]], [[INDEX]]
+; RV64-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP24]]
 ; RV64-NEXT:    [[TMP37:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[TMP0]]
 ; RV64-NEXT:    br i1 [[TMP37]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; RV64:       [[MIDDLE_BLOCK]]:
@@ -491,17 +487,15 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; RV64-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x float> @llvm.experimental.vp.reverse.nxv4f32(<vscale x 4 x float> [[VP_OP_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP20]])
 ; RV64-NEXT:    [[TMP29:%.*]] = fadd <vscale x 4 x float> [[REVERSE]], splat (float 1.000000e+00)
 ; RV64-NEXT:    [[TMP30:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP22]]
-; RV64-NEXT:    [[TMP39:%.*]] = zext i32 [[TMP20]] to i64
-; RV64-NEXT:    [[TMP31:%.*]] = mul i64 0, [[TMP39]]
-; RV64-NEXT:    [[TMP32:%.*]] = sub i64 [[TMP39]], 1
+; RV64-NEXT:    [[TMP31:%.*]] = mul i64 0, [[TMP24]]
+; RV64-NEXT:    [[TMP32:%.*]] = sub i64 [[TMP24]], 1
 ; RV64-NEXT:    [[TMP33:%.*]] = mul i64 -1, [[TMP32]]
 ; RV64-NEXT:    [[TMP34:%.*]] = getelementptr float, ptr [[TMP30]], i64 [[TMP31]]
 ; RV64-NEXT:    [[TMP35:%.*]] = getelementptr float, ptr [[TMP34]], i64 [[TMP33]]
 ; RV64-NEXT:    [[VP_REVERSE3:%.*]] = call <vscale x 4 x float> @llvm.experimental.vp.reverse.nxv4f32(<vscale x 4 x float> [[TMP29]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP20]])
 ; RV64-NEXT:    call void @llvm.vp.store.nxv4f32.p0(<vscale x 4 x float> [[VP_REVERSE3]], ptr align 4 [[TMP35]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP20]])
-; RV64-NEXT:    [[TMP36:%.*]] = zext i32 [[TMP20]] to i64
-; RV64-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP36]], [[INDEX]]
-; RV64-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP36]]
+; RV64-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP24]], [[INDEX]]
+; RV64-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP24]]
 ; RV64-NEXT:    [[TMP37:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[TMP0]]
 ; RV64-NEXT:    br i1 [[TMP37]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; RV64:       [[MIDDLE_BLOCK]]:
@@ -724,17 +718,15 @@ define void @vector_reverse_f32_simplify(ptr noalias %A, ptr noalias %B) {
 ; RV64-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x float> @llvm.experimental.vp.reverse.nxv4f32(<vscale x 4 x float> [[VP_OP_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP19]])
 ; RV64-NEXT:    [[TMP14:%.*]] = fadd <vscale x 4 x float> [[REVERSE]], splat (float 1.000000e+00)
 ; RV64-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP7]]
-; RV64-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP19]] to i64
-; RV64-NEXT:    [[TMP25:%.*]] = mul i64 0, [[TMP16]]
-; RV64-NEXT:    [[TMP17:%.*]] = sub i64 [[TMP16]], 1
+; RV64-NEXT:    [[TMP25:%.*]] = mul i64 0, [[TMP24]]
+; RV64-NEXT:    [[TMP17:%.*]] = sub i64 [[TMP24]], 1
 ; RV64-NEXT:    [[TMP18:%.*]] = mul i64 -1, [[TMP17]]
 ; RV64-NEXT:    [[TMP20:%.*]] = getelementptr float, ptr [[TMP15]], i64 [[TMP25]]
 ; RV64-NEXT:    [[TMP21:%.*]] = getelementptr float, ptr [[TMP20]], i64 [[TMP18]]
 ; RV64-NEXT:    [[VP_REVERSE1:%.*]] = call <vscale x 4 x float> @llvm.experimental.vp.reverse.nxv4f32(<vscale x 4 x float> [[TMP14]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP19]])
 ; RV64-NEXT:    call void @llvm.vp.store.nxv4f32.p0(<vscale x 4 x float> [[VP_REVERSE1]], ptr align 4 [[TMP21]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP19]])
-; RV64-NEXT:    [[TMP22:%.*]] = zext i32 [[TMP19]] to i64
-; RV64-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP22]], [[INDEX]]
-; RV64-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP22]]
+; RV64-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP24]], [[INDEX]]
+; RV64-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP24]]
 ; RV64-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], 1023
 ; RV64-NEXT:    br i1 [[TMP23]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; RV64:       [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll b/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll
index 47979d358690a..58e09c2285307 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll
@@ -25,9 +25,8 @@ define void @single_constant_stride_int_scaled(ptr %p) {
 ; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.vp.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP15]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP11]])
 ; CHECK-NEXT:    [[TMP16:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_GATHER]], splat (i32 1)
 ; CHECK-NEXT:    call void @llvm.vp.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP16]], <vscale x 4 x ptr> align 4 [[TMP15]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP11]])
-; CHECK-NEXT:    [[TMP13:%.*]] = zext i32 [[TMP11]] to i64
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP13]], [[INDEX]]
-; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP13]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP12]], [[INDEX]]
+; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP12]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -88,9 +87,8 @@ define void @single_constant_stride_int_iv(ptr %p) {
 ; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.vp.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP7]])
 ; CHECK-NEXT:    [[TMP13:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_GATHER]], splat (i32 1)
 ; CHECK-NEXT:    call void @llvm.vp.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP13]], <vscale x 4 x ptr> align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP7]])
-; CHECK-NEXT:    [[TMP14:%.*]] = zext i32 [[TMP7]] to i64
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP14]], [[INDEX]]
-; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP14]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP11]], [[INDEX]]
+; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
@@ -152,8 +150,7 @@ define void @single_constant_stride_ptr_iv(ptr %p) {
 ; CHECK-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP11]] to i64
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP9]], [[INDEX]]
 ; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP9]]
-; CHECK-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP11]] to i64
-; CHECK-NEXT:    [[TMP12:%.*]] = mul i64 8, [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = mul i64 8, [[TMP9]]
 ; CHECK-NEXT:    [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP12]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
@@ -503,9 +500,8 @@ define void @double_stride_int_scaled(ptr %p, ptr %p2, i64 %stride) {
 ; STRIDED-NEXT:    [[TMP20:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_GATHER]], splat (i32 1)
 ; STRIDED-NEXT:    [[TMP21:%.*]] = getelementptr i32, ptr [[P2]], <vscale x 4 x i64> [[TMP18]]
 ; STRIDED-NEXT:    call void @llvm.vp.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP20]], <vscale x 4 x ptr> align 4 [[TMP21]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP43]]), !alias.scope [[META12:![0-9]+]], !noalias [[META9]]
-; STRIDED-NEXT:    [[TMP46:%.*]] = zext i32 [[TMP43]] to i64
-; STRIDED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP46]], [[INDEX]]
-; STRIDED-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP46]]
+; STRIDED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP44]], [[INDEX]]
+; STRIDED-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP44]]
 ; STRIDED-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; STRIDED-NEXT:    [[TMP41:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; STRIDED-NEXT:    br i1 [[TMP41]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
@@ -690,11 +686,9 @@ define void @double_stride_ptr_iv(ptr %p, ptr %p2, i64 %stride) {
 ; STRIDED-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP14]] to i64
 ; STRIDED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP16]], [[INDEX]]
 ; STRIDED-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP16]]
-; STRIDED-NEXT:    [[TMP20:%.*]] = zext i32 [[TMP14]] to i64
-; STRIDED-NEXT:    [[TMP25:%.*]] = mul i64 [[STRIDE]], [[TMP20]]
+; STRIDED-NEXT:    [[TMP25:%.*]] = mul i64 [[STRIDE]], [[TMP16]]
 ; STRIDED-NEXT:    [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP25]]
-; STRIDED-NEXT:    [[TMP22:%.*]] = zext i32 [[TMP14]] to i64
-; STRIDED-NEXT:    [[TMP17:%.*]] = mul i64 [[STRIDE]], [[TMP22]]
+; STRIDED-NEXT:    [[TMP17:%.*]] = mul i64 [[STRIDE]], [[TMP16]]
 ; STRIDED-NEXT:    [[PTR_IND12]] = getelementptr i8, ptr [[POINTER_PHI11]], i64 [[TMP17]]
 ; STRIDED-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; STRIDED-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-cast-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-cast-intrinsics.ll
index 7ea462eed42db..f2d296e2685c7 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-cast-intrinsics.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-cast-intrinsics.ll
@@ -1209,9 +1209,8 @@ define void @vp_ptrtoint(ptr %a, ptr %b, i64 %N) {
 ; IF-EVL-NEXT:    [[TMP15:%.*]] = ptrtoint <vscale x 2 x ptr> [[TMP14]] to <vscale x 2 x i64>
 ; IF-EVL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP15]], ptr align 8 [[TMP16]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP11]])
-; IF-EVL-NEXT:    [[TMP18:%.*]] = zext i32 [[TMP11]] to i64
-; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP18]], [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP18]]
+; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP12]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP12]]
 ; IF-EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; IF-EVL-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]]
 ; IF-EVL-NEXT:    br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP48:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-cond-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-cond-reduction.ll
index fe3a723a9bbea..53c00dc1c7446 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-cond-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-cond-reduction.ll
@@ -703,12 +703,12 @@ define i32 @step_cond_add_pred(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP23:%.*]] = select <vscale x 4 x i1> [[TMP18]], <vscale x 4 x i1> [[TMP21]], <vscale x 4 x i1> zeroinitializer
 ; IF-EVL-OUTLOOP-NEXT:    [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP23]], <vscale x 4 x i32> [[VEC_PHI]], <vscale x 4 x i32> [[TMP22]]
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP24]] = call <vscale x 4 x i32> @llvm.vp.merge.nxv4i32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> [[PREDPHI]], <vscale x 4 x i32> [[VEC_PHI]], i32 [[TMP14]])
-; IF-EVL-OUTLOOP-NEXT:    [[TMP25:%.*]] = zext i32 [[TMP14]] to i64
-; IF-EVL-OUTLOOP-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP25]], [[IV]]
-; IF-EVL-OUTLOOP-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP25]]
+; IF-EVL-OUTLOOP-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP14]] to i64
+; IF-EVL-OUTLOOP-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP16]], [[IV]]
+; IF-EVL-OUTLOOP-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP16]]
 ; IF-EVL-OUTLOOP-NEXT:    [[VEC_IND_NEXT7]] = add <vscale x 4 x i32> [[VEC_IND2]], [[BROADCAST_SPLAT2]]
-; IF-EVL-OUTLOOP-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]]
-; IF-EVL-OUTLOOP-NEXT:    br i1 [[TMP26]], label [[MIDDLE_BLOCK1:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; IF-EVL-OUTLOOP-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]]
+; IF-EVL-OUTLOOP-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK1:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; IF-EVL-OUTLOOP:       middle.block:
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP27:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP24]])
 ; IF-EVL-OUTLOOP-NEXT:    br label [[FOR_END:%.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-fixed-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-fixed-order-recurrence.ll
index bf54f669b65f9..ba2fa17267fd5 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-fixed-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-fixed-order-recurrence.ll
@@ -609,9 +609,8 @@ define void @first_order_recurrence_indvar(ptr noalias %A, i64 %TC) {
 ; IF-EVL-NEXT:    [[TMP15:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vp.splice.nxv2i64(<vscale x 2 x i64> [[VECTOR_RECUR]], <vscale x 2 x i64> [[TMP20]], i32 -1, <vscale x 2 x i1> splat (i1 true), i32 [[PREV_EVL]], i32 [[TMP11]])
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i64, ptr [[A]], i64 [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP15]], ptr align 8 [[TMP9]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP11]])
-; IF-EVL-NEXT:    [[TMP21:%.*]] = zext i32 [[TMP11]] to i64
-; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP21]], [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP21]]
+; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP7]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP7]]
 ; IF-EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; IF-EVL-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[TC]]
 ; IF-EVL-NEXT:    br i1 [[TMP22]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-gather-scatter.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-gather-scatter.ll
index 6f723c268914c..653549dfb7fa8 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-gather-scatter.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-gather-scatter.ll
@@ -30,9 +30,8 @@ define void @gather_scatter(ptr noalias %in, ptr noalias %out, ptr noalias %inde
 ; IF-EVL-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 2 x float> @llvm.vp.gather.nxv2f32.nxv2p0(<vscale x 2 x ptr> align 4 [[TMP15]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP11]])
 ; IF-EVL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, ptr [[OUT:%.*]], <vscale x 2 x i64> [[WIDE_MASKED_GATHER]]
 ; IF-EVL-NEXT:    call void @llvm.vp.scatter.nxv2f32.nxv2p0(<vscale x 2 x float> [[WIDE_MASKED_GATHER2]], <vscale x 2 x ptr> align 4 [[TMP16]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP11]])
-; IF-EVL-NEXT:    [[TMP17:%.*]] = zext i32 [[TMP11]] to i64
-; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP17]], [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP17]]
+; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP12]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP12]]
 ; IF-EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; IF-EVL-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]]
 ; IF-EVL-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll
index 332c16e8eb656..e45c98093871b 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll
@@ -159,9 +159,8 @@ define i32 @load_factor_4_with_gap(i64 %n, ptr noalias %a) {
 ; IF-EVL-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 4 x i32> @llvm.vp.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP10]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP4]])
 ; IF-EVL-NEXT:    [[TMP11:%.*]] = add <vscale x 4 x i32> [[TMP9]], [[WIDE_MASKED_GATHER2]]
 ; IF-EVL-NEXT:    [[TMP12]] = call <vscale x 4 x i32> @llvm.vp.merge.nxv4i32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> [[TMP11]], <vscale x 4 x i32> [[VEC_PHI]], i32 [[TMP4]])
-; IF-EVL-NEXT:    [[TMP13:%.*]] = zext i32 [[TMP4]] to i64
-; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP13]], [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP13]]
+; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP5]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP5]]
 ; IF-EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; IF-EVL-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]]
 ; IF-EVL-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
@@ -435,9 +434,8 @@ define i32 @load_factor_4_with_tail_gap(i64 %n, ptr noalias %a) {
 ; IF-EVL-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 4 x i32> @llvm.vp.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP10]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP4]])
 ; IF-EVL-NEXT:    [[TMP11:%.*]] = add <vscale x 4 x i32> [[TMP9]], [[WIDE_MASKED_GATHER2]]
 ; IF-EVL-NEXT:    [[TMP12]] = call <vscale x 4 x i32> @llvm.vp.merge.nxv4i32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> [[TMP11]], <vscale x 4 x i32> [[VEC_PHI]], i32 [[TMP4]])
-; IF-EVL-NEXT:    [[TMP13:%.*]] = zext i32 [[TMP4]] to i64
-; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP13]], [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP13]]
+; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP5]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP5]]
 ; IF-EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; IF-EVL-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]]
 ; IF-EVL-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
@@ -710,9 +708,8 @@ define i32 @load_factor_4_reverse(i64 %n, ptr noalias %a) {
 ; IF-EVL-NEXT:    [[WIDE_MASKED_GATHER5:%.*]] = call <vscale x 4 x i32> @llvm.vp.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP14]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP6]])
 ; IF-EVL-NEXT:    [[TMP15:%.*]] = add <vscale x 4 x i32> [[TMP13]], [[WIDE_MASKED_GATHER5]]
 ; IF-EVL-NEXT:    [[TMP16]] = call <vscale x 4 x i32> @llvm.vp.merge.nxv4i32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> [[TMP15]], <vscale x 4 x i32> [[VEC_PHI]], i32 [[TMP6]])
-; IF-EVL-NEXT:    [[TMP17:%.*]] = zext i32 [[TMP6]] to i64
-; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP17]], [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP17]]
+; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP7]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP7]]
 ; IF-EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT2]]
 ; IF-EVL-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[TMP1]]
 ; IF-EVL-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-reduction-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-reduction-cost.ll
index 10d83f4ad125e..34fd04f01ad61 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-reduction-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-reduction-cost.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; REQUIRES: asserts
 ; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize --disable-output \
 ; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-reverse-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-reverse-load-store.ll
index c804fd9ceb7a9..e0e3d4b174a66 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-reverse-load-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-reverse-load-store.ll
@@ -29,19 +29,17 @@ define void @reverse_load_store(i64 %startval, ptr noalias %ptr, ptr noalias %pt
 ; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
 ; IF-EVL-NEXT:    [[VP_REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
 ; IF-EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[PTR2:%.*]], i64 [[TMP7]]
-; IF-EVL-NEXT:    [[TMP19:%.*]] = zext i32 [[TMP5]] to i64
-; IF-EVL-NEXT:    [[TMP14:%.*]] = mul i64 0, [[TMP19]]
-; IF-EVL-NEXT:    [[TMP23:%.*]] = sub i64 [[TMP19]], 1
+; IF-EVL-NEXT:    [[TMP14:%.*]] = mul i64 0, [[TMP18]]
+; IF-EVL-NEXT:    [[TMP23:%.*]] = sub i64 [[TMP18]], 1
 ; IF-EVL-NEXT:    [[TMP15:%.*]] = mul i64 -1, [[TMP23]]
 ; IF-EVL-NEXT:    [[TMP22:%.*]] = getelementptr i32, ptr [[TMP13]], i64 [[TMP14]]
 ; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr i32, ptr [[TMP22]], i64 [[TMP15]]
 ; IF-EVL-NEXT:    [[VP_REVERSE3:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_REVERSE]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
 ; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_REVERSE3]], ptr align 4 [[TMP17]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
-; IF-EVL-NEXT:    [[TMP20:%.*]] = zext i32 [[TMP5]] to i64
-; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP20]], [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP20]]
-; IF-EVL-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], 1024
-; IF-EVL-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP18]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP18]]
+; IF-EVL-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], 1024
+; IF-EVL-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; IF-EVL:       middle.block:
 ; IF-EVL-NEXT:    br label [[LOOPEND:%.*]]
 ; IF-EVL:       scalar.ph:
@@ -164,20 +162,18 @@ define void @reverse_load_store_masked(i64 %startval, ptr noalias %ptr, ptr noal
 ; IF-EVL-NEXT:    [[VP_OP_LOAD4:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP20]], <vscale x 4 x i1> [[VP_REVERSE_MASK]], i32 [[TMP5]])
 ; IF-EVL-NEXT:    [[VP_REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_OP_LOAD4]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
 ; IF-EVL-NEXT:    [[TMP21:%.*]] = getelementptr i32, ptr [[PTR2:%.*]], i64 [[TMP11]]
-; IF-EVL-NEXT:    [[TMP27:%.*]] = zext i32 [[TMP5]] to i64
-; IF-EVL-NEXT:    [[TMP22:%.*]] = mul i64 0, [[TMP27]]
-; IF-EVL-NEXT:    [[TMP30:%.*]] = sub i64 [[TMP27]], 1
+; IF-EVL-NEXT:    [[TMP22:%.*]] = mul i64 0, [[TMP26]]
+; IF-EVL-NEXT:    [[TMP30:%.*]] = sub i64 [[TMP26]], 1
 ; IF-EVL-NEXT:    [[TMP23:%.*]] = mul i64 -1, [[TMP30]]
 ; IF-EVL-NEXT:    [[TMP24:%.*]] = getelementptr i32, ptr [[TMP21]], i64 [[TMP22]]
 ; IF-EVL-NEXT:    [[TMP25:%.*]] = getelementptr i32, ptr [[TMP24]], i64 [[TMP23]]
 ; IF-EVL-NEXT:    [[VP_REVERSE5:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_REVERSE]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
 ; IF-EVL-NEXT:    [[VP_REVERSE_MASK6:%.*]] = call <vscale x 4 x i1> @llvm.experimental.vp.reverse.nxv4i1(<vscale x 4 x i1> [[TMP14]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
 ; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_REVERSE5]], ptr align 4 [[TMP25]], <vscale x 4 x i1> [[VP_REVERSE_MASK6]], i32 [[TMP5]])
-; IF-EVL-NEXT:    [[TMP28:%.*]] = zext i32 [[TMP5]] to i64
-; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP28]], [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP28]]
-; IF-EVL-NEXT:    [[TMP29:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], 1024
-; IF-EVL-NEXT:    br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP26]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP26]]
+; IF-EVL-NEXT:    [[TMP27:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], 1024
+; IF-EVL-NEXT:    br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; IF-EVL:       middle.block:
 ; IF-EVL-NEXT:    br label [[LOOPEND:%.*]]
 ; IF-EVL:       scalar.ph:
@@ -328,28 +324,25 @@ define void @multiple_reverse_vector_pointer(ptr noalias %a, ptr noalias %b, ptr
 ; IF-EVL-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[B:%.*]], <vscale x 16 x i8> [[VP_REVERSE]]
 ; IF-EVL-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 16 x i8> @llvm.vp.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> align 1 [[TMP14]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
 ; IF-EVL-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[C:%.*]], i64 [[OFFSET_IDX]]
-; IF-EVL-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP6]] to i64
-; IF-EVL-NEXT:    [[TMP17:%.*]] = mul i64 0, [[TMP16]]
-; IF-EVL-NEXT:    [[TMP30:%.*]] = sub i64 [[TMP16]], 1
+; IF-EVL-NEXT:    [[TMP17:%.*]] = mul i64 0, [[TMP9]]
+; IF-EVL-NEXT:    [[TMP30:%.*]] = sub i64 [[TMP9]], 1
 ; IF-EVL-NEXT:    [[TMP18:%.*]] = mul i64 -1, [[TMP30]]
 ; IF-EVL-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[TMP15]], i64 [[TMP17]]
 ; IF-EVL-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[TMP19]], i64 [[TMP18]]
 ; IF-EVL-NEXT:    [[VP_REVERSE1:%.*]] = call <vscale x 16 x i8> @llvm.experimental.vp.reverse.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
 ; IF-EVL-NEXT:    call void @llvm.vp.store.nxv16i8.p0(<vscale x 16 x i8> [[VP_REVERSE1]], ptr align 1 [[TMP20]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
 ; IF-EVL-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[D:%.*]], i64 [[OFFSET_IDX]]
-; IF-EVL-NEXT:    [[TMP22:%.*]] = zext i32 [[TMP6]] to i64
-; IF-EVL-NEXT:    [[TMP23:%.*]] = mul i64 0, [[TMP22]]
-; IF-EVL-NEXT:    [[TMP31:%.*]] = sub i64 [[TMP22]], 1
+; IF-EVL-NEXT:    [[TMP23:%.*]] = mul i64 0, [[TMP9]]
+; IF-EVL-NEXT:    [[TMP31:%.*]] = sub i64 [[TMP9]], 1
 ; IF-EVL-NEXT:    [[TMP24:%.*]] = mul i64 -1, [[TMP31]]
 ; IF-EVL-NEXT:    [[TMP25:%.*]] = getelementptr i8, ptr [[TMP21]], i64 [[TMP23]]
 ; IF-EVL-NEXT:    [[TMP26:%.*]] = getelementptr i8, ptr [[TMP25]], i64 [[TMP24]]
 ; IF-EVL-NEXT:    [[VP_REVERSE2:%.*]] = call <vscale x 16 x i8> @llvm.experimental.vp.reverse.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
 ; IF-EVL-NEXT:    call void @llvm.vp.store.nxv16i8.p0(<vscale x 16 x i8> [[VP_REVERSE2]], ptr align 1 [[TMP26]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
-; IF-EVL-NEXT:    [[TMP27:%.*]] = zext i32 [[TMP6]] to i64
-; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP27]], [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP27]]
-; IF-EVL-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], 1025
-; IF-EVL-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP9]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP9]]
+; IF-EVL-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], 1025
+; IF-EVL-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; IF-EVL:       middle.block:
 ; IF-EVL-NEXT:    br label [[EXIT:%.*]]
 ; IF-EVL:       scalar.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-uniform-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-uniform-store.ll
index 55c4c4a411837..51512dd3fb30e 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-uniform-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-uniform-store.ll
@@ -31,11 +31,10 @@ define void @lshift_significand(i32 %n, ptr nocapture writeonly %dst) {
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr i64, ptr [[TMP18]], i64 [[TMP17]]
 ; CHECK-NEXT:    [[VP_REVERSE:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vp.reverse.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i1> splat (i1 true), i32 [[TMP11]])
 ; CHECK-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[VP_REVERSE]], ptr align 8 [[TMP19]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP11]])
-; CHECK-NEXT:    [[TMP20:%.*]] = zext i32 [[TMP11]] to i64
-; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP20]], [[EVL_BASED_IV]]
-; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP20]]
-; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[TMP0]]
-; CHECK-NEXT:    br i1 [[TMP21]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP15]], [[EVL_BASED_IV]]
+; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP15]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[TMP0]]
+; CHECK-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[SCALAR_PH]]:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
index e3a93cbf450ae..0ecba49b421ae 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
@@ -292,9 +292,8 @@ define void @conditional_uniform_load(ptr noalias nocapture %a, ptr noalias noca
 ; SCALABLE-NEXT:    [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP13]], <vscale x 4 x i64> [[WIDE_MASKED_GATHER]], <vscale x 4 x i64> zeroinitializer
 ; SCALABLE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
 ; SCALABLE-NEXT:    call void @llvm.vp.store.nxv4i64.p0(<vscale x 4 x i64> [[PREDPHI]], ptr align 8 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP17]])
-; SCALABLE-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP17]] to i64
-; SCALABLE-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP15]], [[INDEX]]
-; SCALABLE-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP15]]
+; SCALABLE-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP8]], [[INDEX]]
+; SCALABLE-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP8]]
 ; SCALABLE-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; SCALABLE-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], 1025
 ; SCALABLE-NEXT:    br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
@@ -396,9 +395,8 @@ define void @conditional_uniform_load(ptr noalias nocapture %a, ptr noalias noca
 ; TF-SCALABLE-NEXT:    [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP9]], <vscale x 4 x i64> [[WIDE_MASKED_GATHER]], <vscale x 4 x i64> zeroinitializer
 ; TF-SCALABLE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
 ; TF-SCALABLE-NEXT:    call void @llvm.vp.store.nxv4i64.p0(<vscale x 4 x i64> [[PREDPHI]], ptr align 8 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP7]])
-; TF-SCALABLE-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP7]] to i64
-; TF-SCALABLE-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP15]], [[INDEX]]
-; TF-SCALABLE-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP15]]
+; TF-SCALABLE-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP11]], [[INDEX]]
+; TF-SCALABLE-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]]
 ; TF-SCALABLE-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; TF-SCALABLE-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], 1025
 ; TF-SCALABLE-NEXT:    br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
@@ -713,9 +711,8 @@ define void @uniform_store_of_loop_varying(ptr noalias nocapture %a, ptr noalias
 ; SCALABLE-NEXT:    call void @llvm.vp.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[VEC_IND]], <vscale x 2 x ptr> align 8 [[BROADCAST_SPLAT1]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP7]])
 ; SCALABLE-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]]
 ; SCALABLE-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr align 8 [[TMP16]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP7]])
-; SCALABLE-NEXT:    [[TMP11:%.*]] = zext i32 [[TMP7]] to i64
-; SCALABLE-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP11]], [[TMP10]]
-; SCALABLE-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]]
+; SCALABLE-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP8]], [[TMP10]]
+; SCALABLE-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP8]]
 ; SCALABLE-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; SCALABLE-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], 1025
 ; SCALABLE-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
@@ -796,9 +793,8 @@ define void @uniform_store_of_loop_varying(ptr noalias nocapture %a, ptr noalias
 ; TF-SCALABLE-NEXT:    call void @llvm.vp.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[VEC_IND]], <vscale x 2 x ptr> align 8 [[BROADCAST_SPLAT]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP9]])
 ; TF-SCALABLE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
 ; TF-SCALABLE-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[BROADCAST_SPLAT3]], ptr align 8 [[TMP10]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP9]])
-; TF-SCALABLE-NEXT:    [[TMP11:%.*]] = zext i32 [[TMP9]] to i64
-; TF-SCALABLE-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP11]], [[INDEX]]
-; TF-SCALABLE-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]]
+; TF-SCALABLE-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP13]], [[INDEX]]
+; TF-SCALABLE-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP13]]
 ; TF-SCALABLE-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT2]]
 ; TF-SCALABLE-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], 1025
 ; TF-SCALABLE-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
@@ -859,9 +855,8 @@ define void @conditional_uniform_store(ptr noalias nocapture %a, ptr noalias noc
 ; SCALABLE-NEXT:    call void @llvm.vp.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[BROADCAST_SPLAT1]], <vscale x 2 x ptr> align 8 [[BROADCAST_SPLAT2]], <vscale x 2 x i1> [[TMP10]], i32 [[TMP7]])
 ; SCALABLE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
 ; SCALABLE-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[BROADCAST_SPLAT1]], ptr align 8 [[TMP12]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP7]])
-; SCALABLE-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP7]] to i64
-; SCALABLE-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP15]], [[INDEX]]
-; SCALABLE-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP15]]
+; SCALABLE-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP14]], [[INDEX]]
+; SCALABLE-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP14]]
 ; SCALABLE-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; SCALABLE-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], 1025
 ; SCALABLE-NEXT:    br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
@@ -957,9 +952,8 @@ define void @conditional_uniform_store(ptr noalias nocapture %a, ptr noalias noc
 ; TF-SCALABLE-NEXT:    call void @llvm.vp.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[BROADCAST_SPLAT1]], <vscale x 2 x ptr> align 8 [[BROADCAST_SPLAT2]], <vscale x 2 x i1> [[TMP10]], i32 [[TMP9]])
 ; TF-SCALABLE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
 ; TF-SCALABLE-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[BROADCAST_SPLAT1]], ptr align 8 [[TMP12]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP9]])
-; TF-SCALABLE-NEXT:    [[TMP14:%.*]] = zext i32 [[TMP9]] to i64
-; TF-SCALABLE-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP14]], [[INDEX]]
-; TF-SCALABLE-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP14]]
+; TF-SCALABLE-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP11]], [[INDEX]]
+; TF-SCALABLE-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]]
 ; TF-SCALABLE-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; TF-SCALABLE-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], 1025
 ; TF-SCALABLE-NEXT:    br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/outer_loop_test1_no_explicit_vect_width.ll b/llvm/test/Transforms/LoopVectorize/X86/outer_loop_test1_no_explicit_vect_width.ll
index 1225e0fc583e1..ec9bdbd337187 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/outer_loop_test1_no_explicit_vect_width.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/outer_loop_test1_no_explicit_vect_width.ll
@@ -38,8 +38,7 @@ define void @foo(i32 %n) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [8 x i32], ptr @arr2, i64 0, <4 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = trunc <4 x i64> [[VEC_IND]] to <4 x i32>
 ; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[TMP1]], <4 x ptr> [[TMP0]], i32 4, <4 x i1> splat (i1 true))
-; CHECK-NEXT:    [[TMP8:%.*]] = trunc <4 x i64> [[VEC_IND]] to <4 x i32>
-; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[TMP8]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[TMP1]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    br label %[[FOR_BODY31:.*]]
 ; CHECK:       [[FOR_BODY31]]:
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[TMP4:%.*]], %[[FOR_BODY31]] ]
@@ -71,8 +70,7 @@ define void @foo(i32 %n) {
 ; AVX-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [8 x i32], ptr @arr2, i64 0, <8 x i64> [[VEC_IND]]
 ; AVX-NEXT:    [[TMP1:%.*]] = trunc <8 x i64> [[VEC_IND]] to <8 x i32>
 ; AVX-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> [[TMP1]], <8 x ptr> [[TMP0]], i32 4, <8 x i1> splat (i1 true))
-; AVX-NEXT:    [[TMP7:%.*]] = trunc <8 x i64> [[VEC_IND]] to <8 x i32>
-; AVX-NEXT:    [[TMP2:%.*]] = add nsw <8 x i32> [[TMP7]], [[BROADCAST_SPLAT]]
+; AVX-NEXT:    [[TMP2:%.*]] = add nsw <8 x i32> [[TMP1]], [[BROADCAST_SPLAT]]
 ; AVX-NEXT:    br label %[[FOR_BODY31:.*]]
 ; AVX:       [[FOR_BODY31]]:
 ; AVX-NEXT:    [[VEC_PHI:%.*]] = phi <8 x i64> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[TMP4:%.*]], %[[FOR_BODY31]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/cse.ll b/llvm/test/Transforms/LoopVectorize/cse.ll
new file mode 100644
index 0000000000000..616321edda102
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/cse.ll
@@ -0,0 +1,309 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "^scalar.ph:" --version 5
+; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -enable-epilogue-vectorization=false -S < %s | FileCheck %s
+
+define i32 @common_sext(ptr %a, ptr %b, ptr %c, i32 %N) {
+; CHECK-LABEL: define i32 @common_sext(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
+; CHECK-NEXT:    [[DIV27:%.*]] = lshr i32 [[N]], 1
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP0]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1
+; CHECK-NEXT:    [[TMP6:%.*]] = sext <4 x i8> [[WIDE_LOAD]] to <4 x i32>
+; CHECK-NEXT:    [[TMP7:%.*]] = sext <4 x i8> [[WIDE_LOAD1]] to <4 x i32>
+; CHECK-NEXT:    [[TMP8:%.*]] = sext <4 x i8> [[WIDE_LOAD2]] to <4 x i32>
+; CHECK-NEXT:    [[TMP9:%.*]] = add nsw <4 x i32> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP10:%.*]] = add nsw <4 x i32> [[TMP8]], [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = add <4 x i32> [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x i32> [[TMP11]], i32 3
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP11]], i32 3
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+;
+entry:
+  %cmp28.not = icmp ult i32 %N, 2
+  %div27 = lshr i32 %N, 1
+  %wide.trip.count = zext nneg i32 %div27 to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %res = phi i32 [ 0, %entry ], [ %add3, %for.body ]
+  %a.ptr = getelementptr inbounds nuw i8, ptr %a, i64 %iv
+  %b.ptr = getelementptr inbounds nuw i8, ptr %b, i64 %iv
+  %c.ptr = getelementptr inbounds nuw i8, ptr %c, i64 %iv
+  %a.val = load i8, ptr %a.ptr, align 1
+  %b.val = load i8, ptr %b.ptr, align 1
+  %c.val = load i8, ptr %c.ptr, align 1
+  %a.ext = sext i8 %a.val to i32
+  %b.ext = sext i8 %b.val to i32
+  %b.ext2 = sext i8 %b.val to i32
+  %c.ext = sext i8 %c.val to i32
+  %add = add nsw i32 %a.ext, %b.ext
+  %add2 = add nsw i32 %c.ext, %b.ext2
+  %add3 = add i32 %add, %add2
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %wide.trip.count
+  br i1 %exitcond, label %exit, label %for.body
+
+exit:                                 ; preds = %exit.loopexit, %entry
+  %res.0.lcssa = phi i32 [ %add3, %for.body ]
+  ret i32 %res.0.lcssa
+}
+
+define i32 @common_zext(ptr %a, ptr %b, ptr %c, i32 %N) {
+; CHECK-LABEL: define i32 @common_zext(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
+; CHECK-NEXT:    [[DIV27:%.*]] = lshr i32 [[N]], 1
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP0]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP11]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = zext <4 x i8> [[WIDE_LOAD]] to <4 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = zext <4 x i8> [[WIDE_LOAD1]] to <4 x i32>
+; CHECK-NEXT:    [[TMP13:%.*]] = zext <4 x i8> [[WIDE_LOAD2]] to <4 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = add nsw <4 x i32> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = add nsw <4 x i32> [[TMP13]], [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = add <4 x i32> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i32> [[TMP8]], i32 3
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP8]], i32 3
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+;
+entry:
+  %cmp28.not = icmp ult i32 %N, 2
+  %div27 = lshr i32 %N, 1
+  %wide.trip.count = zext nneg i32 %div27 to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %res = phi i32 [ 0, %entry ], [ %add3, %for.body ]
+  %a.ptr = getelementptr inbounds nuw i8, ptr %a, i64 %iv
+  %b.ptr = getelementptr inbounds nuw i8, ptr %b, i64 %iv
+  %c.ptr = getelementptr inbounds nuw i8, ptr %c, i64 %iv
+  %a.val = load i8, ptr %a.ptr, align 1
+  %b.val = load i8, ptr %b.ptr, align 1
+  %c.val = load i8, ptr %c.ptr, align 1
+  %a.ext = zext i8 %a.val to i32
+  %b.ext = zext i8 %b.val to i32
+  %b.ext2 = zext i8 %b.val to i32
+  %c.ext = zext i8 %c.val to i32
+  %add = add nsw i32 %a.ext, %b.ext
+  %add2 = add nsw i32 %c.ext, %b.ext2
+  %add3 = add i32 %add, %add2
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %wide.trip.count
+  br i1 %exitcond, label %exit, label %for.body
+
+exit:                                 ; preds = %exit.loopexit, %entry
+  %res.0.lcssa = phi i32 [ %add3, %for.body ]
+  ret i32 %res.0.lcssa
+}
+
+define i32 @common_sext_different_types(ptr %a, ptr %b, ptr %c, i32 %N) {
+; CHECK-LABEL: define i32 @common_sext_different_types(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
+; CHECK-NEXT:    [[DIV27:%.*]] = lshr i32 [[N]], 1
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP0]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP12:%.*]] = load i16, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP13:%.*]] = load i16, ptr [[TMP6]], align 1
+; CHECK-NEXT:    [[TMP14:%.*]] = load i16, ptr [[TMP7]], align 1
+; CHECK-NEXT:    [[TMP15:%.*]] = load i16, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <4 x i16> poison, i16 [[TMP12]], i32 0
+; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <4 x i16> [[TMP16]], i16 [[TMP13]], i32 1
+; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <4 x i16> [[TMP17]], i16 [[TMP14]], i32 2
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x i16> [[TMP18]], i16 [[TMP15]], i32 3
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP21:%.*]] = sext <4 x i8> [[WIDE_LOAD]] to <4 x i32>
+; CHECK-NEXT:    [[TMP22:%.*]] = sext <4 x i8> [[WIDE_LOAD1]] to <4 x i32>
+; CHECK-NEXT:    [[TMP23:%.*]] = sext <4 x i16> [[TMP19]] to <4 x i32>
+; CHECK-NEXT:    [[TMP24:%.*]] = sext <4 x i8> [[WIDE_LOAD2]] to <4 x i32>
+; CHECK-NEXT:    [[TMP25:%.*]] = add nsw <4 x i32> [[TMP21]], [[TMP22]]
+; CHECK-NEXT:    [[TMP26:%.*]] = add nsw <4 x i32> [[TMP24]], [[TMP23]]
+; CHECK-NEXT:    [[TMP27:%.*]] = add <4 x i32> [[TMP25]], [[TMP26]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP28]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <4 x i32> [[TMP27]], i32 3
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP27]], i32 3
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+;
+entry:
+  %cmp28.not = icmp ult i32 %N, 2
+  %div27 = lshr i32 %N, 1
+  %wide.trip.count = zext nneg i32 %div27 to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %res = phi i32 [ 0, %entry ], [ %add3, %for.body ]
+  %a.ptr = getelementptr inbounds nuw i8, ptr %a, i64 %iv
+  %b.ptr = getelementptr inbounds nuw i8, ptr %b, i64 %iv
+  %b.ptr2 = getelementptr inbounds nuw i16, ptr %b, i64 %iv
+  %c.ptr = getelementptr inbounds nuw i8, ptr %c, i64 %iv
+  %a.val = load i8, ptr %a.ptr, align 1
+  %b.val = load i8, ptr %b.ptr, align 1
+  %b.val2 = load i16, ptr %b.ptr, align 1
+  %c.val = load i8, ptr %c.ptr, align 1
+  %a.ext = sext i8 %a.val to i32
+  %b.ext = sext i8 %b.val to i32
+  %b.ext2 = sext i16 %b.val2 to i32
+  %c.ext = sext i8 %c.val to i32
+  %add = add nsw i32 %a.ext, %b.ext
+  %add2 = add nsw i32 %c.ext, %b.ext2
+  %add3 = add i32 %add, %add2
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %wide.trip.count
+  br i1 %exitcond, label %exit, label %for.body
+
+exit:                                 ; preds = %exit.loopexit, %entry
+  %res.0.lcssa = phi i32 [ %add3, %for.body ]
+  ret i32 %res.0.lcssa
+}
+define i32 @common_zext_different_types(ptr %a, ptr %b, ptr %c, i32 %N) {
+; CHECK-LABEL: define i32 @common_zext_different_types(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
+; CHECK-NEXT:    [[DIV27:%.*]] = lshr i32 [[N]], 1
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP0]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP12:%.*]] = load i16, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP13:%.*]] = load i16, ptr [[TMP6]], align 1
+; CHECK-NEXT:    [[TMP14:%.*]] = load i16, ptr [[TMP7]], align 1
+; CHECK-NEXT:    [[TMP15:%.*]] = load i16, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <4 x i16> poison, i16 [[TMP12]], i32 0
+; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <4 x i16> [[TMP16]], i16 [[TMP13]], i32 1
+; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <4 x i16> [[TMP17]], i16 [[TMP14]], i32 2
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x i16> [[TMP18]], i16 [[TMP15]], i32 3
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP21:%.*]] = zext <4 x i8> [[WIDE_LOAD]] to <4 x i32>
+; CHECK-NEXT:    [[TMP22:%.*]] = zext <4 x i8> [[WIDE_LOAD1]] to <4 x i32>
+; CHECK-NEXT:    [[TMP23:%.*]] = zext <4 x i16> [[TMP19]] to <4 x i32>
+; CHECK-NEXT:    [[TMP24:%.*]] = zext <4 x i8> [[WIDE_LOAD2]] to <4 x i32>
+; CHECK-NEXT:    [[TMP25:%.*]] = add nsw <4 x i32> [[TMP21]], [[TMP22]]
+; CHECK-NEXT:    [[TMP26:%.*]] = add nsw <4 x i32> [[TMP24]], [[TMP23]]
+; CHECK-NEXT:    [[TMP27:%.*]] = add <4 x i32> [[TMP25]], [[TMP26]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP28]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <4 x i32> [[TMP27]], i32 3
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP27]], i32 3
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+;
+entry:
+  %cmp28.not = icmp ult i32 %N, 2
+  %div27 = lshr i32 %N, 1
+  %wide.trip.count = zext nneg i32 %div27 to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %res = phi i32 [ 0, %entry ], [ %add3, %for.body ]
+  %a.ptr = getelementptr inbounds nuw i8, ptr %a, i64 %iv
+  %b.ptr = getelementptr inbounds nuw i8, ptr %b, i64 %iv
+  %b.ptr2 = getelementptr inbounds nuw i16, ptr %b, i64 %iv
+  %c.ptr = getelementptr inbounds nuw i8, ptr %c, i64 %iv
+  %a.val = load i8, ptr %a.ptr, align 1
+  %b.val = load i8, ptr %b.ptr, align 1
+  %b.val2 = load i16, ptr %b.ptr, align 1
+  %c.val = load i8, ptr %c.ptr, align 1
+  %a.ext = zext i8 %a.val to i32
+  %b.ext = zext i8 %b.val to i32
+  %b.ext2 = zext i16 %b.val2 to i32
+  %c.ext = zext i8 %c.val to i32
+  %add = add nsw i32 %a.ext, %b.ext
+  %add2 = add nsw i32 %c.ext, %b.ext2
+  %add3 = add i32 %add, %add2
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %wide.trip.count
+  br i1 %exitcond, label %exit, label %for.body
+
+exit:                                 ; preds = %exit.loopexit, %entry
+  %res.0.lcssa = phi i32 [ %add3, %for.body ]
+  ret i32 %res.0.lcssa
+}
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
index 10cbf66c783db..d4506d1205c95 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
@@ -563,13 +563,12 @@ define void @recurrence_3(ptr readonly noalias %a, ptr noalias %b, i32 %n, float
 ; UNROLL-NO-VF-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[TMP4]]
 ; UNROLL-NO-VF-NEXT:    [[TMP7:%.*]] = load i16, ptr [[TMP5]], align 2
 ; UNROLL-NO-VF-NEXT:    [[TMP8]] = load i16, ptr [[TMP6]], align 2
-; UNROLL-NO-VF-NEXT:    [[TMP9:%.*]] = sitofp i16 [[TMP7]] to double
+; UNROLL-NO-VF-NEXT:    [[TMP12:%.*]] = sitofp i16 [[TMP7]] to double
 ; UNROLL-NO-VF-NEXT:    [[TMP10:%.*]] = sitofp i16 [[TMP8]] to double
 ; UNROLL-NO-VF-NEXT:    [[TMP11:%.*]] = sitofp i16 [[VECTOR_RECUR]] to double
-; UNROLL-NO-VF-NEXT:    [[TMP12:%.*]] = sitofp i16 [[TMP7]] to double
 ; UNROLL-NO-VF-NEXT:    [[TMP13:%.*]] = fmul fast double [[TMP11]], [[CONV1]]
 ; UNROLL-NO-VF-NEXT:    [[TMP14:%.*]] = fmul fast double [[TMP12]], [[CONV1]]
-; UNROLL-NO-VF-NEXT:    [[TMP15:%.*]] = fsub fast double [[TMP9]], [[TMP13]]
+; UNROLL-NO-VF-NEXT:    [[TMP15:%.*]] = fsub fast double [[TMP12]], [[TMP13]]
 ; UNROLL-NO-VF-NEXT:    [[TMP16:%.*]] = fsub fast double [[TMP10]], [[TMP14]]
 ; UNROLL-NO-VF-NEXT:    [[TMP17:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[OFFSET_IDX]]
 ; UNROLL-NO-VF-NEXT:    [[TMP18:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[TMP4]]
@@ -1792,11 +1791,10 @@ define void @sink_after(ptr noalias %a, ptr noalias %b, i64 %n) {
 ; UNROLL-NO-VF-NEXT:    [[TMP6:%.*]] = load i16, ptr [[TMP4]], align 2
 ; UNROLL-NO-VF-NEXT:    [[TMP7]] = load i16, ptr [[TMP5]], align 2
 ; UNROLL-NO-VF-NEXT:    [[TMP8:%.*]] = sext i16 [[VECTOR_RECUR]] to i32
-; UNROLL-NO-VF-NEXT:    [[TMP9:%.*]] = sext i16 [[TMP6]] to i32
 ; UNROLL-NO-VF-NEXT:    [[TMP10:%.*]] = sext i16 [[TMP6]] to i32
 ; UNROLL-NO-VF-NEXT:    [[TMP11:%.*]] = sext i16 [[TMP7]] to i32
 ; UNROLL-NO-VF-NEXT:    [[TMP12:%.*]] = mul nsw i32 [[TMP10]], [[TMP8]]
-; UNROLL-NO-VF-NEXT:    [[TMP13:%.*]] = mul nsw i32 [[TMP11]], [[TMP9]]
+; UNROLL-NO-VF-NEXT:    [[TMP13:%.*]] = mul nsw i32 [[TMP11]], [[TMP10]]
 ; UNROLL-NO-VF-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDEX]]
 ; UNROLL-NO-VF-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP1]]
 ; UNROLL-NO-VF-NEXT:    store i32 [[TMP12]], ptr [[TMP14]], align 4
@@ -2026,11 +2024,10 @@ define void @PR34711(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %n) {
 ; UNROLL-NO-VF-NEXT:    [[TMP6:%.*]] = load i16, ptr [[TMP4]], align 2
 ; UNROLL-NO-VF-NEXT:    [[TMP7]] = load i16, ptr [[TMP5]], align 2
 ; UNROLL-NO-VF-NEXT:    [[TMP8:%.*]] = sext i16 [[VECTOR_RECUR]] to i32
-; UNROLL-NO-VF-NEXT:    [[TMP9:%.*]] = sext i16 [[TMP6]] to i32
 ; UNROLL-NO-VF-NEXT:    [[TMP10:%.*]] = sext i16 [[TMP6]] to i32
 ; UNROLL-NO-VF-NEXT:    [[TMP11:%.*]] = sext i16 [[TMP7]] to i32
 ; UNROLL-NO-VF-NEXT:    [[TMP12:%.*]] = mul nsw i32 [[TMP10]], [[TMP8]]
-; UNROLL-NO-VF-NEXT:    [[TMP13:%.*]] = mul nsw i32 [[TMP11]], [[TMP9]]
+; UNROLL-NO-VF-NEXT:    [[TMP13:%.*]] = mul nsw i32 [[TMP11]], [[TMP10]]
 ; UNROLL-NO-VF-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDEX]]
 ; UNROLL-NO-VF-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP1]]
 ; UNROLL-NO-VF-NEXT:    store i32 [[TMP12]], ptr [[TMP14]], align 4
@@ -2244,9 +2241,8 @@ define void @sink_after_with_multiple_users(ptr noalias %a, ptr noalias %b, i64
 ; UNROLL-NO-VF-NEXT:    [[TMP9:%.*]] = sext i16 [[TMP6]] to i32
 ; UNROLL-NO-VF-NEXT:    [[TMP10:%.*]] = add nsw i32 [[TMP8]], 2
 ; UNROLL-NO-VF-NEXT:    [[TMP11:%.*]] = add nsw i32 [[TMP9]], 2
-; UNROLL-NO-VF-NEXT:    [[TMP12:%.*]] = sext i16 [[TMP6]] to i32
 ; UNROLL-NO-VF-NEXT:    [[TMP13:%.*]] = sext i16 [[TMP7]] to i32
-; UNROLL-NO-VF-NEXT:    [[TMP14:%.*]] = mul nsw i32 [[TMP10]], [[TMP12]]
+; UNROLL-NO-VF-NEXT:    [[TMP14:%.*]] = mul nsw i32 [[TMP10]], [[TMP9]]
 ; UNROLL-NO-VF-NEXT:    [[TMP15:%.*]] = mul nsw i32 [[TMP11]], [[TMP13]]
 ; UNROLL-NO-VF-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDEX]]
 ; UNROLL-NO-VF-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP1]]
diff --git a/llvm/test/Transforms/LoopVectorize/float-induction.ll b/llvm/test/Transforms/LoopVectorize/float-induction.ll
index 8a3cad0681013..5fe426efb5e3d 100644
--- a/llvm/test/Transforms/LoopVectorize/float-induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/float-induction.ll
@@ -932,8 +932,7 @@ define void @fp_iv_loop3(float %init, ptr noalias nocapture %A, ptr noalias noca
 ; VEC1_INTERL2-NEXT:    [[TMP4:%.*]] = or disjoint i64 [[INDEX]], 1
 ; VEC1_INTERL2-NEXT:    [[DOTCAST5:%.*]] = sitofp i64 [[INDEX]] to float
 ; VEC1_INTERL2-NEXT:    [[TMP5:%.*]] = fmul fast float [[DOTCAST5]], -5.000000e-01
-; VEC1_INTERL2-NEXT:    [[DOTCAST6:%.*]] = sitofp i64 [[INDEX]] to float
-; VEC1_INTERL2-NEXT:    [[TMP6:%.*]] = fmul fast float [[TMP0]], [[DOTCAST6]]
+; VEC1_INTERL2-NEXT:    [[TMP6:%.*]] = fmul fast float [[TMP0]], [[DOTCAST5]]
 ; VEC1_INTERL2-NEXT:    [[OFFSET_IDX7:%.*]] = fadd fast float [[INIT]], [[TMP6]]
 ; VEC1_INTERL2-NEXT:    [[TMP7:%.*]] = fadd fast float [[OFFSET_IDX7]], [[TMP0]]
 ; VEC1_INTERL2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]]
diff --git a/llvm/test/Transforms/LoopVectorize/outer_loop_test1.ll b/llvm/test/Transforms/LoopVectorize/outer_loop_test1.ll
index 6bc2f389d65ce..4244ef690ac0c 100644
--- a/llvm/test/Transforms/LoopVectorize/outer_loop_test1.ll
+++ b/llvm/test/Transforms/LoopVectorize/outer_loop_test1.ll
@@ -36,8 +36,7 @@ define void @foo(i32 %n) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [8 x i32], ptr @arr2, i64 0, <4 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = trunc <4 x i64> [[VEC_IND]] to <4 x i32>
 ; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[TMP1]], <4 x ptr> [[TMP0]], i32 4, <4 x i1> splat (i1 true))
-; CHECK-NEXT:    [[TMP8:%.*]] = trunc <4 x i64> [[VEC_IND]] to <4 x i32>
-; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[TMP8]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[TMP1]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    br label %[[FOR_BODY31:.*]]
 ; CHECK:       [[FOR_BODY31]]:
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[TMP4:%.*]], %[[FOR_BODY31]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll b/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll
index e7f73a82292f0..3bd7d8fdd5cdc 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll
@@ -2195,8 +2195,7 @@ define i32 @predicated_not_dominates_reduction_twoadd(ptr nocapture noundef read
 ; CHECK-NEXT:    [[TMP5:%.*]] = zext nneg <4 x i8> [[TMP4]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP5]])
 ; CHECK-NEXT:    [[TMP8:%.*]] = add i32 [[VEC_PHI]], [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = zext nneg <4 x i8> [[TMP4]] to <4 x i32>
-; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP9]])
+; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP5]])
 ; CHECK-NEXT:    [[TMP11]] = add i32 [[TMP8]], [[TMP10]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
@@ -2261,11 +2260,9 @@ define i32 @predicated_not_dominates_reduction_twoadd(ptr nocapture noundef read
 ; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = zext nneg <4 x i8> [[TMP8]] to <4 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP12]])
 ; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = add i32 [[VEC_PHI1]], [[TMP13]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = zext nneg <4 x i8> [[TMP7]] to <4 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP15]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP9]])
 ; CHECK-INTERLEAVED-NEXT:    [[TMP17]] = add i32 [[TMP11]], [[TMP16]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = zext nneg <4 x i8> [[TMP8]] to <4 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP18]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP12]])
 ; CHECK-INTERLEAVED-NEXT:    [[TMP20]] = add i32 [[TMP14]], [[TMP19]]
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]

From ee0a425812730d737226800029f5ceaed0763215 Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs@arm.com>
Date: Thu, 10 Jul 2025 13:34:28 +0100
Subject: [PATCH 2/6] Remove FunctionComparator changes and add MergeFunc test

---
 llvm/lib/IR/Instruction.cpp                   |   2 -
 .../Transforms/Utils/FunctionComparator.cpp   |   4 -
 llvm/test/Transforms/LoopVectorize/cse.ll     | 159 +++++++++++++++++-
 llvm/test/Transforms/MergeFunc/merge-ext.ll   |  59 +++++++
 4 files changed, 214 insertions(+), 10 deletions(-)
 create mode 100644 llvm/test/Transforms/MergeFunc/merge-ext.ll

diff --git a/llvm/lib/IR/Instruction.cpp b/llvm/lib/IR/Instruction.cpp
index 94dfed4df72fe..5e87b5ff941ad 100644
--- a/llvm/lib/IR/Instruction.cpp
+++ b/llvm/lib/IR/Instruction.cpp
@@ -935,8 +935,6 @@ bool Instruction::hasSameSpecialState(const Instruction *I2,
   if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I1))
     return GEP->getSourceElementType() ==
            cast<GetElementPtrInst>(I2)->getSourceElementType();
-  if (const CastInst *Cast = dyn_cast<CastInst>(I1))
-    return Cast->getDestTy() == cast<CastInst>(I2)->getDestTy();
 
   return true;
 }
diff --git a/llvm/lib/Transforms/Utils/FunctionComparator.cpp b/llvm/lib/Transforms/Utils/FunctionComparator.cpp
index 8d1f7c9b3a2ea..6d4026e8209de 100644
--- a/llvm/lib/Transforms/Utils/FunctionComparator.cpp
+++ b/llvm/lib/Transforms/Utils/FunctionComparator.cpp
@@ -804,10 +804,6 @@ int FunctionComparator::cmpOperations(const Instruction *L,
         return Res;
     }
   }
-  if (const CastInst *Cast = dyn_cast<CastInst>(L)) {
-    const CastInst *CastR = cast<CastInst>(R);
-    return cmpTypes(Cast->getDestTy(), CastR->getDestTy());
-  }
   return 0;
 }
 
diff --git a/llvm/test/Transforms/LoopVectorize/cse.ll b/llvm/test/Transforms/LoopVectorize/cse.ll
index 616321edda102..71325de1ca275 100644
--- a/llvm/test/Transforms/LoopVectorize/cse.ll
+++ b/llvm/test/Transforms/LoopVectorize/cse.ll
@@ -137,8 +137,8 @@ exit:                                 ; preds = %exit.loopexit, %entry
   ret i32 %res.0.lcssa
 }
 
-define i32 @common_sext_different_types(ptr %a, ptr %b, ptr %c, i32 %N) {
-; CHECK-LABEL: define i32 @common_sext_different_types(
+define i32 @common_sext_different_src_types(ptr %a, ptr %b, ptr %c, i32 %N) {
+; CHECK-LABEL: define i32 @common_sext_different_src_types(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
@@ -222,8 +222,9 @@ exit:                                 ; preds = %exit.loopexit, %entry
   %res.0.lcssa = phi i32 [ %add3, %for.body ]
   ret i32 %res.0.lcssa
 }
-define i32 @common_zext_different_types(ptr %a, ptr %b, ptr %c, i32 %N) {
-; CHECK-LABEL: define i32 @common_zext_different_types(
+
+define i32 @common_zext_different_src_types(ptr %a, ptr %b, ptr %c, i32 %N) {
+; CHECK-LABEL: define i32 @common_zext_different_src_types(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
@@ -307,3 +308,153 @@ exit:                                 ; preds = %exit.loopexit, %entry
   %res.0.lcssa = phi i32 [ %add3, %for.body ]
   ret i32 %res.0.lcssa
 }
+
+define i32 @common_sext_different_dest_types(ptr %a, ptr %b, ptr %c, i32 %N) {
+; CHECK-LABEL: define i32 @common_sext_different_dest_types(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
+; CHECK-NEXT:    [[DIV27:%.*]] = lshr i32 [[N]], 1
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP6:%.*]] = sext <4 x i8> [[WIDE_LOAD]] to <4 x i32>
+; CHECK-NEXT:    [[TMP7:%.*]] = sext <4 x i8> [[WIDE_LOAD1]] to <4 x i32>
+; CHECK-NEXT:    [[TMP8:%.*]] = sext <4 x i8> [[WIDE_LOAD1]] to <4 x i16>
+; CHECK-NEXT:    [[TMP9:%.*]] = sext <4 x i8> [[WIDE_LOAD2]] to <4 x i16>
+; CHECK-NEXT:    [[TMP10:%.*]] = add nsw <4 x i32> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = add nsw <4 x i16> [[TMP9]], [[TMP8]]
+; CHECK-NEXT:    [[TMP12:%.*]] = sext <4 x i16> [[TMP11]] to <4 x i32>
+; CHECK-NEXT:    [[TMP13:%.*]] = add <4 x i32> [[TMP10]], [[TMP12]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x i32> [[TMP13]], i32 3
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP13]], i32 3
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+;
+entry:
+  %cmp28.not = icmp ult i32 %N, 2
+  %div27 = lshr i32 %N, 1
+  %wide.trip.count = zext nneg i32 %div27 to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %res = phi i32 [ 0, %entry ], [ %add3, %for.body ]
+  %a.ptr = getelementptr inbounds nuw i8, ptr %a, i64 %iv
+  %b.ptr = getelementptr inbounds nuw i8, ptr %b, i64 %iv
+  %b.ptr2 = getelementptr inbounds nuw i16, ptr %b, i64 %iv
+  %c.ptr = getelementptr inbounds nuw i8, ptr %c, i64 %iv
+  %a.val = load i8, ptr %a.ptr, align 1
+  %b.val = load i8, ptr %b.ptr, align 1
+  %c.val = load i8, ptr %c.ptr, align 1
+  %a.ext = sext i8 %a.val to i32
+  %b.ext = sext i8 %b.val to i32
+  %b.ext2 = sext i8 %b.val to i16
+  %c.ext = sext i8 %c.val to i16
+  %add = add nsw i32 %a.ext, %b.ext
+  %add2 = add nsw i16 %c.ext, %b.ext2
+  %add2.ext = sext i16 %add2 to i32
+  %add3 = add i32 %add, %add2.ext
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %wide.trip.count
+  br i1 %exitcond, label %exit, label %for.body
+
+exit:                                 ; preds = %exit.loopexit, %entry
+  %res.0.lcssa = phi i32 [ %add3, %for.body ]
+  ret i32 %res.0.lcssa
+}
+
+define i32 @common_zext_different_dest_types(ptr %a, ptr %b, ptr %c, i32 %N) {
+; CHECK-LABEL: define i32 @common_zext_different_dest_types(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
+; CHECK-NEXT:    [[DIV27:%.*]] = lshr i32 [[N]], 1
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP6:%.*]] = zext <4 x i8> [[WIDE_LOAD]] to <4 x i32>
+; CHECK-NEXT:    [[TMP7:%.*]] = zext <4 x i8> [[WIDE_LOAD1]] to <4 x i32>
+; CHECK-NEXT:    [[TMP8:%.*]] = zext <4 x i8> [[WIDE_LOAD1]] to <4 x i16>
+; CHECK-NEXT:    [[TMP9:%.*]] = zext <4 x i8> [[WIDE_LOAD2]] to <4 x i16>
+; CHECK-NEXT:    [[TMP10:%.*]] = add nsw <4 x i32> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = add nsw <4 x i16> [[TMP9]], [[TMP8]]
+; CHECK-NEXT:    [[TMP12:%.*]] = zext <4 x i16> [[TMP11]] to <4 x i32>
+; CHECK-NEXT:    [[TMP13:%.*]] = add <4 x i32> [[TMP10]], [[TMP12]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x i32> [[TMP13]], i32 3
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP13]], i32 3
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+;
+entry:
+  %cmp28.not = icmp ult i32 %N, 2
+  %div27 = lshr i32 %N, 1
+  %wide.trip.count = zext nneg i32 %div27 to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %res = phi i32 [ 0, %entry ], [ %add3, %for.body ]
+  %a.ptr = getelementptr inbounds nuw i8, ptr %a, i64 %iv
+  %b.ptr = getelementptr inbounds nuw i8, ptr %b, i64 %iv
+  %b.ptr2 = getelementptr inbounds nuw i16, ptr %b, i64 %iv
+  %c.ptr = getelementptr inbounds nuw i8, ptr %c, i64 %iv
+  %a.val = load i8, ptr %a.ptr, align 1
+  %b.val = load i8, ptr %b.ptr, align 1
+  %c.val = load i8, ptr %c.ptr, align 1
+  %a.ext = zext i8 %a.val to i32
+  %b.ext = zext i8 %b.val to i32
+  %b.ext2 = zext i8 %b.val to i16
+  %c.ext = zext i8 %c.val to i16
+  %add = add nsw i32 %a.ext, %b.ext
+  %add2 = add nsw i16 %c.ext, %b.ext2
+  %add2.ext = zext i16 %add2 to i32
+  %add3 = add i32 %add, %add2.ext
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %wide.trip.count
+  br i1 %exitcond, label %exit, label %for.body
+
+exit:                                 ; preds = %exit.loopexit, %entry
+  %res.0.lcssa = phi i32 [ %add3, %for.body ]
+  ret i32 %res.0.lcssa
+}
diff --git a/llvm/test/Transforms/MergeFunc/merge-ext.ll b/llvm/test/Transforms/MergeFunc/merge-ext.ll
new file mode 100644
index 0000000000000..e4da2dc1282b7
--- /dev/null
+++ b/llvm/test/Transforms/MergeFunc/merge-ext.ll
@@ -0,0 +1,59 @@
+; RUN: opt -S -passes=mergefunc < %s | FileCheck %s
+
+@llvm.compiler.used = appending global [2 x ptr] [ptr @zext_a, ptr @sext_a], section "llvm.metadata"
+
+define internal i32 @sext_a(i16 %a) unnamed_addr {
+  %b = sext i16 %a to i32
+  ret i32 %b
+}
+
+define i32 @sext_b(i16 %a) unnamed_addr {
+  %b = sext i16 %a to i32
+  ret i32 %b
+}
+
+define i32 @sext_c(i32 %a) unnamed_addr {
+  %b = tail call i32 @sext_a(i32 %a)
+  ret i32 %b
+}
+
+define internal i32 @zext_a(i16 %a) unnamed_addr {
+  %b = zext i16 %a to i32
+  ret i32 %b
+}
+
+define i32 @zext_b(i16 %a) unnamed_addr {
+  %b = zext i16 %a to i32
+  ret i32 %b
+}
+
+define i32 @zext_c(i32 %a) unnamed_addr {
+  %b = tail call i32 @zext_a(i32 %a)
+  ret i32 %b
+}
+
+; CHECK-LABEL: @llvm.compiler.used = appending global [2 x ptr] [ptr @zext_a, ptr @sext_a], section "llvm.metadata"
+
+; CHECK-LABEL: define i32 @sext_b(i16 %a) unnamed_addr
+; CHECK-NEXT:    sext
+; CHECK-NEXT:    ret
+
+; CHECK-LABEL: define i32 @sext_c(i32 %a) unnamed_addr
+; CHECK-NEXT:    tail call i32 @sext_b(i32 %a)
+; CHECK-NEXT:    ret
+
+; CHECK-LABEL: define i32 @zext_b(i16 %a) unnamed_addr
+; CHECK-NEXT:    zext
+; CHECK-NEXT:    ret
+
+; CHECK-LABEL: define i32 @zext_c(i32 %a) unnamed_addr
+; CHECK-NEXT:    tail call i32 @zext_b(i32 %a)
+; CHECK-NEXT:    ret
+
+; CHECK-LABEL: define internal i32 @sext_a(i16 %0) unnamed_addr
+; CHECK-NEXT:    tail call i32 @sext_b(i16 %0)
+; CHECK-NEXT:    ret
+
+; CHECK-LABEL: define internal i32 @zext_a(i16 %0) unnamed_addr
+; CHECK-NEXT:    tail call i32 @zext_b(i16 %0)
+; CHECK-NEXT:    ret

From c2de7f0200c5ecca92c5f26997d5c23fc03e06b4 Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs@arm.com>
Date: Tue, 22 Jul 2025 16:50:37 +0100
Subject: [PATCH 3/6] Update comment

---
 llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 34cb28bf48a85..f23ecc2311cf5 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2539,7 +2539,7 @@ struct CSEDenseMapInfo {
 
 } // end anonymous namespace
 
-///Perform cse of induction variable instructions.
+/// Perform cse of induction variable instructions, casts and selects.
 static void cse(BasicBlock *BB) {
   // Perform simple cse.
   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;

From 95cbc8a865241a4993fc9a75096adadc74c87cf0 Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs@arm.com>
Date: Tue, 5 Aug 2025 15:30:51 +0100
Subject: [PATCH 4/6] Remove merge-ext.ll test

---
 llvm/test/Transforms/MergeFunc/merge-ext.ll | 59 ---------------------
 1 file changed, 59 deletions(-)
 delete mode 100644 llvm/test/Transforms/MergeFunc/merge-ext.ll

diff --git a/llvm/test/Transforms/MergeFunc/merge-ext.ll b/llvm/test/Transforms/MergeFunc/merge-ext.ll
deleted file mode 100644
index e4da2dc1282b7..0000000000000
--- a/llvm/test/Transforms/MergeFunc/merge-ext.ll
+++ /dev/null
@@ -1,59 +0,0 @@
-; RUN: opt -S -passes=mergefunc < %s | FileCheck %s
-
-@llvm.compiler.used = appending global [2 x ptr] [ptr @zext_a, ptr @sext_a], section "llvm.metadata"
-
-define internal i32 @sext_a(i16 %a) unnamed_addr {
-  %b = sext i16 %a to i32
-  ret i32 %b
-}
-
-define i32 @sext_b(i16 %a) unnamed_addr {
-  %b = sext i16 %a to i32
-  ret i32 %b
-}
-
-define i32 @sext_c(i32 %a) unnamed_addr {
-  %b = tail call i32 @sext_a(i32 %a)
-  ret i32 %b
-}
-
-define internal i32 @zext_a(i16 %a) unnamed_addr {
-  %b = zext i16 %a to i32
-  ret i32 %b
-}
-
-define i32 @zext_b(i16 %a) unnamed_addr {
-  %b = zext i16 %a to i32
-  ret i32 %b
-}
-
-define i32 @zext_c(i32 %a) unnamed_addr {
-  %b = tail call i32 @zext_a(i32 %a)
-  ret i32 %b
-}
-
-; CHECK-LABEL: @llvm.compiler.used = appending global [2 x ptr] [ptr @zext_a, ptr @sext_a], section "llvm.metadata"
-
-; CHECK-LABEL: define i32 @sext_b(i16 %a) unnamed_addr
-; CHECK-NEXT:    sext
-; CHECK-NEXT:    ret
-
-; CHECK-LABEL: define i32 @sext_c(i32 %a) unnamed_addr
-; CHECK-NEXT:    tail call i32 @sext_b(i32 %a)
-; CHECK-NEXT:    ret
-
-; CHECK-LABEL: define i32 @zext_b(i16 %a) unnamed_addr
-; CHECK-NEXT:    zext
-; CHECK-NEXT:    ret
-
-; CHECK-LABEL: define i32 @zext_c(i32 %a) unnamed_addr
-; CHECK-NEXT:    tail call i32 @zext_b(i32 %a)
-; CHECK-NEXT:    ret
-
-; CHECK-LABEL: define internal i32 @sext_a(i16 %0) unnamed_addr
-; CHECK-NEXT:    tail call i32 @sext_b(i16 %0)
-; CHECK-NEXT:    ret
-
-; CHECK-LABEL: define internal i32 @zext_a(i16 %0) unnamed_addr
-; CHECK-NEXT:    tail call i32 @zext_b(i16 %0)
-; CHECK-NEXT:    ret

From e51726baf2405eda9348261c1116fcea6be778a4 Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs@arm.com>
Date: Tue, 5 Aug 2025 16:05:12 +0100
Subject: [PATCH 5/6] Rebase

---
 .../RISCV/riscv-vector-reverse-output.ll      | 653 ------------------
 llvm/test/Transforms/LoopVectorize/cse.ll     |  18 +-
 2 files changed, 6 insertions(+), 665 deletions(-)
 delete mode 100644 llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll

diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll
deleted file mode 100644
index 1d7c80b3d722f..0000000000000
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll
+++ /dev/null
@@ -1,653 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 5
-;; This is the loop in c++ being vectorize in this file with
-;; vector.reverse
-;;  #pragma clang loop vectorize_width(4, scalable)
-;;  for (int i = N-1; i >= 0; --i)
-;;    a[i] = b[i] + 1.0;
-
-; RUN: opt -passes=loop-vectorize -mtriple=riscv64 -mattr=+v \
-; RUN: -riscv-v-vector-bits-min=128 -S < %s \
-; RUN: | FileCheck --check-prefix=RV64 %s
-
-; RUN: opt -passes=loop-vectorize -mtriple=riscv32 -mattr=+v \
-; RUN: -riscv-v-vector-bits-min=128 -S < %s \
-; RUN: | FileCheck --check-prefix=RV32 %s
-
-; RUN: opt -passes=loop-vectorize -mtriple=riscv64 -mattr=+v \
-; RUN: -riscv-v-vector-bits-min=128 -force-vector-interleave=2 -S < %s \
-; RUN: | FileCheck --check-prefix=RV64-UF2 %s
-
-define void @vector_reverse_i32(ptr noalias %A, ptr noalias %B) {
-; RV64-LABEL: define void @vector_reverse_i32(
-; RV64-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0:[0-9]+]] {
-; RV64-NEXT:  [[ENTRY:.*:]]
-; RV64-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; RV64:       [[VECTOR_PH]]:
-; RV64-NEXT:    br label %[[VECTOR_BODY:.*]]
-; RV64:       [[VECTOR_BODY]]:
-; RV64-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; RV64-NEXT:    [[AVL:%.*]] = phi i64 [ 1023, %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; RV64-NEXT:    [[TMP0:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
-; RV64-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
-; RV64-NEXT:    [[TMP8:%.*]] = add nsw i64 [[OFFSET_IDX]], -1
-; RV64-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP8]]
-; RV64-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP0]] to i64
-; RV64-NEXT:    [[TMP10:%.*]] = mul i64 0, [[TMP5]]
-; RV64-NEXT:    [[TMP22:%.*]] = sub i64 [[TMP5]], 1
-; RV64-NEXT:    [[TMP11:%.*]] = mul i64 -1, [[TMP22]]
-; RV64-NEXT:    [[TMP7:%.*]] = getelementptr i32, ptr [[TMP9]], i64 [[TMP10]]
-; RV64-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[TMP7]], i64 [[TMP11]]
-; RV64-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP0]])
-; RV64-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP0]])
-; RV64-NEXT:    [[TMP14:%.*]] = add <vscale x 4 x i32> [[REVERSE]], splat (i32 1)
-; RV64-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP8]]
-; RV64-NEXT:    [[TMP16:%.*]] = mul i64 0, [[TMP5]]
-; RV64-NEXT:    [[TMP23:%.*]] = sub i64 [[TMP5]], 1
-; RV64-NEXT:    [[TMP17:%.*]] = mul i64 -1, [[TMP23]]
-; RV64-NEXT:    [[TMP18:%.*]] = getelementptr i32, ptr [[TMP15]], i64 [[TMP16]]
-; RV64-NEXT:    [[TMP19:%.*]] = getelementptr i32, ptr [[TMP18]], i64 [[TMP17]]
-; RV64-NEXT:    [[VP_REVERSE1:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[TMP14]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP0]])
-; RV64-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_REVERSE1]], ptr align 4 [[TMP19]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP0]])
-; RV64-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP5]], [[INDEX]]
-; RV64-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP5]]
-; RV64-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], 1023
-; RV64-NEXT:    br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; RV64:       [[MIDDLE_BLOCK]]:
-; RV64-NEXT:    br label %[[EXIT:.*]]
-; RV64:       [[SCALAR_PH]]:
-; RV64-NEXT:    br label %[[FOR_BODY:.*]]
-; RV64:       [[FOR_BODY]]:
-; RV64-NEXT:    [[DEC_IV:%.*]] = phi i64 [ 1023, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; RV64-NEXT:    [[IV_NEXT]] = add nsw i64 [[DEC_IV]], -1
-; RV64-NEXT:    [[ARRAYIDX_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV_NEXT]]
-; RV64-NEXT:    [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX_B]], align 4
-; RV64-NEXT:    [[ADD:%.*]] = add i32 [[TMP21]], 1
-; RV64-NEXT:    [[ARRAYIDX_A:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV_NEXT]]
-; RV64-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX_A]], align 4
-; RV64-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[DEC_IV]], 1
-; RV64-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[EXIT]], !llvm.loop [[LOOP4:![0-9]+]]
-; RV64:       [[EXIT]]:
-; RV64-NEXT:    ret void
-;
-; RV32-LABEL: define void @vector_reverse_i32(
-; RV32-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0:[0-9]+]] {
-; RV32-NEXT:  [[ENTRY:.*:]]
-; RV32-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; RV32:       [[VECTOR_PH]]:
-; RV32-NEXT:    br label %[[VECTOR_BODY:.*]]
-; RV32:       [[VECTOR_BODY]]:
-; RV32-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; RV32-NEXT:    [[AVL:%.*]] = phi i64 [ 1023, %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; RV32-NEXT:    [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
-; RV32-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
-; RV32-NEXT:    [[TMP8:%.*]] = add nsw i64 [[OFFSET_IDX]], -1
-; RV32-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP8]]
-; RV32-NEXT:    [[TMP11:%.*]] = mul i32 0, [[TMP10]]
-; RV32-NEXT:    [[TMP24:%.*]] = sub i32 [[TMP10]], 1
-; RV32-NEXT:    [[TMP12:%.*]] = mul i32 -1, [[TMP24]]
-; RV32-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr [[TMP9]], i32 [[TMP11]]
-; RV32-NEXT:    [[TMP7:%.*]] = getelementptr i32, ptr [[TMP6]], i32 [[TMP12]]
-; RV32-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP7]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
-; RV32-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
-; RV32-NEXT:    [[TMP15:%.*]] = add <vscale x 4 x i32> [[REVERSE]], splat (i32 1)
-; RV32-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP8]]
-; RV32-NEXT:    [[TMP18:%.*]] = mul i32 0, [[TMP10]]
-; RV32-NEXT:    [[TMP25:%.*]] = sub i32 [[TMP10]], 1
-; RV32-NEXT:    [[TMP19:%.*]] = mul i32 -1, [[TMP25]]
-; RV32-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[TMP16]], i32 [[TMP18]]
-; RV32-NEXT:    [[TMP14:%.*]] = getelementptr i32, ptr [[TMP13]], i32 [[TMP19]]
-; RV32-NEXT:    [[VP_REVERSE1:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[TMP15]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
-; RV32-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_REVERSE1]], ptr align 4 [[TMP14]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
-; RV32-NEXT:    [[TMP17:%.*]] = zext i32 [[TMP10]] to i64
-; RV32-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP17]], [[INDEX]]
-; RV32-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP17]]
-; RV32-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], 1023
-; RV32-NEXT:    br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; RV32:       [[MIDDLE_BLOCK]]:
-; RV32-NEXT:    br label %[[EXIT:.*]]
-; RV32:       [[SCALAR_PH]]:
-; RV32-NEXT:    br label %[[FOR_BODY:.*]]
-; RV32:       [[FOR_BODY]]:
-; RV32-NEXT:    [[DEC_IV:%.*]] = phi i64 [ 1023, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; RV32-NEXT:    [[IV_NEXT]] = add nsw i64 [[DEC_IV]], -1
-; RV32-NEXT:    [[ARRAYIDX_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV_NEXT]]
-; RV32-NEXT:    [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX_B]], align 4
-; RV32-NEXT:    [[ADD:%.*]] = add i32 [[TMP23]], 1
-; RV32-NEXT:    [[ARRAYIDX_A:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV_NEXT]]
-; RV32-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX_A]], align 4
-; RV32-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[DEC_IV]], 1
-; RV32-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[EXIT]], !llvm.loop [[LOOP4:![0-9]+]]
-; RV32:       [[EXIT]]:
-; RV32-NEXT:    ret void
-;
-; RV64-UF2-LABEL: define void @vector_reverse_i32(
-; RV64-UF2-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0:[0-9]+]] {
-; RV64-UF2-NEXT:  [[ENTRY:.*]]:
-; RV64-UF2-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; RV64-UF2-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 3
-; RV64-UF2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1023, [[TMP1]]
-; RV64-UF2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; RV64-UF2:       [[VECTOR_PH]]:
-; RV64-UF2-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; RV64-UF2-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
-; RV64-UF2-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 2
-; RV64-UF2-NEXT:    [[N_VEC:%.*]] = urem i64 1023, [[TMP6]]
-; RV64-UF2-NEXT:    [[TMP7:%.*]] = sub i64 1023, [[N_VEC]]
-; RV64-UF2-NEXT:    [[TMP8:%.*]] = sub i64 1023, [[TMP7]]
-; RV64-UF2-NEXT:    br label %[[VECTOR_BODY:.*]]
-; RV64-UF2:       [[VECTOR_BODY]]:
-; RV64-UF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; RV64-UF2-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
-; RV64-UF2-NEXT:    [[TMP9:%.*]] = add nsw i64 [[OFFSET_IDX]], -1
-; RV64-UF2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP9]]
-; RV64-UF2-NEXT:    [[TMP11:%.*]] = mul i64 0, [[TMP5]]
-; RV64-UF2-NEXT:    [[TMP32:%.*]] = sub i64 [[TMP5]], 1
-; RV64-UF2-NEXT:    [[TMP12:%.*]] = mul i64 -1, [[TMP32]]
-; RV64-UF2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i64 [[TMP11]]
-; RV64-UF2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i64 [[TMP12]]
-; RV64-UF2-NEXT:    [[TMP15:%.*]] = mul i64 -1, [[TMP5]]
-; RV64-UF2-NEXT:    [[TMP33:%.*]] = sub i64 [[TMP5]], 1
-; RV64-UF2-NEXT:    [[TMP16:%.*]] = mul i64 -1, [[TMP33]]
-; RV64-UF2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i64 [[TMP15]]
-; RV64-UF2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i64 [[TMP16]]
-; RV64-UF2-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP14]], align 4
-; RV64-UF2-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD]])
-; RV64-UF2-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i32>, ptr [[TMP18]], align 4
-; RV64-UF2-NEXT:    [[REVERSE2:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD1]])
-; RV64-UF2-NEXT:    [[TMP19:%.*]] = add <vscale x 4 x i32> [[REVERSE]], splat (i32 1)
-; RV64-UF2-NEXT:    [[TMP20:%.*]] = add <vscale x 4 x i32> [[REVERSE2]], splat (i32 1)
-; RV64-UF2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP9]]
-; RV64-UF2-NEXT:    [[TMP22:%.*]] = mul i64 0, [[TMP5]]
-; RV64-UF2-NEXT:    [[TMP34:%.*]] = sub i64 [[TMP5]], 1
-; RV64-UF2-NEXT:    [[TMP23:%.*]] = mul i64 -1, [[TMP34]]
-; RV64-UF2-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i64 [[TMP22]]
-; RV64-UF2-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i64 [[TMP23]]
-; RV64-UF2-NEXT:    [[TMP26:%.*]] = mul i64 -1, [[TMP5]]
-; RV64-UF2-NEXT:    [[TMP35:%.*]] = sub i64 [[TMP5]], 1
-; RV64-UF2-NEXT:    [[TMP27:%.*]] = mul i64 -1, [[TMP35]]
-; RV64-UF2-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i64 [[TMP26]]
-; RV64-UF2-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[TMP28]], i64 [[TMP27]]
-; RV64-UF2-NEXT:    [[REVERSE3:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP19]])
-; RV64-UF2-NEXT:    store <vscale x 4 x i32> [[REVERSE3]], ptr [[TMP25]], align 4
-; RV64-UF2-NEXT:    [[REVERSE4:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP20]])
-; RV64-UF2-NEXT:    store <vscale x 4 x i32> [[REVERSE4]], ptr [[TMP29]], align 4
-; RV64-UF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
-; RV64-UF2-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[TMP7]]
-; RV64-UF2-NEXT:    br i1 [[TMP36]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; RV64-UF2:       [[MIDDLE_BLOCK]]:
-; RV64-UF2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1023, [[TMP7]]
-; RV64-UF2-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
-; RV64-UF2:       [[SCALAR_PH]]:
-; RV64-UF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP8]], %[[MIDDLE_BLOCK]] ], [ 1023, %[[ENTRY]] ]
-; RV64-UF2-NEXT:    br label %[[FOR_BODY:.*]]
-; RV64-UF2:       [[FOR_BODY]]:
-; RV64-UF2-NEXT:    [[DEC_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; RV64-UF2-NEXT:    [[IV_NEXT]] = add nsw i64 [[DEC_IV]], -1
-; RV64-UF2-NEXT:    [[ARRAYIDX_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV_NEXT]]
-; RV64-UF2-NEXT:    [[TMP31:%.*]] = load i32, ptr [[ARRAYIDX_B]], align 4
-; RV64-UF2-NEXT:    [[ADD:%.*]] = add i32 [[TMP31]], 1
-; RV64-UF2-NEXT:    [[ARRAYIDX_A:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV_NEXT]]
-; RV64-UF2-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX_A]], align 4
-; RV64-UF2-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[DEC_IV]], 1
-; RV64-UF2-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[EXIT]], !llvm.loop [[LOOP3:![0-9]+]]
-; RV64-UF2:       [[EXIT]]:
-; RV64-UF2-NEXT:    ret void
-;
-entry:
-  br label %for.body
-
-for.body:
-  %dec.iv = phi i64 [ 1023, %entry ], [ %iv.next, %for.body ]
-  %iv.next = add nsw i64 %dec.iv, -1
-  %arrayidx.b = getelementptr inbounds i32, ptr %B, i64 %iv.next
-  %0 = load i32, ptr %arrayidx.b, align 4
-  %add = add i32 %0, 1
-  %arrayidx.a = getelementptr inbounds i32, ptr %A, i64 %iv.next
-  store i32 %add, ptr %arrayidx.a, align 4
-  %cmp = icmp ugt i64 %dec.iv, 1
-  br i1 %cmp, label %for.body, label %exit, !llvm.loop !0
-
-exit:
-  ret void
-}
-
-define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B) {
-; RV64-LABEL: define void @vector_reverse_f32(
-; RV64-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0]] {
-; RV64-NEXT:  [[ENTRY:.*:]]
-; RV64-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; RV64:       [[VECTOR_PH]]:
-; RV64-NEXT:    br label %[[VECTOR_BODY:.*]]
-; RV64:       [[VECTOR_BODY]]:
-; RV64-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; RV64-NEXT:    [[AVL:%.*]] = phi i64 [ 1023, %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; RV64-NEXT:    [[TMP0:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
-; RV64-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
-; RV64-NEXT:    [[TMP8:%.*]] = add nsw i64 [[OFFSET_IDX]], -1
-; RV64-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP8]]
-; RV64-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP0]] to i64
-; RV64-NEXT:    [[TMP10:%.*]] = mul i64 0, [[TMP5]]
-; RV64-NEXT:    [[TMP22:%.*]] = sub i64 [[TMP5]], 1
-; RV64-NEXT:    [[TMP11:%.*]] = mul i64 -1, [[TMP22]]
-; RV64-NEXT:    [[TMP7:%.*]] = getelementptr float, ptr [[TMP9]], i64 [[TMP10]]
-; RV64-NEXT:    [[TMP12:%.*]] = getelementptr float, ptr [[TMP7]], i64 [[TMP11]]
-; RV64-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP0]])
-; RV64-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x float> @llvm.experimental.vp.reverse.nxv4f32(<vscale x 4 x float> [[VP_OP_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP0]])
-; RV64-NEXT:    [[TMP14:%.*]] = fadd <vscale x 4 x float> [[REVERSE]], splat (float 1.000000e+00)
-; RV64-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP8]]
-; RV64-NEXT:    [[TMP16:%.*]] = mul i64 0, [[TMP5]]
-; RV64-NEXT:    [[TMP23:%.*]] = sub i64 [[TMP5]], 1
-; RV64-NEXT:    [[TMP17:%.*]] = mul i64 -1, [[TMP23]]
-; RV64-NEXT:    [[TMP18:%.*]] = getelementptr float, ptr [[TMP15]], i64 [[TMP16]]
-; RV64-NEXT:    [[TMP19:%.*]] = getelementptr float, ptr [[TMP18]], i64 [[TMP17]]
-; RV64-NEXT:    [[VP_REVERSE1:%.*]] = call <vscale x 4 x float> @llvm.experimental.vp.reverse.nxv4f32(<vscale x 4 x float> [[TMP14]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP0]])
-; RV64-NEXT:    call void @llvm.vp.store.nxv4f32.p0(<vscale x 4 x float> [[VP_REVERSE1]], ptr align 4 [[TMP19]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP0]])
-; RV64-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP5]], [[INDEX]]
-; RV64-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP5]]
-; RV64-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], 1023
-; RV64-NEXT:    br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; RV64:       [[MIDDLE_BLOCK]]:
-; RV64-NEXT:    br label %[[EXIT:.*]]
-; RV64:       [[SCALAR_PH]]:
-; RV64-NEXT:    br label %[[FOR_BODY:.*]]
-; RV64:       [[FOR_BODY]]:
-; RV64-NEXT:    [[DEC_IV:%.*]] = phi i64 [ 1023, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; RV64-NEXT:    [[IV_NEXT]] = add nsw i64 [[DEC_IV]], -1
-; RV64-NEXT:    [[ARRAYIDX_B:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV_NEXT]]
-; RV64-NEXT:    [[TMP21:%.*]] = load float, ptr [[ARRAYIDX_B]], align 4
-; RV64-NEXT:    [[FADD:%.*]] = fadd float [[TMP21]], 1.000000e+00
-; RV64-NEXT:    [[ARRAYIDX_A:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV_NEXT]]
-; RV64-NEXT:    store float [[FADD]], ptr [[ARRAYIDX_A]], align 4
-; RV64-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[DEC_IV]], 1
-; RV64-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[EXIT]], !llvm.loop [[LOOP6:![0-9]+]]
-; RV64:       [[EXIT]]:
-; RV64-NEXT:    ret void
-;
-; RV32-LABEL: define void @vector_reverse_f32(
-; RV32-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0]] {
-; RV32-NEXT:  [[ENTRY:.*:]]
-; RV32-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; RV32:       [[VECTOR_PH]]:
-; RV32-NEXT:    br label %[[VECTOR_BODY:.*]]
-; RV32:       [[VECTOR_BODY]]:
-; RV32-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; RV32-NEXT:    [[AVL:%.*]] = phi i64 [ 1023, %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; RV32-NEXT:    [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
-; RV32-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
-; RV32-NEXT:    [[TMP8:%.*]] = add nsw i64 [[OFFSET_IDX]], -1
-; RV32-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP8]]
-; RV32-NEXT:    [[TMP11:%.*]] = mul i32 0, [[TMP10]]
-; RV32-NEXT:    [[TMP24:%.*]] = sub i32 [[TMP10]], 1
-; RV32-NEXT:    [[TMP12:%.*]] = mul i32 -1, [[TMP24]]
-; RV32-NEXT:    [[TMP6:%.*]] = getelementptr float, ptr [[TMP9]], i32 [[TMP11]]
-; RV32-NEXT:    [[TMP7:%.*]] = getelementptr float, ptr [[TMP6]], i32 [[TMP12]]
-; RV32-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP7]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
-; RV32-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x float> @llvm.experimental.vp.reverse.nxv4f32(<vscale x 4 x float> [[VP_OP_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
-; RV32-NEXT:    [[TMP15:%.*]] = fadd <vscale x 4 x float> [[REVERSE]], splat (float 1.000000e+00)
-; RV32-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP8]]
-; RV32-NEXT:    [[TMP18:%.*]] = mul i32 0, [[TMP10]]
-; RV32-NEXT:    [[TMP25:%.*]] = sub i32 [[TMP10]], 1
-; RV32-NEXT:    [[TMP19:%.*]] = mul i32 -1, [[TMP25]]
-; RV32-NEXT:    [[TMP13:%.*]] = getelementptr float, ptr [[TMP16]], i32 [[TMP18]]
-; RV32-NEXT:    [[TMP14:%.*]] = getelementptr float, ptr [[TMP13]], i32 [[TMP19]]
-; RV32-NEXT:    [[VP_REVERSE1:%.*]] = call <vscale x 4 x float> @llvm.experimental.vp.reverse.nxv4f32(<vscale x 4 x float> [[TMP15]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
-; RV32-NEXT:    call void @llvm.vp.store.nxv4f32.p0(<vscale x 4 x float> [[VP_REVERSE1]], ptr align 4 [[TMP14]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
-; RV32-NEXT:    [[TMP17:%.*]] = zext i32 [[TMP10]] to i64
-; RV32-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP17]], [[INDEX]]
-; RV32-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP17]]
-; RV32-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], 1023
-; RV32-NEXT:    br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; RV32:       [[MIDDLE_BLOCK]]:
-; RV32-NEXT:    br label %[[EXIT:.*]]
-; RV32:       [[SCALAR_PH]]:
-; RV32-NEXT:    br label %[[FOR_BODY:.*]]
-; RV32:       [[FOR_BODY]]:
-; RV32-NEXT:    [[DEC_IV:%.*]] = phi i64 [ 1023, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; RV32-NEXT:    [[IV_NEXT]] = add nsw i64 [[DEC_IV]], -1
-; RV32-NEXT:    [[ARRAYIDX_B:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV_NEXT]]
-; RV32-NEXT:    [[TMP23:%.*]] = load float, ptr [[ARRAYIDX_B]], align 4
-; RV32-NEXT:    [[FADD:%.*]] = fadd float [[TMP23]], 1.000000e+00
-; RV32-NEXT:    [[ARRAYIDX_A:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV_NEXT]]
-; RV32-NEXT:    store float [[FADD]], ptr [[ARRAYIDX_A]], align 4
-; RV32-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[DEC_IV]], 1
-; RV32-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[EXIT]], !llvm.loop [[LOOP6:![0-9]+]]
-; RV32:       [[EXIT]]:
-; RV32-NEXT:    ret void
-;
-; RV64-UF2-LABEL: define void @vector_reverse_f32(
-; RV64-UF2-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0]] {
-; RV64-UF2-NEXT:  [[ENTRY:.*]]:
-; RV64-UF2-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; RV64-UF2-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 3
-; RV64-UF2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1023, [[TMP1]]
-; RV64-UF2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; RV64-UF2:       [[VECTOR_PH]]:
-; RV64-UF2-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; RV64-UF2-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
-; RV64-UF2-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 2
-; RV64-UF2-NEXT:    [[N_VEC:%.*]] = urem i64 1023, [[TMP6]]
-; RV64-UF2-NEXT:    [[TMP7:%.*]] = sub i64 1023, [[N_VEC]]
-; RV64-UF2-NEXT:    [[TMP8:%.*]] = sub i64 1023, [[TMP7]]
-; RV64-UF2-NEXT:    br label %[[VECTOR_BODY:.*]]
-; RV64-UF2:       [[VECTOR_BODY]]:
-; RV64-UF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; RV64-UF2-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
-; RV64-UF2-NEXT:    [[TMP9:%.*]] = add nsw i64 [[OFFSET_IDX]], -1
-; RV64-UF2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP9]]
-; RV64-UF2-NEXT:    [[TMP11:%.*]] = mul i64 0, [[TMP5]]
-; RV64-UF2-NEXT:    [[TMP32:%.*]] = sub i64 [[TMP5]], 1
-; RV64-UF2-NEXT:    [[TMP12:%.*]] = mul i64 -1, [[TMP32]]
-; RV64-UF2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i64 [[TMP11]]
-; RV64-UF2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i64 [[TMP12]]
-; RV64-UF2-NEXT:    [[TMP15:%.*]] = mul i64 -1, [[TMP5]]
-; RV64-UF2-NEXT:    [[TMP33:%.*]] = sub i64 [[TMP5]], 1
-; RV64-UF2-NEXT:    [[TMP16:%.*]] = mul i64 -1, [[TMP33]]
-; RV64-UF2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i64 [[TMP15]]
-; RV64-UF2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[TMP16]]
-; RV64-UF2-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP14]], align 4
-; RV64-UF2-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD]])
-; RV64-UF2-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x float>, ptr [[TMP18]], align 4
-; RV64-UF2-NEXT:    [[REVERSE2:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD1]])
-; RV64-UF2-NEXT:    [[TMP19:%.*]] = fadd <vscale x 4 x float> [[REVERSE]], splat (float 1.000000e+00)
-; RV64-UF2-NEXT:    [[TMP20:%.*]] = fadd <vscale x 4 x float> [[REVERSE2]], splat (float 1.000000e+00)
-; RV64-UF2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP9]]
-; RV64-UF2-NEXT:    [[TMP22:%.*]] = mul i64 0, [[TMP5]]
-; RV64-UF2-NEXT:    [[TMP34:%.*]] = sub i64 [[TMP5]], 1
-; RV64-UF2-NEXT:    [[TMP23:%.*]] = mul i64 -1, [[TMP34]]
-; RV64-UF2-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP21]], i64 [[TMP22]]
-; RV64-UF2-NEXT:    [[TMP25:%.*]] = getelementptr inbounds float, ptr [[TMP24]], i64 [[TMP23]]
-; RV64-UF2-NEXT:    [[TMP26:%.*]] = mul i64 -1, [[TMP5]]
-; RV64-UF2-NEXT:    [[TMP35:%.*]] = sub i64 [[TMP5]], 1
-; RV64-UF2-NEXT:    [[TMP27:%.*]] = mul i64 -1, [[TMP35]]
-; RV64-UF2-NEXT:    [[TMP28:%.*]] = getelementptr inbounds float, ptr [[TMP21]], i64 [[TMP26]]
-; RV64-UF2-NEXT:    [[TMP29:%.*]] = getelementptr inbounds float, ptr [[TMP28]], i64 [[TMP27]]
-; RV64-UF2-NEXT:    [[REVERSE3:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[TMP19]])
-; RV64-UF2-NEXT:    store <vscale x 4 x float> [[REVERSE3]], ptr [[TMP25]], align 4
-; RV64-UF2-NEXT:    [[REVERSE4:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[TMP20]])
-; RV64-UF2-NEXT:    store <vscale x 4 x float> [[REVERSE4]], ptr [[TMP29]], align 4
-; RV64-UF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
-; RV64-UF2-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[TMP7]]
-; RV64-UF2-NEXT:    br i1 [[TMP36]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; RV64-UF2:       [[MIDDLE_BLOCK]]:
-; RV64-UF2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1023, [[TMP7]]
-; RV64-UF2-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
-; RV64-UF2:       [[SCALAR_PH]]:
-; RV64-UF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP8]], %[[MIDDLE_BLOCK]] ], [ 1023, %[[ENTRY]] ]
-; RV64-UF2-NEXT:    br label %[[FOR_BODY:.*]]
-; RV64-UF2:       [[FOR_BODY]]:
-; RV64-UF2-NEXT:    [[DEC_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; RV64-UF2-NEXT:    [[IV_NEXT]] = add nsw i64 [[DEC_IV]], -1
-; RV64-UF2-NEXT:    [[ARRAYIDX_B:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV_NEXT]]
-; RV64-UF2-NEXT:    [[TMP31:%.*]] = load float, ptr [[ARRAYIDX_B]], align 4
-; RV64-UF2-NEXT:    [[FADD:%.*]] = fadd float [[TMP31]], 1.000000e+00
-; RV64-UF2-NEXT:    [[ARRAYIDX_A:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV_NEXT]]
-; RV64-UF2-NEXT:    store float [[FADD]], ptr [[ARRAYIDX_A]], align 4
-; RV64-UF2-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[DEC_IV]], 1
-; RV64-UF2-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[EXIT]], !llvm.loop [[LOOP5:![0-9]+]]
-; RV64-UF2:       [[EXIT]]:
-; RV64-UF2-NEXT:    ret void
-;
-entry:
-  br label %for.body
-
-for.body:
-  %dec.iv = phi i64 [ 1023, %entry ], [ %iv.next, %for.body ]
-  %iv.next = add nsw i64 %dec.iv, -1
-  %arrayidx.b = getelementptr inbounds float, ptr %B, i64 %iv.next
-  %0 = load float, ptr %arrayidx.b, align 4
-  %fadd = fadd float %0, 1.000000e+00
-  %arrayidx.a = getelementptr inbounds float, ptr %A, i64 %iv.next
-  store float %fadd, ptr %arrayidx.a, align 4
-  %cmp = icmp ugt i64 %dec.iv, 1
-  br i1 %cmp, label %for.body, label %exit, !llvm.loop !0
-
-exit:
-  ret void
-}
-
-define void @vector_reverse_irregular_type(ptr noalias %A, ptr noalias %B) {
-; RV64-LABEL: define void @vector_reverse_irregular_type(
-; RV64-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0]] {
-; RV64-NEXT:  [[ENTRY:.*]]:
-; RV64-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; RV64:       [[VECTOR_PH]]:
-; RV64-NEXT:    br label %[[VECTOR_BODY:.*]]
-; RV64:       [[VECTOR_BODY]]:
-; RV64-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; RV64-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
-; RV64-NEXT:    [[DEC_IV:%.*]] = add i64 [[OFFSET_IDX]], 0
-; RV64-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], -1
-; RV64-NEXT:    [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], -2
-; RV64-NEXT:    [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], -3
-; RV64-NEXT:    [[IV_NEXT:%.*]] = add nsw i64 [[DEC_IV]], -1
-; RV64-NEXT:    [[TMP5:%.*]] = add nsw i64 [[TMP1]], -1
-; RV64-NEXT:    [[TMP6:%.*]] = add nsw i64 [[TMP2]], -1
-; RV64-NEXT:    [[TMP7:%.*]] = add nsw i64 [[TMP3]], -1
-; RV64-NEXT:    [[ARRAYIDX_B:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[IV_NEXT]]
-; RV64-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP5]]
-; RV64-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP6]]
-; RV64-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP7]]
-; RV64-NEXT:    [[TMP0:%.*]] = load i7, ptr [[ARRAYIDX_B]], align 1
-; RV64-NEXT:    [[TMP13:%.*]] = load i7, ptr [[TMP9]], align 1
-; RV64-NEXT:    [[TMP14:%.*]] = load i7, ptr [[TMP10]], align 1
-; RV64-NEXT:    [[TMP15:%.*]] = load i7, ptr [[TMP11]], align 1
-; RV64-NEXT:    [[TMP16:%.*]] = insertelement <4 x i7> poison, i7 [[TMP0]], i32 0
-; RV64-NEXT:    [[TMP17:%.*]] = insertelement <4 x i7> [[TMP16]], i7 [[TMP13]], i32 1
-; RV64-NEXT:    [[TMP18:%.*]] = insertelement <4 x i7> [[TMP17]], i7 [[TMP14]], i32 2
-; RV64-NEXT:    [[TMP19:%.*]] = insertelement <4 x i7> [[TMP18]], i7 [[TMP15]], i32 3
-; RV64-NEXT:    [[TMP20:%.*]] = add <4 x i7> [[TMP19]], splat (i7 1)
-; RV64-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[IV_NEXT]]
-; RV64-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP5]]
-; RV64-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP6]]
-; RV64-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP7]]
-; RV64-NEXT:    [[TMP25:%.*]] = extractelement <4 x i7> [[TMP20]], i32 0
-; RV64-NEXT:    store i7 [[TMP25]], ptr [[TMP21]], align 1
-; RV64-NEXT:    [[TMP26:%.*]] = extractelement <4 x i7> [[TMP20]], i32 1
-; RV64-NEXT:    store i7 [[TMP26]], ptr [[TMP22]], align 1
-; RV64-NEXT:    [[TMP27:%.*]] = extractelement <4 x i7> [[TMP20]], i32 2
-; RV64-NEXT:    store i7 [[TMP27]], ptr [[TMP23]], align 1
-; RV64-NEXT:    [[TMP28:%.*]] = extractelement <4 x i7> [[TMP20]], i32 3
-; RV64-NEXT:    store i7 [[TMP28]], ptr [[TMP24]], align 1
-; RV64-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; RV64-NEXT:    [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1020
-; RV64-NEXT:    br i1 [[TMP29]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
-; RV64:       [[MIDDLE_BLOCK]]:
-; RV64-NEXT:    br label %[[SCALAR_PH]]
-; RV64:       [[SCALAR_PH]]:
-; RV64-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 3, %[[MIDDLE_BLOCK]] ], [ 1023, %[[ENTRY]] ]
-; RV64-NEXT:    br label %[[FOR_BODY:.*]]
-; RV64:       [[FOR_BODY]]:
-; RV64-NEXT:    [[DEC_IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], %[[FOR_BODY]] ]
-; RV64-NEXT:    [[IV_NEXT1]] = add nsw i64 [[DEC_IV1]], -1
-; RV64-NEXT:    [[ARRAYIDX_B1:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[IV_NEXT1]]
-; RV64-NEXT:    [[TMP30:%.*]] = load i7, ptr [[ARRAYIDX_B1]], align 1
-; RV64-NEXT:    [[ADD:%.*]] = add i7 [[TMP30]], 1
-; RV64-NEXT:    [[ARRAYIDX_A:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[IV_NEXT1]]
-; RV64-NEXT:    store i7 [[ADD]], ptr [[ARRAYIDX_A]], align 1
-; RV64-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[DEC_IV1]], 1
-; RV64-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[EXIT:.*]], !llvm.loop [[LOOP8:![0-9]+]]
-; RV64:       [[EXIT]]:
-; RV64-NEXT:    ret void
-;
-; RV32-LABEL: define void @vector_reverse_irregular_type(
-; RV32-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0]] {
-; RV32-NEXT:  [[ENTRY:.*]]:
-; RV32-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; RV32:       [[VECTOR_PH]]:
-; RV32-NEXT:    br label %[[VECTOR_BODY:.*]]
-; RV32:       [[VECTOR_BODY]]:
-; RV32-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; RV32-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
-; RV32-NEXT:    [[DEC_IV:%.*]] = add i64 [[OFFSET_IDX]], 0
-; RV32-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], -1
-; RV32-NEXT:    [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], -2
-; RV32-NEXT:    [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], -3
-; RV32-NEXT:    [[IV_NEXT:%.*]] = add nsw i64 [[DEC_IV]], -1
-; RV32-NEXT:    [[TMP5:%.*]] = add nsw i64 [[TMP1]], -1
-; RV32-NEXT:    [[TMP6:%.*]] = add nsw i64 [[TMP2]], -1
-; RV32-NEXT:    [[TMP7:%.*]] = add nsw i64 [[TMP3]], -1
-; RV32-NEXT:    [[ARRAYIDX_B:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[IV_NEXT]]
-; RV32-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP5]]
-; RV32-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP6]]
-; RV32-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP7]]
-; RV32-NEXT:    [[TMP0:%.*]] = load i7, ptr [[ARRAYIDX_B]], align 1
-; RV32-NEXT:    [[TMP13:%.*]] = load i7, ptr [[TMP9]], align 1
-; RV32-NEXT:    [[TMP14:%.*]] = load i7, ptr [[TMP10]], align 1
-; RV32-NEXT:    [[TMP15:%.*]] = load i7, ptr [[TMP11]], align 1
-; RV32-NEXT:    [[TMP16:%.*]] = insertelement <4 x i7> poison, i7 [[TMP0]], i32 0
-; RV32-NEXT:    [[TMP17:%.*]] = insertelement <4 x i7> [[TMP16]], i7 [[TMP13]], i32 1
-; RV32-NEXT:    [[TMP18:%.*]] = insertelement <4 x i7> [[TMP17]], i7 [[TMP14]], i32 2
-; RV32-NEXT:    [[TMP19:%.*]] = insertelement <4 x i7> [[TMP18]], i7 [[TMP15]], i32 3
-; RV32-NEXT:    [[TMP20:%.*]] = add <4 x i7> [[TMP19]], splat (i7 1)
-; RV32-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[IV_NEXT]]
-; RV32-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP5]]
-; RV32-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP6]]
-; RV32-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP7]]
-; RV32-NEXT:    [[TMP25:%.*]] = extractelement <4 x i7> [[TMP20]], i32 0
-; RV32-NEXT:    store i7 [[TMP25]], ptr [[TMP21]], align 1
-; RV32-NEXT:    [[TMP26:%.*]] = extractelement <4 x i7> [[TMP20]], i32 1
-; RV32-NEXT:    store i7 [[TMP26]], ptr [[TMP22]], align 1
-; RV32-NEXT:    [[TMP27:%.*]] = extractelement <4 x i7> [[TMP20]], i32 2
-; RV32-NEXT:    store i7 [[TMP27]], ptr [[TMP23]], align 1
-; RV32-NEXT:    [[TMP28:%.*]] = extractelement <4 x i7> [[TMP20]], i32 3
-; RV32-NEXT:    store i7 [[TMP28]], ptr [[TMP24]], align 1
-; RV32-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; RV32-NEXT:    [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1020
-; RV32-NEXT:    br i1 [[TMP29]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
-; RV32:       [[MIDDLE_BLOCK]]:
-; RV32-NEXT:    br label %[[SCALAR_PH]]
-; RV32:       [[SCALAR_PH]]:
-; RV32-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 3, %[[MIDDLE_BLOCK]] ], [ 1023, %[[ENTRY]] ]
-; RV32-NEXT:    br label %[[FOR_BODY:.*]]
-; RV32:       [[FOR_BODY]]:
-; RV32-NEXT:    [[DEC_IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], %[[FOR_BODY]] ]
-; RV32-NEXT:    [[IV_NEXT1]] = add nsw i64 [[DEC_IV1]], -1
-; RV32-NEXT:    [[ARRAYIDX_B1:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[IV_NEXT1]]
-; RV32-NEXT:    [[TMP30:%.*]] = load i7, ptr [[ARRAYIDX_B1]], align 1
-; RV32-NEXT:    [[ADD:%.*]] = add i7 [[TMP30]], 1
-; RV32-NEXT:    [[ARRAYIDX_A:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[IV_NEXT1]]
-; RV32-NEXT:    store i7 [[ADD]], ptr [[ARRAYIDX_A]], align 1
-; RV32-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[DEC_IV1]], 1
-; RV32-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[EXIT:.*]], !llvm.loop [[LOOP8:![0-9]+]]
-; RV32:       [[EXIT]]:
-; RV32-NEXT:    ret void
-;
-; RV64-UF2-LABEL: define void @vector_reverse_irregular_type(
-; RV64-UF2-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0]] {
-; RV64-UF2-NEXT:  [[ENTRY:.*]]:
-; RV64-UF2-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; RV64-UF2:       [[VECTOR_PH]]:
-; RV64-UF2-NEXT:    br label %[[VECTOR_BODY:.*]]
-; RV64-UF2:       [[VECTOR_BODY]]:
-; RV64-UF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; RV64-UF2-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
-; RV64-UF2-NEXT:    [[TMP16:%.*]] = add i64 [[OFFSET_IDX]], 0
-; RV64-UF2-NEXT:    [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], -1
-; RV64-UF2-NEXT:    [[TMP17:%.*]] = add i64 [[OFFSET_IDX]], -2
-; RV64-UF2-NEXT:    [[TMP24:%.*]] = add i64 [[OFFSET_IDX]], -3
-; RV64-UF2-NEXT:    [[TMP25:%.*]] = add i64 [[OFFSET_IDX]], -4
-; RV64-UF2-NEXT:    [[TMP42:%.*]] = add i64 [[OFFSET_IDX]], -5
-; RV64-UF2-NEXT:    [[TMP43:%.*]] = add i64 [[OFFSET_IDX]], -6
-; RV64-UF2-NEXT:    [[TMP50:%.*]] = add i64 [[OFFSET_IDX]], -7
-; RV64-UF2-NEXT:    [[TMP1:%.*]] = add nsw i64 [[TMP16]], -1
-; RV64-UF2-NEXT:    [[TMP2:%.*]] = add nsw i64 [[TMP0]], -1
-; RV64-UF2-NEXT:    [[TMP51:%.*]] = add nsw i64 [[TMP17]], -1
-; RV64-UF2-NEXT:    [[TMP11:%.*]] = add nsw i64 [[TMP24]], -1
-; RV64-UF2-NEXT:    [[TMP59:%.*]] = add nsw i64 [[TMP25]], -1
-; RV64-UF2-NEXT:    [[TMP13:%.*]] = add nsw i64 [[TMP42]], -1
-; RV64-UF2-NEXT:    [[TMP14:%.*]] = add nsw i64 [[TMP43]], -1
-; RV64-UF2-NEXT:    [[TMP15:%.*]] = add nsw i64 [[TMP50]], -1
-; RV64-UF2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP1]]
-; RV64-UF2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP2]]
-; RV64-UF2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP51]]
-; RV64-UF2-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP11]]
-; RV64-UF2-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP59]]
-; RV64-UF2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP13]]
-; RV64-UF2-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP14]]
-; RV64-UF2-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP15]]
-; RV64-UF2-NEXT:    [[TMP5:%.*]] = load i7, ptr [[TMP3]], align 1
-; RV64-UF2-NEXT:    [[TMP6:%.*]] = load i7, ptr [[TMP4]], align 1
-; RV64-UF2-NEXT:    [[TMP26:%.*]] = load i7, ptr [[TMP18]], align 1
-; RV64-UF2-NEXT:    [[TMP27:%.*]] = load i7, ptr [[TMP19]], align 1
-; RV64-UF2-NEXT:    [[TMP28:%.*]] = insertelement <4 x i7> poison, i7 [[TMP5]], i32 0
-; RV64-UF2-NEXT:    [[TMP29:%.*]] = insertelement <4 x i7> [[TMP28]], i7 [[TMP6]], i32 1
-; RV64-UF2-NEXT:    [[TMP30:%.*]] = insertelement <4 x i7> [[TMP29]], i7 [[TMP26]], i32 2
-; RV64-UF2-NEXT:    [[TMP31:%.*]] = insertelement <4 x i7> [[TMP30]], i7 [[TMP27]], i32 3
-; RV64-UF2-NEXT:    [[TMP32:%.*]] = load i7, ptr [[TMP20]], align 1
-; RV64-UF2-NEXT:    [[TMP33:%.*]] = load i7, ptr [[TMP21]], align 1
-; RV64-UF2-NEXT:    [[TMP34:%.*]] = load i7, ptr [[TMP22]], align 1
-; RV64-UF2-NEXT:    [[TMP35:%.*]] = load i7, ptr [[TMP23]], align 1
-; RV64-UF2-NEXT:    [[TMP36:%.*]] = insertelement <4 x i7> poison, i7 [[TMP32]], i32 0
-; RV64-UF2-NEXT:    [[TMP37:%.*]] = insertelement <4 x i7> [[TMP36]], i7 [[TMP33]], i32 1
-; RV64-UF2-NEXT:    [[TMP38:%.*]] = insertelement <4 x i7> [[TMP37]], i7 [[TMP34]], i32 2
-; RV64-UF2-NEXT:    [[TMP39:%.*]] = insertelement <4 x i7> [[TMP38]], i7 [[TMP35]], i32 3
-; RV64-UF2-NEXT:    [[TMP40:%.*]] = add <4 x i7> [[TMP31]], splat (i7 1)
-; RV64-UF2-NEXT:    [[TMP41:%.*]] = add <4 x i7> [[TMP39]], splat (i7 1)
-; RV64-UF2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP1]]
-; RV64-UF2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP2]]
-; RV64-UF2-NEXT:    [[TMP44:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP51]]
-; RV64-UF2-NEXT:    [[TMP45:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP11]]
-; RV64-UF2-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP59]]
-; RV64-UF2-NEXT:    [[TMP47:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP13]]
-; RV64-UF2-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP14]]
-; RV64-UF2-NEXT:    [[TMP49:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP15]]
-; RV64-UF2-NEXT:    [[TMP7:%.*]] = extractelement <4 x i7> [[TMP40]], i32 0
-; RV64-UF2-NEXT:    store i7 [[TMP7]], ptr [[TMP9]], align 1
-; RV64-UF2-NEXT:    [[TMP8:%.*]] = extractelement <4 x i7> [[TMP40]], i32 1
-; RV64-UF2-NEXT:    store i7 [[TMP8]], ptr [[TMP10]], align 1
-; RV64-UF2-NEXT:    [[TMP52:%.*]] = extractelement <4 x i7> [[TMP40]], i32 2
-; RV64-UF2-NEXT:    store i7 [[TMP52]], ptr [[TMP44]], align 1
-; RV64-UF2-NEXT:    [[TMP53:%.*]] = extractelement <4 x i7> [[TMP40]], i32 3
-; RV64-UF2-NEXT:    store i7 [[TMP53]], ptr [[TMP45]], align 1
-; RV64-UF2-NEXT:    [[TMP54:%.*]] = extractelement <4 x i7> [[TMP41]], i32 0
-; RV64-UF2-NEXT:    store i7 [[TMP54]], ptr [[TMP46]], align 1
-; RV64-UF2-NEXT:    [[TMP55:%.*]] = extractelement <4 x i7> [[TMP41]], i32 1
-; RV64-UF2-NEXT:    store i7 [[TMP55]], ptr [[TMP47]], align 1
-; RV64-UF2-NEXT:    [[TMP56:%.*]] = extractelement <4 x i7> [[TMP41]], i32 2
-; RV64-UF2-NEXT:    store i7 [[TMP56]], ptr [[TMP48]], align 1
-; RV64-UF2-NEXT:    [[TMP57:%.*]] = extractelement <4 x i7> [[TMP41]], i32 3
-; RV64-UF2-NEXT:    store i7 [[TMP57]], ptr [[TMP49]], align 1
-; RV64-UF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; RV64-UF2-NEXT:    [[TMP58:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1016
-; RV64-UF2-NEXT:    br i1 [[TMP58]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; RV64-UF2:       [[MIDDLE_BLOCK]]:
-; RV64-UF2-NEXT:    br label %[[SCALAR_PH]]
-; RV64-UF2:       [[SCALAR_PH]]:
-; RV64-UF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 7, %[[MIDDLE_BLOCK]] ], [ 1023, %[[ENTRY]] ]
-; RV64-UF2-NEXT:    br label %[[FOR_BODY:.*]]
-; RV64-UF2:       [[FOR_BODY]]:
-; RV64-UF2-NEXT:    [[DEC_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; RV64-UF2-NEXT:    [[IV_NEXT]] = add nsw i64 [[DEC_IV]], -1
-; RV64-UF2-NEXT:    [[ARRAYIDX_B:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[IV_NEXT]]
-; RV64-UF2-NEXT:    [[TMP12:%.*]] = load i7, ptr [[ARRAYIDX_B]], align 1
-; RV64-UF2-NEXT:    [[ADD:%.*]] = add i7 [[TMP12]], 1
-; RV64-UF2-NEXT:    [[ARRAYIDX_A:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[IV_NEXT]]
-; RV64-UF2-NEXT:    store i7 [[ADD]], ptr [[ARRAYIDX_A]], align 1
-; RV64-UF2-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[DEC_IV]], 1
-; RV64-UF2-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[EXIT:.*]], !llvm.loop [[LOOP7:![0-9]+]]
-; RV64-UF2:       [[EXIT]]:
-; RV64-UF2-NEXT:    ret void
-;
-entry:
-  br label %for.body
-
-for.body:
-  %dec.iv = phi i64 [ 1023, %entry ], [ %iv.next, %for.body ]
-  %iv.next = add nsw i64 %dec.iv, -1
-  %arrayidx.b = getelementptr inbounds i7, ptr %B, i64 %iv.next
-  %0 = load i7, ptr %arrayidx.b, align 1
-  %add = add i7 %0, 1
-  %arrayidx.a = getelementptr inbounds i7, ptr %A, i64 %iv.next
-  store i7 %add, ptr %arrayidx.a, align 1
-  %cmp = icmp ugt i64 %dec.iv, 1
-  br i1 %cmp, label %for.body, label %exit, !llvm.loop !4
-
-exit:
-  ret void
-}
-
-!0 = distinct !{!0, !1, !2, !3}
-!1 = !{!"llvm.loop.vectorize.width", i32 4}
-!2 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
-!3 = !{!"llvm.loop.vectorize.enable", i1 true}
-!4 = distinct !{!4, !1, !3}
diff --git a/llvm/test/Transforms/LoopVectorize/cse.ll b/llvm/test/Transforms/LoopVectorize/cse.ll
index 71325de1ca275..37b9c094820e4 100644
--- a/llvm/test/Transforms/LoopVectorize/cse.ll
+++ b/llvm/test/Transforms/LoopVectorize/cse.ll
@@ -327,12 +327,9 @@ define i32 @common_sext_different_dest_types(ptr %a, ptr %b, ptr %c, i32 %N) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP0]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = sext <4 x i8> [[WIDE_LOAD]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP7:%.*]] = sext <4 x i8> [[WIDE_LOAD1]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP8:%.*]] = sext <4 x i8> [[WIDE_LOAD1]] to <4 x i16>
@@ -402,12 +399,9 @@ define i32 @common_zext_different_dest_types(ptr %a, ptr %b, ptr %c, i32 %N) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP0]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = zext <4 x i8> [[WIDE_LOAD]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP7:%.*]] = zext <4 x i8> [[WIDE_LOAD1]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP8:%.*]] = zext <4 x i8> [[WIDE_LOAD1]] to <4 x i16>

From 02919ed05d38f2a64734444f5b0d3de6ac6eedbd Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs@arm.com>
Date: Tue, 19 Aug 2025 11:25:00 +0100
Subject: [PATCH 6/6] Fix powerpc test

---
 llvm/test/Transforms/LoopVectorize/PowerPC/vectorize-bswap.ll | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/vectorize-bswap.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/vectorize-bswap.ll
index 7f0a487fca945..db1f2c71e0f77 100644
--- a/llvm/test/Transforms/LoopVectorize/PowerPC/vectorize-bswap.ll
+++ b/llvm/test/Transforms/LoopVectorize/PowerPC/vectorize-bswap.ll
@@ -19,7 +19,7 @@ define dso_local void @test(ptr %Arr, i32 signext %Len) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[ARR:%.*]], i64 [[TMP1]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> [[WIDE_LOAD]])
-; CHECK-NEXT:    store <4 x i32> [[TMP4]], ptr [[TMP3]], align 4
+; CHECK-NEXT:    store <4 x i32> [[TMP4]], ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]