From c7220147cee093b95e138c377b4da1c2d724e485 Mon Sep 17 00:00:00 2001
From: Min-Yih Hsu <min.hsu@sifive.com>
Date: Thu, 31 Jul 2025 15:04:32 -0700
Subject: [PATCH 1/9] Pre-commit test

---
 .../rvv/fixed-vectors-interleaved-access.ll   | 680 +++++++++++++++++-
 1 file changed, 644 insertions(+), 36 deletions(-)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
index 6eb0b693b5546..2df26b2f78d5b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
@@ -332,6 +332,174 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @vpload_factor3_poison_shufflemask(ptr
   ret {<4 x i32>, <4 x i32>, <4 x i32>} %res2
 }
 
+define {<4 x i32>, <4 x i32>, <4 x i32>} @vpload_factor3_skip_fields(ptr %ptr) {
+  ; mask = 1111, skip the last field.
+; RV32-LABEL: vpload_factor3_skip_fields:
+; RV32:       # %bb.0:
+; RV32-NEXT:    li a1, 1755
+; RV32-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; RV32-NEXT:    vmv.s.x v0, a1
+; RV32-NEXT:    li a1, 73
+; RV32-NEXT:    vmv.v.i v10, 8
+; RV32-NEXT:    vsetivli zero, 12, e32, m4, ta, ma
+; RV32-NEXT:    vle32.v v12, (a0), v0.t
+; RV32-NEXT:    li a0, 36
+; RV32-NEXT:    vmv.s.x v11, a1
+; RV32-NEXT:    lui a1, %hi(.LCPI17_0)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI17_0)
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vcompress.vm v8, v12, v11
+; RV32-NEXT:    vsetivli zero, 8, e32, m4, ta, ma
+; RV32-NEXT:    vslidedown.vi v16, v12, 8
+; RV32-NEXT:    vmv1r.v v0, v10
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
+; RV32-NEXT:    vrgather.vi v8, v16, 1, v0.t
+; RV32-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; RV32-NEXT:    vmv.v.i v0, 2
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
+; RV32-NEXT:    vslidedown.vi v14, v12, 1
+; RV32-NEXT:    vslidedown.vi v14, v12, 3, v0.t
+; RV32-NEXT:    vle16.v v9, (a1)
+; RV32-NEXT:    vmv1r.v v0, v10
+; RV32-NEXT:    vrgather.vi v14, v16, 2, v0.t
+; RV32-NEXT:    vmv.s.x v0, a0
+; RV32-NEXT:    vmerge.vvm v12, v16, v12, v0
+; RV32-NEXT:    vrgatherei16.vv v10, v12, v9
+; RV32-NEXT:    vmv1r.v v9, v14
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vpload_factor3_skip_fields:
+; RV64:       # %bb.0:
+; RV64-NEXT:    li a1, 1755
+; RV64-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; RV64-NEXT:    vmv.s.x v0, a1
+; RV64-NEXT:    li a1, 73
+; RV64-NEXT:    vmv.v.i v10, 8
+; RV64-NEXT:    vmv.s.x v11, a1
+; RV64-NEXT:    li a1, 36
+; RV64-NEXT:    vsetivli zero, 12, e32, m4, ta, ma
+; RV64-NEXT:    vle32.v v12, (a0), v0.t
+; RV64-NEXT:    li a0, 3
+; RV64-NEXT:    slli a0, a0, 32
+; RV64-NEXT:    addi a0, a0, 5
+; RV64-NEXT:    slli a0, a0, 16
+; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV64-NEXT:    vcompress.vm v8, v12, v11
+; RV64-NEXT:    vsetivli zero, 8, e32, m4, ta, ma
+; RV64-NEXT:    vslidedown.vi v16, v12, 8
+; RV64-NEXT:    vmv1r.v v0, v10
+; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
+; RV64-NEXT:    vrgather.vi v8, v16, 1, v0.t
+; RV64-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; RV64-NEXT:    vmv.v.i v0, 2
+; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
+; RV64-NEXT:    vslidedown.vi v14, v12, 1
+; RV64-NEXT:    vslidedown.vi v14, v12, 3, v0.t
+; RV64-NEXT:    vmv1r.v v0, v10
+; RV64-NEXT:    vrgather.vi v14, v16, 2, v0.t
+; RV64-NEXT:    vmv.s.x v0, a1
+; RV64-NEXT:    addi a0, a0, 2
+; RV64-NEXT:    vmerge.vvm v12, v16, v12, v0
+; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV64-NEXT:    vmv.v.x v9, a0
+; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV64-NEXT:    vrgatherei16.vv v10, v12, v9
+; RV64-NEXT:    vmv1r.v v9, v14
+; RV64-NEXT:    ret
+  %interleaved.vec = tail call <12 x i32> @llvm.vp.load.v12i32.p0(ptr %ptr, <12 x i1> <i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0>, i32 12)
+  %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+  %v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 poison, i32 10>
+  %v2 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
+  %res0 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0
+  %res1 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1
+  %res2 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res1, <4 x i32> %v2, 2
+  ret {<4 x i32>, <4 x i32>, <4 x i32>} %res2
+}
+
+define {<4 x i32>, <4 x i32>, <4 x i32>} @vpload_factor3_mask_skip_fields(ptr %ptr) {
+  ; mask = 0101, skip the last field.
+; RV32-LABEL: vpload_factor3_mask_skip_fields:
+; RV32:       # %bb.0:
+; RV32-NEXT:    li a1, 1560
+; RV32-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; RV32-NEXT:    vmv.s.x v0, a1
+; RV32-NEXT:    li a1, 73
+; RV32-NEXT:    vmv.v.i v10, 8
+; RV32-NEXT:    vsetivli zero, 12, e32, m4, ta, ma
+; RV32-NEXT:    vle32.v v12, (a0), v0.t
+; RV32-NEXT:    li a0, 36
+; RV32-NEXT:    vmv.s.x v11, a1
+; RV32-NEXT:    lui a1, %hi(.LCPI18_0)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI18_0)
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vcompress.vm v8, v12, v11
+; RV32-NEXT:    vsetivli zero, 8, e32, m4, ta, ma
+; RV32-NEXT:    vslidedown.vi v16, v12, 8
+; RV32-NEXT:    vmv1r.v v0, v10
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
+; RV32-NEXT:    vrgather.vi v8, v16, 1, v0.t
+; RV32-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; RV32-NEXT:    vmv.v.i v0, 2
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
+; RV32-NEXT:    vslidedown.vi v14, v12, 1
+; RV32-NEXT:    vslidedown.vi v14, v12, 3, v0.t
+; RV32-NEXT:    vle16.v v9, (a1)
+; RV32-NEXT:    vmv1r.v v0, v10
+; RV32-NEXT:    vrgather.vi v14, v16, 2, v0.t
+; RV32-NEXT:    vmv.s.x v0, a0
+; RV32-NEXT:    vmerge.vvm v12, v16, v12, v0
+; RV32-NEXT:    vrgatherei16.vv v10, v12, v9
+; RV32-NEXT:    vmv1r.v v9, v14
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vpload_factor3_mask_skip_fields:
+; RV64:       # %bb.0:
+; RV64-NEXT:    li a1, 1560
+; RV64-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; RV64-NEXT:    vmv.s.x v0, a1
+; RV64-NEXT:    li a1, 73
+; RV64-NEXT:    vmv.v.i v10, 8
+; RV64-NEXT:    vmv.s.x v11, a1
+; RV64-NEXT:    li a1, 36
+; RV64-NEXT:    vsetivli zero, 12, e32, m4, ta, ma
+; RV64-NEXT:    vle32.v v12, (a0), v0.t
+; RV64-NEXT:    li a0, 3
+; RV64-NEXT:    slli a0, a0, 32
+; RV64-NEXT:    addi a0, a0, 5
+; RV64-NEXT:    slli a0, a0, 16
+; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV64-NEXT:    vcompress.vm v8, v12, v11
+; RV64-NEXT:    vsetivli zero, 8, e32, m4, ta, ma
+; RV64-NEXT:    vslidedown.vi v16, v12, 8
+; RV64-NEXT:    vmv1r.v v0, v10
+; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
+; RV64-NEXT:    vrgather.vi v8, v16, 1, v0.t
+; RV64-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; RV64-NEXT:    vmv.v.i v0, 2
+; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
+; RV64-NEXT:    vslidedown.vi v14, v12, 1
+; RV64-NEXT:    vslidedown.vi v14, v12, 3, v0.t
+; RV64-NEXT:    vmv1r.v v0, v10
+; RV64-NEXT:    vrgather.vi v14, v16, 2, v0.t
+; RV64-NEXT:    vmv.s.x v0, a1
+; RV64-NEXT:    addi a0, a0, 2
+; RV64-NEXT:    vmerge.vvm v12, v16, v12, v0
+; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV64-NEXT:    vmv.v.x v9, a0
+; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV64-NEXT:    vrgatherei16.vv v10, v12, v9
+; RV64-NEXT:    vmv1r.v v9, v14
+; RV64-NEXT:    ret
+  %interleaved.vec = tail call <12 x i32> @llvm.vp.load.v12i32.p0(ptr %ptr, <12 x i1> <i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0>, i32 12)
+  %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+  %v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 poison, i32 10>
+  %v2 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
+  %res0 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0
+  %res1 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1
+  %res2 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res1, <4 x i32> %v2, 2
+  ret {<4 x i32>, <4 x i32>, <4 x i32>} %res2
+}
+
 define {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} @vpload_factor4(ptr %ptr) {
 ; CHECK-LABEL: vpload_factor4:
 ; CHECK:       # %bb.0:
@@ -479,8 +647,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    li a2, 32
 ; RV32-NEXT:    lui a3, 12
 ; RV32-NEXT:    lui a6, 12291
-; RV32-NEXT:    lui a7, %hi(.LCPI23_0)
-; RV32-NEXT:    addi a7, a7, %lo(.LCPI23_0)
+; RV32-NEXT:    lui a7, %hi(.LCPI25_0)
+; RV32-NEXT:    addi a7, a7, %lo(.LCPI25_0)
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; RV32-NEXT:    vle32.v v24, (a5)
 ; RV32-NEXT:    vmv.s.x v0, a3
@@ -565,12 +733,12 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs4r.v v8, (a1) # vscale x 32-byte Folded Spill
 ; RV32-NEXT:    lui a7, 49164
-; RV32-NEXT:    lui a1, %hi(.LCPI23_1)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI23_1)
+; RV32-NEXT:    lui a1, %hi(.LCPI25_1)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI25_1)
 ; RV32-NEXT:    lui t2, 3
 ; RV32-NEXT:    lui t1, 196656
-; RV32-NEXT:    lui a4, %hi(.LCPI23_3)
-; RV32-NEXT:    addi a4, a4, %lo(.LCPI23_3)
+; RV32-NEXT:    lui a4, %hi(.LCPI25_3)
+; RV32-NEXT:    addi a4, a4, %lo(.LCPI25_3)
 ; RV32-NEXT:    lui t0, 786624
 ; RV32-NEXT:    li a5, 48
 ; RV32-NEXT:    lui a6, 768
@@ -749,8 +917,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
 ; RV32-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; RV32-NEXT:    vrgatherei16.vv v24, v8, v2
-; RV32-NEXT:    lui a1, %hi(.LCPI23_2)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI23_2)
+; RV32-NEXT:    lui a1, %hi(.LCPI25_2)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI25_2)
 ; RV32-NEXT:    lui a3, 3073
 ; RV32-NEXT:    addi a3, a3, -1024
 ; RV32-NEXT:    vmv.s.x v0, a3
@@ -814,16 +982,16 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vrgatherei16.vv v28, v8, v3
 ; RV32-NEXT:    vsetivli zero, 10, e32, m4, tu, ma
 ; RV32-NEXT:    vmv.v.v v28, v24
-; RV32-NEXT:    lui a1, %hi(.LCPI23_4)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI23_4)
-; RV32-NEXT:    lui a2, %hi(.LCPI23_5)
-; RV32-NEXT:    addi a2, a2, %lo(.LCPI23_5)
+; RV32-NEXT:    lui a1, %hi(.LCPI25_4)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI25_4)
+; RV32-NEXT:    lui a2, %hi(.LCPI25_5)
+; RV32-NEXT:    addi a2, a2, %lo(.LCPI25_5)
 ; RV32-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; RV32-NEXT:    vle16.v v24, (a2)
 ; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; RV32-NEXT:    vle16.v v8, (a1)
-; RV32-NEXT:    lui a1, %hi(.LCPI23_7)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI23_7)
+; RV32-NEXT:    lui a1, %hi(.LCPI25_7)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI25_7)
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vle16.v v10, (a1)
 ; RV32-NEXT:    csrr a1, vlenb
@@ -851,14 +1019,14 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vl8r.v v0, (a1) # vscale x 64-byte Folded Reload
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vrgatherei16.vv v16, v0, v10
-; RV32-NEXT:    lui a1, %hi(.LCPI23_6)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI23_6)
-; RV32-NEXT:    lui a2, %hi(.LCPI23_8)
-; RV32-NEXT:    addi a2, a2, %lo(.LCPI23_8)
+; RV32-NEXT:    lui a1, %hi(.LCPI25_6)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI25_6)
+; RV32-NEXT:    lui a2, %hi(.LCPI25_8)
+; RV32-NEXT:    addi a2, a2, %lo(.LCPI25_8)
 ; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; RV32-NEXT:    vle16.v v4, (a1)
-; RV32-NEXT:    lui a1, %hi(.LCPI23_9)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI23_9)
+; RV32-NEXT:    lui a1, %hi(.LCPI25_9)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI25_9)
 ; RV32-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; RV32-NEXT:    vle16.v v6, (a1)
 ; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
@@ -945,8 +1113,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    li a4, 128
 ; RV64-NEXT:    lui a1, 1
 ; RV64-NEXT:    vle64.v v8, (a3)
-; RV64-NEXT:    lui a3, %hi(.LCPI23_0)
-; RV64-NEXT:    addi a3, a3, %lo(.LCPI23_0)
+; RV64-NEXT:    lui a3, %hi(.LCPI25_0)
+; RV64-NEXT:    addi a3, a3, %lo(.LCPI25_0)
 ; RV64-NEXT:    vmv.s.x v0, a4
 ; RV64-NEXT:    csrr a4, vlenb
 ; RV64-NEXT:    li a5, 61
@@ -1134,8 +1302,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
 ; RV64-NEXT:    vslideup.vi v12, v16, 1, v0.t
-; RV64-NEXT:    lui a2, %hi(.LCPI23_1)
-; RV64-NEXT:    addi a2, a2, %lo(.LCPI23_1)
+; RV64-NEXT:    lui a2, %hi(.LCPI25_1)
+; RV64-NEXT:    addi a2, a2, %lo(.LCPI25_1)
 ; RV64-NEXT:    li a3, 192
 ; RV64-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; RV64-NEXT:    vle16.v v6, (a2)
@@ -1169,8 +1337,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    vrgatherei16.vv v24, v16, v6
 ; RV64-NEXT:    addi a2, sp, 16
 ; RV64-NEXT:    vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    lui a2, %hi(.LCPI23_2)
-; RV64-NEXT:    addi a2, a2, %lo(.LCPI23_2)
+; RV64-NEXT:    lui a2, %hi(.LCPI25_2)
+; RV64-NEXT:    addi a2, a2, %lo(.LCPI25_2)
 ; RV64-NEXT:    li a3, 1040
 ; RV64-NEXT:    vmv.s.x v0, a3
 ; RV64-NEXT:    addi a1, a1, -2016
@@ -1254,12 +1422,12 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vs4r.v v8, (a1) # vscale x 32-byte Folded Spill
-; RV64-NEXT:    lui a1, %hi(.LCPI23_3)
-; RV64-NEXT:    addi a1, a1, %lo(.LCPI23_3)
+; RV64-NEXT:    lui a1, %hi(.LCPI25_3)
+; RV64-NEXT:    addi a1, a1, %lo(.LCPI25_3)
 ; RV64-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; RV64-NEXT:    vle16.v v20, (a1)
-; RV64-NEXT:    lui a1, %hi(.LCPI23_4)
-; RV64-NEXT:    addi a1, a1, %lo(.LCPI23_4)
+; RV64-NEXT:    lui a1, %hi(.LCPI25_4)
+; RV64-NEXT:    addi a1, a1, %lo(.LCPI25_4)
 ; RV64-NEXT:    vle16.v v8, (a1)
 ; RV64-NEXT:    csrr a1, vlenb
 ; RV64-NEXT:    li a2, 77
@@ -1310,8 +1478,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    vl2r.v v8, (a1) # vscale x 16-byte Folded Reload
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV64-NEXT:    vrgatherei16.vv v0, v16, v8
-; RV64-NEXT:    lui a1, %hi(.LCPI23_5)
-; RV64-NEXT:    addi a1, a1, %lo(.LCPI23_5)
+; RV64-NEXT:    lui a1, %hi(.LCPI25_5)
+; RV64-NEXT:    addi a1, a1, %lo(.LCPI25_5)
 ; RV64-NEXT:    vle16.v v20, (a1)
 ; RV64-NEXT:    csrr a1, vlenb
 ; RV64-NEXT:    li a2, 61
@@ -1928,8 +2096,8 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @invalid_vp_mask(ptr %ptr) {
 ; RV32-NEXT:    vle32.v v12, (a0), v0.t
 ; RV32-NEXT:    li a0, 36
 ; RV32-NEXT:    vmv.s.x v20, a1
-; RV32-NEXT:    lui a1, %hi(.LCPI59_0)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI59_0)
+; RV32-NEXT:    lui a1, %hi(.LCPI61_0)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI61_0)
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32-NEXT:    vle16.v v21, (a1)
 ; RV32-NEXT:    vcompress.vm v8, v12, v11
@@ -2004,8 +2172,8 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @invalid_vp_evl(ptr %ptr) {
 ; RV32-NEXT:    vmv.s.x v10, a0
 ; RV32-NEXT:    li a0, 146
 ; RV32-NEXT:    vmv.s.x v11, a0
-; RV32-NEXT:    lui a0, %hi(.LCPI60_0)
-; RV32-NEXT:    addi a0, a0, %lo(.LCPI60_0)
+; RV32-NEXT:    lui a0, %hi(.LCPI62_0)
+; RV32-NEXT:    addi a0, a0, %lo(.LCPI62_0)
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32-NEXT:    vle16.v v20, (a0)
 ; RV32-NEXT:    li a0, 36
@@ -2094,3 +2262,443 @@ define void @maskedstore_factor2(ptr %ptr, <4 x i32> %v0, <4 x i32> %v1) {
   tail call void @llvm.masked.store(<8 x i32> %interleaved.vec, ptr %ptr, i32 4, <8 x i1> splat (i1 true))
   ret void
 }
+
+define {<4 x i32>, <4 x i32>, <4 x i32>} @maskedload_factor3_mask(ptr %ptr) {
+; CHECK-LABEL: maskedload_factor3_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vmv.v.i v0, 5
+; CHECK-NEXT:    vlseg3e32.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+  %interleaved.vec = tail call <12 x i32> @llvm.masked.load.v12i32.p0(ptr %ptr, i32 4, <12 x i1> <i1 1,i1 1,i1 1,i1 0,i1 0,i1 0,i1 1,i1 1,i1 1,i1 0,i1 0,i1 0>, <12 x i32> poison)
+  ; mask = 1010
+  %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+  %v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+  %v2 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
+  %res0 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0
+  %res1 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1
+  %res2 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res1, <4 x i32> %v2, 2
+  ret {<4 x i32>, <4 x i32>, <4 x i32>} %res2
+}
+
+define {<4 x i32>, <4 x i32>, <4 x i32>} @maskedload_factor3_skip_field(ptr %ptr) {
+; RV32-LABEL: maskedload_factor3_skip_field:
+; RV32:       # %bb.0:
+; RV32-NEXT:    li a1, 1755
+; RV32-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; RV32-NEXT:    vmv.s.x v0, a1
+; RV32-NEXT:    li a1, 73
+; RV32-NEXT:    vmv.v.i v10, 8
+; RV32-NEXT:    vmv.s.x v11, a1
+; RV32-NEXT:    li a1, 146
+; RV32-NEXT:    vsetivli zero, 12, e32, m4, ta, ma
+; RV32-NEXT:    vle32.v v12, (a0), v0.t
+; RV32-NEXT:    li a0, 36
+; RV32-NEXT:    vmv.s.x v20, a1
+; RV32-NEXT:    lui a1, %hi(.LCPI66_0)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI66_0)
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vle16.v v21, (a1)
+; RV32-NEXT:    vcompress.vm v8, v12, v11
+; RV32-NEXT:    vsetivli zero, 8, e32, m4, ta, ma
+; RV32-NEXT:    vslidedown.vi v16, v12, 8
+; RV32-NEXT:    vmv1r.v v0, v10
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
+; RV32-NEXT:    vrgather.vi v8, v16, 1, v0.t
+; RV32-NEXT:    vcompress.vm v14, v12, v20
+; RV32-NEXT:    vrgather.vi v14, v16, 2, v0.t
+; RV32-NEXT:    vmv.s.x v0, a0
+; RV32-NEXT:    vmerge.vvm v12, v16, v12, v0
+; RV32-NEXT:    vrgatherei16.vv v10, v12, v21
+; RV32-NEXT:    vmv1r.v v9, v14
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: maskedload_factor3_skip_field:
+; RV64:       # %bb.0:
+; RV64-NEXT:    li a1, 1755
+; RV64-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; RV64-NEXT:    vmv.s.x v0, a1
+; RV64-NEXT:    li a1, 73
+; RV64-NEXT:    vmv.v.i v10, 8
+; RV64-NEXT:    vmv.s.x v11, a1
+; RV64-NEXT:    li a1, 146
+; RV64-NEXT:    vmv.s.x v20, a1
+; RV64-NEXT:    li a1, 36
+; RV64-NEXT:    vsetivli zero, 12, e32, m4, ta, ma
+; RV64-NEXT:    vle32.v v12, (a0), v0.t
+; RV64-NEXT:    li a0, 3
+; RV64-NEXT:    slli a0, a0, 32
+; RV64-NEXT:    addi a0, a0, 5
+; RV64-NEXT:    slli a0, a0, 16
+; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV64-NEXT:    vcompress.vm v8, v12, v11
+; RV64-NEXT:    vsetivli zero, 8, e32, m4, ta, ma
+; RV64-NEXT:    vslidedown.vi v16, v12, 8
+; RV64-NEXT:    vmv1r.v v0, v10
+; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
+; RV64-NEXT:    vrgather.vi v8, v16, 1, v0.t
+; RV64-NEXT:    vcompress.vm v14, v12, v20
+; RV64-NEXT:    vrgather.vi v14, v16, 2, v0.t
+; RV64-NEXT:    vmv.s.x v0, a1
+; RV64-NEXT:    addi a0, a0, 2
+; RV64-NEXT:    vmerge.vvm v12, v16, v12, v0
+; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV64-NEXT:    vmv.v.x v9, a0
+; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV64-NEXT:    vrgatherei16.vv v10, v12, v9
+; RV64-NEXT:    vmv1r.v v9, v14
+; RV64-NEXT:    ret
+  %interleaved.vec = tail call <12 x i32> @llvm.masked.load.v12i32.p0(ptr %ptr, i32 4, <12 x i1> <i1 1,i1 1,i1 0,i1 1,i1 1,i1 0,i1 1,i1 1,i1 0,i1 1,i1 1,i1 0>, <12 x i32> poison)
+  ; mask = 1111, skip last field
+  %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+  %v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+  %v2 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
+  %res0 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0
+  %res1 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1
+  %res2 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res1, <4 x i32> %v2, 2
+  ret {<4 x i32>, <4 x i32>, <4 x i32>} %res2
+}
+
+define {<4 x i32>, <4 x i32>, <4 x i32>} @maskedload_factor3_mask_skip_field(ptr %ptr) {
+; RV32-LABEL: maskedload_factor3_mask_skip_field:
+; RV32:       # %bb.0:
+; RV32-NEXT:    li a1, 195
+; RV32-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; RV32-NEXT:    vmv.s.x v0, a1
+; RV32-NEXT:    li a1, 73
+; RV32-NEXT:    vmv.v.i v10, 8
+; RV32-NEXT:    vmv.s.x v11, a1
+; RV32-NEXT:    li a1, 146
+; RV32-NEXT:    vsetivli zero, 12, e32, m4, ta, ma
+; RV32-NEXT:    vle32.v v12, (a0), v0.t
+; RV32-NEXT:    li a0, 36
+; RV32-NEXT:    vmv.s.x v20, a1
+; RV32-NEXT:    lui a1, %hi(.LCPI67_0)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI67_0)
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vle16.v v21, (a1)
+; RV32-NEXT:    vcompress.vm v8, v12, v11
+; RV32-NEXT:    vsetivli zero, 8, e32, m4, ta, ma
+; RV32-NEXT:    vslidedown.vi v16, v12, 8
+; RV32-NEXT:    vmv1r.v v0, v10
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
+; RV32-NEXT:    vrgather.vi v8, v16, 1, v0.t
+; RV32-NEXT:    vcompress.vm v14, v12, v20
+; RV32-NEXT:    vrgather.vi v14, v16, 2, v0.t
+; RV32-NEXT:    vmv.s.x v0, a0
+; RV32-NEXT:    vmerge.vvm v12, v16, v12, v0
+; RV32-NEXT:    vrgatherei16.vv v10, v12, v21
+; RV32-NEXT:    vmv1r.v v9, v14
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: maskedload_factor3_mask_skip_field:
+; RV64:       # %bb.0:
+; RV64-NEXT:    li a1, 195
+; RV64-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; RV64-NEXT:    vmv.s.x v0, a1
+; RV64-NEXT:    li a1, 73
+; RV64-NEXT:    vmv.v.i v10, 8
+; RV64-NEXT:    vmv.s.x v11, a1
+; RV64-NEXT:    li a1, 146
+; RV64-NEXT:    vmv.s.x v20, a1
+; RV64-NEXT:    li a1, 36
+; RV64-NEXT:    vsetivli zero, 12, e32, m4, ta, ma
+; RV64-NEXT:    vle32.v v12, (a0), v0.t
+; RV64-NEXT:    li a0, 3
+; RV64-NEXT:    slli a0, a0, 32
+; RV64-NEXT:    addi a0, a0, 5
+; RV64-NEXT:    slli a0, a0, 16
+; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV64-NEXT:    vcompress.vm v8, v12, v11
+; RV64-NEXT:    vsetivli zero, 8, e32, m4, ta, ma
+; RV64-NEXT:    vslidedown.vi v16, v12, 8
+; RV64-NEXT:    vmv1r.v v0, v10
+; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
+; RV64-NEXT:    vrgather.vi v8, v16, 1, v0.t
+; RV64-NEXT:    vcompress.vm v14, v12, v20
+; RV64-NEXT:    vrgather.vi v14, v16, 2, v0.t
+; RV64-NEXT:    vmv.s.x v0, a1
+; RV64-NEXT:    addi a0, a0, 2
+; RV64-NEXT:    vmerge.vvm v12, v16, v12, v0
+; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV64-NEXT:    vmv.v.x v9, a0
+; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV64-NEXT:    vrgatherei16.vv v10, v12, v9
+; RV64-NEXT:    vmv1r.v v9, v14
+; RV64-NEXT:    ret
+  %interleaved.vec = tail call <12 x i32> @llvm.masked.load.v12i32.p0(ptr %ptr, i32 4, <12 x i1> <i1 1,i1 1,i1 0,i1 0,i1 0,i1 0,i1 1,i1 1,i1 0,i1 0,i1 0,i1 0>, <12 x i32> poison)
+  ; mask = 1010, skip the last field
+  %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+  %v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+  %v2 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
+  %res0 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0
+  %res1 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1
+  %res2 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res1, <4 x i32> %v2, 2
+  ret {<4 x i32>, <4 x i32>, <4 x i32>} %res2
+}
+
+; We can only skip the last field for now.
+define {<4 x i32>, <4 x i32>, <4 x i32>} @maskedload_factor3_invalid_skip_field(ptr %ptr) {
+; RV32-LABEL: maskedload_factor3_invalid_skip_field:
+; RV32:       # %bb.0:
+; RV32-NEXT:    li a1, 73
+; RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; RV32-NEXT:    vmv.s.x v11, a1
+; RV32-NEXT:    lui a1, 1
+; RV32-NEXT:    vmv.v.i v10, 8
+; RV32-NEXT:    addi a1, a1, -1171
+; RV32-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; RV32-NEXT:    vmv.s.x v0, a1
+; RV32-NEXT:    li a1, 146
+; RV32-NEXT:    vsetivli zero, 12, e32, m4, ta, ma
+; RV32-NEXT:    vle32.v v12, (a0), v0.t
+; RV32-NEXT:    li a0, 36
+; RV32-NEXT:    vmv.s.x v20, a1
+; RV32-NEXT:    lui a1, %hi(.LCPI68_0)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI68_0)
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vle16.v v21, (a1)
+; RV32-NEXT:    vcompress.vm v8, v12, v11
+; RV32-NEXT:    vsetivli zero, 8, e32, m4, ta, ma
+; RV32-NEXT:    vslidedown.vi v16, v12, 8
+; RV32-NEXT:    vmv1r.v v0, v10
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
+; RV32-NEXT:    vrgather.vi v8, v16, 1, v0.t
+; RV32-NEXT:    vcompress.vm v14, v12, v20
+; RV32-NEXT:    vrgather.vi v14, v16, 2, v0.t
+; RV32-NEXT:    vmv.s.x v0, a0
+; RV32-NEXT:    vmerge.vvm v12, v16, v12, v0
+; RV32-NEXT:    vrgatherei16.vv v10, v12, v21
+; RV32-NEXT:    vmv1r.v v9, v14
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: maskedload_factor3_invalid_skip_field:
+; RV64:       # %bb.0:
+; RV64-NEXT:    li a1, 73
+; RV64-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; RV64-NEXT:    vmv.s.x v11, a1
+; RV64-NEXT:    li a1, 146
+; RV64-NEXT:    vmv.s.x v20, a1
+; RV64-NEXT:    lui a1, 1
+; RV64-NEXT:    vmv.v.i v10, 8
+; RV64-NEXT:    addi a1, a1, -1171
+; RV64-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; RV64-NEXT:    vmv.s.x v0, a1
+; RV64-NEXT:    li a1, 36
+; RV64-NEXT:    vsetivli zero, 12, e32, m4, ta, ma
+; RV64-NEXT:    vle32.v v12, (a0), v0.t
+; RV64-NEXT:    li a0, 3
+; RV64-NEXT:    slli a0, a0, 32
+; RV64-NEXT:    addi a0, a0, 5
+; RV64-NEXT:    slli a0, a0, 16
+; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV64-NEXT:    vcompress.vm v8, v12, v11
+; RV64-NEXT:    vsetivli zero, 8, e32, m4, ta, ma
+; RV64-NEXT:    vslidedown.vi v16, v12, 8
+; RV64-NEXT:    vmv1r.v v0, v10
+; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
+; RV64-NEXT:    vrgather.vi v8, v16, 1, v0.t
+; RV64-NEXT:    vcompress.vm v14, v12, v20
+; RV64-NEXT:    vrgather.vi v14, v16, 2, v0.t
+; RV64-NEXT:    vmv.s.x v0, a1
+; RV64-NEXT:    addi a0, a0, 2
+; RV64-NEXT:    vmerge.vvm v12, v16, v12, v0
+; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV64-NEXT:    vmv.v.x v9, a0
+; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV64-NEXT:    vrgatherei16.vv v10, v12, v9
+; RV64-NEXT:    vmv1r.v v9, v14
+; RV64-NEXT:    ret
+  %interleaved.vec = tail call <12 x i32> @llvm.masked.load.v12i32.p0(ptr %ptr, i32 4, <12 x i1> <i1 1,i1 0,i1 1,i1 1,i1 0,i1 1,i1 1,i1 0,i1 1,i1 1,i1 0,i1 1>, <12 x i32> poison)
+  %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+  %v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+  %v2 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
+  %res0 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0
+  %res1 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1
+  %res2 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res1, <4 x i32> %v2, 2
+  ret {<4 x i32>, <4 x i32>, <4 x i32>} %res2
+}
+
+define {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} @maskedload_factor5_skip_fields(ptr %ptr) {
+  ; mask = 1111, skip the last two fields.
+; RV32-LABEL: maskedload_factor5_skip_fields:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -256
+; RV32-NEXT:    .cfi_def_cfa_offset 256
+; RV32-NEXT:    sw ra, 252(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s0, 248(sp) # 4-byte Folded Spill
+; RV32-NEXT:    .cfi_offset ra, -4
+; RV32-NEXT:    .cfi_offset s0, -8
+; RV32-NEXT:    addi s0, sp, 256
+; RV32-NEXT:    .cfi_def_cfa s0, 0
+; RV32-NEXT:    andi sp, sp, -128
+; RV32-NEXT:    lui a1, 58
+; RV32-NEXT:    addi a1, a1, -793
+; RV32-NEXT:    vsetivli zero, 20, e32, m8, ta, ma
+; RV32-NEXT:    vmv.s.x v0, a1
+; RV32-NEXT:    li a1, 33
+; RV32-NEXT:    vle32.v v16, (a0), v0.t
+; RV32-NEXT:    li a0, 32
+; RV32-NEXT:    mv a2, sp
+; RV32-NEXT:    vmv.s.x v0, a1
+; RV32-NEXT:    vsetivli zero, 8, e32, m4, ta, ma
+; RV32-NEXT:    vslidedown.vi v8, v16, 8
+; RV32-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
+; RV32-NEXT:    vslidedown.vi v12, v16, 6
+; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; RV32-NEXT:    vslidedown.vi v13, v16, 1
+; RV32-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; RV32-NEXT:    vse32.v v16, (a2)
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vmerge.vvm v8, v8, v16, v0
+; RV32-NEXT:    vslidedown.vi v10, v16, 7
+; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; RV32-NEXT:    vslidedown.vi v11, v16, 2
+; RV32-NEXT:    vslidedown.vi v18, v16, 3
+; RV32-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
+; RV32-NEXT:    vslidedown.vi v14, v16, 4
+; RV32-NEXT:    vmv.x.s a0, v12
+; RV32-NEXT:    vmv.x.s a1, v13
+; RV32-NEXT:    vmv.x.s a2, v11
+; RV32-NEXT:    vmv.x.s a3, v18
+; RV32-NEXT:    vmv.x.s a4, v14
+; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT:    vmv.v.x v11, a1
+; RV32-NEXT:    vmv.v.x v12, a2
+; RV32-NEXT:    vmv.v.x v13, a3
+; RV32-NEXT:    vmv.v.x v14, a4
+; RV32-NEXT:    lw a1, 32(sp)
+; RV32-NEXT:    lw a2, 36(sp)
+; RV32-NEXT:    lw a3, 44(sp)
+; RV32-NEXT:    lw a4, 48(sp)
+; RV32-NEXT:    vslide1down.vx v11, v11, a0
+; RV32-NEXT:    vmv.x.s a0, v10
+; RV32-NEXT:    vslide1down.vx v10, v12, a0
+; RV32-NEXT:    vslide1down.vx v11, v11, a3
+; RV32-NEXT:    vslide1down.vx v10, v10, a4
+; RV32-NEXT:    vslide1down.vx v12, v13, a1
+; RV32-NEXT:    lw a0, 64(sp)
+; RV32-NEXT:    lw a1, 52(sp)
+; RV32-NEXT:    lw a3, 56(sp)
+; RV32-NEXT:    lw a4, 68(sp)
+; RV32-NEXT:    vslide1down.vx v14, v14, a2
+; RV32-NEXT:    vslide1down.vx v13, v11, a0
+; RV32-NEXT:    vmv.v.i v0, 10
+; RV32-NEXT:    vslide1down.vx v10, v10, a4
+; RV32-NEXT:    vslide1down.vx v11, v12, a1
+; RV32-NEXT:    lw a0, 72(sp)
+; RV32-NEXT:    lw a1, 76(sp)
+; RV32-NEXT:    vslide1down.vx v12, v14, a3
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
+; RV32-NEXT:    vslidedown.vi v8, v8, 4, v0.t
+; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT:    vslide1down.vx v11, v11, a0
+; RV32-NEXT:    vslide1down.vx v12, v12, a1
+; RV32-NEXT:    vmv1r.v v9, v13
+; RV32-NEXT:    addi sp, s0, -256
+; RV32-NEXT:    .cfi_def_cfa sp, 256
+; RV32-NEXT:    lw ra, 252(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s0, 248(sp) # 4-byte Folded Reload
+; RV32-NEXT:    .cfi_restore ra
+; RV32-NEXT:    .cfi_restore s0
+; RV32-NEXT:    addi sp, sp, 256
+; RV32-NEXT:    .cfi_def_cfa_offset 0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: maskedload_factor5_skip_fields:
+; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -256
+; RV64-NEXT:    .cfi_def_cfa_offset 256
+; RV64-NEXT:    sd ra, 248(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s0, 240(sp) # 8-byte Folded Spill
+; RV64-NEXT:    .cfi_offset ra, -8
+; RV64-NEXT:    .cfi_offset s0, -16
+; RV64-NEXT:    addi s0, sp, 256
+; RV64-NEXT:    .cfi_def_cfa s0, 0
+; RV64-NEXT:    andi sp, sp, -128
+; RV64-NEXT:    lui a1, 58
+; RV64-NEXT:    addi a1, a1, -793
+; RV64-NEXT:    vsetivli zero, 20, e32, m8, ta, ma
+; RV64-NEXT:    vmv.s.x v0, a1
+; RV64-NEXT:    li a1, 33
+; RV64-NEXT:    vle32.v v16, (a0), v0.t
+; RV64-NEXT:    li a0, 32
+; RV64-NEXT:    mv a2, sp
+; RV64-NEXT:    vmv.s.x v0, a1
+; RV64-NEXT:    vsetivli zero, 8, e32, m4, ta, ma
+; RV64-NEXT:    vslidedown.vi v8, v16, 8
+; RV64-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
+; RV64-NEXT:    vslidedown.vi v12, v16, 6
+; RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; RV64-NEXT:    vslidedown.vi v13, v16, 1
+; RV64-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; RV64-NEXT:    vse32.v v16, (a2)
+; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV64-NEXT:    vmerge.vvm v8, v8, v16, v0
+; RV64-NEXT:    vslidedown.vi v10, v16, 7
+; RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; RV64-NEXT:    vslidedown.vi v11, v16, 2
+; RV64-NEXT:    vslidedown.vi v18, v16, 3
+; RV64-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
+; RV64-NEXT:    vslidedown.vi v14, v16, 4
+; RV64-NEXT:    vmv.x.s a0, v12
+; RV64-NEXT:    vmv.x.s a1, v13
+; RV64-NEXT:    vmv.x.s a2, v11
+; RV64-NEXT:    vmv.x.s a3, v18
+; RV64-NEXT:    vmv.x.s a4, v14
+; RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV64-NEXT:    vmv.v.x v11, a1
+; RV64-NEXT:    vmv.v.x v12, a2
+; RV64-NEXT:    vmv.v.x v13, a3
+; RV64-NEXT:    vmv.v.x v14, a4
+; RV64-NEXT:    lw a1, 32(sp)
+; RV64-NEXT:    lw a2, 36(sp)
+; RV64-NEXT:    lw a3, 44(sp)
+; RV64-NEXT:    lw a4, 48(sp)
+; RV64-NEXT:    vslide1down.vx v11, v11, a0
+; RV64-NEXT:    vmv.x.s a0, v10
+; RV64-NEXT:    vslide1down.vx v10, v12, a0
+; RV64-NEXT:    vslide1down.vx v11, v11, a3
+; RV64-NEXT:    vslide1down.vx v10, v10, a4
+; RV64-NEXT:    vslide1down.vx v12, v13, a1
+; RV64-NEXT:    lw a0, 64(sp)
+; RV64-NEXT:    lw a1, 52(sp)
+; RV64-NEXT:    lw a3, 56(sp)
+; RV64-NEXT:    lw a4, 68(sp)
+; RV64-NEXT:    vslide1down.vx v14, v14, a2
+; RV64-NEXT:    vslide1down.vx v13, v11, a0
+; RV64-NEXT:    vmv.v.i v0, 10
+; RV64-NEXT:    vslide1down.vx v10, v10, a4
+; RV64-NEXT:    vslide1down.vx v11, v12, a1
+; RV64-NEXT:    lw a0, 72(sp)
+; RV64-NEXT:    lw a1, 76(sp)
+; RV64-NEXT:    vslide1down.vx v12, v14, a3
+; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
+; RV64-NEXT:    vslidedown.vi v8, v8, 4, v0.t
+; RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV64-NEXT:    vslide1down.vx v11, v11, a0
+; RV64-NEXT:    vslide1down.vx v12, v12, a1
+; RV64-NEXT:    vmv1r.v v9, v13
+; RV64-NEXT:    addi sp, s0, -256
+; RV64-NEXT:    .cfi_def_cfa sp, 256
+; RV64-NEXT:    ld ra, 248(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s0, 240(sp) # 8-byte Folded Reload
+; RV64-NEXT:    .cfi_restore ra
+; RV64-NEXT:    .cfi_restore s0
+; RV64-NEXT:    addi sp, sp, 256
+; RV64-NEXT:    .cfi_def_cfa_offset 0
+; RV64-NEXT:    ret
+  %interleaved.vec = tail call <20 x i32> @llvm.masked.load(ptr %ptr, i32 4, <20 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0>, <20 x i32> poison)
+  %v0 = shufflevector <20 x i32> %interleaved.vec, <20 x i32> poison, <4 x i32> <i32 0, i32 5, i32 10, i32 15>
+  %v1 = shufflevector <20 x i32> %interleaved.vec, <20 x i32> poison, <4 x i32> <i32 1, i32 6, i32 11, i32 16>
+  %v2 = shufflevector <20 x i32> %interleaved.vec, <20 x i32> poison, <4 x i32> <i32 2, i32 7, i32 12, i32 17>
+  %v3 = shufflevector <20 x i32> %interleaved.vec, <20 x i32> poison, <4 x i32> <i32 3, i32 8, i32 13, i32 18>
+  %v4 = shufflevector <20 x i32> %interleaved.vec, <20 x i32> poison, <4 x i32> <i32 4, i32 9, i32 14, i32 19>
+  %res0 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0
+  %res1 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1
+  %res2 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} %res1, <4 x i32> %v2, 2
+  %res3 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} %res2, <4 x i32> %v3, 3
+  %res4 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} %res3, <4 x i32> %v4, 4
+  ret {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} %res4
+}
+

From 9d6ef18b57878c74ce673875683ade639f1eb14e Mon Sep 17 00:00:00 2001
From: Min-Yih Hsu <min.hsu@sifive.com>
Date: Thu, 31 Jul 2025 15:56:04 -0700
Subject: [PATCH 2/9] [IA][RISCV] Recognize deinterleaved loads that could
 lower to strided segmented loads

---
 llvm/include/llvm/CodeGen/TargetLowering.h    |   6 +-
 llvm/lib/CodeGen/InterleavedAccessPass.cpp    |  81 ++-
 .../Target/AArch64/AArch64ISelLowering.cpp    |   2 +-
 llvm/lib/Target/AArch64/AArch64ISelLowering.h |   4 +-
 llvm/lib/Target/ARM/ARMISelLowering.cpp       |   2 +-
 llvm/lib/Target/ARM/ARMISelLowering.h         |   4 +-
 llvm/lib/Target/RISCV/RISCVISelLowering.h     |   4 +-
 .../Target/RISCV/RISCVInterleavedAccess.cpp   |  41 +-
 llvm/lib/Target/X86/X86ISelLowering.h         |   4 +-
 llvm/lib/Target/X86/X86InterleavedAccess.cpp  |   2 +-
 .../rvv/fixed-vectors-interleaved-access.ll   | 475 ++----------------
 11 files changed, 139 insertions(+), 486 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index cbdc1b6031680..3239b35031e36 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3209,10 +3209,12 @@ class LLVM_ABI TargetLoweringBase {
   /// \p Shuffles is the shufflevector list to DE-interleave the loaded vector.
   /// \p Indices is the corresponding indices for each shufflevector.
   /// \p Factor is the interleave factor.
+  /// \p MaskFactor is the interleave factor that considers mask, which can
+  /// reduce the original factor.
   virtual bool lowerInterleavedLoad(Instruction *Load, Value *Mask,
                                     ArrayRef<ShuffleVectorInst *> Shuffles,
-                                    ArrayRef<unsigned> Indices,
-                                    unsigned Factor) const {
+                                    ArrayRef<unsigned> Indices, unsigned Factor,
+                                    unsigned MaskFactor) const {
     return false;
   }
 
diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index 5e508989ef2da..e6c4de23c055e 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -268,13 +268,19 @@ static Value *getMaskOperand(IntrinsicInst *II) {
   }
 }
 
-// Return the corresponded deinterleaved mask, or nullptr if there is no valid
-// mask.
-static Value *getMask(Value *WideMask, unsigned Factor,
-                      ElementCount LeafValueEC);
-
-static Value *getMask(Value *WideMask, unsigned Factor,
-                      VectorType *LeafValueTy) {
+// Return a pair of
+//  (1) The corresponded deinterleaved mask, or nullptr if there is no valid
+//  mask.
+//  (2) Some mask effectively skips a certain field, this element contains
+//  the factor after taking such contraction into consideration. Note that
+//  currently we only support skipping trailing fields. So if the "nominal"
+//  factor was 5, you cannot only skip field 1 and 2, but you can skip field 3
+//  and 4.
+static std::pair<Value *, unsigned> getMask(Value *WideMask, unsigned Factor,
+                                            ElementCount LeafValueEC);
+
+static std::pair<Value *, unsigned> getMask(Value *WideMask, unsigned Factor,
+                                            VectorType *LeafValueTy) {
   return getMask(WideMask, Factor, LeafValueTy->getElementCount());
 }
 
@@ -379,22 +385,25 @@ bool InterleavedAccessImpl::lowerInterleavedLoad(
       replaceBinOpShuffles(BinOpShuffles.getArrayRef(), Shuffles, Load);
 
   Value *Mask = nullptr;
+  unsigned MaskFactor = Factor;
   if (LI) {
     LLVM_DEBUG(dbgs() << "IA: Found an interleaved load: " << *Load << "\n");
   } else {
     // Check mask operand. Handle both all-true/false and interleaved mask.
-    Mask = getMask(getMaskOperand(II), Factor, VecTy);
+    std::tie(Mask, MaskFactor) = getMask(getMaskOperand(II), Factor, VecTy);
     if (!Mask)
       return false;
 
     LLVM_DEBUG(dbgs() << "IA: Found an interleaved vp.load or masked.load: "
                       << *Load << "\n");
+    LLVM_DEBUG(dbgs() << "IA: With nominal factor " << Factor
+                      << " and mask factor " << MaskFactor << "\n");
   }
 
   // Try to create target specific intrinsics to replace the load and
   // shuffles.
   if (!TLI->lowerInterleavedLoad(cast<Instruction>(Load), Mask, Shuffles,
-                                 Indices, Factor))
+                                 Indices, Factor, MaskFactor))
     // If Extracts is not empty, tryReplaceExtracts made changes earlier.
     return !Extracts.empty() || BinOpShuffleChanged;
 
@@ -536,8 +545,8 @@ bool InterleavedAccessImpl::lowerInterleavedStore(
   } else {
     // Check mask operand. Handle both all-true/false and interleaved mask.
     unsigned LaneMaskLen = NumStoredElements / Factor;
-    Mask = getMask(getMaskOperand(II), Factor,
-                   ElementCount::getFixed(LaneMaskLen));
+    std::tie(Mask, std::ignore) = getMask(getMaskOperand(II), Factor,
+                                          ElementCount::getFixed(LaneMaskLen));
     if (!Mask)
       return false;
 
@@ -556,34 +565,57 @@ bool InterleavedAccessImpl::lowerInterleavedStore(
   return true;
 }
 
-static Value *getMask(Value *WideMask, unsigned Factor,
-                      ElementCount LeafValueEC) {
+static std::pair<Value *, unsigned> getMask(Value *WideMask, unsigned Factor,
+                                            ElementCount LeafValueEC) {
   if (auto *IMI = dyn_cast<IntrinsicInst>(WideMask)) {
     if (unsigned F = getInterleaveIntrinsicFactor(IMI->getIntrinsicID());
         F && F == Factor && llvm::all_equal(IMI->args())) {
-      return IMI->getArgOperand(0);
+      return {IMI->getArgOperand(0), Factor};
     }
   }
 
   if (auto *ConstMask = dyn_cast<Constant>(WideMask)) {
     if (auto *Splat = ConstMask->getSplatValue())
       // All-ones or all-zeros mask.
-      return ConstantVector::getSplat(LeafValueEC, Splat);
+      return {ConstantVector::getSplat(LeafValueEC, Splat), Factor};
 
     if (LeafValueEC.isFixed()) {
       unsigned LeafMaskLen = LeafValueEC.getFixedValue();
+      // First, check if the mask completely skips some of the factors / fields.
+      APInt FactorMask(Factor, 0);
+      FactorMask.setAllBits();
+      for (unsigned F = 0U; F < Factor; ++F) {
+        unsigned Idx;
+        for (Idx = 0U; Idx < LeafMaskLen; ++Idx) {
+          Constant *C = ConstMask->getAggregateElement(F + Idx * Factor);
+          if (!C->isZeroValue())
+            break;
+        }
+        // All mask bits on this field are zero, skipping it.
+        if (Idx >= LeafMaskLen)
+          FactorMask.clearBit(F);
+      }
+      // We currently only support skipping "trailing" factors / fields. So
+      // given the original factor being 4, we can skip fields 2 and 3, but we
+      // cannot only skip fields 1 and 2. If FactorMask does not match such
+      // pattern, reset it.
+      if (!FactorMask.isMask())
+        FactorMask.setAllBits();
+
       SmallVector<Constant *, 8> LeafMask(LeafMaskLen, nullptr);
       // If this is a fixed-length constant mask, each lane / leaf has to
       // use the same mask. This is done by checking if every group with Factor
       // number of elements in the interleaved mask has homogeneous values.
       for (unsigned Idx = 0U; Idx < LeafMaskLen * Factor; ++Idx) {
+        if (!FactorMask[Idx % Factor])
+          continue;
         Constant *C = ConstMask->getAggregateElement(Idx);
         if (LeafMask[Idx / Factor] && LeafMask[Idx / Factor] != C)
-          return nullptr;
+          return {nullptr, Factor};
         LeafMask[Idx / Factor] = C;
       }
 
-      return ConstantVector::get(LeafMask);
+      return {ConstantVector::get(LeafMask), FactorMask.popcount()};
     }
   }
 
@@ -603,12 +635,13 @@ static Value *getMask(Value *WideMask, unsigned Factor,
       auto *LeafMaskTy =
           VectorType::get(Type::getInt1Ty(SVI->getContext()), LeafValueEC);
       IRBuilder<> Builder(SVI);
-      return Builder.CreateExtractVector(LeafMaskTy, SVI->getOperand(0),
-                                         uint64_t(0));
+      return {Builder.CreateExtractVector(LeafMaskTy, SVI->getOperand(0),
+                                          uint64_t(0)),
+              Factor};
     }
   }
 
-  return nullptr;
+  return {nullptr, Factor};
 }
 
 bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
@@ -639,7 +672,8 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
       return false;
 
     // Check mask operand. Handle both all-true/false and interleaved mask.
-    Mask = getMask(getMaskOperand(II), Factor, getDeinterleavedVectorType(DI));
+    std::tie(Mask, std::ignore) =
+        getMask(getMaskOperand(II), Factor, getDeinterleavedVectorType(DI));
     if (!Mask)
       return false;
 
@@ -680,8 +714,9 @@ bool InterleavedAccessImpl::lowerInterleaveIntrinsic(
         II->getIntrinsicID() != Intrinsic::vp_store)
       return false;
     // Check mask operand. Handle both all-true/false and interleaved mask.
-    Mask = getMask(getMaskOperand(II), Factor,
-                   cast<VectorType>(InterleaveValues[0]->getType()));
+    std::tie(Mask, std::ignore) =
+        getMask(getMaskOperand(II), Factor,
+                cast<VectorType>(InterleaveValues[0]->getType()));
     if (!Mask)
       return false;
 
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 2b6ea86ee1af5..e681d846f9e1c 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -17254,7 +17254,7 @@ static Function *getStructuredStoreFunction(Module *M, unsigned Factor,
 ///        %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
 bool AArch64TargetLowering::lowerInterleavedLoad(
     Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles,
-    ArrayRef<unsigned> Indices, unsigned Factor) const {
+    ArrayRef<unsigned> Indices, unsigned Factor, unsigned MaskFactor) const {
   assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
          "Invalid interleave factor");
   assert(!Shuffles.empty() && "Empty shufflevector input");
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index ea63edd86210e..d0d6512d39015 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -220,8 +220,8 @@ class AArch64TargetLowering : public TargetLowering {
 
   bool lowerInterleavedLoad(Instruction *Load, Value *Mask,
                             ArrayRef<ShuffleVectorInst *> Shuffles,
-                            ArrayRef<unsigned> Indices,
-                            unsigned Factor) const override;
+                            ArrayRef<unsigned> Indices, unsigned Factor,
+                            unsigned MaskFactor) const override;
   bool lowerInterleavedStore(Instruction *Store, Value *Mask,
                              ShuffleVectorInst *SVI,
                              unsigned Factor) const override;
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 936625606e315..a5750def66b7d 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -21599,7 +21599,7 @@ unsigned ARMTargetLowering::getMaxSupportedInterleaveFactor() const {
 ///        %vec1 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 1
 bool ARMTargetLowering::lowerInterleavedLoad(
     Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles,
-    ArrayRef<unsigned> Indices, unsigned Factor) const {
+    ArrayRef<unsigned> Indices, unsigned Factor, unsigned MaskFactor) const {
   assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
          "Invalid interleave factor");
   assert(!Shuffles.empty() && "Empty shufflevector input");
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index 825145d813fb1..670bbb62fe0f6 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -683,8 +683,8 @@ class VectorType;
 
     bool lowerInterleavedLoad(Instruction *Load, Value *Mask,
                               ArrayRef<ShuffleVectorInst *> Shuffles,
-                              ArrayRef<unsigned> Indices,
-                              unsigned Factor) const override;
+                              ArrayRef<unsigned> Indices, unsigned Factor,
+                              unsigned MaskFactor) const override;
     bool lowerInterleavedStore(Instruction *Store, Value *Mask,
                                ShuffleVectorInst *SVI,
                                unsigned Factor) const override;
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index fa50e2105a708..4155f613f7f04 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -431,8 +431,8 @@ class RISCVTargetLowering : public TargetLowering {
 
   bool lowerInterleavedLoad(Instruction *Load, Value *Mask,
                             ArrayRef<ShuffleVectorInst *> Shuffles,
-                            ArrayRef<unsigned> Indices,
-                            unsigned Factor) const override;
+                            ArrayRef<unsigned> Indices, unsigned Factor,
+                            unsigned MaskFactor) const override;
 
   bool lowerInterleavedStore(Instruction *Store, Value *Mask,
                              ShuffleVectorInst *SVI,
diff --git a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
index 726920e4015cf..d4e6351ea6a51 100644
--- a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
@@ -63,6 +63,12 @@ static const Intrinsic::ID FixedVlsegIntrIds[] = {
     Intrinsic::riscv_seg6_load_mask, Intrinsic::riscv_seg7_load_mask,
     Intrinsic::riscv_seg8_load_mask};
 
+static const Intrinsic::ID FixedVlssegIntrIds[] = {
+    Intrinsic::riscv_sseg2_load_mask, Intrinsic::riscv_sseg3_load_mask,
+    Intrinsic::riscv_sseg4_load_mask, Intrinsic::riscv_sseg5_load_mask,
+    Intrinsic::riscv_sseg6_load_mask, Intrinsic::riscv_sseg7_load_mask,
+    Intrinsic::riscv_sseg8_load_mask};
+
 static const Intrinsic::ID ScalableVlsegIntrIds[] = {
     Intrinsic::riscv_vlseg2_mask, Intrinsic::riscv_vlseg3_mask,
     Intrinsic::riscv_vlseg4_mask, Intrinsic::riscv_vlseg5_mask,
@@ -197,9 +203,13 @@ static bool getMemOperands(unsigned Factor, VectorType *VTy, Type *XLenTy,
 /// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
 bool RISCVTargetLowering::lowerInterleavedLoad(
     Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles,
-    ArrayRef<unsigned> Indices, unsigned Factor) const {
+    ArrayRef<unsigned> Indices, unsigned Factor, unsigned MaskFactor) const {
   assert(Indices.size() == Shuffles.size());
+  assert(MaskFactor <= Factor);
 
+  // TODO: Lower to strided load when MaskFactor = 1.
+  if (MaskFactor < 2)
+    return false;
   IRBuilder<> Builder(Load);
 
   const DataLayout &DL = Load->getDataLayout();
@@ -208,20 +218,37 @@ bool RISCVTargetLowering::lowerInterleavedLoad(
 
   Value *Ptr, *VL;
   Align Alignment;
-  if (!getMemOperands(Factor, VTy, XLenTy, Load, Ptr, Mask, VL, Alignment))
+  if (!getMemOperands(MaskFactor, VTy, XLenTy, Load, Ptr, Mask, VL, Alignment))
     return false;
 
   Type *PtrTy = Ptr->getType();
   unsigned AS = PtrTy->getPointerAddressSpace();
-  if (!isLegalInterleavedAccessType(VTy, Factor, Alignment, AS, DL))
+  if (!isLegalInterleavedAccessType(VTy, MaskFactor, Alignment, AS, DL))
     return false;
 
-  CallInst *VlsegN = Builder.CreateIntrinsic(
-      FixedVlsegIntrIds[Factor - 2], {VTy, PtrTy, XLenTy}, {Ptr, Mask, VL});
+  CallInst *SegLoad = nullptr;
+  if (MaskFactor < Factor) {
+    // Lower to strided segmented load.
+    unsigned ScalarSizeInBytes = DL.getTypeStoreSize(VTy->getElementType());
+    Value *Stride = ConstantInt::get(XLenTy, Factor * ScalarSizeInBytes);
+    SegLoad = Builder.CreateIntrinsic(FixedVlssegIntrIds[MaskFactor - 2],
+                                      {VTy, PtrTy, XLenTy, XLenTy},
+                                      {Ptr, Stride, Mask, VL});
+  } else {
+    // Lower to normal segmented load.
+    SegLoad = Builder.CreateIntrinsic(FixedVlsegIntrIds[Factor - 2],
+                                      {VTy, PtrTy, XLenTy}, {Ptr, Mask, VL});
+  }
 
   for (unsigned i = 0; i < Shuffles.size(); i++) {
-    Value *SubVec = Builder.CreateExtractValue(VlsegN, Indices[i]);
-    Shuffles[i]->replaceAllUsesWith(SubVec);
+    unsigned FactorIdx = Indices[i];
+    if (FactorIdx >= MaskFactor) {
+      // Replace masked-off factors (that are still extracted) with poison.
+      Shuffles[i]->replaceAllUsesWith(PoisonValue::get(VTy));
+    } else {
+      Value *SubVec = Builder.CreateExtractValue(SegLoad, FactorIdx);
+      Shuffles[i]->replaceAllUsesWith(SubVec);
+    }
   }
 
   return true;
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 547b2210fdbf0..242d24b5faf60 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1663,8 +1663,8 @@ namespace llvm {
     /// instructions/intrinsics.
     bool lowerInterleavedLoad(Instruction *Load, Value *Mask,
                               ArrayRef<ShuffleVectorInst *> Shuffles,
-                              ArrayRef<unsigned> Indices,
-                              unsigned Factor) const override;
+                              ArrayRef<unsigned> Indices, unsigned Factor,
+                              unsigned MaskFactor) const override;
 
     /// Lower interleaved store(s) into target specific
     /// instructions/intrinsics.
diff --git a/llvm/lib/Target/X86/X86InterleavedAccess.cpp b/llvm/lib/Target/X86/X86InterleavedAccess.cpp
index 636b072837441..6929c869b1a31 100644
--- a/llvm/lib/Target/X86/X86InterleavedAccess.cpp
+++ b/llvm/lib/Target/X86/X86InterleavedAccess.cpp
@@ -802,7 +802,7 @@ bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() {
 // Currently, lowering is supported for 4x64 bits with Factor = 4 on AVX.
 bool X86TargetLowering::lowerInterleavedLoad(
     Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles,
-    ArrayRef<unsigned> Indices, unsigned Factor) const {
+    ArrayRef<unsigned> Indices, unsigned Factor, unsigned MaskFactor) const {
   assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
          "Invalid interleave factor");
   assert(!Shuffles.empty() && "Empty shufflevector input");
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
index 2df26b2f78d5b..497b39fb6f044 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
@@ -334,78 +334,12 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @vpload_factor3_poison_shufflemask(ptr
 
 define {<4 x i32>, <4 x i32>, <4 x i32>} @vpload_factor3_skip_fields(ptr %ptr) {
   ; mask = 1111, skip the last field.
-; RV32-LABEL: vpload_factor3_skip_fields:
-; RV32:       # %bb.0:
-; RV32-NEXT:    li a1, 1755
-; RV32-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; RV32-NEXT:    vmv.s.x v0, a1
-; RV32-NEXT:    li a1, 73
-; RV32-NEXT:    vmv.v.i v10, 8
-; RV32-NEXT:    vsetivli zero, 12, e32, m4, ta, ma
-; RV32-NEXT:    vle32.v v12, (a0), v0.t
-; RV32-NEXT:    li a0, 36
-; RV32-NEXT:    vmv.s.x v11, a1
-; RV32-NEXT:    lui a1, %hi(.LCPI17_0)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI17_0)
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vcompress.vm v8, v12, v11
-; RV32-NEXT:    vsetivli zero, 8, e32, m4, ta, ma
-; RV32-NEXT:    vslidedown.vi v16, v12, 8
-; RV32-NEXT:    vmv1r.v v0, v10
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
-; RV32-NEXT:    vrgather.vi v8, v16, 1, v0.t
-; RV32-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
-; RV32-NEXT:    vmv.v.i v0, 2
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
-; RV32-NEXT:    vslidedown.vi v14, v12, 1
-; RV32-NEXT:    vslidedown.vi v14, v12, 3, v0.t
-; RV32-NEXT:    vle16.v v9, (a1)
-; RV32-NEXT:    vmv1r.v v0, v10
-; RV32-NEXT:    vrgather.vi v14, v16, 2, v0.t
-; RV32-NEXT:    vmv.s.x v0, a0
-; RV32-NEXT:    vmerge.vvm v12, v16, v12, v0
-; RV32-NEXT:    vrgatherei16.vv v10, v12, v9
-; RV32-NEXT:    vmv1r.v v9, v14
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vpload_factor3_skip_fields:
-; RV64:       # %bb.0:
-; RV64-NEXT:    li a1, 1755
-; RV64-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; RV64-NEXT:    vmv.s.x v0, a1
-; RV64-NEXT:    li a1, 73
-; RV64-NEXT:    vmv.v.i v10, 8
-; RV64-NEXT:    vmv.s.x v11, a1
-; RV64-NEXT:    li a1, 36
-; RV64-NEXT:    vsetivli zero, 12, e32, m4, ta, ma
-; RV64-NEXT:    vle32.v v12, (a0), v0.t
-; RV64-NEXT:    li a0, 3
-; RV64-NEXT:    slli a0, a0, 32
-; RV64-NEXT:    addi a0, a0, 5
-; RV64-NEXT:    slli a0, a0, 16
-; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV64-NEXT:    vcompress.vm v8, v12, v11
-; RV64-NEXT:    vsetivli zero, 8, e32, m4, ta, ma
-; RV64-NEXT:    vslidedown.vi v16, v12, 8
-; RV64-NEXT:    vmv1r.v v0, v10
-; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
-; RV64-NEXT:    vrgather.vi v8, v16, 1, v0.t
-; RV64-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
-; RV64-NEXT:    vmv.v.i v0, 2
-; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
-; RV64-NEXT:    vslidedown.vi v14, v12, 1
-; RV64-NEXT:    vslidedown.vi v14, v12, 3, v0.t
-; RV64-NEXT:    vmv1r.v v0, v10
-; RV64-NEXT:    vrgather.vi v14, v16, 2, v0.t
-; RV64-NEXT:    vmv.s.x v0, a1
-; RV64-NEXT:    addi a0, a0, 2
-; RV64-NEXT:    vmerge.vvm v12, v16, v12, v0
-; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV64-NEXT:    vmv.v.x v9, a0
-; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV64-NEXT:    vrgatherei16.vv v10, v12, v9
-; RV64-NEXT:    vmv1r.v v9, v14
-; RV64-NEXT:    ret
+; CHECK-LABEL: vpload_factor3_skip_fields:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 12
+; CHECK-NEXT:    vsetivli zero, 6, e32, m1, ta, ma
+; CHECK-NEXT:    vlsseg2e32.v v8, (a0), a1
+; CHECK-NEXT:    ret
   %interleaved.vec = tail call <12 x i32> @llvm.vp.load.v12i32.p0(ptr %ptr, <12 x i1> <i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0>, i32 12)
   %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
   %v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 poison, i32 10>
@@ -418,78 +352,13 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @vpload_factor3_skip_fields(ptr %ptr) {
 
 define {<4 x i32>, <4 x i32>, <4 x i32>} @vpload_factor3_mask_skip_fields(ptr %ptr) {
   ; mask = 0101, skip the last field.
-; RV32-LABEL: vpload_factor3_mask_skip_fields:
-; RV32:       # %bb.0:
-; RV32-NEXT:    li a1, 1560
-; RV32-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; RV32-NEXT:    vmv.s.x v0, a1
-; RV32-NEXT:    li a1, 73
-; RV32-NEXT:    vmv.v.i v10, 8
-; RV32-NEXT:    vsetivli zero, 12, e32, m4, ta, ma
-; RV32-NEXT:    vle32.v v12, (a0), v0.t
-; RV32-NEXT:    li a0, 36
-; RV32-NEXT:    vmv.s.x v11, a1
-; RV32-NEXT:    lui a1, %hi(.LCPI18_0)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI18_0)
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vcompress.vm v8, v12, v11
-; RV32-NEXT:    vsetivli zero, 8, e32, m4, ta, ma
-; RV32-NEXT:    vslidedown.vi v16, v12, 8
-; RV32-NEXT:    vmv1r.v v0, v10
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
-; RV32-NEXT:    vrgather.vi v8, v16, 1, v0.t
-; RV32-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
-; RV32-NEXT:    vmv.v.i v0, 2
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
-; RV32-NEXT:    vslidedown.vi v14, v12, 1
-; RV32-NEXT:    vslidedown.vi v14, v12, 3, v0.t
-; RV32-NEXT:    vle16.v v9, (a1)
-; RV32-NEXT:    vmv1r.v v0, v10
-; RV32-NEXT:    vrgather.vi v14, v16, 2, v0.t
-; RV32-NEXT:    vmv.s.x v0, a0
-; RV32-NEXT:    vmerge.vvm v12, v16, v12, v0
-; RV32-NEXT:    vrgatherei16.vv v10, v12, v9
-; RV32-NEXT:    vmv1r.v v9, v14
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vpload_factor3_mask_skip_fields:
-; RV64:       # %bb.0:
-; RV64-NEXT:    li a1, 1560
-; RV64-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; RV64-NEXT:    vmv.s.x v0, a1
-; RV64-NEXT:    li a1, 73
-; RV64-NEXT:    vmv.v.i v10, 8
-; RV64-NEXT:    vmv.s.x v11, a1
-; RV64-NEXT:    li a1, 36
-; RV64-NEXT:    vsetivli zero, 12, e32, m4, ta, ma
-; RV64-NEXT:    vle32.v v12, (a0), v0.t
-; RV64-NEXT:    li a0, 3
-; RV64-NEXT:    slli a0, a0, 32
-; RV64-NEXT:    addi a0, a0, 5
-; RV64-NEXT:    slli a0, a0, 16
-; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV64-NEXT:    vcompress.vm v8, v12, v11
-; RV64-NEXT:    vsetivli zero, 8, e32, m4, ta, ma
-; RV64-NEXT:    vslidedown.vi v16, v12, 8
-; RV64-NEXT:    vmv1r.v v0, v10
-; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
-; RV64-NEXT:    vrgather.vi v8, v16, 1, v0.t
-; RV64-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
-; RV64-NEXT:    vmv.v.i v0, 2
-; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
-; RV64-NEXT:    vslidedown.vi v14, v12, 1
-; RV64-NEXT:    vslidedown.vi v14, v12, 3, v0.t
-; RV64-NEXT:    vmv1r.v v0, v10
-; RV64-NEXT:    vrgather.vi v14, v16, 2, v0.t
-; RV64-NEXT:    vmv.s.x v0, a1
-; RV64-NEXT:    addi a0, a0, 2
-; RV64-NEXT:    vmerge.vvm v12, v16, v12, v0
-; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV64-NEXT:    vmv.v.x v9, a0
-; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV64-NEXT:    vrgatherei16.vv v10, v12, v9
-; RV64-NEXT:    vmv1r.v v9, v14
-; RV64-NEXT:    ret
+; CHECK-LABEL: vpload_factor3_mask_skip_fields:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 6, e32, m1, ta, ma
+; CHECK-NEXT:    vmv.v.i v0, 10
+; CHECK-NEXT:    li a1, 12
+; CHECK-NEXT:    vlsseg2e32.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
   %interleaved.vec = tail call <12 x i32> @llvm.vp.load.v12i32.p0(ptr %ptr, <12 x i1> <i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0>, i32 12)
   %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
   %v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 poison, i32 10>
@@ -2282,72 +2151,12 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @maskedload_factor3_mask(ptr %ptr) {
 }
 
 define {<4 x i32>, <4 x i32>, <4 x i32>} @maskedload_factor3_skip_field(ptr %ptr) {
-; RV32-LABEL: maskedload_factor3_skip_field:
-; RV32:       # %bb.0:
-; RV32-NEXT:    li a1, 1755
-; RV32-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; RV32-NEXT:    vmv.s.x v0, a1
-; RV32-NEXT:    li a1, 73
-; RV32-NEXT:    vmv.v.i v10, 8
-; RV32-NEXT:    vmv.s.x v11, a1
-; RV32-NEXT:    li a1, 146
-; RV32-NEXT:    vsetivli zero, 12, e32, m4, ta, ma
-; RV32-NEXT:    vle32.v v12, (a0), v0.t
-; RV32-NEXT:    li a0, 36
-; RV32-NEXT:    vmv.s.x v20, a1
-; RV32-NEXT:    lui a1, %hi(.LCPI66_0)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI66_0)
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vle16.v v21, (a1)
-; RV32-NEXT:    vcompress.vm v8, v12, v11
-; RV32-NEXT:    vsetivli zero, 8, e32, m4, ta, ma
-; RV32-NEXT:    vslidedown.vi v16, v12, 8
-; RV32-NEXT:    vmv1r.v v0, v10
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
-; RV32-NEXT:    vrgather.vi v8, v16, 1, v0.t
-; RV32-NEXT:    vcompress.vm v14, v12, v20
-; RV32-NEXT:    vrgather.vi v14, v16, 2, v0.t
-; RV32-NEXT:    vmv.s.x v0, a0
-; RV32-NEXT:    vmerge.vvm v12, v16, v12, v0
-; RV32-NEXT:    vrgatherei16.vv v10, v12, v21
-; RV32-NEXT:    vmv1r.v v9, v14
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: maskedload_factor3_skip_field:
-; RV64:       # %bb.0:
-; RV64-NEXT:    li a1, 1755
-; RV64-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; RV64-NEXT:    vmv.s.x v0, a1
-; RV64-NEXT:    li a1, 73
-; RV64-NEXT:    vmv.v.i v10, 8
-; RV64-NEXT:    vmv.s.x v11, a1
-; RV64-NEXT:    li a1, 146
-; RV64-NEXT:    vmv.s.x v20, a1
-; RV64-NEXT:    li a1, 36
-; RV64-NEXT:    vsetivli zero, 12, e32, m4, ta, ma
-; RV64-NEXT:    vle32.v v12, (a0), v0.t
-; RV64-NEXT:    li a0, 3
-; RV64-NEXT:    slli a0, a0, 32
-; RV64-NEXT:    addi a0, a0, 5
-; RV64-NEXT:    slli a0, a0, 16
-; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV64-NEXT:    vcompress.vm v8, v12, v11
-; RV64-NEXT:    vsetivli zero, 8, e32, m4, ta, ma
-; RV64-NEXT:    vslidedown.vi v16, v12, 8
-; RV64-NEXT:    vmv1r.v v0, v10
-; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
-; RV64-NEXT:    vrgather.vi v8, v16, 1, v0.t
-; RV64-NEXT:    vcompress.vm v14, v12, v20
-; RV64-NEXT:    vrgather.vi v14, v16, 2, v0.t
-; RV64-NEXT:    vmv.s.x v0, a1
-; RV64-NEXT:    addi a0, a0, 2
-; RV64-NEXT:    vmerge.vvm v12, v16, v12, v0
-; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV64-NEXT:    vmv.v.x v9, a0
-; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV64-NEXT:    vrgatherei16.vv v10, v12, v9
-; RV64-NEXT:    vmv1r.v v9, v14
-; RV64-NEXT:    ret
+; CHECK-LABEL: maskedload_factor3_skip_field:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 12
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vlsseg2e32.v v8, (a0), a1
+; CHECK-NEXT:    ret
   %interleaved.vec = tail call <12 x i32> @llvm.masked.load.v12i32.p0(ptr %ptr, i32 4, <12 x i1> <i1 1,i1 1,i1 0,i1 1,i1 1,i1 0,i1 1,i1 1,i1 0,i1 1,i1 1,i1 0>, <12 x i32> poison)
   ; mask = 1111, skip last field
   %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
@@ -2360,72 +2169,13 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @maskedload_factor3_skip_field(ptr %ptr
 }
 
 define {<4 x i32>, <4 x i32>, <4 x i32>} @maskedload_factor3_mask_skip_field(ptr %ptr) {
-; RV32-LABEL: maskedload_factor3_mask_skip_field:
-; RV32:       # %bb.0:
-; RV32-NEXT:    li a1, 195
-; RV32-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; RV32-NEXT:    vmv.s.x v0, a1
-; RV32-NEXT:    li a1, 73
-; RV32-NEXT:    vmv.v.i v10, 8
-; RV32-NEXT:    vmv.s.x v11, a1
-; RV32-NEXT:    li a1, 146
-; RV32-NEXT:    vsetivli zero, 12, e32, m4, ta, ma
-; RV32-NEXT:    vle32.v v12, (a0), v0.t
-; RV32-NEXT:    li a0, 36
-; RV32-NEXT:    vmv.s.x v20, a1
-; RV32-NEXT:    lui a1, %hi(.LCPI67_0)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI67_0)
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vle16.v v21, (a1)
-; RV32-NEXT:    vcompress.vm v8, v12, v11
-; RV32-NEXT:    vsetivli zero, 8, e32, m4, ta, ma
-; RV32-NEXT:    vslidedown.vi v16, v12, 8
-; RV32-NEXT:    vmv1r.v v0, v10
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
-; RV32-NEXT:    vrgather.vi v8, v16, 1, v0.t
-; RV32-NEXT:    vcompress.vm v14, v12, v20
-; RV32-NEXT:    vrgather.vi v14, v16, 2, v0.t
-; RV32-NEXT:    vmv.s.x v0, a0
-; RV32-NEXT:    vmerge.vvm v12, v16, v12, v0
-; RV32-NEXT:    vrgatherei16.vv v10, v12, v21
-; RV32-NEXT:    vmv1r.v v9, v14
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: maskedload_factor3_mask_skip_field:
-; RV64:       # %bb.0:
-; RV64-NEXT:    li a1, 195
-; RV64-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; RV64-NEXT:    vmv.s.x v0, a1
-; RV64-NEXT:    li a1, 73
-; RV64-NEXT:    vmv.v.i v10, 8
-; RV64-NEXT:    vmv.s.x v11, a1
-; RV64-NEXT:    li a1, 146
-; RV64-NEXT:    vmv.s.x v20, a1
-; RV64-NEXT:    li a1, 36
-; RV64-NEXT:    vsetivli zero, 12, e32, m4, ta, ma
-; RV64-NEXT:    vle32.v v12, (a0), v0.t
-; RV64-NEXT:    li a0, 3
-; RV64-NEXT:    slli a0, a0, 32
-; RV64-NEXT:    addi a0, a0, 5
-; RV64-NEXT:    slli a0, a0, 16
-; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV64-NEXT:    vcompress.vm v8, v12, v11
-; RV64-NEXT:    vsetivli zero, 8, e32, m4, ta, ma
-; RV64-NEXT:    vslidedown.vi v16, v12, 8
-; RV64-NEXT:    vmv1r.v v0, v10
-; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
-; RV64-NEXT:    vrgather.vi v8, v16, 1, v0.t
-; RV64-NEXT:    vcompress.vm v14, v12, v20
-; RV64-NEXT:    vrgather.vi v14, v16, 2, v0.t
-; RV64-NEXT:    vmv.s.x v0, a1
-; RV64-NEXT:    addi a0, a0, 2
-; RV64-NEXT:    vmerge.vvm v12, v16, v12, v0
-; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV64-NEXT:    vmv.v.x v9, a0
-; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV64-NEXT:    vrgatherei16.vv v10, v12, v9
-; RV64-NEXT:    vmv1r.v v9, v14
-; RV64-NEXT:    ret
+; CHECK-LABEL: maskedload_factor3_mask_skip_field:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vmv.v.i v0, 5
+; CHECK-NEXT:    li a1, 12
+; CHECK-NEXT:    vlsseg2e32.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
   %interleaved.vec = tail call <12 x i32> @llvm.masked.load.v12i32.p0(ptr %ptr, i32 4, <12 x i1> <i1 1,i1 1,i1 0,i1 0,i1 0,i1 0,i1 1,i1 1,i1 0,i1 0,i1 0,i1 0>, <12 x i32> poison)
   ; mask = 1010, skip the last field
   %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
@@ -2521,173 +2271,12 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @maskedload_factor3_invalid_skip_field(
 
 define {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} @maskedload_factor5_skip_fields(ptr %ptr) {
   ; mask = 1111, skip the last two fields.
-; RV32-LABEL: maskedload_factor5_skip_fields:
-; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -256
-; RV32-NEXT:    .cfi_def_cfa_offset 256
-; RV32-NEXT:    sw ra, 252(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s0, 248(sp) # 4-byte Folded Spill
-; RV32-NEXT:    .cfi_offset ra, -4
-; RV32-NEXT:    .cfi_offset s0, -8
-; RV32-NEXT:    addi s0, sp, 256
-; RV32-NEXT:    .cfi_def_cfa s0, 0
-; RV32-NEXT:    andi sp, sp, -128
-; RV32-NEXT:    lui a1, 58
-; RV32-NEXT:    addi a1, a1, -793
-; RV32-NEXT:    vsetivli zero, 20, e32, m8, ta, ma
-; RV32-NEXT:    vmv.s.x v0, a1
-; RV32-NEXT:    li a1, 33
-; RV32-NEXT:    vle32.v v16, (a0), v0.t
-; RV32-NEXT:    li a0, 32
-; RV32-NEXT:    mv a2, sp
-; RV32-NEXT:    vmv.s.x v0, a1
-; RV32-NEXT:    vsetivli zero, 8, e32, m4, ta, ma
-; RV32-NEXT:    vslidedown.vi v8, v16, 8
-; RV32-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; RV32-NEXT:    vslidedown.vi v12, v16, 6
-; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV32-NEXT:    vslidedown.vi v13, v16, 1
-; RV32-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; RV32-NEXT:    vse32.v v16, (a2)
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmerge.vvm v8, v8, v16, v0
-; RV32-NEXT:    vslidedown.vi v10, v16, 7
-; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV32-NEXT:    vslidedown.vi v11, v16, 2
-; RV32-NEXT:    vslidedown.vi v18, v16, 3
-; RV32-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; RV32-NEXT:    vslidedown.vi v14, v16, 4
-; RV32-NEXT:    vmv.x.s a0, v12
-; RV32-NEXT:    vmv.x.s a1, v13
-; RV32-NEXT:    vmv.x.s a2, v11
-; RV32-NEXT:    vmv.x.s a3, v18
-; RV32-NEXT:    vmv.x.s a4, v14
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v11, a1
-; RV32-NEXT:    vmv.v.x v12, a2
-; RV32-NEXT:    vmv.v.x v13, a3
-; RV32-NEXT:    vmv.v.x v14, a4
-; RV32-NEXT:    lw a1, 32(sp)
-; RV32-NEXT:    lw a2, 36(sp)
-; RV32-NEXT:    lw a3, 44(sp)
-; RV32-NEXT:    lw a4, 48(sp)
-; RV32-NEXT:    vslide1down.vx v11, v11, a0
-; RV32-NEXT:    vmv.x.s a0, v10
-; RV32-NEXT:    vslide1down.vx v10, v12, a0
-; RV32-NEXT:    vslide1down.vx v11, v11, a3
-; RV32-NEXT:    vslide1down.vx v10, v10, a4
-; RV32-NEXT:    vslide1down.vx v12, v13, a1
-; RV32-NEXT:    lw a0, 64(sp)
-; RV32-NEXT:    lw a1, 52(sp)
-; RV32-NEXT:    lw a3, 56(sp)
-; RV32-NEXT:    lw a4, 68(sp)
-; RV32-NEXT:    vslide1down.vx v14, v14, a2
-; RV32-NEXT:    vslide1down.vx v13, v11, a0
-; RV32-NEXT:    vmv.v.i v0, 10
-; RV32-NEXT:    vslide1down.vx v10, v10, a4
-; RV32-NEXT:    vslide1down.vx v11, v12, a1
-; RV32-NEXT:    lw a0, 72(sp)
-; RV32-NEXT:    lw a1, 76(sp)
-; RV32-NEXT:    vslide1down.vx v12, v14, a3
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
-; RV32-NEXT:    vslidedown.vi v8, v8, 4, v0.t
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vslide1down.vx v11, v11, a0
-; RV32-NEXT:    vslide1down.vx v12, v12, a1
-; RV32-NEXT:    vmv1r.v v9, v13
-; RV32-NEXT:    addi sp, s0, -256
-; RV32-NEXT:    .cfi_def_cfa sp, 256
-; RV32-NEXT:    lw ra, 252(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s0, 248(sp) # 4-byte Folded Reload
-; RV32-NEXT:    .cfi_restore ra
-; RV32-NEXT:    .cfi_restore s0
-; RV32-NEXT:    addi sp, sp, 256
-; RV32-NEXT:    .cfi_def_cfa_offset 0
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: maskedload_factor5_skip_fields:
-; RV64:       # %bb.0:
-; RV64-NEXT:    addi sp, sp, -256
-; RV64-NEXT:    .cfi_def_cfa_offset 256
-; RV64-NEXT:    sd ra, 248(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s0, 240(sp) # 8-byte Folded Spill
-; RV64-NEXT:    .cfi_offset ra, -8
-; RV64-NEXT:    .cfi_offset s0, -16
-; RV64-NEXT:    addi s0, sp, 256
-; RV64-NEXT:    .cfi_def_cfa s0, 0
-; RV64-NEXT:    andi sp, sp, -128
-; RV64-NEXT:    lui a1, 58
-; RV64-NEXT:    addi a1, a1, -793
-; RV64-NEXT:    vsetivli zero, 20, e32, m8, ta, ma
-; RV64-NEXT:    vmv.s.x v0, a1
-; RV64-NEXT:    li a1, 33
-; RV64-NEXT:    vle32.v v16, (a0), v0.t
-; RV64-NEXT:    li a0, 32
-; RV64-NEXT:    mv a2, sp
-; RV64-NEXT:    vmv.s.x v0, a1
-; RV64-NEXT:    vsetivli zero, 8, e32, m4, ta, ma
-; RV64-NEXT:    vslidedown.vi v8, v16, 8
-; RV64-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; RV64-NEXT:    vslidedown.vi v12, v16, 6
-; RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV64-NEXT:    vslidedown.vi v13, v16, 1
-; RV64-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; RV64-NEXT:    vse32.v v16, (a2)
-; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV64-NEXT:    vmerge.vvm v8, v8, v16, v0
-; RV64-NEXT:    vslidedown.vi v10, v16, 7
-; RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV64-NEXT:    vslidedown.vi v11, v16, 2
-; RV64-NEXT:    vslidedown.vi v18, v16, 3
-; RV64-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; RV64-NEXT:    vslidedown.vi v14, v16, 4
-; RV64-NEXT:    vmv.x.s a0, v12
-; RV64-NEXT:    vmv.x.s a1, v13
-; RV64-NEXT:    vmv.x.s a2, v11
-; RV64-NEXT:    vmv.x.s a3, v18
-; RV64-NEXT:    vmv.x.s a4, v14
-; RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV64-NEXT:    vmv.v.x v11, a1
-; RV64-NEXT:    vmv.v.x v12, a2
-; RV64-NEXT:    vmv.v.x v13, a3
-; RV64-NEXT:    vmv.v.x v14, a4
-; RV64-NEXT:    lw a1, 32(sp)
-; RV64-NEXT:    lw a2, 36(sp)
-; RV64-NEXT:    lw a3, 44(sp)
-; RV64-NEXT:    lw a4, 48(sp)
-; RV64-NEXT:    vslide1down.vx v11, v11, a0
-; RV64-NEXT:    vmv.x.s a0, v10
-; RV64-NEXT:    vslide1down.vx v10, v12, a0
-; RV64-NEXT:    vslide1down.vx v11, v11, a3
-; RV64-NEXT:    vslide1down.vx v10, v10, a4
-; RV64-NEXT:    vslide1down.vx v12, v13, a1
-; RV64-NEXT:    lw a0, 64(sp)
-; RV64-NEXT:    lw a1, 52(sp)
-; RV64-NEXT:    lw a3, 56(sp)
-; RV64-NEXT:    lw a4, 68(sp)
-; RV64-NEXT:    vslide1down.vx v14, v14, a2
-; RV64-NEXT:    vslide1down.vx v13, v11, a0
-; RV64-NEXT:    vmv.v.i v0, 10
-; RV64-NEXT:    vslide1down.vx v10, v10, a4
-; RV64-NEXT:    vslide1down.vx v11, v12, a1
-; RV64-NEXT:    lw a0, 72(sp)
-; RV64-NEXT:    lw a1, 76(sp)
-; RV64-NEXT:    vslide1down.vx v12, v14, a3
-; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
-; RV64-NEXT:    vslidedown.vi v8, v8, 4, v0.t
-; RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV64-NEXT:    vslide1down.vx v11, v11, a0
-; RV64-NEXT:    vslide1down.vx v12, v12, a1
-; RV64-NEXT:    vmv1r.v v9, v13
-; RV64-NEXT:    addi sp, s0, -256
-; RV64-NEXT:    .cfi_def_cfa sp, 256
-; RV64-NEXT:    ld ra, 248(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s0, 240(sp) # 8-byte Folded Reload
-; RV64-NEXT:    .cfi_restore ra
-; RV64-NEXT:    .cfi_restore s0
-; RV64-NEXT:    addi sp, sp, 256
-; RV64-NEXT:    .cfi_def_cfa_offset 0
-; RV64-NEXT:    ret
+; CHECK-LABEL: maskedload_factor5_skip_fields:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 20
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vlsseg3e32.v v8, (a0), a1
+; CHECK-NEXT:    ret
   %interleaved.vec = tail call <20 x i32> @llvm.masked.load(ptr %ptr, i32 4, <20 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0>, <20 x i32> poison)
   %v0 = shufflevector <20 x i32> %interleaved.vec, <20 x i32> poison, <4 x i32> <i32 0, i32 5, i32 10, i32 15>
   %v1 = shufflevector <20 x i32> %interleaved.vec, <20 x i32> poison, <4 x i32> <i32 1, i32 6, i32 11, i32 16>

From 95f772e818601ddf5f54e76ec518715f3929eeb2 Mon Sep 17 00:00:00 2001
From: Min-Yih Hsu <min.hsu@sifive.com>
Date: Mon, 4 Aug 2025 11:16:21 -0700
Subject: [PATCH 3/9] fixup! Clean up the tests

---
 .../rvv/fixed-vectors-interleaved-access.ll   | 48 ++++++++-----------
 1 file changed, 21 insertions(+), 27 deletions(-)

diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
index 497b39fb6f044..a61a1b7cf9703 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
@@ -332,7 +332,7 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @vpload_factor3_poison_shufflemask(ptr
   ret {<4 x i32>, <4 x i32>, <4 x i32>} %res2
 }
 
-define {<4 x i32>, <4 x i32>, <4 x i32>} @vpload_factor3_skip_fields(ptr %ptr) {
+define {<4 x i32>, <4 x i32>} @vpload_factor3_skip_fields(ptr %ptr) {
   ; mask = 1111, skip the last field.
 ; CHECK-LABEL: vpload_factor3_skip_fields:
 ; CHECK:       # %bb.0:
@@ -344,13 +344,12 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @vpload_factor3_skip_fields(ptr %ptr) {
   %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
   %v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 poison, i32 10>
   %v2 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
-  %res0 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0
-  %res1 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1
-  %res2 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res1, <4 x i32> %v2, 2
-  ret {<4 x i32>, <4 x i32>, <4 x i32>} %res2
+  %res0 = insertvalue {<4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0
+  %res1 = insertvalue {<4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1
+  ret {<4 x i32>, <4 x i32>} %res1
 }
 
-define {<4 x i32>, <4 x i32>, <4 x i32>} @vpload_factor3_mask_skip_fields(ptr %ptr) {
+define {<4 x i32>, <4 x i32>} @vpload_factor3_mask_skip_fields(ptr %ptr) {
   ; mask = 0101, skip the last field.
 ; CHECK-LABEL: vpload_factor3_mask_skip_fields:
 ; CHECK:       # %bb.0:
@@ -363,10 +362,9 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @vpload_factor3_mask_skip_fields(ptr %p
   %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
   %v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 poison, i32 10>
   %v2 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
-  %res0 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0
-  %res1 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1
-  %res2 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res1, <4 x i32> %v2, 2
-  ret {<4 x i32>, <4 x i32>, <4 x i32>} %res2
+  %res0 = insertvalue {<4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0
+  %res1 = insertvalue {<4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1
+  ret {<4 x i32>, <4 x i32>} %res1
 }
 
 define {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} @vpload_factor4(ptr %ptr) {
@@ -2150,7 +2148,7 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @maskedload_factor3_mask(ptr %ptr) {
   ret {<4 x i32>, <4 x i32>, <4 x i32>} %res2
 }
 
-define {<4 x i32>, <4 x i32>, <4 x i32>} @maskedload_factor3_skip_field(ptr %ptr) {
+define {<4 x i32>, <4 x i32>} @maskedload_factor3_skip_field(ptr %ptr) {
 ; CHECK-LABEL: maskedload_factor3_skip_field:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 12
@@ -2162,13 +2160,12 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @maskedload_factor3_skip_field(ptr %ptr
   %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
   %v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
   %v2 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
-  %res0 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0
-  %res1 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1
-  %res2 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res1, <4 x i32> %v2, 2
-  ret {<4 x i32>, <4 x i32>, <4 x i32>} %res2
+  %res0 = insertvalue {<4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0
+  %res1 = insertvalue {<4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1
+  ret {<4 x i32>, <4 x i32>} %res1
 }
 
-define {<4 x i32>, <4 x i32>, <4 x i32>} @maskedload_factor3_mask_skip_field(ptr %ptr) {
+define {<4 x i32>, <4 x i32>} @maskedload_factor3_mask_skip_field(ptr %ptr) {
 ; CHECK-LABEL: maskedload_factor3_mask_skip_field:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
@@ -2181,10 +2178,9 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @maskedload_factor3_mask_skip_field(ptr
   %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
   %v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
   %v2 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
-  %res0 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0
-  %res1 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1
-  %res2 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res1, <4 x i32> %v2, 2
-  ret {<4 x i32>, <4 x i32>, <4 x i32>} %res2
+  %res0 = insertvalue {<4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0
+  %res1 = insertvalue {<4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1
+  ret {<4 x i32>, <4 x i32>} %res1
 }
 
 ; We can only skip the last field for now.
@@ -2269,7 +2265,7 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @maskedload_factor3_invalid_skip_field(
   ret {<4 x i32>, <4 x i32>, <4 x i32>} %res2
 }
 
-define {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} @maskedload_factor5_skip_fields(ptr %ptr) {
+define {<4 x i32>, <4 x i32>, <4 x i32>} @maskedload_factor5_skip_fields(ptr %ptr) {
   ; mask = 1111, skip the last two fields.
 ; CHECK-LABEL: maskedload_factor5_skip_fields:
 ; CHECK:       # %bb.0:
@@ -2283,11 +2279,9 @@ define {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} @maskedload_facto
   %v2 = shufflevector <20 x i32> %interleaved.vec, <20 x i32> poison, <4 x i32> <i32 2, i32 7, i32 12, i32 17>
   %v3 = shufflevector <20 x i32> %interleaved.vec, <20 x i32> poison, <4 x i32> <i32 3, i32 8, i32 13, i32 18>
   %v4 = shufflevector <20 x i32> %interleaved.vec, <20 x i32> poison, <4 x i32> <i32 4, i32 9, i32 14, i32 19>
-  %res0 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0
-  %res1 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1
-  %res2 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} %res1, <4 x i32> %v2, 2
-  %res3 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} %res2, <4 x i32> %v3, 3
-  %res4 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} %res3, <4 x i32> %v4, 4
-  ret {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} %res4
+  %res0 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0
+  %res1 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1
+  %res2 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res1, <4 x i32> %v2, 2
+  ret {<4 x i32>, <4 x i32>, <4 x i32>} %res2
 }
 

From 7bb4ec398211736b42315945976ac5eba32ac71b Mon Sep 17 00:00:00 2001
From: Min-Yih Hsu <min.hsu@sifive.com>
Date: Tue, 5 Aug 2025 14:34:12 -0700
Subject: [PATCH 4/9] fixup! Recognizing masks assembled by AND

---
 llvm/lib/CodeGen/InterleavedAccessPass.cpp    |  89 +++++++++-----
 .../rvv/fixed-vectors-interleaved-access.ll   | 114 ++++++++++++------
 2 files changed, 135 insertions(+), 68 deletions(-)

diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index e6c4de23c055e..81efb7b335dbc 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -385,25 +385,25 @@ bool InterleavedAccessImpl::lowerInterleavedLoad(
       replaceBinOpShuffles(BinOpShuffles.getArrayRef(), Shuffles, Load);
 
   Value *Mask = nullptr;
-  unsigned MaskFactor = Factor;
+  unsigned GapMaskFactor = Factor;
   if (LI) {
     LLVM_DEBUG(dbgs() << "IA: Found an interleaved load: " << *Load << "\n");
   } else {
     // Check mask operand. Handle both all-true/false and interleaved mask.
-    std::tie(Mask, MaskFactor) = getMask(getMaskOperand(II), Factor, VecTy);
+    std::tie(Mask, GapMaskFactor) = getMask(getMaskOperand(II), Factor, VecTy);
     if (!Mask)
       return false;
 
     LLVM_DEBUG(dbgs() << "IA: Found an interleaved vp.load or masked.load: "
                       << *Load << "\n");
     LLVM_DEBUG(dbgs() << "IA: With nominal factor " << Factor
-                      << " and mask factor " << MaskFactor << "\n");
+                      << " and mask factor " << GapMaskFactor << "\n");
   }
 
   // Try to create target specific intrinsics to replace the load and
   // shuffles.
   if (!TLI->lowerInterleavedLoad(cast<Instruction>(Load), Mask, Shuffles,
-                                 Indices, Factor, MaskFactor))
+                                 Indices, Factor, GapMaskFactor))
     // If Extracts is not empty, tryReplaceExtracts made changes earlier.
     return !Extracts.empty() || BinOpShuffleChanged;
 
@@ -540,15 +540,20 @@ bool InterleavedAccessImpl::lowerInterleavedStore(
          "number of stored element should be a multiple of Factor");
 
   Value *Mask = nullptr;
+  unsigned GapMaskFactor = Factor;
   if (SI) {
     LLVM_DEBUG(dbgs() << "IA: Found an interleaved store: " << *Store << "\n");
   } else {
     // Check mask operand. Handle both all-true/false and interleaved mask.
     unsigned LaneMaskLen = NumStoredElements / Factor;
-    std::tie(Mask, std::ignore) = getMask(getMaskOperand(II), Factor,
-                                          ElementCount::getFixed(LaneMaskLen));
+    std::tie(Mask, GapMaskFactor) = getMask(
+        getMaskOperand(II), Factor, ElementCount::getFixed(LaneMaskLen));
     if (!Mask)
       return false;
+    // We shouldn't transform stores even it has a gap mask. And since we might
+    // already change the IR, we're returning true here.
+    if (GapMaskFactor != Factor)
+      return true;
 
     LLVM_DEBUG(dbgs() << "IA: Found an interleaved vp.store or masked.store: "
                       << *Store << "\n");
@@ -565,8 +570,40 @@ bool InterleavedAccessImpl::lowerInterleavedStore(
   return true;
 }
 
+// A wide mask <1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0> could be used to skip the
+// last field in a factor-of-three interleaved store or deinterleaved load (in
+// which case LeafMaskLen is 4). Such (wide) mask is also known as gap mask.
+// This helper function tries to detect this pattern and return the actual
+// factor we're accessing, which is 2 in this example.
+static unsigned getGapMaskFactor(const Constant &MaskConst, unsigned Factor,
+                                 unsigned LeafMaskLen) {
+  APInt FactorMask(Factor, 0);
+  FactorMask.setAllBits();
+  for (unsigned F = 0U; F < Factor; ++F) {
+    unsigned Idx;
+    for (Idx = 0U; Idx < LeafMaskLen; ++Idx) {
+      Constant *C = MaskConst.getAggregateElement(F + Idx * Factor);
+      if (!C->isZeroValue())
+        break;
+    }
+    // All mask bits on this field are zero, skipping it.
+    if (Idx >= LeafMaskLen)
+      FactorMask.clearBit(F);
+  }
+  // We currently only allow gaps in the "trailing" factors / fields. So
+  // given the original factor being 4, we can skip fields 2 and 3, but we
+  // cannot only skip fields 1 and 2. If FactorMask does not match such
+  // pattern, reset it.
+  if (!FactorMask.isMask())
+    FactorMask.setAllBits();
+
+  return FactorMask.popcount();
+}
+
 static std::pair<Value *, unsigned> getMask(Value *WideMask, unsigned Factor,
                                             ElementCount LeafValueEC) {
+  using namespace PatternMatch;
+
   if (auto *IMI = dyn_cast<IntrinsicInst>(WideMask)) {
     if (unsigned F = getInterleaveIntrinsicFactor(IMI->getIntrinsicID());
         F && F == Factor && llvm::all_equal(IMI->args())) {
@@ -574,6 +611,18 @@ static std::pair<Value *, unsigned> getMask(Value *WideMask, unsigned Factor,
     }
   }
 
+  // Try to match `and <interleaved mask>, <gap mask>`. The WideMask here is
+  // expected to be a fixed vector and gap mask should be a constant mask.
+  Value *AndMaskLHS;
+  Constant *AndMaskRHS;
+  if (match(WideMask, m_c_And(m_Value(AndMaskLHS), m_Constant(AndMaskRHS))) &&
+      LeafValueEC.isFixed()) {
+    assert(!isa<Constant>(AndMaskLHS) &&
+           "expect constants to be folded already");
+    return {getMask(AndMaskLHS, Factor, LeafValueEC).first,
+            getGapMaskFactor(*AndMaskRHS, Factor, LeafValueEC.getFixedValue())};
+  }
+
   if (auto *ConstMask = dyn_cast<Constant>(WideMask)) {
     if (auto *Splat = ConstMask->getSplatValue())
       // All-ones or all-zeros mask.
@@ -581,33 +630,17 @@ static std::pair<Value *, unsigned> getMask(Value *WideMask, unsigned Factor,
 
     if (LeafValueEC.isFixed()) {
       unsigned LeafMaskLen = LeafValueEC.getFixedValue();
-      // First, check if the mask completely skips some of the factors / fields.
-      APInt FactorMask(Factor, 0);
-      FactorMask.setAllBits();
-      for (unsigned F = 0U; F < Factor; ++F) {
-        unsigned Idx;
-        for (Idx = 0U; Idx < LeafMaskLen; ++Idx) {
-          Constant *C = ConstMask->getAggregateElement(F + Idx * Factor);
-          if (!C->isZeroValue())
-            break;
-        }
-        // All mask bits on this field are zero, skipping it.
-        if (Idx >= LeafMaskLen)
-          FactorMask.clearBit(F);
-      }
-      // We currently only support skipping "trailing" factors / fields. So
-      // given the original factor being 4, we can skip fields 2 and 3, but we
-      // cannot only skip fields 1 and 2. If FactorMask does not match such
-      // pattern, reset it.
-      if (!FactorMask.isMask())
-        FactorMask.setAllBits();
+      // First, check if we use a gap mask to skip some of the factors / fields.
+      const unsigned GapMaskFactor =
+          getGapMaskFactor(*ConstMask, Factor, LeafMaskLen);
+      assert(GapMaskFactor <= Factor);
 
       SmallVector<Constant *, 8> LeafMask(LeafMaskLen, nullptr);
       // If this is a fixed-length constant mask, each lane / leaf has to
       // use the same mask. This is done by checking if every group with Factor
       // number of elements in the interleaved mask has homogeneous values.
       for (unsigned Idx = 0U; Idx < LeafMaskLen * Factor; ++Idx) {
-        if (!FactorMask[Idx % Factor])
+        if (Idx % Factor >= GapMaskFactor)
           continue;
         Constant *C = ConstMask->getAggregateElement(Idx);
         if (LeafMask[Idx / Factor] && LeafMask[Idx / Factor] != C)
@@ -615,7 +648,7 @@ static std::pair<Value *, unsigned> getMask(Value *WideMask, unsigned Factor,
         LeafMask[Idx / Factor] = C;
       }
 
-      return {ConstantVector::get(LeafMask), FactorMask.popcount()};
+      return {ConstantVector::get(LeafMask), GapMaskFactor};
     }
   }
 
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
index a61a1b7cf9703..2c738e5aeb55b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
@@ -367,6 +367,24 @@ define {<4 x i32>, <4 x i32>} @vpload_factor3_mask_skip_fields(ptr %ptr) {
   ret {<4 x i32>, <4 x i32>} %res1
 }
 
+define {<4 x i32>, <4 x i32>} @vpload_factor3_combined_mask_skip_field(ptr %ptr, <4 x i1> %mask) {
+; CHECK-LABEL: vpload_factor3_combined_mask_skip_field:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 12
+; CHECK-NEXT:    vsetivli zero, 6, e32, m1, ta, ma
+; CHECK-NEXT:    vlsseg2e32.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
+  %interleaved.mask = shufflevector <4 x i1> %mask, <4 x i1> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3>
+  %combined = and <12 x i1> %interleaved.mask, <i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false>
+  %interleaved.vec = tail call <12 x i32> @llvm.vp.load.v12i32.p0(ptr %ptr, <12 x i1> %combined, i32 12)
+  ; mask = %mask, skip the last field
+  %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+  %v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+  %res0 = insertvalue {<4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0
+  %res1 = insertvalue {<4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1
+  ret {<4 x i32>, <4 x i32>} %res1
+}
+
 define {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} @vpload_factor4(ptr %ptr) {
 ; CHECK-LABEL: vpload_factor4:
 ; CHECK:       # %bb.0:
@@ -514,8 +532,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    li a2, 32
 ; RV32-NEXT:    lui a3, 12
 ; RV32-NEXT:    lui a6, 12291
-; RV32-NEXT:    lui a7, %hi(.LCPI25_0)
-; RV32-NEXT:    addi a7, a7, %lo(.LCPI25_0)
+; RV32-NEXT:    lui a7, %hi(.LCPI26_0)
+; RV32-NEXT:    addi a7, a7, %lo(.LCPI26_0)
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; RV32-NEXT:    vle32.v v24, (a5)
 ; RV32-NEXT:    vmv.s.x v0, a3
@@ -600,12 +618,12 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs4r.v v8, (a1) # vscale x 32-byte Folded Spill
 ; RV32-NEXT:    lui a7, 49164
-; RV32-NEXT:    lui a1, %hi(.LCPI25_1)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI25_1)
+; RV32-NEXT:    lui a1, %hi(.LCPI26_1)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI26_1)
 ; RV32-NEXT:    lui t2, 3
 ; RV32-NEXT:    lui t1, 196656
-; RV32-NEXT:    lui a4, %hi(.LCPI25_3)
-; RV32-NEXT:    addi a4, a4, %lo(.LCPI25_3)
+; RV32-NEXT:    lui a4, %hi(.LCPI26_3)
+; RV32-NEXT:    addi a4, a4, %lo(.LCPI26_3)
 ; RV32-NEXT:    lui t0, 786624
 ; RV32-NEXT:    li a5, 48
 ; RV32-NEXT:    lui a6, 768
@@ -784,8 +802,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
 ; RV32-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; RV32-NEXT:    vrgatherei16.vv v24, v8, v2
-; RV32-NEXT:    lui a1, %hi(.LCPI25_2)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI25_2)
+; RV32-NEXT:    lui a1, %hi(.LCPI26_2)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI26_2)
 ; RV32-NEXT:    lui a3, 3073
 ; RV32-NEXT:    addi a3, a3, -1024
 ; RV32-NEXT:    vmv.s.x v0, a3
@@ -849,16 +867,16 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vrgatherei16.vv v28, v8, v3
 ; RV32-NEXT:    vsetivli zero, 10, e32, m4, tu, ma
 ; RV32-NEXT:    vmv.v.v v28, v24
-; RV32-NEXT:    lui a1, %hi(.LCPI25_4)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI25_4)
-; RV32-NEXT:    lui a2, %hi(.LCPI25_5)
-; RV32-NEXT:    addi a2, a2, %lo(.LCPI25_5)
+; RV32-NEXT:    lui a1, %hi(.LCPI26_4)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI26_4)
+; RV32-NEXT:    lui a2, %hi(.LCPI26_5)
+; RV32-NEXT:    addi a2, a2, %lo(.LCPI26_5)
 ; RV32-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; RV32-NEXT:    vle16.v v24, (a2)
 ; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; RV32-NEXT:    vle16.v v8, (a1)
-; RV32-NEXT:    lui a1, %hi(.LCPI25_7)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI25_7)
+; RV32-NEXT:    lui a1, %hi(.LCPI26_7)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI26_7)
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vle16.v v10, (a1)
 ; RV32-NEXT:    csrr a1, vlenb
@@ -886,14 +904,14 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vl8r.v v0, (a1) # vscale x 64-byte Folded Reload
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vrgatherei16.vv v16, v0, v10
-; RV32-NEXT:    lui a1, %hi(.LCPI25_6)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI25_6)
-; RV32-NEXT:    lui a2, %hi(.LCPI25_8)
-; RV32-NEXT:    addi a2, a2, %lo(.LCPI25_8)
+; RV32-NEXT:    lui a1, %hi(.LCPI26_6)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI26_6)
+; RV32-NEXT:    lui a2, %hi(.LCPI26_8)
+; RV32-NEXT:    addi a2, a2, %lo(.LCPI26_8)
 ; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; RV32-NEXT:    vle16.v v4, (a1)
-; RV32-NEXT:    lui a1, %hi(.LCPI25_9)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI25_9)
+; RV32-NEXT:    lui a1, %hi(.LCPI26_9)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI26_9)
 ; RV32-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; RV32-NEXT:    vle16.v v6, (a1)
 ; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
@@ -980,8 +998,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    li a4, 128
 ; RV64-NEXT:    lui a1, 1
 ; RV64-NEXT:    vle64.v v8, (a3)
-; RV64-NEXT:    lui a3, %hi(.LCPI25_0)
-; RV64-NEXT:    addi a3, a3, %lo(.LCPI25_0)
+; RV64-NEXT:    lui a3, %hi(.LCPI26_0)
+; RV64-NEXT:    addi a3, a3, %lo(.LCPI26_0)
 ; RV64-NEXT:    vmv.s.x v0, a4
 ; RV64-NEXT:    csrr a4, vlenb
 ; RV64-NEXT:    li a5, 61
@@ -1169,8 +1187,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
 ; RV64-NEXT:    vslideup.vi v12, v16, 1, v0.t
-; RV64-NEXT:    lui a2, %hi(.LCPI25_1)
-; RV64-NEXT:    addi a2, a2, %lo(.LCPI25_1)
+; RV64-NEXT:    lui a2, %hi(.LCPI26_1)
+; RV64-NEXT:    addi a2, a2, %lo(.LCPI26_1)
 ; RV64-NEXT:    li a3, 192
 ; RV64-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; RV64-NEXT:    vle16.v v6, (a2)
@@ -1204,8 +1222,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    vrgatherei16.vv v24, v16, v6
 ; RV64-NEXT:    addi a2, sp, 16
 ; RV64-NEXT:    vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    lui a2, %hi(.LCPI25_2)
-; RV64-NEXT:    addi a2, a2, %lo(.LCPI25_2)
+; RV64-NEXT:    lui a2, %hi(.LCPI26_2)
+; RV64-NEXT:    addi a2, a2, %lo(.LCPI26_2)
 ; RV64-NEXT:    li a3, 1040
 ; RV64-NEXT:    vmv.s.x v0, a3
 ; RV64-NEXT:    addi a1, a1, -2016
@@ -1289,12 +1307,12 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vs4r.v v8, (a1) # vscale x 32-byte Folded Spill
-; RV64-NEXT:    lui a1, %hi(.LCPI25_3)
-; RV64-NEXT:    addi a1, a1, %lo(.LCPI25_3)
+; RV64-NEXT:    lui a1, %hi(.LCPI26_3)
+; RV64-NEXT:    addi a1, a1, %lo(.LCPI26_3)
 ; RV64-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; RV64-NEXT:    vle16.v v20, (a1)
-; RV64-NEXT:    lui a1, %hi(.LCPI25_4)
-; RV64-NEXT:    addi a1, a1, %lo(.LCPI25_4)
+; RV64-NEXT:    lui a1, %hi(.LCPI26_4)
+; RV64-NEXT:    addi a1, a1, %lo(.LCPI26_4)
 ; RV64-NEXT:    vle16.v v8, (a1)
 ; RV64-NEXT:    csrr a1, vlenb
 ; RV64-NEXT:    li a2, 77
@@ -1345,8 +1363,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    vl2r.v v8, (a1) # vscale x 16-byte Folded Reload
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV64-NEXT:    vrgatherei16.vv v0, v16, v8
-; RV64-NEXT:    lui a1, %hi(.LCPI25_5)
-; RV64-NEXT:    addi a1, a1, %lo(.LCPI25_5)
+; RV64-NEXT:    lui a1, %hi(.LCPI26_5)
+; RV64-NEXT:    addi a1, a1, %lo(.LCPI26_5)
 ; RV64-NEXT:    vle16.v v20, (a1)
 ; RV64-NEXT:    csrr a1, vlenb
 ; RV64-NEXT:    li a2, 61
@@ -1963,8 +1981,8 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @invalid_vp_mask(ptr %ptr) {
 ; RV32-NEXT:    vle32.v v12, (a0), v0.t
 ; RV32-NEXT:    li a0, 36
 ; RV32-NEXT:    vmv.s.x v20, a1
-; RV32-NEXT:    lui a1, %hi(.LCPI61_0)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI61_0)
+; RV32-NEXT:    lui a1, %hi(.LCPI62_0)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI62_0)
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32-NEXT:    vle16.v v21, (a1)
 ; RV32-NEXT:    vcompress.vm v8, v12, v11
@@ -2039,8 +2057,8 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @invalid_vp_evl(ptr %ptr) {
 ; RV32-NEXT:    vmv.s.x v10, a0
 ; RV32-NEXT:    li a0, 146
 ; RV32-NEXT:    vmv.s.x v11, a0
-; RV32-NEXT:    lui a0, %hi(.LCPI62_0)
-; RV32-NEXT:    addi a0, a0, %lo(.LCPI62_0)
+; RV32-NEXT:    lui a0, %hi(.LCPI63_0)
+; RV32-NEXT:    addi a0, a0, %lo(.LCPI63_0)
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32-NEXT:    vle16.v v20, (a0)
 ; RV32-NEXT:    li a0, 36
@@ -2159,7 +2177,6 @@ define {<4 x i32>, <4 x i32>} @maskedload_factor3_skip_field(ptr %ptr) {
   ; mask = 1111, skip last field
   %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
   %v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
-  %v2 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
   %res0 = insertvalue {<4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0
   %res1 = insertvalue {<4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1
   ret {<4 x i32>, <4 x i32>} %res1
@@ -2177,7 +2194,24 @@ define {<4 x i32>, <4 x i32>} @maskedload_factor3_mask_skip_field(ptr %ptr) {
   ; mask = 1010, skip the last field
   %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
   %v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
-  %v2 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
+  %res0 = insertvalue {<4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0
+  %res1 = insertvalue {<4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1
+  ret {<4 x i32>, <4 x i32>} %res1
+}
+
+define {<4 x i32>, <4 x i32>} @maskedload_factor3_combined_mask_skip_field(ptr %ptr, <4 x i1> %mask) {
+; CHECK-LABEL: maskedload_factor3_combined_mask_skip_field:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 12
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vlsseg2e32.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
+  %interleaved.mask = shufflevector <4 x i1> %mask, <4 x i1> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3>
+  %combined = and <12 x i1> %interleaved.mask, <i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false>
+  %interleaved.vec = tail call <12 x i32> @llvm.masked.load.v12i32.p0(ptr %ptr, i32 4, <12 x i1> %combined, <12 x i32> poison)
+  ; mask = %mask, skip the last field
+  %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+  %v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
   %res0 = insertvalue {<4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0
   %res1 = insertvalue {<4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1
   ret {<4 x i32>, <4 x i32>} %res1
@@ -2200,8 +2234,8 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @maskedload_factor3_invalid_skip_field(
 ; RV32-NEXT:    vle32.v v12, (a0), v0.t
 ; RV32-NEXT:    li a0, 36
 ; RV32-NEXT:    vmv.s.x v20, a1
-; RV32-NEXT:    lui a1, %hi(.LCPI68_0)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI68_0)
+; RV32-NEXT:    lui a1, %hi(.LCPI70_0)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI70_0)
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32-NEXT:    vle16.v v21, (a1)
 ; RV32-NEXT:    vcompress.vm v8, v12, v11

From f5507fb761a55083daff9896d551b350d21f1280 Mon Sep 17 00:00:00 2001
From: Min-Yih Hsu <min.hsu@sifive.com>
Date: Tue, 5 Aug 2025 17:43:35 -0700
Subject: [PATCH 5/9] fixup! Reject cases where Factor != MaskFactor in other
 targets

---
 llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 3 +++
 llvm/lib/Target/ARM/ARMISelLowering.cpp         | 3 +++
 llvm/lib/Target/X86/X86InterleavedAccess.cpp    | 3 +++
 3 files changed, 9 insertions(+)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index e681d846f9e1c..632bb79fa02e4 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -17266,6 +17266,9 @@ bool AArch64TargetLowering::lowerInterleavedLoad(
     return false;
   assert(!Mask && "Unexpected mask on a load");
 
+  if (Factor != MaskFactor)
+    return false;
+
   const DataLayout &DL = LI->getDataLayout();
 
   VectorType *VTy = Shuffles[0]->getType();
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index a5750def66b7d..c087e32cd4787 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -21611,6 +21611,9 @@ bool ARMTargetLowering::lowerInterleavedLoad(
     return false;
   assert(!Mask && "Unexpected mask on a load");
 
+  if (Factor != MaskFactor)
+    return false;
+
   auto *VecTy = cast<FixedVectorType>(Shuffles[0]->getType());
   Type *EltTy = VecTy->getElementType();
 
diff --git a/llvm/lib/Target/X86/X86InterleavedAccess.cpp b/llvm/lib/Target/X86/X86InterleavedAccess.cpp
index 6929c869b1a31..52132a9d64b1a 100644
--- a/llvm/lib/Target/X86/X86InterleavedAccess.cpp
+++ b/llvm/lib/Target/X86/X86InterleavedAccess.cpp
@@ -814,6 +814,9 @@ bool X86TargetLowering::lowerInterleavedLoad(
     return false;
   assert(!Mask && "Unexpected mask on a load");
 
+  if (Factor != MaskFactor)
+    return false;
+
   // Create an interleaved access group.
   IRBuilder<> Builder(LI);
   X86InterleavedAccessGroup Grp(LI, Shuffles, Indices, Factor, Subtarget,

From 8e4b79edae79a713a6c40066c2895e8a06fbc5b9 Mon Sep 17 00:00:00 2001
From: Min-Yih Hsu <min.hsu@sifive.com>
Date: Wed, 6 Aug 2025 09:37:12 -0700
Subject: [PATCH 6/9] fixup! Address review comments

---
 llvm/lib/CodeGen/InterleavedAccessPass.cpp | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index 81efb7b335dbc..7c3b0db50f2ad 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -580,14 +580,16 @@ static unsigned getGapMaskFactor(const Constant &MaskConst, unsigned Factor,
   APInt FactorMask(Factor, 0);
   FactorMask.setAllBits();
   for (unsigned F = 0U; F < Factor; ++F) {
-    unsigned Idx;
-    for (Idx = 0U; Idx < LeafMaskLen; ++Idx) {
+    bool AllZero = true;
+    for (unsigned Idx = 0U; Idx < LeafMaskLen; ++Idx) {
       Constant *C = MaskConst.getAggregateElement(F + Idx * Factor);
-      if (!C->isZeroValue())
+      if (!C->isZeroValue()) {
+        AllZero = false;
         break;
+      }
     }
     // All mask bits on this field are zero, skipping it.
-    if (Idx >= LeafMaskLen)
+    if (AllZero)
       FactorMask.clearBit(F);
   }
   // We currently only allow gaps in the "trailing" factors / fields. So
@@ -705,10 +707,12 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
       return false;
 
     // Check mask operand. Handle both all-true/false and interleaved mask.
-    std::tie(Mask, std::ignore) =
+    unsigned GapMaskFactor;
+    std::tie(Mask, GapMaskFactor) =
         getMask(getMaskOperand(II), Factor, getDeinterleavedVectorType(DI));
     if (!Mask)
       return false;
+    assert(GapMaskFactor == Factor);
 
     LLVM_DEBUG(dbgs() << "IA: Found a vp.load or masked.load with deinterleave"
                       << " intrinsic " << *DI << " and factor = "
@@ -747,11 +751,13 @@ bool InterleavedAccessImpl::lowerInterleaveIntrinsic(
         II->getIntrinsicID() != Intrinsic::vp_store)
       return false;
     // Check mask operand. Handle both all-true/false and interleaved mask.
-    std::tie(Mask, std::ignore) =
+    unsigned GapMaskFactor;
+    std::tie(Mask, GapMaskFactor) =
         getMask(getMaskOperand(II), Factor,
                 cast<VectorType>(InterleaveValues[0]->getType()));
     if (!Mask)
       return false;
+    assert(GapMaskFactor == Factor);
 
     LLVM_DEBUG(dbgs() << "IA: Found a vp.store or masked.store with interleave"
                       << " intrinsic " << *IntII << " and factor = "

From 3f992d6c2051d66916930d8fd96798db680abab2 Mon Sep 17 00:00:00 2001
From: Min-Yih Hsu <min.hsu@sifive.com>
Date: Tue, 12 Aug 2025 11:47:23 -0700
Subject: [PATCH 7/9] fixup! Passing APInt GapMask instead of MaskFactor

---
 llvm/include/llvm/CodeGen/TargetLowering.h    |   6 +-
 llvm/lib/CodeGen/InterleavedAccessPass.cpp    | 103 +++++++++---------
 .../Target/AArch64/AArch64ISelLowering.cpp    |   6 +-
 llvm/lib/Target/AArch64/AArch64ISelLowering.h |   2 +-
 llvm/lib/Target/ARM/ARMISelLowering.cpp       |   6 +-
 llvm/lib/Target/ARM/ARMISelLowering.h         |   2 +-
 llvm/lib/Target/RISCV/RISCVISelLowering.h     |   2 +-
 .../Target/RISCV/RISCVInterleavedAccess.cpp   |  10 +-
 llvm/lib/Target/X86/X86ISelLowering.h         |   2 +-
 llvm/lib/Target/X86/X86InterleavedAccess.cpp  |   6 +-
 10 files changed, 69 insertions(+), 76 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 3239b35031e36..3d7d74593533b 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3209,12 +3209,12 @@ class LLVM_ABI TargetLoweringBase {
   /// \p Shuffles is the shufflevector list to DE-interleave the loaded vector.
   /// \p Indices is the corresponding indices for each shufflevector.
   /// \p Factor is the interleave factor.
-  /// \p MaskFactor is the interleave factor that considers mask, which can
-  /// reduce the original factor.
+  /// \p GapMask is a mask in which inactive lanes represent components / fields
+  /// that are always skipped.
   virtual bool lowerInterleavedLoad(Instruction *Load, Value *Mask,
                                     ArrayRef<ShuffleVectorInst *> Shuffles,
                                     ArrayRef<unsigned> Indices, unsigned Factor,
-                                    unsigned MaskFactor) const {
+                                    const APInt &GapMask) const {
     return false;
   }
 
diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index 7c3b0db50f2ad..7956e02ac1fc7 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -271,16 +271,13 @@ static Value *getMaskOperand(IntrinsicInst *II) {
 // Return a pair of
 //  (1) The corresponded deinterleaved mask, or nullptr if there is no valid
 //  mask.
-//  (2) Some mask effectively skips a certain field, this element contains
-//  the factor after taking such contraction into consideration. Note that
-//  currently we only support skipping trailing fields. So if the "nominal"
-//  factor was 5, you cannot only skip field 1 and 2, but you can skip field 3
-//  and 4.
-static std::pair<Value *, unsigned> getMask(Value *WideMask, unsigned Factor,
-                                            ElementCount LeafValueEC);
-
-static std::pair<Value *, unsigned> getMask(Value *WideMask, unsigned Factor,
-                                            VectorType *LeafValueTy) {
+//  (2) Some mask effectively skips a certain field, and this element is a mask
+//  in which inactive lanes represent fields that are skipped (i.e. "gaps").
+static std::pair<Value *, APInt> getMask(Value *WideMask, unsigned Factor,
+                                         ElementCount LeafValueEC);
+
+static std::pair<Value *, APInt> getMask(Value *WideMask, unsigned Factor,
+                                         VectorType *LeafValueTy) {
   return getMask(WideMask, Factor, LeafValueTy->getElementCount());
 }
 
@@ -385,25 +382,26 @@ bool InterleavedAccessImpl::lowerInterleavedLoad(
       replaceBinOpShuffles(BinOpShuffles.getArrayRef(), Shuffles, Load);
 
   Value *Mask = nullptr;
-  unsigned GapMaskFactor = Factor;
+  APInt GapMask(Factor, 0);
   if (LI) {
+    GapMask.setAllBits();
     LLVM_DEBUG(dbgs() << "IA: Found an interleaved load: " << *Load << "\n");
   } else {
     // Check mask operand. Handle both all-true/false and interleaved mask.
-    std::tie(Mask, GapMaskFactor) = getMask(getMaskOperand(II), Factor, VecTy);
+    std::tie(Mask, GapMask) = getMask(getMaskOperand(II), Factor, VecTy);
     if (!Mask)
       return false;
 
     LLVM_DEBUG(dbgs() << "IA: Found an interleaved vp.load or masked.load: "
                       << *Load << "\n");
     LLVM_DEBUG(dbgs() << "IA: With nominal factor " << Factor
-                      << " and mask factor " << GapMaskFactor << "\n");
+                      << " and actual factor " << GapMask.popcount() << "\n");
   }
 
   // Try to create target specific intrinsics to replace the load and
   // shuffles.
   if (!TLI->lowerInterleavedLoad(cast<Instruction>(Load), Mask, Shuffles,
-                                 Indices, Factor, GapMaskFactor))
+                                 Indices, Factor, GapMask))
     // If Extracts is not empty, tryReplaceExtracts made changes earlier.
     return !Extracts.empty() || BinOpShuffleChanged;
 
@@ -540,19 +538,19 @@ bool InterleavedAccessImpl::lowerInterleavedStore(
          "number of stored element should be a multiple of Factor");
 
   Value *Mask = nullptr;
-  unsigned GapMaskFactor = Factor;
   if (SI) {
     LLVM_DEBUG(dbgs() << "IA: Found an interleaved store: " << *Store << "\n");
   } else {
     // Check mask operand. Handle both all-true/false and interleaved mask.
     unsigned LaneMaskLen = NumStoredElements / Factor;
-    std::tie(Mask, GapMaskFactor) = getMask(
-        getMaskOperand(II), Factor, ElementCount::getFixed(LaneMaskLen));
+    APInt GapMask(Factor, 0);
+    std::tie(Mask, GapMask) = getMask(getMaskOperand(II), Factor,
+                                      ElementCount::getFixed(LaneMaskLen));
     if (!Mask)
       return false;
-    // We shouldn't transform stores even it has a gap mask. And since we might
-    // already change the IR, we're returning true here.
-    if (GapMaskFactor != Factor)
+    // We haven't supported gap mask for stores. Yet it is possible that we
+    // already changed the IR, hence returning true here.
+    if (GapMask.popcount() != Factor)
       return true;
 
     LLVM_DEBUG(dbgs() << "IA: Found an interleaved vp.store or masked.store: "
@@ -575,10 +573,9 @@ bool InterleavedAccessImpl::lowerInterleavedStore(
 // which case LeafMaskLen is 4). Such (wide) mask is also known as gap mask.
 // This helper function tries to detect this pattern and return the actual
 // factor we're accessing, which is 2 in this example.
-static unsigned getGapMaskFactor(const Constant &MaskConst, unsigned Factor,
-                                 unsigned LeafMaskLen) {
-  APInt FactorMask(Factor, 0);
-  FactorMask.setAllBits();
+static void getGapMask(const Constant &MaskConst, unsigned Factor,
+                       unsigned LeafMaskLen, APInt &GapMask) {
+  assert(GapMask.getBitWidth() == Factor);
   for (unsigned F = 0U; F < Factor; ++F) {
     bool AllZero = true;
     for (unsigned Idx = 0U; Idx < LeafMaskLen; ++Idx) {
@@ -590,26 +587,21 @@ static unsigned getGapMaskFactor(const Constant &MaskConst, unsigned Factor,
     }
     // All mask bits on this field are zero, skipping it.
     if (AllZero)
-      FactorMask.clearBit(F);
+      GapMask.clearBit(F);
   }
-  // We currently only allow gaps in the "trailing" factors / fields. So
-  // given the original factor being 4, we can skip fields 2 and 3, but we
-  // cannot only skip fields 1 and 2. If FactorMask does not match such
-  // pattern, reset it.
-  if (!FactorMask.isMask())
-    FactorMask.setAllBits();
-
-  return FactorMask.popcount();
 }
 
-static std::pair<Value *, unsigned> getMask(Value *WideMask, unsigned Factor,
-                                            ElementCount LeafValueEC) {
+static std::pair<Value *, APInt> getMask(Value *WideMask, unsigned Factor,
+                                         ElementCount LeafValueEC) {
   using namespace PatternMatch;
 
+  APInt GapMask(Factor, 0);
+  GapMask.setAllBits();
+
   if (auto *IMI = dyn_cast<IntrinsicInst>(WideMask)) {
     if (unsigned F = getInterleaveIntrinsicFactor(IMI->getIntrinsicID());
         F && F == Factor && llvm::all_equal(IMI->args())) {
-      return {IMI->getArgOperand(0), Factor};
+      return {IMI->getArgOperand(0), GapMask};
     }
   }
 
@@ -621,36 +613,34 @@ static std::pair<Value *, unsigned> getMask(Value *WideMask, unsigned Factor,
       LeafValueEC.isFixed()) {
     assert(!isa<Constant>(AndMaskLHS) &&
            "expect constants to be folded already");
-    return {getMask(AndMaskLHS, Factor, LeafValueEC).first,
-            getGapMaskFactor(*AndMaskRHS, Factor, LeafValueEC.getFixedValue())};
+    getGapMask(*AndMaskRHS, Factor, LeafValueEC.getFixedValue(), GapMask);
+    return {getMask(AndMaskLHS, Factor, LeafValueEC).first, GapMask};
   }
 
   if (auto *ConstMask = dyn_cast<Constant>(WideMask)) {
     if (auto *Splat = ConstMask->getSplatValue())
       // All-ones or all-zeros mask.
-      return {ConstantVector::getSplat(LeafValueEC, Splat), Factor};
+      return {ConstantVector::getSplat(LeafValueEC, Splat), GapMask};
 
     if (LeafValueEC.isFixed()) {
       unsigned LeafMaskLen = LeafValueEC.getFixedValue();
       // First, check if we use a gap mask to skip some of the factors / fields.
-      const unsigned GapMaskFactor =
-          getGapMaskFactor(*ConstMask, Factor, LeafMaskLen);
-      assert(GapMaskFactor <= Factor);
+      getGapMask(*ConstMask, Factor, LeafMaskLen, GapMask);
 
       SmallVector<Constant *, 8> LeafMask(LeafMaskLen, nullptr);
       // If this is a fixed-length constant mask, each lane / leaf has to
       // use the same mask. This is done by checking if every group with Factor
       // number of elements in the interleaved mask has homogeneous values.
       for (unsigned Idx = 0U; Idx < LeafMaskLen * Factor; ++Idx) {
-        if (Idx % Factor >= GapMaskFactor)
+        if (!GapMask[Idx % Factor])
           continue;
         Constant *C = ConstMask->getAggregateElement(Idx);
         if (LeafMask[Idx / Factor] && LeafMask[Idx / Factor] != C)
-          return {nullptr, Factor};
+          return {nullptr, GapMask};
         LeafMask[Idx / Factor] = C;
       }
 
-      return {ConstantVector::get(LeafMask), GapMaskFactor};
+      return {ConstantVector::get(LeafMask), GapMask};
     }
   }
 
@@ -672,11 +662,11 @@ static std::pair<Value *, unsigned> getMask(Value *WideMask, unsigned Factor,
       IRBuilder<> Builder(SVI);
       return {Builder.CreateExtractVector(LeafMaskTy, SVI->getOperand(0),
                                           uint64_t(0)),
-              Factor};
+              GapMask};
     }
   }
 
-  return {nullptr, Factor};
+  return {nullptr, GapMask};
 }
 
 bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
@@ -707,12 +697,16 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
       return false;
 
     // Check mask operand. Handle both all-true/false and interleaved mask.
-    unsigned GapMaskFactor;
-    std::tie(Mask, GapMaskFactor) =
+    APInt GapMask(Factor, 0);
+    std::tie(Mask, GapMask) =
         getMask(getMaskOperand(II), Factor, getDeinterleavedVectorType(DI));
     if (!Mask)
       return false;
-    assert(GapMaskFactor == Factor);
+    // We haven't supported gap mask if it's deinterleaving using intrinsics.
+    // Yet it is possible that we already changed the IR, hence returning true
+    // here.
+    if (GapMask.popcount() != Factor)
+      return true;
 
     LLVM_DEBUG(dbgs() << "IA: Found a vp.load or masked.load with deinterleave"
                       << " intrinsic " << *DI << " and factor = "
@@ -751,13 +745,16 @@ bool InterleavedAccessImpl::lowerInterleaveIntrinsic(
         II->getIntrinsicID() != Intrinsic::vp_store)
       return false;
     // Check mask operand. Handle both all-true/false and interleaved mask.
-    unsigned GapMaskFactor;
-    std::tie(Mask, GapMaskFactor) =
+    APInt GapMask(Factor, 0);
+    std::tie(Mask, GapMask) =
         getMask(getMaskOperand(II), Factor,
                 cast<VectorType>(InterleaveValues[0]->getType()));
     if (!Mask)
       return false;
-    assert(GapMaskFactor == Factor);
+    // We haven't supported gap mask if it's interleaving using intrinsics. Yet
+    // it is possible that we already changed the IR, hence returning true here.
+    if (GapMask.popcount() != Factor)
+      return true;
 
     LLVM_DEBUG(dbgs() << "IA: Found a vp.store or masked.store with interleave"
                       << " intrinsic " << *IntII << " and factor = "
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 632bb79fa02e4..c0d9a9320a6cd 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -17254,7 +17254,7 @@ static Function *getStructuredStoreFunction(Module *M, unsigned Factor,
 ///        %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
 bool AArch64TargetLowering::lowerInterleavedLoad(
     Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles,
-    ArrayRef<unsigned> Indices, unsigned Factor, unsigned MaskFactor) const {
+    ArrayRef<unsigned> Indices, unsigned Factor, const APInt &GapMask) const {
   assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
          "Invalid interleave factor");
   assert(!Shuffles.empty() && "Empty shufflevector input");
@@ -17265,9 +17265,7 @@ bool AArch64TargetLowering::lowerInterleavedLoad(
   if (!LI)
     return false;
   assert(!Mask && "Unexpected mask on a load");
-
-  if (Factor != MaskFactor)
-    return false;
+  assert(GapMask.popcount() == Factor && "Unexpected factor reduction");
 
   const DataLayout &DL = LI->getDataLayout();
 
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index d0d6512d39015..e7ab408aeffe2 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -221,7 +221,7 @@ class AArch64TargetLowering : public TargetLowering {
   bool lowerInterleavedLoad(Instruction *Load, Value *Mask,
                             ArrayRef<ShuffleVectorInst *> Shuffles,
                             ArrayRef<unsigned> Indices, unsigned Factor,
-                            unsigned MaskFactor) const override;
+                            const APInt &GapMask) const override;
   bool lowerInterleavedStore(Instruction *Store, Value *Mask,
                              ShuffleVectorInst *SVI,
                              unsigned Factor) const override;
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index c087e32cd4787..b00241b7aea1f 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -21599,7 +21599,7 @@ unsigned ARMTargetLowering::getMaxSupportedInterleaveFactor() const {
 ///        %vec1 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 1
 bool ARMTargetLowering::lowerInterleavedLoad(
     Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles,
-    ArrayRef<unsigned> Indices, unsigned Factor, unsigned MaskFactor) const {
+    ArrayRef<unsigned> Indices, unsigned Factor, const APInt &GapMask) const {
   assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
          "Invalid interleave factor");
   assert(!Shuffles.empty() && "Empty shufflevector input");
@@ -21610,9 +21610,7 @@ bool ARMTargetLowering::lowerInterleavedLoad(
   if (!LI)
     return false;
   assert(!Mask && "Unexpected mask on a load");
-
-  if (Factor != MaskFactor)
-    return false;
+  assert(GapMask.popcount() == Factor && "Unexpected factor reduction");
 
   auto *VecTy = cast<FixedVectorType>(Shuffles[0]->getType());
   Type *EltTy = VecTy->getElementType();
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index 670bbb62fe0f6..d3f99d70f1f7d 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -684,7 +684,7 @@ class VectorType;
     bool lowerInterleavedLoad(Instruction *Load, Value *Mask,
                               ArrayRef<ShuffleVectorInst *> Shuffles,
                               ArrayRef<unsigned> Indices, unsigned Factor,
-                              unsigned MaskFactor) const override;
+                              const APInt &GapMask) const override;
     bool lowerInterleavedStore(Instruction *Store, Value *Mask,
                                ShuffleVectorInst *SVI,
                                unsigned Factor) const override;
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index 4155f613f7f04..f615eb7a0eb9c 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -432,7 +432,7 @@ class RISCVTargetLowering : public TargetLowering {
   bool lowerInterleavedLoad(Instruction *Load, Value *Mask,
                             ArrayRef<ShuffleVectorInst *> Shuffles,
                             ArrayRef<unsigned> Indices, unsigned Factor,
-                            unsigned MaskFactor) const override;
+                            const APInt &GapMask) const override;
 
   bool lowerInterleavedStore(Instruction *Store, Value *Mask,
                              ShuffleVectorInst *SVI,
diff --git a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
index d4e6351ea6a51..c7b96f5c3d0c8 100644
--- a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
@@ -203,12 +203,14 @@ static bool getMemOperands(unsigned Factor, VectorType *VTy, Type *XLenTy,
 /// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
 bool RISCVTargetLowering::lowerInterleavedLoad(
     Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles,
-    ArrayRef<unsigned> Indices, unsigned Factor, unsigned MaskFactor) const {
+    ArrayRef<unsigned> Indices, unsigned Factor, const APInt &GapMask) const {
   assert(Indices.size() == Shuffles.size());
-  assert(MaskFactor <= Factor);
+  assert(GapMask.getBitWidth() == Factor);
 
-  // TODO: Lower to strided load when MaskFactor = 1.
-  if (MaskFactor < 2)
+  // We only support cases where the skipped fields are the trailing ones.
+  // TODO: Lower to strided load if there is only a single active field.
+  unsigned MaskFactor = GapMask.popcount();
+  if (MaskFactor < 2 || !GapMask.isMask())
     return false;
   IRBuilder<> Builder(Load);
 
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 242d24b5faf60..3dd79b3249517 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1664,7 +1664,7 @@ namespace llvm {
     bool lowerInterleavedLoad(Instruction *Load, Value *Mask,
                               ArrayRef<ShuffleVectorInst *> Shuffles,
                               ArrayRef<unsigned> Indices, unsigned Factor,
-                              unsigned MaskFactor) const override;
+                              const APInt &GapMask) const override;
 
     /// Lower interleaved store(s) into target specific
     /// instructions/intrinsics.
diff --git a/llvm/lib/Target/X86/X86InterleavedAccess.cpp b/llvm/lib/Target/X86/X86InterleavedAccess.cpp
index 52132a9d64b1a..ee2ab9f37023c 100644
--- a/llvm/lib/Target/X86/X86InterleavedAccess.cpp
+++ b/llvm/lib/Target/X86/X86InterleavedAccess.cpp
@@ -802,7 +802,7 @@ bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() {
 // Currently, lowering is supported for 4x64 bits with Factor = 4 on AVX.
 bool X86TargetLowering::lowerInterleavedLoad(
     Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles,
-    ArrayRef<unsigned> Indices, unsigned Factor, unsigned MaskFactor) const {
+    ArrayRef<unsigned> Indices, unsigned Factor, const APInt &GapMask) const {
   assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
          "Invalid interleave factor");
   assert(!Shuffles.empty() && "Empty shufflevector input");
@@ -813,9 +813,7 @@ bool X86TargetLowering::lowerInterleavedLoad(
   if (!LI)
     return false;
   assert(!Mask && "Unexpected mask on a load");
-
-  if (Factor != MaskFactor)
-    return false;
+  assert(GapMask.popcount() == Factor && "Unexpected factor reduction");
 
   // Create an interleaved access group.
   IRBuilder<> Builder(LI);

From 1f5ab334d9e2bbd0d9046da9bcae4b2dfec09957 Mon Sep 17 00:00:00 2001
From: Min-Yih Hsu <min.hsu@sifive.com>
Date: Tue, 12 Aug 2025 13:05:00 -0700
Subject: [PATCH 8/9] fixup! Split combined mask into another patch

---
 llvm/lib/CodeGen/InterleavedAccessPass.cpp    |  14 ---
 .../rvv/fixed-vectors-interleaved-access.ll   | 112 ++++++------------
 2 files changed, 38 insertions(+), 88 deletions(-)

diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index 7956e02ac1fc7..f84d83f454286 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -593,8 +593,6 @@ static void getGapMask(const Constant &MaskConst, unsigned Factor,
 
 static std::pair<Value *, APInt> getMask(Value *WideMask, unsigned Factor,
                                          ElementCount LeafValueEC) {
-  using namespace PatternMatch;
-
   APInt GapMask(Factor, 0);
   GapMask.setAllBits();
 
@@ -605,18 +603,6 @@ static std::pair<Value *, APInt> getMask(Value *WideMask, unsigned Factor,
     }
   }
 
-  // Try to match `and <interleaved mask>, <gap mask>`. The WideMask here is
-  // expected to be a fixed vector and gap mask should be a constant mask.
-  Value *AndMaskLHS;
-  Constant *AndMaskRHS;
-  if (match(WideMask, m_c_And(m_Value(AndMaskLHS), m_Constant(AndMaskRHS))) &&
-      LeafValueEC.isFixed()) {
-    assert(!isa<Constant>(AndMaskLHS) &&
-           "expect constants to be folded already");
-    getGapMask(*AndMaskRHS, Factor, LeafValueEC.getFixedValue(), GapMask);
-    return {getMask(AndMaskLHS, Factor, LeafValueEC).first, GapMask};
-  }
-
   if (auto *ConstMask = dyn_cast<Constant>(WideMask)) {
     if (auto *Splat = ConstMask->getSplatValue())
       // All-ones or all-zeros mask.
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
index 2c738e5aeb55b..7d7ef3e4e2a4b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
@@ -367,24 +367,6 @@ define {<4 x i32>, <4 x i32>} @vpload_factor3_mask_skip_fields(ptr %ptr) {
   ret {<4 x i32>, <4 x i32>} %res1
 }
 
-define {<4 x i32>, <4 x i32>} @vpload_factor3_combined_mask_skip_field(ptr %ptr, <4 x i1> %mask) {
-; CHECK-LABEL: vpload_factor3_combined_mask_skip_field:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a1, 12
-; CHECK-NEXT:    vsetivli zero, 6, e32, m1, ta, ma
-; CHECK-NEXT:    vlsseg2e32.v v8, (a0), a1, v0.t
-; CHECK-NEXT:    ret
-  %interleaved.mask = shufflevector <4 x i1> %mask, <4 x i1> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3>
-  %combined = and <12 x i1> %interleaved.mask, <i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false>
-  %interleaved.vec = tail call <12 x i32> @llvm.vp.load.v12i32.p0(ptr %ptr, <12 x i1> %combined, i32 12)
-  ; mask = %mask, skip the last field
-  %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
-  %v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
-  %res0 = insertvalue {<4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0
-  %res1 = insertvalue {<4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1
-  ret {<4 x i32>, <4 x i32>} %res1
-}
-
 define {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} @vpload_factor4(ptr %ptr) {
 ; CHECK-LABEL: vpload_factor4:
 ; CHECK:       # %bb.0:
@@ -532,8 +514,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    li a2, 32
 ; RV32-NEXT:    lui a3, 12
 ; RV32-NEXT:    lui a6, 12291
-; RV32-NEXT:    lui a7, %hi(.LCPI26_0)
-; RV32-NEXT:    addi a7, a7, %lo(.LCPI26_0)
+; RV32-NEXT:    lui a7, %hi(.LCPI25_0)
+; RV32-NEXT:    addi a7, a7, %lo(.LCPI25_0)
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; RV32-NEXT:    vle32.v v24, (a5)
 ; RV32-NEXT:    vmv.s.x v0, a3
@@ -618,12 +600,12 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs4r.v v8, (a1) # vscale x 32-byte Folded Spill
 ; RV32-NEXT:    lui a7, 49164
-; RV32-NEXT:    lui a1, %hi(.LCPI26_1)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI26_1)
+; RV32-NEXT:    lui a1, %hi(.LCPI25_1)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI25_1)
 ; RV32-NEXT:    lui t2, 3
 ; RV32-NEXT:    lui t1, 196656
-; RV32-NEXT:    lui a4, %hi(.LCPI26_3)
-; RV32-NEXT:    addi a4, a4, %lo(.LCPI26_3)
+; RV32-NEXT:    lui a4, %hi(.LCPI25_3)
+; RV32-NEXT:    addi a4, a4, %lo(.LCPI25_3)
 ; RV32-NEXT:    lui t0, 786624
 ; RV32-NEXT:    li a5, 48
 ; RV32-NEXT:    lui a6, 768
@@ -802,8 +784,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
 ; RV32-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; RV32-NEXT:    vrgatherei16.vv v24, v8, v2
-; RV32-NEXT:    lui a1, %hi(.LCPI26_2)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI26_2)
+; RV32-NEXT:    lui a1, %hi(.LCPI25_2)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI25_2)
 ; RV32-NEXT:    lui a3, 3073
 ; RV32-NEXT:    addi a3, a3, -1024
 ; RV32-NEXT:    vmv.s.x v0, a3
@@ -867,16 +849,16 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vrgatherei16.vv v28, v8, v3
 ; RV32-NEXT:    vsetivli zero, 10, e32, m4, tu, ma
 ; RV32-NEXT:    vmv.v.v v28, v24
-; RV32-NEXT:    lui a1, %hi(.LCPI26_4)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI26_4)
-; RV32-NEXT:    lui a2, %hi(.LCPI26_5)
-; RV32-NEXT:    addi a2, a2, %lo(.LCPI26_5)
+; RV32-NEXT:    lui a1, %hi(.LCPI25_4)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI25_4)
+; RV32-NEXT:    lui a2, %hi(.LCPI25_5)
+; RV32-NEXT:    addi a2, a2, %lo(.LCPI25_5)
 ; RV32-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; RV32-NEXT:    vle16.v v24, (a2)
 ; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; RV32-NEXT:    vle16.v v8, (a1)
-; RV32-NEXT:    lui a1, %hi(.LCPI26_7)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI26_7)
+; RV32-NEXT:    lui a1, %hi(.LCPI25_7)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI25_7)
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vle16.v v10, (a1)
 ; RV32-NEXT:    csrr a1, vlenb
@@ -904,14 +886,14 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vl8r.v v0, (a1) # vscale x 64-byte Folded Reload
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vrgatherei16.vv v16, v0, v10
-; RV32-NEXT:    lui a1, %hi(.LCPI26_6)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI26_6)
-; RV32-NEXT:    lui a2, %hi(.LCPI26_8)
-; RV32-NEXT:    addi a2, a2, %lo(.LCPI26_8)
+; RV32-NEXT:    lui a1, %hi(.LCPI25_6)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI25_6)
+; RV32-NEXT:    lui a2, %hi(.LCPI25_8)
+; RV32-NEXT:    addi a2, a2, %lo(.LCPI25_8)
 ; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; RV32-NEXT:    vle16.v v4, (a1)
-; RV32-NEXT:    lui a1, %hi(.LCPI26_9)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI26_9)
+; RV32-NEXT:    lui a1, %hi(.LCPI25_9)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI25_9)
 ; RV32-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; RV32-NEXT:    vle16.v v6, (a1)
 ; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
@@ -998,8 +980,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    li a4, 128
 ; RV64-NEXT:    lui a1, 1
 ; RV64-NEXT:    vle64.v v8, (a3)
-; RV64-NEXT:    lui a3, %hi(.LCPI26_0)
-; RV64-NEXT:    addi a3, a3, %lo(.LCPI26_0)
+; RV64-NEXT:    lui a3, %hi(.LCPI25_0)
+; RV64-NEXT:    addi a3, a3, %lo(.LCPI25_0)
 ; RV64-NEXT:    vmv.s.x v0, a4
 ; RV64-NEXT:    csrr a4, vlenb
 ; RV64-NEXT:    li a5, 61
@@ -1187,8 +1169,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
 ; RV64-NEXT:    vslideup.vi v12, v16, 1, v0.t
-; RV64-NEXT:    lui a2, %hi(.LCPI26_1)
-; RV64-NEXT:    addi a2, a2, %lo(.LCPI26_1)
+; RV64-NEXT:    lui a2, %hi(.LCPI25_1)
+; RV64-NEXT:    addi a2, a2, %lo(.LCPI25_1)
 ; RV64-NEXT:    li a3, 192
 ; RV64-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; RV64-NEXT:    vle16.v v6, (a2)
@@ -1222,8 +1204,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    vrgatherei16.vv v24, v16, v6
 ; RV64-NEXT:    addi a2, sp, 16
 ; RV64-NEXT:    vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    lui a2, %hi(.LCPI26_2)
-; RV64-NEXT:    addi a2, a2, %lo(.LCPI26_2)
+; RV64-NEXT:    lui a2, %hi(.LCPI25_2)
+; RV64-NEXT:    addi a2, a2, %lo(.LCPI25_2)
 ; RV64-NEXT:    li a3, 1040
 ; RV64-NEXT:    vmv.s.x v0, a3
 ; RV64-NEXT:    addi a1, a1, -2016
@@ -1307,12 +1289,12 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vs4r.v v8, (a1) # vscale x 32-byte Folded Spill
-; RV64-NEXT:    lui a1, %hi(.LCPI26_3)
-; RV64-NEXT:    addi a1, a1, %lo(.LCPI26_3)
+; RV64-NEXT:    lui a1, %hi(.LCPI25_3)
+; RV64-NEXT:    addi a1, a1, %lo(.LCPI25_3)
 ; RV64-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; RV64-NEXT:    vle16.v v20, (a1)
-; RV64-NEXT:    lui a1, %hi(.LCPI26_4)
-; RV64-NEXT:    addi a1, a1, %lo(.LCPI26_4)
+; RV64-NEXT:    lui a1, %hi(.LCPI25_4)
+; RV64-NEXT:    addi a1, a1, %lo(.LCPI25_4)
 ; RV64-NEXT:    vle16.v v8, (a1)
 ; RV64-NEXT:    csrr a1, vlenb
 ; RV64-NEXT:    li a2, 77
@@ -1363,8 +1345,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    vl2r.v v8, (a1) # vscale x 16-byte Folded Reload
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV64-NEXT:    vrgatherei16.vv v0, v16, v8
-; RV64-NEXT:    lui a1, %hi(.LCPI26_5)
-; RV64-NEXT:    addi a1, a1, %lo(.LCPI26_5)
+; RV64-NEXT:    lui a1, %hi(.LCPI25_5)
+; RV64-NEXT:    addi a1, a1, %lo(.LCPI25_5)
 ; RV64-NEXT:    vle16.v v20, (a1)
 ; RV64-NEXT:    csrr a1, vlenb
 ; RV64-NEXT:    li a2, 61
@@ -1981,8 +1963,8 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @invalid_vp_mask(ptr %ptr) {
 ; RV32-NEXT:    vle32.v v12, (a0), v0.t
 ; RV32-NEXT:    li a0, 36
 ; RV32-NEXT:    vmv.s.x v20, a1
-; RV32-NEXT:    lui a1, %hi(.LCPI62_0)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI62_0)
+; RV32-NEXT:    lui a1, %hi(.LCPI61_0)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI61_0)
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32-NEXT:    vle16.v v21, (a1)
 ; RV32-NEXT:    vcompress.vm v8, v12, v11
@@ -2057,8 +2039,8 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @invalid_vp_evl(ptr %ptr) {
 ; RV32-NEXT:    vmv.s.x v10, a0
 ; RV32-NEXT:    li a0, 146
 ; RV32-NEXT:    vmv.s.x v11, a0
-; RV32-NEXT:    lui a0, %hi(.LCPI63_0)
-; RV32-NEXT:    addi a0, a0, %lo(.LCPI63_0)
+; RV32-NEXT:    lui a0, %hi(.LCPI62_0)
+; RV32-NEXT:    addi a0, a0, %lo(.LCPI62_0)
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32-NEXT:    vle16.v v20, (a0)
 ; RV32-NEXT:    li a0, 36
@@ -2199,24 +2181,6 @@ define {<4 x i32>, <4 x i32>} @maskedload_factor3_mask_skip_field(ptr %ptr) {
   ret {<4 x i32>, <4 x i32>} %res1
 }
 
-define {<4 x i32>, <4 x i32>} @maskedload_factor3_combined_mask_skip_field(ptr %ptr, <4 x i1> %mask) {
-; CHECK-LABEL: maskedload_factor3_combined_mask_skip_field:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a1, 12
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vlsseg2e32.v v8, (a0), a1, v0.t
-; CHECK-NEXT:    ret
-  %interleaved.mask = shufflevector <4 x i1> %mask, <4 x i1> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3>
-  %combined = and <12 x i1> %interleaved.mask, <i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false>
-  %interleaved.vec = tail call <12 x i32> @llvm.masked.load.v12i32.p0(ptr %ptr, i32 4, <12 x i1> %combined, <12 x i32> poison)
-  ; mask = %mask, skip the last field
-  %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
-  %v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
-  %res0 = insertvalue {<4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0
-  %res1 = insertvalue {<4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1
-  ret {<4 x i32>, <4 x i32>} %res1
-}
-
 ; We can only skip the last field for now.
 define {<4 x i32>, <4 x i32>, <4 x i32>} @maskedload_factor3_invalid_skip_field(ptr %ptr) {
 ; RV32-LABEL: maskedload_factor3_invalid_skip_field:
@@ -2234,8 +2198,8 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @maskedload_factor3_invalid_skip_field(
 ; RV32-NEXT:    vle32.v v12, (a0), v0.t
 ; RV32-NEXT:    li a0, 36
 ; RV32-NEXT:    vmv.s.x v20, a1
-; RV32-NEXT:    lui a1, %hi(.LCPI70_0)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI70_0)
+; RV32-NEXT:    lui a1, %hi(.LCPI68_0)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI68_0)
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32-NEXT:    vle16.v v21, (a1)
 ; RV32-NEXT:    vcompress.vm v8, v12, v11

From 705b7a6d4e6247e044c3b9eb4f8f1f7c424d9034 Mon Sep 17 00:00:00 2001
From: Min-Yih Hsu <min.hsu@sifive.com>
Date: Tue, 12 Aug 2025 13:55:35 -0700
Subject: [PATCH 9/9] fixup! Address review comments

---
 llvm/include/llvm/CodeGen/TargetLowering.h      | 4 ++--
 llvm/lib/CodeGen/InterleavedAccessPass.cpp      | 6 ++----
 llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 3 +--
 llvm/lib/Target/ARM/ARMISelLowering.cpp         | 3 +--
 llvm/lib/Target/X86/X86InterleavedAccess.cpp    | 3 +--
 5 files changed, 7 insertions(+), 12 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 3d7d74593533b..8b50f2d7eef05 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3209,8 +3209,8 @@ class LLVM_ABI TargetLoweringBase {
   /// \p Shuffles is the shufflevector list to DE-interleave the loaded vector.
   /// \p Indices is the corresponding indices for each shufflevector.
   /// \p Factor is the interleave factor.
-  /// \p GapMask is a mask in which inactive lanes represent components / fields
-  /// that are always skipped.
+  /// \p GapMask is a mask with zeros for components / fields that may not be
+  /// accessed.
   virtual bool lowerInterleavedLoad(Instruction *Load, Value *Mask,
                                     ArrayRef<ShuffleVectorInst *> Shuffles,
                                     ArrayRef<unsigned> Indices, unsigned Factor,
diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index f84d83f454286..a41a44df3f847 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -382,9 +382,8 @@ bool InterleavedAccessImpl::lowerInterleavedLoad(
       replaceBinOpShuffles(BinOpShuffles.getArrayRef(), Shuffles, Load);
 
   Value *Mask = nullptr;
-  APInt GapMask(Factor, 0);
+  auto GapMask = APInt::getAllOnes(Factor);
   if (LI) {
-    GapMask.setAllBits();
     LLVM_DEBUG(dbgs() << "IA: Found an interleaved load: " << *Load << "\n");
   } else {
     // Check mask operand. Handle both all-true/false and interleaved mask.
@@ -593,8 +592,7 @@ static void getGapMask(const Constant &MaskConst, unsigned Factor,
 
 static std::pair<Value *, APInt> getMask(Value *WideMask, unsigned Factor,
                                          ElementCount LeafValueEC) {
-  APInt GapMask(Factor, 0);
-  GapMask.setAllBits();
+  auto GapMask = APInt::getAllOnes(Factor);
 
   if (auto *IMI = dyn_cast<IntrinsicInst>(WideMask)) {
     if (unsigned F = getInterleaveIntrinsicFactor(IMI->getIntrinsicID());
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index c0d9a9320a6cd..3aa6ae40e0da4 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -17264,8 +17264,7 @@ bool AArch64TargetLowering::lowerInterleavedLoad(
   auto *LI = dyn_cast<LoadInst>(Load);
   if (!LI)
     return false;
-  assert(!Mask && "Unexpected mask on a load");
-  assert(GapMask.popcount() == Factor && "Unexpected factor reduction");
+  assert(!Mask && GapMask.popcount() == Factor && "Unexpected mask on a load");
 
   const DataLayout &DL = LI->getDataLayout();
 
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index b00241b7aea1f..7c2a228f89750 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -21609,8 +21609,7 @@ bool ARMTargetLowering::lowerInterleavedLoad(
   auto *LI = dyn_cast<LoadInst>(Load);
   if (!LI)
     return false;
-  assert(!Mask && "Unexpected mask on a load");
-  assert(GapMask.popcount() == Factor && "Unexpected factor reduction");
+  assert(!Mask && GapMask.popcount() == Factor && "Unexpected mask on a load");
 
   auto *VecTy = cast<FixedVectorType>(Shuffles[0]->getType());
   Type *EltTy = VecTy->getElementType();
diff --git a/llvm/lib/Target/X86/X86InterleavedAccess.cpp b/llvm/lib/Target/X86/X86InterleavedAccess.cpp
index ee2ab9f37023c..632db7e4326e2 100644
--- a/llvm/lib/Target/X86/X86InterleavedAccess.cpp
+++ b/llvm/lib/Target/X86/X86InterleavedAccess.cpp
@@ -812,8 +812,7 @@ bool X86TargetLowering::lowerInterleavedLoad(
   auto *LI = dyn_cast<LoadInst>(Load);
   if (!LI)
     return false;
-  assert(!Mask && "Unexpected mask on a load");
-  assert(GapMask.popcount() == Factor && "Unexpected factor reduction");
+  assert(!Mask && GapMask.popcount() == Factor && "Unexpected mask on a load");
 
   // Create an interleaved access group.
   IRBuilder<> Builder(LI);