From c7220147cee093b95e138c377b4da1c2d724e485 Mon Sep 17 00:00:00 2001 From: Min-Yih Hsu Date: Thu, 31 Jul 2025 15:04:32 -0700 Subject: [PATCH 1/9] Pre-commit test --- .../rvv/fixed-vectors-interleaved-access.ll | 680 +++++++++++++++++- 1 file changed, 644 insertions(+), 36 deletions(-) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll index 6eb0b693b5546..2df26b2f78d5b 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll @@ -332,6 +332,174 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @vpload_factor3_poison_shufflemask(ptr ret {<4 x i32>, <4 x i32>, <4 x i32>} %res2 } +define {<4 x i32>, <4 x i32>, <4 x i32>} @vpload_factor3_skip_fields(ptr %ptr) { + ; mask = 1111, skip the last field. +; RV32-LABEL: vpload_factor3_skip_fields: +; RV32: # %bb.0: +; RV32-NEXT: li a1, 1755 +; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; RV32-NEXT: vmv.s.x v0, a1 +; RV32-NEXT: li a1, 73 +; RV32-NEXT: vmv.v.i v10, 8 +; RV32-NEXT: vsetivli zero, 12, e32, m4, ta, ma +; RV32-NEXT: vle32.v v12, (a0), v0.t +; RV32-NEXT: li a0, 36 +; RV32-NEXT: vmv.s.x v11, a1 +; RV32-NEXT: lui a1, %hi(.LCPI17_0) +; RV32-NEXT: addi a1, a1, %lo(.LCPI17_0) +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vcompress.vm v8, v12, v11 +; RV32-NEXT: vsetivli zero, 8, e32, m4, ta, ma +; RV32-NEXT: vslidedown.vi v16, v12, 8 +; RV32-NEXT: vmv1r.v v0, v10 +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV32-NEXT: vrgather.vi v8, v16, 1, v0.t +; RV32-NEXT: vsetivli zero, 1, e8, mf8, ta, ma +; RV32-NEXT: vmv.v.i v0, 2 +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV32-NEXT: vslidedown.vi v14, v12, 1 +; RV32-NEXT: vslidedown.vi v14, v12, 3, v0.t +; RV32-NEXT: vle16.v v9, (a1) +; RV32-NEXT: vmv1r.v v0, v10 +; RV32-NEXT: vrgather.vi v14, v16, 2, v0.t +; RV32-NEXT: vmv.s.x v0, a0 +; RV32-NEXT: vmerge.vvm v12, v16, v12, v0 +; RV32-NEXT: vrgatherei16.vv v10, v12, v9 +; RV32-NEXT: vmv1r.v v9, v14 +; RV32-NEXT: ret +; +; RV64-LABEL: vpload_factor3_skip_fields: +; RV64: # %bb.0: +; RV64-NEXT: li a1, 1755 +; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; RV64-NEXT: vmv.s.x v0, a1 +; RV64-NEXT: li a1, 73 +; RV64-NEXT: vmv.v.i v10, 8 +; RV64-NEXT: vmv.s.x v11, a1 +; RV64-NEXT: li a1, 36 +; RV64-NEXT: vsetivli zero, 12, e32, m4, ta, ma +; RV64-NEXT: vle32.v v12, (a0), v0.t +; RV64-NEXT: li a0, 3 +; RV64-NEXT: slli a0, a0, 32 +; RV64-NEXT: addi a0, a0, 5 +; RV64-NEXT: slli a0, a0, 16 +; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64-NEXT: vcompress.vm v8, v12, v11 +; RV64-NEXT: vsetivli zero, 8, e32, m4, ta, ma +; RV64-NEXT: vslidedown.vi v16, v12, 8 +; RV64-NEXT: vmv1r.v v0, v10 +; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64-NEXT: vrgather.vi v8, v16, 1, v0.t +; RV64-NEXT: vsetivli zero, 1, e8, mf8, ta, ma +; RV64-NEXT: vmv.v.i v0, 2 +; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64-NEXT: vslidedown.vi v14, v12, 1 +; RV64-NEXT: vslidedown.vi v14, v12, 3, v0.t +; RV64-NEXT: vmv1r.v v0, v10 +; RV64-NEXT: vrgather.vi v14, v16, 2, v0.t +; RV64-NEXT: vmv.s.x v0, a1 +; RV64-NEXT: addi a0, a0, 2 +; RV64-NEXT: vmerge.vvm v12, v16, v12, v0 +; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV64-NEXT: vmv.v.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64-NEXT: vrgatherei16.vv v10, v12, v9 +; RV64-NEXT: vmv1r.v v9, v14 +; RV64-NEXT: ret + %interleaved.vec = tail call <12 x i32> @llvm.vp.load.v12i32.p0(ptr %ptr, <12 x i1> , i32 12) + %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> + %v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> + %v2 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> + %res0 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0 + %res1 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1 + %res2 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res1, <4 x i32> %v2, 2 + ret {<4 x i32>, <4 x i32>, <4 x i32>} %res2 +} + +define {<4 x i32>, <4 x i32>, <4 x i32>} @vpload_factor3_mask_skip_fields(ptr %ptr) { + ; mask = 0101, skip the last field. +; RV32-LABEL: vpload_factor3_mask_skip_fields: +; RV32: # %bb.0: +; RV32-NEXT: li a1, 1560 +; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; RV32-NEXT: vmv.s.x v0, a1 +; RV32-NEXT: li a1, 73 +; RV32-NEXT: vmv.v.i v10, 8 +; RV32-NEXT: vsetivli zero, 12, e32, m4, ta, ma +; RV32-NEXT: vle32.v v12, (a0), v0.t +; RV32-NEXT: li a0, 36 +; RV32-NEXT: vmv.s.x v11, a1 +; RV32-NEXT: lui a1, %hi(.LCPI18_0) +; RV32-NEXT: addi a1, a1, %lo(.LCPI18_0) +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vcompress.vm v8, v12, v11 +; RV32-NEXT: vsetivli zero, 8, e32, m4, ta, ma +; RV32-NEXT: vslidedown.vi v16, v12, 8 +; RV32-NEXT: vmv1r.v v0, v10 +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV32-NEXT: vrgather.vi v8, v16, 1, v0.t +; RV32-NEXT: vsetivli zero, 1, e8, mf8, ta, ma +; RV32-NEXT: vmv.v.i v0, 2 +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV32-NEXT: vslidedown.vi v14, v12, 1 +; RV32-NEXT: vslidedown.vi v14, v12, 3, v0.t +; RV32-NEXT: vle16.v v9, (a1) +; RV32-NEXT: vmv1r.v v0, v10 +; RV32-NEXT: vrgather.vi v14, v16, 2, v0.t +; RV32-NEXT: vmv.s.x v0, a0 +; RV32-NEXT: vmerge.vvm v12, v16, v12, v0 +; RV32-NEXT: vrgatherei16.vv v10, v12, v9 +; RV32-NEXT: vmv1r.v v9, v14 +; RV32-NEXT: ret +; +; RV64-LABEL: vpload_factor3_mask_skip_fields: +; RV64: # %bb.0: +; RV64-NEXT: li a1, 1560 +; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; RV64-NEXT: vmv.s.x v0, a1 +; RV64-NEXT: li a1, 73 +; RV64-NEXT: vmv.v.i v10, 8 +; RV64-NEXT: vmv.s.x v11, a1 +; RV64-NEXT: li a1, 36 +; RV64-NEXT: vsetivli zero, 12, e32, m4, ta, ma +; RV64-NEXT: vle32.v v12, (a0), v0.t +; RV64-NEXT: li a0, 3 +; RV64-NEXT: slli a0, a0, 32 +; RV64-NEXT: addi a0, a0, 5 +; RV64-NEXT: slli a0, a0, 16 +; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64-NEXT: vcompress.vm v8, v12, v11 +; RV64-NEXT: vsetivli zero, 8, e32, m4, ta, ma +; RV64-NEXT: vslidedown.vi v16, v12, 8 +; RV64-NEXT: vmv1r.v v0, v10 +; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64-NEXT: vrgather.vi v8, v16, 1, v0.t +; RV64-NEXT: vsetivli zero, 1, e8, mf8, ta, ma +; RV64-NEXT: vmv.v.i v0, 2 +; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64-NEXT: vslidedown.vi v14, v12, 1 +; RV64-NEXT: vslidedown.vi v14, v12, 3, v0.t +; RV64-NEXT: vmv1r.v v0, v10 +; RV64-NEXT: vrgather.vi v14, v16, 2, v0.t +; RV64-NEXT: vmv.s.x v0, a1 +; RV64-NEXT: addi a0, a0, 2 +; RV64-NEXT: vmerge.vvm v12, v16, v12, v0 +; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV64-NEXT: vmv.v.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64-NEXT: vrgatherei16.vv v10, v12, v9 +; RV64-NEXT: vmv1r.v v9, v14 +; RV64-NEXT: ret + %interleaved.vec = tail call <12 x i32> @llvm.vp.load.v12i32.p0(ptr %ptr, <12 x i1> , i32 12) + %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> + %v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> + %v2 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> + %res0 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0 + %res1 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1 + %res2 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res1, <4 x i32> %v2, 2 + ret {<4 x i32>, <4 x i32>, <4 x i32>} %res2 +} + define {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} @vpload_factor4(ptr %ptr) { ; CHECK-LABEL: vpload_factor4: ; CHECK: # %bb.0: @@ -479,8 +647,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: li a2, 32 ; RV32-NEXT: lui a3, 12 ; RV32-NEXT: lui a6, 12291 -; RV32-NEXT: lui a7, %hi(.LCPI23_0) -; RV32-NEXT: addi a7, a7, %lo(.LCPI23_0) +; RV32-NEXT: lui a7, %hi(.LCPI25_0) +; RV32-NEXT: addi a7, a7, %lo(.LCPI25_0) ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; RV32-NEXT: vle32.v v24, (a5) ; RV32-NEXT: vmv.s.x v0, a3 @@ -565,12 +733,12 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs4r.v v8, (a1) # vscale x 32-byte Folded Spill ; RV32-NEXT: lui a7, 49164 -; RV32-NEXT: lui a1, %hi(.LCPI23_1) -; RV32-NEXT: addi a1, a1, %lo(.LCPI23_1) +; RV32-NEXT: lui a1, %hi(.LCPI25_1) +; RV32-NEXT: addi a1, a1, %lo(.LCPI25_1) ; RV32-NEXT: lui t2, 3 ; RV32-NEXT: lui t1, 196656 -; RV32-NEXT: lui a4, %hi(.LCPI23_3) -; RV32-NEXT: addi a4, a4, %lo(.LCPI23_3) +; RV32-NEXT: lui a4, %hi(.LCPI25_3) +; RV32-NEXT: addi a4, a4, %lo(.LCPI25_3) ; RV32-NEXT: lui t0, 786624 ; RV32-NEXT: li a5, 48 ; RV32-NEXT: lui a6, 768 @@ -749,8 +917,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload ; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, ma ; RV32-NEXT: vrgatherei16.vv v24, v8, v2 -; RV32-NEXT: lui a1, %hi(.LCPI23_2) -; RV32-NEXT: addi a1, a1, %lo(.LCPI23_2) +; RV32-NEXT: lui a1, %hi(.LCPI25_2) +; RV32-NEXT: addi a1, a1, %lo(.LCPI25_2) ; RV32-NEXT: lui a3, 3073 ; RV32-NEXT: addi a3, a3, -1024 ; RV32-NEXT: vmv.s.x v0, a3 @@ -814,16 +982,16 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: vrgatherei16.vv v28, v8, v3 ; RV32-NEXT: vsetivli zero, 10, e32, m4, tu, ma ; RV32-NEXT: vmv.v.v v28, v24 -; RV32-NEXT: lui a1, %hi(.LCPI23_4) -; RV32-NEXT: addi a1, a1, %lo(.LCPI23_4) -; RV32-NEXT: lui a2, %hi(.LCPI23_5) -; RV32-NEXT: addi a2, a2, %lo(.LCPI23_5) +; RV32-NEXT: lui a1, %hi(.LCPI25_4) +; RV32-NEXT: addi a1, a1, %lo(.LCPI25_4) +; RV32-NEXT: lui a2, %hi(.LCPI25_5) +; RV32-NEXT: addi a2, a2, %lo(.LCPI25_5) ; RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; RV32-NEXT: vle16.v v24, (a2) ; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; RV32-NEXT: vle16.v v8, (a1) -; RV32-NEXT: lui a1, %hi(.LCPI23_7) -; RV32-NEXT: addi a1, a1, %lo(.LCPI23_7) +; RV32-NEXT: lui a1, %hi(.LCPI25_7) +; RV32-NEXT: addi a1, a1, %lo(.LCPI25_7) ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vle16.v v10, (a1) ; RV32-NEXT: csrr a1, vlenb @@ -851,14 +1019,14 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: vl8r.v v0, (a1) # vscale x 64-byte Folded Reload ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vrgatherei16.vv v16, v0, v10 -; RV32-NEXT: lui a1, %hi(.LCPI23_6) -; RV32-NEXT: addi a1, a1, %lo(.LCPI23_6) -; RV32-NEXT: lui a2, %hi(.LCPI23_8) -; RV32-NEXT: addi a2, a2, %lo(.LCPI23_8) +; RV32-NEXT: lui a1, %hi(.LCPI25_6) +; RV32-NEXT: addi a1, a1, %lo(.LCPI25_6) +; RV32-NEXT: lui a2, %hi(.LCPI25_8) +; RV32-NEXT: addi a2, a2, %lo(.LCPI25_8) ; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; RV32-NEXT: vle16.v v4, (a1) -; RV32-NEXT: lui a1, %hi(.LCPI23_9) -; RV32-NEXT: addi a1, a1, %lo(.LCPI23_9) +; RV32-NEXT: lui a1, %hi(.LCPI25_9) +; RV32-NEXT: addi a1, a1, %lo(.LCPI25_9) ; RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; RV32-NEXT: vle16.v v6, (a1) ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma @@ -945,8 +1113,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV64-NEXT: li a4, 128 ; RV64-NEXT: lui a1, 1 ; RV64-NEXT: vle64.v v8, (a3) -; RV64-NEXT: lui a3, %hi(.LCPI23_0) -; RV64-NEXT: addi a3, a3, %lo(.LCPI23_0) +; RV64-NEXT: lui a3, %hi(.LCPI25_0) +; RV64-NEXT: addi a3, a3, %lo(.LCPI25_0) ; RV64-NEXT: vmv.s.x v0, a4 ; RV64-NEXT: csrr a4, vlenb ; RV64-NEXT: li a5, 61 @@ -1134,8 +1302,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV64-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu ; RV64-NEXT: vslideup.vi v12, v16, 1, v0.t -; RV64-NEXT: lui a2, %hi(.LCPI23_1) -; RV64-NEXT: addi a2, a2, %lo(.LCPI23_1) +; RV64-NEXT: lui a2, %hi(.LCPI25_1) +; RV64-NEXT: addi a2, a2, %lo(.LCPI25_1) ; RV64-NEXT: li a3, 192 ; RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; RV64-NEXT: vle16.v v6, (a2) @@ -1169,8 +1337,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV64-NEXT: vrgatherei16.vv v24, v16, v6 ; RV64-NEXT: addi a2, sp, 16 ; RV64-NEXT: vs8r.v v24, (a2) # vscale x 64-byte Folded Spill -; RV64-NEXT: lui a2, %hi(.LCPI23_2) -; RV64-NEXT: addi a2, a2, %lo(.LCPI23_2) +; RV64-NEXT: lui a2, %hi(.LCPI25_2) +; RV64-NEXT: addi a2, a2, %lo(.LCPI25_2) ; RV64-NEXT: li a3, 1040 ; RV64-NEXT: vmv.s.x v0, a3 ; RV64-NEXT: addi a1, a1, -2016 @@ -1254,12 +1422,12 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vs4r.v v8, (a1) # vscale x 32-byte Folded Spill -; RV64-NEXT: lui a1, %hi(.LCPI23_3) -; RV64-NEXT: addi a1, a1, %lo(.LCPI23_3) +; RV64-NEXT: lui a1, %hi(.LCPI25_3) +; RV64-NEXT: addi a1, a1, %lo(.LCPI25_3) ; RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; RV64-NEXT: vle16.v v20, (a1) -; RV64-NEXT: lui a1, %hi(.LCPI23_4) -; RV64-NEXT: addi a1, a1, %lo(.LCPI23_4) +; RV64-NEXT: lui a1, %hi(.LCPI25_4) +; RV64-NEXT: addi a1, a1, %lo(.LCPI25_4) ; RV64-NEXT: vle16.v v8, (a1) ; RV64-NEXT: csrr a1, vlenb ; RV64-NEXT: li a2, 77 @@ -1310,8 +1478,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV64-NEXT: vl2r.v v8, (a1) # vscale x 16-byte Folded Reload ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV64-NEXT: vrgatherei16.vv v0, v16, v8 -; RV64-NEXT: lui a1, %hi(.LCPI23_5) -; RV64-NEXT: addi a1, a1, %lo(.LCPI23_5) +; RV64-NEXT: lui a1, %hi(.LCPI25_5) +; RV64-NEXT: addi a1, a1, %lo(.LCPI25_5) ; RV64-NEXT: vle16.v v20, (a1) ; RV64-NEXT: csrr a1, vlenb ; RV64-NEXT: li a2, 61 @@ -1928,8 +2096,8 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @invalid_vp_mask(ptr %ptr) { ; RV32-NEXT: vle32.v v12, (a0), v0.t ; RV32-NEXT: li a0, 36 ; RV32-NEXT: vmv.s.x v20, a1 -; RV32-NEXT: lui a1, %hi(.LCPI59_0) -; RV32-NEXT: addi a1, a1, %lo(.LCPI59_0) +; RV32-NEXT: lui a1, %hi(.LCPI61_0) +; RV32-NEXT: addi a1, a1, %lo(.LCPI61_0) ; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32-NEXT: vle16.v v21, (a1) ; RV32-NEXT: vcompress.vm v8, v12, v11 @@ -2004,8 +2172,8 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @invalid_vp_evl(ptr %ptr) { ; RV32-NEXT: vmv.s.x v10, a0 ; RV32-NEXT: li a0, 146 ; RV32-NEXT: vmv.s.x v11, a0 -; RV32-NEXT: lui a0, %hi(.LCPI60_0) -; RV32-NEXT: addi a0, a0, %lo(.LCPI60_0) +; RV32-NEXT: lui a0, %hi(.LCPI62_0) +; RV32-NEXT: addi a0, a0, %lo(.LCPI62_0) ; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32-NEXT: vle16.v v20, (a0) ; RV32-NEXT: li a0, 36 @@ -2094,3 +2262,443 @@ define void @maskedstore_factor2(ptr %ptr, <4 x i32> %v0, <4 x i32> %v1) { tail call void @llvm.masked.store(<8 x i32> %interleaved.vec, ptr %ptr, i32 4, <8 x i1> splat (i1 true)) ret void } + +define {<4 x i32>, <4 x i32>, <4 x i32>} @maskedload_factor3_mask(ptr %ptr) { +; CHECK-LABEL: maskedload_factor3_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vmv.v.i v0, 5 +; CHECK-NEXT: vlseg3e32.v v8, (a0), v0.t +; CHECK-NEXT: ret + %interleaved.vec = tail call <12 x i32> @llvm.masked.load.v12i32.p0(ptr %ptr, i32 4, <12 x i1> , <12 x i32> poison) + ; mask = 1010 + %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> + %v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> + %v2 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> + %res0 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0 + %res1 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1 + %res2 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res1, <4 x i32> %v2, 2 + ret {<4 x i32>, <4 x i32>, <4 x i32>} %res2 +} + +define {<4 x i32>, <4 x i32>, <4 x i32>} @maskedload_factor3_skip_field(ptr %ptr) { +; RV32-LABEL: maskedload_factor3_skip_field: +; RV32: # %bb.0: +; RV32-NEXT: li a1, 1755 +; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; RV32-NEXT: vmv.s.x v0, a1 +; RV32-NEXT: li a1, 73 +; RV32-NEXT: vmv.v.i v10, 8 +; RV32-NEXT: vmv.s.x v11, a1 +; RV32-NEXT: li a1, 146 +; RV32-NEXT: vsetivli zero, 12, e32, m4, ta, ma +; RV32-NEXT: vle32.v v12, (a0), v0.t +; RV32-NEXT: li a0, 36 +; RV32-NEXT: vmv.s.x v20, a1 +; RV32-NEXT: lui a1, %hi(.LCPI66_0) +; RV32-NEXT: addi a1, a1, %lo(.LCPI66_0) +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vle16.v v21, (a1) +; RV32-NEXT: vcompress.vm v8, v12, v11 +; RV32-NEXT: vsetivli zero, 8, e32, m4, ta, ma +; RV32-NEXT: vslidedown.vi v16, v12, 8 +; RV32-NEXT: vmv1r.v v0, v10 +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV32-NEXT: vrgather.vi v8, v16, 1, v0.t +; RV32-NEXT: vcompress.vm v14, v12, v20 +; RV32-NEXT: vrgather.vi v14, v16, 2, v0.t +; RV32-NEXT: vmv.s.x v0, a0 +; RV32-NEXT: vmerge.vvm v12, v16, v12, v0 +; RV32-NEXT: vrgatherei16.vv v10, v12, v21 +; RV32-NEXT: vmv1r.v v9, v14 +; RV32-NEXT: ret +; +; RV64-LABEL: maskedload_factor3_skip_field: +; RV64: # %bb.0: +; RV64-NEXT: li a1, 1755 +; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; RV64-NEXT: vmv.s.x v0, a1 +; RV64-NEXT: li a1, 73 +; RV64-NEXT: vmv.v.i v10, 8 +; RV64-NEXT: vmv.s.x v11, a1 +; RV64-NEXT: li a1, 146 +; RV64-NEXT: vmv.s.x v20, a1 +; RV64-NEXT: li a1, 36 +; RV64-NEXT: vsetivli zero, 12, e32, m4, ta, ma +; RV64-NEXT: vle32.v v12, (a0), v0.t +; RV64-NEXT: li a0, 3 +; RV64-NEXT: slli a0, a0, 32 +; RV64-NEXT: addi a0, a0, 5 +; RV64-NEXT: slli a0, a0, 16 +; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64-NEXT: vcompress.vm v8, v12, v11 +; RV64-NEXT: vsetivli zero, 8, e32, m4, ta, ma +; RV64-NEXT: vslidedown.vi v16, v12, 8 +; RV64-NEXT: vmv1r.v v0, v10 +; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64-NEXT: vrgather.vi v8, v16, 1, v0.t +; RV64-NEXT: vcompress.vm v14, v12, v20 +; RV64-NEXT: vrgather.vi v14, v16, 2, v0.t +; RV64-NEXT: vmv.s.x v0, a1 +; RV64-NEXT: addi a0, a0, 2 +; RV64-NEXT: vmerge.vvm v12, v16, v12, v0 +; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV64-NEXT: vmv.v.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64-NEXT: vrgatherei16.vv v10, v12, v9 +; RV64-NEXT: vmv1r.v v9, v14 +; RV64-NEXT: ret + %interleaved.vec = tail call <12 x i32> @llvm.masked.load.v12i32.p0(ptr %ptr, i32 4, <12 x i1> , <12 x i32> poison) + ; mask = 1111, skip last field + %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> + %v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> + %v2 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> + %res0 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0 + %res1 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1 + %res2 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res1, <4 x i32> %v2, 2 + ret {<4 x i32>, <4 x i32>, <4 x i32>} %res2 +} + +define {<4 x i32>, <4 x i32>, <4 x i32>} @maskedload_factor3_mask_skip_field(ptr %ptr) { +; RV32-LABEL: maskedload_factor3_mask_skip_field: +; RV32: # %bb.0: +; RV32-NEXT: li a1, 195 +; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; RV32-NEXT: vmv.s.x v0, a1 +; RV32-NEXT: li a1, 73 +; RV32-NEXT: vmv.v.i v10, 8 +; RV32-NEXT: vmv.s.x v11, a1 +; RV32-NEXT: li a1, 146 +; RV32-NEXT: vsetivli zero, 12, e32, m4, ta, ma +; RV32-NEXT: vle32.v v12, (a0), v0.t +; RV32-NEXT: li a0, 36 +; RV32-NEXT: vmv.s.x v20, a1 +; RV32-NEXT: lui a1, %hi(.LCPI67_0) +; RV32-NEXT: addi a1, a1, %lo(.LCPI67_0) +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vle16.v v21, (a1) +; RV32-NEXT: vcompress.vm v8, v12, v11 +; RV32-NEXT: vsetivli zero, 8, e32, m4, ta, ma +; RV32-NEXT: vslidedown.vi v16, v12, 8 +; RV32-NEXT: vmv1r.v v0, v10 +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV32-NEXT: vrgather.vi v8, v16, 1, v0.t +; RV32-NEXT: vcompress.vm v14, v12, v20 +; RV32-NEXT: vrgather.vi v14, v16, 2, v0.t +; RV32-NEXT: vmv.s.x v0, a0 +; RV32-NEXT: vmerge.vvm v12, v16, v12, v0 +; RV32-NEXT: vrgatherei16.vv v10, v12, v21 +; RV32-NEXT: vmv1r.v v9, v14 +; RV32-NEXT: ret +; +; RV64-LABEL: maskedload_factor3_mask_skip_field: +; RV64: # %bb.0: +; RV64-NEXT: li a1, 195 +; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; RV64-NEXT: vmv.s.x v0, a1 +; RV64-NEXT: li a1, 73 +; RV64-NEXT: vmv.v.i v10, 8 +; RV64-NEXT: vmv.s.x v11, a1 +; RV64-NEXT: li a1, 146 +; RV64-NEXT: vmv.s.x v20, a1 +; RV64-NEXT: li a1, 36 +; RV64-NEXT: vsetivli zero, 12, e32, m4, ta, ma +; RV64-NEXT: vle32.v v12, (a0), v0.t +; RV64-NEXT: li a0, 3 +; RV64-NEXT: slli a0, a0, 32 +; RV64-NEXT: addi a0, a0, 5 +; RV64-NEXT: slli a0, a0, 16 +; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64-NEXT: vcompress.vm v8, v12, v11 +; RV64-NEXT: vsetivli zero, 8, e32, m4, ta, ma +; RV64-NEXT: vslidedown.vi v16, v12, 8 +; RV64-NEXT: vmv1r.v v0, v10 +; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64-NEXT: vrgather.vi v8, v16, 1, v0.t +; RV64-NEXT: vcompress.vm v14, v12, v20 +; RV64-NEXT: vrgather.vi v14, v16, 2, v0.t +; RV64-NEXT: vmv.s.x v0, a1 +; RV64-NEXT: addi a0, a0, 2 +; RV64-NEXT: vmerge.vvm v12, v16, v12, v0 +; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV64-NEXT: vmv.v.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64-NEXT: vrgatherei16.vv v10, v12, v9 +; RV64-NEXT: vmv1r.v v9, v14 +; RV64-NEXT: ret + %interleaved.vec = tail call <12 x i32> @llvm.masked.load.v12i32.p0(ptr %ptr, i32 4, <12 x i1> , <12 x i32> poison) + ; mask = 1010, skip the last field + %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> + %v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> + %v2 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> + %res0 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0 + %res1 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1 + %res2 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res1, <4 x i32> %v2, 2 + ret {<4 x i32>, <4 x i32>, <4 x i32>} %res2 +} + +; We can only skip the last field for now. +define {<4 x i32>, <4 x i32>, <4 x i32>} @maskedload_factor3_invalid_skip_field(ptr %ptr) { +; RV32-LABEL: maskedload_factor3_invalid_skip_field: +; RV32: # %bb.0: +; RV32-NEXT: li a1, 73 +; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; RV32-NEXT: vmv.s.x v11, a1 +; RV32-NEXT: lui a1, 1 +; RV32-NEXT: vmv.v.i v10, 8 +; RV32-NEXT: addi a1, a1, -1171 +; RV32-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; RV32-NEXT: vmv.s.x v0, a1 +; RV32-NEXT: li a1, 146 +; RV32-NEXT: vsetivli zero, 12, e32, m4, ta, ma +; RV32-NEXT: vle32.v v12, (a0), v0.t +; RV32-NEXT: li a0, 36 +; RV32-NEXT: vmv.s.x v20, a1 +; RV32-NEXT: lui a1, %hi(.LCPI68_0) +; RV32-NEXT: addi a1, a1, %lo(.LCPI68_0) +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vle16.v v21, (a1) +; RV32-NEXT: vcompress.vm v8, v12, v11 +; RV32-NEXT: vsetivli zero, 8, e32, m4, ta, ma +; RV32-NEXT: vslidedown.vi v16, v12, 8 +; RV32-NEXT: vmv1r.v v0, v10 +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV32-NEXT: vrgather.vi v8, v16, 1, v0.t +; RV32-NEXT: vcompress.vm v14, v12, v20 +; RV32-NEXT: vrgather.vi v14, v16, 2, v0.t +; RV32-NEXT: vmv.s.x v0, a0 +; RV32-NEXT: vmerge.vvm v12, v16, v12, v0 +; RV32-NEXT: vrgatherei16.vv v10, v12, v21 +; RV32-NEXT: vmv1r.v v9, v14 +; RV32-NEXT: ret +; +; RV64-LABEL: maskedload_factor3_invalid_skip_field: +; RV64: # %bb.0: +; RV64-NEXT: li a1, 73 +; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; RV64-NEXT: vmv.s.x v11, a1 +; RV64-NEXT: li a1, 146 +; RV64-NEXT: vmv.s.x v20, a1 +; RV64-NEXT: lui a1, 1 +; RV64-NEXT: vmv.v.i v10, 8 +; RV64-NEXT: addi a1, a1, -1171 +; RV64-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; RV64-NEXT: vmv.s.x v0, a1 +; RV64-NEXT: li a1, 36 +; RV64-NEXT: vsetivli zero, 12, e32, m4, ta, ma +; RV64-NEXT: vle32.v v12, (a0), v0.t +; RV64-NEXT: li a0, 3 +; RV64-NEXT: slli a0, a0, 32 +; RV64-NEXT: addi a0, a0, 5 +; RV64-NEXT: slli a0, a0, 16 +; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64-NEXT: vcompress.vm v8, v12, v11 +; RV64-NEXT: vsetivli zero, 8, e32, m4, ta, ma +; RV64-NEXT: vslidedown.vi v16, v12, 8 +; RV64-NEXT: vmv1r.v v0, v10 +; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64-NEXT: vrgather.vi v8, v16, 1, v0.t +; RV64-NEXT: vcompress.vm v14, v12, v20 +; RV64-NEXT: vrgather.vi v14, v16, 2, v0.t +; RV64-NEXT: vmv.s.x v0, a1 +; RV64-NEXT: addi a0, a0, 2 +; RV64-NEXT: vmerge.vvm v12, v16, v12, v0 +; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV64-NEXT: vmv.v.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64-NEXT: vrgatherei16.vv v10, v12, v9 +; RV64-NEXT: vmv1r.v v9, v14 +; RV64-NEXT: ret + %interleaved.vec = tail call <12 x i32> @llvm.masked.load.v12i32.p0(ptr %ptr, i32 4, <12 x i1> , <12 x i32> poison) + %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> + %v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> + %v2 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> + %res0 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0 + %res1 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1 + %res2 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res1, <4 x i32> %v2, 2 + ret {<4 x i32>, <4 x i32>, <4 x i32>} %res2 +} + +define {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} @maskedload_factor5_skip_fields(ptr %ptr) { + ; mask = 1111, skip the last two fields. +; RV32-LABEL: maskedload_factor5_skip_fields: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -256 +; RV32-NEXT: .cfi_def_cfa_offset 256 +; RV32-NEXT: sw ra, 252(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s0, 248(sp) # 4-byte Folded Spill +; RV32-NEXT: .cfi_offset ra, -4 +; RV32-NEXT: .cfi_offset s0, -8 +; RV32-NEXT: addi s0, sp, 256 +; RV32-NEXT: .cfi_def_cfa s0, 0 +; RV32-NEXT: andi sp, sp, -128 +; RV32-NEXT: lui a1, 58 +; RV32-NEXT: addi a1, a1, -793 +; RV32-NEXT: vsetivli zero, 20, e32, m8, ta, ma +; RV32-NEXT: vmv.s.x v0, a1 +; RV32-NEXT: li a1, 33 +; RV32-NEXT: vle32.v v16, (a0), v0.t +; RV32-NEXT: li a0, 32 +; RV32-NEXT: mv a2, sp +; RV32-NEXT: vmv.s.x v0, a1 +; RV32-NEXT: vsetivli zero, 8, e32, m4, ta, ma +; RV32-NEXT: vslidedown.vi v8, v16, 8 +; RV32-NEXT: vsetivli zero, 1, e32, m2, ta, ma +; RV32-NEXT: vslidedown.vi v12, v16, 6 +; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; RV32-NEXT: vslidedown.vi v13, v16, 1 +; RV32-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; RV32-NEXT: vse32.v v16, (a2) +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vmerge.vvm v8, v8, v16, v0 +; RV32-NEXT: vslidedown.vi v10, v16, 7 +; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; RV32-NEXT: vslidedown.vi v11, v16, 2 +; RV32-NEXT: vslidedown.vi v18, v16, 3 +; RV32-NEXT: vsetivli zero, 1, e32, m2, ta, ma +; RV32-NEXT: vslidedown.vi v14, v16, 4 +; RV32-NEXT: vmv.x.s a0, v12 +; RV32-NEXT: vmv.x.s a1, v13 +; RV32-NEXT: vmv.x.s a2, v11 +; RV32-NEXT: vmv.x.s a3, v18 +; RV32-NEXT: vmv.x.s a4, v14 +; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV32-NEXT: vmv.v.x v11, a1 +; RV32-NEXT: vmv.v.x v12, a2 +; RV32-NEXT: vmv.v.x v13, a3 +; RV32-NEXT: vmv.v.x v14, a4 +; RV32-NEXT: lw a1, 32(sp) +; RV32-NEXT: lw a2, 36(sp) +; RV32-NEXT: lw a3, 44(sp) +; RV32-NEXT: lw a4, 48(sp) +; RV32-NEXT: vslide1down.vx v11, v11, a0 +; RV32-NEXT: vmv.x.s a0, v10 +; RV32-NEXT: vslide1down.vx v10, v12, a0 +; RV32-NEXT: vslide1down.vx v11, v11, a3 +; RV32-NEXT: vslide1down.vx v10, v10, a4 +; RV32-NEXT: vslide1down.vx v12, v13, a1 +; RV32-NEXT: lw a0, 64(sp) +; RV32-NEXT: lw a1, 52(sp) +; RV32-NEXT: lw a3, 56(sp) +; RV32-NEXT: lw a4, 68(sp) +; RV32-NEXT: vslide1down.vx v14, v14, a2 +; RV32-NEXT: vslide1down.vx v13, v11, a0 +; RV32-NEXT: vmv.v.i v0, 10 +; RV32-NEXT: vslide1down.vx v10, v10, a4 +; RV32-NEXT: vslide1down.vx v11, v12, a1 +; RV32-NEXT: lw a0, 72(sp) +; RV32-NEXT: lw a1, 76(sp) +; RV32-NEXT: vslide1down.vx v12, v14, a3 +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV32-NEXT: vslidedown.vi v8, v8, 4, v0.t +; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV32-NEXT: vslide1down.vx v11, v11, a0 +; RV32-NEXT: vslide1down.vx v12, v12, a1 +; RV32-NEXT: vmv1r.v v9, v13 +; RV32-NEXT: addi sp, s0, -256 +; RV32-NEXT: .cfi_def_cfa sp, 256 +; RV32-NEXT: lw ra, 252(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s0, 248(sp) # 4-byte Folded Reload +; RV32-NEXT: .cfi_restore ra +; RV32-NEXT: .cfi_restore s0 +; RV32-NEXT: addi sp, sp, 256 +; RV32-NEXT: .cfi_def_cfa_offset 0 +; RV32-NEXT: ret +; +; RV64-LABEL: maskedload_factor5_skip_fields: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -256 +; RV64-NEXT: .cfi_def_cfa_offset 256 +; RV64-NEXT: sd ra, 248(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s0, 240(sp) # 8-byte Folded Spill +; RV64-NEXT: .cfi_offset ra, -8 +; RV64-NEXT: .cfi_offset s0, -16 +; RV64-NEXT: addi s0, sp, 256 +; RV64-NEXT: .cfi_def_cfa s0, 0 +; RV64-NEXT: andi sp, sp, -128 +; RV64-NEXT: lui a1, 58 +; RV64-NEXT: addi a1, a1, -793 +; RV64-NEXT: vsetivli zero, 20, e32, m8, ta, ma +; RV64-NEXT: vmv.s.x v0, a1 +; RV64-NEXT: li a1, 33 +; RV64-NEXT: vle32.v v16, (a0), v0.t +; RV64-NEXT: li a0, 32 +; RV64-NEXT: mv a2, sp +; RV64-NEXT: vmv.s.x v0, a1 +; RV64-NEXT: vsetivli zero, 8, e32, m4, ta, ma +; RV64-NEXT: vslidedown.vi v8, v16, 8 +; RV64-NEXT: vsetivli zero, 1, e32, m2, ta, ma +; RV64-NEXT: vslidedown.vi v12, v16, 6 +; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; RV64-NEXT: vslidedown.vi v13, v16, 1 +; RV64-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; RV64-NEXT: vse32.v v16, (a2) +; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64-NEXT: vmerge.vvm v8, v8, v16, v0 +; RV64-NEXT: vslidedown.vi v10, v16, 7 +; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; RV64-NEXT: vslidedown.vi v11, v16, 2 +; RV64-NEXT: vslidedown.vi v18, v16, 3 +; RV64-NEXT: vsetivli zero, 1, e32, m2, ta, ma +; RV64-NEXT: vslidedown.vi v14, v16, 4 +; RV64-NEXT: vmv.x.s a0, v12 +; RV64-NEXT: vmv.x.s a1, v13 +; RV64-NEXT: vmv.x.s a2, v11 +; RV64-NEXT: vmv.x.s a3, v18 +; RV64-NEXT: vmv.x.s a4, v14 +; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV64-NEXT: vmv.v.x v11, a1 +; RV64-NEXT: vmv.v.x v12, a2 +; RV64-NEXT: vmv.v.x v13, a3 +; RV64-NEXT: vmv.v.x v14, a4 +; RV64-NEXT: lw a1, 32(sp) +; RV64-NEXT: lw a2, 36(sp) +; RV64-NEXT: lw a3, 44(sp) +; RV64-NEXT: lw a4, 48(sp) +; RV64-NEXT: vslide1down.vx v11, v11, a0 +; RV64-NEXT: vmv.x.s a0, v10 +; RV64-NEXT: vslide1down.vx v10, v12, a0 +; RV64-NEXT: vslide1down.vx v11, v11, a3 +; RV64-NEXT: vslide1down.vx v10, v10, a4 +; RV64-NEXT: vslide1down.vx v12, v13, a1 +; RV64-NEXT: lw a0, 64(sp) +; RV64-NEXT: lw a1, 52(sp) +; RV64-NEXT: lw a3, 56(sp) +; RV64-NEXT: lw a4, 68(sp) +; RV64-NEXT: vslide1down.vx v14, v14, a2 +; RV64-NEXT: vslide1down.vx v13, v11, a0 +; RV64-NEXT: vmv.v.i v0, 10 +; RV64-NEXT: vslide1down.vx v10, v10, a4 +; RV64-NEXT: vslide1down.vx v11, v12, a1 +; RV64-NEXT: lw a0, 72(sp) +; RV64-NEXT: lw a1, 76(sp) +; RV64-NEXT: vslide1down.vx v12, v14, a3 +; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64-NEXT: vslidedown.vi v8, v8, 4, v0.t +; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV64-NEXT: vslide1down.vx v11, v11, a0 +; RV64-NEXT: vslide1down.vx v12, v12, a1 +; RV64-NEXT: vmv1r.v v9, v13 +; RV64-NEXT: addi sp, s0, -256 +; RV64-NEXT: .cfi_def_cfa sp, 256 +; RV64-NEXT: ld ra, 248(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s0, 240(sp) # 8-byte Folded Reload +; RV64-NEXT: .cfi_restore ra +; RV64-NEXT: .cfi_restore s0 +; RV64-NEXT: addi sp, sp, 256 +; RV64-NEXT: .cfi_def_cfa_offset 0 +; RV64-NEXT: ret + %interleaved.vec = tail call <20 x i32> @llvm.masked.load(ptr %ptr, i32 4, <20 x i1> , <20 x i32> poison) + %v0 = shufflevector <20 x i32> %interleaved.vec, <20 x i32> poison, <4 x i32> + %v1 = shufflevector <20 x i32> %interleaved.vec, <20 x i32> poison, <4 x i32> + %v2 = shufflevector <20 x i32> %interleaved.vec, <20 x i32> poison, <4 x i32> + %v3 = shufflevector <20 x i32> %interleaved.vec, <20 x i32> poison, <4 x i32> + %v4 = shufflevector <20 x i32> %interleaved.vec, <20 x i32> poison, <4 x i32> + %res0 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0 + %res1 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1 + %res2 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} %res1, <4 x i32> %v2, 2 + %res3 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} %res2, <4 x i32> %v3, 3 + %res4 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} %res3, <4 x i32> %v4, 4 + ret {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} %res4 +} + From 9d6ef18b57878c74ce673875683ade639f1eb14e Mon Sep 17 00:00:00 2001 From: Min-Yih Hsu Date: Thu, 31 Jul 2025 15:56:04 -0700 Subject: [PATCH 2/9] [IA][RISCV] Recognize deinterleaved loads that could lower to strided segmented loads --- llvm/include/llvm/CodeGen/TargetLowering.h | 6 +- llvm/lib/CodeGen/InterleavedAccessPass.cpp | 81 ++- .../Target/AArch64/AArch64ISelLowering.cpp | 2 +- llvm/lib/Target/AArch64/AArch64ISelLowering.h | 4 +- llvm/lib/Target/ARM/ARMISelLowering.cpp | 2 +- llvm/lib/Target/ARM/ARMISelLowering.h | 4 +- llvm/lib/Target/RISCV/RISCVISelLowering.h | 4 +- .../Target/RISCV/RISCVInterleavedAccess.cpp | 41 +- llvm/lib/Target/X86/X86ISelLowering.h | 4 +- llvm/lib/Target/X86/X86InterleavedAccess.cpp | 2 +- .../rvv/fixed-vectors-interleaved-access.ll | 475 ++---------------- 11 files changed, 139 insertions(+), 486 deletions(-) diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index cbdc1b6031680..3239b35031e36 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -3209,10 +3209,12 @@ class LLVM_ABI TargetLoweringBase { /// \p Shuffles is the shufflevector list to DE-interleave the loaded vector. /// \p Indices is the corresponding indices for each shufflevector. /// \p Factor is the interleave factor. + /// \p MaskFactor is the interleave factor that considers mask, which can + /// reduce the original factor. virtual bool lowerInterleavedLoad(Instruction *Load, Value *Mask, ArrayRef Shuffles, - ArrayRef Indices, - unsigned Factor) const { + ArrayRef Indices, unsigned Factor, + unsigned MaskFactor) const { return false; } diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp index 5e508989ef2da..e6c4de23c055e 100644 --- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp +++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp @@ -268,13 +268,19 @@ static Value *getMaskOperand(IntrinsicInst *II) { } } -// Return the corresponded deinterleaved mask, or nullptr if there is no valid -// mask. -static Value *getMask(Value *WideMask, unsigned Factor, - ElementCount LeafValueEC); - -static Value *getMask(Value *WideMask, unsigned Factor, - VectorType *LeafValueTy) { +// Return a pair of +// (1) The corresponded deinterleaved mask, or nullptr if there is no valid +// mask. +// (2) Some mask effectively skips a certain field, this element contains +// the factor after taking such contraction into consideration. Note that +// currently we only support skipping trailing fields. So if the "nominal" +// factor was 5, you cannot only skip field 1 and 2, but you can skip field 3 +// and 4. +static std::pair getMask(Value *WideMask, unsigned Factor, + ElementCount LeafValueEC); + +static std::pair getMask(Value *WideMask, unsigned Factor, + VectorType *LeafValueTy) { return getMask(WideMask, Factor, LeafValueTy->getElementCount()); } @@ -379,22 +385,25 @@ bool InterleavedAccessImpl::lowerInterleavedLoad( replaceBinOpShuffles(BinOpShuffles.getArrayRef(), Shuffles, Load); Value *Mask = nullptr; + unsigned MaskFactor = Factor; if (LI) { LLVM_DEBUG(dbgs() << "IA: Found an interleaved load: " << *Load << "\n"); } else { // Check mask operand. Handle both all-true/false and interleaved mask. - Mask = getMask(getMaskOperand(II), Factor, VecTy); + std::tie(Mask, MaskFactor) = getMask(getMaskOperand(II), Factor, VecTy); if (!Mask) return false; LLVM_DEBUG(dbgs() << "IA: Found an interleaved vp.load or masked.load: " << *Load << "\n"); + LLVM_DEBUG(dbgs() << "IA: With nominal factor " << Factor + << " and mask factor " << MaskFactor << "\n"); } // Try to create target specific intrinsics to replace the load and // shuffles. if (!TLI->lowerInterleavedLoad(cast(Load), Mask, Shuffles, - Indices, Factor)) + Indices, Factor, MaskFactor)) // If Extracts is not empty, tryReplaceExtracts made changes earlier. return !Extracts.empty() || BinOpShuffleChanged; @@ -536,8 +545,8 @@ bool InterleavedAccessImpl::lowerInterleavedStore( } else { // Check mask operand. Handle both all-true/false and interleaved mask. unsigned LaneMaskLen = NumStoredElements / Factor; - Mask = getMask(getMaskOperand(II), Factor, - ElementCount::getFixed(LaneMaskLen)); + std::tie(Mask, std::ignore) = getMask(getMaskOperand(II), Factor, + ElementCount::getFixed(LaneMaskLen)); if (!Mask) return false; @@ -556,34 +565,57 @@ bool InterleavedAccessImpl::lowerInterleavedStore( return true; } -static Value *getMask(Value *WideMask, unsigned Factor, - ElementCount LeafValueEC) { +static std::pair getMask(Value *WideMask, unsigned Factor, + ElementCount LeafValueEC) { if (auto *IMI = dyn_cast(WideMask)) { if (unsigned F = getInterleaveIntrinsicFactor(IMI->getIntrinsicID()); F && F == Factor && llvm::all_equal(IMI->args())) { - return IMI->getArgOperand(0); + return {IMI->getArgOperand(0), Factor}; } } if (auto *ConstMask = dyn_cast(WideMask)) { if (auto *Splat = ConstMask->getSplatValue()) // All-ones or all-zeros mask. - return ConstantVector::getSplat(LeafValueEC, Splat); + return {ConstantVector::getSplat(LeafValueEC, Splat), Factor}; if (LeafValueEC.isFixed()) { unsigned LeafMaskLen = LeafValueEC.getFixedValue(); + // First, check if the mask completely skips some of the factors / fields. + APInt FactorMask(Factor, 0); + FactorMask.setAllBits(); + for (unsigned F = 0U; F < Factor; ++F) { + unsigned Idx; + for (Idx = 0U; Idx < LeafMaskLen; ++Idx) { + Constant *C = ConstMask->getAggregateElement(F + Idx * Factor); + if (!C->isZeroValue()) + break; + } + // All mask bits on this field are zero, skipping it. + if (Idx >= LeafMaskLen) + FactorMask.clearBit(F); + } + // We currently only support skipping "trailing" factors / fields. So + // given the original factor being 4, we can skip fields 2 and 3, but we + // cannot only skip fields 1 and 2. If FactorMask does not match such + // pattern, reset it. + if (!FactorMask.isMask()) + FactorMask.setAllBits(); + SmallVector LeafMask(LeafMaskLen, nullptr); // If this is a fixed-length constant mask, each lane / leaf has to // use the same mask. This is done by checking if every group with Factor // number of elements in the interleaved mask has homogeneous values. for (unsigned Idx = 0U; Idx < LeafMaskLen * Factor; ++Idx) { + if (!FactorMask[Idx % Factor]) + continue; Constant *C = ConstMask->getAggregateElement(Idx); if (LeafMask[Idx / Factor] && LeafMask[Idx / Factor] != C) - return nullptr; + return {nullptr, Factor}; LeafMask[Idx / Factor] = C; } - return ConstantVector::get(LeafMask); + return {ConstantVector::get(LeafMask), FactorMask.popcount()}; } } @@ -603,12 +635,13 @@ static Value *getMask(Value *WideMask, unsigned Factor, auto *LeafMaskTy = VectorType::get(Type::getInt1Ty(SVI->getContext()), LeafValueEC); IRBuilder<> Builder(SVI); - return Builder.CreateExtractVector(LeafMaskTy, SVI->getOperand(0), - uint64_t(0)); + return {Builder.CreateExtractVector(LeafMaskTy, SVI->getOperand(0), + uint64_t(0)), + Factor}; } } - return nullptr; + return {nullptr, Factor}; } bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic( @@ -639,7 +672,8 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic( return false; // Check mask operand. Handle both all-true/false and interleaved mask. - Mask = getMask(getMaskOperand(II), Factor, getDeinterleavedVectorType(DI)); + std::tie(Mask, std::ignore) = + getMask(getMaskOperand(II), Factor, getDeinterleavedVectorType(DI)); if (!Mask) return false; @@ -680,8 +714,9 @@ bool InterleavedAccessImpl::lowerInterleaveIntrinsic( II->getIntrinsicID() != Intrinsic::vp_store) return false; // Check mask operand. Handle both all-true/false and interleaved mask. - Mask = getMask(getMaskOperand(II), Factor, - cast(InterleaveValues[0]->getType())); + std::tie(Mask, std::ignore) = + getMask(getMaskOperand(II), Factor, + cast(InterleaveValues[0]->getType())); if (!Mask) return false; diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 2b6ea86ee1af5..e681d846f9e1c 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -17254,7 +17254,7 @@ static Function *getStructuredStoreFunction(Module *M, unsigned Factor, /// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1 bool AArch64TargetLowering::lowerInterleavedLoad( Instruction *Load, Value *Mask, ArrayRef Shuffles, - ArrayRef Indices, unsigned Factor) const { + ArrayRef Indices, unsigned Factor, unsigned MaskFactor) const { assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && "Invalid interleave factor"); assert(!Shuffles.empty() && "Empty shufflevector input"); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index ea63edd86210e..d0d6512d39015 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -220,8 +220,8 @@ class AArch64TargetLowering : public TargetLowering { bool lowerInterleavedLoad(Instruction *Load, Value *Mask, ArrayRef Shuffles, - ArrayRef Indices, - unsigned Factor) const override; + ArrayRef Indices, unsigned Factor, + unsigned MaskFactor) const override; bool lowerInterleavedStore(Instruction *Store, Value *Mask, ShuffleVectorInst *SVI, unsigned Factor) const override; diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 936625606e315..a5750def66b7d 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -21599,7 +21599,7 @@ unsigned ARMTargetLowering::getMaxSupportedInterleaveFactor() const { /// %vec1 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 1 bool ARMTargetLowering::lowerInterleavedLoad( Instruction *Load, Value *Mask, ArrayRef Shuffles, - ArrayRef Indices, unsigned Factor) const { + ArrayRef Indices, unsigned Factor, unsigned MaskFactor) const { assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && "Invalid interleave factor"); assert(!Shuffles.empty() && "Empty shufflevector input"); diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h index 825145d813fb1..670bbb62fe0f6 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -683,8 +683,8 @@ class VectorType; bool lowerInterleavedLoad(Instruction *Load, Value *Mask, ArrayRef Shuffles, - ArrayRef Indices, - unsigned Factor) const override; + ArrayRef Indices, unsigned Factor, + unsigned MaskFactor) const override; bool lowerInterleavedStore(Instruction *Store, Value *Mask, ShuffleVectorInst *SVI, unsigned Factor) const override; diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h index fa50e2105a708..4155f613f7f04 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -431,8 +431,8 @@ class RISCVTargetLowering : public TargetLowering { bool lowerInterleavedLoad(Instruction *Load, Value *Mask, ArrayRef Shuffles, - ArrayRef Indices, - unsigned Factor) const override; + ArrayRef Indices, unsigned Factor, + unsigned MaskFactor) const override; bool lowerInterleavedStore(Instruction *Store, Value *Mask, ShuffleVectorInst *SVI, diff --git a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp index 726920e4015cf..d4e6351ea6a51 100644 --- a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp +++ b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp @@ -63,6 +63,12 @@ static const Intrinsic::ID FixedVlsegIntrIds[] = { Intrinsic::riscv_seg6_load_mask, Intrinsic::riscv_seg7_load_mask, Intrinsic::riscv_seg8_load_mask}; +static const Intrinsic::ID FixedVlssegIntrIds[] = { + Intrinsic::riscv_sseg2_load_mask, Intrinsic::riscv_sseg3_load_mask, + Intrinsic::riscv_sseg4_load_mask, Intrinsic::riscv_sseg5_load_mask, + Intrinsic::riscv_sseg6_load_mask, Intrinsic::riscv_sseg7_load_mask, + Intrinsic::riscv_sseg8_load_mask}; + static const Intrinsic::ID ScalableVlsegIntrIds[] = { Intrinsic::riscv_vlseg2_mask, Intrinsic::riscv_vlseg3_mask, Intrinsic::riscv_vlseg4_mask, Intrinsic::riscv_vlseg5_mask, @@ -197,9 +203,13 @@ static bool getMemOperands(unsigned Factor, VectorType *VTy, Type *XLenTy, /// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1 bool RISCVTargetLowering::lowerInterleavedLoad( Instruction *Load, Value *Mask, ArrayRef Shuffles, - ArrayRef Indices, unsigned Factor) const { + ArrayRef Indices, unsigned Factor, unsigned MaskFactor) const { assert(Indices.size() == Shuffles.size()); + assert(MaskFactor <= Factor); + // TODO: Lower to strided load when MaskFactor = 1. + if (MaskFactor < 2) + return false; IRBuilder<> Builder(Load); const DataLayout &DL = Load->getDataLayout(); @@ -208,20 +218,37 @@ bool RISCVTargetLowering::lowerInterleavedLoad( Value *Ptr, *VL; Align Alignment; - if (!getMemOperands(Factor, VTy, XLenTy, Load, Ptr, Mask, VL, Alignment)) + if (!getMemOperands(MaskFactor, VTy, XLenTy, Load, Ptr, Mask, VL, Alignment)) return false; Type *PtrTy = Ptr->getType(); unsigned AS = PtrTy->getPointerAddressSpace(); - if (!isLegalInterleavedAccessType(VTy, Factor, Alignment, AS, DL)) + if (!isLegalInterleavedAccessType(VTy, MaskFactor, Alignment, AS, DL)) return false; - CallInst *VlsegN = Builder.CreateIntrinsic( - FixedVlsegIntrIds[Factor - 2], {VTy, PtrTy, XLenTy}, {Ptr, Mask, VL}); + CallInst *SegLoad = nullptr; + if (MaskFactor < Factor) { + // Lower to strided segmented load. + unsigned ScalarSizeInBytes = DL.getTypeStoreSize(VTy->getElementType()); + Value *Stride = ConstantInt::get(XLenTy, Factor * ScalarSizeInBytes); + SegLoad = Builder.CreateIntrinsic(FixedVlssegIntrIds[MaskFactor - 2], + {VTy, PtrTy, XLenTy, XLenTy}, + {Ptr, Stride, Mask, VL}); + } else { + // Lower to normal segmented load. + SegLoad = Builder.CreateIntrinsic(FixedVlsegIntrIds[Factor - 2], + {VTy, PtrTy, XLenTy}, {Ptr, Mask, VL}); + } for (unsigned i = 0; i < Shuffles.size(); i++) { - Value *SubVec = Builder.CreateExtractValue(VlsegN, Indices[i]); - Shuffles[i]->replaceAllUsesWith(SubVec); + unsigned FactorIdx = Indices[i]; + if (FactorIdx >= MaskFactor) { + // Replace masked-off factors (that are still extracted) with poison. + Shuffles[i]->replaceAllUsesWith(PoisonValue::get(VTy)); + } else { + Value *SubVec = Builder.CreateExtractValue(SegLoad, FactorIdx); + Shuffles[i]->replaceAllUsesWith(SubVec); + } } return true; diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index 547b2210fdbf0..242d24b5faf60 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -1663,8 +1663,8 @@ namespace llvm { /// instructions/intrinsics. bool lowerInterleavedLoad(Instruction *Load, Value *Mask, ArrayRef Shuffles, - ArrayRef Indices, - unsigned Factor) const override; + ArrayRef Indices, unsigned Factor, + unsigned MaskFactor) const override; /// Lower interleaved store(s) into target specific /// instructions/intrinsics. diff --git a/llvm/lib/Target/X86/X86InterleavedAccess.cpp b/llvm/lib/Target/X86/X86InterleavedAccess.cpp index 636b072837441..6929c869b1a31 100644 --- a/llvm/lib/Target/X86/X86InterleavedAccess.cpp +++ b/llvm/lib/Target/X86/X86InterleavedAccess.cpp @@ -802,7 +802,7 @@ bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() { // Currently, lowering is supported for 4x64 bits with Factor = 4 on AVX. bool X86TargetLowering::lowerInterleavedLoad( Instruction *Load, Value *Mask, ArrayRef Shuffles, - ArrayRef Indices, unsigned Factor) const { + ArrayRef Indices, unsigned Factor, unsigned MaskFactor) const { assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && "Invalid interleave factor"); assert(!Shuffles.empty() && "Empty shufflevector input"); diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll index 2df26b2f78d5b..497b39fb6f044 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll @@ -334,78 +334,12 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @vpload_factor3_poison_shufflemask(ptr define {<4 x i32>, <4 x i32>, <4 x i32>} @vpload_factor3_skip_fields(ptr %ptr) { ; mask = 1111, skip the last field. -; RV32-LABEL: vpload_factor3_skip_fields: -; RV32: # %bb.0: -; RV32-NEXT: li a1, 1755 -; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV32-NEXT: vmv.s.x v0, a1 -; RV32-NEXT: li a1, 73 -; RV32-NEXT: vmv.v.i v10, 8 -; RV32-NEXT: vsetivli zero, 12, e32, m4, ta, ma -; RV32-NEXT: vle32.v v12, (a0), v0.t -; RV32-NEXT: li a0, 36 -; RV32-NEXT: vmv.s.x v11, a1 -; RV32-NEXT: lui a1, %hi(.LCPI17_0) -; RV32-NEXT: addi a1, a1, %lo(.LCPI17_0) -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vcompress.vm v8, v12, v11 -; RV32-NEXT: vsetivli zero, 8, e32, m4, ta, ma -; RV32-NEXT: vslidedown.vi v16, v12, 8 -; RV32-NEXT: vmv1r.v v0, v10 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; RV32-NEXT: vrgather.vi v8, v16, 1, v0.t -; RV32-NEXT: vsetivli zero, 1, e8, mf8, ta, ma -; RV32-NEXT: vmv.v.i v0, 2 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; RV32-NEXT: vslidedown.vi v14, v12, 1 -; RV32-NEXT: vslidedown.vi v14, v12, 3, v0.t -; RV32-NEXT: vle16.v v9, (a1) -; RV32-NEXT: vmv1r.v v0, v10 -; RV32-NEXT: vrgather.vi v14, v16, 2, v0.t -; RV32-NEXT: vmv.s.x v0, a0 -; RV32-NEXT: vmerge.vvm v12, v16, v12, v0 -; RV32-NEXT: vrgatherei16.vv v10, v12, v9 -; RV32-NEXT: vmv1r.v v9, v14 -; RV32-NEXT: ret -; -; RV64-LABEL: vpload_factor3_skip_fields: -; RV64: # %bb.0: -; RV64-NEXT: li a1, 1755 -; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64-NEXT: vmv.s.x v0, a1 -; RV64-NEXT: li a1, 73 -; RV64-NEXT: vmv.v.i v10, 8 -; RV64-NEXT: vmv.s.x v11, a1 -; RV64-NEXT: li a1, 36 -; RV64-NEXT: vsetivli zero, 12, e32, m4, ta, ma -; RV64-NEXT: vle32.v v12, (a0), v0.t -; RV64-NEXT: li a0, 3 -; RV64-NEXT: slli a0, a0, 32 -; RV64-NEXT: addi a0, a0, 5 -; RV64-NEXT: slli a0, a0, 16 -; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV64-NEXT: vcompress.vm v8, v12, v11 -; RV64-NEXT: vsetivli zero, 8, e32, m4, ta, ma -; RV64-NEXT: vslidedown.vi v16, v12, 8 -; RV64-NEXT: vmv1r.v v0, v10 -; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; RV64-NEXT: vrgather.vi v8, v16, 1, v0.t -; RV64-NEXT: vsetivli zero, 1, e8, mf8, ta, ma -; RV64-NEXT: vmv.v.i v0, 2 -; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; RV64-NEXT: vslidedown.vi v14, v12, 1 -; RV64-NEXT: vslidedown.vi v14, v12, 3, v0.t -; RV64-NEXT: vmv1r.v v0, v10 -; RV64-NEXT: vrgather.vi v14, v16, 2, v0.t -; RV64-NEXT: vmv.s.x v0, a1 -; RV64-NEXT: addi a0, a0, 2 -; RV64-NEXT: vmerge.vvm v12, v16, v12, v0 -; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV64-NEXT: vmv.v.x v9, a0 -; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV64-NEXT: vrgatherei16.vv v10, v12, v9 -; RV64-NEXT: vmv1r.v v9, v14 -; RV64-NEXT: ret +; CHECK-LABEL: vpload_factor3_skip_fields: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 12 +; CHECK-NEXT: vsetivli zero, 6, e32, m1, ta, ma +; CHECK-NEXT: vlsseg2e32.v v8, (a0), a1 +; CHECK-NEXT: ret %interleaved.vec = tail call <12 x i32> @llvm.vp.load.v12i32.p0(ptr %ptr, <12 x i1> , i32 12) %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> %v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> @@ -418,78 +352,13 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @vpload_factor3_skip_fields(ptr %ptr) { define {<4 x i32>, <4 x i32>, <4 x i32>} @vpload_factor3_mask_skip_fields(ptr %ptr) { ; mask = 0101, skip the last field. -; RV32-LABEL: vpload_factor3_mask_skip_fields: -; RV32: # %bb.0: -; RV32-NEXT: li a1, 1560 -; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV32-NEXT: vmv.s.x v0, a1 -; RV32-NEXT: li a1, 73 -; RV32-NEXT: vmv.v.i v10, 8 -; RV32-NEXT: vsetivli zero, 12, e32, m4, ta, ma -; RV32-NEXT: vle32.v v12, (a0), v0.t -; RV32-NEXT: li a0, 36 -; RV32-NEXT: vmv.s.x v11, a1 -; RV32-NEXT: lui a1, %hi(.LCPI18_0) -; RV32-NEXT: addi a1, a1, %lo(.LCPI18_0) -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vcompress.vm v8, v12, v11 -; RV32-NEXT: vsetivli zero, 8, e32, m4, ta, ma -; RV32-NEXT: vslidedown.vi v16, v12, 8 -; RV32-NEXT: vmv1r.v v0, v10 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; RV32-NEXT: vrgather.vi v8, v16, 1, v0.t -; RV32-NEXT: vsetivli zero, 1, e8, mf8, ta, ma -; RV32-NEXT: vmv.v.i v0, 2 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; RV32-NEXT: vslidedown.vi v14, v12, 1 -; RV32-NEXT: vslidedown.vi v14, v12, 3, v0.t -; RV32-NEXT: vle16.v v9, (a1) -; RV32-NEXT: vmv1r.v v0, v10 -; RV32-NEXT: vrgather.vi v14, v16, 2, v0.t -; RV32-NEXT: vmv.s.x v0, a0 -; RV32-NEXT: vmerge.vvm v12, v16, v12, v0 -; RV32-NEXT: vrgatherei16.vv v10, v12, v9 -; RV32-NEXT: vmv1r.v v9, v14 -; RV32-NEXT: ret -; -; RV64-LABEL: vpload_factor3_mask_skip_fields: -; RV64: # %bb.0: -; RV64-NEXT: li a1, 1560 -; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64-NEXT: vmv.s.x v0, a1 -; RV64-NEXT: li a1, 73 -; RV64-NEXT: vmv.v.i v10, 8 -; RV64-NEXT: vmv.s.x v11, a1 -; RV64-NEXT: li a1, 36 -; RV64-NEXT: vsetivli zero, 12, e32, m4, ta, ma -; RV64-NEXT: vle32.v v12, (a0), v0.t -; RV64-NEXT: li a0, 3 -; RV64-NEXT: slli a0, a0, 32 -; RV64-NEXT: addi a0, a0, 5 -; RV64-NEXT: slli a0, a0, 16 -; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV64-NEXT: vcompress.vm v8, v12, v11 -; RV64-NEXT: vsetivli zero, 8, e32, m4, ta, ma -; RV64-NEXT: vslidedown.vi v16, v12, 8 -; RV64-NEXT: vmv1r.v v0, v10 -; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; RV64-NEXT: vrgather.vi v8, v16, 1, v0.t -; RV64-NEXT: vsetivli zero, 1, e8, mf8, ta, ma -; RV64-NEXT: vmv.v.i v0, 2 -; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; RV64-NEXT: vslidedown.vi v14, v12, 1 -; RV64-NEXT: vslidedown.vi v14, v12, 3, v0.t -; RV64-NEXT: vmv1r.v v0, v10 -; RV64-NEXT: vrgather.vi v14, v16, 2, v0.t -; RV64-NEXT: vmv.s.x v0, a1 -; RV64-NEXT: addi a0, a0, 2 -; RV64-NEXT: vmerge.vvm v12, v16, v12, v0 -; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV64-NEXT: vmv.v.x v9, a0 -; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV64-NEXT: vrgatherei16.vv v10, v12, v9 -; RV64-NEXT: vmv1r.v v9, v14 -; RV64-NEXT: ret +; CHECK-LABEL: vpload_factor3_mask_skip_fields: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 6, e32, m1, ta, ma +; CHECK-NEXT: vmv.v.i v0, 10 +; CHECK-NEXT: li a1, 12 +; CHECK-NEXT: vlsseg2e32.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret %interleaved.vec = tail call <12 x i32> @llvm.vp.load.v12i32.p0(ptr %ptr, <12 x i1> , i32 12) %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> %v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> @@ -2282,72 +2151,12 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @maskedload_factor3_mask(ptr %ptr) { } define {<4 x i32>, <4 x i32>, <4 x i32>} @maskedload_factor3_skip_field(ptr %ptr) { -; RV32-LABEL: maskedload_factor3_skip_field: -; RV32: # %bb.0: -; RV32-NEXT: li a1, 1755 -; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV32-NEXT: vmv.s.x v0, a1 -; RV32-NEXT: li a1, 73 -; RV32-NEXT: vmv.v.i v10, 8 -; RV32-NEXT: vmv.s.x v11, a1 -; RV32-NEXT: li a1, 146 -; RV32-NEXT: vsetivli zero, 12, e32, m4, ta, ma -; RV32-NEXT: vle32.v v12, (a0), v0.t -; RV32-NEXT: li a0, 36 -; RV32-NEXT: vmv.s.x v20, a1 -; RV32-NEXT: lui a1, %hi(.LCPI66_0) -; RV32-NEXT: addi a1, a1, %lo(.LCPI66_0) -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vle16.v v21, (a1) -; RV32-NEXT: vcompress.vm v8, v12, v11 -; RV32-NEXT: vsetivli zero, 8, e32, m4, ta, ma -; RV32-NEXT: vslidedown.vi v16, v12, 8 -; RV32-NEXT: vmv1r.v v0, v10 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; RV32-NEXT: vrgather.vi v8, v16, 1, v0.t -; RV32-NEXT: vcompress.vm v14, v12, v20 -; RV32-NEXT: vrgather.vi v14, v16, 2, v0.t -; RV32-NEXT: vmv.s.x v0, a0 -; RV32-NEXT: vmerge.vvm v12, v16, v12, v0 -; RV32-NEXT: vrgatherei16.vv v10, v12, v21 -; RV32-NEXT: vmv1r.v v9, v14 -; RV32-NEXT: ret -; -; RV64-LABEL: maskedload_factor3_skip_field: -; RV64: # %bb.0: -; RV64-NEXT: li a1, 1755 -; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64-NEXT: vmv.s.x v0, a1 -; RV64-NEXT: li a1, 73 -; RV64-NEXT: vmv.v.i v10, 8 -; RV64-NEXT: vmv.s.x v11, a1 -; RV64-NEXT: li a1, 146 -; RV64-NEXT: vmv.s.x v20, a1 -; RV64-NEXT: li a1, 36 -; RV64-NEXT: vsetivli zero, 12, e32, m4, ta, ma -; RV64-NEXT: vle32.v v12, (a0), v0.t -; RV64-NEXT: li a0, 3 -; RV64-NEXT: slli a0, a0, 32 -; RV64-NEXT: addi a0, a0, 5 -; RV64-NEXT: slli a0, a0, 16 -; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV64-NEXT: vcompress.vm v8, v12, v11 -; RV64-NEXT: vsetivli zero, 8, e32, m4, ta, ma -; RV64-NEXT: vslidedown.vi v16, v12, 8 -; RV64-NEXT: vmv1r.v v0, v10 -; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; RV64-NEXT: vrgather.vi v8, v16, 1, v0.t -; RV64-NEXT: vcompress.vm v14, v12, v20 -; RV64-NEXT: vrgather.vi v14, v16, 2, v0.t -; RV64-NEXT: vmv.s.x v0, a1 -; RV64-NEXT: addi a0, a0, 2 -; RV64-NEXT: vmerge.vvm v12, v16, v12, v0 -; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV64-NEXT: vmv.v.x v9, a0 -; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV64-NEXT: vrgatherei16.vv v10, v12, v9 -; RV64-NEXT: vmv1r.v v9, v14 -; RV64-NEXT: ret +; CHECK-LABEL: maskedload_factor3_skip_field: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 12 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vlsseg2e32.v v8, (a0), a1 +; CHECK-NEXT: ret %interleaved.vec = tail call <12 x i32> @llvm.masked.load.v12i32.p0(ptr %ptr, i32 4, <12 x i1> , <12 x i32> poison) ; mask = 1111, skip last field %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> @@ -2360,72 +2169,13 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @maskedload_factor3_skip_field(ptr %ptr } define {<4 x i32>, <4 x i32>, <4 x i32>} @maskedload_factor3_mask_skip_field(ptr %ptr) { -; RV32-LABEL: maskedload_factor3_mask_skip_field: -; RV32: # %bb.0: -; RV32-NEXT: li a1, 195 -; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV32-NEXT: vmv.s.x v0, a1 -; RV32-NEXT: li a1, 73 -; RV32-NEXT: vmv.v.i v10, 8 -; RV32-NEXT: vmv.s.x v11, a1 -; RV32-NEXT: li a1, 146 -; RV32-NEXT: vsetivli zero, 12, e32, m4, ta, ma -; RV32-NEXT: vle32.v v12, (a0), v0.t -; RV32-NEXT: li a0, 36 -; RV32-NEXT: vmv.s.x v20, a1 -; RV32-NEXT: lui a1, %hi(.LCPI67_0) -; RV32-NEXT: addi a1, a1, %lo(.LCPI67_0) -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vle16.v v21, (a1) -; RV32-NEXT: vcompress.vm v8, v12, v11 -; RV32-NEXT: vsetivli zero, 8, e32, m4, ta, ma -; RV32-NEXT: vslidedown.vi v16, v12, 8 -; RV32-NEXT: vmv1r.v v0, v10 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; RV32-NEXT: vrgather.vi v8, v16, 1, v0.t -; RV32-NEXT: vcompress.vm v14, v12, v20 -; RV32-NEXT: vrgather.vi v14, v16, 2, v0.t -; RV32-NEXT: vmv.s.x v0, a0 -; RV32-NEXT: vmerge.vvm v12, v16, v12, v0 -; RV32-NEXT: vrgatherei16.vv v10, v12, v21 -; RV32-NEXT: vmv1r.v v9, v14 -; RV32-NEXT: ret -; -; RV64-LABEL: maskedload_factor3_mask_skip_field: -; RV64: # %bb.0: -; RV64-NEXT: li a1, 195 -; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64-NEXT: vmv.s.x v0, a1 -; RV64-NEXT: li a1, 73 -; RV64-NEXT: vmv.v.i v10, 8 -; RV64-NEXT: vmv.s.x v11, a1 -; RV64-NEXT: li a1, 146 -; RV64-NEXT: vmv.s.x v20, a1 -; RV64-NEXT: li a1, 36 -; RV64-NEXT: vsetivli zero, 12, e32, m4, ta, ma -; RV64-NEXT: vle32.v v12, (a0), v0.t -; RV64-NEXT: li a0, 3 -; RV64-NEXT: slli a0, a0, 32 -; RV64-NEXT: addi a0, a0, 5 -; RV64-NEXT: slli a0, a0, 16 -; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV64-NEXT: vcompress.vm v8, v12, v11 -; RV64-NEXT: vsetivli zero, 8, e32, m4, ta, ma -; RV64-NEXT: vslidedown.vi v16, v12, 8 -; RV64-NEXT: vmv1r.v v0, v10 -; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; RV64-NEXT: vrgather.vi v8, v16, 1, v0.t -; RV64-NEXT: vcompress.vm v14, v12, v20 -; RV64-NEXT: vrgather.vi v14, v16, 2, v0.t -; RV64-NEXT: vmv.s.x v0, a1 -; RV64-NEXT: addi a0, a0, 2 -; RV64-NEXT: vmerge.vvm v12, v16, v12, v0 -; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV64-NEXT: vmv.v.x v9, a0 -; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV64-NEXT: vrgatherei16.vv v10, v12, v9 -; RV64-NEXT: vmv1r.v v9, v14 -; RV64-NEXT: ret +; CHECK-LABEL: maskedload_factor3_mask_skip_field: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vmv.v.i v0, 5 +; CHECK-NEXT: li a1, 12 +; CHECK-NEXT: vlsseg2e32.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret %interleaved.vec = tail call <12 x i32> @llvm.masked.load.v12i32.p0(ptr %ptr, i32 4, <12 x i1> , <12 x i32> poison) ; mask = 1010, skip the last field %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> @@ -2521,173 +2271,12 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @maskedload_factor3_invalid_skip_field( define {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} @maskedload_factor5_skip_fields(ptr %ptr) { ; mask = 1111, skip the last two fields. -; RV32-LABEL: maskedload_factor5_skip_fields: -; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -256 -; RV32-NEXT: .cfi_def_cfa_offset 256 -; RV32-NEXT: sw ra, 252(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s0, 248(sp) # 4-byte Folded Spill -; RV32-NEXT: .cfi_offset ra, -4 -; RV32-NEXT: .cfi_offset s0, -8 -; RV32-NEXT: addi s0, sp, 256 -; RV32-NEXT: .cfi_def_cfa s0, 0 -; RV32-NEXT: andi sp, sp, -128 -; RV32-NEXT: lui a1, 58 -; RV32-NEXT: addi a1, a1, -793 -; RV32-NEXT: vsetivli zero, 20, e32, m8, ta, ma -; RV32-NEXT: vmv.s.x v0, a1 -; RV32-NEXT: li a1, 33 -; RV32-NEXT: vle32.v v16, (a0), v0.t -; RV32-NEXT: li a0, 32 -; RV32-NEXT: mv a2, sp -; RV32-NEXT: vmv.s.x v0, a1 -; RV32-NEXT: vsetivli zero, 8, e32, m4, ta, ma -; RV32-NEXT: vslidedown.vi v8, v16, 8 -; RV32-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV32-NEXT: vslidedown.vi v12, v16, 6 -; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV32-NEXT: vslidedown.vi v13, v16, 1 -; RV32-NEXT: vsetvli zero, a0, e32, m8, ta, ma -; RV32-NEXT: vse32.v v16, (a2) -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmerge.vvm v8, v8, v16, v0 -; RV32-NEXT: vslidedown.vi v10, v16, 7 -; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV32-NEXT: vslidedown.vi v11, v16, 2 -; RV32-NEXT: vslidedown.vi v18, v16, 3 -; RV32-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV32-NEXT: vslidedown.vi v14, v16, 4 -; RV32-NEXT: vmv.x.s a0, v12 -; RV32-NEXT: vmv.x.s a1, v13 -; RV32-NEXT: vmv.x.s a2, v11 -; RV32-NEXT: vmv.x.s a3, v18 -; RV32-NEXT: vmv.x.s a4, v14 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v11, a1 -; RV32-NEXT: vmv.v.x v12, a2 -; RV32-NEXT: vmv.v.x v13, a3 -; RV32-NEXT: vmv.v.x v14, a4 -; RV32-NEXT: lw a1, 32(sp) -; RV32-NEXT: lw a2, 36(sp) -; RV32-NEXT: lw a3, 44(sp) -; RV32-NEXT: lw a4, 48(sp) -; RV32-NEXT: vslide1down.vx v11, v11, a0 -; RV32-NEXT: vmv.x.s a0, v10 -; RV32-NEXT: vslide1down.vx v10, v12, a0 -; RV32-NEXT: vslide1down.vx v11, v11, a3 -; RV32-NEXT: vslide1down.vx v10, v10, a4 -; RV32-NEXT: vslide1down.vx v12, v13, a1 -; RV32-NEXT: lw a0, 64(sp) -; RV32-NEXT: lw a1, 52(sp) -; RV32-NEXT: lw a3, 56(sp) -; RV32-NEXT: lw a4, 68(sp) -; RV32-NEXT: vslide1down.vx v14, v14, a2 -; RV32-NEXT: vslide1down.vx v13, v11, a0 -; RV32-NEXT: vmv.v.i v0, 10 -; RV32-NEXT: vslide1down.vx v10, v10, a4 -; RV32-NEXT: vslide1down.vx v11, v12, a1 -; RV32-NEXT: lw a0, 72(sp) -; RV32-NEXT: lw a1, 76(sp) -; RV32-NEXT: vslide1down.vx v12, v14, a3 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; RV32-NEXT: vslidedown.vi v8, v8, 4, v0.t -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vslide1down.vx v11, v11, a0 -; RV32-NEXT: vslide1down.vx v12, v12, a1 -; RV32-NEXT: vmv1r.v v9, v13 -; RV32-NEXT: addi sp, s0, -256 -; RV32-NEXT: .cfi_def_cfa sp, 256 -; RV32-NEXT: lw ra, 252(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s0, 248(sp) # 4-byte Folded Reload -; RV32-NEXT: .cfi_restore ra -; RV32-NEXT: .cfi_restore s0 -; RV32-NEXT: addi sp, sp, 256 -; RV32-NEXT: .cfi_def_cfa_offset 0 -; RV32-NEXT: ret -; -; RV64-LABEL: maskedload_factor5_skip_fields: -; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -256 -; RV64-NEXT: .cfi_def_cfa_offset 256 -; RV64-NEXT: sd ra, 248(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s0, 240(sp) # 8-byte Folded Spill -; RV64-NEXT: .cfi_offset ra, -8 -; RV64-NEXT: .cfi_offset s0, -16 -; RV64-NEXT: addi s0, sp, 256 -; RV64-NEXT: .cfi_def_cfa s0, 0 -; RV64-NEXT: andi sp, sp, -128 -; RV64-NEXT: lui a1, 58 -; RV64-NEXT: addi a1, a1, -793 -; RV64-NEXT: vsetivli zero, 20, e32, m8, ta, ma -; RV64-NEXT: vmv.s.x v0, a1 -; RV64-NEXT: li a1, 33 -; RV64-NEXT: vle32.v v16, (a0), v0.t -; RV64-NEXT: li a0, 32 -; RV64-NEXT: mv a2, sp -; RV64-NEXT: vmv.s.x v0, a1 -; RV64-NEXT: vsetivli zero, 8, e32, m4, ta, ma -; RV64-NEXT: vslidedown.vi v8, v16, 8 -; RV64-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV64-NEXT: vslidedown.vi v12, v16, 6 -; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV64-NEXT: vslidedown.vi v13, v16, 1 -; RV64-NEXT: vsetvli zero, a0, e32, m8, ta, ma -; RV64-NEXT: vse32.v v16, (a2) -; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV64-NEXT: vmerge.vvm v8, v8, v16, v0 -; RV64-NEXT: vslidedown.vi v10, v16, 7 -; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV64-NEXT: vslidedown.vi v11, v16, 2 -; RV64-NEXT: vslidedown.vi v18, v16, 3 -; RV64-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV64-NEXT: vslidedown.vi v14, v16, 4 -; RV64-NEXT: vmv.x.s a0, v12 -; RV64-NEXT: vmv.x.s a1, v13 -; RV64-NEXT: vmv.x.s a2, v11 -; RV64-NEXT: vmv.x.s a3, v18 -; RV64-NEXT: vmv.x.s a4, v14 -; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV64-NEXT: vmv.v.x v11, a1 -; RV64-NEXT: vmv.v.x v12, a2 -; RV64-NEXT: vmv.v.x v13, a3 -; RV64-NEXT: vmv.v.x v14, a4 -; RV64-NEXT: lw a1, 32(sp) -; RV64-NEXT: lw a2, 36(sp) -; RV64-NEXT: lw a3, 44(sp) -; RV64-NEXT: lw a4, 48(sp) -; RV64-NEXT: vslide1down.vx v11, v11, a0 -; RV64-NEXT: vmv.x.s a0, v10 -; RV64-NEXT: vslide1down.vx v10, v12, a0 -; RV64-NEXT: vslide1down.vx v11, v11, a3 -; RV64-NEXT: vslide1down.vx v10, v10, a4 -; RV64-NEXT: vslide1down.vx v12, v13, a1 -; RV64-NEXT: lw a0, 64(sp) -; RV64-NEXT: lw a1, 52(sp) -; RV64-NEXT: lw a3, 56(sp) -; RV64-NEXT: lw a4, 68(sp) -; RV64-NEXT: vslide1down.vx v14, v14, a2 -; RV64-NEXT: vslide1down.vx v13, v11, a0 -; RV64-NEXT: vmv.v.i v0, 10 -; RV64-NEXT: vslide1down.vx v10, v10, a4 -; RV64-NEXT: vslide1down.vx v11, v12, a1 -; RV64-NEXT: lw a0, 72(sp) -; RV64-NEXT: lw a1, 76(sp) -; RV64-NEXT: vslide1down.vx v12, v14, a3 -; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; RV64-NEXT: vslidedown.vi v8, v8, 4, v0.t -; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV64-NEXT: vslide1down.vx v11, v11, a0 -; RV64-NEXT: vslide1down.vx v12, v12, a1 -; RV64-NEXT: vmv1r.v v9, v13 -; RV64-NEXT: addi sp, s0, -256 -; RV64-NEXT: .cfi_def_cfa sp, 256 -; RV64-NEXT: ld ra, 248(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s0, 240(sp) # 8-byte Folded Reload -; RV64-NEXT: .cfi_restore ra -; RV64-NEXT: .cfi_restore s0 -; RV64-NEXT: addi sp, sp, 256 -; RV64-NEXT: .cfi_def_cfa_offset 0 -; RV64-NEXT: ret +; CHECK-LABEL: maskedload_factor5_skip_fields: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 20 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vlsseg3e32.v v8, (a0), a1 +; CHECK-NEXT: ret %interleaved.vec = tail call <20 x i32> @llvm.masked.load(ptr %ptr, i32 4, <20 x i1> , <20 x i32> poison) %v0 = shufflevector <20 x i32> %interleaved.vec, <20 x i32> poison, <4 x i32> %v1 = shufflevector <20 x i32> %interleaved.vec, <20 x i32> poison, <4 x i32> From 95f772e818601ddf5f54e76ec518715f3929eeb2 Mon Sep 17 00:00:00 2001 From: Min-Yih Hsu Date: Mon, 4 Aug 2025 11:16:21 -0700 Subject: [PATCH 3/9] fixup! Clean up the tests --- .../rvv/fixed-vectors-interleaved-access.ll | 48 ++++++++----------- 1 file changed, 21 insertions(+), 27 deletions(-) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll index 497b39fb6f044..a61a1b7cf9703 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll @@ -332,7 +332,7 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @vpload_factor3_poison_shufflemask(ptr ret {<4 x i32>, <4 x i32>, <4 x i32>} %res2 } -define {<4 x i32>, <4 x i32>, <4 x i32>} @vpload_factor3_skip_fields(ptr %ptr) { +define {<4 x i32>, <4 x i32>} @vpload_factor3_skip_fields(ptr %ptr) { ; mask = 1111, skip the last field. ; CHECK-LABEL: vpload_factor3_skip_fields: ; CHECK: # %bb.0: @@ -344,13 +344,12 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @vpload_factor3_skip_fields(ptr %ptr) { %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> %v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> %v2 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> - %res0 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0 - %res1 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1 - %res2 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res1, <4 x i32> %v2, 2 - ret {<4 x i32>, <4 x i32>, <4 x i32>} %res2 + %res0 = insertvalue {<4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0 + %res1 = insertvalue {<4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1 + ret {<4 x i32>, <4 x i32>} %res1 } -define {<4 x i32>, <4 x i32>, <4 x i32>} @vpload_factor3_mask_skip_fields(ptr %ptr) { +define {<4 x i32>, <4 x i32>} @vpload_factor3_mask_skip_fields(ptr %ptr) { ; mask = 0101, skip the last field. ; CHECK-LABEL: vpload_factor3_mask_skip_fields: ; CHECK: # %bb.0: @@ -363,10 +362,9 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @vpload_factor3_mask_skip_fields(ptr %p %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> %v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> %v2 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> - %res0 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0 - %res1 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1 - %res2 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res1, <4 x i32> %v2, 2 - ret {<4 x i32>, <4 x i32>, <4 x i32>} %res2 + %res0 = insertvalue {<4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0 + %res1 = insertvalue {<4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1 + ret {<4 x i32>, <4 x i32>} %res1 } define {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} @vpload_factor4(ptr %ptr) { @@ -2150,7 +2148,7 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @maskedload_factor3_mask(ptr %ptr) { ret {<4 x i32>, <4 x i32>, <4 x i32>} %res2 } -define {<4 x i32>, <4 x i32>, <4 x i32>} @maskedload_factor3_skip_field(ptr %ptr) { +define {<4 x i32>, <4 x i32>} @maskedload_factor3_skip_field(ptr %ptr) { ; CHECK-LABEL: maskedload_factor3_skip_field: ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 12 @@ -2162,13 +2160,12 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @maskedload_factor3_skip_field(ptr %ptr %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> %v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> %v2 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> - %res0 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0 - %res1 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1 - %res2 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res1, <4 x i32> %v2, 2 - ret {<4 x i32>, <4 x i32>, <4 x i32>} %res2 + %res0 = insertvalue {<4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0 + %res1 = insertvalue {<4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1 + ret {<4 x i32>, <4 x i32>} %res1 } -define {<4 x i32>, <4 x i32>, <4 x i32>} @maskedload_factor3_mask_skip_field(ptr %ptr) { +define {<4 x i32>, <4 x i32>} @maskedload_factor3_mask_skip_field(ptr %ptr) { ; CHECK-LABEL: maskedload_factor3_mask_skip_field: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma @@ -2181,10 +2178,9 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @maskedload_factor3_mask_skip_field(ptr %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> %v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> %v2 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> - %res0 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0 - %res1 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1 - %res2 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res1, <4 x i32> %v2, 2 - ret {<4 x i32>, <4 x i32>, <4 x i32>} %res2 + %res0 = insertvalue {<4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0 + %res1 = insertvalue {<4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1 + ret {<4 x i32>, <4 x i32>} %res1 } ; We can only skip the last field for now. @@ -2269,7 +2265,7 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @maskedload_factor3_invalid_skip_field( ret {<4 x i32>, <4 x i32>, <4 x i32>} %res2 } -define {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} @maskedload_factor5_skip_fields(ptr %ptr) { +define {<4 x i32>, <4 x i32>, <4 x i32>} @maskedload_factor5_skip_fields(ptr %ptr) { ; mask = 1111, skip the last two fields. ; CHECK-LABEL: maskedload_factor5_skip_fields: ; CHECK: # %bb.0: @@ -2283,11 +2279,9 @@ define {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} @maskedload_facto %v2 = shufflevector <20 x i32> %interleaved.vec, <20 x i32> poison, <4 x i32> %v3 = shufflevector <20 x i32> %interleaved.vec, <20 x i32> poison, <4 x i32> %v4 = shufflevector <20 x i32> %interleaved.vec, <20 x i32> poison, <4 x i32> - %res0 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0 - %res1 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1 - %res2 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} %res1, <4 x i32> %v2, 2 - %res3 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} %res2, <4 x i32> %v3, 3 - %res4 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} %res3, <4 x i32> %v4, 4 - ret {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} %res4 + %res0 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0 + %res1 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1 + %res2 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res1, <4 x i32> %v2, 2 + ret {<4 x i32>, <4 x i32>, <4 x i32>} %res2 } From 7bb4ec398211736b42315945976ac5eba32ac71b Mon Sep 17 00:00:00 2001 From: Min-Yih Hsu Date: Tue, 5 Aug 2025 14:34:12 -0700 Subject: [PATCH 4/9] fixup! Recognizing masks assembled by AND --- llvm/lib/CodeGen/InterleavedAccessPass.cpp | 89 +++++++++----- .../rvv/fixed-vectors-interleaved-access.ll | 114 ++++++++++++------ 2 files changed, 135 insertions(+), 68 deletions(-) diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp index e6c4de23c055e..81efb7b335dbc 100644 --- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp +++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp @@ -385,25 +385,25 @@ bool InterleavedAccessImpl::lowerInterleavedLoad( replaceBinOpShuffles(BinOpShuffles.getArrayRef(), Shuffles, Load); Value *Mask = nullptr; - unsigned MaskFactor = Factor; + unsigned GapMaskFactor = Factor; if (LI) { LLVM_DEBUG(dbgs() << "IA: Found an interleaved load: " << *Load << "\n"); } else { // Check mask operand. Handle both all-true/false and interleaved mask. - std::tie(Mask, MaskFactor) = getMask(getMaskOperand(II), Factor, VecTy); + std::tie(Mask, GapMaskFactor) = getMask(getMaskOperand(II), Factor, VecTy); if (!Mask) return false; LLVM_DEBUG(dbgs() << "IA: Found an interleaved vp.load or masked.load: " << *Load << "\n"); LLVM_DEBUG(dbgs() << "IA: With nominal factor " << Factor - << " and mask factor " << MaskFactor << "\n"); + << " and mask factor " << GapMaskFactor << "\n"); } // Try to create target specific intrinsics to replace the load and // shuffles. if (!TLI->lowerInterleavedLoad(cast(Load), Mask, Shuffles, - Indices, Factor, MaskFactor)) + Indices, Factor, GapMaskFactor)) // If Extracts is not empty, tryReplaceExtracts made changes earlier. return !Extracts.empty() || BinOpShuffleChanged; @@ -540,15 +540,20 @@ bool InterleavedAccessImpl::lowerInterleavedStore( "number of stored element should be a multiple of Factor"); Value *Mask = nullptr; + unsigned GapMaskFactor = Factor; if (SI) { LLVM_DEBUG(dbgs() << "IA: Found an interleaved store: " << *Store << "\n"); } else { // Check mask operand. Handle both all-true/false and interleaved mask. unsigned LaneMaskLen = NumStoredElements / Factor; - std::tie(Mask, std::ignore) = getMask(getMaskOperand(II), Factor, - ElementCount::getFixed(LaneMaskLen)); + std::tie(Mask, GapMaskFactor) = getMask( + getMaskOperand(II), Factor, ElementCount::getFixed(LaneMaskLen)); if (!Mask) return false; + // We shouldn't transform stores even it has a gap mask. And since we might + // already change the IR, we're returning true here. + if (GapMaskFactor != Factor) + return true; LLVM_DEBUG(dbgs() << "IA: Found an interleaved vp.store or masked.store: " << *Store << "\n"); @@ -565,8 +570,40 @@ bool InterleavedAccessImpl::lowerInterleavedStore( return true; } +// A wide mask <1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0> could be used to skip the +// last field in a factor-of-three interleaved store or deinterleaved load (in +// which case LeafMaskLen is 4). Such (wide) mask is also known as gap mask. +// This helper function tries to detect this pattern and return the actual +// factor we're accessing, which is 2 in this example. +static unsigned getGapMaskFactor(const Constant &MaskConst, unsigned Factor, + unsigned LeafMaskLen) { + APInt FactorMask(Factor, 0); + FactorMask.setAllBits(); + for (unsigned F = 0U; F < Factor; ++F) { + unsigned Idx; + for (Idx = 0U; Idx < LeafMaskLen; ++Idx) { + Constant *C = MaskConst.getAggregateElement(F + Idx * Factor); + if (!C->isZeroValue()) + break; + } + // All mask bits on this field are zero, skipping it. + if (Idx >= LeafMaskLen) + FactorMask.clearBit(F); + } + // We currently only allow gaps in the "trailing" factors / fields. So + // given the original factor being 4, we can skip fields 2 and 3, but we + // cannot only skip fields 1 and 2. If FactorMask does not match such + // pattern, reset it. + if (!FactorMask.isMask()) + FactorMask.setAllBits(); + + return FactorMask.popcount(); +} + static std::pair getMask(Value *WideMask, unsigned Factor, ElementCount LeafValueEC) { + using namespace PatternMatch; + if (auto *IMI = dyn_cast(WideMask)) { if (unsigned F = getInterleaveIntrinsicFactor(IMI->getIntrinsicID()); F && F == Factor && llvm::all_equal(IMI->args())) { @@ -574,6 +611,18 @@ static std::pair getMask(Value *WideMask, unsigned Factor, } } + // Try to match `and , `. The WideMask here is + // expected to be a fixed vector and gap mask should be a constant mask. + Value *AndMaskLHS; + Constant *AndMaskRHS; + if (match(WideMask, m_c_And(m_Value(AndMaskLHS), m_Constant(AndMaskRHS))) && + LeafValueEC.isFixed()) { + assert(!isa(AndMaskLHS) && + "expect constants to be folded already"); + return {getMask(AndMaskLHS, Factor, LeafValueEC).first, + getGapMaskFactor(*AndMaskRHS, Factor, LeafValueEC.getFixedValue())}; + } + if (auto *ConstMask = dyn_cast(WideMask)) { if (auto *Splat = ConstMask->getSplatValue()) // All-ones or all-zeros mask. @@ -581,33 +630,17 @@ static std::pair getMask(Value *WideMask, unsigned Factor, if (LeafValueEC.isFixed()) { unsigned LeafMaskLen = LeafValueEC.getFixedValue(); - // First, check if the mask completely skips some of the factors / fields. - APInt FactorMask(Factor, 0); - FactorMask.setAllBits(); - for (unsigned F = 0U; F < Factor; ++F) { - unsigned Idx; - for (Idx = 0U; Idx < LeafMaskLen; ++Idx) { - Constant *C = ConstMask->getAggregateElement(F + Idx * Factor); - if (!C->isZeroValue()) - break; - } - // All mask bits on this field are zero, skipping it. - if (Idx >= LeafMaskLen) - FactorMask.clearBit(F); - } - // We currently only support skipping "trailing" factors / fields. So - // given the original factor being 4, we can skip fields 2 and 3, but we - // cannot only skip fields 1 and 2. If FactorMask does not match such - // pattern, reset it. - if (!FactorMask.isMask()) - FactorMask.setAllBits(); + // First, check if we use a gap mask to skip some of the factors / fields. + const unsigned GapMaskFactor = + getGapMaskFactor(*ConstMask, Factor, LeafMaskLen); + assert(GapMaskFactor <= Factor); SmallVector LeafMask(LeafMaskLen, nullptr); // If this is a fixed-length constant mask, each lane / leaf has to // use the same mask. This is done by checking if every group with Factor // number of elements in the interleaved mask has homogeneous values. for (unsigned Idx = 0U; Idx < LeafMaskLen * Factor; ++Idx) { - if (!FactorMask[Idx % Factor]) + if (Idx % Factor >= GapMaskFactor) continue; Constant *C = ConstMask->getAggregateElement(Idx); if (LeafMask[Idx / Factor] && LeafMask[Idx / Factor] != C) @@ -615,7 +648,7 @@ static std::pair getMask(Value *WideMask, unsigned Factor, LeafMask[Idx / Factor] = C; } - return {ConstantVector::get(LeafMask), FactorMask.popcount()}; + return {ConstantVector::get(LeafMask), GapMaskFactor}; } } diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll index a61a1b7cf9703..2c738e5aeb55b 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll @@ -367,6 +367,24 @@ define {<4 x i32>, <4 x i32>} @vpload_factor3_mask_skip_fields(ptr %ptr) { ret {<4 x i32>, <4 x i32>} %res1 } +define {<4 x i32>, <4 x i32>} @vpload_factor3_combined_mask_skip_field(ptr %ptr, <4 x i1> %mask) { +; CHECK-LABEL: vpload_factor3_combined_mask_skip_field: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 12 +; CHECK-NEXT: vsetivli zero, 6, e32, m1, ta, ma +; CHECK-NEXT: vlsseg2e32.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret + %interleaved.mask = shufflevector <4 x i1> %mask, <4 x i1> poison, <12 x i32> + %combined = and <12 x i1> %interleaved.mask, + %interleaved.vec = tail call <12 x i32> @llvm.vp.load.v12i32.p0(ptr %ptr, <12 x i1> %combined, i32 12) + ; mask = %mask, skip the last field + %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> + %v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> + %res0 = insertvalue {<4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0 + %res1 = insertvalue {<4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1 + ret {<4 x i32>, <4 x i32>} %res1 +} + define {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} @vpload_factor4(ptr %ptr) { ; CHECK-LABEL: vpload_factor4: ; CHECK: # %bb.0: @@ -514,8 +532,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: li a2, 32 ; RV32-NEXT: lui a3, 12 ; RV32-NEXT: lui a6, 12291 -; RV32-NEXT: lui a7, %hi(.LCPI25_0) -; RV32-NEXT: addi a7, a7, %lo(.LCPI25_0) +; RV32-NEXT: lui a7, %hi(.LCPI26_0) +; RV32-NEXT: addi a7, a7, %lo(.LCPI26_0) ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; RV32-NEXT: vle32.v v24, (a5) ; RV32-NEXT: vmv.s.x v0, a3 @@ -600,12 +618,12 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs4r.v v8, (a1) # vscale x 32-byte Folded Spill ; RV32-NEXT: lui a7, 49164 -; RV32-NEXT: lui a1, %hi(.LCPI25_1) -; RV32-NEXT: addi a1, a1, %lo(.LCPI25_1) +; RV32-NEXT: lui a1, %hi(.LCPI26_1) +; RV32-NEXT: addi a1, a1, %lo(.LCPI26_1) ; RV32-NEXT: lui t2, 3 ; RV32-NEXT: lui t1, 196656 -; RV32-NEXT: lui a4, %hi(.LCPI25_3) -; RV32-NEXT: addi a4, a4, %lo(.LCPI25_3) +; RV32-NEXT: lui a4, %hi(.LCPI26_3) +; RV32-NEXT: addi a4, a4, %lo(.LCPI26_3) ; RV32-NEXT: lui t0, 786624 ; RV32-NEXT: li a5, 48 ; RV32-NEXT: lui a6, 768 @@ -784,8 +802,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload ; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, ma ; RV32-NEXT: vrgatherei16.vv v24, v8, v2 -; RV32-NEXT: lui a1, %hi(.LCPI25_2) -; RV32-NEXT: addi a1, a1, %lo(.LCPI25_2) +; RV32-NEXT: lui a1, %hi(.LCPI26_2) +; RV32-NEXT: addi a1, a1, %lo(.LCPI26_2) ; RV32-NEXT: lui a3, 3073 ; RV32-NEXT: addi a3, a3, -1024 ; RV32-NEXT: vmv.s.x v0, a3 @@ -849,16 +867,16 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: vrgatherei16.vv v28, v8, v3 ; RV32-NEXT: vsetivli zero, 10, e32, m4, tu, ma ; RV32-NEXT: vmv.v.v v28, v24 -; RV32-NEXT: lui a1, %hi(.LCPI25_4) -; RV32-NEXT: addi a1, a1, %lo(.LCPI25_4) -; RV32-NEXT: lui a2, %hi(.LCPI25_5) -; RV32-NEXT: addi a2, a2, %lo(.LCPI25_5) +; RV32-NEXT: lui a1, %hi(.LCPI26_4) +; RV32-NEXT: addi a1, a1, %lo(.LCPI26_4) +; RV32-NEXT: lui a2, %hi(.LCPI26_5) +; RV32-NEXT: addi a2, a2, %lo(.LCPI26_5) ; RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; RV32-NEXT: vle16.v v24, (a2) ; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; RV32-NEXT: vle16.v v8, (a1) -; RV32-NEXT: lui a1, %hi(.LCPI25_7) -; RV32-NEXT: addi a1, a1, %lo(.LCPI25_7) +; RV32-NEXT: lui a1, %hi(.LCPI26_7) +; RV32-NEXT: addi a1, a1, %lo(.LCPI26_7) ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vle16.v v10, (a1) ; RV32-NEXT: csrr a1, vlenb @@ -886,14 +904,14 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: vl8r.v v0, (a1) # vscale x 64-byte Folded Reload ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vrgatherei16.vv v16, v0, v10 -; RV32-NEXT: lui a1, %hi(.LCPI25_6) -; RV32-NEXT: addi a1, a1, %lo(.LCPI25_6) -; RV32-NEXT: lui a2, %hi(.LCPI25_8) -; RV32-NEXT: addi a2, a2, %lo(.LCPI25_8) +; RV32-NEXT: lui a1, %hi(.LCPI26_6) +; RV32-NEXT: addi a1, a1, %lo(.LCPI26_6) +; RV32-NEXT: lui a2, %hi(.LCPI26_8) +; RV32-NEXT: addi a2, a2, %lo(.LCPI26_8) ; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; RV32-NEXT: vle16.v v4, (a1) -; RV32-NEXT: lui a1, %hi(.LCPI25_9) -; RV32-NEXT: addi a1, a1, %lo(.LCPI25_9) +; RV32-NEXT: lui a1, %hi(.LCPI26_9) +; RV32-NEXT: addi a1, a1, %lo(.LCPI26_9) ; RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; RV32-NEXT: vle16.v v6, (a1) ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma @@ -980,8 +998,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV64-NEXT: li a4, 128 ; RV64-NEXT: lui a1, 1 ; RV64-NEXT: vle64.v v8, (a3) -; RV64-NEXT: lui a3, %hi(.LCPI25_0) -; RV64-NEXT: addi a3, a3, %lo(.LCPI25_0) +; RV64-NEXT: lui a3, %hi(.LCPI26_0) +; RV64-NEXT: addi a3, a3, %lo(.LCPI26_0) ; RV64-NEXT: vmv.s.x v0, a4 ; RV64-NEXT: csrr a4, vlenb ; RV64-NEXT: li a5, 61 @@ -1169,8 +1187,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV64-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu ; RV64-NEXT: vslideup.vi v12, v16, 1, v0.t -; RV64-NEXT: lui a2, %hi(.LCPI25_1) -; RV64-NEXT: addi a2, a2, %lo(.LCPI25_1) +; RV64-NEXT: lui a2, %hi(.LCPI26_1) +; RV64-NEXT: addi a2, a2, %lo(.LCPI26_1) ; RV64-NEXT: li a3, 192 ; RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; RV64-NEXT: vle16.v v6, (a2) @@ -1204,8 +1222,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV64-NEXT: vrgatherei16.vv v24, v16, v6 ; RV64-NEXT: addi a2, sp, 16 ; RV64-NEXT: vs8r.v v24, (a2) # vscale x 64-byte Folded Spill -; RV64-NEXT: lui a2, %hi(.LCPI25_2) -; RV64-NEXT: addi a2, a2, %lo(.LCPI25_2) +; RV64-NEXT: lui a2, %hi(.LCPI26_2) +; RV64-NEXT: addi a2, a2, %lo(.LCPI26_2) ; RV64-NEXT: li a3, 1040 ; RV64-NEXT: vmv.s.x v0, a3 ; RV64-NEXT: addi a1, a1, -2016 @@ -1289,12 +1307,12 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vs4r.v v8, (a1) # vscale x 32-byte Folded Spill -; RV64-NEXT: lui a1, %hi(.LCPI25_3) -; RV64-NEXT: addi a1, a1, %lo(.LCPI25_3) +; RV64-NEXT: lui a1, %hi(.LCPI26_3) +; RV64-NEXT: addi a1, a1, %lo(.LCPI26_3) ; RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; RV64-NEXT: vle16.v v20, (a1) -; RV64-NEXT: lui a1, %hi(.LCPI25_4) -; RV64-NEXT: addi a1, a1, %lo(.LCPI25_4) +; RV64-NEXT: lui a1, %hi(.LCPI26_4) +; RV64-NEXT: addi a1, a1, %lo(.LCPI26_4) ; RV64-NEXT: vle16.v v8, (a1) ; RV64-NEXT: csrr a1, vlenb ; RV64-NEXT: li a2, 77 @@ -1345,8 +1363,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV64-NEXT: vl2r.v v8, (a1) # vscale x 16-byte Folded Reload ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV64-NEXT: vrgatherei16.vv v0, v16, v8 -; RV64-NEXT: lui a1, %hi(.LCPI25_5) -; RV64-NEXT: addi a1, a1, %lo(.LCPI25_5) +; RV64-NEXT: lui a1, %hi(.LCPI26_5) +; RV64-NEXT: addi a1, a1, %lo(.LCPI26_5) ; RV64-NEXT: vle16.v v20, (a1) ; RV64-NEXT: csrr a1, vlenb ; RV64-NEXT: li a2, 61 @@ -1963,8 +1981,8 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @invalid_vp_mask(ptr %ptr) { ; RV32-NEXT: vle32.v v12, (a0), v0.t ; RV32-NEXT: li a0, 36 ; RV32-NEXT: vmv.s.x v20, a1 -; RV32-NEXT: lui a1, %hi(.LCPI61_0) -; RV32-NEXT: addi a1, a1, %lo(.LCPI61_0) +; RV32-NEXT: lui a1, %hi(.LCPI62_0) +; RV32-NEXT: addi a1, a1, %lo(.LCPI62_0) ; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32-NEXT: vle16.v v21, (a1) ; RV32-NEXT: vcompress.vm v8, v12, v11 @@ -2039,8 +2057,8 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @invalid_vp_evl(ptr %ptr) { ; RV32-NEXT: vmv.s.x v10, a0 ; RV32-NEXT: li a0, 146 ; RV32-NEXT: vmv.s.x v11, a0 -; RV32-NEXT: lui a0, %hi(.LCPI62_0) -; RV32-NEXT: addi a0, a0, %lo(.LCPI62_0) +; RV32-NEXT: lui a0, %hi(.LCPI63_0) +; RV32-NEXT: addi a0, a0, %lo(.LCPI63_0) ; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32-NEXT: vle16.v v20, (a0) ; RV32-NEXT: li a0, 36 @@ -2159,7 +2177,6 @@ define {<4 x i32>, <4 x i32>} @maskedload_factor3_skip_field(ptr %ptr) { ; mask = 1111, skip last field %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> %v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> - %v2 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> %res0 = insertvalue {<4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0 %res1 = insertvalue {<4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1 ret {<4 x i32>, <4 x i32>} %res1 @@ -2177,7 +2194,24 @@ define {<4 x i32>, <4 x i32>} @maskedload_factor3_mask_skip_field(ptr %ptr) { ; mask = 1010, skip the last field %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> %v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> - %v2 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> + %res0 = insertvalue {<4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0 + %res1 = insertvalue {<4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1 + ret {<4 x i32>, <4 x i32>} %res1 +} + +define {<4 x i32>, <4 x i32>} @maskedload_factor3_combined_mask_skip_field(ptr %ptr, <4 x i1> %mask) { +; CHECK-LABEL: maskedload_factor3_combined_mask_skip_field: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 12 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vlsseg2e32.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret + %interleaved.mask = shufflevector <4 x i1> %mask, <4 x i1> poison, <12 x i32> + %combined = and <12 x i1> %interleaved.mask, + %interleaved.vec = tail call <12 x i32> @llvm.masked.load.v12i32.p0(ptr %ptr, i32 4, <12 x i1> %combined, <12 x i32> poison) + ; mask = %mask, skip the last field + %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> + %v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> %res0 = insertvalue {<4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0 %res1 = insertvalue {<4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1 ret {<4 x i32>, <4 x i32>} %res1 @@ -2200,8 +2234,8 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @maskedload_factor3_invalid_skip_field( ; RV32-NEXT: vle32.v v12, (a0), v0.t ; RV32-NEXT: li a0, 36 ; RV32-NEXT: vmv.s.x v20, a1 -; RV32-NEXT: lui a1, %hi(.LCPI68_0) -; RV32-NEXT: addi a1, a1, %lo(.LCPI68_0) +; RV32-NEXT: lui a1, %hi(.LCPI70_0) +; RV32-NEXT: addi a1, a1, %lo(.LCPI70_0) ; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32-NEXT: vle16.v v21, (a1) ; RV32-NEXT: vcompress.vm v8, v12, v11 From f5507fb761a55083daff9896d551b350d21f1280 Mon Sep 17 00:00:00 2001 From: Min-Yih Hsu Date: Tue, 5 Aug 2025 17:43:35 -0700 Subject: [PATCH 5/9] fixup! Reject cases where Factor != MaskFactor in other targets --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 3 +++ llvm/lib/Target/ARM/ARMISelLowering.cpp | 3 +++ llvm/lib/Target/X86/X86InterleavedAccess.cpp | 3 +++ 3 files changed, 9 insertions(+) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index e681d846f9e1c..632bb79fa02e4 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -17266,6 +17266,9 @@ bool AArch64TargetLowering::lowerInterleavedLoad( return false; assert(!Mask && "Unexpected mask on a load"); + if (Factor != MaskFactor) + return false; + const DataLayout &DL = LI->getDataLayout(); VectorType *VTy = Shuffles[0]->getType(); diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index a5750def66b7d..c087e32cd4787 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -21611,6 +21611,9 @@ bool ARMTargetLowering::lowerInterleavedLoad( return false; assert(!Mask && "Unexpected mask on a load"); + if (Factor != MaskFactor) + return false; + auto *VecTy = cast(Shuffles[0]->getType()); Type *EltTy = VecTy->getElementType(); diff --git a/llvm/lib/Target/X86/X86InterleavedAccess.cpp b/llvm/lib/Target/X86/X86InterleavedAccess.cpp index 6929c869b1a31..52132a9d64b1a 100644 --- a/llvm/lib/Target/X86/X86InterleavedAccess.cpp +++ b/llvm/lib/Target/X86/X86InterleavedAccess.cpp @@ -814,6 +814,9 @@ bool X86TargetLowering::lowerInterleavedLoad( return false; assert(!Mask && "Unexpected mask on a load"); + if (Factor != MaskFactor) + return false; + // Create an interleaved access group. IRBuilder<> Builder(LI); X86InterleavedAccessGroup Grp(LI, Shuffles, Indices, Factor, Subtarget, From 8e4b79edae79a713a6c40066c2895e8a06fbc5b9 Mon Sep 17 00:00:00 2001 From: Min-Yih Hsu Date: Wed, 6 Aug 2025 09:37:12 -0700 Subject: [PATCH 6/9] fixup! Address review comments --- llvm/lib/CodeGen/InterleavedAccessPass.cpp | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp index 81efb7b335dbc..7c3b0db50f2ad 100644 --- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp +++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp @@ -580,14 +580,16 @@ static unsigned getGapMaskFactor(const Constant &MaskConst, unsigned Factor, APInt FactorMask(Factor, 0); FactorMask.setAllBits(); for (unsigned F = 0U; F < Factor; ++F) { - unsigned Idx; - for (Idx = 0U; Idx < LeafMaskLen; ++Idx) { + bool AllZero = true; + for (unsigned Idx = 0U; Idx < LeafMaskLen; ++Idx) { Constant *C = MaskConst.getAggregateElement(F + Idx * Factor); - if (!C->isZeroValue()) + if (!C->isZeroValue()) { + AllZero = false; break; + } } // All mask bits on this field are zero, skipping it. - if (Idx >= LeafMaskLen) + if (AllZero) FactorMask.clearBit(F); } // We currently only allow gaps in the "trailing" factors / fields. So @@ -705,10 +707,12 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic( return false; // Check mask operand. Handle both all-true/false and interleaved mask. - std::tie(Mask, std::ignore) = + unsigned GapMaskFactor; + std::tie(Mask, GapMaskFactor) = getMask(getMaskOperand(II), Factor, getDeinterleavedVectorType(DI)); if (!Mask) return false; + assert(GapMaskFactor == Factor); LLVM_DEBUG(dbgs() << "IA: Found a vp.load or masked.load with deinterleave" << " intrinsic " << *DI << " and factor = " @@ -747,11 +751,13 @@ bool InterleavedAccessImpl::lowerInterleaveIntrinsic( II->getIntrinsicID() != Intrinsic::vp_store) return false; // Check mask operand. Handle both all-true/false and interleaved mask. - std::tie(Mask, std::ignore) = + unsigned GapMaskFactor; + std::tie(Mask, GapMaskFactor) = getMask(getMaskOperand(II), Factor, cast(InterleaveValues[0]->getType())); if (!Mask) return false; + assert(GapMaskFactor == Factor); LLVM_DEBUG(dbgs() << "IA: Found a vp.store or masked.store with interleave" << " intrinsic " << *IntII << " and factor = " From 3f992d6c2051d66916930d8fd96798db680abab2 Mon Sep 17 00:00:00 2001 From: Min-Yih Hsu Date: Tue, 12 Aug 2025 11:47:23 -0700 Subject: [PATCH 7/9] fixup! Passing APInt GapMask instead of MaskFactor --- llvm/include/llvm/CodeGen/TargetLowering.h | 6 +- llvm/lib/CodeGen/InterleavedAccessPass.cpp | 103 +++++++++--------- .../Target/AArch64/AArch64ISelLowering.cpp | 6 +- llvm/lib/Target/AArch64/AArch64ISelLowering.h | 2 +- llvm/lib/Target/ARM/ARMISelLowering.cpp | 6 +- llvm/lib/Target/ARM/ARMISelLowering.h | 2 +- llvm/lib/Target/RISCV/RISCVISelLowering.h | 2 +- .../Target/RISCV/RISCVInterleavedAccess.cpp | 10 +- llvm/lib/Target/X86/X86ISelLowering.h | 2 +- llvm/lib/Target/X86/X86InterleavedAccess.cpp | 6 +- 10 files changed, 69 insertions(+), 76 deletions(-) diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index 3239b35031e36..3d7d74593533b 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -3209,12 +3209,12 @@ class LLVM_ABI TargetLoweringBase { /// \p Shuffles is the shufflevector list to DE-interleave the loaded vector. /// \p Indices is the corresponding indices for each shufflevector. /// \p Factor is the interleave factor. - /// \p MaskFactor is the interleave factor that considers mask, which can - /// reduce the original factor. + /// \p GapMask is a mask in which inactive lanes represent components / fields + /// that are always skipped. virtual bool lowerInterleavedLoad(Instruction *Load, Value *Mask, ArrayRef Shuffles, ArrayRef Indices, unsigned Factor, - unsigned MaskFactor) const { + const APInt &GapMask) const { return false; } diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp index 7c3b0db50f2ad..7956e02ac1fc7 100644 --- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp +++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp @@ -271,16 +271,13 @@ static Value *getMaskOperand(IntrinsicInst *II) { // Return a pair of // (1) The corresponded deinterleaved mask, or nullptr if there is no valid // mask. -// (2) Some mask effectively skips a certain field, this element contains -// the factor after taking such contraction into consideration. Note that -// currently we only support skipping trailing fields. So if the "nominal" -// factor was 5, you cannot only skip field 1 and 2, but you can skip field 3 -// and 4. -static std::pair getMask(Value *WideMask, unsigned Factor, - ElementCount LeafValueEC); - -static std::pair getMask(Value *WideMask, unsigned Factor, - VectorType *LeafValueTy) { +// (2) Some mask effectively skips a certain field, and this element is a mask +// in which inactive lanes represent fields that are skipped (i.e. "gaps"). +static std::pair getMask(Value *WideMask, unsigned Factor, + ElementCount LeafValueEC); + +static std::pair getMask(Value *WideMask, unsigned Factor, + VectorType *LeafValueTy) { return getMask(WideMask, Factor, LeafValueTy->getElementCount()); } @@ -385,25 +382,26 @@ bool InterleavedAccessImpl::lowerInterleavedLoad( replaceBinOpShuffles(BinOpShuffles.getArrayRef(), Shuffles, Load); Value *Mask = nullptr; - unsigned GapMaskFactor = Factor; + APInt GapMask(Factor, 0); if (LI) { + GapMask.setAllBits(); LLVM_DEBUG(dbgs() << "IA: Found an interleaved load: " << *Load << "\n"); } else { // Check mask operand. Handle both all-true/false and interleaved mask. - std::tie(Mask, GapMaskFactor) = getMask(getMaskOperand(II), Factor, VecTy); + std::tie(Mask, GapMask) = getMask(getMaskOperand(II), Factor, VecTy); if (!Mask) return false; LLVM_DEBUG(dbgs() << "IA: Found an interleaved vp.load or masked.load: " << *Load << "\n"); LLVM_DEBUG(dbgs() << "IA: With nominal factor " << Factor - << " and mask factor " << GapMaskFactor << "\n"); + << " and actual factor " << GapMask.popcount() << "\n"); } // Try to create target specific intrinsics to replace the load and // shuffles. if (!TLI->lowerInterleavedLoad(cast(Load), Mask, Shuffles, - Indices, Factor, GapMaskFactor)) + Indices, Factor, GapMask)) // If Extracts is not empty, tryReplaceExtracts made changes earlier. return !Extracts.empty() || BinOpShuffleChanged; @@ -540,19 +538,19 @@ bool InterleavedAccessImpl::lowerInterleavedStore( "number of stored element should be a multiple of Factor"); Value *Mask = nullptr; - unsigned GapMaskFactor = Factor; if (SI) { LLVM_DEBUG(dbgs() << "IA: Found an interleaved store: " << *Store << "\n"); } else { // Check mask operand. Handle both all-true/false and interleaved mask. unsigned LaneMaskLen = NumStoredElements / Factor; - std::tie(Mask, GapMaskFactor) = getMask( - getMaskOperand(II), Factor, ElementCount::getFixed(LaneMaskLen)); + APInt GapMask(Factor, 0); + std::tie(Mask, GapMask) = getMask(getMaskOperand(II), Factor, + ElementCount::getFixed(LaneMaskLen)); if (!Mask) return false; - // We shouldn't transform stores even it has a gap mask. And since we might - // already change the IR, we're returning true here. - if (GapMaskFactor != Factor) + // We haven't supported gap mask for stores. Yet it is possible that we + // already changed the IR, hence returning true here. + if (GapMask.popcount() != Factor) return true; LLVM_DEBUG(dbgs() << "IA: Found an interleaved vp.store or masked.store: " @@ -575,10 +573,9 @@ bool InterleavedAccessImpl::lowerInterleavedStore( // which case LeafMaskLen is 4). Such (wide) mask is also known as gap mask. // This helper function tries to detect this pattern and return the actual // factor we're accessing, which is 2 in this example. -static unsigned getGapMaskFactor(const Constant &MaskConst, unsigned Factor, - unsigned LeafMaskLen) { - APInt FactorMask(Factor, 0); - FactorMask.setAllBits(); +static void getGapMask(const Constant &MaskConst, unsigned Factor, + unsigned LeafMaskLen, APInt &GapMask) { + assert(GapMask.getBitWidth() == Factor); for (unsigned F = 0U; F < Factor; ++F) { bool AllZero = true; for (unsigned Idx = 0U; Idx < LeafMaskLen; ++Idx) { @@ -590,26 +587,21 @@ static unsigned getGapMaskFactor(const Constant &MaskConst, unsigned Factor, } // All mask bits on this field are zero, skipping it. if (AllZero) - FactorMask.clearBit(F); + GapMask.clearBit(F); } - // We currently only allow gaps in the "trailing" factors / fields. So - // given the original factor being 4, we can skip fields 2 and 3, but we - // cannot only skip fields 1 and 2. If FactorMask does not match such - // pattern, reset it. - if (!FactorMask.isMask()) - FactorMask.setAllBits(); - - return FactorMask.popcount(); } -static std::pair getMask(Value *WideMask, unsigned Factor, - ElementCount LeafValueEC) { +static std::pair getMask(Value *WideMask, unsigned Factor, + ElementCount LeafValueEC) { using namespace PatternMatch; + APInt GapMask(Factor, 0); + GapMask.setAllBits(); + if (auto *IMI = dyn_cast(WideMask)) { if (unsigned F = getInterleaveIntrinsicFactor(IMI->getIntrinsicID()); F && F == Factor && llvm::all_equal(IMI->args())) { - return {IMI->getArgOperand(0), Factor}; + return {IMI->getArgOperand(0), GapMask}; } } @@ -621,36 +613,34 @@ static std::pair getMask(Value *WideMask, unsigned Factor, LeafValueEC.isFixed()) { assert(!isa(AndMaskLHS) && "expect constants to be folded already"); - return {getMask(AndMaskLHS, Factor, LeafValueEC).first, - getGapMaskFactor(*AndMaskRHS, Factor, LeafValueEC.getFixedValue())}; + getGapMask(*AndMaskRHS, Factor, LeafValueEC.getFixedValue(), GapMask); + return {getMask(AndMaskLHS, Factor, LeafValueEC).first, GapMask}; } if (auto *ConstMask = dyn_cast(WideMask)) { if (auto *Splat = ConstMask->getSplatValue()) // All-ones or all-zeros mask. - return {ConstantVector::getSplat(LeafValueEC, Splat), Factor}; + return {ConstantVector::getSplat(LeafValueEC, Splat), GapMask}; if (LeafValueEC.isFixed()) { unsigned LeafMaskLen = LeafValueEC.getFixedValue(); // First, check if we use a gap mask to skip some of the factors / fields. - const unsigned GapMaskFactor = - getGapMaskFactor(*ConstMask, Factor, LeafMaskLen); - assert(GapMaskFactor <= Factor); + getGapMask(*ConstMask, Factor, LeafMaskLen, GapMask); SmallVector LeafMask(LeafMaskLen, nullptr); // If this is a fixed-length constant mask, each lane / leaf has to // use the same mask. This is done by checking if every group with Factor // number of elements in the interleaved mask has homogeneous values. for (unsigned Idx = 0U; Idx < LeafMaskLen * Factor; ++Idx) { - if (Idx % Factor >= GapMaskFactor) + if (!GapMask[Idx % Factor]) continue; Constant *C = ConstMask->getAggregateElement(Idx); if (LeafMask[Idx / Factor] && LeafMask[Idx / Factor] != C) - return {nullptr, Factor}; + return {nullptr, GapMask}; LeafMask[Idx / Factor] = C; } - return {ConstantVector::get(LeafMask), GapMaskFactor}; + return {ConstantVector::get(LeafMask), GapMask}; } } @@ -672,11 +662,11 @@ static std::pair getMask(Value *WideMask, unsigned Factor, IRBuilder<> Builder(SVI); return {Builder.CreateExtractVector(LeafMaskTy, SVI->getOperand(0), uint64_t(0)), - Factor}; + GapMask}; } } - return {nullptr, Factor}; + return {nullptr, GapMask}; } bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic( @@ -707,12 +697,16 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic( return false; // Check mask operand. Handle both all-true/false and interleaved mask. - unsigned GapMaskFactor; - std::tie(Mask, GapMaskFactor) = + APInt GapMask(Factor, 0); + std::tie(Mask, GapMask) = getMask(getMaskOperand(II), Factor, getDeinterleavedVectorType(DI)); if (!Mask) return false; - assert(GapMaskFactor == Factor); + // We haven't supported gap mask if it's deinterleaving using intrinsics. + // Yet it is possible that we already changed the IR, hence returning true + // here. + if (GapMask.popcount() != Factor) + return true; LLVM_DEBUG(dbgs() << "IA: Found a vp.load or masked.load with deinterleave" << " intrinsic " << *DI << " and factor = " @@ -751,13 +745,16 @@ bool InterleavedAccessImpl::lowerInterleaveIntrinsic( II->getIntrinsicID() != Intrinsic::vp_store) return false; // Check mask operand. Handle both all-true/false and interleaved mask. - unsigned GapMaskFactor; - std::tie(Mask, GapMaskFactor) = + APInt GapMask(Factor, 0); + std::tie(Mask, GapMask) = getMask(getMaskOperand(II), Factor, cast(InterleaveValues[0]->getType())); if (!Mask) return false; - assert(GapMaskFactor == Factor); + // We haven't supported gap mask if it's interleaving using intrinsics. Yet + // it is possible that we already changed the IR, hence returning true here. + if (GapMask.popcount() != Factor) + return true; LLVM_DEBUG(dbgs() << "IA: Found a vp.store or masked.store with interleave" << " intrinsic " << *IntII << " and factor = " diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 632bb79fa02e4..c0d9a9320a6cd 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -17254,7 +17254,7 @@ static Function *getStructuredStoreFunction(Module *M, unsigned Factor, /// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1 bool AArch64TargetLowering::lowerInterleavedLoad( Instruction *Load, Value *Mask, ArrayRef Shuffles, - ArrayRef Indices, unsigned Factor, unsigned MaskFactor) const { + ArrayRef Indices, unsigned Factor, const APInt &GapMask) const { assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && "Invalid interleave factor"); assert(!Shuffles.empty() && "Empty shufflevector input"); @@ -17265,9 +17265,7 @@ bool AArch64TargetLowering::lowerInterleavedLoad( if (!LI) return false; assert(!Mask && "Unexpected mask on a load"); - - if (Factor != MaskFactor) - return false; + assert(GapMask.popcount() == Factor && "Unexpected factor reduction"); const DataLayout &DL = LI->getDataLayout(); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index d0d6512d39015..e7ab408aeffe2 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -221,7 +221,7 @@ class AArch64TargetLowering : public TargetLowering { bool lowerInterleavedLoad(Instruction *Load, Value *Mask, ArrayRef Shuffles, ArrayRef Indices, unsigned Factor, - unsigned MaskFactor) const override; + const APInt &GapMask) const override; bool lowerInterleavedStore(Instruction *Store, Value *Mask, ShuffleVectorInst *SVI, unsigned Factor) const override; diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index c087e32cd4787..b00241b7aea1f 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -21599,7 +21599,7 @@ unsigned ARMTargetLowering::getMaxSupportedInterleaveFactor() const { /// %vec1 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 1 bool ARMTargetLowering::lowerInterleavedLoad( Instruction *Load, Value *Mask, ArrayRef Shuffles, - ArrayRef Indices, unsigned Factor, unsigned MaskFactor) const { + ArrayRef Indices, unsigned Factor, const APInt &GapMask) const { assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && "Invalid interleave factor"); assert(!Shuffles.empty() && "Empty shufflevector input"); @@ -21610,9 +21610,7 @@ bool ARMTargetLowering::lowerInterleavedLoad( if (!LI) return false; assert(!Mask && "Unexpected mask on a load"); - - if (Factor != MaskFactor) - return false; + assert(GapMask.popcount() == Factor && "Unexpected factor reduction"); auto *VecTy = cast(Shuffles[0]->getType()); Type *EltTy = VecTy->getElementType(); diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h index 670bbb62fe0f6..d3f99d70f1f7d 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -684,7 +684,7 @@ class VectorType; bool lowerInterleavedLoad(Instruction *Load, Value *Mask, ArrayRef Shuffles, ArrayRef Indices, unsigned Factor, - unsigned MaskFactor) const override; + const APInt &GapMask) const override; bool lowerInterleavedStore(Instruction *Store, Value *Mask, ShuffleVectorInst *SVI, unsigned Factor) const override; diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h index 4155f613f7f04..f615eb7a0eb9c 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -432,7 +432,7 @@ class RISCVTargetLowering : public TargetLowering { bool lowerInterleavedLoad(Instruction *Load, Value *Mask, ArrayRef Shuffles, ArrayRef Indices, unsigned Factor, - unsigned MaskFactor) const override; + const APInt &GapMask) const override; bool lowerInterleavedStore(Instruction *Store, Value *Mask, ShuffleVectorInst *SVI, diff --git a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp index d4e6351ea6a51..c7b96f5c3d0c8 100644 --- a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp +++ b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp @@ -203,12 +203,14 @@ static bool getMemOperands(unsigned Factor, VectorType *VTy, Type *XLenTy, /// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1 bool RISCVTargetLowering::lowerInterleavedLoad( Instruction *Load, Value *Mask, ArrayRef Shuffles, - ArrayRef Indices, unsigned Factor, unsigned MaskFactor) const { + ArrayRef Indices, unsigned Factor, const APInt &GapMask) const { assert(Indices.size() == Shuffles.size()); - assert(MaskFactor <= Factor); + assert(GapMask.getBitWidth() == Factor); - // TODO: Lower to strided load when MaskFactor = 1. - if (MaskFactor < 2) + // We only support cases where the skipped fields are the trailing ones. + // TODO: Lower to strided load if there is only a single active field. + unsigned MaskFactor = GapMask.popcount(); + if (MaskFactor < 2 || !GapMask.isMask()) return false; IRBuilder<> Builder(Load); diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index 242d24b5faf60..3dd79b3249517 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -1664,7 +1664,7 @@ namespace llvm { bool lowerInterleavedLoad(Instruction *Load, Value *Mask, ArrayRef Shuffles, ArrayRef Indices, unsigned Factor, - unsigned MaskFactor) const override; + const APInt &GapMask) const override; /// Lower interleaved store(s) into target specific /// instructions/intrinsics. diff --git a/llvm/lib/Target/X86/X86InterleavedAccess.cpp b/llvm/lib/Target/X86/X86InterleavedAccess.cpp index 52132a9d64b1a..ee2ab9f37023c 100644 --- a/llvm/lib/Target/X86/X86InterleavedAccess.cpp +++ b/llvm/lib/Target/X86/X86InterleavedAccess.cpp @@ -802,7 +802,7 @@ bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() { // Currently, lowering is supported for 4x64 bits with Factor = 4 on AVX. bool X86TargetLowering::lowerInterleavedLoad( Instruction *Load, Value *Mask, ArrayRef Shuffles, - ArrayRef Indices, unsigned Factor, unsigned MaskFactor) const { + ArrayRef Indices, unsigned Factor, const APInt &GapMask) const { assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && "Invalid interleave factor"); assert(!Shuffles.empty() && "Empty shufflevector input"); @@ -813,9 +813,7 @@ bool X86TargetLowering::lowerInterleavedLoad( if (!LI) return false; assert(!Mask && "Unexpected mask on a load"); - - if (Factor != MaskFactor) - return false; + assert(GapMask.popcount() == Factor && "Unexpected factor reduction"); // Create an interleaved access group. IRBuilder<> Builder(LI); From 1f5ab334d9e2bbd0d9046da9bcae4b2dfec09957 Mon Sep 17 00:00:00 2001 From: Min-Yih Hsu Date: Tue, 12 Aug 2025 13:05:00 -0700 Subject: [PATCH 8/9] fixup! Split combined mask into another patch --- llvm/lib/CodeGen/InterleavedAccessPass.cpp | 14 --- .../rvv/fixed-vectors-interleaved-access.ll | 112 ++++++------------ 2 files changed, 38 insertions(+), 88 deletions(-) diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp index 7956e02ac1fc7..f84d83f454286 100644 --- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp +++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp @@ -593,8 +593,6 @@ static void getGapMask(const Constant &MaskConst, unsigned Factor, static std::pair getMask(Value *WideMask, unsigned Factor, ElementCount LeafValueEC) { - using namespace PatternMatch; - APInt GapMask(Factor, 0); GapMask.setAllBits(); @@ -605,18 +603,6 @@ static std::pair getMask(Value *WideMask, unsigned Factor, } } - // Try to match `and , `. The WideMask here is - // expected to be a fixed vector and gap mask should be a constant mask. - Value *AndMaskLHS; - Constant *AndMaskRHS; - if (match(WideMask, m_c_And(m_Value(AndMaskLHS), m_Constant(AndMaskRHS))) && - LeafValueEC.isFixed()) { - assert(!isa(AndMaskLHS) && - "expect constants to be folded already"); - getGapMask(*AndMaskRHS, Factor, LeafValueEC.getFixedValue(), GapMask); - return {getMask(AndMaskLHS, Factor, LeafValueEC).first, GapMask}; - } - if (auto *ConstMask = dyn_cast(WideMask)) { if (auto *Splat = ConstMask->getSplatValue()) // All-ones or all-zeros mask. diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll index 2c738e5aeb55b..7d7ef3e4e2a4b 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll @@ -367,24 +367,6 @@ define {<4 x i32>, <4 x i32>} @vpload_factor3_mask_skip_fields(ptr %ptr) { ret {<4 x i32>, <4 x i32>} %res1 } -define {<4 x i32>, <4 x i32>} @vpload_factor3_combined_mask_skip_field(ptr %ptr, <4 x i1> %mask) { -; CHECK-LABEL: vpload_factor3_combined_mask_skip_field: -; CHECK: # %bb.0: -; CHECK-NEXT: li a1, 12 -; CHECK-NEXT: vsetivli zero, 6, e32, m1, ta, ma -; CHECK-NEXT: vlsseg2e32.v v8, (a0), a1, v0.t -; CHECK-NEXT: ret - %interleaved.mask = shufflevector <4 x i1> %mask, <4 x i1> poison, <12 x i32> - %combined = and <12 x i1> %interleaved.mask, - %interleaved.vec = tail call <12 x i32> @llvm.vp.load.v12i32.p0(ptr %ptr, <12 x i1> %combined, i32 12) - ; mask = %mask, skip the last field - %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> - %v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> - %res0 = insertvalue {<4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0 - %res1 = insertvalue {<4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1 - ret {<4 x i32>, <4 x i32>} %res1 -} - define {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} @vpload_factor4(ptr %ptr) { ; CHECK-LABEL: vpload_factor4: ; CHECK: # %bb.0: @@ -532,8 +514,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: li a2, 32 ; RV32-NEXT: lui a3, 12 ; RV32-NEXT: lui a6, 12291 -; RV32-NEXT: lui a7, %hi(.LCPI26_0) -; RV32-NEXT: addi a7, a7, %lo(.LCPI26_0) +; RV32-NEXT: lui a7, %hi(.LCPI25_0) +; RV32-NEXT: addi a7, a7, %lo(.LCPI25_0) ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; RV32-NEXT: vle32.v v24, (a5) ; RV32-NEXT: vmv.s.x v0, a3 @@ -618,12 +600,12 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs4r.v v8, (a1) # vscale x 32-byte Folded Spill ; RV32-NEXT: lui a7, 49164 -; RV32-NEXT: lui a1, %hi(.LCPI26_1) -; RV32-NEXT: addi a1, a1, %lo(.LCPI26_1) +; RV32-NEXT: lui a1, %hi(.LCPI25_1) +; RV32-NEXT: addi a1, a1, %lo(.LCPI25_1) ; RV32-NEXT: lui t2, 3 ; RV32-NEXT: lui t1, 196656 -; RV32-NEXT: lui a4, %hi(.LCPI26_3) -; RV32-NEXT: addi a4, a4, %lo(.LCPI26_3) +; RV32-NEXT: lui a4, %hi(.LCPI25_3) +; RV32-NEXT: addi a4, a4, %lo(.LCPI25_3) ; RV32-NEXT: lui t0, 786624 ; RV32-NEXT: li a5, 48 ; RV32-NEXT: lui a6, 768 @@ -802,8 +784,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload ; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, ma ; RV32-NEXT: vrgatherei16.vv v24, v8, v2 -; RV32-NEXT: lui a1, %hi(.LCPI26_2) -; RV32-NEXT: addi a1, a1, %lo(.LCPI26_2) +; RV32-NEXT: lui a1, %hi(.LCPI25_2) +; RV32-NEXT: addi a1, a1, %lo(.LCPI25_2) ; RV32-NEXT: lui a3, 3073 ; RV32-NEXT: addi a3, a3, -1024 ; RV32-NEXT: vmv.s.x v0, a3 @@ -867,16 +849,16 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: vrgatherei16.vv v28, v8, v3 ; RV32-NEXT: vsetivli zero, 10, e32, m4, tu, ma ; RV32-NEXT: vmv.v.v v28, v24 -; RV32-NEXT: lui a1, %hi(.LCPI26_4) -; RV32-NEXT: addi a1, a1, %lo(.LCPI26_4) -; RV32-NEXT: lui a2, %hi(.LCPI26_5) -; RV32-NEXT: addi a2, a2, %lo(.LCPI26_5) +; RV32-NEXT: lui a1, %hi(.LCPI25_4) +; RV32-NEXT: addi a1, a1, %lo(.LCPI25_4) +; RV32-NEXT: lui a2, %hi(.LCPI25_5) +; RV32-NEXT: addi a2, a2, %lo(.LCPI25_5) ; RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; RV32-NEXT: vle16.v v24, (a2) ; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; RV32-NEXT: vle16.v v8, (a1) -; RV32-NEXT: lui a1, %hi(.LCPI26_7) -; RV32-NEXT: addi a1, a1, %lo(.LCPI26_7) +; RV32-NEXT: lui a1, %hi(.LCPI25_7) +; RV32-NEXT: addi a1, a1, %lo(.LCPI25_7) ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vle16.v v10, (a1) ; RV32-NEXT: csrr a1, vlenb @@ -904,14 +886,14 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: vl8r.v v0, (a1) # vscale x 64-byte Folded Reload ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vrgatherei16.vv v16, v0, v10 -; RV32-NEXT: lui a1, %hi(.LCPI26_6) -; RV32-NEXT: addi a1, a1, %lo(.LCPI26_6) -; RV32-NEXT: lui a2, %hi(.LCPI26_8) -; RV32-NEXT: addi a2, a2, %lo(.LCPI26_8) +; RV32-NEXT: lui a1, %hi(.LCPI25_6) +; RV32-NEXT: addi a1, a1, %lo(.LCPI25_6) +; RV32-NEXT: lui a2, %hi(.LCPI25_8) +; RV32-NEXT: addi a2, a2, %lo(.LCPI25_8) ; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; RV32-NEXT: vle16.v v4, (a1) -; RV32-NEXT: lui a1, %hi(.LCPI26_9) -; RV32-NEXT: addi a1, a1, %lo(.LCPI26_9) +; RV32-NEXT: lui a1, %hi(.LCPI25_9) +; RV32-NEXT: addi a1, a1, %lo(.LCPI25_9) ; RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; RV32-NEXT: vle16.v v6, (a1) ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma @@ -998,8 +980,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV64-NEXT: li a4, 128 ; RV64-NEXT: lui a1, 1 ; RV64-NEXT: vle64.v v8, (a3) -; RV64-NEXT: lui a3, %hi(.LCPI26_0) -; RV64-NEXT: addi a3, a3, %lo(.LCPI26_0) +; RV64-NEXT: lui a3, %hi(.LCPI25_0) +; RV64-NEXT: addi a3, a3, %lo(.LCPI25_0) ; RV64-NEXT: vmv.s.x v0, a4 ; RV64-NEXT: csrr a4, vlenb ; RV64-NEXT: li a5, 61 @@ -1187,8 +1169,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV64-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu ; RV64-NEXT: vslideup.vi v12, v16, 1, v0.t -; RV64-NEXT: lui a2, %hi(.LCPI26_1) -; RV64-NEXT: addi a2, a2, %lo(.LCPI26_1) +; RV64-NEXT: lui a2, %hi(.LCPI25_1) +; RV64-NEXT: addi a2, a2, %lo(.LCPI25_1) ; RV64-NEXT: li a3, 192 ; RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; RV64-NEXT: vle16.v v6, (a2) @@ -1222,8 +1204,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV64-NEXT: vrgatherei16.vv v24, v16, v6 ; RV64-NEXT: addi a2, sp, 16 ; RV64-NEXT: vs8r.v v24, (a2) # vscale x 64-byte Folded Spill -; RV64-NEXT: lui a2, %hi(.LCPI26_2) -; RV64-NEXT: addi a2, a2, %lo(.LCPI26_2) +; RV64-NEXT: lui a2, %hi(.LCPI25_2) +; RV64-NEXT: addi a2, a2, %lo(.LCPI25_2) ; RV64-NEXT: li a3, 1040 ; RV64-NEXT: vmv.s.x v0, a3 ; RV64-NEXT: addi a1, a1, -2016 @@ -1307,12 +1289,12 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vs4r.v v8, (a1) # vscale x 32-byte Folded Spill -; RV64-NEXT: lui a1, %hi(.LCPI26_3) -; RV64-NEXT: addi a1, a1, %lo(.LCPI26_3) +; RV64-NEXT: lui a1, %hi(.LCPI25_3) +; RV64-NEXT: addi a1, a1, %lo(.LCPI25_3) ; RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; RV64-NEXT: vle16.v v20, (a1) -; RV64-NEXT: lui a1, %hi(.LCPI26_4) -; RV64-NEXT: addi a1, a1, %lo(.LCPI26_4) +; RV64-NEXT: lui a1, %hi(.LCPI25_4) +; RV64-NEXT: addi a1, a1, %lo(.LCPI25_4) ; RV64-NEXT: vle16.v v8, (a1) ; RV64-NEXT: csrr a1, vlenb ; RV64-NEXT: li a2, 77 @@ -1363,8 +1345,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV64-NEXT: vl2r.v v8, (a1) # vscale x 16-byte Folded Reload ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV64-NEXT: vrgatherei16.vv v0, v16, v8 -; RV64-NEXT: lui a1, %hi(.LCPI26_5) -; RV64-NEXT: addi a1, a1, %lo(.LCPI26_5) +; RV64-NEXT: lui a1, %hi(.LCPI25_5) +; RV64-NEXT: addi a1, a1, %lo(.LCPI25_5) ; RV64-NEXT: vle16.v v20, (a1) ; RV64-NEXT: csrr a1, vlenb ; RV64-NEXT: li a2, 61 @@ -1981,8 +1963,8 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @invalid_vp_mask(ptr %ptr) { ; RV32-NEXT: vle32.v v12, (a0), v0.t ; RV32-NEXT: li a0, 36 ; RV32-NEXT: vmv.s.x v20, a1 -; RV32-NEXT: lui a1, %hi(.LCPI62_0) -; RV32-NEXT: addi a1, a1, %lo(.LCPI62_0) +; RV32-NEXT: lui a1, %hi(.LCPI61_0) +; RV32-NEXT: addi a1, a1, %lo(.LCPI61_0) ; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32-NEXT: vle16.v v21, (a1) ; RV32-NEXT: vcompress.vm v8, v12, v11 @@ -2057,8 +2039,8 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @invalid_vp_evl(ptr %ptr) { ; RV32-NEXT: vmv.s.x v10, a0 ; RV32-NEXT: li a0, 146 ; RV32-NEXT: vmv.s.x v11, a0 -; RV32-NEXT: lui a0, %hi(.LCPI63_0) -; RV32-NEXT: addi a0, a0, %lo(.LCPI63_0) +; RV32-NEXT: lui a0, %hi(.LCPI62_0) +; RV32-NEXT: addi a0, a0, %lo(.LCPI62_0) ; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32-NEXT: vle16.v v20, (a0) ; RV32-NEXT: li a0, 36 @@ -2199,24 +2181,6 @@ define {<4 x i32>, <4 x i32>} @maskedload_factor3_mask_skip_field(ptr %ptr) { ret {<4 x i32>, <4 x i32>} %res1 } -define {<4 x i32>, <4 x i32>} @maskedload_factor3_combined_mask_skip_field(ptr %ptr, <4 x i1> %mask) { -; CHECK-LABEL: maskedload_factor3_combined_mask_skip_field: -; CHECK: # %bb.0: -; CHECK-NEXT: li a1, 12 -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vlsseg2e32.v v8, (a0), a1, v0.t -; CHECK-NEXT: ret - %interleaved.mask = shufflevector <4 x i1> %mask, <4 x i1> poison, <12 x i32> - %combined = and <12 x i1> %interleaved.mask, - %interleaved.vec = tail call <12 x i32> @llvm.masked.load.v12i32.p0(ptr %ptr, i32 4, <12 x i1> %combined, <12 x i32> poison) - ; mask = %mask, skip the last field - %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> - %v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> - %res0 = insertvalue {<4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0 - %res1 = insertvalue {<4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1 - ret {<4 x i32>, <4 x i32>} %res1 -} - ; We can only skip the last field for now. define {<4 x i32>, <4 x i32>, <4 x i32>} @maskedload_factor3_invalid_skip_field(ptr %ptr) { ; RV32-LABEL: maskedload_factor3_invalid_skip_field: @@ -2234,8 +2198,8 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @maskedload_factor3_invalid_skip_field( ; RV32-NEXT: vle32.v v12, (a0), v0.t ; RV32-NEXT: li a0, 36 ; RV32-NEXT: vmv.s.x v20, a1 -; RV32-NEXT: lui a1, %hi(.LCPI70_0) -; RV32-NEXT: addi a1, a1, %lo(.LCPI70_0) +; RV32-NEXT: lui a1, %hi(.LCPI68_0) +; RV32-NEXT: addi a1, a1, %lo(.LCPI68_0) ; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32-NEXT: vle16.v v21, (a1) ; RV32-NEXT: vcompress.vm v8, v12, v11 From 705b7a6d4e6247e044c3b9eb4f8f1f7c424d9034 Mon Sep 17 00:00:00 2001 From: Min-Yih Hsu Date: Tue, 12 Aug 2025 13:55:35 -0700 Subject: [PATCH 9/9] fixup! Address review comments --- llvm/include/llvm/CodeGen/TargetLowering.h | 4 ++-- llvm/lib/CodeGen/InterleavedAccessPass.cpp | 6 ++---- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 3 +-- llvm/lib/Target/ARM/ARMISelLowering.cpp | 3 +-- llvm/lib/Target/X86/X86InterleavedAccess.cpp | 3 +-- 5 files changed, 7 insertions(+), 12 deletions(-) diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index 3d7d74593533b..8b50f2d7eef05 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -3209,8 +3209,8 @@ class LLVM_ABI TargetLoweringBase { /// \p Shuffles is the shufflevector list to DE-interleave the loaded vector. /// \p Indices is the corresponding indices for each shufflevector. /// \p Factor is the interleave factor. - /// \p GapMask is a mask in which inactive lanes represent components / fields - /// that are always skipped. + /// \p GapMask is a mask with zeros for components / fields that may not be + /// accessed. virtual bool lowerInterleavedLoad(Instruction *Load, Value *Mask, ArrayRef Shuffles, ArrayRef Indices, unsigned Factor, diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp index f84d83f454286..a41a44df3f847 100644 --- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp +++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp @@ -382,9 +382,8 @@ bool InterleavedAccessImpl::lowerInterleavedLoad( replaceBinOpShuffles(BinOpShuffles.getArrayRef(), Shuffles, Load); Value *Mask = nullptr; - APInt GapMask(Factor, 0); + auto GapMask = APInt::getAllOnes(Factor); if (LI) { - GapMask.setAllBits(); LLVM_DEBUG(dbgs() << "IA: Found an interleaved load: " << *Load << "\n"); } else { // Check mask operand. Handle both all-true/false and interleaved mask. @@ -593,8 +592,7 @@ static void getGapMask(const Constant &MaskConst, unsigned Factor, static std::pair getMask(Value *WideMask, unsigned Factor, ElementCount LeafValueEC) { - APInt GapMask(Factor, 0); - GapMask.setAllBits(); + auto GapMask = APInt::getAllOnes(Factor); if (auto *IMI = dyn_cast(WideMask)) { if (unsigned F = getInterleaveIntrinsicFactor(IMI->getIntrinsicID()); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index c0d9a9320a6cd..3aa6ae40e0da4 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -17264,8 +17264,7 @@ bool AArch64TargetLowering::lowerInterleavedLoad( auto *LI = dyn_cast(Load); if (!LI) return false; - assert(!Mask && "Unexpected mask on a load"); - assert(GapMask.popcount() == Factor && "Unexpected factor reduction"); + assert(!Mask && GapMask.popcount() == Factor && "Unexpected mask on a load"); const DataLayout &DL = LI->getDataLayout(); diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index b00241b7aea1f..7c2a228f89750 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -21609,8 +21609,7 @@ bool ARMTargetLowering::lowerInterleavedLoad( auto *LI = dyn_cast(Load); if (!LI) return false; - assert(!Mask && "Unexpected mask on a load"); - assert(GapMask.popcount() == Factor && "Unexpected factor reduction"); + assert(!Mask && GapMask.popcount() == Factor && "Unexpected mask on a load"); auto *VecTy = cast(Shuffles[0]->getType()); Type *EltTy = VecTy->getElementType(); diff --git a/llvm/lib/Target/X86/X86InterleavedAccess.cpp b/llvm/lib/Target/X86/X86InterleavedAccess.cpp index ee2ab9f37023c..632db7e4326e2 100644 --- a/llvm/lib/Target/X86/X86InterleavedAccess.cpp +++ b/llvm/lib/Target/X86/X86InterleavedAccess.cpp @@ -812,8 +812,7 @@ bool X86TargetLowering::lowerInterleavedLoad( auto *LI = dyn_cast(Load); if (!LI) return false; - assert(!Mask && "Unexpected mask on a load"); - assert(GapMask.popcount() == Factor && "Unexpected factor reduction"); + assert(!Mask && GapMask.popcount() == Factor && "Unexpected mask on a load"); // Create an interleaved access group. IRBuilder<> Builder(LI);