Added support for 4xv4i32

sparker-arm · sparker-arm · commit 5ee601ac3c6b · 2025-07-29T09:26:24.000+01:00
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
@@ -248,8 +248,9 @@ InstructionCost WebAssemblyTTIImpl::getInterleavedMemoryOpCost(
     // A stride of two is commonly supported via dedicated instructions, so it
     // should be relatively cheap for all element sizes. A stride of four is
     // more expensive as it will likely require more shuffles. Using two
-    // simd128 inputs is considered more expensive and we don't currently
-    // account for shuffling than two inputs (32 bytes).
+    // simd128 inputs is considered more expensive and we mainly account for
+    // shuffling two inputs (32 bytes), but we do model 4 x v4i32 to enable
+    // arithmetic kernels.
     static const CostTblEntry ShuffleCostTbl[] = {
         // One reg.
         {2, MVT::v2i8, 1},  // interleave 2 x 2i8 into 4i8
@@ -273,6 +274,9 @@ InstructionCost WebAssemblyTTIImpl::getInterleavedMemoryOpCost(
         {4, MVT::v8i8, 16}, // interleave 4 x 8i8 into 32i8
         {4, MVT::v4i16, 8}, // interleave 4 x 4i16 into 16i16
         {4, MVT::v2i32, 4}, // interleave 4 x 2i32 into 8i32
+
+        // Four regs.
+        {4, MVT::v4i32, 16}, // interleave 4 x 4i32 into 16i32
     };
 
     EVT ETy = TLI->getValueType(DL, SubVecTy);
diff --git a/llvm/test/Transforms/LoopVectorize/WebAssembly/memory-interleave.ll b/llvm/test/Transforms/LoopVectorize/WebAssembly/memory-interleave.ll
@@ -1061,10 +1061,10 @@ define hidden void @eight_bytes_interleave_op(ptr noalias nocapture noundef writ
 ; CHECK: LV: Found an estimated cost of 14 for VF 2 For instruction: store i32
 ; CHECK: LV: Vector loop of width 2 costs: 44.
 ; CHECK: LV: Found an estimated cost of 18 for VF 4 For instruction: %10 = load i8
-; CHECK: LV: Found an estimated cost of 36 for VF 4 For instruction: %17 = load i32
-; CHECK: LV: Found an estimated cost of 36 for VF 4 For instruction: store i32
-; CHECK: LV: Vector loop of width 4 costs: 32.
-; CHECK: LV: Selecting VF: 1.
+; CHECK: LV: Found an estimated cost of 24 for VF 4 For instruction: %17 = load i32
+; CHECK: LV: Found an estimated cost of 24 for VF 4 For instruction: store i32
+; CHECK: LV: Vector loop of width 4 costs: 26.
+; CHECK: LV: Selecting VF: 4.
 define hidden void @four_bytes_into_four_ints_same_op(ptr noalias nocapture noundef %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
   %5 = icmp eq i32 %3, 0
   br i1 %5, label %6, label %7
@@ -1123,21 +1123,20 @@ define hidden void @four_bytes_into_four_ints_same_op(ptr noalias nocapture noun
   br i1 %50, label %6, label %7
 }
 
-; TODO: Should be able to vectorize?
 ; CHECK-LABEL: four_bytes_into_four_ints_vary_op
 ; CHECK: Cost of 14 for VF 2: INTERLEAVE-GROUP with factor 4
 ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4
-; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 24 for VF 4: INTERLEAVE-GROUP with factor 4
 ; CHECK: LV: Scalar loop costs: 21.
 ; CHECK: LV: Found an estimated cost of 6 for VF 2 For instruction: %10 = load i8
 ; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction: %11 = zext i8
 ; CHECK: LV: Found an estimated cost of 14 for VF 2 For instruction: store i32
 ; CHECK: LV: Vector loop of width 2 costs: 35.
 ; CHECK: LV: Found an estimated cost of 18 for VF 4 For instruction: %10 = load i8
 ; CHECK: LV: Found an estimated cost of 2 for VF 4 For instruction:  %11 = zext i8
-; CHECK: LV: Found an estimated cost of 36 for VF 4 For instruction: store i32
-; CHECK: LV: Vector loop of width 4 costs: 23.
-; CHECK: LV: Selecting VF: 1.
+; CHECK: LV: Found an estimated cost of 24 for VF 4 For instruction: store i32
+; CHECK: LV: Vector loop of width 4 costs: 20.
+; CHECK: LV: Selecting VF: 4.
 define hidden void @four_bytes_into_four_ints_vary_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
   %5 = icmp eq i32 %3, 0
   br i1 %5, label %6, label %7