Skip to content

Commit 5ee601a

Browse files
committed
Added support for 4xv4i32
1 parent 15ab3c4 commit 5ee601a

File tree

2 files changed

+14
-11
lines changed

2 files changed

+14
-11
lines changed

llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -248,8 +248,9 @@ InstructionCost WebAssemblyTTIImpl::getInterleavedMemoryOpCost(
248248
// A stride of two is commonly supported via dedicated instructions, so it
249249
// should be relatively cheap for all element sizes. A stride of four is
250250
// more expensive as it will likely require more shuffles. Using two
251-
// simd128 inputs is considered more expensive and we don't currently
252-
// account for shuffling than two inputs (32 bytes).
251+
// simd128 inputs is considered more expensive and we mainly account for
252+
// shuffling two inputs (32 bytes), but we do model 4 x v4i32 to enable
253+
// arithmetic kernels.
253254
static const CostTblEntry ShuffleCostTbl[] = {
254255
// One reg.
255256
{2, MVT::v2i8, 1}, // interleave 2 x 2i8 into 4i8
@@ -273,6 +274,9 @@ InstructionCost WebAssemblyTTIImpl::getInterleavedMemoryOpCost(
273274
{4, MVT::v8i8, 16}, // interleave 4 x 8i8 into 32i8
274275
{4, MVT::v4i16, 8}, // interleave 4 x 4i16 into 16i16
275276
{4, MVT::v2i32, 4}, // interleave 4 x 2i32 into 8i32
277+
278+
// Four regs.
279+
{4, MVT::v4i32, 16}, // interleave 4 x 4i32 into 16i32
276280
};
277281

278282
EVT ETy = TLI->getValueType(DL, SubVecTy);

llvm/test/Transforms/LoopVectorize/WebAssembly/memory-interleave.ll

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1061,10 +1061,10 @@ define hidden void @eight_bytes_interleave_op(ptr noalias nocapture noundef writ
10611061
; CHECK: LV: Found an estimated cost of 14 for VF 2 For instruction: store i32
10621062
; CHECK: LV: Vector loop of width 2 costs: 44.
10631063
; CHECK: LV: Found an estimated cost of 18 for VF 4 For instruction: %10 = load i8
1064-
; CHECK: LV: Found an estimated cost of 36 for VF 4 For instruction: %17 = load i32
1065-
; CHECK: LV: Found an estimated cost of 36 for VF 4 For instruction: store i32
1066-
; CHECK: LV: Vector loop of width 4 costs: 32.
1067-
; CHECK: LV: Selecting VF: 1.
1064+
; CHECK: LV: Found an estimated cost of 24 for VF 4 For instruction: %17 = load i32
1065+
; CHECK: LV: Found an estimated cost of 24 for VF 4 For instruction: store i32
1066+
; CHECK: LV: Vector loop of width 4 costs: 26.
1067+
; CHECK: LV: Selecting VF: 4.
10681068
define hidden void @four_bytes_into_four_ints_same_op(ptr noalias nocapture noundef %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
10691069
%5 = icmp eq i32 %3, 0
10701070
br i1 %5, label %6, label %7
@@ -1123,21 +1123,20 @@ define hidden void @four_bytes_into_four_ints_same_op(ptr noalias nocapture noun
11231123
br i1 %50, label %6, label %7
11241124
}
11251125

1126-
; TODO: Should be able to vectorize?
11271126
; CHECK-LABEL: four_bytes_into_four_ints_vary_op
11281127
; CHECK: Cost of 14 for VF 2: INTERLEAVE-GROUP with factor 4
11291128
; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4
1130-
; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4
1129+
; CHECK: Cost of 24 for VF 4: INTERLEAVE-GROUP with factor 4
11311130
; CHECK: LV: Scalar loop costs: 21.
11321131
; CHECK: LV: Found an estimated cost of 6 for VF 2 For instruction: %10 = load i8
11331132
; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction: %11 = zext i8
11341133
; CHECK: LV: Found an estimated cost of 14 for VF 2 For instruction: store i32
11351134
; CHECK: LV: Vector loop of width 2 costs: 35.
11361135
; CHECK: LV: Found an estimated cost of 18 for VF 4 For instruction: %10 = load i8
11371136
; CHECK: LV: Found an estimated cost of 2 for VF 4 For instruction: %11 = zext i8
1138-
; CHECK: LV: Found an estimated cost of 36 for VF 4 For instruction: store i32
1139-
; CHECK: LV: Vector loop of width 4 costs: 23.
1140-
; CHECK: LV: Selecting VF: 1.
1137+
; CHECK: LV: Found an estimated cost of 24 for VF 4 For instruction: store i32
1138+
; CHECK: LV: Vector loop of width 4 costs: 20.
1139+
; CHECK: LV: Selecting VF: 4.
11411140
define hidden void @four_bytes_into_four_ints_vary_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
11421141
%5 = icmp eq i32 %3, 0
11431142
br i1 %5, label %6, label %7

0 commit comments

Comments
 (0)