Skip to content

Commit abe6d36

Browse files
committed
Added support for 4xv4i32
1 parent df2a5e9 commit abe6d36

File tree

2 files changed

+14
-11
lines changed

2 files changed

+14
-11
lines changed

llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -233,8 +233,9 @@ InstructionCost WebAssemblyTTIImpl::getInterleavedMemoryOpCost(
233233
// A stride of two is commonly supported via dedicated instructions, so it
234234
// should be relatively cheap for all element sizes. A stride of four is
235235
// more expensive as it will likely require more shuffles. Using two
236-
// simd128 inputs is considered more expensive and we don't currently
237-
// account for shuffling than two inputs (32 bytes).
236+
// simd128 inputs is considered more expensive and we mainly account for
237+
// shuffling two inputs (32 bytes), but we do model 4 x v4i32 to enable
238+
// arithmetic kernels.
238239
static const CostTblEntry ShuffleCostTbl[] = {
239240
// One reg.
240241
{2, MVT::v2i8, 1}, // interleave 2 x 2i8 into 4i8
@@ -258,6 +259,9 @@ InstructionCost WebAssemblyTTIImpl::getInterleavedMemoryOpCost(
258259
{4, MVT::v8i8, 16}, // interleave 4 x 8i8 into 32i8
259260
{4, MVT::v4i16, 8}, // interleave 4 x 4i16 into 16i16
260261
{4, MVT::v2i32, 4}, // interleave 4 x 2i32 into 8i32
262+
263+
// Four regs.
264+
{4, MVT::v4i32, 16}, // interleave 4 x 4i32 into 16i32
261265
};
262266

263267
EVT ETy = TLI->getValueType(DL, SubVecTy);

llvm/test/Transforms/LoopVectorize/WebAssembly/memory-interleave.ll

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1061,10 +1061,10 @@ define hidden void @eight_bytes_interleave_op(ptr noalias nocapture noundef writ
10611061
; CHECK: LV: Found an estimated cost of 14 for VF 2 For instruction: store i32
10621062
; CHECK: LV: Vector loop of width 2 costs: 44.
10631063
; CHECK: LV: Found an estimated cost of 18 for VF 4 For instruction: %10 = load i8
1064-
; CHECK: LV: Found an estimated cost of 36 for VF 4 For instruction: %17 = load i32
1065-
; CHECK: LV: Found an estimated cost of 36 for VF 4 For instruction: store i32
1066-
; CHECK: LV: Vector loop of width 4 costs: 32.
1067-
; CHECK: LV: Selecting VF: 1.
1064+
; CHECK: LV: Found an estimated cost of 24 for VF 4 For instruction: %17 = load i32
1065+
; CHECK: LV: Found an estimated cost of 24 for VF 4 For instruction: store i32
1066+
; CHECK: LV: Vector loop of width 4 costs: 26.
1067+
; CHECK: LV: Selecting VF: 4.
10681068
define hidden void @four_bytes_into_four_ints_same_op(ptr noalias nocapture noundef %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
10691069
%5 = icmp eq i32 %3, 0
10701070
br i1 %5, label %6, label %7
@@ -1123,21 +1123,20 @@ define hidden void @four_bytes_into_four_ints_same_op(ptr noalias nocapture noun
11231123
br i1 %50, label %6, label %7
11241124
}
11251125

1126-
; TODO: Should be able to vectorize?
11271126
; CHECK-LABEL: four_bytes_into_four_ints_vary_op
11281127
; CHECK: Cost of 14 for VF 2: INTERLEAVE-GROUP with factor 4
11291128
; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4
1130-
; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4
1129+
; CHECK: Cost of 24 for VF 4: INTERLEAVE-GROUP with factor 4
11311130
; CHECK: LV: Scalar loop costs: 21.
11321131
; CHECK: LV: Found an estimated cost of 6 for VF 2 For instruction: %10 = load i8
11331132
; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction: %11 = zext i8
11341133
; CHECK: LV: Found an estimated cost of 14 for VF 2 For instruction: store i32
11351134
; CHECK: LV: Vector loop of width 2 costs: 35.
11361135
; CHECK: LV: Found an estimated cost of 18 for VF 4 For instruction: %10 = load i8
11371136
; CHECK: LV: Found an estimated cost of 2 for VF 4 For instruction: %11 = zext i8
1138-
; CHECK: LV: Found an estimated cost of 36 for VF 4 For instruction: store i32
1139-
; CHECK: LV: Vector loop of width 4 costs: 23.
1140-
; CHECK: LV: Selecting VF: 1.
1137+
; CHECK: LV: Found an estimated cost of 24 for VF 4 For instruction: store i32
1138+
; CHECK: LV: Vector loop of width 4 costs: 20.
1139+
; CHECK: LV: Selecting VF: 4.
11411140
define hidden void @four_bytes_into_four_ints_vary_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
11421141
%5 = icmp eq i32 %3, 0
11431142
br i1 %5, label %6, label %7

0 commit comments

Comments
 (0)