@@ -150,12 +150,6 @@ InstructionCost WebAssemblyTTIImpl::getMemoryOpCost(
150
150
CostKind);
151
151
}
152
152
153
- int ISD = TLI->InstructionOpcodeToISD (Opcode);
154
- if (ISD != ISD::LOAD) {
155
- return BaseT::getMemoryOpCost (Opcode, Ty, Alignment, AddressSpace,
156
- CostKind);
157
- }
158
-
159
153
EVT VT = TLI->getValueType (DL, Ty, true );
160
154
// Type legalization can't handle structs
161
155
if (VT == MVT::Other)
@@ -166,22 +160,117 @@ InstructionCost WebAssemblyTTIImpl::getMemoryOpCost(
166
160
if (!LT.first .isValid ())
167
161
return InstructionCost::getInvalid ();
168
162
169
- // 128-bit loads are a single instruction. 32-bit and 64-bit vector loads can
170
- // be lowered to load32_zero and load64_zero respectively. Assume SIMD loads
171
- // are twice as expensive as scalar.
163
+ int ISD = TLI->InstructionOpcodeToISD (Opcode);
172
164
unsigned width = VT.getSizeInBits ();
173
- switch (width) {
174
- default :
175
- break ;
176
- case 32 :
177
- case 64 :
178
- case 128 :
179
- return 2 ;
165
+ if (ISD == ISD::LOAD) {
166
+ // 128-bit loads are a single instruction. 32-bit and 64-bit vector loads
167
+ // can be lowered to load32_zero and load64_zero respectively. Assume SIMD
168
+ // loads are twice as expensive as scalar.
169
+ switch (width) {
170
+ default :
171
+ break ;
172
+ case 32 :
173
+ case 64 :
174
+ case 128 :
175
+ return 2 ;
176
+ }
177
+ } else if (ISD == ISD::STORE) {
178
+ // For stores, we can use store lane operations.
179
+ switch (width) {
180
+ default :
181
+ break ;
182
+ case 8 :
183
+ case 16 :
184
+ case 32 :
185
+ case 64 :
186
+ case 128 :
187
+ return 2 ;
188
+ }
180
189
}
181
190
182
191
return BaseT::getMemoryOpCost (Opcode, Ty, Alignment, AddressSpace, CostKind);
183
192
}
184
193
194
+ InstructionCost WebAssemblyTTIImpl::getInterleavedMemoryOpCost (
195
+ unsigned Opcode, Type *Ty, unsigned Factor, ArrayRef<unsigned > Indices,
196
+ Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
197
+ bool UseMaskForCond, bool UseMaskForGaps) const {
198
+ assert (Factor >= 2 && " Invalid interleave factor" );
199
+
200
+ auto *VecTy = cast<VectorType>(Ty);
201
+ if (!ST->hasSIMD128 () || !isa<FixedVectorType>(VecTy)) {
202
+ return InstructionCost::getInvalid ();
203
+ }
204
+
205
+ if (UseMaskForCond || UseMaskForGaps)
206
+ return BaseT::getInterleavedMemoryOpCost (Opcode, Ty, Factor, Indices,
207
+ Alignment, AddressSpace, CostKind,
208
+ UseMaskForCond, UseMaskForGaps);
209
+
210
+ constexpr unsigned MaxInterleaveFactor = 4 ;
211
+ if (Factor <= MaxInterleaveFactor) {
212
+ unsigned MinElts = VecTy->getElementCount ().getKnownMinValue ();
213
+ // Ensure the number of vector elements is greater than 1.
214
+ if (MinElts < 2 || MinElts % Factor != 0 )
215
+ return InstructionCost::getInvalid ();
216
+
217
+ unsigned ElSize = DL.getTypeSizeInBits (VecTy->getElementType ());
218
+ // Ensure the element type is legal.
219
+ if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64 )
220
+ return InstructionCost::getInvalid ();
221
+
222
+ auto *SubVecTy =
223
+ VectorType::get (VecTy->getElementType (),
224
+ VecTy->getElementCount ().divideCoefficientBy (Factor));
225
+ InstructionCost MemCost =
226
+ getMemoryOpCost (Opcode, SubVecTy, Alignment, AddressSpace, CostKind);
227
+
228
+ unsigned VecSize = DL.getTypeSizeInBits (SubVecTy);
229
+ unsigned MaxVecSize = 128 ;
230
+ unsigned NumAccesses =
231
+ std::max<unsigned >(1 , (MinElts * ElSize + MaxVecSize - 1 ) / VecSize);
232
+
233
+ // A stride of two is commonly supported via dedicated instructions, so it
234
+ // should be relatively cheap for all element sizes. A stride of four is
235
+ // more expensive as it will likely require more shuffles. Using two
236
+ // simd128 inputs is considered more expensive and we don't currently
237
+ // account for shuffling than two inputs (32 bytes).
238
+ static const CostTblEntry ShuffleCostTbl[] = {
239
+ // One reg.
240
+ {2 , MVT::v2i8, 1 }, // interleave 2 x 2i8 into 4i8
241
+ {2 , MVT::v4i8, 1 }, // interleave 2 x 4i8 into 8i8
242
+ {2 , MVT::v8i8, 1 }, // interleave 2 x 8i8 into 16i8
243
+ {2 , MVT::v2i16, 1 }, // interleave 2 x 2i16 into 4i16
244
+ {2 , MVT::v4i16, 1 }, // interleave 2 x 4i16 into 8i16
245
+ {2 , MVT::v2i32, 1 }, // interleave 2 x 2i32 into 4i32
246
+
247
+ // Two regs.
248
+ {2 , MVT::v16i8, 2 }, // interleave 2 x 16i8 into 32i8
249
+ {2 , MVT::v8i16, 2 }, // interleave 2 x 8i16 into 16i16
250
+ {2 , MVT::v4i32, 2 }, // interleave 2 x 4i32 into 8i32
251
+
252
+ // One reg.
253
+ {4 , MVT::v2i8, 4 }, // interleave 4 x 2i8 into 8i8
254
+ {4 , MVT::v4i8, 4 }, // interleave 4 x 4i8 into 16i8
255
+ {4 , MVT::v2i16, 4 }, // interleave 4 x 2i16 into 8i16
256
+
257
+ // Two regs.
258
+ {4 , MVT::v8i8, 16 }, // interleave 4 x 8i8 into 32i8
259
+ {4 , MVT::v4i16, 8 }, // interleave 4 x 4i16 into 16i16
260
+ {4 , MVT::v2i32, 4 }, // interleave 4 x 2i32 into 8i32
261
+ };
262
+
263
+ EVT ETy = TLI->getValueType (DL, SubVecTy);
264
+ if (const auto *Entry =
265
+ CostTableLookup (ShuffleCostTbl, Factor, ETy.getSimpleVT ()))
266
+ return Entry->Cost + (NumAccesses * MemCost);
267
+ }
268
+
269
+ return BaseT::getInterleavedMemoryOpCost (Opcode, VecTy, Factor, Indices,
270
+ Alignment, AddressSpace, CostKind,
271
+ UseMaskForCond, UseMaskForGaps);
272
+ }
273
+
185
274
InstructionCost WebAssemblyTTIImpl::getVectorInstrCost (
186
275
unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
187
276
const Value *Op0, const Value *Op1) const {
0 commit comments