@@ -165,12 +165,6 @@ InstructionCost WebAssemblyTTIImpl::getMemoryOpCost(
165
165
CostKind);
166
166
}
167
167
168
- int ISD = TLI->InstructionOpcodeToISD (Opcode);
169
- if (ISD != ISD::LOAD) {
170
- return BaseT::getMemoryOpCost (Opcode, Ty, Alignment, AddressSpace,
171
- CostKind);
172
- }
173
-
174
168
EVT VT = TLI->getValueType (DL, Ty, true );
175
169
// Type legalization can't handle structs
176
170
if (VT == MVT::Other)
@@ -181,22 +175,117 @@ InstructionCost WebAssemblyTTIImpl::getMemoryOpCost(
181
175
if (!LT.first .isValid ())
182
176
return InstructionCost::getInvalid ();
183
177
184
- // 128-bit loads are a single instruction. 32-bit and 64-bit vector loads can
185
- // be lowered to load32_zero and load64_zero respectively. Assume SIMD loads
186
- // are twice as expensive as scalar.
178
+ int ISD = TLI->InstructionOpcodeToISD (Opcode);
187
179
unsigned width = VT.getSizeInBits ();
188
- switch (width) {
189
- default :
190
- break ;
191
- case 32 :
192
- case 64 :
193
- case 128 :
194
- return 2 ;
180
+ if (ISD == ISD::LOAD) {
181
+ // 128-bit loads are a single instruction. 32-bit and 64-bit vector loads
182
+ // can be lowered to load32_zero and load64_zero respectively. Assume SIMD
183
+ // loads are twice as expensive as scalar.
184
+ switch (width) {
185
+ default :
186
+ break ;
187
+ case 32 :
188
+ case 64 :
189
+ case 128 :
190
+ return 2 ;
191
+ }
192
+ } else if (ISD == ISD::STORE) {
193
+ // For stores, we can use store lane operations.
194
+ switch (width) {
195
+ default :
196
+ break ;
197
+ case 8 :
198
+ case 16 :
199
+ case 32 :
200
+ case 64 :
201
+ case 128 :
202
+ return 2 ;
203
+ }
195
204
}
196
205
197
206
return BaseT::getMemoryOpCost (Opcode, Ty, Alignment, AddressSpace, CostKind);
198
207
}
199
208
209
+ InstructionCost WebAssemblyTTIImpl::getInterleavedMemoryOpCost (
210
+ unsigned Opcode, Type *Ty, unsigned Factor, ArrayRef<unsigned > Indices,
211
+ Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
212
+ bool UseMaskForCond, bool UseMaskForGaps) const {
213
+ assert (Factor >= 2 && " Invalid interleave factor" );
214
+
215
+ auto *VecTy = cast<VectorType>(Ty);
216
+ if (!ST->hasSIMD128 () || !isa<FixedVectorType>(VecTy)) {
217
+ return InstructionCost::getInvalid ();
218
+ }
219
+
220
+ if (UseMaskForCond || UseMaskForGaps)
221
+ return BaseT::getInterleavedMemoryOpCost (Opcode, Ty, Factor, Indices,
222
+ Alignment, AddressSpace, CostKind,
223
+ UseMaskForCond, UseMaskForGaps);
224
+
225
+ constexpr unsigned MaxInterleaveFactor = 4 ;
226
+ if (Factor <= MaxInterleaveFactor) {
227
+ unsigned MinElts = VecTy->getElementCount ().getKnownMinValue ();
228
+ // Ensure the number of vector elements is greater than 1.
229
+ if (MinElts < 2 || MinElts % Factor != 0 )
230
+ return InstructionCost::getInvalid ();
231
+
232
+ unsigned ElSize = DL.getTypeSizeInBits (VecTy->getElementType ());
233
+ // Ensure the element type is legal.
234
+ if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64 )
235
+ return InstructionCost::getInvalid ();
236
+
237
+ auto *SubVecTy =
238
+ VectorType::get (VecTy->getElementType (),
239
+ VecTy->getElementCount ().divideCoefficientBy (Factor));
240
+ InstructionCost MemCost =
241
+ getMemoryOpCost (Opcode, SubVecTy, Alignment, AddressSpace, CostKind);
242
+
243
+ unsigned VecSize = DL.getTypeSizeInBits (SubVecTy);
244
+ unsigned MaxVecSize = 128 ;
245
+ unsigned NumAccesses =
246
+ std::max<unsigned >(1 , (MinElts * ElSize + MaxVecSize - 1 ) / VecSize);
247
+
248
+ // A stride of two is commonly supported via dedicated instructions, so it
249
+ // should be relatively cheap for all element sizes. A stride of four is
250
+ // more expensive as it will likely require more shuffles. Using two
251
+ // simd128 inputs is considered more expensive and we don't currently
252
+ // account for shuffling than two inputs (32 bytes).
253
+ static const CostTblEntry ShuffleCostTbl[] = {
254
+ // One reg.
255
+ {2 , MVT::v2i8, 1 }, // interleave 2 x 2i8 into 4i8
256
+ {2 , MVT::v4i8, 1 }, // interleave 2 x 4i8 into 8i8
257
+ {2 , MVT::v8i8, 1 }, // interleave 2 x 8i8 into 16i8
258
+ {2 , MVT::v2i16, 1 }, // interleave 2 x 2i16 into 4i16
259
+ {2 , MVT::v4i16, 1 }, // interleave 2 x 4i16 into 8i16
260
+ {2 , MVT::v2i32, 1 }, // interleave 2 x 2i32 into 4i32
261
+
262
+ // Two regs.
263
+ {2 , MVT::v16i8, 2 }, // interleave 2 x 16i8 into 32i8
264
+ {2 , MVT::v8i16, 2 }, // interleave 2 x 8i16 into 16i16
265
+ {2 , MVT::v4i32, 2 }, // interleave 2 x 4i32 into 8i32
266
+
267
+ // One reg.
268
+ {4 , MVT::v2i8, 4 }, // interleave 4 x 2i8 into 8i8
269
+ {4 , MVT::v4i8, 4 }, // interleave 4 x 4i8 into 16i8
270
+ {4 , MVT::v2i16, 4 }, // interleave 4 x 2i16 into 8i16
271
+
272
+ // Two regs.
273
+ {4 , MVT::v8i8, 16 }, // interleave 4 x 8i8 into 32i8
274
+ {4 , MVT::v4i16, 8 }, // interleave 4 x 4i16 into 16i16
275
+ {4 , MVT::v2i32, 4 }, // interleave 4 x 2i32 into 8i32
276
+ };
277
+
278
+ EVT ETy = TLI->getValueType (DL, SubVecTy);
279
+ if (const auto *Entry =
280
+ CostTableLookup (ShuffleCostTbl, Factor, ETy.getSimpleVT ()))
281
+ return Entry->Cost + (NumAccesses * MemCost);
282
+ }
283
+
284
+ return BaseT::getInterleavedMemoryOpCost (Opcode, VecTy, Factor, Indices,
285
+ Alignment, AddressSpace, CostKind,
286
+ UseMaskForCond, UseMaskForGaps);
287
+ }
288
+
200
289
InstructionCost WebAssemblyTTIImpl::getVectorInstrCost (
201
290
unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
202
291
const Value *Op0, const Value *Op1) const {
0 commit comments