Skip to content

Commit dbd9eae

Browse files
preamestopperc
andauthored
[IA] Support vp.store in lowerinterleavedStore (#149605)
Follow up to 28417e6, and the whole line of work started with 4b81dc7. This change merges the handling for VPStore - currently in lowerInterleavedVPStore - into the existing dedicated routine used in the shuffle lowering path. This removes the last use of the dedicated lowerInterleavedVPStore and thus we can remove it. This contains two changes which are functional. First, like in 28417e6, merging support for vp.store exposes the strided store optimization for code using vp.store. Second, it seems the strided store case had a significant missed optimization. We were performing the strided store at the full unit strided store type width (i.e. LMUL) rather than reducing it to match the input width. This became obvious when I tried to use the mask created by the helper routine as it caused a type incompatibility. Normally, I'd try not to include an optimization in an API rework, but structuring the code to both be correct for vp.store and not optimize the existing case turned out be more involved than seemed worthwhile. I could pull this part out as a pre-change, but its a bit awkward on it's own as it turns out to be somewhat of a half step on the possible optimization; the full optimization is complex with the old code structure. --------- Co-authored-by: Craig Topper <[email protected]>
1 parent 5edb845 commit dbd9eae

File tree

11 files changed

+74
-170
lines changed

11 files changed

+74
-170
lines changed

llvm/include/llvm/CodeGen/TargetLowering.h

Lines changed: 7 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -3219,25 +3219,19 @@ class LLVM_ABI TargetLoweringBase {
32193219
/// Lower an interleaved store to target specific intrinsics. Return
32203220
/// true on success.
32213221
///
3222-
/// \p SI is the vector store instruction.
3222+
/// \p SI is the vector store instruction. Can be either a plain store
3223+
/// or a vp.store.
3224+
/// \p Mask is a per-segment (i.e. number of lanes equal to that of one
3225+
/// component being interwoven) mask. Can be nullptr, in which case the
3226+
/// result is unconditional.
32233227
/// \p SVI is the shufflevector to RE-interleave the stored vector.
32243228
/// \p Factor is the interleave factor.
3225-
virtual bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
3229+
virtual bool lowerInterleavedStore(Instruction *Store, Value *Mask,
3230+
ShuffleVectorInst *SVI,
32263231
unsigned Factor) const {
32273232
return false;
32283233
}
32293234

3230-
/// Lower an interleaved store to target specific intrinsics. Return
3231-
/// true on success.
3232-
///
3233-
/// \p Store is the vp.store instruction.
3234-
/// \p Mask is a mask value
3235-
/// \p InterleaveOps is a list of values being interleaved.
3236-
virtual bool lowerInterleavedVPStore(VPIntrinsic *Store, Value *Mask,
3237-
ArrayRef<Value *> InterleaveOps) const {
3238-
return false;
3239-
}
3240-
32413235
/// Lower a deinterleave intrinsic to a target specific load intrinsic.
32423236
/// Return true on success. Currently only supports
32433237
/// llvm.vector.deinterleave{2,3,5,7}

llvm/lib/CodeGen/InterleavedAccessPass.cpp

Lines changed: 9 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -507,46 +507,26 @@ bool InterleavedAccessImpl::lowerInterleavedStore(
507507
assert(NumStoredElements % Factor == 0 &&
508508
"number of stored element should be a multiple of Factor");
509509

510+
Value *Mask = nullptr;
510511
if (auto *VPStore = dyn_cast<VPIntrinsic>(Store)) {
511512
unsigned LaneMaskLen = NumStoredElements / Factor;
512-
Value *LaneMask = getMask(VPStore->getMaskParam(), Factor,
513-
ElementCount::getFixed(LaneMaskLen));
514-
if (!LaneMask)
513+
Mask = getMask(VPStore->getMaskParam(), Factor,
514+
ElementCount::getFixed(LaneMaskLen));
515+
if (!Mask)
515516
return false;
516517

517518
LLVM_DEBUG(dbgs() << "IA: Found an interleaved vp.store: " << *Store
518519
<< "\n");
519520

520-
IRBuilder<> Builder(VPStore);
521-
// We need to effectively de-interleave the shufflemask
522-
// because lowerInterleavedVPStore expects individual de-interleaved
523-
// values.
524-
SmallVector<Value *, 10> NewShuffles;
525-
SmallVector<int, 16> NewShuffleMask(LaneMaskLen);
526-
auto ShuffleMask = SVI->getShuffleMask();
527-
528-
for (unsigned i = 0; i < Factor; i++) {
529-
for (unsigned j = 0; j < LaneMaskLen; j++)
530-
NewShuffleMask[j] = ShuffleMask[i + Factor * j];
531-
532-
NewShuffles.push_back(Builder.CreateShuffleVector(
533-
SVI->getOperand(0), SVI->getOperand(1), NewShuffleMask));
534-
}
535-
536-
// Try to create target specific intrinsics to replace the vp.store and
537-
// shuffle.
538-
if (!TLI->lowerInterleavedVPStore(VPStore, LaneMask, NewShuffles))
539-
// We already created new shuffles.
540-
return true;
541521
} else {
542522
LLVM_DEBUG(dbgs() << "IA: Found an interleaved store: " << *Store << "\n");
543-
544-
// Try to create target specific intrinsics to replace the store and
545-
// shuffle.
546-
if (!TLI->lowerInterleavedStore(cast<StoreInst>(Store), SVI, Factor))
547-
return false;
548523
}
549524

525+
// Try to create target specific intrinsics to replace the store and
526+
// shuffle.
527+
if (!TLI->lowerInterleavedStore(Store, Mask, SVI, Factor))
528+
return false;
529+
550530
// Already have a new target specific interleaved store. Erase the old store.
551531
DeadInsts.insert(Store);
552532
DeadInsts.insert(SVI);

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17343,12 +17343,17 @@ bool hasNearbyPairedStore(Iter It, Iter End, Value *Ptr, const DataLayout &DL) {
1734317343
/// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
1734417344
/// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
1734517345
/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
17346-
bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
17346+
bool AArch64TargetLowering::lowerInterleavedStore(Instruction *Store,
17347+
Value *LaneMask,
1734717348
ShuffleVectorInst *SVI,
1734817349
unsigned Factor) const {
1734917350

1735017351
assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
1735117352
"Invalid interleave factor");
17353+
auto *SI = dyn_cast<StoreInst>(Store);
17354+
if (!SI)
17355+
return false;
17356+
assert(!LaneMask && "Unexpected mask on store");
1735217357

1735317358
auto *VecTy = cast<FixedVectorType>(SVI->getType());
1735417359
assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");

llvm/lib/Target/AArch64/AArch64ISelLowering.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -215,7 +215,8 @@ class AArch64TargetLowering : public TargetLowering {
215215
ArrayRef<ShuffleVectorInst *> Shuffles,
216216
ArrayRef<unsigned> Indices,
217217
unsigned Factor) const override;
218-
bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
218+
bool lowerInterleavedStore(Instruction *Store, Value *Mask,
219+
ShuffleVectorInst *SVI,
219220
unsigned Factor) const override;
220221

221222
bool lowerDeinterleaveIntrinsicToLoad(Instruction *Load, Value *Mask,

llvm/lib/Target/ARM/ARMISelLowering.cpp

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21731,11 +21731,16 @@ bool ARMTargetLowering::lowerInterleavedLoad(
2173121731
/// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
2173221732
/// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
2173321733
/// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4)
21734-
bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
21734+
bool ARMTargetLowering::lowerInterleavedStore(Instruction *Store,
21735+
Value *LaneMask,
2173521736
ShuffleVectorInst *SVI,
2173621737
unsigned Factor) const {
2173721738
assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
2173821739
"Invalid interleave factor");
21740+
auto *SI = dyn_cast<StoreInst>(Store);
21741+
if (!SI)
21742+
return false;
21743+
assert(!LaneMask && "Unexpected mask on store");
2173921744

2174021745
auto *VecTy = cast<FixedVectorType>(SVI->getType());
2174121746
assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");

llvm/lib/Target/ARM/ARMISelLowering.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -685,7 +685,8 @@ class VectorType;
685685
ArrayRef<ShuffleVectorInst *> Shuffles,
686686
ArrayRef<unsigned> Indices,
687687
unsigned Factor) const override;
688-
bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
688+
bool lowerInterleavedStore(Instruction *Store, Value *Mask,
689+
ShuffleVectorInst *SVI,
689690
unsigned Factor) const override;
690691

691692
bool shouldInsertFencesForAtomic(const Instruction *I) const override;

llvm/lib/Target/RISCV/RISCVISelLowering.h

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -434,7 +434,8 @@ class RISCVTargetLowering : public TargetLowering {
434434
ArrayRef<unsigned> Indices,
435435
unsigned Factor) const override;
436436

437-
bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
437+
bool lowerInterleavedStore(Instruction *Store, Value *Mask,
438+
ShuffleVectorInst *SVI,
438439
unsigned Factor) const override;
439440

440441
bool lowerDeinterleaveIntrinsicToLoad(Instruction *Load, Value *Mask,
@@ -444,9 +445,6 @@ class RISCVTargetLowering : public TargetLowering {
444445
Instruction *Store, Value *Mask,
445446
ArrayRef<Value *> InterleaveValues) const override;
446447

447-
bool lowerInterleavedVPStore(VPIntrinsic *Store, Value *Mask,
448-
ArrayRef<Value *> InterleaveOps) const override;
449-
450448
bool supportKCFIBundles() const override { return true; }
451449

452450
SDValue expandIndirectJTBranch(const SDLoc &dl, SDValue Value, SDValue Addr,

llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp

Lines changed: 28 additions & 116 deletions
Original file line numberDiff line numberDiff line change
@@ -266,22 +266,28 @@ bool RISCVTargetLowering::lowerInterleavedLoad(
266266
///
267267
/// Note that the new shufflevectors will be removed and we'll only generate one
268268
/// vsseg3 instruction in CodeGen.
269-
bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI,
269+
bool RISCVTargetLowering::lowerInterleavedStore(Instruction *Store,
270+
Value *LaneMask,
270271
ShuffleVectorInst *SVI,
271272
unsigned Factor) const {
272-
IRBuilder<> Builder(SI);
273-
const DataLayout &DL = SI->getDataLayout();
273+
IRBuilder<> Builder(Store);
274+
const DataLayout &DL = Store->getDataLayout();
274275
auto Mask = SVI->getShuffleMask();
275276
auto *ShuffleVTy = cast<FixedVectorType>(SVI->getType());
276277
// Given SVI : <n*factor x ty>, then VTy : <n x ty>
277278
auto *VTy = FixedVectorType::get(ShuffleVTy->getElementType(),
278279
ShuffleVTy->getNumElements() / Factor);
279-
if (!isLegalInterleavedAccessType(VTy, Factor, SI->getAlign(),
280-
SI->getPointerAddressSpace(), DL))
280+
auto *XLenTy = Type::getIntNTy(Store->getContext(), Subtarget.getXLen());
281+
282+
Value *Ptr, *VL;
283+
Align Alignment;
284+
if (!getMemOperands(Factor, VTy, XLenTy, Store, Ptr, LaneMask, VL, Alignment))
281285
return false;
282286

283-
auto *PtrTy = SI->getPointerOperandType();
284-
auto *XLenTy = Type::getIntNTy(SI->getContext(), Subtarget.getXLen());
287+
Type *PtrTy = Ptr->getType();
288+
unsigned AS = PtrTy->getPointerAddressSpace();
289+
if (!isLegalInterleavedAccessType(VTy, Factor, Alignment, AS, DL))
290+
return false;
285291

286292
unsigned Index;
287293
// If the segment store only has one active lane (i.e. the interleave is
@@ -292,27 +298,27 @@ bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI,
292298
unsigned ScalarSizeInBytes =
293299
DL.getTypeStoreSize(ShuffleVTy->getElementType());
294300
Value *Data = SVI->getOperand(0);
295-
auto *DataVTy = cast<FixedVectorType>(Data->getType());
301+
Data = Builder.CreateExtractVector(VTy, Data, uint64_t(0));
296302
Value *Stride = ConstantInt::get(XLenTy, Factor * ScalarSizeInBytes);
297303
Value *Offset = ConstantInt::get(XLenTy, Index * ScalarSizeInBytes);
298-
Value *BasePtr = Builder.CreatePtrAdd(SI->getPointerOperand(), Offset);
299-
Value *Mask = Builder.getAllOnesMask(DataVTy->getElementCount());
300-
Value *VL = Builder.CreateElementCount(Builder.getInt32Ty(),
301-
VTy->getElementCount());
302-
303-
CallInst *CI = Builder.CreateIntrinsic(
304-
Intrinsic::experimental_vp_strided_store,
305-
{Data->getType(), BasePtr->getType(), Stride->getType()},
306-
{Data, BasePtr, Stride, Mask, VL});
307-
Align Alignment = commonAlignment(SI->getAlign(), Index * ScalarSizeInBytes);
308-
CI->addParamAttr(
309-
1, Attribute::getWithAlignment(CI->getContext(), Alignment));
304+
Value *BasePtr = Builder.CreatePtrAdd(Ptr, Offset);
305+
// Note: Same VL as above, but i32 not xlen due to signature of
306+
// vp.strided.store
307+
VL = Builder.CreateElementCount(Builder.getInt32Ty(),
308+
VTy->getElementCount());
310309

310+
CallInst *CI =
311+
Builder.CreateIntrinsic(Intrinsic::experimental_vp_strided_store,
312+
{VTy, BasePtr->getType(), Stride->getType()},
313+
{Data, BasePtr, Stride, LaneMask, VL});
314+
Alignment = commonAlignment(Alignment, Index * ScalarSizeInBytes);
315+
CI->addParamAttr(1,
316+
Attribute::getWithAlignment(CI->getContext(), Alignment));
311317
return true;
312318
}
313319

314320
Function *VssegNFunc = Intrinsic::getOrInsertDeclaration(
315-
SI->getModule(), FixedVssegIntrIds[Factor - 2], {VTy, PtrTy, XLenTy});
321+
Store->getModule(), FixedVssegIntrIds[Factor - 2], {VTy, PtrTy, XLenTy});
316322

317323
SmallVector<Value *, 10> Ops;
318324
SmallVector<int, 16> NewShuffleMask;
@@ -328,13 +334,7 @@ bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI,
328334

329335
NewShuffleMask.clear();
330336
}
331-
// This VL should be OK (should be executable in one vsseg instruction,
332-
// potentially under larger LMULs) because we checked that the fixed vector
333-
// type fits in isLegalInterleavedAccessType
334-
Value *VL = Builder.CreateElementCount(XLenTy, VTy->getElementCount());
335-
Value *StoreMask = Builder.getAllOnesMask(VTy->getElementCount());
336-
Ops.append({SI->getPointerOperand(), StoreMask, VL});
337-
337+
Ops.append({Ptr, LaneMask, VL});
338338
Builder.CreateCall(VssegNFunc, Ops);
339339

340340
return true;
@@ -457,91 +457,3 @@ bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(
457457
Builder.CreateCall(VssegNFunc, Operands);
458458
return true;
459459
}
460-
461-
/// Lower an interleaved vp.store into a vssegN intrinsic.
462-
///
463-
/// E.g. Lower an interleaved vp.store (Factor = 2):
464-
///
465-
/// %is = tail call <vscale x 64 x i8>
466-
/// @llvm.vector.interleave2.nxv64i8(
467-
/// <vscale x 32 x i8> %load0,
468-
/// <vscale x 32 x i8> %load1
469-
/// %wide.rvl = shl nuw nsw i32 %rvl, 1
470-
/// tail call void @llvm.vp.store.nxv64i8.p0(
471-
/// <vscale x 64 x i8> %is, ptr %ptr,
472-
/// %mask,
473-
/// i32 %wide.rvl)
474-
///
475-
/// Into:
476-
/// call void @llvm.riscv.vsseg2.mask.nxv32i8.i64(
477-
/// <vscale x 32 x i8> %load1,
478-
/// <vscale x 32 x i8> %load2, ptr %ptr,
479-
/// %mask,
480-
/// i64 %rvl)
481-
bool RISCVTargetLowering::lowerInterleavedVPStore(
482-
VPIntrinsic *Store, Value *Mask,
483-
ArrayRef<Value *> InterleaveOperands) const {
484-
assert(Mask && "Expect a valid mask");
485-
assert(Store->getIntrinsicID() == Intrinsic::vp_store &&
486-
"Unexpected intrinsic");
487-
488-
const unsigned Factor = InterleaveOperands.size();
489-
490-
auto *VTy = dyn_cast<VectorType>(InterleaveOperands[0]->getType());
491-
if (!VTy)
492-
return false;
493-
494-
const DataLayout &DL = Store->getDataLayout();
495-
Align Alignment = Store->getParamAlign(1).value_or(
496-
DL.getABITypeAlign(VTy->getElementType()));
497-
if (!isLegalInterleavedAccessType(
498-
VTy, Factor, Alignment,
499-
Store->getArgOperand(1)->getType()->getPointerAddressSpace(), DL))
500-
return false;
501-
502-
IRBuilder<> Builder(Store);
503-
Value *WideEVL = Store->getArgOperand(3);
504-
// Conservatively check if EVL is a multiple of factor, otherwise some
505-
// (trailing) elements might be lost after the transformation.
506-
if (!isMultipleOfN(WideEVL, Store->getDataLayout(), Factor))
507-
return false;
508-
509-
auto *PtrTy = Store->getArgOperand(1)->getType();
510-
auto *XLenTy = Type::getIntNTy(Store->getContext(), Subtarget.getXLen());
511-
auto *FactorC = ConstantInt::get(WideEVL->getType(), Factor);
512-
Value *EVL =
513-
Builder.CreateZExt(Builder.CreateExactUDiv(WideEVL, FactorC), XLenTy);
514-
515-
if (isa<FixedVectorType>(VTy)) {
516-
SmallVector<Value *, 8> Operands(InterleaveOperands);
517-
Operands.append({Store->getArgOperand(1), Mask, EVL});
518-
Builder.CreateIntrinsic(FixedVssegIntrIds[Factor - 2],
519-
{VTy, PtrTy, XLenTy}, Operands);
520-
return true;
521-
}
522-
523-
unsigned SEW = DL.getTypeSizeInBits(VTy->getElementType());
524-
unsigned NumElts = VTy->getElementCount().getKnownMinValue();
525-
Type *VecTupTy = TargetExtType::get(
526-
Store->getContext(), "riscv.vector.tuple",
527-
ScalableVectorType::get(Type::getInt8Ty(Store->getContext()),
528-
NumElts * SEW / 8),
529-
Factor);
530-
531-
Function *VecInsertFunc = Intrinsic::getOrInsertDeclaration(
532-
Store->getModule(), Intrinsic::riscv_tuple_insert, {VecTupTy, VTy});
533-
Value *StoredVal = PoisonValue::get(VecTupTy);
534-
for (unsigned i = 0; i < Factor; ++i)
535-
StoredVal = Builder.CreateCall(
536-
VecInsertFunc, {StoredVal, InterleaveOperands[i], Builder.getInt32(i)});
537-
538-
Function *VssegNFunc = Intrinsic::getOrInsertDeclaration(
539-
Store->getModule(), ScalableVssegIntrIds[Factor - 2],
540-
{VecTupTy, PtrTy, Mask->getType(), EVL->getType()});
541-
542-
Value *Operands[] = {StoredVal, Store->getArgOperand(1), Mask, EVL,
543-
ConstantInt::get(XLenTy, Log2_64(SEW))};
544-
545-
Builder.CreateCall(VssegNFunc, Operands);
546-
return true;
547-
}

llvm/lib/Target/X86/X86ISelLowering.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1668,7 +1668,8 @@ namespace llvm {
16681668

16691669
/// Lower interleaved store(s) into target specific
16701670
/// instructions/intrinsics.
1671-
bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
1671+
bool lowerInterleavedStore(Instruction *Store, Value *Mask,
1672+
ShuffleVectorInst *SVI,
16721673
unsigned Factor) const override;
16731674

16741675
SDValue expandIndirectJTBranch(const SDLoc &dl, SDValue Value, SDValue Addr,

llvm/lib/Target/X86/X86InterleavedAccess.cpp

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -822,7 +822,8 @@ bool X86TargetLowering::lowerInterleavedLoad(
822822
return Grp.isSupported() && Grp.lowerIntoOptimizedSequence();
823823
}
824824

825-
bool X86TargetLowering::lowerInterleavedStore(StoreInst *SI,
825+
bool X86TargetLowering::lowerInterleavedStore(Instruction *Store,
826+
Value *LaneMask,
826827
ShuffleVectorInst *SVI,
827828
unsigned Factor) const {
828829
assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
@@ -832,6 +833,11 @@ bool X86TargetLowering::lowerInterleavedStore(StoreInst *SI,
832833
0 &&
833834
"Invalid interleaved store");
834835

836+
auto *SI = dyn_cast<StoreInst>(Store);
837+
if (!SI)
838+
return false;
839+
assert(!LaneMask && "Unexpected mask on store");
840+
835841
// Holds the indices of SVI that correspond to the starting index of each
836842
// interleaved shuffle.
837843
auto Mask = SVI->getShuffleMask();

0 commit comments

Comments
 (0)