Skip to content

[IA] Support vp.store in lowerinterleavedStore #149605

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 7 additions & 13 deletions llvm/include/llvm/CodeGen/TargetLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -3219,25 +3219,19 @@ class LLVM_ABI TargetLoweringBase {
/// Lower an interleaved store to target specific intrinsics. Return
/// true on success.
///
/// \p SI is the vector store instruction.
/// \p SI is the vector store instruction. Can be either a plain store
/// or a vp.store.
/// \p Mask is a per-segment (i.e. number of lanes equal to that of one
/// component being interwoven) mask. Can be nullptr, in which case the
/// result is uncondiitional.
/// \p SVI is the shufflevector to RE-interleave the stored vector.
/// \p Factor is the interleave factor.
virtual bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
virtual bool lowerInterleavedStore(Instruction *Store, Value *Mask,
ShuffleVectorInst *SVI,
unsigned Factor) const {
return false;
}

/// Lower an interleaved store to target specific intrinsics. Return
/// true on success.
///
/// \p Store is the vp.store instruction.
/// \p Mask is a mask value
/// \p InterleaveOps is a list of values being interleaved.
virtual bool lowerInterleavedVPStore(VPIntrinsic *Store, Value *Mask,
ArrayRef<Value *> InterleaveOps) const {
return false;
}

/// Lower a deinterleave intrinsic to a target specific load intrinsic.
/// Return true on success. Currently only supports
/// llvm.vector.deinterleave{2,3,5,7}
Expand Down
38 changes: 9 additions & 29 deletions llvm/lib/CodeGen/InterleavedAccessPass.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -518,46 +518,26 @@ bool InterleavedAccessImpl::lowerInterleavedStore(
assert(NumStoredElements % Factor == 0 &&
"number of stored element should be a multiple of Factor");

Value *Mask = nullptr;
if (auto *VPStore = dyn_cast<VPIntrinsic>(Store)) {
unsigned LaneMaskLen = NumStoredElements / Factor;
Value *LaneMask = getMask(VPStore->getMaskParam(), Factor,
ElementCount::getFixed(LaneMaskLen));
if (!LaneMask)
Mask = getMask(VPStore->getMaskParam(), Factor,
ElementCount::getFixed(LaneMaskLen));
if (!Mask)
return false;

LLVM_DEBUG(dbgs() << "IA: Found an interleaved vp.store: " << *Store
<< "\n");

IRBuilder<> Builder(VPStore);
// We need to effectively de-interleave the shufflemask
// because lowerInterleavedVPStore expects individual de-interleaved
// values.
SmallVector<Value *, 10> NewShuffles;
SmallVector<int, 16> NewShuffleMask(LaneMaskLen);
auto ShuffleMask = SVI->getShuffleMask();

for (unsigned i = 0; i < Factor; i++) {
for (unsigned j = 0; j < LaneMaskLen; j++)
NewShuffleMask[j] = ShuffleMask[i + Factor * j];

NewShuffles.push_back(Builder.CreateShuffleVector(
SVI->getOperand(0), SVI->getOperand(1), NewShuffleMask));
}

// Try to create target specific intrinsics to replace the vp.store and
// shuffle.
if (!TLI->lowerInterleavedVPStore(VPStore, LaneMask, NewShuffles))
// We already created new shuffles.
return true;
} else {
LLVM_DEBUG(dbgs() << "IA: Found an interleaved store: " << *Store << "\n");

// Try to create target specific intrinsics to replace the store and
// shuffle.
if (!TLI->lowerInterleavedStore(cast<StoreInst>(Store), SVI, Factor))
return false;
}

// Try to create target specific intrinsics to replace the store and
// shuffle.
if (!TLI->lowerInterleavedStore(cast<Instruction>(Store), Mask, SVI, Factor))
return false;

// Already have a new target specific interleaved store. Erase the old store.
DeadInsts.insert(Store);
DeadInsts.insert(SVI);
Expand Down
7 changes: 6 additions & 1 deletion llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17343,12 +17343,17 @@ bool hasNearbyPairedStore(Iter It, Iter End, Value *Ptr, const DataLayout &DL) {
/// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
/// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
bool AArch64TargetLowering::lowerInterleavedStore(Instruction *Store,
Value *LaneMask,
ShuffleVectorInst *SVI,
unsigned Factor) const {

assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
"Invalid interleave factor");
auto *SI = dyn_cast<StoreInst>(Store);
if (!SI)
return false;
assert(!LaneMask && "Unexpected mask on store");

auto *VecTy = cast<FixedVectorType>(SVI->getType());
assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
Expand Down
3 changes: 2 additions & 1 deletion llvm/lib/Target/AArch64/AArch64ISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,8 @@ class AArch64TargetLowering : public TargetLowering {
ArrayRef<ShuffleVectorInst *> Shuffles,
ArrayRef<unsigned> Indices,
unsigned Factor) const override;
bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
bool lowerInterleavedStore(Instruction *Store, Value *Mask,
ShuffleVectorInst *SVI,
unsigned Factor) const override;

bool lowerDeinterleaveIntrinsicToLoad(Instruction *Load, Value *Mask,
Expand Down
7 changes: 6 additions & 1 deletion llvm/lib/Target/ARM/ARMISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21731,11 +21731,16 @@ bool ARMTargetLowering::lowerInterleavedLoad(
/// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
/// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
/// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4)
bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
bool ARMTargetLowering::lowerInterleavedStore(Instruction *Store,
Value *LaneMask,
ShuffleVectorInst *SVI,
unsigned Factor) const {
assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
"Invalid interleave factor");
auto *SI = dyn_cast<StoreInst>(Store);
if (!SI)
return false;
assert(!LaneMask && "Unexpected mask on store");

auto *VecTy = cast<FixedVectorType>(SVI->getType());
assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
Expand Down
3 changes: 2 additions & 1 deletion llvm/lib/Target/ARM/ARMISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -685,7 +685,8 @@ class VectorType;
ArrayRef<ShuffleVectorInst *> Shuffles,
ArrayRef<unsigned> Indices,
unsigned Factor) const override;
bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
bool lowerInterleavedStore(Instruction *Store, Value *Mask,
ShuffleVectorInst *SVI,
unsigned Factor) const override;

bool shouldInsertFencesForAtomic(const Instruction *I) const override;
Expand Down
6 changes: 2 additions & 4 deletions llvm/lib/Target/RISCV/RISCVISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -434,7 +434,8 @@ class RISCVTargetLowering : public TargetLowering {
ArrayRef<unsigned> Indices,
unsigned Factor) const override;

bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
bool lowerInterleavedStore(Instruction *Store, Value *Mask,
ShuffleVectorInst *SVI,
unsigned Factor) const override;

bool lowerDeinterleaveIntrinsicToLoad(Instruction *Load, Value *Mask,
Expand All @@ -444,9 +445,6 @@ class RISCVTargetLowering : public TargetLowering {
Instruction *Store, Value *Mask,
ArrayRef<Value *> InterleaveValues) const override;

bool lowerInterleavedVPStore(VPIntrinsic *Store, Value *Mask,
ArrayRef<Value *> InterleaveOps) const override;

bool supportKCFIBundles() const override { return true; }

SDValue expandIndirectJTBranch(const SDLoc &dl, SDValue Value, SDValue Addr,
Expand Down
135 changes: 24 additions & 111 deletions llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -234,22 +234,28 @@ bool RISCVTargetLowering::lowerInterleavedLoad(
///
/// Note that the new shufflevectors will be removed and we'll only generate one
/// vsseg3 instruction in CodeGen.
bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI,
bool RISCVTargetLowering::lowerInterleavedStore(Instruction *Store,
Value *LaneMask,
ShuffleVectorInst *SVI,
unsigned Factor) const {
IRBuilder<> Builder(SI);
const DataLayout &DL = SI->getDataLayout();
IRBuilder<> Builder(Store);
const DataLayout &DL = Store->getDataLayout();
auto Mask = SVI->getShuffleMask();
auto *ShuffleVTy = cast<FixedVectorType>(SVI->getType());
// Given SVI : <n*factor x ty>, then VTy : <n x ty>
auto *VTy = FixedVectorType::get(ShuffleVTy->getElementType(),
ShuffleVTy->getNumElements() / Factor);
if (!isLegalInterleavedAccessType(VTy, Factor, SI->getAlign(),
SI->getPointerAddressSpace(), DL))
auto *XLenTy = Type::getIntNTy(Store->getContext(), Subtarget.getXLen());

Value *Ptr, *VL;
Align Alignment;
if (!getMemOperands(Factor, VTy, XLenTy, Store, Ptr, LaneMask, VL, Alignment))
return false;

auto *PtrTy = SI->getPointerOperandType();
auto *XLenTy = Type::getIntNTy(SI->getContext(), Subtarget.getXLen());
Type *PtrTy = Ptr->getType();
unsigned AS = PtrTy->getPointerAddressSpace();
if (!isLegalInterleavedAccessType(VTy, Factor, Alignment, AS, DL))
return false;

unsigned Index;
// If the segment store only has one active lane (i.e. the interleave is
Expand All @@ -260,26 +266,27 @@ bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI,
unsigned ScalarSizeInBytes =
DL.getTypeStoreSize(ShuffleVTy->getElementType());
Value *Data = SVI->getOperand(0);
auto *DataVTy = cast<FixedVectorType>(Data->getType());
Data = Builder.CreateExtractVector(VTy, Data, uint64_t(0));
Value *Stride = ConstantInt::get(XLenTy, Factor * ScalarSizeInBytes);
Value *Offset = ConstantInt::get(XLenTy, Index * ScalarSizeInBytes);
Value *BasePtr = Builder.CreatePtrAdd(SI->getPointerOperand(), Offset);
Value *Mask = Builder.getAllOnesMask(DataVTy->getElementCount());
Value *VL = Builder.CreateElementCount(Builder.getInt32Ty(),
Value *BasePtr = Builder.CreatePtrAdd(Ptr, Offset);
// Note: Same VL as above, but i32 not xlen due to signature of
// vp.strided.store
VL = Builder.CreateElementCount(Builder.getInt32Ty(),
VTy->getElementCount());

CallInst *CI = Builder.CreateIntrinsic(
Intrinsic::experimental_vp_strided_store,
{Data->getType(), BasePtr->getType(), Stride->getType()},
{Data, BasePtr, Stride, Mask, VL});
CI->addParamAttr(
1, Attribute::getWithAlignment(CI->getContext(), SI->getAlign()));
{VTy, BasePtr->getType(), Stride->getType()},
{Data, BasePtr, Stride, LaneMask, VL});
CI->addParamAttr(1,
Attribute::getWithAlignment(CI->getContext(), Alignment));

return true;
}

Function *VssegNFunc = Intrinsic::getOrInsertDeclaration(
SI->getModule(), FixedVssegIntrIds[Factor - 2], {VTy, PtrTy, XLenTy});
Store->getModule(), FixedVssegIntrIds[Factor - 2], {VTy, PtrTy, XLenTy});

SmallVector<Value *, 10> Ops;
SmallVector<int, 16> NewShuffleMask;
Expand All @@ -295,13 +302,7 @@ bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI,

NewShuffleMask.clear();
}
// This VL should be OK (should be executable in one vsseg instruction,
// potentially under larger LMULs) because we checked that the fixed vector
// type fits in isLegalInterleavedAccessType
Value *VL = Builder.CreateElementCount(XLenTy, VTy->getElementCount());
Value *StoreMask = Builder.getAllOnesMask(VTy->getElementCount());
Ops.append({SI->getPointerOperand(), StoreMask, VL});

Ops.append({Ptr, LaneMask, VL});
Builder.CreateCall(VssegNFunc, Ops);

return true;
Expand Down Expand Up @@ -424,91 +425,3 @@ bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(
Builder.CreateCall(VssegNFunc, Operands);
return true;
}

/// Lower an interleaved vp.store into a vssegN intrinsic.
///
/// E.g. Lower an interleaved vp.store (Factor = 2):
///
/// %is = tail call <vscale x 64 x i8>
/// @llvm.vector.interleave2.nxv64i8(
/// <vscale x 32 x i8> %load0,
/// <vscale x 32 x i8> %load1
/// %wide.rvl = shl nuw nsw i32 %rvl, 1
/// tail call void @llvm.vp.store.nxv64i8.p0(
/// <vscale x 64 x i8> %is, ptr %ptr,
/// %mask,
/// i32 %wide.rvl)
///
/// Into:
/// call void @llvm.riscv.vsseg2.mask.nxv32i8.i64(
/// <vscale x 32 x i8> %load1,
/// <vscale x 32 x i8> %load2, ptr %ptr,
/// %mask,
/// i64 %rvl)
bool RISCVTargetLowering::lowerInterleavedVPStore(
VPIntrinsic *Store, Value *Mask,
ArrayRef<Value *> InterleaveOperands) const {
assert(Mask && "Expect a valid mask");
assert(Store->getIntrinsicID() == Intrinsic::vp_store &&
"Unexpected intrinsic");

const unsigned Factor = InterleaveOperands.size();

auto *VTy = dyn_cast<VectorType>(InterleaveOperands[0]->getType());
if (!VTy)
return false;

const DataLayout &DL = Store->getDataLayout();
Align Alignment = Store->getParamAlign(1).value_or(
DL.getABITypeAlign(VTy->getElementType()));
if (!isLegalInterleavedAccessType(
VTy, Factor, Alignment,
Store->getArgOperand(1)->getType()->getPointerAddressSpace(), DL))
return false;

IRBuilder<> Builder(Store);
Value *WideEVL = Store->getArgOperand(3);
// Conservatively check if EVL is a multiple of factor, otherwise some
// (trailing) elements might be lost after the transformation.
if (!isMultipleOfN(WideEVL, Store->getDataLayout(), Factor))
return false;

auto *PtrTy = Store->getArgOperand(1)->getType();
auto *XLenTy = Type::getIntNTy(Store->getContext(), Subtarget.getXLen());
auto *FactorC = ConstantInt::get(WideEVL->getType(), Factor);
Value *EVL =
Builder.CreateZExt(Builder.CreateExactUDiv(WideEVL, FactorC), XLenTy);

if (isa<FixedVectorType>(VTy)) {
SmallVector<Value *, 8> Operands(InterleaveOperands);
Operands.append({Store->getArgOperand(1), Mask, EVL});
Builder.CreateIntrinsic(FixedVssegIntrIds[Factor - 2],
{VTy, PtrTy, XLenTy}, Operands);
return true;
}

unsigned SEW = DL.getTypeSizeInBits(VTy->getElementType());
unsigned NumElts = VTy->getElementCount().getKnownMinValue();
Type *VecTupTy = TargetExtType::get(
Store->getContext(), "riscv.vector.tuple",
ScalableVectorType::get(Type::getInt8Ty(Store->getContext()),
NumElts * SEW / 8),
Factor);

Function *VecInsertFunc = Intrinsic::getOrInsertDeclaration(
Store->getModule(), Intrinsic::riscv_tuple_insert, {VecTupTy, VTy});
Value *StoredVal = PoisonValue::get(VecTupTy);
for (unsigned i = 0; i < Factor; ++i)
StoredVal = Builder.CreateCall(
VecInsertFunc, {StoredVal, InterleaveOperands[i], Builder.getInt32(i)});

Function *VssegNFunc = Intrinsic::getOrInsertDeclaration(
Store->getModule(), ScalableVssegIntrIds[Factor - 2],
{VecTupTy, PtrTy, Mask->getType(), EVL->getType()});

Value *Operands[] = {StoredVal, Store->getArgOperand(1), Mask, EVL,
ConstantInt::get(XLenTy, Log2_64(SEW))};

Builder.CreateCall(VssegNFunc, Operands);
return true;
}
3 changes: 2 additions & 1 deletion llvm/lib/Target/X86/X86ISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -1668,7 +1668,8 @@ namespace llvm {

/// Lower interleaved store(s) into target specific
/// instructions/intrinsics.
bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
bool lowerInterleavedStore(Instruction *Store, Value *Mask,
ShuffleVectorInst *SVI,
unsigned Factor) const override;

SDValue expandIndirectJTBranch(const SDLoc &dl, SDValue Value, SDValue Addr,
Expand Down
8 changes: 7 additions & 1 deletion llvm/lib/Target/X86/X86InterleavedAccess.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -822,7 +822,8 @@ bool X86TargetLowering::lowerInterleavedLoad(
return Grp.isSupported() && Grp.lowerIntoOptimizedSequence();
}

bool X86TargetLowering::lowerInterleavedStore(StoreInst *SI,
bool X86TargetLowering::lowerInterleavedStore(Instruction *Store,
Value *LaneMask,
ShuffleVectorInst *SVI,
unsigned Factor) const {
assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
Expand All @@ -832,6 +833,11 @@ bool X86TargetLowering::lowerInterleavedStore(StoreInst *SI,
0 &&
"Invalid interleaved store");

auto *SI = dyn_cast<StoreInst>(Store);
if (!SI)
return false;
assert(!LaneMask && "Unexpected mask on store");

// Holds the indices of SVI that correspond to the starting index of each
// interleaved shuffle.
auto Mask = SVI->getShuffleMask();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1757,8 +1757,9 @@ define void @store_factor4_one_active(ptr %ptr, <4 x i32> %v) {
define void @vpstore_factor4_one_active(ptr %ptr, <4 x i32> %v) {
; CHECK-LABEL: vpstore_factor4_one_active:
; CHECK: # %bb.0:
; CHECK-NEXT: li a1, 16
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; CHECK-NEXT: vsseg4e32.v v8, (a0)
; CHECK-NEXT: vsse32.v v8, (a0), a1
; CHECK-NEXT: ret
%v0 = shufflevector <4 x i32> %v, <4 x i32> poison, <16 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 undef, i32 undef, i32 2, i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 undef>
tail call void @llvm.vp.store.v16i32.p0(<16 x i32> %v0, ptr %ptr, <16 x i1> splat (i1 true), i32 16)
Expand All @@ -1782,7 +1783,7 @@ define void @store_factor4_one_active_fullwidth(ptr %ptr, <16 x i32> %v) {
; CHECK-LABEL: store_factor4_one_active_fullwidth:
; CHECK: # %bb.0:
; CHECK-NEXT: li a1, 16
; CHECK-NEXT: vsetivli zero, 4, e32, m4, ta, ma
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; CHECK-NEXT: vsse32.v v8, (a0), a1
; CHECK-NEXT: ret
%v0 = shufflevector <16 x i32> %v, <16 x i32> poison, <16 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 undef, i32 undef, i32 2, i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 undef>
Expand Down
Loading