Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 29 additions & 13 deletions llvm/lib/CodeGen/InterleavedAccessPass.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -601,31 +601,47 @@ static Value *getMask(Value *WideMask, unsigned Factor,
bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
IntrinsicInst *DI, SmallSetVector<Instruction *, 32> &DeadInsts) {
Value *LoadedVal = DI->getOperand(0);
if (!LoadedVal->hasOneUse() || !isa<LoadInst, VPIntrinsic>(LoadedVal))
if (!LoadedVal->hasOneUse())
return false;

auto *LI = dyn_cast<LoadInst>(LoadedVal);
auto *II = dyn_cast<IntrinsicInst>(LoadedVal);
if (!LI && !II)
return false;

const unsigned Factor = getDeinterleaveIntrinsicFactor(DI->getIntrinsicID());
assert(Factor && "unexpected deinterleave intrinsic");

Value *Mask = nullptr;
if (auto *VPLoad = dyn_cast<VPIntrinsic>(LoadedVal)) {
if (VPLoad->getIntrinsicID() != Intrinsic::vp_load)
if (LI) {
if (!LI->isSimple())
return false;

LLVM_DEBUG(dbgs() << "IA: Found a load with deinterleave intrinsic " << *DI
<< " and factor = " << Factor << "\n");
} else {
assert(II);

// Check mask operand. Handle both all-true/false and interleaved mask.
Value *WideMask = VPLoad->getOperand(1);
Mask = getMask(WideMask, Factor, getDeinterleavedVectorType(DI));
if (!Mask)
Value *WideMask;
switch (II->getIntrinsicID()) {
default:
return false;
case Intrinsic::vp_load:
WideMask = II->getOperand(1);
break;
case Intrinsic::masked_load:
WideMask = II->getOperand(2);
break;
}

LLVM_DEBUG(dbgs() << "IA: Found a vp.load with deinterleave intrinsic "
<< *DI << " and factor = " << Factor << "\n");
} else {
auto *LI = cast<LoadInst>(LoadedVal);
if (!LI->isSimple())
Mask = getMask(WideMask, Factor, getDeinterleavedVectorType(DI));
if (!Mask)
return false;

LLVM_DEBUG(dbgs() << "IA: Found a load with deinterleave intrinsic " << *DI
<< " and factor = " << Factor << "\n");
LLVM_DEBUG(dbgs() << "IA: Found a vp.load or masked.load with deinterleave"
<< " intrinsic " << *DI << " and factor = "
<< Factor << "\n");
}

// Try and match this with target specific intrinsics.
Expand Down
44 changes: 30 additions & 14 deletions llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -131,24 +131,40 @@ static bool getMemOperands(unsigned Factor, VectorType *VTy, Type *XLenTy,
: Constant::getAllOnesValue(XLenTy);
return true;
}
auto *VPLdSt = cast<VPIntrinsic>(I);
assert((VPLdSt->getIntrinsicID() == Intrinsic::vp_load ||
VPLdSt->getIntrinsicID() == Intrinsic::vp_store) &&
"Unexpected intrinsic");
Ptr = VPLdSt->getMemoryPointerParam();
Alignment = VPLdSt->getPointerAlignment().value_or(
DL.getABITypeAlign(VTy->getElementType()));
if (auto *VPLdSt = dyn_cast<VPIntrinsic>(I)) {
assert((VPLdSt->getIntrinsicID() == Intrinsic::vp_load ||
VPLdSt->getIntrinsicID() == Intrinsic::vp_store) &&
"Unexpected intrinsic");
Ptr = VPLdSt->getMemoryPointerParam();
Alignment = VPLdSt->getPointerAlignment().value_or(
DL.getABITypeAlign(VTy->getElementType()));

assert(Mask && "vp.load and vp.store needs a mask!");

Value *WideEVL = VPLdSt->getVectorLengthParam();
// Conservatively check if EVL is a multiple of factor, otherwise some
// (trailing) elements might be lost after the transformation.
if (!isMultipleOfN(WideEVL, I->getDataLayout(), Factor))
return false;

assert(Mask && "vp.load and vp.store needs a mask!");
auto *FactorC = ConstantInt::get(WideEVL->getType(), Factor);
VL = Builder.CreateZExt(Builder.CreateExactUDiv(WideEVL, FactorC), XLenTy);
return true;
}
auto *II = cast<IntrinsicInst>(I);
assert(II->getIntrinsicID() == Intrinsic::masked_load &&
"Unexpected intrinsic");
Ptr = II->getOperand(0);
Alignment = cast<ConstantInt>(II->getArgOperand(1))->getAlignValue();

Value *WideEVL = VPLdSt->getVectorLengthParam();
// Conservatively check if EVL is a multiple of factor, otherwise some
// (trailing) elements might be lost after the transformation.
if (!isMultipleOfN(WideEVL, I->getDataLayout(), Factor))
if (!isa<UndefValue>(II->getOperand(3)))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

just a note: we can support interleaving passthru in the future (i.e. passthru composed by interleave intrinsic) though I haven't seen a real case yet.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd thought so to, but on reflection, I don't think we can. The problem is that the segment load only deinterleaves the loaded elements, we'd need to somehow deinterleave the pass thru elements separately, and then stick them in the right positions.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we'd need to somehow deinterleave the pass thru elements separately, and then stick them in the right positions.

right, I was thinking about using the same way we recognize mask, namely, recognizing something like this:

%interleaved.passthru = llvm.vector.interleave2(<4 x i32> %seg0, <4 x i32> %seg1)
masked.load(ptr %p, ..., <8 x i32> %interleaved.passthru)

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, yeah, that would work. It's just extremely restrictive. I haven't seen a case like that yet, have you?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, yeah, that would work. It's just extremely restrictive. I haven't seen a case like that yet, have you?

no I haven't. IIRC SLP doesn't really use passthru in its masked.load either

return false;

auto *FactorC = ConstantInt::get(WideEVL->getType(), Factor);
VL = Builder.CreateZExt(Builder.CreateExactUDiv(WideEVL, FactorC), XLenTy);
assert(Mask && "masked.load needs a mask!");

VL = isa<FixedVectorType>(VTy)
? Builder.CreateElementCount(XLenTy, VTy->getElementCount())
: Constant::getAllOnesValue(XLenTy);
return true;
}

Expand Down
71 changes: 3 additions & 68 deletions llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll
Original file line number Diff line number Diff line change
Expand Up @@ -542,10 +542,8 @@ define { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x
define {<vscale x 16 x i8>, <vscale x 16 x i8>} @masked_load_factor2(ptr %p) {
; CHECK-LABEL: masked_load_factor2:
; CHECK: # %bb.0:
; CHECK-NEXT: vl4r.v v12, (a0)
; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma
; CHECK-NEXT: vnsrl.wi v8, v12, 0
; CHECK-NEXT: vnsrl.wi v10, v12, 8
; CHECK-NEXT: vsetvli a1, zero, e8, m2, ta, ma
; CHECK-NEXT: vlseg2e8.v v8, (a0)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

non-blocking: I'm wondering for HW without optimized (NF=2) segmented load, whether the VNSRL lowering would be better.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd had the exact same question. It's somewhere on my list of things to maybe come back to. :)

; CHECK-NEXT: ret
%vec = call <vscale x 32 x i8> @llvm.masked.load(ptr %p, i32 4, <vscale x 32 x i1> splat (i1 true), <vscale x 32 x i8> poison)
%deinterleaved.results = call {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> %vec)
Expand All @@ -555,23 +553,8 @@ define {<vscale x 16 x i8>, <vscale x 16 x i8>} @masked_load_factor2(ptr %p) {
define {<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>} @masked_loat_factor4(ptr %p) {
; CHECK-LABEL: masked_loat_factor4:
; CHECK: # %bb.0:
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 2
; CHECK-NEXT: sub sp, sp, a1
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
; CHECK-NEXT: vl4r.v v8, (a0)
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vs4r.v v8, (a0)
; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma
; CHECK-NEXT: vlseg4e8.v v8, (a0)
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 2
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: .cfi_def_cfa sp, 16
; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: .cfi_def_cfa_offset 0
; CHECK-NEXT: ret
%vec = call <vscale x 32 x i8> @llvm.masked.load(ptr %p, i32 4, <vscale x 32 x i1> splat (i1 true), <vscale x 32 x i8> poison)
%deinterleaved.results = call {<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>} @llvm.vector.deinterleave4.nxv32i8(<vscale x 32 x i8> %vec)
Expand All @@ -581,56 +564,8 @@ define {<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i
define {<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>} @masked_loat_factor4_mask(ptr %p, <vscale x 8 x i1> %mask) {
; CHECK-LABEL: masked_loat_factor4_mask:
; CHECK: # %bb.0:
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 3
; CHECK-NEXT: sub sp, sp, a1
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma
; CHECK-NEXT: vmv.v.i v8, 0
; CHECK-NEXT: addi a1, sp, 16
; CHECK-NEXT: csrr a2, vlenb
; CHECK-NEXT: vmerge.vim v8, v8, 1, v0
; CHECK-NEXT: add a3, a1, a2
; CHECK-NEXT: vmv.v.v v9, v8
; CHECK-NEXT: srli a4, a2, 2
; CHECK-NEXT: vmv.v.v v10, v8
; CHECK-NEXT: srli a5, a2, 3
; CHECK-NEXT: vmv.v.v v11, v8
; CHECK-NEXT: vsseg4e8.v v8, (a1)
; CHECK-NEXT: vl1r.v v8, (a1)
; CHECK-NEXT: add a1, a4, a5
; CHECK-NEXT: vl1r.v v9, (a3)
; CHECK-NEXT: add a3, a3, a2
; CHECK-NEXT: add a2, a3, a2
; CHECK-NEXT: vl1r.v v10, (a3)
; CHECK-NEXT: vl1r.v v11, (a2)
; CHECK-NEXT: vmsne.vi v9, v9, 0
; CHECK-NEXT: vmsne.vi v0, v8, 0
; CHECK-NEXT: vmsne.vi v8, v10, 0
; CHECK-NEXT: vmsne.vi v10, v11, 0
; CHECK-NEXT: vsetvli zero, a4, e8, mf2, tu, ma
; CHECK-NEXT: vslideup.vx v0, v9, a5
; CHECK-NEXT: vsetvli zero, a1, e8, mf2, tu, ma
; CHECK-NEXT: vslideup.vx v0, v8, a4
; CHECK-NEXT: vsetvli a2, zero, e8, mf2, ta, ma
; CHECK-NEXT: vslideup.vx v0, v10, a1
; CHECK-NEXT: vsetvli a1, zero, e8, m4, ta, ma
; CHECK-NEXT: vle8.v v8, (a0), v0.t
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 2
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
; CHECK-NEXT: vs4r.v v8, (a0)
; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma
; CHECK-NEXT: vlseg4e8.v v8, (a0)
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: .cfi_def_cfa sp, 16
; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: .cfi_def_cfa_offset 0
; CHECK-NEXT: vlseg4e8.v v8, (a0), v0.t
; CHECK-NEXT: ret
%interleaved.mask = tail call <vscale x 32 x i1> @llvm.vector.interleave4.nxv32i1(<vscale x 8 x i1> %mask, <vscale x 8 x i1> %mask, <vscale x 8 x i1> %mask, <vscale x 8 x i1> %mask)
%vec = call <vscale x 32 x i8> @llvm.masked.load(ptr %p, i32 4, <vscale x 32 x i1> %interleaved.mask, <vscale x 32 x i8> poison)
Expand Down
Loading