Skip to content

[RISCV][IA] Support masked.load for deinterleaveN matching #149556

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 29 additions & 13 deletions llvm/lib/CodeGen/InterleavedAccessPass.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -601,31 +601,47 @@ static Value *getMask(Value *WideMask, unsigned Factor,
bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
IntrinsicInst *DI, SmallSetVector<Instruction *, 32> &DeadInsts) {
Value *LoadedVal = DI->getOperand(0);
if (!LoadedVal->hasOneUse() || !isa<LoadInst, VPIntrinsic>(LoadedVal))
if (!LoadedVal->hasOneUse())
return false;

auto *LI = dyn_cast<LoadInst>(LoadedVal);
auto *II = dyn_cast<IntrinsicInst>(LoadedVal);
if (!LI && !II)
return false;

const unsigned Factor = getDeinterleaveIntrinsicFactor(DI->getIntrinsicID());
assert(Factor && "unexpected deinterleave intrinsic");

Value *Mask = nullptr;
if (auto *VPLoad = dyn_cast<VPIntrinsic>(LoadedVal)) {
if (VPLoad->getIntrinsicID() != Intrinsic::vp_load)
if (LI) {
if (!LI->isSimple())
return false;

LLVM_DEBUG(dbgs() << "IA: Found a load with deinterleave intrinsic " << *DI
<< " and factor = " << Factor << "\n");
} else {
assert(II);

// Check mask operand. Handle both all-true/false and interleaved mask.
Value *WideMask = VPLoad->getOperand(1);
Mask = getMask(WideMask, Factor, getDeinterleavedVectorType(DI));
if (!Mask)
Value *WideMask;
switch (II->getIntrinsicID()) {
default:
return false;
case Intrinsic::vp_load:
WideMask = II->getOperand(1);
break;
case Intrinsic::masked_load:
WideMask = II->getOperand(2);
break;
}

LLVM_DEBUG(dbgs() << "IA: Found a vp.load with deinterleave intrinsic "
<< *DI << " and factor = " << Factor << "\n");
} else {
auto *LI = cast<LoadInst>(LoadedVal);
if (!LI->isSimple())
Mask = getMask(WideMask, Factor, getDeinterleavedVectorType(DI));
if (!Mask)
return false;

LLVM_DEBUG(dbgs() << "IA: Found a load with deinterleave intrinsic " << *DI
<< " and factor = " << Factor << "\n");
LLVM_DEBUG(dbgs() << "IA: Found a vp.load or masked.load with deinterleave"
<< " intrinsic " << *DI << " and factor = "
<< Factor << "\n");
}

// Try and match this with target specific intrinsics.
Expand Down
44 changes: 30 additions & 14 deletions llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -131,24 +131,40 @@ static bool getMemOperands(unsigned Factor, VectorType *VTy, Type *XLenTy,
: Constant::getAllOnesValue(XLenTy);
return true;
}
auto *VPLdSt = cast<VPIntrinsic>(I);
assert((VPLdSt->getIntrinsicID() == Intrinsic::vp_load ||
VPLdSt->getIntrinsicID() == Intrinsic::vp_store) &&
"Unexpected intrinsic");
Ptr = VPLdSt->getMemoryPointerParam();
Alignment = VPLdSt->getPointerAlignment().value_or(
DL.getABITypeAlign(VTy->getElementType()));
if (auto *VPLdSt = dyn_cast<VPIntrinsic>(I)) {
assert((VPLdSt->getIntrinsicID() == Intrinsic::vp_load ||
VPLdSt->getIntrinsicID() == Intrinsic::vp_store) &&
"Unexpected intrinsic");
Ptr = VPLdSt->getMemoryPointerParam();
Alignment = VPLdSt->getPointerAlignment().value_or(
DL.getABITypeAlign(VTy->getElementType()));

assert(Mask && "vp.load and vp.store needs a mask!");

Value *WideEVL = VPLdSt->getVectorLengthParam();
// Conservatively check if EVL is a multiple of factor, otherwise some
// (trailing) elements might be lost after the transformation.
if (!isMultipleOfN(WideEVL, I->getDataLayout(), Factor))
return false;

assert(Mask && "vp.load and vp.store needs a mask!");
auto *FactorC = ConstantInt::get(WideEVL->getType(), Factor);
VL = Builder.CreateZExt(Builder.CreateExactUDiv(WideEVL, FactorC), XLenTy);
return true;
}
auto *II = cast<IntrinsicInst>(I);
assert(II->getIntrinsicID() == Intrinsic::masked_load &&
"Unexpected intrinsic");
Ptr = II->getOperand(0);
Alignment = cast<ConstantInt>(II->getArgOperand(1))->getAlignValue();

Value *WideEVL = VPLdSt->getVectorLengthParam();
// Conservatively check if EVL is a multiple of factor, otherwise some
// (trailing) elements might be lost after the transformation.
if (!isMultipleOfN(WideEVL, I->getDataLayout(), Factor))
if (!isa<UndefValue>(II->getOperand(3)))
return false;

auto *FactorC = ConstantInt::get(WideEVL->getType(), Factor);
VL = Builder.CreateZExt(Builder.CreateExactUDiv(WideEVL, FactorC), XLenTy);
assert(Mask && "masked.load needs a mask!");

VL = isa<FixedVectorType>(VTy)
? Builder.CreateElementCount(XLenTy, VTy->getElementCount())
: Constant::getAllOnesValue(XLenTy);
return true;
}

Expand Down
71 changes: 3 additions & 68 deletions llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll
Original file line number Diff line number Diff line change
Expand Up @@ -542,10 +542,8 @@ define { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x
define {<vscale x 16 x i8>, <vscale x 16 x i8>} @masked_load_factor2(ptr %p) {
; CHECK-LABEL: masked_load_factor2:
; CHECK: # %bb.0:
; CHECK-NEXT: vl4r.v v12, (a0)
; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma
; CHECK-NEXT: vnsrl.wi v8, v12, 0
; CHECK-NEXT: vnsrl.wi v10, v12, 8
; CHECK-NEXT: vsetvli a1, zero, e8, m2, ta, ma
; CHECK-NEXT: vlseg2e8.v v8, (a0)
; CHECK-NEXT: ret
%vec = call <vscale x 32 x i8> @llvm.masked.load(ptr %p, i32 4, <vscale x 32 x i1> splat (i1 true), <vscale x 32 x i8> poison)
%deinterleaved.results = call {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> %vec)
Expand All @@ -555,23 +553,8 @@ define {<vscale x 16 x i8>, <vscale x 16 x i8>} @masked_load_factor2(ptr %p) {
define {<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>} @masked_loat_factor4(ptr %p) {
; CHECK-LABEL: masked_loat_factor4:
; CHECK: # %bb.0:
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 2
; CHECK-NEXT: sub sp, sp, a1
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
; CHECK-NEXT: vl4r.v v8, (a0)
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vs4r.v v8, (a0)
; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma
; CHECK-NEXT: vlseg4e8.v v8, (a0)
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 2
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: .cfi_def_cfa sp, 16
; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: .cfi_def_cfa_offset 0
; CHECK-NEXT: ret
%vec = call <vscale x 32 x i8> @llvm.masked.load(ptr %p, i32 4, <vscale x 32 x i1> splat (i1 true), <vscale x 32 x i8> poison)
%deinterleaved.results = call {<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>} @llvm.vector.deinterleave4.nxv32i8(<vscale x 32 x i8> %vec)
Expand All @@ -581,56 +564,8 @@ define {<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i
define {<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>} @masked_loat_factor4_mask(ptr %p, <vscale x 8 x i1> %mask) {
; CHECK-LABEL: masked_loat_factor4_mask:
; CHECK: # %bb.0:
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 3
; CHECK-NEXT: sub sp, sp, a1
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma
; CHECK-NEXT: vmv.v.i v8, 0
; CHECK-NEXT: addi a1, sp, 16
; CHECK-NEXT: csrr a2, vlenb
; CHECK-NEXT: vmerge.vim v8, v8, 1, v0
; CHECK-NEXT: add a3, a1, a2
; CHECK-NEXT: vmv.v.v v9, v8
; CHECK-NEXT: srli a4, a2, 2
; CHECK-NEXT: vmv.v.v v10, v8
; CHECK-NEXT: srli a5, a2, 3
; CHECK-NEXT: vmv.v.v v11, v8
; CHECK-NEXT: vsseg4e8.v v8, (a1)
; CHECK-NEXT: vl1r.v v8, (a1)
; CHECK-NEXT: add a1, a4, a5
; CHECK-NEXT: vl1r.v v9, (a3)
; CHECK-NEXT: add a3, a3, a2
; CHECK-NEXT: add a2, a3, a2
; CHECK-NEXT: vl1r.v v10, (a3)
; CHECK-NEXT: vl1r.v v11, (a2)
; CHECK-NEXT: vmsne.vi v9, v9, 0
; CHECK-NEXT: vmsne.vi v0, v8, 0
; CHECK-NEXT: vmsne.vi v8, v10, 0
; CHECK-NEXT: vmsne.vi v10, v11, 0
; CHECK-NEXT: vsetvli zero, a4, e8, mf2, tu, ma
; CHECK-NEXT: vslideup.vx v0, v9, a5
; CHECK-NEXT: vsetvli zero, a1, e8, mf2, tu, ma
; CHECK-NEXT: vslideup.vx v0, v8, a4
; CHECK-NEXT: vsetvli a2, zero, e8, mf2, ta, ma
; CHECK-NEXT: vslideup.vx v0, v10, a1
; CHECK-NEXT: vsetvli a1, zero, e8, m4, ta, ma
; CHECK-NEXT: vle8.v v8, (a0), v0.t
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 2
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
; CHECK-NEXT: vs4r.v v8, (a0)
; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma
; CHECK-NEXT: vlseg4e8.v v8, (a0)
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: .cfi_def_cfa sp, 16
; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: .cfi_def_cfa_offset 0
; CHECK-NEXT: vlseg4e8.v v8, (a0), v0.t
; CHECK-NEXT: ret
%interleaved.mask = tail call <vscale x 32 x i1> @llvm.vector.interleave4.nxv32i1(<vscale x 8 x i1> %mask, <vscale x 8 x i1> %mask, <vscale x 8 x i1> %mask, <vscale x 8 x i1> %mask)
%vec = call <vscale x 32 x i8> @llvm.masked.load(ptr %p, i32 4, <vscale x 32 x i1> %interleaved.mask, <vscale x 32 x i8> poison)
Expand Down
Loading