Skip to content

Commit ce23830

Browse files
authored
[RISCV] Combine a vsse from a vsseg with one active segment (#151198)
This is a rewrite of the current strided store optimization to be a DAG combine. This allows it to kick in slightly more broadly, in particular for the scalable lowering paths.
1 parent 616cef0 commit ce23830

File tree

4 files changed

+91
-28
lines changed

4 files changed

+91
-28
lines changed

llvm/lib/Target/RISCV/RISCVISelLowering.cpp

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20751,6 +20751,53 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
2075120751
return DAG.getAllOnesConstant(DL, VT);
2075220752
return DAG.getConstant(0, DL, VT);
2075320753
}
20754+
case Intrinsic::riscv_vsseg2_mask:
20755+
case Intrinsic::riscv_vsseg3_mask:
20756+
case Intrinsic::riscv_vsseg4_mask:
20757+
case Intrinsic::riscv_vsseg5_mask:
20758+
case Intrinsic::riscv_vsseg6_mask:
20759+
case Intrinsic::riscv_vsseg7_mask:
20760+
case Intrinsic::riscv_vsseg8_mask: {
20761+
SDValue Tuple = N->getOperand(2);
20762+
unsigned NF = Tuple.getValueType().getRISCVVectorTupleNumFields();
20763+
20764+
if (Subtarget.hasOptimizedSegmentLoadStore(NF) || !Tuple.hasOneUse() ||
20765+
Tuple.getOpcode() != RISCVISD::TUPLE_INSERT ||
20766+
!Tuple.getOperand(0).isUndef())
20767+
return SDValue();
20768+
20769+
SDValue Val = Tuple.getOperand(1);
20770+
unsigned Idx = Tuple.getConstantOperandVal(2);
20771+
20772+
unsigned SEW = Val.getValueType().getScalarSizeInBits();
20773+
assert(Log2_64(SEW) == N->getConstantOperandVal(6) &&
20774+
"Type mismatch without bitcast?");
20775+
unsigned Stride = SEW / 8 * NF;
20776+
unsigned Offset = SEW / 8 * Idx;
20777+
20778+
SDValue Ops[] = {
20779+
/*Chain=*/N->getOperand(0),
20780+
/*IntID=*/
20781+
DAG.getTargetConstant(Intrinsic::riscv_vsse_mask, DL, XLenVT),
20782+
/*StoredVal=*/Val,
20783+
/*Ptr=*/
20784+
DAG.getNode(ISD::ADD, DL, XLenVT, N->getOperand(3),
20785+
DAG.getConstant(Offset, DL, XLenVT)),
20786+
/*Stride=*/DAG.getConstant(Stride, DL, XLenVT),
20787+
/*Mask=*/N->getOperand(4),
20788+
/*VL=*/N->getOperand(5)};
20789+
20790+
auto *OldMemSD = cast<MemIntrinsicSDNode>(N);
20791+
// Match getTgtMemIntrinsic for non-unit stride case
20792+
EVT MemVT = OldMemSD->getMemoryVT().getScalarType();
20793+
MachineFunction &MF = DAG.getMachineFunction();
20794+
MachineMemOperand *MMO = MF.getMachineMemOperand(
20795+
OldMemSD->getMemOperand(), Offset, MemoryLocation::UnknownSize);
20796+
20797+
SDVTList VTs = DAG.getVTList(MVT::Other);
20798+
return DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID, DL, VTs, Ops, MemVT,
20799+
MMO);
20800+
}
2075420801
}
2075520802
}
2075620803
case ISD::EXPERIMENTAL_VP_REVERSE:
@@ -20899,6 +20946,12 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
2089920946
DAG.ReplaceAllUsesOfValueWith(Tuple.getValue(1), Result.getValue(1));
2090020947
return Result.getValue(0);
2090120948
}
20949+
case RISCVISD::TUPLE_INSERT: {
20950+
// tuple_insert tuple, undef, idx -> tuple
20951+
if (N->getOperand(1).isUndef())
20952+
return N->getOperand(0);
20953+
break;
20954+
}
2090220955
}
2090320956

2090420957
return SDValue();

llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp

Lines changed: 0 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -266,33 +266,6 @@ bool RISCVTargetLowering::lowerInterleavedStore(Instruction *Store,
266266
if (!isLegalInterleavedAccessType(VTy, Factor, Alignment, AS, DL))
267267
return false;
268268

269-
unsigned Index;
270-
// If the segment store only has one active lane (i.e. the interleave is
271-
// just a spread shuffle), we can use a strided store instead. This will
272-
// be equally fast, and create less vector register pressure.
273-
if (!Subtarget.hasOptimizedSegmentLoadStore(Factor) &&
274-
isSpreadMask(Mask, Factor, Index)) {
275-
unsigned ScalarSizeInBytes =
276-
DL.getTypeStoreSize(ShuffleVTy->getElementType());
277-
Value *Data = SVI->getOperand(0);
278-
Data = Builder.CreateExtractVector(VTy, Data, uint64_t(0));
279-
Value *Stride = ConstantInt::get(XLenTy, Factor * ScalarSizeInBytes);
280-
Value *Offset = ConstantInt::get(XLenTy, Index * ScalarSizeInBytes);
281-
Value *BasePtr = Builder.CreatePtrAdd(Ptr, Offset);
282-
// For rv64, need to truncate i64 to i32 to match signature. As VL is at
283-
// most the number of active lanes (which is bounded by i32) this is safe.
284-
VL = Builder.CreateTrunc(VL, Builder.getInt32Ty());
285-
286-
CallInst *CI =
287-
Builder.CreateIntrinsic(Intrinsic::experimental_vp_strided_store,
288-
{VTy, BasePtr->getType(), Stride->getType()},
289-
{Data, BasePtr, Stride, LaneMask, VL});
290-
Alignment = commonAlignment(Alignment, Index * ScalarSizeInBytes);
291-
CI->addParamAttr(1,
292-
Attribute::getWithAlignment(CI->getContext(), Alignment));
293-
return true;
294-
}
295-
296269
Function *VssegNFunc = Intrinsic::getOrInsertDeclaration(
297270
Store->getModule(), FixedVssegIntrIds[Factor - 2], {VTy, PtrTy, XLenTy});
298271

llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1883,7 +1883,8 @@ define void @store_factor4_one_active_slidedown(ptr %ptr, <4 x i32> %v) {
18831883
; CHECK: # %bb.0:
18841884
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
18851885
; CHECK-NEXT: vslidedown.vi v8, v8, 1
1886-
; CHECK-NEXT: vsseg4e32.v v8, (a0)
1886+
; CHECK-NEXT: li a1, 16
1887+
; CHECK-NEXT: vsse32.v v8, (a0), a1
18871888
; CHECK-NEXT: ret
18881889
%v0 = shufflevector <4 x i32> %v, <4 x i32> poison, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 2, i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 undef, i32 4, i32 undef, i32 undef, i32 undef>
18891890
store <16 x i32> %v0, ptr %ptr

llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -326,3 +326,39 @@ define void @masked_store_factor3_masked(<vscale x 2 x i32> %a, <vscale x 2 x i3
326326
call void @llvm.masked.store(<vscale x 6 x i32> %v, ptr %p, i32 4, <vscale x 6 x i1> %interleaved.mask)
327327
ret void
328328
}
329+
330+
define void @store_factor2_oneactive(<vscale x 2 x i32> %a, ptr %p) {
331+
; CHECK-LABEL: store_factor2_oneactive:
332+
; CHECK: # %bb.0:
333+
; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma
334+
; CHECK-NEXT: vsseg2e32.v v8, (a0)
335+
; CHECK-NEXT: ret
336+
%v = call <vscale x 4 x i32> @llvm.vector.interleave2(<vscale x 2 x i32> %a, <vscale x 2 x i32> poison)
337+
store <vscale x 4 x i32> %v, ptr %p
338+
ret void
339+
}
340+
341+
define void @store_factor3_oneactive(<vscale x 2 x i32> %a, ptr %p) {
342+
; CHECK-LABEL: store_factor3_oneactive:
343+
; CHECK: # %bb.0:
344+
; CHECK-NEXT: li a1, 12
345+
; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, ma
346+
; CHECK-NEXT: vsse32.v v8, (a0), a1
347+
; CHECK-NEXT: ret
348+
%v = call <vscale x 6 x i32> @llvm.vector.interleave3(<vscale x 2 x i32> %a, <vscale x 2 x i32> poison, <vscale x 2 x i32> poison)
349+
store <vscale x 6 x i32> %v, ptr %p
350+
ret void
351+
}
352+
353+
define void @store_factor7_oneactive(<vscale x 2 x i32> %a, ptr %p) {
354+
; CHECK-LABEL: store_factor7_oneactive:
355+
; CHECK: # %bb.0:
356+
; CHECK-NEXT: addi a0, a0, 24
357+
; CHECK-NEXT: li a1, 28
358+
; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, ma
359+
; CHECK-NEXT: vsse32.v v8, (a0), a1
360+
; CHECK-NEXT: ret
361+
%v = call <vscale x 14 x i32> @llvm.vector.interleave7(<vscale x 2 x i32> poison, <vscale x 2 x i32> poison, <vscale x 2 x i32> poison, <vscale x 2 x i32> poison, <vscale x 2 x i32> poison, <vscale x 2 x i32> poison, <vscale x 2 x i32> %a)
362+
store <vscale x 14 x i32> %v, ptr %p
363+
ret void
364+
}

0 commit comments

Comments
 (0)