Skip to content

Commit b5a8f68

Browse files
committed
[ARM] Have custom lowering for ucmp and scmp
Limited to non-thumb at the moment, but we can do this for i32 in 3 steps, using subs to set the flags initially.
1 parent b9adc4a commit b5a8f68

File tree

6 files changed

+894
-268
lines changed

6 files changed

+894
-268
lines changed

llvm/lib/Target/ARM/ARMISelLowering.cpp

Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -802,6 +802,12 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_,
802802
setOperationAction(ISD::BSWAP, VT, Expand);
803803
}
804804

805+
if (!Subtarget->isThumb1Only() && !Subtarget->hasV8_1MMainlineOps())
806+
setOperationAction(ISD::SCMP, MVT::i32, Custom);
807+
808+
if (!Subtarget->hasV8_1MMainlineOps())
809+
setOperationAction(ISD::UCMP, MVT::i32, Custom);
810+
805811
setOperationAction(ISD::ConstantFP, MVT::f32, Custom);
806812
setOperationAction(ISD::ConstantFP, MVT::f64, Custom);
807813

@@ -1628,6 +1634,10 @@ bool ARMTargetLowering::useSoftFloat() const {
16281634
return Subtarget->useSoftFloat();
16291635
}
16301636

1637+
bool ARMTargetLowering::shouldExpandCmpUsingSelects(EVT VT) const {
1638+
return !Subtarget->isThumb1Only() && VT.getSizeInBits() <= 32;
1639+
}
1640+
16311641
// FIXME: It might make sense to define the representative register class as the
16321642
// nearest super-register that has a non-null superset. For example, DPR_VFP2 is
16331643
// a super-register of SPR, and DPR is a superset if DPR_VFP2. Consequently,
@@ -10614,6 +10624,134 @@ SDValue ARMTargetLowering::LowerFP_TO_BF16(SDValue Op,
1061410624
return DAG.getBitcast(MVT::i32, Res);
1061510625
}
1061610626

10627+
SDValue ARMTargetLowering::LowerCMP(SDValue Op, SelectionDAG &DAG) const {
10628+
SDLoc dl(Op);
10629+
SDValue LHS = Op.getOperand(0);
10630+
SDValue RHS = Op.getOperand(1);
10631+
10632+
// Determine if this is signed or unsigned comparison
10633+
bool IsSigned = (Op.getOpcode() == ISD::SCMP);
10634+
10635+
// Special case for Thumb1 UCMP only
10636+
if (!IsSigned && Subtarget->isThumb1Only()) {
10637+
// For Thumb unsigned comparison, use this sequence:
10638+
// subs r2, r0, r1 ; r2 = LHS - RHS, sets flags
10639+
// sbc r2, r2 ; r2 = r2 - r2 - !carry
10640+
// cmp r1, r0 ; compare RHS with LHS
10641+
// sbc r1, r1 ; r1 = r1 - r1 - !carry
10642+
// subs r0, r2, r1 ; r0 = r2 - r1 (final result)
10643+
10644+
// First subtraction: LHS - RHS
10645+
SDValue Sub1WithFlags = DAG.getNode(
10646+
ARMISD::SUBC, dl, DAG.getVTList(MVT::i32, FlagsVT), LHS, RHS);
10647+
SDValue Sub1Result = Sub1WithFlags.getValue(0);
10648+
SDValue Flags1 = Sub1WithFlags.getValue(1);
10649+
10650+
// SUBE: Sub1Result - Sub1Result - !carry
10651+
// This gives 0 if LHS >= RHS (unsigned), -1 if LHS < RHS (unsigned)
10652+
SDValue Sbc1 =
10653+
DAG.getNode(ARMISD::SUBE, dl, DAG.getVTList(MVT::i32, FlagsVT),
10654+
Sub1Result, Sub1Result, Flags1);
10655+
SDValue Sbc1Result = Sbc1.getValue(0);
10656+
10657+
// Second comparison: RHS vs LHS (reverse comparison)
10658+
SDValue CmpFlags = DAG.getNode(ARMISD::CMP, dl, FlagsVT, RHS, LHS);
10659+
10660+
// SUBE: RHS - RHS - !carry
10661+
// This gives 0 if RHS <= LHS (unsigned), -1 if RHS > LHS (unsigned)
10662+
SDValue Sbc2 = DAG.getNode(
10663+
ARMISD::SUBE, dl, DAG.getVTList(MVT::i32, FlagsVT), RHS, RHS, CmpFlags);
10664+
SDValue Sbc2Result = Sbc2.getValue(0);
10665+
10666+
// Final subtraction: Sbc1Result - Sbc2Result (no flags needed)
10667+
SDValue Result =
10668+
DAG.getNode(ISD::SUB, dl, MVT::i32, Sbc1Result, Sbc2Result);
10669+
if (Op.getValueType() != MVT::i32)
10670+
Result = DAG.getSExtOrTrunc(Result, dl, Op.getValueType());
10671+
10672+
return Result;
10673+
}
10674+
10675+
// For the ARM assembly pattern:
10676+
// subs r0, r0, r1 ; subtract RHS from LHS and set flags
10677+
// movgt r0, #1 ; if LHS > RHS, set result to 1 (GT for signed, HI for
10678+
// unsigned) mvnlt r0, #0 ; if LHS < RHS, set result to -1 (LT for
10679+
// signed, LO for unsigned)
10680+
// ; if LHS == RHS, result remains 0 from the subs
10681+
10682+
// Optimization: if RHS is a subtraction against 0, use ADDC instead of SUBC
10683+
SDValue AddOperand;
10684+
unsigned Opcode = ARMISD::SUBC;
10685+
10686+
// Check if RHS is a subtraction against 0: (0 - X)
10687+
if (RHS.getOpcode() == ISD::SUB) {
10688+
SDValue SubLHS = RHS.getOperand(0);
10689+
SDValue SubRHS = RHS.getOperand(1);
10690+
10691+
// Check if it's 0 - X
10692+
if (isNullConstant(SubLHS)) {
10693+
bool CanUseAdd = false;
10694+
if (IsSigned) {
10695+
// For SCMP: only if X is known to never be INT_MIN (to avoid overflow)
10696+
if (RHS->getFlags().hasNoSignedWrap() || !DAG.computeKnownBits(SubRHS)
10697+
.getSignedMinValue()
10698+
.isMinSignedValue()) {
10699+
CanUseAdd = true;
10700+
}
10701+
} else {
10702+
// For UCMP: only if X is known to never be zero
10703+
if (DAG.isKnownNeverZero(SubRHS)) {
10704+
CanUseAdd = true;
10705+
}
10706+
}
10707+
10708+
if (CanUseAdd) {
10709+
Opcode = ARMISD::ADDC;
10710+
AddOperand = SubRHS; // Replace RHS with X, so we do LHS + X instead of
10711+
// LHS - (0 - X)
10712+
}
10713+
}
10714+
}
10715+
10716+
// Generate the operation with flags
10717+
SDValue OpWithFlags;
10718+
if (Opcode == ARMISD::ADDC) {
10719+
// Use ADDC: LHS + AddOperand (where RHS was 0 - AddOperand)
10720+
OpWithFlags = DAG.getNode(
10721+
ARMISD::ADDC, dl, DAG.getVTList(MVT::i32, FlagsVT), LHS, AddOperand);
10722+
} else {
10723+
// Use ARMISD::SUBC to generate SUBS instruction (subtract with flags)
10724+
OpWithFlags = DAG.getNode(ARMISD::SUBC, dl,
10725+
DAG.getVTList(MVT::i32, FlagsVT), LHS, RHS);
10726+
}
10727+
10728+
SDValue OpResult = OpWithFlags.getValue(0); // The operation result
10729+
SDValue Flags = OpWithFlags.getValue(1); // The flags
10730+
10731+
// Constants for conditional moves
10732+
SDValue One = DAG.getConstant(1, dl, MVT::i32);
10733+
SDValue MinusOne = DAG.getAllOnesConstant(dl, MVT::i32);
10734+
10735+
// Select condition codes based on signed vs unsigned
10736+
ARMCC::CondCodes GTCond = IsSigned ? ARMCC::GT : ARMCC::HI;
10737+
ARMCC::CondCodes LTCond = IsSigned ? ARMCC::LT : ARMCC::LO;
10738+
10739+
// First conditional move: if greater than, set to 1
10740+
SDValue GTCondValue = DAG.getConstant(GTCond, dl, MVT::i32);
10741+
SDValue Result1 = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, OpResult, One,
10742+
GTCondValue, Flags);
10743+
10744+
// Second conditional move: if less than, set to -1
10745+
SDValue LTCondValue = DAG.getConstant(LTCond, dl, MVT::i32);
10746+
SDValue Result2 = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, Result1, MinusOne,
10747+
LTCondValue, Flags);
10748+
10749+
if (Op.getValueType() != MVT::i32)
10750+
Result2 = DAG.getSExtOrTrunc(Result2, dl, Op.getValueType());
10751+
10752+
return Result2;
10753+
}
10754+
1061710755
SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
1061810756
LLVM_DEBUG(dbgs() << "Lowering node: "; Op.dump());
1061910757
switch (Op.getOpcode()) {
@@ -10742,6 +10880,9 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
1074210880
case ISD::FP_TO_BF16:
1074310881
return LowerFP_TO_BF16(Op, DAG);
1074410882
case ARMISD::WIN__DBZCHK: return SDValue();
10883+
case ISD::UCMP:
10884+
case ISD::SCMP:
10885+
return LowerCMP(Op, DAG);
1074510886
}
1074610887
}
1074710888

llvm/lib/Target/ARM/ARMISelLowering.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -607,6 +607,8 @@ class VectorType;
607607

608608
bool preferZeroCompareBranch() const override { return true; }
609609

610+
bool shouldExpandCmpUsingSelects(EVT VT) const override;
611+
610612
bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override;
611613

612614
bool hasAndNotCompare(SDValue V) const override {
@@ -903,6 +905,7 @@ class VectorType;
903905
void LowerLOAD(SDNode *N, SmallVectorImpl<SDValue> &Results,
904906
SelectionDAG &DAG) const;
905907
SDValue LowerFP_TO_BF16(SDValue Op, SelectionDAG &DAG) const;
908+
SDValue LowerCMP(SDValue Op, SelectionDAG &DAG) const;
906909

907910
Register getRegisterByName(const char* RegName, LLT VT,
908911
const MachineFunction &MF) const override;

llvm/test/CodeGen/ARM/scmp.ll

Lines changed: 24 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,9 @@
44
define i8 @scmp_8_8(i8 signext %x, i8 signext %y) nounwind {
55
; CHECK-LABEL: scmp_8_8:
66
; CHECK: @ %bb.0:
7-
; CHECK-NEXT: cmp r0, r1
8-
; CHECK-NEXT: mov r0, #0
9-
; CHECK-NEXT: mov r2, #0
10-
; CHECK-NEXT: movwlt r0, #1
11-
; CHECK-NEXT: movwgt r2, #1
12-
; CHECK-NEXT: sub r0, r2, r0
7+
; CHECK-NEXT: subs r0, r0, r1
8+
; CHECK-NEXT: movwgt r0, #1
9+
; CHECK-NEXT: mvnlt r0, #0
1310
; CHECK-NEXT: bx lr
1411
%1 = call i8 @llvm.scmp(i8 %x, i8 %y)
1512
ret i8 %1
@@ -18,12 +15,9 @@ define i8 @scmp_8_8(i8 signext %x, i8 signext %y) nounwind {
1815
define i8 @scmp_8_16(i16 signext %x, i16 signext %y) nounwind {
1916
; CHECK-LABEL: scmp_8_16:
2017
; CHECK: @ %bb.0:
21-
; CHECK-NEXT: cmp r0, r1
22-
; CHECK-NEXT: mov r0, #0
23-
; CHECK-NEXT: mov r2, #0
24-
; CHECK-NEXT: movwlt r0, #1
25-
; CHECK-NEXT: movwgt r2, #1
26-
; CHECK-NEXT: sub r0, r2, r0
18+
; CHECK-NEXT: subs r0, r0, r1
19+
; CHECK-NEXT: movwgt r0, #1
20+
; CHECK-NEXT: mvnlt r0, #0
2721
; CHECK-NEXT: bx lr
2822
%1 = call i8 @llvm.scmp(i16 %x, i16 %y)
2923
ret i8 %1
@@ -32,12 +26,9 @@ define i8 @scmp_8_16(i16 signext %x, i16 signext %y) nounwind {
3226
define i8 @scmp_8_32(i32 %x, i32 %y) nounwind {
3327
; CHECK-LABEL: scmp_8_32:
3428
; CHECK: @ %bb.0:
35-
; CHECK-NEXT: cmp r0, r1
36-
; CHECK-NEXT: mov r0, #0
37-
; CHECK-NEXT: mov r2, #0
38-
; CHECK-NEXT: movwlt r0, #1
39-
; CHECK-NEXT: movwgt r2, #1
40-
; CHECK-NEXT: sub r0, r2, r0
29+
; CHECK-NEXT: subs r0, r0, r1
30+
; CHECK-NEXT: movwgt r0, #1
31+
; CHECK-NEXT: mvnlt r0, #0
4132
; CHECK-NEXT: bx lr
4233
%1 = call i8 @llvm.scmp(i32 %x, i32 %y)
4334
ret i8 %1
@@ -92,17 +83,26 @@ define i8 @scmp_8_128(i128 %x, i128 %y) nounwind {
9283
define i32 @scmp_32_32(i32 %x, i32 %y) nounwind {
9384
; CHECK-LABEL: scmp_32_32:
9485
; CHECK: @ %bb.0:
95-
; CHECK-NEXT: cmp r0, r1
96-
; CHECK-NEXT: mov r0, #0
97-
; CHECK-NEXT: mov r2, #0
98-
; CHECK-NEXT: movwlt r0, #1
99-
; CHECK-NEXT: movwgt r2, #1
100-
; CHECK-NEXT: sub r0, r2, r0
86+
; CHECK-NEXT: subs r0, r0, r1
87+
; CHECK-NEXT: movwgt r0, #1
88+
; CHECK-NEXT: mvnlt r0, #0
10189
; CHECK-NEXT: bx lr
10290
%1 = call i32 @llvm.scmp(i32 %x, i32 %y)
10391
ret i32 %1
10492
}
10593

94+
define i32 @scmp_neg(i32 %x, i32 %y) nounwind {
95+
; CHECK-LABEL: scmp_neg:
96+
; CHECK: @ %bb.0:
97+
; CHECK-NEXT: adds r0, r0, r1
98+
; CHECK-NEXT: movwgt r0, #1
99+
; CHECK-NEXT: mvnlt r0, #0
100+
; CHECK-NEXT: bx lr
101+
%yy = sub nsw i32 0, %y
102+
%1 = call i32 @llvm.scmp(i32 %x, i32 %yy)
103+
ret i32 %1
104+
}
105+
106106
define i32 @scmp_32_64(i64 %x, i64 %y) nounwind {
107107
; CHECK-LABEL: scmp_32_64:
108108
; CHECK: @ %bb.0:

llvm/test/CodeGen/ARM/ucmp.ll

Lines changed: 12 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,9 @@
44
define i8 @ucmp_8_8(i8 zeroext %x, i8 zeroext %y) nounwind {
55
; CHECK-LABEL: ucmp_8_8:
66
; CHECK: @ %bb.0:
7-
; CHECK-NEXT: cmp r0, r1
8-
; CHECK-NEXT: mov r0, #0
9-
; CHECK-NEXT: mov r2, #0
10-
; CHECK-NEXT: movwlo r0, #1
11-
; CHECK-NEXT: movwhi r2, #1
12-
; CHECK-NEXT: sub r0, r2, r0
7+
; CHECK-NEXT: subs r0, r0, r1
8+
; CHECK-NEXT: movwhi r0, #1
9+
; CHECK-NEXT: mvnlo r0, #0
1310
; CHECK-NEXT: bx lr
1411
%1 = call i8 @llvm.ucmp(i8 %x, i8 %y)
1512
ret i8 %1
@@ -18,12 +15,9 @@ define i8 @ucmp_8_8(i8 zeroext %x, i8 zeroext %y) nounwind {
1815
define i8 @ucmp_8_16(i16 zeroext %x, i16 zeroext %y) nounwind {
1916
; CHECK-LABEL: ucmp_8_16:
2017
; CHECK: @ %bb.0:
21-
; CHECK-NEXT: cmp r0, r1
22-
; CHECK-NEXT: mov r0, #0
23-
; CHECK-NEXT: mov r2, #0
24-
; CHECK-NEXT: movwlo r0, #1
25-
; CHECK-NEXT: movwhi r2, #1
26-
; CHECK-NEXT: sub r0, r2, r0
18+
; CHECK-NEXT: subs r0, r0, r1
19+
; CHECK-NEXT: movwhi r0, #1
20+
; CHECK-NEXT: mvnlo r0, #0
2721
; CHECK-NEXT: bx lr
2822
%1 = call i8 @llvm.ucmp(i16 %x, i16 %y)
2923
ret i8 %1
@@ -32,12 +26,9 @@ define i8 @ucmp_8_16(i16 zeroext %x, i16 zeroext %y) nounwind {
3226
define i8 @ucmp_8_32(i32 %x, i32 %y) nounwind {
3327
; CHECK-LABEL: ucmp_8_32:
3428
; CHECK: @ %bb.0:
35-
; CHECK-NEXT: cmp r0, r1
36-
; CHECK-NEXT: mov r0, #0
37-
; CHECK-NEXT: mov r2, #0
38-
; CHECK-NEXT: movwlo r0, #1
39-
; CHECK-NEXT: movwhi r2, #1
40-
; CHECK-NEXT: sub r0, r2, r0
29+
; CHECK-NEXT: subs r0, r0, r1
30+
; CHECK-NEXT: movwhi r0, #1
31+
; CHECK-NEXT: mvnlo r0, #0
4132
; CHECK-NEXT: bx lr
4233
%1 = call i8 @llvm.ucmp(i32 %x, i32 %y)
4334
ret i8 %1
@@ -92,12 +83,9 @@ define i8 @ucmp_8_128(i128 %x, i128 %y) nounwind {
9283
define i32 @ucmp_32_32(i32 %x, i32 %y) nounwind {
9384
; CHECK-LABEL: ucmp_32_32:
9485
; CHECK: @ %bb.0:
95-
; CHECK-NEXT: cmp r0, r1
96-
; CHECK-NEXT: mov r0, #0
97-
; CHECK-NEXT: mov r2, #0
98-
; CHECK-NEXT: movwlo r0, #1
99-
; CHECK-NEXT: movwhi r2, #1
100-
; CHECK-NEXT: sub r0, r2, r0
86+
; CHECK-NEXT: subs r0, r0, r1
87+
; CHECK-NEXT: movwhi r0, #1
88+
; CHECK-NEXT: mvnlo r0, #0
10189
; CHECK-NEXT: bx lr
10290
%1 = call i32 @llvm.ucmp(i32 %x, i32 %y)
10391
ret i32 %1

0 commit comments

Comments
 (0)