Skip to content

Commit 5f9627e

Browse files
committed
[ARM] Have custom lowering for ucmp and scmp
Limited to non-thumb at the moment, but we can do this for i32 in 3 steps, using subs to set the flags initially.
1 parent 1e815ce commit 5f9627e

File tree

6 files changed

+894
-268
lines changed

6 files changed

+894
-268
lines changed

llvm/lib/Target/ARM/ARMISelLowering.cpp

Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -807,6 +807,12 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_,
807807
setOperationAction(ISD::BSWAP, VT, Expand);
808808
}
809809

810+
if (!Subtarget->isThumb1Only() && !Subtarget->hasV8_1MMainlineOps())
811+
setOperationAction(ISD::SCMP, MVT::i32, Custom);
812+
813+
if (!Subtarget->hasV8_1MMainlineOps())
814+
setOperationAction(ISD::UCMP, MVT::i32, Custom);
815+
810816
setOperationAction(ISD::ConstantFP, MVT::f32, Custom);
811817
setOperationAction(ISD::ConstantFP, MVT::f64, Custom);
812818

@@ -1639,6 +1645,10 @@ bool ARMTargetLowering::useSoftFloat() const {
16391645
return Subtarget->useSoftFloat();
16401646
}
16411647

1648+
bool ARMTargetLowering::shouldExpandCmpUsingSelects(EVT VT) const {
1649+
return !Subtarget->isThumb1Only() && VT.getSizeInBits() <= 32;
1650+
}
1651+
16421652
// FIXME: It might make sense to define the representative register class as the
16431653
// nearest super-register that has a non-null superset. For example, DPR_VFP2 is
16441654
// a super-register of SPR, and DPR is a superset if DPR_VFP2. Consequently,
@@ -10617,6 +10627,134 @@ SDValue ARMTargetLowering::LowerFP_TO_BF16(SDValue Op,
1061710627
return DAG.getBitcast(MVT::i32, Res);
1061810628
}
1061910629

10630+
SDValue ARMTargetLowering::LowerCMP(SDValue Op, SelectionDAG &DAG) const {
10631+
SDLoc dl(Op);
10632+
SDValue LHS = Op.getOperand(0);
10633+
SDValue RHS = Op.getOperand(1);
10634+
10635+
// Determine if this is signed or unsigned comparison
10636+
bool IsSigned = (Op.getOpcode() == ISD::SCMP);
10637+
10638+
// Special case for Thumb1 UCMP only
10639+
if (!IsSigned && Subtarget->isThumb1Only()) {
10640+
// For Thumb unsigned comparison, use this sequence:
10641+
// subs r2, r0, r1 ; r2 = LHS - RHS, sets flags
10642+
// sbc r2, r2 ; r2 = r2 - r2 - !carry
10643+
// cmp r1, r0 ; compare RHS with LHS
10644+
// sbc r1, r1 ; r1 = r1 - r1 - !carry
10645+
// subs r0, r2, r1 ; r0 = r2 - r1 (final result)
10646+
10647+
// First subtraction: LHS - RHS
10648+
SDValue Sub1WithFlags = DAG.getNode(
10649+
ARMISD::SUBC, dl, DAG.getVTList(MVT::i32, FlagsVT), LHS, RHS);
10650+
SDValue Sub1Result = Sub1WithFlags.getValue(0);
10651+
SDValue Flags1 = Sub1WithFlags.getValue(1);
10652+
10653+
// SUBE: Sub1Result - Sub1Result - !carry
10654+
// This gives 0 if LHS >= RHS (unsigned), -1 if LHS < RHS (unsigned)
10655+
SDValue Sbc1 =
10656+
DAG.getNode(ARMISD::SUBE, dl, DAG.getVTList(MVT::i32, FlagsVT),
10657+
Sub1Result, Sub1Result, Flags1);
10658+
SDValue Sbc1Result = Sbc1.getValue(0);
10659+
10660+
// Second comparison: RHS vs LHS (reverse comparison)
10661+
SDValue CmpFlags = DAG.getNode(ARMISD::CMP, dl, FlagsVT, RHS, LHS);
10662+
10663+
// SUBE: RHS - RHS - !carry
10664+
// This gives 0 if RHS <= LHS (unsigned), -1 if RHS > LHS (unsigned)
10665+
SDValue Sbc2 = DAG.getNode(
10666+
ARMISD::SUBE, dl, DAG.getVTList(MVT::i32, FlagsVT), RHS, RHS, CmpFlags);
10667+
SDValue Sbc2Result = Sbc2.getValue(0);
10668+
10669+
// Final subtraction: Sbc1Result - Sbc2Result (no flags needed)
10670+
SDValue Result =
10671+
DAG.getNode(ISD::SUB, dl, MVT::i32, Sbc1Result, Sbc2Result);
10672+
if (Op.getValueType() != MVT::i32)
10673+
Result = DAG.getSExtOrTrunc(Result, dl, Op.getValueType());
10674+
10675+
return Result;
10676+
}
10677+
10678+
// For the ARM assembly pattern:
10679+
// subs r0, r0, r1 ; subtract RHS from LHS and set flags
10680+
// movgt r0, #1 ; if LHS > RHS, set result to 1 (GT for signed, HI for
10681+
// unsigned) mvnlt r0, #0 ; if LHS < RHS, set result to -1 (LT for
10682+
// signed, LO for unsigned)
10683+
// ; if LHS == RHS, result remains 0 from the subs
10684+
10685+
// Optimization: if RHS is a subtraction against 0, use ADDC instead of SUBC
10686+
SDValue AddOperand;
10687+
unsigned Opcode = ARMISD::SUBC;
10688+
10689+
// Check if RHS is a subtraction against 0: (0 - X)
10690+
if (RHS.getOpcode() == ISD::SUB) {
10691+
SDValue SubLHS = RHS.getOperand(0);
10692+
SDValue SubRHS = RHS.getOperand(1);
10693+
10694+
// Check if it's 0 - X
10695+
if (isNullConstant(SubLHS)) {
10696+
bool CanUseAdd = false;
10697+
if (IsSigned) {
10698+
// For SCMP: only if X is known to never be INT_MIN (to avoid overflow)
10699+
if (RHS->getFlags().hasNoSignedWrap() || !DAG.computeKnownBits(SubRHS)
10700+
.getSignedMinValue()
10701+
.isMinSignedValue()) {
10702+
CanUseAdd = true;
10703+
}
10704+
} else {
10705+
// For UCMP: only if X is known to never be zero
10706+
if (DAG.isKnownNeverZero(SubRHS)) {
10707+
CanUseAdd = true;
10708+
}
10709+
}
10710+
10711+
if (CanUseAdd) {
10712+
Opcode = ARMISD::ADDC;
10713+
AddOperand = SubRHS; // Replace RHS with X, so we do LHS + X instead of
10714+
// LHS - (0 - X)
10715+
}
10716+
}
10717+
}
10718+
10719+
// Generate the operation with flags
10720+
SDValue OpWithFlags;
10721+
if (Opcode == ARMISD::ADDC) {
10722+
// Use ADDC: LHS + AddOperand (where RHS was 0 - AddOperand)
10723+
OpWithFlags = DAG.getNode(
10724+
ARMISD::ADDC, dl, DAG.getVTList(MVT::i32, FlagsVT), LHS, AddOperand);
10725+
} else {
10726+
// Use ARMISD::SUBC to generate SUBS instruction (subtract with flags)
10727+
OpWithFlags = DAG.getNode(ARMISD::SUBC, dl,
10728+
DAG.getVTList(MVT::i32, FlagsVT), LHS, RHS);
10729+
}
10730+
10731+
SDValue OpResult = OpWithFlags.getValue(0); // The operation result
10732+
SDValue Flags = OpWithFlags.getValue(1); // The flags
10733+
10734+
// Constants for conditional moves
10735+
SDValue One = DAG.getConstant(1, dl, MVT::i32);
10736+
SDValue MinusOne = DAG.getAllOnesConstant(dl, MVT::i32);
10737+
10738+
// Select condition codes based on signed vs unsigned
10739+
ARMCC::CondCodes GTCond = IsSigned ? ARMCC::GT : ARMCC::HI;
10740+
ARMCC::CondCodes LTCond = IsSigned ? ARMCC::LT : ARMCC::LO;
10741+
10742+
// First conditional move: if greater than, set to 1
10743+
SDValue GTCondValue = DAG.getConstant(GTCond, dl, MVT::i32);
10744+
SDValue Result1 = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, OpResult, One,
10745+
GTCondValue, Flags);
10746+
10747+
// Second conditional move: if less than, set to -1
10748+
SDValue LTCondValue = DAG.getConstant(LTCond, dl, MVT::i32);
10749+
SDValue Result2 = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, Result1, MinusOne,
10750+
LTCondValue, Flags);
10751+
10752+
if (Op.getValueType() != MVT::i32)
10753+
Result2 = DAG.getSExtOrTrunc(Result2, dl, Op.getValueType());
10754+
10755+
return Result2;
10756+
}
10757+
1062010758
SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
1062110759
LLVM_DEBUG(dbgs() << "Lowering node: "; Op.dump());
1062210760
switch (Op.getOpcode()) {
@@ -10745,6 +10883,9 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
1074510883
case ISD::FP_TO_BF16:
1074610884
return LowerFP_TO_BF16(Op, DAG);
1074710885
case ARMISD::WIN__DBZCHK: return SDValue();
10886+
case ISD::UCMP:
10887+
case ISD::SCMP:
10888+
return LowerCMP(Op, DAG);
1074810889
}
1074910890
}
1075010891

llvm/lib/Target/ARM/ARMISelLowering.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -607,6 +607,8 @@ class VectorType;
607607

608608
bool preferZeroCompareBranch() const override { return true; }
609609

610+
bool shouldExpandCmpUsingSelects(EVT VT) const override;
611+
610612
bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override;
611613

612614
bool hasAndNotCompare(SDValue V) const override {
@@ -904,6 +906,7 @@ class VectorType;
904906
void LowerLOAD(SDNode *N, SmallVectorImpl<SDValue> &Results,
905907
SelectionDAG &DAG) const;
906908
SDValue LowerFP_TO_BF16(SDValue Op, SelectionDAG &DAG) const;
909+
SDValue LowerCMP(SDValue Op, SelectionDAG &DAG) const;
907910

908911
Register getRegisterByName(const char* RegName, LLT VT,
909912
const MachineFunction &MF) const override;

llvm/test/CodeGen/ARM/scmp.ll

Lines changed: 24 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,9 @@
44
define i8 @scmp_8_8(i8 signext %x, i8 signext %y) nounwind {
55
; CHECK-LABEL: scmp_8_8:
66
; CHECK: @ %bb.0:
7-
; CHECK-NEXT: cmp r0, r1
8-
; CHECK-NEXT: mov r0, #0
9-
; CHECK-NEXT: mov r2, #0
10-
; CHECK-NEXT: movwlt r0, #1
11-
; CHECK-NEXT: movwgt r2, #1
12-
; CHECK-NEXT: sub r0, r2, r0
7+
; CHECK-NEXT: subs r0, r0, r1
8+
; CHECK-NEXT: movwgt r0, #1
9+
; CHECK-NEXT: mvnlt r0, #0
1310
; CHECK-NEXT: bx lr
1411
%1 = call i8 @llvm.scmp(i8 %x, i8 %y)
1512
ret i8 %1
@@ -18,12 +15,9 @@ define i8 @scmp_8_8(i8 signext %x, i8 signext %y) nounwind {
1815
define i8 @scmp_8_16(i16 signext %x, i16 signext %y) nounwind {
1916
; CHECK-LABEL: scmp_8_16:
2017
; CHECK: @ %bb.0:
21-
; CHECK-NEXT: cmp r0, r1
22-
; CHECK-NEXT: mov r0, #0
23-
; CHECK-NEXT: mov r2, #0
24-
; CHECK-NEXT: movwlt r0, #1
25-
; CHECK-NEXT: movwgt r2, #1
26-
; CHECK-NEXT: sub r0, r2, r0
18+
; CHECK-NEXT: subs r0, r0, r1
19+
; CHECK-NEXT: movwgt r0, #1
20+
; CHECK-NEXT: mvnlt r0, #0
2721
; CHECK-NEXT: bx lr
2822
%1 = call i8 @llvm.scmp(i16 %x, i16 %y)
2923
ret i8 %1
@@ -32,12 +26,9 @@ define i8 @scmp_8_16(i16 signext %x, i16 signext %y) nounwind {
3226
define i8 @scmp_8_32(i32 %x, i32 %y) nounwind {
3327
; CHECK-LABEL: scmp_8_32:
3428
; CHECK: @ %bb.0:
35-
; CHECK-NEXT: cmp r0, r1
36-
; CHECK-NEXT: mov r0, #0
37-
; CHECK-NEXT: mov r2, #0
38-
; CHECK-NEXT: movwlt r0, #1
39-
; CHECK-NEXT: movwgt r2, #1
40-
; CHECK-NEXT: sub r0, r2, r0
29+
; CHECK-NEXT: subs r0, r0, r1
30+
; CHECK-NEXT: movwgt r0, #1
31+
; CHECK-NEXT: mvnlt r0, #0
4132
; CHECK-NEXT: bx lr
4233
%1 = call i8 @llvm.scmp(i32 %x, i32 %y)
4334
ret i8 %1
@@ -92,17 +83,26 @@ define i8 @scmp_8_128(i128 %x, i128 %y) nounwind {
9283
define i32 @scmp_32_32(i32 %x, i32 %y) nounwind {
9384
; CHECK-LABEL: scmp_32_32:
9485
; CHECK: @ %bb.0:
95-
; CHECK-NEXT: cmp r0, r1
96-
; CHECK-NEXT: mov r0, #0
97-
; CHECK-NEXT: mov r2, #0
98-
; CHECK-NEXT: movwlt r0, #1
99-
; CHECK-NEXT: movwgt r2, #1
100-
; CHECK-NEXT: sub r0, r2, r0
86+
; CHECK-NEXT: subs r0, r0, r1
87+
; CHECK-NEXT: movwgt r0, #1
88+
; CHECK-NEXT: mvnlt r0, #0
10189
; CHECK-NEXT: bx lr
10290
%1 = call i32 @llvm.scmp(i32 %x, i32 %y)
10391
ret i32 %1
10492
}
10593

94+
define i32 @scmp_neg(i32 %x, i32 %y) nounwind {
95+
; CHECK-LABEL: scmp_neg:
96+
; CHECK: @ %bb.0:
97+
; CHECK-NEXT: adds r0, r0, r1
98+
; CHECK-NEXT: movwgt r0, #1
99+
; CHECK-NEXT: mvnlt r0, #0
100+
; CHECK-NEXT: bx lr
101+
%yy = sub nsw i32 0, %y
102+
%1 = call i32 @llvm.scmp(i32 %x, i32 %yy)
103+
ret i32 %1
104+
}
105+
106106
define i32 @scmp_32_64(i64 %x, i64 %y) nounwind {
107107
; CHECK-LABEL: scmp_32_64:
108108
; CHECK: @ %bb.0:

llvm/test/CodeGen/ARM/ucmp.ll

Lines changed: 12 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,9 @@
44
define i8 @ucmp_8_8(i8 zeroext %x, i8 zeroext %y) nounwind {
55
; CHECK-LABEL: ucmp_8_8:
66
; CHECK: @ %bb.0:
7-
; CHECK-NEXT: cmp r0, r1
8-
; CHECK-NEXT: mov r0, #0
9-
; CHECK-NEXT: mov r2, #0
10-
; CHECK-NEXT: movwlo r0, #1
11-
; CHECK-NEXT: movwhi r2, #1
12-
; CHECK-NEXT: sub r0, r2, r0
7+
; CHECK-NEXT: subs r0, r0, r1
8+
; CHECK-NEXT: movwhi r0, #1
9+
; CHECK-NEXT: mvnlo r0, #0
1310
; CHECK-NEXT: bx lr
1411
%1 = call i8 @llvm.ucmp(i8 %x, i8 %y)
1512
ret i8 %1
@@ -18,12 +15,9 @@ define i8 @ucmp_8_8(i8 zeroext %x, i8 zeroext %y) nounwind {
1815
define i8 @ucmp_8_16(i16 zeroext %x, i16 zeroext %y) nounwind {
1916
; CHECK-LABEL: ucmp_8_16:
2017
; CHECK: @ %bb.0:
21-
; CHECK-NEXT: cmp r0, r1
22-
; CHECK-NEXT: mov r0, #0
23-
; CHECK-NEXT: mov r2, #0
24-
; CHECK-NEXT: movwlo r0, #1
25-
; CHECK-NEXT: movwhi r2, #1
26-
; CHECK-NEXT: sub r0, r2, r0
18+
; CHECK-NEXT: subs r0, r0, r1
19+
; CHECK-NEXT: movwhi r0, #1
20+
; CHECK-NEXT: mvnlo r0, #0
2721
; CHECK-NEXT: bx lr
2822
%1 = call i8 @llvm.ucmp(i16 %x, i16 %y)
2923
ret i8 %1
@@ -32,12 +26,9 @@ define i8 @ucmp_8_16(i16 zeroext %x, i16 zeroext %y) nounwind {
3226
define i8 @ucmp_8_32(i32 %x, i32 %y) nounwind {
3327
; CHECK-LABEL: ucmp_8_32:
3428
; CHECK: @ %bb.0:
35-
; CHECK-NEXT: cmp r0, r1
36-
; CHECK-NEXT: mov r0, #0
37-
; CHECK-NEXT: mov r2, #0
38-
; CHECK-NEXT: movwlo r0, #1
39-
; CHECK-NEXT: movwhi r2, #1
40-
; CHECK-NEXT: sub r0, r2, r0
29+
; CHECK-NEXT: subs r0, r0, r1
30+
; CHECK-NEXT: movwhi r0, #1
31+
; CHECK-NEXT: mvnlo r0, #0
4132
; CHECK-NEXT: bx lr
4233
%1 = call i8 @llvm.ucmp(i32 %x, i32 %y)
4334
ret i8 %1
@@ -92,12 +83,9 @@ define i8 @ucmp_8_128(i128 %x, i128 %y) nounwind {
9283
define i32 @ucmp_32_32(i32 %x, i32 %y) nounwind {
9384
; CHECK-LABEL: ucmp_32_32:
9485
; CHECK: @ %bb.0:
95-
; CHECK-NEXT: cmp r0, r1
96-
; CHECK-NEXT: mov r0, #0
97-
; CHECK-NEXT: mov r2, #0
98-
; CHECK-NEXT: movwlo r0, #1
99-
; CHECK-NEXT: movwhi r2, #1
100-
; CHECK-NEXT: sub r0, r2, r0
86+
; CHECK-NEXT: subs r0, r0, r1
87+
; CHECK-NEXT: movwhi r0, #1
88+
; CHECK-NEXT: mvnlo r0, #0
10189
; CHECK-NEXT: bx lr
10290
%1 = call i32 @llvm.ucmp(i32 %x, i32 %y)
10391
ret i32 %1

0 commit comments

Comments
 (0)