From 6c90abc80c39a8f58260e312a6408c3ea67e6b8b Mon Sep 17 00:00:00 2001 From: AZero13 Date: Tue, 5 Aug 2025 16:33:40 -0400 Subject: [PATCH 1/2] [ARM] Move BFI creation to ISELDAGToDAG This prevents a bunch of extraneous bfi nodes that do more harm than good, as can be seen in select-imm.ll --- llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp | 165 ++++++++++++++++++++++++ llvm/lib/Target/ARM/ARMISelLowering.cpp | 133 ------------------- llvm/test/CodeGen/ARM/select-imm.ll | 54 ++------ 3 files changed, 179 insertions(+), 173 deletions(-) diff --git a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp index 9ad46df159c20..68285f83c58c0 100644 --- a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -303,6 +303,7 @@ class ARMDAGToDAGISel : public SelectionDAGISel { /// Try to select SBFX/UBFX instructions for ARM. bool tryV6T2BitfieldExtractOp(SDNode *N, bool isSigned); + bool tryV6T2BitfieldInsertOp(SDNode *N); bool tryInsertVectorElt(SDNode *N); @@ -3323,6 +3324,165 @@ bool ARMDAGToDAGISel::tryFMULFixed(SDNode *N, SDLoc dl) { N, N, LHS.getOpcode() == ISD::UINT_TO_FP, true); } +bool ARMDAGToDAGISel::tryV6T2BitfieldInsertOp(SDNode *N) { + if (!Subtarget->hasV6T2Ops()) + return false; + + EVT VT = N->getValueType(0); + if (VT != MVT::i32) + return false; + + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + SDLoc DL(N); + + unsigned MaskImm; + // Must be a single use AND with an immediate operand. + if (!N0.hasOneUse() || + !isOpcWithIntImmediate(N0.getNode(), ISD::AND, MaskImm)) + return false; + + // If the mask is 0xffff, we can do better + // via a movt instruction, so don't use BFI in that case. + if (MaskImm == 0xffff) + return false; + + SDValue N00 = N0.getOperand(0); + unsigned Mask = MaskImm; + + // Case (1): or (and A, mask), val => ARMbfi A, val, mask + ConstantSDNode *N1C = dyn_cast(N1); + if (N1C) { + unsigned Val = N1C->getZExtValue(); + if ((Val & ~Mask) != Val) + return false; + + if (ARM::isBitFieldInvertedMask(Mask)) { + Val >>= llvm::countr_zero(~Mask); + + // Materialize the constant to be inserted + unsigned MovOpc = Subtarget->isThumb() ? ARM::t2MOVi32imm : ARM::MOVi32imm; + SDNode *ValNode = CurDAG->getMachineNode( + MovOpc, DL, VT, CurDAG->getTargetConstant(Val, DL, MVT::i32)); + + SDValue Ops[] = {N00, SDValue(ValNode, 0), + CurDAG->getTargetConstant(Mask, DL, MVT::i32), + getAL(CurDAG, DL), CurDAG->getRegister(0, MVT::i32)}; + unsigned Opc = Subtarget->isThumb() ? ARM::t2BFI : ARM::BFI; + CurDAG->SelectNodeTo(N, Opc, VT, Ops); + return true; + } + } else if (N1.getOpcode() == ISD::AND) { + // case (2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask + ConstantSDNode *N11C = dyn_cast(N1.getOperand(1)); + if (!N11C) + return false; + unsigned Mask2 = N11C->getZExtValue(); + + // Mask and ~Mask2 (or reverse) must be equivalent for the BFI pattern + // as is to match. + if (ARM::isBitFieldInvertedMask(Mask) && (Mask == ~Mask2)) { + // The pack halfword instruction works better for masks that fit it, + // so use that when it's available. + if (Subtarget->hasDSP() && (Mask == 0xffff || Mask == 0xffff0000)) + return false; + // 2a + unsigned amt = llvm::countr_zero(Mask2); + + // Create machine shift instruction + SDNode *Shift; + if (Subtarget->isThumb()) { + // For Thumb2, use t2LSRri: Rm, imm, pred, pred_reg, cc_out + SDValue ShiftOps[] = { + N1.getOperand(0), CurDAG->getTargetConstant(amt, DL, MVT::i32), + getAL(CurDAG, DL), // predicate + CurDAG->getRegister(0, MVT::i32), // pred_reg + CurDAG->getRegister(0, MVT::i32) // cc_out + }; + Shift = CurDAG->getMachineNode(ARM::t2LSRri, DL, VT, ShiftOps); + } else { + // For ARM mode, use MOVsi: Rm, so_reg_imm, pred, pred_reg, cc_out + SDValue ShiftOps[] = { + N1.getOperand(0), + CurDAG->getTargetConstant(ARM_AM::getSORegOpc(ARM_AM::lsr, amt), DL, + MVT::i32), + getAL(CurDAG, DL), // predicate + CurDAG->getRegister(0, MVT::i32), // pred_reg + CurDAG->getRegister(0, MVT::i32) // cc_out + }; + Shift = CurDAG->getMachineNode(ARM::MOVsi, DL, VT, ShiftOps); + } + + SDValue Ops[] = {N00, SDValue(Shift, 0), + CurDAG->getTargetConstant(Mask, DL, MVT::i32), + getAL(CurDAG, DL), CurDAG->getRegister(0, MVT::i32)}; + unsigned Opc = Subtarget->isThumb() ? ARM::t2BFI : ARM::BFI; + CurDAG->SelectNodeTo(N, Opc, VT, Ops); + return true; + } else if (ARM::isBitFieldInvertedMask(~Mask) && (~Mask == Mask2)) { + // The pack halfword instruction works better for masks that fit it, + // so use that when it's available. + if (Subtarget->hasDSP() && (Mask2 == 0xffff || Mask2 == 0xffff0000)) + return false; + // 2b + unsigned lsb = llvm::countr_zero(Mask); + + // Create machine shift instruction + SDNode *Shift; + if (Subtarget->isThumb()) { + // For Thumb2, use t2LSRri: Rm, imm, pred, pred_reg, cc_out + SDValue ShiftOps[] = { + N00, CurDAG->getTargetConstant(lsb, DL, MVT::i32), + getAL(CurDAG, DL), // predicate + CurDAG->getRegister(0, MVT::i32), // pred_reg + CurDAG->getRegister(0, MVT::i32) // cc_out + }; + Shift = CurDAG->getMachineNode(ARM::t2LSRri, DL, VT, ShiftOps); + } else { + // For ARM mode, use MOVsi: Rm, so_reg_imm, pred, pred_reg, cc_out + SDValue ShiftOps[] = { + N00, + CurDAG->getTargetConstant(ARM_AM::getSORegOpc(ARM_AM::lsr, lsb), DL, + MVT::i32), + getAL(CurDAG, DL), // predicate + CurDAG->getRegister(0, MVT::i32), // pred_reg + CurDAG->getRegister(0, MVT::i32) // cc_out + }; + Shift = CurDAG->getMachineNode(ARM::MOVsi, DL, VT, ShiftOps); + } + + SDValue Ops[] = {N1.getOperand(0), SDValue(Shift, 0), + CurDAG->getTargetConstant(Mask2, DL, MVT::i32), + getAL(CurDAG, DL), CurDAG->getRegister(0, MVT::i32)}; + unsigned Opc = Subtarget->isThumb() ? ARM::t2BFI : ARM::BFI; + CurDAG->SelectNodeTo(N, Opc, VT, Ops); + return true; + } + } + + // Case (3): or (and (shl A, #shamt), mask), B => ARMbfi B, A, ~mask + // where lsb(mask) == #shamt and masked bits of B are known zero. + if (CurDAG->MaskedValueIsZero(N1, APInt(32, Mask)) && + N00.getOpcode() == ISD::SHL && isa(N00.getOperand(1)) && + ARM::isBitFieldInvertedMask(~Mask)) { + SDValue ShAmt = N00.getOperand(1); + unsigned ShAmtC = cast(ShAmt)->getZExtValue(); + unsigned LSB = llvm::countr_zero(Mask); + if (ShAmtC != LSB) + return false; + + SDValue Ops[] = {N1, N00.getOperand(0), + CurDAG->getTargetConstant(~Mask, DL, MVT::i32), + getAL(CurDAG, DL), CurDAG->getRegister(0, MVT::i32)}; + + unsigned Opc = Subtarget->isThumb() ? ARM::t2BFI : ARM::BFI; + CurDAG->SelectNodeTo(N, Opc, VT, Ops); + return true; + } + + return false; +} + bool ARMDAGToDAGISel::tryV6T2BitfieldExtractOp(SDNode *N, bool isSigned) { if (!Subtarget->hasV6T2Ops()) return false; @@ -3833,6 +3993,11 @@ void ARMDAGToDAGISel::Select(SDNode *N) { } } break; + case ISD::OR: { + if (tryV6T2BitfieldInsertOp(N)) + return; + break; + } case ISD::AND: { // Check for unsigned bitfield extract if (tryV6T2BitfieldExtractOp(N, false)) diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 7f8b4460bb814..5058a35930be3 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -14579,132 +14579,6 @@ static SDValue PerformORCombineToSMULWBT(SDNode *OR, return SDValue(OR, 0); } -static SDValue PerformORCombineToBFI(SDNode *N, - TargetLowering::DAGCombinerInfo &DCI, - const ARMSubtarget *Subtarget) { - // BFI is only available on V6T2+ - if (Subtarget->isThumb1Only() || !Subtarget->hasV6T2Ops()) - return SDValue(); - - EVT VT = N->getValueType(0); - SDValue N0 = N->getOperand(0); - SDValue N1 = N->getOperand(1); - SelectionDAG &DAG = DCI.DAG; - SDLoc DL(N); - // 1) or (and A, mask), val => ARMbfi A, val, mask - // iff (val & mask) == val - // - // 2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask - // 2a) iff isBitFieldInvertedMask(mask) && isBitFieldInvertedMask(~mask2) - // && mask == ~mask2 - // 2b) iff isBitFieldInvertedMask(~mask) && isBitFieldInvertedMask(mask2) - // && ~mask == mask2 - // (i.e., copy a bitfield value into another bitfield of the same width) - - if (VT != MVT::i32) - return SDValue(); - - SDValue N00 = N0.getOperand(0); - - // The value and the mask need to be constants so we can verify this is - // actually a bitfield set. If the mask is 0xffff, we can do better - // via a movt instruction, so don't use BFI in that case. - SDValue MaskOp = N0.getOperand(1); - ConstantSDNode *MaskC = dyn_cast(MaskOp); - if (!MaskC) - return SDValue(); - unsigned Mask = MaskC->getZExtValue(); - if (Mask == 0xffff) - return SDValue(); - SDValue Res; - // Case (1): or (and A, mask), val => ARMbfi A, val, mask - ConstantSDNode *N1C = dyn_cast(N1); - if (N1C) { - unsigned Val = N1C->getZExtValue(); - if ((Val & ~Mask) != Val) - return SDValue(); - - if (ARM::isBitFieldInvertedMask(Mask)) { - Val >>= llvm::countr_zero(~Mask); - - Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, - DAG.getConstant(Val, DL, MVT::i32), - DAG.getConstant(Mask, DL, MVT::i32)); - - DCI.CombineTo(N, Res, false); - // Return value from the original node to inform the combiner than N is - // now dead. - return SDValue(N, 0); - } - } else if (N1.getOpcode() == ISD::AND) { - // case (2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask - ConstantSDNode *N11C = dyn_cast(N1.getOperand(1)); - if (!N11C) - return SDValue(); - unsigned Mask2 = N11C->getZExtValue(); - - // Mask and ~Mask2 (or reverse) must be equivalent for the BFI pattern - // as is to match. - if (ARM::isBitFieldInvertedMask(Mask) && - (Mask == ~Mask2)) { - // The pack halfword instruction works better for masks that fit it, - // so use that when it's available. - if (Subtarget->hasDSP() && - (Mask == 0xffff || Mask == 0xffff0000)) - return SDValue(); - // 2a - unsigned amt = llvm::countr_zero(Mask2); - Res = DAG.getNode(ISD::SRL, DL, VT, N1.getOperand(0), - DAG.getConstant(amt, DL, MVT::i32)); - Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, Res, - DAG.getConstant(Mask, DL, MVT::i32)); - DCI.CombineTo(N, Res, false); - // Return value from the original node to inform the combiner than N is - // now dead. - return SDValue(N, 0); - } else if (ARM::isBitFieldInvertedMask(~Mask) && - (~Mask == Mask2)) { - // The pack halfword instruction works better for masks that fit it, - // so use that when it's available. - if (Subtarget->hasDSP() && - (Mask2 == 0xffff || Mask2 == 0xffff0000)) - return SDValue(); - // 2b - unsigned lsb = llvm::countr_zero(Mask); - Res = DAG.getNode(ISD::SRL, DL, VT, N00, - DAG.getConstant(lsb, DL, MVT::i32)); - Res = DAG.getNode(ARMISD::BFI, DL, VT, N1.getOperand(0), Res, - DAG.getConstant(Mask2, DL, MVT::i32)); - DCI.CombineTo(N, Res, false); - // Return value from the original node to inform the combiner than N is - // now dead. - return SDValue(N, 0); - } - } - - if (DAG.MaskedValueIsZero(N1, MaskC->getAPIntValue()) && - N00.getOpcode() == ISD::SHL && isa(N00.getOperand(1)) && - ARM::isBitFieldInvertedMask(~Mask)) { - // Case (3): or (and (shl A, #shamt), mask), B => ARMbfi B, A, ~mask - // where lsb(mask) == #shamt and masked bits of B are known zero. - SDValue ShAmt = N00.getOperand(1); - unsigned ShAmtC = ShAmt->getAsZExtVal(); - unsigned LSB = llvm::countr_zero(Mask); - if (ShAmtC != LSB) - return SDValue(); - - Res = DAG.getNode(ARMISD::BFI, DL, VT, N1, N00.getOperand(0), - DAG.getConstant(~Mask, DL, MVT::i32)); - - DCI.CombineTo(N, Res, false); - // Return value from the original node to inform the combiner than N is - // now dead. - return SDValue(N, 0); - } - - return SDValue(); -} - static bool isValidMVECond(unsigned CC, bool IsFloat) { switch (CC) { case ARMCC::EQ: @@ -14849,13 +14723,6 @@ static SDValue PerformORCombine(SDNode *N, } } - // Try to use the ARM/Thumb2 BFI (bitfield insert) instruction when - // reasonable. - if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) { - if (SDValue Res = PerformORCombineToBFI(N, DCI, Subtarget)) - return Res; - } - if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget)) return Result; diff --git a/llvm/test/CodeGen/ARM/select-imm.ll b/llvm/test/CodeGen/ARM/select-imm.ll index 186276b50ceeb..b2e400f4ce923 100644 --- a/llvm/test/CodeGen/ARM/select-imm.ll +++ b/llvm/test/CodeGen/ARM/select-imm.ll @@ -696,27 +696,14 @@ define i1 @t11() { ; ARMT2: @ %bb.0: @ %entry ; ARMT2-NEXT: .pad #4 ; ARMT2-NEXT: sub sp, sp, #4 -; ARMT2-NEXT: ldr r1, [sp] -; ARMT2-NEXT: mov r0, #33 -; ARMT2-NEXT: movw r2, #39322 -; ARMT2-NEXT: movt r2, #6553 -; ARMT2-NEXT: bfi r1, r0, #0, #12 -; ARMT2-NEXT: mov r0, #10 -; ARMT2-NEXT: bfi r1, r0, #12, #13 -; ARMT2-NEXT: mov r0, r1 -; ARMT2-NEXT: bfc r0, #12, #20 -; ARMT2-NEXT: umull r2, r3, r0, r2 -; ARMT2-NEXT: add r2, r3, r3, lsl #2 -; ARMT2-NEXT: sub r0, r0, r2, lsl #1 -; ARMT2-NEXT: movw r2, #40960 -; ARMT2-NEXT: movt r2, #65024 -; ARMT2-NEXT: and r1, r1, r2 -; ARMT2-NEXT: orr r0, r1, r0 +; ARMT2-NEXT: ldr r0, [sp] +; ARMT2-NEXT: movw r1, #40960 +; ARMT2-NEXT: movt r1, #65024 +; ARMT2-NEXT: orr r0, r0, #40960 +; ARMT2-NEXT: and r0, r0, r1 +; ARMT2-NEXT: orr r0, r0, #3 ; ARMT2-NEXT: str r0, [sp] -; ARMT2-NEXT: and r0, r0, #15 -; ARMT2-NEXT: sub r0, r0, #3 -; ARMT2-NEXT: clz r0, r0 -; ARMT2-NEXT: lsr r0, r0, #5 +; ARMT2-NEXT: mov r0, #1 ; ARMT2-NEXT: add sp, sp, #4 ; ARMT2-NEXT: bx lr ; @@ -758,27 +745,14 @@ define i1 @t11() { ; THUMB2: @ %bb.0: @ %entry ; THUMB2-NEXT: .pad #4 ; THUMB2-NEXT: sub sp, #4 -; THUMB2-NEXT: ldr r1, [sp] -; THUMB2-NEXT: movs r0, #33 -; THUMB2-NEXT: movw r2, #39322 -; THUMB2-NEXT: bfi r1, r0, #0, #12 -; THUMB2-NEXT: movs r0, #10 -; THUMB2-NEXT: bfi r1, r0, #12, #13 -; THUMB2-NEXT: mov r0, r1 -; THUMB2-NEXT: movt r2, #6553 -; THUMB2-NEXT: bfc r0, #12, #20 -; THUMB2-NEXT: umull r2, r3, r0, r2 -; THUMB2-NEXT: add.w r2, r3, r3, lsl #2 -; THUMB2-NEXT: sub.w r0, r0, r2, lsl #1 -; THUMB2-NEXT: movw r2, #40960 -; THUMB2-NEXT: movt r2, #65024 -; THUMB2-NEXT: ands r1, r2 -; THUMB2-NEXT: orrs r0, r1 +; THUMB2-NEXT: ldr r0, [sp] +; THUMB2-NEXT: movw r1, #40960 +; THUMB2-NEXT: movt r1, #65024 +; THUMB2-NEXT: orr r0, r0, #40960 +; THUMB2-NEXT: ands r0, r1 +; THUMB2-NEXT: adds r0, #3 ; THUMB2-NEXT: str r0, [sp] -; THUMB2-NEXT: and r0, r0, #15 -; THUMB2-NEXT: subs r0, #3 -; THUMB2-NEXT: clz r0, r0 -; THUMB2-NEXT: lsrs r0, r0, #5 +; THUMB2-NEXT: movs r0, #1 ; THUMB2-NEXT: add sp, #4 ; THUMB2-NEXT: bx lr ; From 1ddae8173aa8fc2459ed09416f694ac84bef8054 Mon Sep 17 00:00:00 2001 From: AZero13 Date: Tue, 5 Aug 2025 16:36:36 -0400 Subject: [PATCH 2/2] OK --- llvm/test/CodeGen/ARM/bfi.ll | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/llvm/test/CodeGen/ARM/bfi.ll b/llvm/test/CodeGen/ARM/bfi.ll index 5aeb99695a5fe..7c8fc9cf448ca 100644 --- a/llvm/test/CodeGen/ARM/bfi.ll +++ b/llvm/test/CodeGen/ARM/bfi.ll @@ -9,7 +9,7 @@ define void @f1([1 x i32] %f.coerce0) nounwind { ; CHECK-LABEL: f1: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: movw r0, :lower16:X -; CHECK-NEXT: mov r2, #10 +; CHECK-NEXT: movw r2, #10 ; CHECK-NEXT: movt r0, :upper16:X ; CHECK-NEXT: ldr r1, [r0] ; CHECK-NEXT: bfi r1, r2, #22, #4 @@ -68,7 +68,9 @@ define i32 @f4(i32 %a) nounwind { define i32 @f5(i32 %a, i32 %b) nounwind { ; CHECK-LABEL: f5: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: bfi r0, r1, #20, #4 +; CHECK-NEXT: and r1, r1, #15 +; CHECK-NEXT: bic r0, r0, #15728640 +; CHECK-NEXT: orr r0, r0, r1, lsl #20 ; CHECK-NEXT: bx lr entry: %0 = and i32 %a, -15728641 @@ -82,6 +84,8 @@ entry: define i32 @f6(i32 %a, i32 %b) nounwind readnone { ; CHECK-LABEL: f6: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: lsl r1, r1, #8 +; CHECK-NEXT: lsr r1, r1, #8 ; CHECK-NEXT: bfi r0, r1, #8, #9 ; CHECK-NEXT: bx lr entry: @@ -246,8 +250,9 @@ define void @bfi1_use(i32 %a, i32 %b) { ; CHECK: @ %bb.0: ; CHECK-NEXT: push {r11, lr} ; CHECK-NEXT: mov r2, r1 +; CHECK-NEXT: lsr r1, r0, #32 +; CHECK-NEXT: bfi r2, r1, #0, #1 ; CHECK-NEXT: lsr r3, r0, #4 -; CHECK-NEXT: bfi r2, r0, #0, #1 ; CHECK-NEXT: lsr r0, r0, #1 ; CHECK-NEXT: mov r1, r2 ; CHECK-NEXT: bfi r1, r3, #4, #1 @@ -305,13 +310,17 @@ define void @bfi2_uses(i32 %a, i32 %b) { ; CHECK: @ %bb.0: ; CHECK-NEXT: push {r11, lr} ; CHECK-NEXT: mov r12, r1 -; CHECK-NEXT: bfi r1, r0, #0, #2 -; CHECK-NEXT: bfi r12, r0, #0, #1 -; CHECK-NEXT: lsr r0, r0, #7 +; CHECK-NEXT: lsr r1, r0, #32 +; CHECK-NEXT: bfi r12, r1, #0, #1 +; CHECK-NEXT: lsr r2, r0, #1 +; CHECK-NEXT: lsr r3, r0, #7 +; CHECK-NEXT: lsr r0, r0, #8 +; CHECK-NEXT: mov r1, r12 +; CHECK-NEXT: bfi r1, r2, #1, #1 ; CHECK-NEXT: mov r2, r1 -; CHECK-NEXT: mov r3, r1 -; CHECK-NEXT: bfi r2, r0, #7, #1 -; CHECK-NEXT: bfi r3, r0, #7, #2 +; CHECK-NEXT: bfi r2, r3, #7, #1 +; CHECK-NEXT: mov r3, r2 +; CHECK-NEXT: bfi r3, r0, #8, #1 ; CHECK-NEXT: mov r0, r12 ; CHECK-NEXT: bl use ; CHECK-NEXT: pop {r11, pc} @@ -366,8 +375,9 @@ define void @bfi3_uses(i32 %a, i32 %b) { ; CHECK: @ %bb.0: ; CHECK-NEXT: push {r11, lr} ; CHECK-NEXT: mov r12, r1 +; CHECK-NEXT: lsr r1, r0, #32 +; CHECK-NEXT: bfi r12, r1, #0, #1 ; CHECK-NEXT: lsr r2, r0, #7 -; CHECK-NEXT: bfi r12, r0, #0, #1 ; CHECK-NEXT: lsr r3, r0, #1 ; CHECK-NEXT: lsr r0, r0, #8 ; CHECK-NEXT: mov r1, r12