diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp index f16283be1b996..bc70e7dc3cbf5 100644 --- a/llvm/lib/CodeGen/CodeGenPrepare.cpp +++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -338,6 +338,10 @@ class CodeGenPrepare { /// Keep track of instructions removed during promotion. SetOfInstrs RemovedInsts; + /// Keep track of seen mul_with_overflow intrinsics to avoid + // reprocessing them. + DenseMap SeenMulWithOverflowInstrs; + /// Keep track of sext chains based on their initial value. DenseMap SeenChainsForSExt; @@ -431,6 +435,10 @@ class CodeGenPrepare { bool optimizeMemoryInst(Instruction *MemoryInst, Value *Addr, Type *AccessTy, unsigned AddrSpace); bool optimizeGatherScatterInst(Instruction *MemoryInst, Value *Ptr); + bool optimizeUMulWithOverflow(Instruction *I); + bool optimizeSMulWithOverflow(Instruction *I); + bool optimizeMulWithOverflow(Instruction *I, bool IsSigned, + ModifyDT &ModifiedDT); bool optimizeInlineAsmInst(CallInst *CS); bool optimizeCallInst(CallInst *CI, ModifyDT &ModifiedDT); bool optimizeExt(Instruction *&I); @@ -772,6 +780,7 @@ bool CodeGenPrepare::_run(Function &F) { verifyBFIUpdates(F); #endif + SeenMulWithOverflowInstrs.clear(); return EverMadeChange; } @@ -2792,6 +2801,10 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, ModifyDT &ModifiedDT) { } } return false; + case Intrinsic::umul_with_overflow: + return optimizeMulWithOverflow(II, /*IsSigned=*/false, ModifiedDT); + case Intrinsic::smul_with_overflow: + return optimizeMulWithOverflow(II, /*IsSigned=*/true, ModifiedDT); } SmallVector PtrOps; @@ -6403,6 +6416,145 @@ bool CodeGenPrepare::optimizeGatherScatterInst(Instruction *MemoryInst, return true; } +// Rewrite the mul_with_overflow intrinsic by checking if both of the +// operands' value range is within the legal type. If so, we can optimize the +// multiplication algorithm. This code is supposed to be written during the step +// of type legalization, but given that we need to reconstruct the IR which is +// not doable there, we do it here. +bool CodeGenPrepare::optimizeMulWithOverflow(Instruction *I, bool IsSigned, + ModifyDT &ModifiedDT) { + // Enable this optimization only for aarch64. + if (!TLI->getTargetMachine().getTargetTriple().isAArch64()) + return false; + // If we have already seen this instruction, don't process it again. + if (!SeenMulWithOverflowInstrs.insert(std::make_pair(I, true)).second) + return false; + + if (TLI->getTypeAction( + I->getContext(), + TLI->getValueType(*DL, I->getType()->getContainedType(0))) != + TargetLowering::TypeExpandInteger) + return false; + + Value *LHS = I->getOperand(0); + Value *RHS = I->getOperand(1); + Type *Ty = LHS->getType(); + unsigned VTBitWidth = Ty->getScalarSizeInBits(); + unsigned VTHalfBitWidth = VTBitWidth / 2; + IntegerType *LegalTy = + IntegerType::getIntNTy(I->getContext(), VTHalfBitWidth); + + // Skip the optimization if the type with HalfBitWidth is not legal for the + // target. + if (TLI->getTypeAction(I->getContext(), TLI->getValueType(*DL, LegalTy)) != + TargetLowering::TypeLegal) + return false; + + // Make sure that the I->getType() is a struct type with two elements. + if (!I->getType()->isStructTy() || I->getType()->getStructNumElements() != 2) + return false; + + I->getParent()->setName("overflow.res"); + BasicBlock *OverflowResBB = I->getParent(); + BasicBlock *OverflowoEntryBB = + I->getParent()->splitBasicBlock(I, "overflow.entry", /*Before*/ true); + BasicBlock *NoOverflowBB = BasicBlock::Create( + I->getContext(), "overflow.no", I->getFunction(), OverflowResBB); + BasicBlock *OverflowBB = BasicBlock::Create(I->getContext(), "overflow", + I->getFunction(), OverflowResBB); + // new blocks should be: + // entry: + // if signed: + // (lhs_lo ^ lhs_hi) || (rhs_lo ^ rhs_hi) ? overflow, overflow_no + // else: + // (lhs_hi != 0) || (rhs_hi != 0) ? overflow, overflow_no + + // overflow_no: + // overflow: + // overflow.res: + + // ---------------------------- + // BB overflow.entry: + // get Lo and Hi of LHS & RHS: + IRBuilder<> Builder(OverflowoEntryBB->getTerminator()); + Value *LoLHS = Builder.CreateTrunc(LHS, LegalTy, "lo.lhs"); + Value *HiLHS = Builder.CreateLShr(LHS, VTHalfBitWidth, "lhs.lsr"); + HiLHS = Builder.CreateTrunc(HiLHS, LegalTy, "hi.lhs"); + Value *LoRHS = Builder.CreateTrunc(RHS, LegalTy, "lo.rhs"); + Value *HiRHS = Builder.CreateLShr(RHS, VTHalfBitWidth, "rhs.lsr"); + HiRHS = Builder.CreateTrunc(HiRHS, LegalTy, "hi.rhs"); + + Value *IsAnyBitTrue; + if (IsSigned) { + Value *SignLoLHS = + Builder.CreateAShr(LoLHS, VTHalfBitWidth - 1, "sign.lo.lhs"); + Value *SignLoRHS = + Builder.CreateAShr(LoRHS, VTHalfBitWidth - 1, "sign.lo.rhs"); + Value *XorLHS = Builder.CreateXor(HiLHS, SignLoLHS); + Value *XorRHS = Builder.CreateXor(HiRHS, SignLoRHS); + Value *Or = Builder.CreateOr(XorLHS, XorRHS, "or.lhs.rhs"); + IsAnyBitTrue = Builder.CreateCmp(ICmpInst::ICMP_EQ, Or, + ConstantInt::get(Or->getType(), 1)); + } else { + Value *CmpLHS = Builder.CreateCmp(ICmpInst::ICMP_NE, HiLHS, + ConstantInt::getNullValue(LegalTy)); + Value *CmpRHS = Builder.CreateCmp(ICmpInst::ICMP_NE, HiRHS, + ConstantInt::getNullValue(LegalTy)); + IsAnyBitTrue = Builder.CreateOr(CmpLHS, CmpRHS, "or.lhs.rhs"); + } + + Builder.CreateCondBr(IsAnyBitTrue, OverflowBB, NoOverflowBB); + OverflowoEntryBB->getTerminator()->eraseFromParent(); + + // BB overflow.no: + Builder.SetInsertPoint(NoOverflowBB); + Value *ExtLoLHS, *ExtLoRHS; + if (IsSigned) { + ExtLoLHS = Builder.CreateSExt(LoLHS, Ty, "lo.lhs.ext"); + ExtLoRHS = Builder.CreateSExt(LoRHS, Ty, "lo.rhs.ext"); + } else { + ExtLoLHS = Builder.CreateZExt(LoLHS, Ty, "lo.lhs.ext"); + ExtLoRHS = Builder.CreateZExt(LoRHS, Ty, "lo.rhs.ext"); + } + + Value *Mul = Builder.CreateMul(ExtLoLHS, ExtLoRHS, "mul.no.overflow"); + Builder.CreateBr(OverflowResBB); + + // BB overflow.res: + Builder.SetInsertPoint(OverflowResBB, OverflowResBB->getFirstInsertionPt()); + PHINode *PHINode1 = Builder.CreatePHI(Ty, 2); + PHINode1->addIncoming(Mul, NoOverflowBB); + PHINode *PHINode2 = + Builder.CreatePHI(IntegerType::getInt1Ty(I->getContext()), 2); + PHINode2->addIncoming(ConstantInt::getFalse(I->getContext()), NoOverflowBB); + + StructType *STy = StructType::get( + I->getContext(), {Ty, IntegerType::getInt1Ty(I->getContext())}); + Value *StructValOverflowRes = PoisonValue::get(STy); + StructValOverflowRes = + Builder.CreateInsertValue(StructValOverflowRes, PHINode1, {0}); + StructValOverflowRes = + Builder.CreateInsertValue(StructValOverflowRes, PHINode2, {1}); + // Before moving the mul.overflow intrinsic to the overflowBB, replace all its + // uses by StructValOverflowRes. + I->replaceAllUsesWith(StructValOverflowRes); + I->removeFromParent(); + + // BB overflow: + I->insertInto(OverflowBB, OverflowBB->end()); + Builder.SetInsertPoint(OverflowBB, OverflowBB->end()); + Value *MulOverflow = Builder.CreateExtractValue(I, {0}, "mul.overflow"); + Value *OverflowFlag = Builder.CreateExtractValue(I, {1}, "overflow.flag"); + Builder.CreateBr(OverflowResBB); + + // Add The Extracted values to the PHINodes in the overflow.res block. + PHINode1->addIncoming(MulOverflow, OverflowBB); + PHINode2->addIncoming(OverflowFlag, OverflowBB); + + ModifiedDT = ModifyDT::ModifyBBDT; + return true; +} + /// If there are any memory operands, use OptimizeMemoryInst to sink their /// address computing into the block when possible / profitable. bool CodeGenPrepare::optimizeInlineAsmInst(CallInst *CS) { diff --git a/llvm/test/CodeGen/AArch64/i128-math.ll b/llvm/test/CodeGen/AArch64/i128-math.ll index 9e1c0c1b115ab..d7e71dc51dcb5 100644 --- a/llvm/test/CodeGen/AArch64/i128-math.ll +++ b/llvm/test/CodeGen/AArch64/i128-math.ll @@ -261,21 +261,29 @@ define i128 @u128_mul(i128 %x, i128 %y) { define { i128, i8 } @u128_checked_mul(i128 %x, i128 %y) { ; CHECK-LABEL: u128_checked_mul: -; CHECK: // %bb.0: +; CHECK: // %bb.0: // %overflow.entry +; CHECK-NEXT: orr x8, x1, x3 +; CHECK-NEXT: cbz x8, .LBB17_2 +; CHECK-NEXT: // %bb.1: // %overflow ; CHECK-NEXT: mul x9, x3, x0 ; CHECK-NEXT: cmp x1, #0 ; CHECK-NEXT: ccmp x3, #0, #4, ne -; CHECK-NEXT: umulh x8, x1, x2 -; CHECK-NEXT: umulh x10, x3, x0 +; CHECK-NEXT: umulh x10, x1, x2 +; CHECK-NEXT: umulh x8, x3, x0 ; CHECK-NEXT: madd x9, x1, x2, x9 -; CHECK-NEXT: ccmp xzr, x8, #0, eq -; CHECK-NEXT: umulh x11, x0, x2 ; CHECK-NEXT: ccmp xzr, x10, #0, eq +; CHECK-NEXT: umulh x11, x0, x2 +; CHECK-NEXT: ccmp xzr, x8, #0, eq ; CHECK-NEXT: mul x0, x0, x2 ; CHECK-NEXT: cset w8, ne ; CHECK-NEXT: adds x1, x11, x9 ; CHECK-NEXT: csinc w8, w8, wzr, lo ; CHECK-NEXT: eor w2, w8, #0x1 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB17_2: // %overflow.no +; CHECK-NEXT: umulh x1, x0, x2 +; CHECK-NEXT: mul x0, x0, x2 +; CHECK-NEXT: eor w2, w8, #0x1 ; CHECK-NEXT: ret %1 = tail call { i128, i1 } @llvm.umul.with.overflow.i128(i128 %x, i128 %y) %2 = extractvalue { i128, i1 } %1, 0 @@ -289,20 +297,28 @@ define { i128, i8 } @u128_checked_mul(i128 %x, i128 %y) { define { i128, i8 } @u128_overflowing_mul(i128 %x, i128 %y) { ; CHECK-LABEL: u128_overflowing_mul: -; CHECK: // %bb.0: +; CHECK: // %bb.0: // %overflow.entry +; CHECK-NEXT: orr x8, x1, x3 +; CHECK-NEXT: cbz x8, .LBB18_2 +; CHECK-NEXT: // %bb.1: // %overflow ; CHECK-NEXT: mul x9, x3, x0 ; CHECK-NEXT: cmp x1, #0 ; CHECK-NEXT: ccmp x3, #0, #4, ne -; CHECK-NEXT: umulh x8, x1, x2 -; CHECK-NEXT: umulh x10, x3, x0 +; CHECK-NEXT: umulh x10, x1, x2 +; CHECK-NEXT: umulh x8, x3, x0 ; CHECK-NEXT: madd x9, x1, x2, x9 -; CHECK-NEXT: ccmp xzr, x8, #0, eq -; CHECK-NEXT: umulh x11, x0, x2 ; CHECK-NEXT: ccmp xzr, x10, #0, eq +; CHECK-NEXT: umulh x11, x0, x2 +; CHECK-NEXT: ccmp xzr, x8, #0, eq ; CHECK-NEXT: mul x0, x0, x2 ; CHECK-NEXT: cset w8, ne ; CHECK-NEXT: adds x1, x11, x9 ; CHECK-NEXT: csinc w2, w8, wzr, lo +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB18_2: // %overflow.no +; CHECK-NEXT: umulh x1, x0, x2 +; CHECK-NEXT: mul x0, x0, x2 +; CHECK-NEXT: mov w2, wzr ; CHECK-NEXT: ret %1 = tail call { i128, i1 } @llvm.umul.with.overflow.i128(i128 %x, i128 %y) %2 = extractvalue { i128, i1 } %1, 0 @@ -315,20 +331,29 @@ define { i128, i8 } @u128_overflowing_mul(i128 %x, i128 %y) { define i128 @u128_saturating_mul(i128 %x, i128 %y) { ; CHECK-LABEL: u128_saturating_mul: -; CHECK: // %bb.0: -; CHECK-NEXT: mul x9, x3, x0 +; CHECK: // %bb.0: // %overflow.entry +; CHECK-NEXT: orr x8, x1, x3 +; CHECK-NEXT: cbz x8, .LBB19_2 +; CHECK-NEXT: // %bb.1: // %overflow +; CHECK-NEXT: mul x8, x3, x0 ; CHECK-NEXT: cmp x1, #0 ; CHECK-NEXT: ccmp x3, #0, #4, ne -; CHECK-NEXT: umulh x8, x1, x2 -; CHECK-NEXT: umulh x10, x3, x0 -; CHECK-NEXT: madd x9, x1, x2, x9 -; CHECK-NEXT: ccmp xzr, x8, #0, eq -; CHECK-NEXT: umulh x11, x0, x2 +; CHECK-NEXT: umulh x10, x1, x2 +; CHECK-NEXT: umulh x9, x3, x0 +; CHECK-NEXT: madd x11, x1, x2, x8 ; CHECK-NEXT: ccmp xzr, x10, #0, eq +; CHECK-NEXT: umulh x12, x0, x2 +; CHECK-NEXT: ccmp xzr, x9, #0, eq ; CHECK-NEXT: mul x8, x0, x2 ; CHECK-NEXT: cset w10, ne -; CHECK-NEXT: adds x9, x11, x9 +; CHECK-NEXT: adds x9, x12, x11 ; CHECK-NEXT: csinc w10, w10, wzr, lo +; CHECK-NEXT: b .LBB19_3 +; CHECK-NEXT: .LBB19_2: // %overflow.no +; CHECK-NEXT: umulh x9, x0, x2 +; CHECK-NEXT: mov w10, wzr +; CHECK-NEXT: mul x8, x0, x2 +; CHECK-NEXT: .LBB19_3: // %overflow.res ; CHECK-NEXT: cmp w10, #0 ; CHECK-NEXT: csinv x0, x8, xzr, eq ; CHECK-NEXT: csinv x1, x9, xzr, eq @@ -354,7 +379,13 @@ define i128 @i128_mul(i128 %x, i128 %y) { define { i128, i8 } @i128_checked_mul(i128 %x, i128 %y) { ; CHECK-LABEL: i128_checked_mul: -; CHECK: // %bb.0: +; CHECK: // %bb.0: // %overflow.entry +; CHECK-NEXT: eor x8, x3, x2, asr #63 +; CHECK-NEXT: eor x9, x1, x0, asr #63 +; CHECK-NEXT: orr x8, x9, x8 +; CHECK-NEXT: cmp x8, #1 +; CHECK-NEXT: b.ne .LBB21_2 +; CHECK-NEXT: // %bb.1: // %overflow ; CHECK-NEXT: asr x9, x1, #63 ; CHECK-NEXT: umulh x10, x0, x2 ; CHECK-NEXT: asr x13, x3, #63 @@ -364,24 +395,30 @@ define { i128, i8 } @i128_checked_mul(i128 %x, i128 %y) { ; CHECK-NEXT: adds x10, x11, x10 ; CHECK-NEXT: mul x14, x0, x3 ; CHECK-NEXT: umulh x12, x0, x3 -; CHECK-NEXT: adc x9, x8, x9 +; CHECK-NEXT: adc x8, x8, x9 +; CHECK-NEXT: mov x9, x1 ; CHECK-NEXT: mul x13, x0, x13 -; CHECK-NEXT: adds x8, x14, x10 -; CHECK-NEXT: mul x15, x1, x3 -; CHECK-NEXT: smulh x10, x1, x3 -; CHECK-NEXT: mov x1, x8 -; CHECK-NEXT: adc x11, x12, x13 -; CHECK-NEXT: asr x12, x9, #63 -; CHECK-NEXT: asr x13, x11, #63 -; CHECK-NEXT: adds x9, x9, x11 ; CHECK-NEXT: asr x11, x8, #63 +; CHECK-NEXT: mul x15, x1, x3 +; CHECK-NEXT: adds x1, x14, x10 +; CHECK-NEXT: smulh x9, x9, x3 +; CHECK-NEXT: adc x10, x12, x13 +; CHECK-NEXT: asr x12, x10, #63 +; CHECK-NEXT: adds x8, x8, x10 +; CHECK-NEXT: asr x10, x1, #63 ; CHECK-NEXT: mul x0, x0, x2 -; CHECK-NEXT: adc x12, x12, x13 -; CHECK-NEXT: adds x9, x15, x9 -; CHECK-NEXT: adc x10, x10, x12 -; CHECK-NEXT: cmp x9, x11 -; CHECK-NEXT: ccmp x10, x11, #0, eq -; CHECK-NEXT: cset w2, eq +; CHECK-NEXT: adc x11, x11, x12 +; CHECK-NEXT: adds x8, x15, x8 +; CHECK-NEXT: adc x9, x9, x11 +; CHECK-NEXT: cmp x8, x10 +; CHECK-NEXT: ccmp x9, x10, #0, eq +; CHECK-NEXT: cset w8, ne +; CHECK-NEXT: eor w2, w8, #0x1 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB21_2: // %overflow.no +; CHECK-NEXT: smulh x1, x0, x2 +; CHECK-NEXT: mul x0, x0, x2 +; CHECK-NEXT: eor w2, wzr, #0x1 ; CHECK-NEXT: ret %1 = tail call { i128, i1 } @llvm.smul.with.overflow.i128(i128 %x, i128 %y) %2 = extractvalue { i128, i1 } %1, 0 @@ -395,7 +432,13 @@ define { i128, i8 } @i128_checked_mul(i128 %x, i128 %y) { define { i128, i8 } @i128_overflowing_mul(i128 %x, i128 %y) { ; CHECK-LABEL: i128_overflowing_mul: -; CHECK: // %bb.0: +; CHECK: // %bb.0: // %overflow.entry +; CHECK-NEXT: eor x8, x3, x2, asr #63 +; CHECK-NEXT: eor x9, x1, x0, asr #63 +; CHECK-NEXT: orr x8, x9, x8 +; CHECK-NEXT: cmp x8, #1 +; CHECK-NEXT: b.ne .LBB22_2 +; CHECK-NEXT: // %bb.1: // %overflow ; CHECK-NEXT: asr x9, x1, #63 ; CHECK-NEXT: umulh x10, x0, x2 ; CHECK-NEXT: asr x13, x3, #63 @@ -405,24 +448,29 @@ define { i128, i8 } @i128_overflowing_mul(i128 %x, i128 %y) { ; CHECK-NEXT: adds x10, x11, x10 ; CHECK-NEXT: mul x14, x0, x3 ; CHECK-NEXT: umulh x12, x0, x3 -; CHECK-NEXT: adc x9, x8, x9 +; CHECK-NEXT: adc x8, x8, x9 +; CHECK-NEXT: mov x9, x1 ; CHECK-NEXT: mul x13, x0, x13 -; CHECK-NEXT: adds x8, x14, x10 -; CHECK-NEXT: mul x15, x1, x3 -; CHECK-NEXT: smulh x10, x1, x3 -; CHECK-NEXT: mov x1, x8 -; CHECK-NEXT: adc x11, x12, x13 -; CHECK-NEXT: asr x12, x9, #63 -; CHECK-NEXT: asr x13, x11, #63 -; CHECK-NEXT: adds x9, x9, x11 ; CHECK-NEXT: asr x11, x8, #63 +; CHECK-NEXT: mul x15, x1, x3 +; CHECK-NEXT: adds x1, x14, x10 +; CHECK-NEXT: smulh x9, x9, x3 +; CHECK-NEXT: adc x10, x12, x13 +; CHECK-NEXT: asr x12, x10, #63 +; CHECK-NEXT: adds x8, x8, x10 +; CHECK-NEXT: asr x10, x1, #63 ; CHECK-NEXT: mul x0, x0, x2 -; CHECK-NEXT: adc x12, x12, x13 -; CHECK-NEXT: adds x9, x15, x9 -; CHECK-NEXT: adc x10, x10, x12 -; CHECK-NEXT: cmp x9, x11 -; CHECK-NEXT: ccmp x10, x11, #0, eq +; CHECK-NEXT: adc x11, x11, x12 +; CHECK-NEXT: adds x8, x15, x8 +; CHECK-NEXT: adc x9, x9, x11 +; CHECK-NEXT: cmp x8, x10 +; CHECK-NEXT: ccmp x9, x10, #0, eq ; CHECK-NEXT: cset w2, ne +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB22_2: // %overflow.no +; CHECK-NEXT: smulh x1, x0, x2 +; CHECK-NEXT: mul x0, x0, x2 +; CHECK-NEXT: mov w2, wzr ; CHECK-NEXT: ret %1 = tail call { i128, i1 } @llvm.smul.with.overflow.i128(i128 %x, i128 %y) %2 = extractvalue { i128, i1 } %1, 0 @@ -435,7 +483,13 @@ define { i128, i8 } @i128_overflowing_mul(i128 %x, i128 %y) { define i128 @i128_saturating_mul(i128 %x, i128 %y) { ; CHECK-LABEL: i128_saturating_mul: -; CHECK: // %bb.0: +; CHECK: // %bb.0: // %overflow.entry +; CHECK-NEXT: eor x8, x3, x2, asr #63 +; CHECK-NEXT: eor x9, x1, x0, asr #63 +; CHECK-NEXT: orr x8, x9, x8 +; CHECK-NEXT: cmp x8, #1 +; CHECK-NEXT: b.ne .LBB23_2 +; CHECK-NEXT: // %bb.1: // %overflow ; CHECK-NEXT: asr x9, x1, #63 ; CHECK-NEXT: umulh x10, x0, x2 ; CHECK-NEXT: asr x13, x3, #63 @@ -445,29 +499,35 @@ define i128 @i128_saturating_mul(i128 %x, i128 %y) { ; CHECK-NEXT: adds x10, x11, x10 ; CHECK-NEXT: mul x14, x0, x3 ; CHECK-NEXT: umulh x12, x0, x3 -; CHECK-NEXT: adc x8, x8, x9 +; CHECK-NEXT: adc x9, x8, x9 ; CHECK-NEXT: mul x13, x0, x13 -; CHECK-NEXT: adds x9, x14, x10 -; CHECK-NEXT: mul x11, x1, x3 -; CHECK-NEXT: adc x10, x12, x13 -; CHECK-NEXT: smulh x12, x1, x3 -; CHECK-NEXT: asr x13, x8, #63 -; CHECK-NEXT: asr x14, x10, #63 -; CHECK-NEXT: adds x8, x8, x10 -; CHECK-NEXT: adc x10, x13, x14 -; CHECK-NEXT: adds x8, x11, x8 -; CHECK-NEXT: asr x11, x9, #63 -; CHECK-NEXT: mul x13, x0, x2 -; CHECK-NEXT: adc x10, x12, x10 -; CHECK-NEXT: eor x12, x3, x1 -; CHECK-NEXT: eor x8, x8, x11 -; CHECK-NEXT: eor x10, x10, x11 -; CHECK-NEXT: asr x11, x12, #63 -; CHECK-NEXT: orr x8, x8, x10 -; CHECK-NEXT: eor x10, x11, #0x7fffffffffffffff -; CHECK-NEXT: cmp x8, #0 -; CHECK-NEXT: csinv x0, x13, x11, eq -; CHECK-NEXT: csel x1, x10, x9, ne +; CHECK-NEXT: adds x8, x14, x10 +; CHECK-NEXT: mul x15, x1, x3 +; CHECK-NEXT: asr x14, x8, #63 +; CHECK-NEXT: smulh x10, x1, x3 +; CHECK-NEXT: adc x11, x12, x13 +; CHECK-NEXT: asr x12, x9, #63 +; CHECK-NEXT: asr x13, x11, #63 +; CHECK-NEXT: adds x11, x9, x11 +; CHECK-NEXT: mul x9, x0, x2 +; CHECK-NEXT: adc x12, x12, x13 +; CHECK-NEXT: adds x11, x15, x11 +; CHECK-NEXT: adc x10, x10, x12 +; CHECK-NEXT: cmp x11, x14 +; CHECK-NEXT: ccmp x10, x14, #0, eq +; CHECK-NEXT: cset w10, ne +; CHECK-NEXT: b .LBB23_3 +; CHECK-NEXT: .LBB23_2: // %overflow.no +; CHECK-NEXT: smulh x8, x0, x2 +; CHECK-NEXT: mov w10, wzr +; CHECK-NEXT: mul x9, x0, x2 +; CHECK-NEXT: .LBB23_3: // %overflow.res +; CHECK-NEXT: eor x11, x3, x1 +; CHECK-NEXT: cmp w10, #0 +; CHECK-NEXT: asr x11, x11, #63 +; CHECK-NEXT: eor x12, x11, #0x7fffffffffffffff +; CHECK-NEXT: csinv x0, x9, x11, eq +; CHECK-NEXT: csel x1, x12, x8, ne ; CHECK-NEXT: ret %1 = tail call { i128, i1 } @llvm.smul.with.overflow.i128(i128 %x, i128 %y) %2 = extractvalue { i128, i1 } %1, 0 diff --git a/llvm/test/CodeGen/AArch64/i128_with_overflow.ll b/llvm/test/CodeGen/AArch64/i128_with_overflow.ll index 9924b7c63f763..75e76472905c2 100644 --- a/llvm/test/CodeGen/AArch64/i128_with_overflow.ll +++ b/llvm/test/CodeGen/AArch64/i128_with_overflow.ll @@ -223,22 +223,30 @@ cleanup: define i128 @test_umul_i128(i128 noundef %x, i128 noundef %y) { ; CHECK-LABEL: test_umul_i128: -; CHECK: // %bb.0: // %entry +; CHECK: // %bb.0: // %overflow.entry +; CHECK-NEXT: orr x8, x1, x3 +; CHECK-NEXT: cbz x8, .LBB4_2 +; CHECK-NEXT: // %bb.1: // %overflow ; CHECK-NEXT: mul x9, x3, x0 ; CHECK-NEXT: cmp x1, #0 ; CHECK-NEXT: ccmp x3, #0, #4, ne -; CHECK-NEXT: umulh x8, x1, x2 -; CHECK-NEXT: umulh x10, x3, x0 +; CHECK-NEXT: umulh x10, x1, x2 +; CHECK-NEXT: umulh x8, x3, x0 ; CHECK-NEXT: madd x9, x1, x2, x9 -; CHECK-NEXT: ccmp xzr, x8, #0, eq -; CHECK-NEXT: umulh x11, x0, x2 ; CHECK-NEXT: ccmp xzr, x10, #0, eq +; CHECK-NEXT: umulh x11, x0, x2 +; CHECK-NEXT: ccmp xzr, x8, #0, eq +; CHECK-NEXT: mul x0, x0, x2 ; CHECK-NEXT: cset w8, ne ; CHECK-NEXT: adds x1, x11, x9 ; CHECK-NEXT: csinc w8, w8, wzr, lo -; CHECK-NEXT: cmp w8, #1 -; CHECK-NEXT: b.ne .LBB4_2 -; CHECK-NEXT: // %bb.1: // %if.then +; CHECK-NEXT: cbnz w8, .LBB4_3 +; CHECK-NEXT: b .LBB4_4 +; CHECK-NEXT: .LBB4_2: // %overflow.no +; CHECK-NEXT: umulh x1, x0, x2 +; CHECK-NEXT: mul x0, x0, x2 +; CHECK-NEXT: cbz w8, .LBB4_4 +; CHECK-NEXT: .LBB4_3: // %if.then ; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: .cfi_offset w30, -16 @@ -247,9 +255,7 @@ define i128 @test_umul_i128(i128 noundef %x, i128 noundef %y) { ; CHECK-NEXT: sxtw x0, w0 ; CHECK-NEXT: asr x1, x0, #63 ; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload -; CHECK-NEXT: ret -; CHECK-NEXT: .LBB4_2: // %if.end -; CHECK-NEXT: mul x0, x0, x2 +; CHECK-NEXT: .LBB4_4: // %cleanup ; CHECK-NEXT: ret entry: %0 = tail call { i128, i1 } @llvm.umul.with.overflow.i128(i128 %x, i128 %y) @@ -272,35 +278,48 @@ cleanup: define i128 @test_smul_i128(i128 noundef %x, i128 noundef %y) { ; CHECK-LABEL: test_smul_i128: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: asr x10, x1, #63 -; CHECK-NEXT: umulh x11, x0, x2 -; CHECK-NEXT: asr x14, x3, #63 -; CHECK-NEXT: mov x8, x1 -; CHECK-NEXT: mul x12, x1, x2 -; CHECK-NEXT: umulh x9, x1, x2 -; CHECK-NEXT: mul x10, x10, x2 -; CHECK-NEXT: adds x11, x12, x11 -; CHECK-NEXT: mul x15, x0, x3 -; CHECK-NEXT: umulh x13, x0, x3 -; CHECK-NEXT: adc x9, x9, x10 -; CHECK-NEXT: mul x14, x0, x14 -; CHECK-NEXT: mul x16, x1, x3 -; CHECK-NEXT: adds x1, x15, x11 -; CHECK-NEXT: asr x11, x9, #63 -; CHECK-NEXT: smulh x8, x8, x3 -; CHECK-NEXT: adc x10, x13, x14 -; CHECK-NEXT: asr x12, x10, #63 -; CHECK-NEXT: adds x9, x9, x10 -; CHECK-NEXT: adc x10, x11, x12 -; CHECK-NEXT: adds x9, x16, x9 -; CHECK-NEXT: asr x11, x1, #63 -; CHECK-NEXT: adc x8, x8, x10 -; CHECK-NEXT: eor x8, x8, x11 -; CHECK-NEXT: eor x9, x9, x11 +; CHECK: // %bb.0: // %overflow.entry +; CHECK-NEXT: eor x8, x3, x2, asr #63 +; CHECK-NEXT: eor x9, x1, x0, asr #63 ; CHECK-NEXT: orr x8, x9, x8 -; CHECK-NEXT: cbz x8, .LBB5_2 -; CHECK-NEXT: // %bb.1: // %if.then +; CHECK-NEXT: cmp x8, #1 +; CHECK-NEXT: b.ne .LBB5_2 +; CHECK-NEXT: // %bb.1: // %overflow +; CHECK-NEXT: asr x9, x1, #63 +; CHECK-NEXT: umulh x10, x0, x2 +; CHECK-NEXT: asr x13, x3, #63 +; CHECK-NEXT: mul x11, x1, x2 +; CHECK-NEXT: umulh x8, x1, x2 +; CHECK-NEXT: mul x9, x9, x2 +; CHECK-NEXT: adds x10, x11, x10 +; CHECK-NEXT: mul x14, x0, x3 +; CHECK-NEXT: umulh x12, x0, x3 +; CHECK-NEXT: adc x8, x8, x9 +; CHECK-NEXT: mov x9, x1 +; CHECK-NEXT: mul x13, x0, x13 +; CHECK-NEXT: asr x11, x8, #63 +; CHECK-NEXT: mul x15, x1, x3 +; CHECK-NEXT: adds x1, x14, x10 +; CHECK-NEXT: smulh x9, x9, x3 +; CHECK-NEXT: adc x10, x12, x13 +; CHECK-NEXT: asr x12, x10, #63 +; CHECK-NEXT: adds x8, x8, x10 +; CHECK-NEXT: asr x10, x1, #63 +; CHECK-NEXT: mul x0, x0, x2 +; CHECK-NEXT: adc x11, x11, x12 +; CHECK-NEXT: adds x8, x15, x8 +; CHECK-NEXT: adc x9, x9, x11 +; CHECK-NEXT: cmp x8, x10 +; CHECK-NEXT: ccmp x9, x10, #0, eq +; CHECK-NEXT: cset w8, ne +; CHECK-NEXT: cbnz w8, .LBB5_3 +; CHECK-NEXT: b .LBB5_4 +; CHECK-NEXT: .LBB5_2: // %overflow.no +; CHECK-NEXT: smulh x1, x0, x2 +; CHECK-NEXT: mov w8, wzr +; CHECK-NEXT: mul x0, x0, x2 +; CHECK-NEXT: cbz w8, .LBB5_4 +; CHECK-NEXT: .LBB5_3: // %if.then ; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: .cfi_offset w30, -16 @@ -309,9 +328,7 @@ define i128 @test_smul_i128(i128 noundef %x, i128 noundef %y) { ; CHECK-NEXT: sxtw x0, w0 ; CHECK-NEXT: asr x1, x0, #63 ; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload -; CHECK-NEXT: ret -; CHECK-NEXT: .LBB5_2: // %if.end -; CHECK-NEXT: mul x0, x0, x2 +; CHECK-NEXT: .LBB5_4: // %cleanup ; CHECK-NEXT: ret entry: %0 = tail call { i128, i1 } @llvm.smul.with.overflow.i128(i128 %x, i128 %y) diff --git a/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll index edfd80b4f2706..f98438593262f 100644 --- a/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll +++ b/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll @@ -3,21 +3,29 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 { ; AARCH-LABEL: muloti_test: -; AARCH: // %bb.0: // %start +; AARCH: // %bb.0: // %overflow.entry +; AARCH-NEXT: orr x8, x1, x3 +; AARCH-NEXT: cbz x8, .LBB0_2 +; AARCH-NEXT: // %bb.1: // %overflow ; AARCH-NEXT: mul x9, x3, x0 ; AARCH-NEXT: cmp x1, #0 ; AARCH-NEXT: ccmp x3, #0, #4, ne -; AARCH-NEXT: umulh x8, x1, x2 -; AARCH-NEXT: umulh x10, x3, x0 +; AARCH-NEXT: umulh x10, x1, x2 +; AARCH-NEXT: umulh x8, x3, x0 ; AARCH-NEXT: madd x9, x1, x2, x9 -; AARCH-NEXT: ccmp xzr, x8, #0, eq -; AARCH-NEXT: umulh x11, x0, x2 ; AARCH-NEXT: ccmp xzr, x10, #0, eq +; AARCH-NEXT: umulh x11, x0, x2 +; AARCH-NEXT: ccmp xzr, x8, #0, eq ; AARCH-NEXT: mul x0, x0, x2 ; AARCH-NEXT: cset w8, ne ; AARCH-NEXT: adds x1, x11, x9 ; AARCH-NEXT: csinc w2, w8, wzr, lo ; AARCH-NEXT: ret +; AARCH-NEXT: .LBB0_2: // %overflow.no +; AARCH-NEXT: umulh x1, x0, x2 +; AARCH-NEXT: mul x0, x0, x2 +; AARCH-NEXT: mov w2, wzr +; AARCH-NEXT: ret start: %0 = tail call { i128, i1 } @llvm.umul.with.overflow.i128(i128 %l, i128 %r) #2 %1 = extractvalue { i128, i1 } %0, 0 @@ -34,46 +42,58 @@ start: define i128 @__muloti4(i128 %0, i128 %1, ptr nocapture nonnull writeonly align 4 %2) #2 { ; AARCH-LABEL: __muloti4: -; AARCH: // %bb.0: // %Entry -; AARCH-NEXT: asr x11, x1, #63 -; AARCH-NEXT: asr x9, x3, #63 -; AARCH-NEXT: umulh x12, x0, x2 -; AARCH-NEXT: mov x8, x1 +; AARCH: // %bb.0: // %overflow.entry +; AARCH-NEXT: eor x8, x3, x2, asr #63 +; AARCH-NEXT: eor x9, x1, x0, asr #63 ; AARCH-NEXT: str wzr, [x4] -; AARCH-NEXT: mul x13, x1, x2 -; AARCH-NEXT: umulh x10, x1, x2 -; AARCH-NEXT: mul x11, x11, x2 -; AARCH-NEXT: adds x12, x13, x12 -; AARCH-NEXT: mul x15, x0, x3 -; AARCH-NEXT: umulh x14, x0, x3 -; AARCH-NEXT: adc x10, x10, x11 -; AARCH-NEXT: mul x9, x0, x9 -; AARCH-NEXT: mul x16, x1, x3 -; AARCH-NEXT: adds x1, x15, x12 -; AARCH-NEXT: asr x12, x10, #63 -; AARCH-NEXT: smulh x11, x8, x3 -; AARCH-NEXT: adc x9, x14, x9 -; AARCH-NEXT: asr x13, x9, #63 -; AARCH-NEXT: adds x9, x10, x9 -; AARCH-NEXT: asr x10, x1, #63 +; AARCH-NEXT: orr x8, x9, x8 +; AARCH-NEXT: cmp x8, #1 +; AARCH-NEXT: b.ne .LBB1_2 +; AARCH-NEXT: // %bb.1: // %overflow +; AARCH-NEXT: asr x9, x1, #63 +; AARCH-NEXT: umulh x10, x0, x2 +; AARCH-NEXT: asr x13, x3, #63 +; AARCH-NEXT: mul x11, x1, x2 +; AARCH-NEXT: umulh x8, x1, x2 +; AARCH-NEXT: mul x9, x9, x2 +; AARCH-NEXT: adds x10, x11, x10 +; AARCH-NEXT: mul x14, x0, x3 +; AARCH-NEXT: umulh x12, x0, x3 +; AARCH-NEXT: adc x9, x8, x9 +; AARCH-NEXT: mul x13, x0, x13 +; AARCH-NEXT: adds x8, x14, x10 +; AARCH-NEXT: mul x15, x1, x3 +; AARCH-NEXT: smulh x10, x1, x3 +; AARCH-NEXT: adc x11, x12, x13 +; AARCH-NEXT: asr x12, x9, #63 +; AARCH-NEXT: asr x13, x11, #63 +; AARCH-NEXT: adds x9, x9, x11 +; AARCH-NEXT: asr x11, x8, #63 ; AARCH-NEXT: mul x0, x0, x2 ; AARCH-NEXT: adc x12, x12, x13 -; AARCH-NEXT: adds x9, x16, x9 -; AARCH-NEXT: adc x11, x11, x12 -; AARCH-NEXT: cmp x9, x10 -; AARCH-NEXT: ccmp x11, x10, #0, eq +; AARCH-NEXT: adds x9, x15, x9 +; AARCH-NEXT: adc x10, x10, x12 +; AARCH-NEXT: cmp x9, x11 +; AARCH-NEXT: ccmp x10, x11, #0, eq ; AARCH-NEXT: cset w9, ne -; AARCH-NEXT: tbz x8, #63, .LBB1_2 -; AARCH-NEXT: // %bb.1: // %Entry -; AARCH-NEXT: eor x8, x3, #0x8000000000000000 -; AARCH-NEXT: orr x8, x2, x8 -; AARCH-NEXT: cbz x8, .LBB1_3 -; AARCH-NEXT: .LBB1_2: // %Else2 -; AARCH-NEXT: cbz w9, .LBB1_4 -; AARCH-NEXT: .LBB1_3: // %Then7 -; AARCH-NEXT: mov w8, #1 // =0x1 -; AARCH-NEXT: str w8, [x4] -; AARCH-NEXT: .LBB1_4: // %Block9 +; AARCH-NEXT: tbnz x1, #63, .LBB1_3 +; AARCH-NEXT: b .LBB1_4 +; AARCH-NEXT: .LBB1_2: // %overflow.no +; AARCH-NEXT: smulh x8, x0, x2 +; AARCH-NEXT: mov w9, wzr +; AARCH-NEXT: mul x0, x0, x2 +; AARCH-NEXT: tbz x1, #63, .LBB1_4 +; AARCH-NEXT: .LBB1_3: // %overflow.res +; AARCH-NEXT: eor x10, x3, #0x8000000000000000 +; AARCH-NEXT: orr x10, x2, x10 +; AARCH-NEXT: cbz x10, .LBB1_5 +; AARCH-NEXT: .LBB1_4: // %Else2 +; AARCH-NEXT: cbz w9, .LBB1_6 +; AARCH-NEXT: .LBB1_5: // %Then7 +; AARCH-NEXT: mov w9, #1 // =0x1 +; AARCH-NEXT: str w9, [x4] +; AARCH-NEXT: .LBB1_6: // %Block9 +; AARCH-NEXT: mov x1, x8 ; AARCH-NEXT: ret Entry: store i32 0, ptr %2, align 4