diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp index ed7b07f7d9367..04cacd5ba8591 100644 --- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -8874,18 +8874,24 @@ LegalizerHelper::lowerSADDO_SSUBO(MachineInstr &MI) { auto Zero = MIRBuilder.buildConstant(Ty, 0); - // For an addition, the result should be less than one of the operands (LHS) - // if and only if the other operand (RHS) is negative, otherwise there will - // be overflow. - // For a subtraction, the result should be less than one of the operands - // (LHS) if and only if the other operand (RHS) is (non-zero) positive, - // otherwise there will be overflow. - auto ResultLowerThanLHS = - MIRBuilder.buildICmp(CmpInst::ICMP_SLT, BoolTy, NewDst0, LHS); - auto ConditionRHS = MIRBuilder.buildICmp( - IsAdd ? CmpInst::ICMP_SLT : CmpInst::ICMP_SGT, BoolTy, RHS, Zero); - - MIRBuilder.buildXor(Dst1, ConditionRHS, ResultLowerThanLHS); + if (IsAdd) { + // For addition, the result should be less than one of the operands (LHS) + // if and only if the other operand (RHS) is negative, otherwise there will + // be overflow. + auto ResultLowerThanLHS = + MIRBuilder.buildICmp(CmpInst::ICMP_SLT, BoolTy, NewDst0, LHS); + auto RHSNegative = + MIRBuilder.buildICmp(CmpInst::ICMP_SLT, BoolTy, RHS, Zero); + MIRBuilder.buildXor(Dst1, RHSNegative, ResultLowerThanLHS); + } else { + // For subtraction, overflow occurs when the signed comparison of operands + // doesn't match the sign of the result + auto LHSLessThanRHS = + MIRBuilder.buildICmp(CmpInst::ICMP_SLT, BoolTy, LHS, RHS); + auto ResultNegative = + MIRBuilder.buildICmp(CmpInst::ICMP_SLT, BoolTy, NewDst0, Zero); + MIRBuilder.buildXor(Dst1, LHSLessThanRHS, ResultNegative); + } MIRBuilder.buildCopy(Dst0, NewDst0); MI.eraseFromParent(); diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index 1764910861df4..7d9c8e3865405 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -11441,19 +11441,25 @@ void TargetLowering::expandSADDSUBO( SDValue Zero = DAG.getConstant(0, dl, LHS.getValueType()); - // For an addition, the result should be less than one of the operands (LHS) - // if and only if the other operand (RHS) is negative, otherwise there will - // be overflow. - // For a subtraction, the result should be less than one of the operands - // (LHS) if and only if the other operand (RHS) is (non-zero) positive, - // otherwise there will be overflow. - SDValue ResultLowerThanLHS = DAG.getSetCC(dl, OType, Result, LHS, ISD::SETLT); - SDValue ConditionRHS = - DAG.getSetCC(dl, OType, RHS, Zero, IsAdd ? ISD::SETLT : ISD::SETGT); - - Overflow = DAG.getBoolExtOrTrunc( - DAG.getNode(ISD::XOR, dl, OType, ConditionRHS, ResultLowerThanLHS), dl, - ResultType, ResultType); + if (IsAdd) { + // For addition, the result should be less than one of the operands (LHS) + // if and only if the other operand (RHS) is negative, otherwise there will + // be overflow. + SDValue ResultLowerThanLHS = + DAG.getSetCC(dl, OType, Result, LHS, ISD::SETLT); + SDValue ConditionRHS = DAG.getSetCC(dl, OType, RHS, Zero, ISD::SETLT); + Overflow = DAG.getBoolExtOrTrunc( + DAG.getNode(ISD::XOR, dl, OType, ConditionRHS, ResultLowerThanLHS), dl, + ResultType, ResultType); + } else { + // For subtraction, overflow occurs when the signed comparison of operands + // doesn't match the sign of the result + SDValue LHSLessThanRHS = DAG.getSetCC(dl, OType, LHS, RHS, ISD::SETLT); + SDValue ResultNegative = DAG.getSetCC(dl, OType, Result, Zero, ISD::SETLT); + Overflow = DAG.getBoolExtOrTrunc( + DAG.getNode(ISD::XOR, dl, OType, LHSLessThanRHS, ResultNegative), dl, + ResultType, ResultType); + } } bool TargetLowering::expandMULO(SDNode *Node, SDValue &Result, diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 607edd3d859f8..7450f01125454 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -14497,6 +14497,35 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N, Results.push_back(customLegalizeToWOp(N, DAG, ExtOpc)); break; } + case ISD::SSUBO: { + assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() && + "Unexpected custom legalisation"); + + // If the RHS is a constant, we can simplify the below. Otherwise + // use the default legalization. + if (!isa(N->getOperand(1))) + return; + + SDValue LHS = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, N->getOperand(0)); + SDValue RHS = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, N->getOperand(1)); + SDValue Res = DAG.getNode(ISD::SUB, DL, MVT::i64, LHS, RHS); + Res = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i64, Res, + DAG.getValueType(MVT::i32)); + + SDValue Zero = DAG.getConstant(0, DL, MVT::i64); + + // For subtraction, overflow occurs when the signed comparison of operands + // doesn't match the sign of the result + EVT OType = N->getValueType(1); + SDValue LHSLessThanRHS = DAG.getSetCC(DL, OType, LHS, RHS, ISD::SETLT); + SDValue ResultNegative = DAG.getSetCC(DL, OType, Res, Zero, ISD::SETLT); + SDValue Overflow = + DAG.getNode(ISD::XOR, DL, OType, LHSLessThanRHS, ResultNegative); + + Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Res)); + Results.push_back(Overflow); + return; + } case ISD::SADDO: { assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() && "Unexpected custom legalisation"); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ssubo.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ssubo.mir index aa59de0118ad6..b03b651db1769 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ssubo.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ssubo.mir @@ -14,12 +14,12 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY]], [[COPY1]] ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[SUB]], 7 - ; CHECK-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY]], 7 + ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY]], 7 + ; CHECK-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY1]], 7 ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[SEXT_INREG]](s32), [[SEXT_INREG1]] - ; CHECK-NEXT: [[SEXT_INREG2:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY1]], 7 - ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[SEXT_INREG2]](s32), [[C]] - ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP1]], [[ICMP]] + ; CHECK-NEXT: [[SEXT_INREG2:%[0-9]+]]:_(s32) = G_SEXT_INREG [[SUB]], 7 + ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[SEXT_INREG2]](s32), [[C]] + ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP]], [[ICMP1]] ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 127 ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[SUB]], [[C1]] ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[XOR]](s1) @@ -48,13 +48,13 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY]], [[COPY1]] - ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[SUB]], 16 - ; CHECK-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY]], 16 + ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY]], 16 + ; CHECK-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY1]], 16 ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[SEXT_INREG]](s32), [[SEXT_INREG1]] - ; CHECK-NEXT: [[SEXT_INREG2:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY1]], 16 + ; CHECK-NEXT: [[SEXT_INREG2:%[0-9]+]]:_(s32) = G_SEXT_INREG [[SUB]], 16 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[SEXT_INREG2]](s32), [[C]] - ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP1]], [[ICMP]] + ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[SEXT_INREG2]](s32), [[C]] + ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP]], [[ICMP1]] ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[XOR]](s1) ; CHECK-NEXT: $vgpr0 = COPY [[SUB]](s32) ; CHECK-NEXT: $vgpr1 = COPY [[ZEXT]](s32) @@ -82,9 +82,9 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY]], [[COPY1]] ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[SUB]](s32), [[COPY]] - ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[COPY1]](s32), [[C]] - ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP1]], [[ICMP]] + ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[COPY]](s32), [[COPY1]] + ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[SUB]](s32), [[C]] + ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP]], [[ICMP1]] ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[SUB]](s32) ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[XOR]](s1) ; CHECK-NEXT: $vgpr0 = COPY [[COPY2]](s32) @@ -114,9 +114,9 @@ body: | ; CHECK-NEXT: [[USUBE:%[0-9]+]]:_(s32), [[USUBE1:%[0-9]+]]:_(s1) = G_USUBE [[UV1]], [[UV3]], [[USUBO1]] ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBO]](s32), [[USUBE]](s32) ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV]](s64), [[COPY]] - ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[COPY1]](s64), [[C]] - ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP1]], [[ICMP]] + ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[COPY]](s64), [[COPY1]] + ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV]](s64), [[C]] + ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP]], [[ICMP1]] ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[MV]](s64) ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[XOR]](s1) ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[COPY2]](s64) @@ -155,22 +155,22 @@ body: | ; CHECK-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) ; CHECK-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) ; CHECK-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) - ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[SUB]], 16 - ; CHECK-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[BITCAST3]], 16 - ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[SEXT_INREG]](s32), [[SEXT_INREG1]] - ; CHECK-NEXT: [[SEXT_INREG2:%[0-9]+]]:_(s32) = G_SEXT_INREG [[SUB1]], 16 - ; CHECK-NEXT: [[SEXT_INREG3:%[0-9]+]]:_(s32) = G_SEXT_INREG [[LSHR2]], 16 - ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[SEXT_INREG2]](s32), [[SEXT_INREG3]] ; CHECK-NEXT: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) ; CHECK-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C]](s32) - ; CHECK-NEXT: [[SEXT_INREG4:%[0-9]+]]:_(s32) = G_SEXT_INREG [[BITCAST4]], 16 + ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[BITCAST3]], 16 + ; CHECK-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[BITCAST4]], 16 + ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[SEXT_INREG]](s32), [[SEXT_INREG1]] + ; CHECK-NEXT: [[SEXT_INREG2:%[0-9]+]]:_(s32) = G_SEXT_INREG [[LSHR2]], 16 + ; CHECK-NEXT: [[SEXT_INREG3:%[0-9]+]]:_(s32) = G_SEXT_INREG [[LSHR3]], 16 + ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[SEXT_INREG2]](s32), [[SEXT_INREG3]] + ; CHECK-NEXT: [[SEXT_INREG4:%[0-9]+]]:_(s32) = G_SEXT_INREG [[SUB]], 16 ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C2]](s32) - ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[SEXT_INREG4]](s32), [[COPY2]] - ; CHECK-NEXT: [[SEXT_INREG5:%[0-9]+]]:_(s32) = G_SEXT_INREG [[LSHR3]], 16 - ; CHECK-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[SEXT_INREG5]](s32), [[C2]] - ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP2]], [[ICMP]] - ; CHECK-NEXT: [[XOR1:%[0-9]+]]:_(s1) = G_XOR [[ICMP3]], [[ICMP1]] + ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[SEXT_INREG4]](s32), [[COPY2]] + ; CHECK-NEXT: [[SEXT_INREG5:%[0-9]+]]:_(s32) = G_SEXT_INREG [[SUB1]], 16 + ; CHECK-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[SEXT_INREG5]](s32), [[C2]] + ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP]], [[ICMP2]] + ; CHECK-NEXT: [[XOR1:%[0-9]+]]:_(s1) = G_XOR [[ICMP1]], [[ICMP3]] ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[XOR]](s1) ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[XOR1]](s1) ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY [[BITCAST2]](<2 x s16>) @@ -214,31 +214,31 @@ body: | ; CHECK-NEXT: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[UV6]](<2 x s16>) ; CHECK-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C]](s32) ; CHECK-NEXT: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[UV7]](<2 x s16>) - ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[SUB]], 16 - ; CHECK-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[BITCAST4]], 16 - ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[SEXT_INREG]](s32), [[SEXT_INREG1]] - ; CHECK-NEXT: [[SEXT_INREG2:%[0-9]+]]:_(s32) = G_SEXT_INREG [[SUB1]], 16 - ; CHECK-NEXT: [[SEXT_INREG3:%[0-9]+]]:_(s32) = G_SEXT_INREG [[LSHR2]], 16 - ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[SEXT_INREG2]](s32), [[SEXT_INREG3]] - ; CHECK-NEXT: [[SEXT_INREG4:%[0-9]+]]:_(s32) = G_SEXT_INREG [[SUB2]], 16 - ; CHECK-NEXT: [[SEXT_INREG5:%[0-9]+]]:_(s32) = G_SEXT_INREG [[BITCAST5]], 16 - ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[SEXT_INREG4]](s32), [[SEXT_INREG5]] ; CHECK-NEXT: [[UV9:%[0-9]+]]:_(<2 x s16>), [[UV10:%[0-9]+]]:_(<2 x s16>), [[UV11:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY1]](<6 x s16>) ; CHECK-NEXT: [[BITCAST6:%[0-9]+]]:_(s32) = G_BITCAST [[UV9]](<2 x s16>) ; CHECK-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST6]], [[C]](s32) ; CHECK-NEXT: [[BITCAST7:%[0-9]+]]:_(s32) = G_BITCAST [[UV10]](<2 x s16>) - ; CHECK-NEXT: [[SEXT_INREG6:%[0-9]+]]:_(s32) = G_SEXT_INREG [[BITCAST6]], 16 + ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[BITCAST4]], 16 + ; CHECK-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[BITCAST6]], 16 + ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[SEXT_INREG]](s32), [[SEXT_INREG1]] + ; CHECK-NEXT: [[SEXT_INREG2:%[0-9]+]]:_(s32) = G_SEXT_INREG [[LSHR2]], 16 + ; CHECK-NEXT: [[SEXT_INREG3:%[0-9]+]]:_(s32) = G_SEXT_INREG [[LSHR3]], 16 + ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[SEXT_INREG2]](s32), [[SEXT_INREG3]] + ; CHECK-NEXT: [[SEXT_INREG4:%[0-9]+]]:_(s32) = G_SEXT_INREG [[BITCAST5]], 16 + ; CHECK-NEXT: [[SEXT_INREG5:%[0-9]+]]:_(s32) = G_SEXT_INREG [[BITCAST7]], 16 + ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[SEXT_INREG4]](s32), [[SEXT_INREG5]] + ; CHECK-NEXT: [[SEXT_INREG6:%[0-9]+]]:_(s32) = G_SEXT_INREG [[SUB]], 16 ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C1]](s32) - ; CHECK-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[SEXT_INREG6]](s32), [[COPY2]] - ; CHECK-NEXT: [[SEXT_INREG7:%[0-9]+]]:_(s32) = G_SEXT_INREG [[LSHR3]], 16 + ; CHECK-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[SEXT_INREG6]](s32), [[COPY2]] + ; CHECK-NEXT: [[SEXT_INREG7:%[0-9]+]]:_(s32) = G_SEXT_INREG [[SUB1]], 16 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C1]](s32) - ; CHECK-NEXT: [[ICMP4:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[SEXT_INREG7]](s32), [[COPY3]] - ; CHECK-NEXT: [[SEXT_INREG8:%[0-9]+]]:_(s32) = G_SEXT_INREG [[BITCAST7]], 16 - ; CHECK-NEXT: [[ICMP5:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[SEXT_INREG8]](s32), [[C1]] - ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP3]], [[ICMP]] - ; CHECK-NEXT: [[XOR1:%[0-9]+]]:_(s1) = G_XOR [[ICMP4]], [[ICMP1]] - ; CHECK-NEXT: [[XOR2:%[0-9]+]]:_(s1) = G_XOR [[ICMP5]], [[ICMP2]] + ; CHECK-NEXT: [[ICMP4:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[SEXT_INREG7]](s32), [[COPY3]] + ; CHECK-NEXT: [[SEXT_INREG8:%[0-9]+]]:_(s32) = G_SEXT_INREG [[SUB2]], 16 + ; CHECK-NEXT: [[ICMP5:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[SEXT_INREG8]](s32), [[C1]] + ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP]], [[ICMP3]] + ; CHECK-NEXT: [[XOR1:%[0-9]+]]:_(s1) = G_XOR [[ICMP1]], [[ICMP4]] + ; CHECK-NEXT: [[XOR2:%[0-9]+]]:_(s1) = G_XOR [[ICMP2]], [[ICMP5]] ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[XOR]](s1) ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[XOR1]](s1) ; CHECK-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[XOR2]](s1) @@ -325,39 +325,39 @@ body: | ; CHECK-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST6]], [[C]](s32) ; CHECK-NEXT: [[BITCAST7:%[0-9]+]]:_(s32) = G_BITCAST [[UV5]](<2 x s16>) ; CHECK-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST7]], [[C]](s32) - ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[SUB]], 16 - ; CHECK-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[BITCAST6]], 16 - ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[SEXT_INREG]](s32), [[SEXT_INREG1]] - ; CHECK-NEXT: [[SEXT_INREG2:%[0-9]+]]:_(s32) = G_SEXT_INREG [[SUB1]], 16 - ; CHECK-NEXT: [[SEXT_INREG3:%[0-9]+]]:_(s32) = G_SEXT_INREG [[LSHR4]], 16 - ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[SEXT_INREG2]](s32), [[SEXT_INREG3]] - ; CHECK-NEXT: [[SEXT_INREG4:%[0-9]+]]:_(s32) = G_SEXT_INREG [[SUB2]], 16 - ; CHECK-NEXT: [[SEXT_INREG5:%[0-9]+]]:_(s32) = G_SEXT_INREG [[BITCAST7]], 16 - ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[SEXT_INREG4]](s32), [[SEXT_INREG5]] - ; CHECK-NEXT: [[SEXT_INREG6:%[0-9]+]]:_(s32) = G_SEXT_INREG [[SUB3]], 16 - ; CHECK-NEXT: [[SEXT_INREG7:%[0-9]+]]:_(s32) = G_SEXT_INREG [[LSHR5]], 16 - ; CHECK-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[SEXT_INREG6]](s32), [[SEXT_INREG7]] ; CHECK-NEXT: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY1]](<4 x s16>) ; CHECK-NEXT: [[BITCAST8:%[0-9]+]]:_(s32) = G_BITCAST [[UV6]](<2 x s16>) ; CHECK-NEXT: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST8]], [[C]](s32) ; CHECK-NEXT: [[BITCAST9:%[0-9]+]]:_(s32) = G_BITCAST [[UV7]](<2 x s16>) ; CHECK-NEXT: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST9]], [[C]](s32) - ; CHECK-NEXT: [[SEXT_INREG8:%[0-9]+]]:_(s32) = G_SEXT_INREG [[BITCAST8]], 16 + ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[BITCAST6]], 16 + ; CHECK-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[BITCAST8]], 16 + ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[SEXT_INREG]](s32), [[SEXT_INREG1]] + ; CHECK-NEXT: [[SEXT_INREG2:%[0-9]+]]:_(s32) = G_SEXT_INREG [[LSHR4]], 16 + ; CHECK-NEXT: [[SEXT_INREG3:%[0-9]+]]:_(s32) = G_SEXT_INREG [[LSHR6]], 16 + ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[SEXT_INREG2]](s32), [[SEXT_INREG3]] + ; CHECK-NEXT: [[SEXT_INREG4:%[0-9]+]]:_(s32) = G_SEXT_INREG [[BITCAST7]], 16 + ; CHECK-NEXT: [[SEXT_INREG5:%[0-9]+]]:_(s32) = G_SEXT_INREG [[BITCAST9]], 16 + ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[SEXT_INREG4]](s32), [[SEXT_INREG5]] + ; CHECK-NEXT: [[SEXT_INREG6:%[0-9]+]]:_(s32) = G_SEXT_INREG [[LSHR5]], 16 + ; CHECK-NEXT: [[SEXT_INREG7:%[0-9]+]]:_(s32) = G_SEXT_INREG [[LSHR7]], 16 + ; CHECK-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[SEXT_INREG6]](s32), [[SEXT_INREG7]] + ; CHECK-NEXT: [[SEXT_INREG8:%[0-9]+]]:_(s32) = G_SEXT_INREG [[SUB]], 16 ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C2]](s32) - ; CHECK-NEXT: [[ICMP4:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[SEXT_INREG8]](s32), [[COPY2]] - ; CHECK-NEXT: [[SEXT_INREG9:%[0-9]+]]:_(s32) = G_SEXT_INREG [[LSHR6]], 16 + ; CHECK-NEXT: [[ICMP4:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[SEXT_INREG8]](s32), [[COPY2]] + ; CHECK-NEXT: [[SEXT_INREG9:%[0-9]+]]:_(s32) = G_SEXT_INREG [[SUB1]], 16 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C2]](s32) - ; CHECK-NEXT: [[ICMP5:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[SEXT_INREG9]](s32), [[COPY3]] - ; CHECK-NEXT: [[SEXT_INREG10:%[0-9]+]]:_(s32) = G_SEXT_INREG [[BITCAST9]], 16 + ; CHECK-NEXT: [[ICMP5:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[SEXT_INREG9]](s32), [[COPY3]] + ; CHECK-NEXT: [[SEXT_INREG10:%[0-9]+]]:_(s32) = G_SEXT_INREG [[SUB2]], 16 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C2]](s32) - ; CHECK-NEXT: [[ICMP6:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[SEXT_INREG10]](s32), [[COPY4]] - ; CHECK-NEXT: [[SEXT_INREG11:%[0-9]+]]:_(s32) = G_SEXT_INREG [[LSHR7]], 16 - ; CHECK-NEXT: [[ICMP7:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[SEXT_INREG11]](s32), [[C2]] - ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP4]], [[ICMP]] - ; CHECK-NEXT: [[XOR1:%[0-9]+]]:_(s1) = G_XOR [[ICMP5]], [[ICMP1]] - ; CHECK-NEXT: [[XOR2:%[0-9]+]]:_(s1) = G_XOR [[ICMP6]], [[ICMP2]] - ; CHECK-NEXT: [[XOR3:%[0-9]+]]:_(s1) = G_XOR [[ICMP7]], [[ICMP3]] + ; CHECK-NEXT: [[ICMP6:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[SEXT_INREG10]](s32), [[COPY4]] + ; CHECK-NEXT: [[SEXT_INREG11:%[0-9]+]]:_(s32) = G_SEXT_INREG [[SUB3]], 16 + ; CHECK-NEXT: [[ICMP7:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[SEXT_INREG11]](s32), [[C2]] + ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP]], [[ICMP4]] + ; CHECK-NEXT: [[XOR1:%[0-9]+]]:_(s1) = G_XOR [[ICMP1]], [[ICMP5]] + ; CHECK-NEXT: [[XOR2:%[0-9]+]]:_(s1) = G_XOR [[ICMP2]], [[ICMP6]] + ; CHECK-NEXT: [[XOR3:%[0-9]+]]:_(s1) = G_XOR [[ICMP3]], [[ICMP7]] ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[XOR]](s1) ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[XOR1]](s1) ; CHECK-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[XOR2]](s1) @@ -397,13 +397,13 @@ body: | ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[SUB]](s32), [[SUB1]](s32) ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; CHECK-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>) - ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[SUB]](s32), [[UV4]] - ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[SUB1]](s32), [[UV5]] ; CHECK-NEXT: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>) - ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[UV6]](s32), [[C]] - ; CHECK-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[UV7]](s32), [[C]] - ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP2]], [[ICMP]] - ; CHECK-NEXT: [[XOR1:%[0-9]+]]:_(s1) = G_XOR [[ICMP3]], [[ICMP1]] + ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[UV4]](s32), [[UV6]] + ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[UV5]](s32), [[UV7]] + ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[SUB]](s32), [[C]] + ; CHECK-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[SUB1]](s32), [[C]] + ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP]], [[ICMP2]] + ; CHECK-NEXT: [[XOR1:%[0-9]+]]:_(s1) = G_XOR [[ICMP1]], [[ICMP3]] ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[XOR]](s1) ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[XOR1]](s1) ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s32>) = COPY [[BUILD_VECTOR]](<2 x s32>) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ssubsat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ssubsat.mir index 67c715ff5058d..c4b7e71fb4d67 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ssubsat.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ssubsat.mir @@ -952,9 +952,9 @@ body: | ; GFX6-NEXT: [[USUBE:%[0-9]+]]:_(s32), [[USUBE1:%[0-9]+]]:_(s1) = G_USUBE [[UV1]], [[UV3]], [[USUBO1]] ; GFX6-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBO]](s32), [[USUBE]](s32) ; GFX6-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; GFX6-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV]](s64), [[COPY]] - ; GFX6-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[COPY1]](s64), [[C]] - ; GFX6-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP1]], [[ICMP]] + ; GFX6-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[COPY]](s64), [[COPY1]] + ; GFX6-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV]](s64), [[C]] + ; GFX6-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP]], [[ICMP1]] ; GFX6-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[MV]](s64) ; GFX6-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 63 ; GFX6-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[COPY2]], [[C1]](s32) @@ -978,9 +978,9 @@ body: | ; GFX8-NEXT: [[USUBE:%[0-9]+]]:_(s32), [[USUBE1:%[0-9]+]]:_(s1) = G_USUBE [[UV1]], [[UV3]], [[USUBO1]] ; GFX8-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBO]](s32), [[USUBE]](s32) ; GFX8-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; GFX8-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV]](s64), [[COPY]] - ; GFX8-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[COPY1]](s64), [[C]] - ; GFX8-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP1]], [[ICMP]] + ; GFX8-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[COPY]](s64), [[COPY1]] + ; GFX8-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV]](s64), [[C]] + ; GFX8-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP]], [[ICMP1]] ; GFX8-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[MV]](s64) ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 63 ; GFX8-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[COPY2]], [[C1]](s32) @@ -1004,9 +1004,9 @@ body: | ; GFX9-NEXT: [[USUBE:%[0-9]+]]:_(s32), [[USUBE1:%[0-9]+]]:_(s1) = G_USUBE [[UV1]], [[UV3]], [[USUBO1]] ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBO]](s32), [[USUBE]](s32) ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV]](s64), [[COPY]] - ; GFX9-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[COPY1]](s64), [[C]] - ; GFX9-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP1]], [[ICMP]] + ; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[COPY]](s64), [[COPY1]] + ; GFX9-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV]](s64), [[C]] + ; GFX9-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP]], [[ICMP1]] ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[MV]](s64) ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 63 ; GFX9-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[COPY2]], [[C1]](s32) @@ -1043,9 +1043,9 @@ body: | ; GFX6-NEXT: [[USUBE:%[0-9]+]]:_(s32), [[USUBE1:%[0-9]+]]:_(s1) = G_USUBE [[UV5]], [[UV7]], [[USUBO1]] ; GFX6-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBO]](s32), [[USUBE]](s32) ; GFX6-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; GFX6-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV]](s64), [[UV]] - ; GFX6-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[UV2]](s64), [[C]] - ; GFX6-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP1]], [[ICMP]] + ; GFX6-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[UV]](s64), [[UV2]] + ; GFX6-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV]](s64), [[C]] + ; GFX6-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP]], [[ICMP1]] ; GFX6-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[MV]](s64) ; GFX6-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 63 ; GFX6-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[COPY2]], [[C1]](s32) @@ -1061,9 +1061,9 @@ body: | ; GFX6-NEXT: [[USUBO2:%[0-9]+]]:_(s32), [[USUBO3:%[0-9]+]]:_(s1) = G_USUBO [[UV12]], [[UV14]] ; GFX6-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV13]], [[UV15]], [[USUBO3]] ; GFX6-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBO2]](s32), [[USUBE2]](s32) - ; GFX6-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV2]](s64), [[UV1]] - ; GFX6-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[UV3]](s64), [[C]] - ; GFX6-NEXT: [[XOR1:%[0-9]+]]:_(s1) = G_XOR [[ICMP3]], [[ICMP2]] + ; GFX6-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[UV1]](s64), [[UV3]] + ; GFX6-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV2]](s64), [[C]] + ; GFX6-NEXT: [[XOR1:%[0-9]+]]:_(s1) = G_XOR [[ICMP2]], [[ICMP3]] ; GFX6-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY [[MV2]](s64) ; GFX6-NEXT: [[ASHR1:%[0-9]+]]:_(s64) = G_ASHR [[COPY3]], [[C1]](s32) ; GFX6-NEXT: [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR1]](s64) @@ -1088,9 +1088,9 @@ body: | ; GFX8-NEXT: [[USUBE:%[0-9]+]]:_(s32), [[USUBE1:%[0-9]+]]:_(s1) = G_USUBE [[UV5]], [[UV7]], [[USUBO1]] ; GFX8-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBO]](s32), [[USUBE]](s32) ; GFX8-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; GFX8-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV]](s64), [[UV]] - ; GFX8-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[UV2]](s64), [[C]] - ; GFX8-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP1]], [[ICMP]] + ; GFX8-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[UV]](s64), [[UV2]] + ; GFX8-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV]](s64), [[C]] + ; GFX8-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP]], [[ICMP1]] ; GFX8-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[MV]](s64) ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 63 ; GFX8-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[COPY2]], [[C1]](s32) @@ -1106,9 +1106,9 @@ body: | ; GFX8-NEXT: [[USUBO2:%[0-9]+]]:_(s32), [[USUBO3:%[0-9]+]]:_(s1) = G_USUBO [[UV12]], [[UV14]] ; GFX8-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV13]], [[UV15]], [[USUBO3]] ; GFX8-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBO2]](s32), [[USUBE2]](s32) - ; GFX8-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV2]](s64), [[UV1]] - ; GFX8-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[UV3]](s64), [[C]] - ; GFX8-NEXT: [[XOR1:%[0-9]+]]:_(s1) = G_XOR [[ICMP3]], [[ICMP2]] + ; GFX8-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[UV1]](s64), [[UV3]] + ; GFX8-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV2]](s64), [[C]] + ; GFX8-NEXT: [[XOR1:%[0-9]+]]:_(s1) = G_XOR [[ICMP2]], [[ICMP3]] ; GFX8-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY [[MV2]](s64) ; GFX8-NEXT: [[ASHR1:%[0-9]+]]:_(s64) = G_ASHR [[COPY3]], [[C1]](s32) ; GFX8-NEXT: [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR1]](s64) @@ -1133,9 +1133,9 @@ body: | ; GFX9-NEXT: [[USUBE:%[0-9]+]]:_(s32), [[USUBE1:%[0-9]+]]:_(s1) = G_USUBE [[UV5]], [[UV7]], [[USUBO1]] ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBO]](s32), [[USUBE]](s32) ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV]](s64), [[UV]] - ; GFX9-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[UV2]](s64), [[C]] - ; GFX9-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP1]], [[ICMP]] + ; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[UV]](s64), [[UV2]] + ; GFX9-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV]](s64), [[C]] + ; GFX9-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP]], [[ICMP1]] ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[MV]](s64) ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 63 ; GFX9-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[COPY2]], [[C1]](s32) @@ -1151,9 +1151,9 @@ body: | ; GFX9-NEXT: [[USUBO2:%[0-9]+]]:_(s32), [[USUBO3:%[0-9]+]]:_(s1) = G_USUBO [[UV12]], [[UV14]] ; GFX9-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV13]], [[UV15]], [[USUBO3]] ; GFX9-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBO2]](s32), [[USUBE2]](s32) - ; GFX9-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV2]](s64), [[UV1]] - ; GFX9-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[UV3]](s64), [[C]] - ; GFX9-NEXT: [[XOR1:%[0-9]+]]:_(s1) = G_XOR [[ICMP3]], [[ICMP2]] + ; GFX9-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[UV1]](s64), [[UV3]] + ; GFX9-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV2]](s64), [[C]] + ; GFX9-NEXT: [[XOR1:%[0-9]+]]:_(s1) = G_XOR [[ICMP2]], [[ICMP3]] ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY [[MV2]](s64) ; GFX9-NEXT: [[ASHR1:%[0-9]+]]:_(s64) = G_ASHR [[COPY3]], [[C1]](s32) ; GFX9-NEXT: [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR1]](s64) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll index 2673ac4fb5bae..930cba57b2c3b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll @@ -1014,14 +1014,14 @@ define i24 @v_ssubsat_i24(i24 %lhs, i24 %rhs) { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v0, v1 -; GFX8-NEXT: v_bfe_i32 v3, v2, 0, 24 ; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 24 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v3, v0 -; GFX8-NEXT: v_bfe_i32 v0, v1, 0, 24 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[6:7], 0, v0 -; GFX8-NEXT: v_ashrrev_i32_e32 v0, 23, v3 +; GFX8-NEXT: v_bfe_i32 v1, v1, 0, 24 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v0, v1 +; GFX8-NEXT: v_bfe_i32 v0, v2, 0, 24 +; GFX8-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v0 +; GFX8-NEXT: v_ashrrev_i32_e32 v0, 23, v0 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xff800000, v0 -; GFX8-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] +; GFX8-NEXT: s_xor_b64 vcc, s[4:5], s[6:7] ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -1064,15 +1064,15 @@ define amdgpu_ps i24 @s_ssubsat_i24(i24 inreg %lhs, i24 inreg %rhs) { ; GFX8-LABEL: s_ssubsat_i24: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_sub_i32 s2, s0, s1 -; GFX8-NEXT: s_bfe_i32 s3, s2, 0x180000 ; GFX8-NEXT: s_bfe_i32 s0, s0, 0x180000 -; GFX8-NEXT: s_cmp_lt_i32 s3, s0 -; GFX8-NEXT: s_cselect_b32 s0, 1, 0 ; GFX8-NEXT: s_bfe_i32 s1, s1, 0x180000 -; GFX8-NEXT: s_cmp_gt_i32 s1, 0 -; GFX8-NEXT: s_cselect_b32 s1, 1, 0 -; GFX8-NEXT: s_xor_b32 s0, s1, s0 -; GFX8-NEXT: s_ashr_i32 s1, s3, 23 +; GFX8-NEXT: s_cmp_lt_i32 s0, s1 +; GFX8-NEXT: s_cselect_b32 s0, 1, 0 +; GFX8-NEXT: s_bfe_i32 s1, s2, 0x180000 +; GFX8-NEXT: s_cmp_lt_i32 s1, 0 +; GFX8-NEXT: s_cselect_b32 s3, 1, 0 +; GFX8-NEXT: s_ashr_i32 s1, s1, 23 +; GFX8-NEXT: s_xor_b32 s0, s0, s3 ; GFX8-NEXT: s_add_i32 s1, s1, 0xff800000 ; GFX8-NEXT: s_cmp_lg_u32 s0, 0 ; GFX8-NEXT: s_cselect_b32 s0, s1, s2 @@ -4092,15 +4092,15 @@ define i48 @v_ssubsat_i48(i48 %lhs, i48 %rhs) { ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v0, v2 ; GFX6-NEXT: v_subb_u32_e32 v6, vcc, v1, v3, vcc -; GFX6-NEXT: v_bfe_i32 v5, v4, 0, 16 ; GFX6-NEXT: v_bfe_i32 v1, v0, 0, 16 ; GFX6-NEXT: v_bfe_i32 v3, v2, 0, 16 -; GFX6-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1] -; GFX6-NEXT: v_cmp_lt_i64_e64 s[6:7], 0, v[2:3] +; GFX6-NEXT: v_bfe_i32 v5, v4, 0, 16 +; GFX6-NEXT: v_cmp_lt_i64_e64 s[4:5], v[0:1], v[2:3] +; GFX6-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[4:5] ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v5 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, 0xffff8000, v0 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 15, v5 -; GFX6-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] +; GFX6-NEXT: s_xor_b64 vcc, s[4:5], s[6:7] ; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -4110,15 +4110,15 @@ define i48 @v_ssubsat_i48(i48 %lhs, i48 %rhs) { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v0, v2 ; GFX8-NEXT: v_subb_u32_e32 v6, vcc, v1, v3, vcc -; GFX8-NEXT: v_bfe_i32 v5, v4, 0, 16 ; GFX8-NEXT: v_bfe_i32 v1, v0, 0, 16 ; GFX8-NEXT: v_bfe_i32 v3, v2, 0, 16 -; GFX8-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1] -; GFX8-NEXT: v_cmp_lt_i64_e64 s[6:7], 0, v[2:3] +; GFX8-NEXT: v_bfe_i32 v5, v4, 0, 16 +; GFX8-NEXT: v_cmp_lt_i64_e64 s[4:5], v[0:1], v[2:3] +; GFX8-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[4:5] ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v5 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0xffff8000, v0 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 15, v5 -; GFX8-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] +; GFX8-NEXT: s_xor_b64 vcc, s[4:5], s[6:7] ; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -4130,11 +4130,11 @@ define i48 @v_ssubsat_i48(i48 %lhs, i48 %rhs) { ; GFX9-NEXT: v_lshlrev_b64 v[2:3], 16, v[2:3] ; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v0, v2 ; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v1, v3, vcc -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1] -; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], 0, v[2:3] +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[4:5] ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v5 ; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v0 -; GFX9-NEXT: s_xor_b64 vcc, s[4:5], vcc +; GFX9-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX9-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] @@ -4147,9 +4147,9 @@ define i48 @v_ssubsat_i48(i48 %lhs, i48 %rhs) { ; GFX10-NEXT: v_lshlrev_b64 v[2:3], 16, v[2:3] ; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v0, v2 ; GFX10-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo -; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, 0, v[2:3] +; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[0:1], v[2:3] ; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5 -; GFX10-NEXT: v_cmp_lt_i64_e64 s4, v[4:5], v[0:1] +; GFX10-NEXT: v_cmp_gt_i64_e64 s4, 0, v[4:5] ; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v6 ; GFX10-NEXT: s_xor_b32 vcc_lo, vcc_lo, s4 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc_lo @@ -4164,9 +4164,9 @@ define i48 @v_ssubsat_i48(i48 %lhs, i48 %rhs) { ; GFX11-NEXT: v_lshlrev_b64 v[2:3], 16, v[2:3] ; GFX11-NEXT: v_sub_co_u32 v4, vcc_lo, v0, v2 ; GFX11-NEXT: v_sub_co_ci_u32_e64 v5, null, v1, v3, vcc_lo -; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, 0, v[2:3] +; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[0:1], v[2:3] ; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v5 -; GFX11-NEXT: v_cmp_lt_i64_e64 s0, v[4:5], v[0:1] +; GFX11-NEXT: v_cmp_gt_i64_e64 s0, 0, v[4:5] ; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v6 ; GFX11-NEXT: s_xor_b32 vcc_lo, vcc_lo, s0 ; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v6 :: v_dual_cndmask_b32 v1, v5, v1 @@ -4180,22 +4180,22 @@ define amdgpu_ps i48 @s_ssubsat_i48(i48 inreg %lhs, i48 inreg %rhs) { ; GFX6-LABEL: s_ssubsat_i48: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_sub_u32 s4, s0, s2 -; GFX6-NEXT: s_subb_u32 s3, s1, s3 +; GFX6-NEXT: s_subb_u32 s5, s1, s3 +; GFX6-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x300000 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x300000 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: s_bfe_i64 s[6:7], s[4:5], 0x300000 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x300000 -; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[0:1] -; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], 0 -; GFX6-NEXT: s_ashr_i32 s2, s7, 31 -; GFX6-NEXT: s_ashr_i32 s5, s7, 15 -; GFX6-NEXT: s_addk_i32 s2, 0x8000 -; GFX6-NEXT: v_mov_b32_e32 v0, s5 -; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] +; GFX6-NEXT: s_bfe_i64 s[0:1], s[4:5], 0x300000 +; GFX6-NEXT: v_cmp_lt_i64_e64 s[2:3], s[0:1], 0 +; GFX6-NEXT: s_ashr_i32 s0, s1, 31 +; GFX6-NEXT: s_ashr_i32 s1, s1, 15 +; GFX6-NEXT: s_addk_i32 s0, 0x8000 +; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: v_mov_b32_e32 v3, s3 -; GFX6-NEXT: s_xor_b64 vcc, s[0:1], vcc +; GFX6-NEXT: v_mov_b32_e32 v3, s5 +; GFX6-NEXT: s_xor_b64 vcc, vcc, s[2:3] ; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX6-NEXT: v_readfirstlane_b32 s0, v0 @@ -4205,22 +4205,22 @@ define amdgpu_ps i48 @s_ssubsat_i48(i48 inreg %lhs, i48 inreg %rhs) { ; GFX8-LABEL: s_ssubsat_i48: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_sub_u32 s4, s0, s2 -; GFX8-NEXT: s_subb_u32 s3, s1, s3 +; GFX8-NEXT: s_subb_u32 s5, s1, s3 +; GFX8-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x300000 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x300000 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: s_bfe_i64 s[6:7], s[4:5], 0x300000 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x300000 -; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[0:1] -; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], 0 -; GFX8-NEXT: s_ashr_i32 s2, s7, 31 -; GFX8-NEXT: s_ashr_i32 s5, s7, 15 -; GFX8-NEXT: s_addk_i32 s2, 0x8000 -; GFX8-NEXT: v_mov_b32_e32 v0, s5 -; GFX8-NEXT: v_mov_b32_e32 v1, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] +; GFX8-NEXT: s_bfe_i64 s[0:1], s[4:5], 0x300000 +; GFX8-NEXT: v_cmp_lt_i64_e64 s[2:3], s[0:1], 0 +; GFX8-NEXT: s_ashr_i32 s0, s1, 31 +; GFX8-NEXT: s_ashr_i32 s1, s1, 15 +; GFX8-NEXT: s_addk_i32 s0, 0x8000 +; GFX8-NEXT: v_mov_b32_e32 v0, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s0 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: s_xor_b64 vcc, s[0:1], vcc +; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: s_xor_b64 vcc, vcc, s[2:3] ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 @@ -4232,18 +4232,18 @@ define amdgpu_ps i48 @s_ssubsat_i48(i48 inreg %lhs, i48 inreg %rhs) { ; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 ; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 16 ; GFX9-NEXT: s_sub_u32 s4, s0, s2 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: s_subb_u32 s5, s1, s3 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] -; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] +; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[4:5], 0 ; GFX9-NEXT: s_ashr_i32 s2, s5, 31 ; GFX9-NEXT: s_add_i32 s3, s2, 0x80000000 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc +; GFX9-NEXT: s_xor_b64 vcc, vcc, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX9-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] @@ -4257,13 +4257,13 @@ define amdgpu_ps i48 @s_ssubsat_i48(i48 inreg %lhs, i48 inreg %rhs) { ; GFX10-NEXT: s_lshl_b64 s[2:3], s[2:3], 16 ; GFX10-NEXT: s_sub_u32 s4, s0, s2 ; GFX10-NEXT: s_subb_u32 s5, s1, s3 +; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[0:1], s[2:3] +; GFX10-NEXT: v_cmp_lt_i64_e64 s1, s[4:5], 0 ; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[4:5], s[0:1] -; GFX10-NEXT: v_cmp_gt_i64_e64 s1, s[2:3], 0 ; GFX10-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-NEXT: s_ashr_i32 s2, s5, 31 ; GFX10-NEXT: s_add_i32 s3, s2, 0x80000000 -; GFX10-NEXT: s_xor_b32 s0, s1, s0 +; GFX10-NEXT: s_xor_b32 s0, s0, s1 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0 ; GFX10-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] @@ -4277,12 +4277,12 @@ define amdgpu_ps i48 @s_ssubsat_i48(i48 inreg %lhs, i48 inreg %rhs) { ; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], 16 ; GFX11-NEXT: s_sub_u32 s4, s0, s2 ; GFX11-NEXT: s_subb_u32 s5, s1, s3 +; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[0:1], s[2:3] +; GFX11-NEXT: v_cmp_lt_i64_e64 s1, s[4:5], 0 ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[4:5], s[0:1] -; GFX11-NEXT: v_cmp_gt_i64_e64 s1, s[2:3], 0 ; GFX11-NEXT: s_ashr_i32 s2, s5, 31 ; GFX11-NEXT: s_add_i32 s3, s2, 0x80000000 -; GFX11-NEXT: s_xor_b32 s0, s1, s0 +; GFX11-NEXT: s_xor_b32 s0, s0, s1 ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0 ; GFX11-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] @@ -4299,15 +4299,15 @@ define amdgpu_ps <2 x float> @ssubsat_i48_sv(i48 inreg %lhs, i48 %rhs) { ; GFX6-NEXT: v_mov_b32_e32 v3, s1 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s0, v0 ; GFX6-NEXT: v_subb_u32_e32 v4, vcc, v3, v1, vcc -; GFX6-NEXT: v_bfe_i32 v3, v2, 0, 16 ; GFX6-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x300000 ; GFX6-NEXT: v_bfe_i32 v1, v0, 0, 16 -; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], v[2:3] -; GFX6-NEXT: v_cmp_lt_i64_e64 s[2:3], 0, v[0:1] +; GFX6-NEXT: v_bfe_i32 v3, v2, 0, 16 +; GFX6-NEXT: v_cmp_lt_i64_e64 s[0:1], s[0:1], v[0:1] +; GFX6-NEXT: v_cmp_gt_i64_e64 s[2:3], 0, v[2:3] ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v3 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 15, v3 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, 0xffff8000, v0 -; GFX6-NEXT: s_xor_b64 vcc, s[2:3], s[0:1] +; GFX6-NEXT: s_xor_b64 vcc, s[0:1], s[2:3] ; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -4318,15 +4318,15 @@ define amdgpu_ps <2 x float> @ssubsat_i48_sv(i48 inreg %lhs, i48 %rhs) { ; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s0, v0 ; GFX8-NEXT: v_subb_u32_e32 v4, vcc, v3, v1, vcc -; GFX8-NEXT: v_bfe_i32 v3, v2, 0, 16 ; GFX8-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x300000 ; GFX8-NEXT: v_bfe_i32 v1, v0, 0, 16 -; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], v[2:3] -; GFX8-NEXT: v_cmp_lt_i64_e64 s[2:3], 0, v[0:1] +; GFX8-NEXT: v_bfe_i32 v3, v2, 0, 16 +; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[0:1], v[0:1] +; GFX8-NEXT: v_cmp_gt_i64_e64 s[2:3], 0, v[2:3] ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v3 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 15, v3 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xffff8000, v0 -; GFX8-NEXT: s_xor_b64 vcc, s[2:3], s[0:1] +; GFX8-NEXT: s_xor_b64 vcc, s[0:1], s[2:3] ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -4339,11 +4339,11 @@ define amdgpu_ps <2 x float> @ssubsat_i48_sv(i48 inreg %lhs, i48 %rhs) { ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s0, v0 ; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v1, vcc -; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, s[0:1], v[2:3] -; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], 0, v[0:1] +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] +; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], 0, v[2:3] ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v3 ; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v0 -; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc +; GFX9-NEXT: s_xor_b64 vcc, vcc, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX9-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] @@ -4356,11 +4356,11 @@ define amdgpu_ps <2 x float> @ssubsat_i48_sv(i48 inreg %lhs, i48 %rhs) { ; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 ; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, s0, v0 ; GFX10-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo +; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[0:1], v[0:1] ; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3] -; GFX10-NEXT: v_cmp_lt_i64_e64 s0, 0, v[0:1] +; GFX10-NEXT: v_cmp_gt_i64_e64 s0, 0, v[2:3] ; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4 -; GFX10-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo +; GFX10-NEXT: s_xor_b32 vcc_lo, vcc_lo, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo ; GFX10-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] @@ -4373,11 +4373,11 @@ define amdgpu_ps <2 x float> @ssubsat_i48_sv(i48 inreg %lhs, i48 %rhs) { ; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 ; GFX11-NEXT: v_sub_co_u32 v2, vcc_lo, s0, v0 ; GFX11-NEXT: v_sub_co_ci_u32_e64 v3, null, s1, v1, vcc_lo +; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[0:1], v[0:1] ; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3] -; GFX11-NEXT: v_cmp_lt_i64_e64 s0, 0, v[0:1] +; GFX11-NEXT: v_cmp_gt_i64_e64 s0, 0, v[2:3] ; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4 -; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo +; GFX11-NEXT: s_xor_b32 vcc_lo, vcc_lo, s0 ; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1 ; GFX11-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -4394,11 +4394,11 @@ define amdgpu_ps <2 x float> @ssubsat_i48_vs(i48 %lhs, i48 inreg %rhs) { ; GFX6-NEXT: v_mov_b32_e32 v3, s1 ; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s0, v0 ; GFX6-NEXT: v_subb_u32_e32 v4, vcc, v1, v3, vcc -; GFX6-NEXT: v_bfe_i32 v3, v2, 0, 16 ; GFX6-NEXT: v_bfe_i32 v1, v0, 0, 16 ; GFX6-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x300000 -; GFX6-NEXT: v_cmp_lt_i64_e64 s[2:3], v[2:3], v[0:1] -; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], 0 +; GFX6-NEXT: v_bfe_i32 v3, v2, 0, 16 +; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], v[0:1] +; GFX6-NEXT: v_cmp_gt_i64_e64 s[2:3], 0, v[2:3] ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v3 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 15, v3 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, 0xffff8000, v0 @@ -4413,11 +4413,11 @@ define amdgpu_ps <2 x float> @ssubsat_i48_vs(i48 %lhs, i48 inreg %rhs) { ; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s0, v0 ; GFX8-NEXT: v_subb_u32_e32 v4, vcc, v1, v3, vcc -; GFX8-NEXT: v_bfe_i32 v3, v2, 0, 16 ; GFX8-NEXT: v_bfe_i32 v1, v0, 0, 16 ; GFX8-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x300000 -; GFX8-NEXT: v_cmp_lt_i64_e64 s[2:3], v[2:3], v[0:1] -; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], 0 +; GFX8-NEXT: v_bfe_i32 v3, v2, 0, 16 +; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], v[0:1] +; GFX8-NEXT: v_cmp_gt_i64_e64 s[2:3], 0, v[2:3] ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v3 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 15, v3 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xffff8000, v0 @@ -4434,11 +4434,11 @@ define amdgpu_ps <2 x float> @ssubsat_i48_vs(i48 %lhs, i48 inreg %rhs) { ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s0, v0 ; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[2:3], v[0:1] -; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], 0 +; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, s[0:1], v[0:1] +; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], 0, v[2:3] ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v3 ; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v0 -; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc +; GFX9-NEXT: s_xor_b64 vcc, vcc, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX9-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] @@ -4451,11 +4451,11 @@ define amdgpu_ps <2 x float> @ssubsat_i48_vs(i48 %lhs, i48 inreg %rhs) { ; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 ; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, v0, s0 ; GFX10-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo -; GFX10-NEXT: v_cmp_gt_i64_e64 s0, s[0:1], 0 +; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[0:1] ; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1] +; GFX10-NEXT: v_cmp_gt_i64_e64 s0, 0, v[2:3] ; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4 -; GFX10-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo +; GFX10-NEXT: s_xor_b32 vcc_lo, vcc_lo, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo ; GFX10-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] @@ -4468,11 +4468,11 @@ define amdgpu_ps <2 x float> @ssubsat_i48_vs(i48 %lhs, i48 inreg %rhs) { ; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 ; GFX11-NEXT: v_sub_co_u32 v2, vcc_lo, v0, s0 ; GFX11-NEXT: v_subrev_co_ci_u32_e64 v3, null, s1, v1, vcc_lo -; GFX11-NEXT: v_cmp_gt_i64_e64 s0, s[0:1], 0 +; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[0:1] ; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1] +; GFX11-NEXT: v_cmp_gt_i64_e64 s0, 0, v[2:3] ; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4 -; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo +; GFX11-NEXT: s_xor_b32 vcc_lo, vcc_lo, s0 ; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1 ; GFX11-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -4489,11 +4489,11 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) { ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v0, v2 ; GFX6-NEXT: v_subb_u32_e32 v5, vcc, v1, v3, vcc -; GFX6-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1] -; GFX6-NEXT: v_cmp_lt_i64_e64 s[6:7], 0, v[2:3] +; GFX6-NEXT: v_cmp_lt_i64_e64 s[4:5], v[0:1], v[2:3] +; GFX6-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[4:5] ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v5 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, 0x80000000, v0 -; GFX6-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] +; GFX6-NEXT: s_xor_b64 vcc, s[4:5], s[6:7] ; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -4503,11 +4503,11 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v0, v2 ; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v1, v3, vcc -; GFX8-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1] -; GFX8-NEXT: v_cmp_lt_i64_e64 s[6:7], 0, v[2:3] +; GFX8-NEXT: v_cmp_lt_i64_e64 s[4:5], v[0:1], v[2:3] +; GFX8-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[4:5] ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v5 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x80000000, v0 -; GFX8-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] +; GFX8-NEXT: s_xor_b64 vcc, s[4:5], s[6:7] ; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -4517,11 +4517,11 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v0, v2 ; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v1, v3, vcc -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1] -; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], 0, v[2:3] +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[4:5] ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v5 ; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v0 -; GFX9-NEXT: s_xor_b64 vcc, s[4:5], vcc +; GFX9-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -4531,11 +4531,11 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v0, v2 ; GFX10-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo -; GFX10-NEXT: v_cmp_lt_i64_e64 s4, 0, v[2:3] +; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[0:1], v[2:3] ; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5 -; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1] +; GFX10-NEXT: v_cmp_gt_i64_e64 s4, 0, v[4:5] ; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v6 -; GFX10-NEXT: s_xor_b32 vcc_lo, s4, vcc_lo +; GFX10-NEXT: s_xor_b32 vcc_lo, vcc_lo, s4 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -4545,11 +4545,11 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_sub_co_u32 v4, vcc_lo, v0, v2 ; GFX11-NEXT: v_sub_co_ci_u32_e64 v5, null, v1, v3, vcc_lo -; GFX11-NEXT: v_cmp_lt_i64_e64 s0, 0, v[2:3] +; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[0:1], v[2:3] ; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v5 -; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1] +; GFX11-NEXT: v_cmp_gt_i64_e64 s0, 0, v[4:5] ; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v6 -; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo +; GFX11-NEXT: s_xor_b32 vcc_lo, vcc_lo, s0 ; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v6 :: v_dual_cndmask_b32 v1, v5, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call i64 @llvm.ssub.sat.i64(i64 %lhs, i64 %rhs) @@ -4560,18 +4560,18 @@ define amdgpu_ps i64 @s_ssubsat_i64(i64 inreg %lhs, i64 inreg %rhs) { ; GFX6-LABEL: s_ssubsat_i64: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_sub_u32 s4, s0, s2 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: s_subb_u32 s5, s1, s3 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] -; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] +; GFX6-NEXT: v_cmp_lt_i64_e64 s[0:1], s[4:5], 0 ; GFX6-NEXT: s_ashr_i32 s2, s5, 31 ; GFX6-NEXT: s_add_i32 s3, s2, 0x80000000 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: v_mov_b32_e32 v3, s5 -; GFX6-NEXT: s_xor_b64 vcc, s[0:1], vcc +; GFX6-NEXT: s_xor_b64 vcc, vcc, s[0:1] ; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX6-NEXT: v_readfirstlane_b32 s0, v0 @@ -4581,18 +4581,18 @@ define amdgpu_ps i64 @s_ssubsat_i64(i64 inreg %lhs, i64 inreg %rhs) { ; GFX8-LABEL: s_ssubsat_i64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_sub_u32 s4, s0, s2 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: s_subb_u32 s5, s1, s3 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] -; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] +; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[4:5], 0 ; GFX8-NEXT: s_ashr_i32 s2, s5, 31 ; GFX8-NEXT: s_add_i32 s3, s2, 0x80000000 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: s_xor_b64 vcc, s[0:1], vcc +; GFX8-NEXT: s_xor_b64 vcc, vcc, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 @@ -4602,18 +4602,18 @@ define amdgpu_ps i64 @s_ssubsat_i64(i64 inreg %lhs, i64 inreg %rhs) { ; GFX9-LABEL: s_ssubsat_i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_sub_u32 s4, s0, s2 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: s_subb_u32 s5, s1, s3 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] -; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] +; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[4:5], 0 ; GFX9-NEXT: s_ashr_i32 s2, s5, 31 ; GFX9-NEXT: s_add_i32 s3, s2, 0x80000000 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc +; GFX9-NEXT: s_xor_b64 vcc, vcc, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 @@ -4624,13 +4624,13 @@ define amdgpu_ps i64 @s_ssubsat_i64(i64 inreg %lhs, i64 inreg %rhs) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_sub_u32 s4, s0, s2 ; GFX10-NEXT: s_subb_u32 s5, s1, s3 +; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[0:1], s[2:3] +; GFX10-NEXT: v_cmp_lt_i64_e64 s1, s[4:5], 0 ; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[4:5], s[0:1] -; GFX10-NEXT: v_cmp_gt_i64_e64 s1, s[2:3], 0 ; GFX10-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-NEXT: s_ashr_i32 s2, s5, 31 ; GFX10-NEXT: s_add_i32 s3, s2, 0x80000000 -; GFX10-NEXT: s_xor_b32 s0, s1, s0 +; GFX10-NEXT: s_xor_b32 s0, s0, s1 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 @@ -4641,12 +4641,12 @@ define amdgpu_ps i64 @s_ssubsat_i64(i64 inreg %lhs, i64 inreg %rhs) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_sub_u32 s4, s0, s2 ; GFX11-NEXT: s_subb_u32 s5, s1, s3 +; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[0:1], s[2:3] +; GFX11-NEXT: v_cmp_lt_i64_e64 s1, s[4:5], 0 ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[4:5], s[0:1] -; GFX11-NEXT: v_cmp_gt_i64_e64 s1, s[2:3], 0 ; GFX11-NEXT: s_ashr_i32 s2, s5, 31 ; GFX11-NEXT: s_add_i32 s3, s2, 0x80000000 -; GFX11-NEXT: s_xor_b32 s0, s1, s0 +; GFX11-NEXT: s_xor_b32 s0, s0, s1 ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0 ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 @@ -4662,11 +4662,11 @@ define amdgpu_ps <2 x float> @ssubsat_i64_sv(i64 inreg %lhs, i64 %rhs) { ; GFX6-NEXT: v_mov_b32_e32 v3, s1 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s0, v0 ; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v3, v1, vcc -; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], v[2:3] -; GFX6-NEXT: v_cmp_lt_i64_e64 s[2:3], 0, v[0:1] +; GFX6-NEXT: v_cmp_lt_i64_e64 s[0:1], s[0:1], v[0:1] +; GFX6-NEXT: v_cmp_gt_i64_e64 s[2:3], 0, v[2:3] ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v3 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, 0x80000000, v0 -; GFX6-NEXT: s_xor_b64 vcc, s[2:3], s[0:1] +; GFX6-NEXT: s_xor_b64 vcc, s[0:1], s[2:3] ; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX6-NEXT: ; return to shader part epilog @@ -4676,11 +4676,11 @@ define amdgpu_ps <2 x float> @ssubsat_i64_sv(i64 inreg %lhs, i64 %rhs) { ; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s0, v0 ; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v3, v1, vcc -; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], v[2:3] -; GFX8-NEXT: v_cmp_lt_i64_e64 s[2:3], 0, v[0:1] +; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[0:1], v[0:1] +; GFX8-NEXT: v_cmp_gt_i64_e64 s[2:3], 0, v[2:3] ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v3 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x80000000, v0 -; GFX8-NEXT: s_xor_b64 vcc, s[2:3], s[0:1] +; GFX8-NEXT: s_xor_b64 vcc, s[0:1], s[2:3] ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX8-NEXT: ; return to shader part epilog @@ -4690,11 +4690,11 @@ define amdgpu_ps <2 x float> @ssubsat_i64_sv(i64 inreg %lhs, i64 %rhs) { ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s0, v0 ; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v1, vcc -; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, s[0:1], v[2:3] -; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], 0, v[0:1] +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] +; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], 0, v[2:3] ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v3 ; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v0 -; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc +; GFX9-NEXT: s_xor_b64 vcc, vcc, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX9-NEXT: ; return to shader part epilog @@ -4703,11 +4703,11 @@ define amdgpu_ps <2 x float> @ssubsat_i64_sv(i64 inreg %lhs, i64 %rhs) { ; GFX10: ; %bb.0: ; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, s0, v0 ; GFX10-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo +; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[0:1], v[0:1] ; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3] -; GFX10-NEXT: v_cmp_lt_i64_e64 s0, 0, v[0:1] +; GFX10-NEXT: v_cmp_gt_i64_e64 s0, 0, v[2:3] ; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4 -; GFX10-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo +; GFX10-NEXT: s_xor_b32 vcc_lo, vcc_lo, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo ; GFX10-NEXT: ; return to shader part epilog @@ -4716,11 +4716,11 @@ define amdgpu_ps <2 x float> @ssubsat_i64_sv(i64 inreg %lhs, i64 %rhs) { ; GFX11: ; %bb.0: ; GFX11-NEXT: v_sub_co_u32 v2, vcc_lo, s0, v0 ; GFX11-NEXT: v_sub_co_ci_u32_e64 v3, null, s1, v1, vcc_lo +; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[0:1], v[0:1] ; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3] -; GFX11-NEXT: v_cmp_lt_i64_e64 s0, 0, v[0:1] +; GFX11-NEXT: v_cmp_gt_i64_e64 s0, 0, v[2:3] ; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4 -; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo +; GFX11-NEXT: s_xor_b32 vcc_lo, vcc_lo, s0 ; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1 ; GFX11-NEXT: ; return to shader part epilog %result = call i64 @llvm.ssub.sat.i64(i64 %lhs, i64 %rhs) @@ -4734,8 +4734,8 @@ define amdgpu_ps <2 x float> @ssubsat_i64_vs(i64 %lhs, i64 inreg %rhs) { ; GFX6-NEXT: v_mov_b32_e32 v3, s1 ; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s0, v0 ; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v1, v3, vcc -; GFX6-NEXT: v_cmp_lt_i64_e64 s[2:3], v[2:3], v[0:1] -; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], 0 +; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], v[0:1] +; GFX6-NEXT: v_cmp_gt_i64_e64 s[2:3], 0, v[2:3] ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v3 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, 0x80000000, v0 ; GFX6-NEXT: s_xor_b64 vcc, s[0:1], s[2:3] @@ -4748,8 +4748,8 @@ define amdgpu_ps <2 x float> @ssubsat_i64_vs(i64 %lhs, i64 inreg %rhs) { ; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s0, v0 ; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v1, v3, vcc -; GFX8-NEXT: v_cmp_lt_i64_e64 s[2:3], v[2:3], v[0:1] -; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], 0 +; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], v[0:1] +; GFX8-NEXT: v_cmp_gt_i64_e64 s[2:3], 0, v[2:3] ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v3 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x80000000, v0 ; GFX8-NEXT: s_xor_b64 vcc, s[0:1], s[2:3] @@ -4762,11 +4762,11 @@ define amdgpu_ps <2 x float> @ssubsat_i64_vs(i64 %lhs, i64 inreg %rhs) { ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s0, v0 ; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[2:3], v[0:1] -; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], 0 +; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, s[0:1], v[0:1] +; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], 0, v[2:3] ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v3 ; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v0 -; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc +; GFX9-NEXT: s_xor_b64 vcc, vcc, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX9-NEXT: ; return to shader part epilog @@ -4775,11 +4775,11 @@ define amdgpu_ps <2 x float> @ssubsat_i64_vs(i64 %lhs, i64 inreg %rhs) { ; GFX10: ; %bb.0: ; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, v0, s0 ; GFX10-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo -; GFX10-NEXT: v_cmp_gt_i64_e64 s0, s[0:1], 0 +; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[0:1] ; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1] +; GFX10-NEXT: v_cmp_gt_i64_e64 s0, 0, v[2:3] ; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4 -; GFX10-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo +; GFX10-NEXT: s_xor_b32 vcc_lo, vcc_lo, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo ; GFX10-NEXT: ; return to shader part epilog @@ -4788,11 +4788,11 @@ define amdgpu_ps <2 x float> @ssubsat_i64_vs(i64 %lhs, i64 inreg %rhs) { ; GFX11: ; %bb.0: ; GFX11-NEXT: v_sub_co_u32 v2, vcc_lo, v0, s0 ; GFX11-NEXT: v_subrev_co_ci_u32_e64 v3, null, s1, v1, vcc_lo -; GFX11-NEXT: v_cmp_gt_i64_e64 s0, s[0:1], 0 +; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[0:1] ; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1] +; GFX11-NEXT: v_cmp_gt_i64_e64 s0, 0, v[2:3] ; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4 -; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo +; GFX11-NEXT: s_xor_b32 vcc_lo, vcc_lo, s0 ; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1 ; GFX11-NEXT: ; return to shader part epilog %result = call i64 @llvm.ssub.sat.i64(i64 %lhs, i64 %rhs) @@ -4806,21 +4806,21 @@ define <2 x i64> @v_ssubsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v0, v4 ; GFX6-NEXT: v_subb_u32_e32 v9, vcc, v1, v5, vcc -; GFX6-NEXT: v_cmp_lt_i64_e64 s[4:5], v[8:9], v[0:1] -; GFX6-NEXT: v_cmp_lt_i64_e64 s[6:7], 0, v[4:5] +; GFX6-NEXT: v_cmp_lt_i64_e64 s[4:5], v[0:1], v[4:5] +; GFX6-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[8:9] ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v9 ; GFX6-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v0, v1 -; GFX6-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] +; GFX6-NEXT: s_xor_b64 vcc, s[4:5], s[6:7] ; GFX6-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v2, v6 ; GFX6-NEXT: v_subb_u32_e32 v5, vcc, v3, v7, vcc -; GFX6-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[2:3] -; GFX6-NEXT: v_cmp_lt_i64_e64 s[6:7], 0, v[6:7] +; GFX6-NEXT: v_cmp_lt_i64_e64 s[4:5], v[2:3], v[6:7] +; GFX6-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[4:5] ; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v5 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, 0x80000000, v2 -; GFX6-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] +; GFX6-NEXT: s_xor_b64 vcc, s[4:5], s[6:7] ; GFX6-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -4830,21 +4830,21 @@ define <2 x i64> @v_ssubsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_sub_u32_e32 v8, vcc, v0, v4 ; GFX8-NEXT: v_subb_u32_e32 v9, vcc, v1, v5, vcc -; GFX8-NEXT: v_cmp_lt_i64_e64 s[4:5], v[8:9], v[0:1] -; GFX8-NEXT: v_cmp_lt_i64_e64 s[6:7], 0, v[4:5] +; GFX8-NEXT: v_cmp_lt_i64_e64 s[4:5], v[0:1], v[4:5] +; GFX8-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[8:9] ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v9 ; GFX8-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v0, v1 -; GFX8-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] +; GFX8-NEXT: s_xor_b64 vcc, s[4:5], s[6:7] ; GFX8-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc ; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v2, v6 ; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v3, v7, vcc -; GFX8-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[2:3] -; GFX8-NEXT: v_cmp_lt_i64_e64 s[6:7], 0, v[6:7] +; GFX8-NEXT: v_cmp_lt_i64_e64 s[4:5], v[2:3], v[6:7] +; GFX8-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[4:5] ; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v5 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x80000000, v2 -; GFX8-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] +; GFX8-NEXT: s_xor_b64 vcc, s[4:5], s[6:7] ; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -4854,20 +4854,20 @@ define <2 x i64> @v_ssubsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_sub_co_u32_e32 v8, vcc, v0, v4 ; GFX9-NEXT: v_subb_co_u32_e32 v9, vcc, v1, v5, vcc -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[8:9], v[0:1] -; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], 0, v[4:5] +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[0:1], v[4:5] +; GFX9-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[8:9] ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v9 ; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v0 -; GFX9-NEXT: s_xor_b64 vcc, s[4:5], vcc +; GFX9-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc ; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v2, v6 ; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v3, v7, vcc -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[2:3] -; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], 0, v[6:7] +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[2:3], v[6:7] +; GFX9-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[4:5] ; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v5 ; GFX9-NEXT: v_add_u32_e32 v3, 0x80000000, v2 -; GFX9-NEXT: s_xor_b64 vcc, s[4:5], vcc +; GFX9-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -4880,17 +4880,17 @@ define <2 x i64> @v_ssubsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { ; GFX10-NEXT: v_sub_co_u32 v10, vcc_lo, v2, v6 ; GFX10-NEXT: v_sub_co_ci_u32_e32 v11, vcc_lo, v3, v7, vcc_lo ; GFX10-NEXT: v_ashrrev_i32_e32 v12, 31, v9 -; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[8:9], v[0:1] -; GFX10-NEXT: v_cmp_lt_i64_e64 s4, 0, v[4:5] +; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[0:1], v[4:5] +; GFX10-NEXT: v_cmp_gt_i64_e64 s4, 0, v[8:9] ; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v11 -; GFX10-NEXT: v_cmp_lt_i64_e64 s5, v[10:11], v[2:3] -; GFX10-NEXT: v_cmp_lt_i64_e64 s6, 0, v[6:7] +; GFX10-NEXT: v_cmp_lt_i64_e64 s5, v[2:3], v[6:7] +; GFX10-NEXT: v_cmp_gt_i64_e64 s6, 0, v[10:11] ; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v12 ; GFX10-NEXT: v_add_nc_u32_e32 v3, 0x80000000, v4 -; GFX10-NEXT: s_xor_b32 vcc_lo, s4, vcc_lo +; GFX10-NEXT: s_xor_b32 vcc_lo, vcc_lo, s4 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v8, v12, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc_lo -; GFX10-NEXT: s_xor_b32 vcc_lo, s6, s5 +; GFX10-NEXT: s_xor_b32 vcc_lo, s5, s6 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v10, v4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -4903,16 +4903,16 @@ define <2 x i64> @v_ssubsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { ; GFX11-NEXT: v_sub_co_u32 v10, vcc_lo, v2, v6 ; GFX11-NEXT: v_sub_co_ci_u32_e64 v11, null, v3, v7, vcc_lo ; GFX11-NEXT: v_ashrrev_i32_e32 v12, 31, v9 -; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[8:9], v[0:1] -; GFX11-NEXT: v_cmp_lt_i64_e64 s0, 0, v[4:5] +; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[0:1], v[4:5] +; GFX11-NEXT: v_cmp_gt_i64_e64 s0, 0, v[8:9] ; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v11 -; GFX11-NEXT: v_cmp_lt_i64_e64 s1, v[10:11], v[2:3] -; GFX11-NEXT: v_cmp_lt_i64_e64 s2, 0, v[6:7] +; GFX11-NEXT: v_cmp_lt_i64_e64 s1, v[2:3], v[6:7] +; GFX11-NEXT: v_cmp_gt_i64_e64 s2, 0, v[10:11] ; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v12 ; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x80000000, v4 -; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo +; GFX11-NEXT: s_xor_b32 vcc_lo, vcc_lo, s0 ; GFX11-NEXT: v_dual_cndmask_b32 v0, v8, v12 :: v_dual_cndmask_b32 v1, v9, v1 -; GFX11-NEXT: s_xor_b32 vcc_lo, s2, s1 +; GFX11-NEXT: s_xor_b32 vcc_lo, s1, s2 ; GFX11-NEXT: v_dual_cndmask_b32 v2, v10, v4 :: v_dual_cndmask_b32 v3, v11, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> %lhs, <2 x i64> %rhs) @@ -4923,33 +4923,33 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX6-LABEL: s_ssubsat_v2i64: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_sub_u32 s8, s0, s4 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_subb_u32 s9, s1, s5 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] -; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[4:5], 0 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] +; GFX6-NEXT: v_cmp_lt_i64_e64 s[0:1], s[8:9], 0 ; GFX6-NEXT: s_ashr_i32 s4, s9, 31 ; GFX6-NEXT: s_add_i32 s5, s4, 0x80000000 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: v_mov_b32_e32 v3, s9 -; GFX6-NEXT: s_xor_b64 vcc, s[0:1], vcc +; GFX6-NEXT: s_xor_b64 vcc, vcc, s[0:1] ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc ; GFX6-NEXT: s_sub_u32 s0, s2, s6 -; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: s_subb_u32 s1, s3, s7 -; GFX6-NEXT: v_mov_b32_e32 v1, s3 -; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] -; GFX6-NEXT: v_cmp_gt_i64_e64 s[2:3], s[6:7], 0 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] +; GFX6-NEXT: v_cmp_lt_i64_e64 s[2:3], s[0:1], 0 ; GFX6-NEXT: s_ashr_i32 s4, s1, 31 ; GFX6-NEXT: s_add_i32 s5, s4, 0x80000000 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v4, s0 ; GFX6-NEXT: v_mov_b32_e32 v5, s1 -; GFX6-NEXT: s_xor_b64 vcc, s[2:3], vcc +; GFX6-NEXT: s_xor_b64 vcc, vcc, s[2:3] ; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX6-NEXT: v_readfirstlane_b32 s0, v2 @@ -4961,33 +4961,33 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX8-LABEL: s_ssubsat_v2i64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_sub_u32 s8, s0, s4 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_subb_u32 s9, s1, s5 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] -; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[4:5], 0 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] +; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[8:9], 0 ; GFX8-NEXT: s_ashr_i32 s4, s9, 31 ; GFX8-NEXT: s_add_i32 s5, s4, 0x80000000 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_mov_b32_e32 v2, s8 ; GFX8-NEXT: v_mov_b32_e32 v3, s9 -; GFX8-NEXT: s_xor_b64 vcc, s[0:1], vcc +; GFX8-NEXT: s_xor_b64 vcc, vcc, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc ; GFX8-NEXT: s_sub_u32 s0, s2, s6 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: s_subb_u32 s1, s3, s7 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] -; GFX8-NEXT: v_cmp_gt_i64_e64 s[2:3], s[6:7], 0 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] +; GFX8-NEXT: v_cmp_lt_i64_e64 s[2:3], s[0:1], 0 ; GFX8-NEXT: s_ashr_i32 s4, s1, 31 ; GFX8-NEXT: s_add_i32 s5, s4, 0x80000000 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: s_xor_b64 vcc, s[2:3], vcc +; GFX8-NEXT: s_xor_b64 vcc, vcc, s[2:3] ; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX8-NEXT: v_readfirstlane_b32 s0, v2 @@ -4999,33 +4999,33 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX9-LABEL: s_ssubsat_v2i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_sub_u32 s8, s0, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: s_subb_u32 s9, s1, s5 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] -; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[4:5], 0 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] +; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[8:9], 0 ; GFX9-NEXT: s_ashr_i32 s4, s9, 31 ; GFX9-NEXT: s_add_i32 s5, s4, 0x80000000 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v2, s8 ; GFX9-NEXT: v_mov_b32_e32 v3, s9 -; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc +; GFX9-NEXT: s_xor_b64 vcc, vcc, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc ; GFX9-NEXT: s_sub_u32 s0, s2, s6 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: s_subb_u32 s1, s3, s7 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] -; GFX9-NEXT: v_cmp_gt_i64_e64 s[2:3], s[6:7], 0 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] +; GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], s[0:1], 0 ; GFX9-NEXT: s_ashr_i32 s4, s1, 31 ; GFX9-NEXT: s_add_i32 s5, s4, 0x80000000 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v4, s0 ; GFX9-NEXT: v_mov_b32_e32 v5, s1 -; GFX9-NEXT: s_xor_b64 vcc, s[2:3], vcc +; GFX9-NEXT: s_xor_b64 vcc, vcc, s[2:3] ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX9-NEXT: v_readfirstlane_b32 s0, v2 @@ -5038,24 +5038,24 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX10: ; %bb.0: ; GFX10-NEXT: s_sub_u32 s8, s0, s4 ; GFX10-NEXT: s_subb_u32 s9, s1, s5 -; GFX10-NEXT: v_mov_b32_e32 v0, s8 -; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[8:9], s[0:1] -; GFX10-NEXT: v_cmp_gt_i64_e64 s1, s[4:5], 0 +; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[0:1], s[4:5] +; GFX10-NEXT: v_cmp_lt_i64_e64 s1, s[8:9], 0 ; GFX10-NEXT: s_ashr_i32 s4, s9, 31 +; GFX10-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-NEXT: s_add_i32 s5, s4, 0x80000000 ; GFX10-NEXT: v_mov_b32_e32 v1, s9 -; GFX10-NEXT: s_add_i32 s8, s4, 0x80000000 -; GFX10-NEXT: s_xor_b32 s5, s1, s0 +; GFX10-NEXT: s_xor_b32 s8, s0, s1 ; GFX10-NEXT: s_sub_u32 s0, s2, s6 ; GFX10-NEXT: s_subb_u32 s1, s3, s7 +; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[2:3], s[6:7] +; GFX10-NEXT: v_cmp_lt_i64_e64 s3, s[0:1], 0 ; GFX10-NEXT: v_mov_b32_e32 v2, s0 -; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[0:1], s[2:3] -; GFX10-NEXT: v_cmp_gt_i64_e64 s3, s[6:7], 0 ; GFX10-NEXT: v_mov_b32_e32 v3, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s4, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s4, s8 ; GFX10-NEXT: s_ashr_i32 s4, s1, 31 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s8, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s5, s8 ; GFX10-NEXT: s_add_i32 s0, s4, 0x80000000 -; GFX10-NEXT: s_xor_b32 s1, s3, s2 +; GFX10-NEXT: s_xor_b32 s1, s2, s3 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s4, s1 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s0, s1 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 @@ -5068,22 +5068,22 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX11: ; %bb.0: ; GFX11-NEXT: s_sub_u32 s8, s0, s4 ; GFX11-NEXT: s_subb_u32 s9, s1, s5 -; GFX11-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9 -; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[8:9], s[0:1] -; GFX11-NEXT: v_cmp_gt_i64_e64 s1, s[4:5], 0 +; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[0:1], s[4:5] +; GFX11-NEXT: v_cmp_lt_i64_e64 s1, s[8:9], 0 ; GFX11-NEXT: s_ashr_i32 s4, s9, 31 -; GFX11-NEXT: s_add_i32 s8, s4, 0x80000000 -; GFX11-NEXT: s_xor_b32 s5, s1, s0 +; GFX11-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9 +; GFX11-NEXT: s_add_i32 s5, s4, 0x80000000 +; GFX11-NEXT: s_xor_b32 s8, s0, s1 ; GFX11-NEXT: s_sub_u32 s0, s2, s6 ; GFX11-NEXT: s_subb_u32 s1, s3, s7 +; GFX11-NEXT: v_cmp_lt_i64_e64 s2, s[2:3], s[6:7] +; GFX11-NEXT: v_cmp_lt_i64_e64 s3, s[0:1], 0 ; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 -; GFX11-NEXT: v_cmp_lt_i64_e64 s2, s[0:1], s[2:3] -; GFX11-NEXT: v_cmp_gt_i64_e64 s3, s[6:7], 0 -; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s4, s5 +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s4, s8 ; GFX11-NEXT: s_ashr_i32 s4, s1, 31 -; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s8, s5 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s5, s8 ; GFX11-NEXT: s_add_i32 s0, s4, 0x80000000 -; GFX11-NEXT: s_xor_b32 s1, s3, s2 +; GFX11-NEXT: s_xor_b32 s1, s2, s3 ; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s4, s1 ; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s0, s1 ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 @@ -5098,39 +5098,37 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) { ; GFX6-LABEL: s_ssubsat_i128: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_sub_u32 s8, s0, s4 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: s_subb_u32 s9, s1, s5 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: s_subb_u32 s10, s2, s6 -; GFX6-NEXT: v_mov_b32_e32 v2, s2 -; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1] -; GFX6-NEXT: s_subb_u32 s11, s3, s7 -; GFX6-NEXT: v_mov_b32_e32 v3, s3 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: s_sub_u32 s10, s0, s4 +; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] +; GFX6-NEXT: s_subb_u32 s11, s1, s5 +; GFX6-NEXT: v_mov_b32_e32 v3, s7 +; GFX6-NEXT: s_subb_u32 s8, s2, s6 ; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[10:11], v[2:3] -; GFX6-NEXT: v_cmp_gt_u64_e64 s[0:1], s[4:5], 0 +; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] +; GFX6-NEXT: s_subb_u32 s9, s3, s7 ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[10:11], v[2:3] -; GFX6-NEXT: v_mov_b32_e32 v3, s9 +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[2:3] +; GFX6-NEXT: v_cmp_lt_i64_e64 s[0:1], s[8:9], 0 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] -; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[6:7], 0 -; GFX6-NEXT: v_cmp_eq_u64_e64 vcc, s[6:7], 0 -; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] -; GFX6-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX6-NEXT: s_ashr_i32 s0, s11, 31 +; GFX6-NEXT: v_cmp_eq_u64_e64 s[0:1], s[8:9], 0 +; GFX6-NEXT: v_mov_b32_e32 v2, s10 +; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1] +; GFX6-NEXT: v_xor_b32_e32 v0, v0, v1 +; GFX6-NEXT: s_ashr_i32 s0, s9, 31 ; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX6-NEXT: s_add_i32 s1, s0, 0x80000000 ; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: v_mov_b32_e32 v3, s11 ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v2, v3, v1, vcc ; GFX6-NEXT: v_mov_b32_e32 v3, s1 -; GFX6-NEXT: v_mov_b32_e32 v4, s10 -; GFX6-NEXT: v_mov_b32_e32 v5, s11 +; GFX6-NEXT: v_mov_b32_e32 v4, s8 +; GFX6-NEXT: v_mov_b32_e32 v5, s9 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX6-NEXT: v_readfirstlane_b32 s0, v0 @@ -5141,45 +5139,43 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) { ; ; GFX8-LABEL: s_ssubsat_i128: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_sub_u32 s8, s0, s4 -; GFX8-NEXT: s_subb_u32 s9, s1, s5 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: s_subb_u32 s10, s2, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: s_subb_u32 s11, s3, s7 -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: s_cmp_eq_u64 s[10:11], s[2:3] -; GFX8-NEXT: s_cselect_b32 s0, 1, 0 +; GFX8-NEXT: s_sub_u32 s10, s0, s4 +; GFX8-NEXT: s_subb_u32 s11, s1, s5 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_subb_u32 s8, s2, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_subb_u32 s9, s3, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s6 +; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], s[6:7] +; GFX8-NEXT: s_cselect_b32 s4, 1, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[10:11], v[2:3] -; GFX8-NEXT: s_and_b32 s0, 1, s0 +; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] +; GFX8-NEXT: s_and_b32 s0, 1, s4 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX8-NEXT: v_cmp_gt_u64_e64 s[0:1], s[4:5], 0 +; GFX8-NEXT: s_cmp_eq_u64 s[8:9], 0 +; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[8:9], 0 +; GFX8-NEXT: s_cselect_b32 s2, 1, 0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GFX8-NEXT: s_cmp_eq_u64 s[6:7], 0 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] -; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[6:7], 0 -; GFX8-NEXT: s_cselect_b32 s2, 1, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; GFX8-NEXT: s_and_b32 s0, 1, s2 -; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX8-NEXT: s_ashr_i32 s0, s11, 31 +; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0 +; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1] +; GFX8-NEXT: v_xor_b32_e32 v0, v0, v1 +; GFX8-NEXT: s_ashr_i32 s0, s9, 31 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX8-NEXT: s_add_i32 s1, s0, 0x80000000 ; GFX8-NEXT: v_mov_b32_e32 v1, s0 -; GFX8-NEXT: v_mov_b32_e32 v2, s8 -; GFX8-NEXT: v_mov_b32_e32 v3, s9 +; GFX8-NEXT: v_mov_b32_e32 v2, s10 +; GFX8-NEXT: v_mov_b32_e32 v3, s11 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v1, vcc ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s10 -; GFX8-NEXT: v_mov_b32_e32 v5, s11 +; GFX8-NEXT: v_mov_b32_e32 v4, s8 +; GFX8-NEXT: v_mov_b32_e32 v5, s9 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 @@ -5190,45 +5186,43 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) { ; ; GFX9-LABEL: s_ssubsat_i128: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_sub_u32 s8, s0, s4 -; GFX9-NEXT: s_subb_u32 s9, s1, s5 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: s_subb_u32 s10, s2, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: s_subb_u32 s11, s3, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: s_cmp_eq_u64 s[10:11], s[2:3] -; GFX9-NEXT: s_cselect_b32 s0, 1, 0 +; GFX9-NEXT: s_sub_u32 s10, s0, s4 +; GFX9-NEXT: s_subb_u32 s11, s1, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: s_subb_u32 s8, s2, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: s_subb_u32 s9, s3, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], s[6:7] +; GFX9-NEXT: s_cselect_b32 s4, 1, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[10:11], v[2:3] -; GFX9-NEXT: s_and_b32 s0, 1, s0 +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] +; GFX9-NEXT: s_and_b32 s0, 1, s4 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX9-NEXT: v_cmp_gt_u64_e64 s[0:1], s[4:5], 0 +; GFX9-NEXT: s_cmp_eq_u64 s[8:9], 0 +; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[8:9], 0 +; GFX9-NEXT: s_cselect_b32 s2, 1, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GFX9-NEXT: s_cmp_eq_u64 s[6:7], 0 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] -; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[6:7], 0 -; GFX9-NEXT: s_cselect_b32 s2, 1, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; GFX9-NEXT: s_and_b32 s0, 1, s2 -; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX9-NEXT: s_ashr_i32 s0, s11, 31 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0 +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1] +; GFX9-NEXT: v_xor_b32_e32 v0, v0, v1 +; GFX9-NEXT: s_ashr_i32 s0, s9, 31 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_add_i32 s1, s0, 0x80000000 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s8 -; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-NEXT: v_mov_b32_e32 v3, s11 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v1, vcc ; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_mov_b32_e32 v4, s10 -; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v4, s8 +; GFX9-NEXT: v_mov_b32_e32 v5, s9 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 @@ -5239,37 +5233,35 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) { ; ; GFX10-LABEL: s_ssubsat_i128: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_sub_u32 s8, s0, s4 -; GFX10-NEXT: s_subb_u32 s9, s1, s5 -; GFX10-NEXT: s_subb_u32 s10, s2, s6 -; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[8:9], s[0:1] -; GFX10-NEXT: s_subb_u32 s11, s3, s7 -; GFX10-NEXT: s_cmp_eq_u64 s[10:11], s[2:3] -; GFX10-NEXT: s_cselect_b32 s12, 1, 0 +; GFX10-NEXT: s_sub_u32 s10, s0, s4 +; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[0:1], s[4:5] +; GFX10-NEXT: s_subb_u32 s11, s1, s5 +; GFX10-NEXT: s_subb_u32 s8, s2, s6 +; GFX10-NEXT: s_subb_u32 s9, s3, s7 +; GFX10-NEXT: s_cmp_eq_u64 s[2:3], s[6:7] +; GFX10-NEXT: v_mov_b32_e32 v3, s9 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[10:11], s[2:3] -; GFX10-NEXT: v_cmp_gt_u64_e64 s2, s[4:5], 0 +; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[2:3], s[6:7] +; GFX10-NEXT: s_cselect_b32 s12, 1, 0 +; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[8:9], 0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 ; GFX10-NEXT: s_and_b32 s0, 1, s12 -; GFX10-NEXT: s_cmp_eq_u64 s[6:7], 0 +; GFX10-NEXT: s_cmp_eq_u64 s[8:9], 0 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s2 -; GFX10-NEXT: v_cmp_gt_i64_e64 s2, s[6:7], 0 -; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 ; GFX10-NEXT: s_cselect_b32 s1, 1, 0 -; GFX10-NEXT: s_ashr_i32 s0, s11, 31 +; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 ; GFX10-NEXT: s_and_b32 s1, 1, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2 +; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo -; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, 0, s0 +; GFX10-NEXT: v_mov_b32_e32 v2, s8 +; GFX10-NEXT: s_ashr_i32 s0, s9, 31 ; GFX10-NEXT: s_add_i32 s1, s0, 0x80000000 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo -; GFX10-NEXT: v_mov_b32_e32 v2, s10 -; GFX10-NEXT: v_mov_b32_e32 v3, s11 -; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX10-NEXT: v_mov_b32_e32 v1, s8 +; GFX10-NEXT: v_xor_b32_e32 v0, v0, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, s10 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, s9 +; GFX10-NEXT: v_mov_b32_e32 v0, s11 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s0, vcc_lo @@ -5282,35 +5274,34 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) { ; ; GFX11-LABEL: s_ssubsat_i128: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_sub_u32 s8, s0, s4 -; GFX11-NEXT: s_subb_u32 s9, s1, s5 -; GFX11-NEXT: s_subb_u32 s10, s2, s6 -; GFX11-NEXT: v_cmp_lt_u64_e64 s0, s[8:9], s[0:1] -; GFX11-NEXT: s_subb_u32 s11, s3, s7 -; GFX11-NEXT: s_cmp_eq_u64 s[10:11], s[2:3] -; GFX11-NEXT: s_cselect_b32 s12, 1, 0 +; GFX11-NEXT: s_sub_u32 s10, s0, s4 +; GFX11-NEXT: v_cmp_lt_u64_e64 s0, s[0:1], s[4:5] +; GFX11-NEXT: s_subb_u32 s11, s1, s5 +; GFX11-NEXT: s_subb_u32 s8, s2, s6 +; GFX11-NEXT: s_subb_u32 s9, s3, s7 +; GFX11-NEXT: s_cmp_eq_u64 s[2:3], s[6:7] +; GFX11-NEXT: v_mov_b32_e32 v3, s9 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[10:11], s[2:3] -; GFX11-NEXT: v_cmp_gt_u64_e64 s2, s[4:5], 0 +; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[2:3], s[6:7] +; GFX11-NEXT: s_cselect_b32 s12, 1, 0 +; GFX11-NEXT: v_cmp_lt_i64_e64 s2, s[8:9], 0 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 ; GFX11-NEXT: s_and_b32 s0, 1, s12 -; GFX11-NEXT: s_cmp_eq_u64 s[6:7], 0 +; GFX11-NEXT: s_cmp_eq_u64 s[8:9], 0 ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s2 -; GFX11-NEXT: v_cmp_gt_i64_e64 s2, s[6:7], 0 -; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 ; GFX11-NEXT: s_cselect_b32 s1, 1, 0 -; GFX11-NEXT: s_ashr_i32 s0, s11, 31 +; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 ; GFX11-NEXT: s_and_b32 s1, 1, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2 +; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, s1 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo -; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v2, 0, s0 +; GFX11-NEXT: v_mov_b32_e32 v2, s8 +; GFX11-NEXT: s_ashr_i32 s0, s9, 31 ; GFX11-NEXT: s_add_i32 s1, s0, 0x80000000 -; GFX11-NEXT: v_dual_cndmask_b32 v1, v3, v2 :: v_dual_mov_b32 v2, s10 -; GFX11-NEXT: v_mov_b32_e32 v3, s11 -; GFX11-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX11-NEXT: v_dual_mov_b32 v1, s8 :: v_dual_and_b32 v0, 1, v0 +; GFX11-NEXT: v_xor_b32_e32 v0, v0, v1 +; GFX11-NEXT: v_dual_mov_b32 v1, s10 :: v_dual_and_b32 v0, 1, v0 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, s9 +; GFX11-NEXT: v_mov_b32_e32 v0, s11 ; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s0, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s0, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s0, vcc_lo @@ -5327,156 +5318,146 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) { define amdgpu_ps <4 x float> @ssubsat_i128_sv(i128 inreg %lhs, i128 %rhs) { ; GFX6-LABEL: ssubsat_i128_sv: ; GFX6: ; %bb.0: -; GFX6-NEXT: v_mov_b32_e32 v5, s1 -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s0, v0 -; GFX6-NEXT: v_subb_u32_e32 v5, vcc, v5, v1, vcc -; GFX6-NEXT: v_mov_b32_e32 v6, s2 -; GFX6-NEXT: v_mov_b32_e32 v7, s3 -; GFX6-NEXT: v_subb_u32_e32 v6, vcc, v6, v2, vcc -; GFX6-NEXT: v_subb_u32_e32 v7, vcc, v7, v3, vcc -; GFX6-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[4:5] -; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX6-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[6:7] -; GFX6-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[6:7] -; GFX6-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc -; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[0:1] +; GFX6-NEXT: v_mov_b32_e32 v4, s1 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s0, v0 +; GFX6-NEXT: v_subb_u32_e32 v7, vcc, v4, v1, vcc +; GFX6-NEXT: v_mov_b32_e32 v4, s2 +; GFX6-NEXT: v_mov_b32_e32 v5, s3 +; GFX6-NEXT: v_subb_u32_e32 v4, vcc, v4, v2, vcc +; GFX6-NEXT: v_subb_u32_e32 v5, vcc, v5, v3, vcc +; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] ; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, 0, v[2:3] +; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] -; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v7 +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[2:3] +; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v5 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GFX6-NEXT: v_xor_b32_e32 v0, v0, v8 +; GFX6-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc +; GFX6-NEXT: v_xor_b32_e32 v0, v0, v1 ; GFX6-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v2, v1 ; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: ssubsat_i128_sv: ; GFX8: ; %bb.0: -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_sub_u32_e32 v4, vcc, s0, v0 -; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v5, v1, vcc -; GFX8-NEXT: v_mov_b32_e32 v6, s2 -; GFX8-NEXT: v_mov_b32_e32 v7, s3 -; GFX8-NEXT: v_subb_u32_e32 v6, vcc, v6, v2, vcc -; GFX8-NEXT: v_subb_u32_e32 v7, vcc, v7, v3, vcc -; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[6:7] -; GFX8-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc -; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v4, s1 +; GFX8-NEXT: v_sub_u32_e32 v6, vcc, s0, v0 +; GFX8-NEXT: v_subb_u32_e32 v7, vcc, v4, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NEXT: v_subb_u32_e32 v4, vcc, v4, v2, vcc +; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v5, v3, vcc +; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, 0, v[2:3] +; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] -; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v7 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[2:3] +; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v5 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GFX8-NEXT: v_xor_b32_e32 v0, v0, v8 +; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc +; GFX8-NEXT: v_xor_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v2, v1 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: ssubsat_i128_sv: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v5, s1 -; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, s0, v0 -; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v5, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v6, s2 -; GFX9-NEXT: v_mov_b32_e32 v7, s3 -; GFX9-NEXT: v_subb_co_u32_e32 v6, vcc, v6, v2, vcc -; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v3, vcc -; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[6:7] -; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v4, s1 +; GFX9-NEXT: v_sub_co_u32_e32 v6, vcc, s0, v0 +; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v4, v1, vcc +; GFX9-NEXT: v_mov_b32_e32 v4, s2 +; GFX9-NEXT: v_mov_b32_e32 v5, s3 +; GFX9-NEXT: v_subb_co_u32_e32 v4, vcc, v4, v2, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v5, v3, vcc +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, 0, v[2:3] +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] -; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v7 +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[2:3] +; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v5 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GFX9-NEXT: v_xor_b32_e32 v0, v0, v8 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[4:5] ; GFX9-NEXT: v_add_u32_e32 v3, 0x80000000, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc +; GFX9-NEXT: v_xor_b32_e32 v0, v0, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: ssubsat_i128_sv: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, s0, v0 -; GFX10-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, s1, v1, vcc_lo -; GFX10-NEXT: v_sub_co_ci_u32_e32 v6, vcc_lo, s2, v2, vcc_lo -; GFX10-NEXT: v_sub_co_ci_u32_e32 v7, vcc_lo, s3, v3, vcc_lo -; GFX10-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[4:5] -; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc_lo -; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[6:7] -; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc_lo -; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, 0, v[0:1] +; GFX10-NEXT: v_sub_co_u32 v6, vcc_lo, s0, v0 +; GFX10-NEXT: v_sub_co_ci_u32_e32 v7, vcc_lo, s1, v1, vcc_lo +; GFX10-NEXT: v_sub_co_ci_u32_e32 v4, vcc_lo, s2, v2, vcc_lo +; GFX10-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, s3, v3, vcc_lo +; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[0:1], v[0:1] ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, 0, v[2:3] +; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[2:3] ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[2:3], v[6:7] -; GFX10-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc_lo -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[2:3] -; GFX10-NEXT: v_ashrrev_i32_e32 v2, 31, v7 +; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[4:5] +; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc_lo +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[2:3], v[2:3] +; GFX10-NEXT: v_ashrrev_i32_e32 v2, 31, v5 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[4:5] ; GFX10-NEXT: v_add_nc_u32_e32 v3, 0x80000000, v2 -; GFX10-NEXT: v_xor_b32_e32 v0, v0, v8 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v8, 0, vcc_lo +; GFX10-NEXT: v_xor_b32_e32 v0, v0, v1 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc_lo ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: ssubsat_i128_sv: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_sub_co_u32 v4, vcc_lo, s0, v0 -; GFX11-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, s1, v1, vcc_lo -; GFX11-NEXT: v_sub_co_ci_u32_e32 v6, vcc_lo, s2, v2, vcc_lo -; GFX11-NEXT: v_sub_co_ci_u32_e64 v7, null, s3, v3, vcc_lo -; GFX11-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[4:5] -; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc_lo -; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[6:7] -; GFX11-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc_lo -; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, 0, v[0:1] +; GFX11-NEXT: v_sub_co_u32 v6, vcc_lo, s0, v0 +; GFX11-NEXT: v_sub_co_ci_u32_e32 v7, vcc_lo, s1, v1, vcc_lo +; GFX11-NEXT: v_sub_co_ci_u32_e32 v4, vcc_lo, s2, v2, vcc_lo +; GFX11-NEXT: v_sub_co_ci_u32_e64 v5, null, s3, v3, vcc_lo +; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[0:1], v[0:1] ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, 0, v[2:3] +; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[2:3] ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[2:3], v[6:7] -; GFX11-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc_lo -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[2:3] -; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v7 +; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[4:5] +; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc_lo +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[2:3], v[2:3] +; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v5 ; GFX11-NEXT: v_dual_cndmask_b32 v0, v1, v0 :: v_dual_add_nc_u32 v3, 0x80000000, v2 -; GFX11-NEXT: v_xor_b32_e32 v0, v0, v8 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[4:5] +; GFX11-NEXT: v_cndmask_b32_e64 v1, v8, 0, vcc_lo +; GFX11-NEXT: v_xor_b32_e32 v0, v0, v1 ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc_lo -; GFX11-NEXT: v_dual_cndmask_b32 v2, v6, v2 :: v_dual_cndmask_b32 v3, v7, v3 +; GFX11-NEXT: v_dual_cndmask_b32 v0, v6, v2 :: v_dual_cndmask_b32 v3, v5, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc_lo ; GFX11-NEXT: ; return to shader part epilog %result = call i128 @llvm.ssub.sat.i128(i128 %lhs, i128 %rhs) %cast = bitcast i128 %result to <4 x float> @@ -5486,169 +5467,146 @@ define amdgpu_ps <4 x float> @ssubsat_i128_sv(i128 inreg %lhs, i128 %rhs) { define amdgpu_ps <4 x float> @ssubsat_i128_vs(i128 %lhs, i128 inreg %rhs) { ; GFX6-LABEL: ssubsat_i128_vs: ; GFX6: ; %bb.0: -; GFX6-NEXT: v_mov_b32_e32 v5, s1 -; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s0, v0 -; GFX6-NEXT: v_subb_u32_e32 v5, vcc, v1, v5, vcc -; GFX6-NEXT: v_mov_b32_e32 v6, s2 -; GFX6-NEXT: v_mov_b32_e32 v7, s3 -; GFX6-NEXT: v_subb_u32_e32 v6, vcc, v2, v6, vcc -; GFX6-NEXT: v_subb_u32_e32 v7, vcc, v3, v7, vcc -; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[0:1] -; GFX6-NEXT: v_cmp_gt_u64_e64 s[0:1], s[0:1], 0 +; GFX6-NEXT: v_mov_b32_e32 v4, s1 +; GFX6-NEXT: v_subrev_i32_e32 v6, vcc, s0, v0 +; GFX6-NEXT: v_subb_u32_e32 v7, vcc, v1, v4, vcc +; GFX6-NEXT: v_mov_b32_e32 v4, s2 +; GFX6-NEXT: v_mov_b32_e32 v5, s3 +; GFX6-NEXT: v_subb_u32_e32 v4, vcc, v2, v4, vcc +; GFX6-NEXT: v_subb_u32_e32 v5, vcc, v3, v5, vcc +; GFX6-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1] ; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, v[6:7], v[2:3] +; GFX6-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[2:3] ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[2:3] +; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v5 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] -; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0 -; GFX6-NEXT: v_cmp_eq_u64_e64 vcc, s[2:3], 0 -; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] -; GFX6-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v7 +; GFX6-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc +; GFX6-NEXT: v_xor_b32_e32 v0, v0, v1 ; GFX6-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v2, v1 ; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: ssubsat_i128_vs: ; GFX8: ; %bb.0: -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s0, v0 -; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v1, v5, vcc -; GFX8-NEXT: v_mov_b32_e32 v6, s2 -; GFX8-NEXT: v_mov_b32_e32 v7, s3 -; GFX8-NEXT: v_subb_u32_e32 v6, vcc, v2, v6, vcc -; GFX8-NEXT: v_subb_u32_e32 v7, vcc, v3, v7, vcc -; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[0:1] -; GFX8-NEXT: v_cmp_gt_u64_e64 s[0:1], s[0:1], 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s1 +; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, s0, v0 +; GFX8-NEXT: v_subb_u32_e32 v7, vcc, v1, v4, vcc +; GFX8-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NEXT: v_subb_u32_e32 v4, vcc, v2, v4, vcc +; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v3, v5, vcc +; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, v[6:7], v[2:3] -; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[2:3] ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] -; GFX8-NEXT: s_cselect_b32 s4, 1, 0 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[2:3] +; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v5 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] -; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0 -; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] -; GFX8-NEXT: s_and_b32 s0, 1, s4 -; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v7 +; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc +; GFX8-NEXT: v_xor_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v2, v1 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: ssubsat_i128_vs: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v5, s1 -; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s0, v0 -; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v1, v5, vcc -; GFX9-NEXT: v_mov_b32_e32 v6, s2 -; GFX9-NEXT: v_mov_b32_e32 v7, s3 -; GFX9-NEXT: v_subb_co_u32_e32 v6, vcc, v2, v6, vcc -; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v3, v7, vcc -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[0:1] -; GFX9-NEXT: v_cmp_gt_u64_e64 s[0:1], s[0:1], 0 +; GFX9-NEXT: v_mov_b32_e32 v4, s1 +; GFX9-NEXT: v_subrev_co_u32_e32 v6, vcc, s0, v0 +; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v1, v4, vcc +; GFX9-NEXT: v_mov_b32_e32 v4, s2 +; GFX9-NEXT: v_mov_b32_e32 v5, s3 +; GFX9-NEXT: v_subb_co_u32_e32 v4, vcc, v2, v4, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v3, v5, vcc +; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[6:7], v[2:3] -; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[2:3] ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] -; GFX9-NEXT: s_cselect_b32 s4, 1, 0 +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[2:3] +; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v5 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] -; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] -; GFX9-NEXT: s_and_b32 s0, 1, s4 -; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v7 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[4:5] ; GFX9-NEXT: v_add_u32_e32 v3, 0x80000000, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc +; GFX9-NEXT: v_xor_b32_e32 v0, v0, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: ssubsat_i128_vs: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v0, s0 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v5, vcc_lo, s1, v1, vcc_lo -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v6, vcc_lo, s2, v2, vcc_lo -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v7, vcc_lo, s3, v3, vcc_lo -; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[0:1] -; GFX10-NEXT: v_cmp_gt_u64_e64 s0, s[0:1], 0 -; GFX10-NEXT: s_cmp_eq_u64 s[2:3], 0 -; GFX10-NEXT: s_cselect_b32 s4, 1, 0 +; GFX10-NEXT: v_sub_co_u32 v6, vcc_lo, v0, s0 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v7, vcc_lo, s1, v1, vcc_lo +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v4, vcc_lo, s2, v2, vcc_lo +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v5, vcc_lo, s3, v3, vcc_lo +; GFX10-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[0:1] ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[6:7], v[2:3] -; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s0 -; GFX10-NEXT: v_cmp_gt_i64_e64 s0, s[2:3], 0 +; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[2:3] ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[2:3] -; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s0 -; GFX10-NEXT: s_and_b32 s0, 1, s4 -; GFX10-NEXT: v_ashrrev_i32_e32 v2, 31, v7 +; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[4:5] +; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc_lo +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[2:3], v[2:3] +; GFX10-NEXT: v_ashrrev_i32_e32 v2, 31, v5 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo -; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[4:5] ; GFX10-NEXT: v_add_nc_u32_e32 v3, 0x80000000, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc_lo -; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v8, 0, vcc_lo +; GFX10-NEXT: v_xor_b32_e32 v0, v0, v1 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc_lo ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: ssubsat_i128_vs: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_sub_co_u32 v4, vcc_lo, v0, s0 -; GFX11-NEXT: v_subrev_co_ci_u32_e32 v5, vcc_lo, s1, v1, vcc_lo -; GFX11-NEXT: v_subrev_co_ci_u32_e32 v6, vcc_lo, s2, v2, vcc_lo -; GFX11-NEXT: v_subrev_co_ci_u32_e64 v7, null, s3, v3, vcc_lo -; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[0:1] -; GFX11-NEXT: v_cmp_gt_u64_e64 s0, s[0:1], 0 -; GFX11-NEXT: s_cmp_eq_u64 s[2:3], 0 -; GFX11-NEXT: s_cselect_b32 s4, 1, 0 +; GFX11-NEXT: v_sub_co_u32 v6, vcc_lo, v0, s0 +; GFX11-NEXT: v_subrev_co_ci_u32_e32 v7, vcc_lo, s1, v1, vcc_lo +; GFX11-NEXT: v_subrev_co_ci_u32_e32 v4, vcc_lo, s2, v2, vcc_lo +; GFX11-NEXT: v_subrev_co_ci_u32_e64 v5, null, s3, v3, vcc_lo +; GFX11-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[0:1] ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[6:7], v[2:3] -; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, 1, s0 -; GFX11-NEXT: v_cmp_gt_i64_e64 s0, s[2:3], 0 +; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[2:3] ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[2:3] -; GFX11-NEXT: v_cndmask_b32_e64 v9, 0, 1, s0 -; GFX11-NEXT: s_and_b32 s0, 1, s4 -; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v7 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo -; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 -; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x80000000, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc_lo -; GFX11-NEXT: v_xor_b32_e32 v0, v1, v0 +; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[4:5] +; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc_lo +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[2:3], v[2:3] +; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v5 +; GFX11-NEXT: v_dual_cndmask_b32 v0, v1, v0 :: v_dual_add_nc_u32 v3, 0x80000000, v2 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[4:5] +; GFX11-NEXT: v_cndmask_b32_e64 v1, v8, 0, vcc_lo +; GFX11-NEXT: v_xor_b32_e32 v0, v0, v1 ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc_lo -; GFX11-NEXT: v_dual_cndmask_b32 v2, v6, v2 :: v_dual_cndmask_b32 v3, v7, v3 +; GFX11-NEXT: v_dual_cndmask_b32 v0, v6, v2 :: v_dual_cndmask_b32 v3, v5, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc_lo ; GFX11-NEXT: ; return to shader part epilog %result = call i128 @llvm.ssub.sat.i128(i128 %lhs, i128 %rhs) %cast = bitcast i128 %result to <4 x float> @@ -5659,275 +5617,256 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX6-LABEL: v_ssubsat_v2i128: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v0, v8 -; GFX6-NEXT: v_subb_u32_e32 v17, vcc, v1, v9, vcc -; GFX6-NEXT: v_subb_u32_e32 v18, vcc, v2, v10, vcc -; GFX6-NEXT: v_subb_u32_e32 v19, vcc, v3, v11, vcc -; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, v[16:17], v[0:1] +; GFX6-NEXT: v_sub_i32_e32 v18, vcc, v0, v8 +; GFX6-NEXT: v_subb_u32_e32 v19, vcc, v1, v9, vcc +; GFX6-NEXT: v_subb_u32_e32 v16, vcc, v2, v10, vcc +; GFX6-NEXT: v_subb_u32_e32 v17, vcc, v3, v11, vcc +; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[8:9] ; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, v[18:19], v[2:3] +; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, v[2:3], v[10:11] ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[18:19], v[2:3] +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[10:11] +; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v17 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[8:9] +; GFX6-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[16:17] ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, 0, v[10:11] -; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] -; GFX6-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v19 +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17] +; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc +; GFX6-NEXT: v_xor_b32_e32 v0, v0, v1 ; GFX6-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v2, v1 ; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v16, v2, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v1, v17, v2, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc -; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v4, v12 -; GFX6-NEXT: v_subb_u32_e32 v9, vcc, v5, v13, vcc -; GFX6-NEXT: v_subb_u32_e32 v10, vcc, v6, v14, vcc -; GFX6-NEXT: v_subb_u32_e32 v11, vcc, v7, v15, vcc -; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[4:5] +; GFX6-NEXT: v_cndmask_b32_e32 v0, v18, v2, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v19, v2, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v2, v16, v2, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v3, v17, v3, vcc +; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v4, v12 +; GFX6-NEXT: v_subb_u32_e32 v11, vcc, v5, v13, vcc +; GFX6-NEXT: v_subb_u32_e32 v8, vcc, v6, v14, vcc +; GFX6-NEXT: v_subb_u32_e32 v9, vcc, v7, v15, vcc +; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[12:13] ; GFX6-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, v[10:11], v[6:7] +; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, v[6:7], v[14:15] ; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[10:11], v[6:7] +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[14:15] +; GFX6-NEXT: v_ashrrev_i32_e32 v6, 31, v9 ; GFX6-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc -; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[12:13] +; GFX6-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[8:9] ; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, 0, v[14:15] -; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15] -; GFX6-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc -; GFX6-NEXT: v_xor_b32_e32 v4, v5, v4 -; GFX6-NEXT: v_ashrrev_i32_e32 v6, 31, v11 +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; GFX6-NEXT: v_cndmask_b32_e64 v5, v5, 0, vcc +; GFX6-NEXT: v_xor_b32_e32 v4, v4, v5 ; GFX6-NEXT: v_add_i32_e32 v7, vcc, 0x80000000, v6 ; GFX6-NEXT: v_and_b32_e32 v4, 1, v4 ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GFX6-NEXT: v_cndmask_b32_e32 v4, v8, v6, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v5, v9, v6, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v6, v10, v6, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v7, v11, v7, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v4, v10, v6, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v5, v11, v6, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v7, v9, v7, vcc ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_ssubsat_v2i128: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_sub_u32_e32 v16, vcc, v0, v8 -; GFX8-NEXT: v_subb_u32_e32 v17, vcc, v1, v9, vcc -; GFX8-NEXT: v_subb_u32_e32 v18, vcc, v2, v10, vcc -; GFX8-NEXT: v_subb_u32_e32 v19, vcc, v3, v11, vcc -; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[16:17], v[0:1] +; GFX8-NEXT: v_sub_u32_e32 v18, vcc, v0, v8 +; GFX8-NEXT: v_subb_u32_e32 v19, vcc, v1, v9, vcc +; GFX8-NEXT: v_subb_u32_e32 v16, vcc, v2, v10, vcc +; GFX8-NEXT: v_subb_u32_e32 v17, vcc, v3, v11, vcc +; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[8:9] ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, v[18:19], v[2:3] +; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, v[2:3], v[10:11] ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[18:19], v[2:3] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[10:11] +; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v17 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[8:9] +; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[16:17] ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, 0, v[10:11] -; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v19 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17] +; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc +; GFX8-NEXT: v_xor_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v2, v1 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v16, v2, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v17, v2, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc -; GFX8-NEXT: v_sub_u32_e32 v8, vcc, v4, v12 -; GFX8-NEXT: v_subb_u32_e32 v9, vcc, v5, v13, vcc -; GFX8-NEXT: v_subb_u32_e32 v10, vcc, v6, v14, vcc -; GFX8-NEXT: v_subb_u32_e32 v11, vcc, v7, v15, vcc -; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v0, v18, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v19, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v2, v16, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v3, v17, v3, vcc +; GFX8-NEXT: v_sub_u32_e32 v10, vcc, v4, v12 +; GFX8-NEXT: v_subb_u32_e32 v11, vcc, v5, v13, vcc +; GFX8-NEXT: v_subb_u32_e32 v8, vcc, v6, v14, vcc +; GFX8-NEXT: v_subb_u32_e32 v9, vcc, v7, v15, vcc +; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[12:13] ; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, v[10:11], v[6:7] +; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, v[6:7], v[14:15] ; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[10:11], v[6:7] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[14:15] +; GFX8-NEXT: v_ashrrev_i32_e32 v6, 31, v9 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc -; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[12:13] +; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[8:9] ; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, 0, v[14:15] -; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15] -; GFX8-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc -; GFX8-NEXT: v_xor_b32_e32 v4, v5, v4 -; GFX8-NEXT: v_ashrrev_i32_e32 v6, 31, v11 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, 0, vcc +; GFX8-NEXT: v_xor_b32_e32 v4, v4, v5 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x80000000, v6 ; GFX8-NEXT: v_and_b32_e32 v4, 1, v4 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v8, v6, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v6, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v6, v10, v6, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v7, v11, v7, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, v10, v6, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v5, v11, v6, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v7, v9, v7, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_ssubsat_v2i128: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_sub_co_u32_e32 v16, vcc, v0, v8 -; GFX9-NEXT: v_subb_co_u32_e32 v17, vcc, v1, v9, vcc -; GFX9-NEXT: v_subb_co_u32_e32 v18, vcc, v2, v10, vcc -; GFX9-NEXT: v_subb_co_u32_e32 v19, vcc, v3, v11, vcc -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[16:17], v[0:1] +; GFX9-NEXT: v_sub_co_u32_e32 v18, vcc, v0, v8 +; GFX9-NEXT: v_subb_co_u32_e32 v19, vcc, v1, v9, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v16, vcc, v2, v10, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v17, vcc, v3, v11, vcc +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[8:9] ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[18:19], v[2:3] +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[2:3], v[10:11] ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[18:19], v[2:3] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[10:11] +; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v17 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[8:9] +; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[16:17] +; GFX9-NEXT: v_add_u32_e32 v3, 0x80000000, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, 0, v[10:11] -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v19 +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17] +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc +; GFX9-NEXT: v_xor_b32_e32 v0, v0, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_add_u32_e32 v3, 0x80000000, v2 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v16, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc -; GFX9-NEXT: v_sub_co_u32_e32 v8, vcc, v4, v12 -; GFX9-NEXT: v_subb_co_u32_e32 v9, vcc, v5, v13, vcc -; GFX9-NEXT: v_subb_co_u32_e32 v10, vcc, v6, v14, vcc -; GFX9-NEXT: v_subb_co_u32_e32 v11, vcc, v7, v15, vcc -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v0, v18, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v19, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v2, v16, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v17, v3, vcc +; GFX9-NEXT: v_sub_co_u32_e32 v10, vcc, v4, v12 +; GFX9-NEXT: v_subb_co_u32_e32 v11, vcc, v5, v13, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v8, vcc, v6, v14, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v9, vcc, v7, v15, vcc +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[12:13] ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[10:11], v[6:7] +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[6:7], v[14:15] ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[10:11], v[6:7] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[14:15] +; GFX9-NEXT: v_ashrrev_i32_e32 v6, 31, v9 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[12:13] +; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[8:9] +; GFX9-NEXT: v_add_u32_e32 v7, 0x80000000, v6 ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, 0, v[14:15] -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15] -; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc -; GFX9-NEXT: v_xor_b32_e32 v4, v5, v4 -; GFX9-NEXT: v_ashrrev_i32_e32 v6, 31, v11 +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, 0, vcc +; GFX9-NEXT: v_xor_b32_e32 v4, v4, v5 ; GFX9-NEXT: v_and_b32_e32 v4, 1, v4 -; GFX9-NEXT: v_add_u32_e32 v7, 0x80000000, v6 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v8, v6, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v6, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v6, v10, v6, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v7, v11, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v10, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v5, v11, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v7, v9, v7, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_ssubsat_v2i128: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_sub_co_u32 v16, vcc_lo, v0, v8 -; GFX10-NEXT: v_sub_co_ci_u32_e32 v17, vcc_lo, v1, v9, vcc_lo -; GFX10-NEXT: v_sub_co_ci_u32_e32 v18, vcc_lo, v2, v10, vcc_lo -; GFX10-NEXT: v_sub_co_ci_u32_e32 v19, vcc_lo, v3, v11, vcc_lo -; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[16:17], v[0:1] +; GFX10-NEXT: v_sub_co_u32 v18, vcc_lo, v0, v8 +; GFX10-NEXT: v_sub_co_ci_u32_e32 v19, vcc_lo, v1, v9, vcc_lo +; GFX10-NEXT: v_sub_co_ci_u32_e32 v16, vcc_lo, v2, v10, vcc_lo +; GFX10-NEXT: v_sub_co_ci_u32_e32 v17, vcc_lo, v3, v11, vcc_lo +; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[8:9] ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[18:19], v[2:3] +; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[10:11] ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[18:19], v[2:3] +; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[16:17] +; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc_lo +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[10:11] ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo -; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, 0, v[8:9] -; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo -; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, 0, v[10:11] -; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v8, vcc_lo, v4, v12 -; GFX10-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, v5, v13, vcc_lo -; GFX10-NEXT: v_sub_co_ci_u32_e32 v20, vcc_lo, v6, v14, vcc_lo -; GFX10-NEXT: v_sub_co_ci_u32_e32 v21, vcc_lo, v7, v15, vcc_lo -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[10:11] -; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo -; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[8:9], v[4:5] -; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[16:17] +; GFX10-NEXT: v_cndmask_b32_e64 v1, v8, 0, vcc_lo +; GFX10-NEXT: v_sub_co_u32 v10, vcc_lo, v4, v12 +; GFX10-NEXT: v_sub_co_ci_u32_e32 v11, vcc_lo, v5, v13, vcc_lo +; GFX10-NEXT: v_sub_co_ci_u32_e32 v8, vcc_lo, v6, v14, vcc_lo +; GFX10-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, v7, v15, vcc_lo +; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[12:13] +; GFX10-NEXT: v_xor_b32_e32 v0, v0, v1 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo -; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[20:21], v[6:7] +; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[6:7], v[14:15] ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo -; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, 0, v[12:13] +; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[8:9] ; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo -; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, 0, v[14:15] -; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[20:21], v[6:7] -; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v21 +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[14:15] +; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v9 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[14:15] +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[8:9] ; GFX10-NEXT: v_add_nc_u32_e32 v7, 0x80000000, v6 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v4, 0, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX10-NEXT: v_xor_b32_e32 v1, v2, v1 -; GFX10-NEXT: v_ashrrev_i32_e32 v2, 31, v19 +; GFX10-NEXT: v_xor_b32_e32 v1, v1, v2 +; GFX10-NEXT: v_ashrrev_i32_e32 v2, 31, v17 ; GFX10-NEXT: v_and_b32_e32 v3, 1, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v4, 0x80000000, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v16, v2, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v1, v17, v2, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v18, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v19, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v2, v16, v2, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v19, v4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v4, v8, v6, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v9, v6, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v20, v6, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v21, v7, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v17, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v4, v10, v6, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v11, v6, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v8, v6, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v9, v7, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_ssubsat_v2i128: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_sub_co_u32 v16, vcc_lo, v0, v8 -; GFX11-NEXT: v_sub_co_ci_u32_e32 v17, vcc_lo, v1, v9, vcc_lo -; GFX11-NEXT: v_sub_co_ci_u32_e32 v18, vcc_lo, v2, v10, vcc_lo -; GFX11-NEXT: v_sub_co_ci_u32_e64 v19, null, v3, v11, vcc_lo -; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[16:17], v[0:1] +; GFX11-NEXT: v_sub_co_u32 v18, vcc_lo, v0, v8 +; GFX11-NEXT: v_sub_co_ci_u32_e32 v19, vcc_lo, v1, v9, vcc_lo +; GFX11-NEXT: v_sub_co_ci_u32_e32 v16, vcc_lo, v2, v10, vcc_lo +; GFX11-NEXT: v_sub_co_ci_u32_e64 v17, null, v3, v11, vcc_lo +; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[8:9] ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[18:19], v[2:3] +; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[10:11] ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[18:19], v[2:3] +; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[16:17] +; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc_lo +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[10:11] ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo -; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, 0, v[8:9] -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo -; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, 0, v[10:11] -; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo -; GFX11-NEXT: v_sub_co_u32 v8, vcc_lo, v4, v12 -; GFX11-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, v5, v13, vcc_lo -; GFX11-NEXT: v_sub_co_ci_u32_e32 v20, vcc_lo, v6, v14, vcc_lo -; GFX11-NEXT: v_sub_co_ci_u32_e64 v21, null, v7, v15, vcc_lo -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[10:11] -; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo -; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[8:9], v[4:5] -; GFX11-NEXT: v_xor_b32_e32 v0, v1, v0 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[16:17] +; GFX11-NEXT: v_cndmask_b32_e64 v1, v8, 0, vcc_lo +; GFX11-NEXT: v_sub_co_u32 v10, vcc_lo, v4, v12 +; GFX11-NEXT: v_sub_co_ci_u32_e32 v11, vcc_lo, v5, v13, vcc_lo +; GFX11-NEXT: v_sub_co_ci_u32_e32 v8, vcc_lo, v6, v14, vcc_lo +; GFX11-NEXT: v_sub_co_ci_u32_e64 v9, null, v7, v15, vcc_lo +; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[12:13] +; GFX11-NEXT: v_xor_b32_e32 v0, v0, v1 ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo -; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[20:21], v[6:7] +; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[6:7], v[14:15] ; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo -; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, 0, v[12:13] +; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[8:9] ; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo -; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, 0, v[14:15] -; GFX11-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[20:21], v[6:7] -; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v21 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[14:15] +; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v9 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[14:15] -; GFX11-NEXT: v_dual_cndmask_b32 v2, v5, v4 :: v_dual_add_nc_u32 v7, 0x80000000, v6 -; GFX11-NEXT: v_xor_b32_e32 v1, v2, v1 -; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v19 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[8:9] +; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x80000000, v6 +; GFX11-NEXT: v_cndmask_b32_e64 v2, v4, 0, vcc_lo +; GFX11-NEXT: v_xor_b32_e32 v1, v1, v2 +; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v17 ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x80000000, v2 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_dual_cndmask_b32 v0, v16, v2 :: v_dual_and_b32 v3, 1, v1 +; GFX11-NEXT: v_dual_cndmask_b32 v0, v18, v2 :: v_dual_and_b32 v3, 1, v1 ; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, v3 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v17, v2, vcc_lo -; GFX11-NEXT: v_dual_cndmask_b32 v2, v18, v2 :: v_dual_cndmask_b32 v3, v19, v4 -; GFX11-NEXT: v_cndmask_b32_e64 v4, v8, v6, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v5, v9, v6, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v6, v20, v6, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v7, v21, v7, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v19, v2, vcc_lo +; GFX11-NEXT: v_dual_cndmask_b32 v2, v16, v2 :: v_dual_cndmask_b32 v3, v17, v4 +; GFX11-NEXT: v_cndmask_b32_e64 v4, v10, v6, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v5, v11, v6, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v6, v8, v6, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v7, v9, v7, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i128> @llvm.ssub.sat.v2i128(<2 x i128> %lhs, <2 x i128> %rhs) ret <2 x i128> %result @@ -5936,74 +5875,70 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> inreg %rhs) { ; GFX6-LABEL: s_ssubsat_v2i128: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_sub_u32 s16, s0, s8 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: s_subb_u32 s17, s1, s9 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: s_subb_u32 s18, s2, s10 -; GFX6-NEXT: v_mov_b32_e32 v2, s2 -; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[16:17], v[0:1] -; GFX6-NEXT: s_subb_u32 s19, s3, s11 -; GFX6-NEXT: v_mov_b32_e32 v3, s3 +; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: v_mov_b32_e32 v1, s9 +; GFX6-NEXT: s_sub_u32 s18, s0, s8 +; GFX6-NEXT: v_mov_b32_e32 v2, s10 +; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] +; GFX6-NEXT: s_subb_u32 s19, s1, s9 +; GFX6-NEXT: v_mov_b32_e32 v3, s11 +; GFX6-NEXT: s_subb_u32 s16, s2, s10 ; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[18:19], v[2:3] -; GFX6-NEXT: v_cmp_gt_u64_e64 s[0:1], s[8:9], 0 +; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] +; GFX6-NEXT: s_subb_u32 s17, s3, s11 ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[18:19], v[2:3] -; GFX6-NEXT: v_mov_b32_e32 v3, s17 +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[2:3] +; GFX6-NEXT: v_cmp_lt_i64_e64 s[0:1], s[16:17], 0 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] -; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[10:11], 0 -; GFX6-NEXT: v_cmp_eq_u64_e64 vcc, s[10:11], 0 -; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] -; GFX6-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX6-NEXT: s_ashr_i32 s0, s19, 31 +; GFX6-NEXT: v_cmp_eq_u64_e64 s[0:1], s[16:17], 0 +; GFX6-NEXT: v_mov_b32_e32 v2, s18 +; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1] +; GFX6-NEXT: v_xor_b32_e32 v0, v0, v1 +; GFX6-NEXT: s_ashr_i32 s0, s17, 31 ; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX6-NEXT: s_add_i32 s1, s0, 0x80000000 ; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: v_mov_b32_e32 v2, s16 +; GFX6-NEXT: v_mov_b32_e32 v3, s19 ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX6-NEXT: v_cndmask_b32_e32 v4, v2, v1, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc ; GFX6-NEXT: v_mov_b32_e32 v0, s1 -; GFX6-NEXT: v_mov_b32_e32 v2, s18 -; GFX6-NEXT: v_mov_b32_e32 v3, s19 +; GFX6-NEXT: v_mov_b32_e32 v2, s16 +; GFX6-NEXT: v_mov_b32_e32 v3, s17 ; GFX6-NEXT: v_cndmask_b32_e32 v6, v2, v1, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v7, v3, v0, vcc -; GFX6-NEXT: s_sub_u32 s0, s4, s12 -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: s_subb_u32 s1, s5, s13 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: s_subb_u32 s2, s6, s14 -; GFX6-NEXT: v_mov_b32_e32 v2, s6 -; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] -; GFX6-NEXT: s_subb_u32 s3, s7, s15 -; GFX6-NEXT: v_mov_b32_e32 v3, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s12 +; GFX6-NEXT: v_mov_b32_e32 v1, s13 +; GFX6-NEXT: s_sub_u32 s8, s4, s12 +; GFX6-NEXT: v_mov_b32_e32 v2, s14 +; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] +; GFX6-NEXT: s_subb_u32 s9, s5, s13 +; GFX6-NEXT: v_mov_b32_e32 v3, s15 +; GFX6-NEXT: s_subb_u32 s0, s6, s14 ; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] -; GFX6-NEXT: v_cmp_gt_u64_e64 s[4:5], s[12:13], 0 +; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] +; GFX6-NEXT: s_subb_u32 s1, s7, s15 ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[2:3] -; GFX6-NEXT: v_mov_b32_e32 v3, s1 +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[6:7], v[2:3] +; GFX6-NEXT: v_cmp_lt_i64_e64 s[2:3], s[0:1], 0 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] -; GFX6-NEXT: v_cmp_gt_i64_e64 s[4:5], s[14:15], 0 -; GFX6-NEXT: v_cmp_eq_u64_e64 vcc, s[14:15], 0 -; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX6-NEXT: s_ashr_i32 s4, s3, 31 +; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3] +; GFX6-NEXT: v_cmp_eq_u64_e64 s[2:3], s[0:1], 0 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[2:3] +; GFX6-NEXT: v_xor_b32_e32 v0, v0, v1 +; GFX6-NEXT: s_ashr_i32 s2, s1, 31 ; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX6-NEXT: s_add_i32 s5, s4, 0x80000000 -; GFX6-NEXT: v_mov_b32_e32 v1, s4 -; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: s_add_i32 s3, s2, 0x80000000 +; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: v_mov_b32_e32 v3, s9 ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v2, v3, v1, vcc -; GFX6-NEXT: v_mov_b32_e32 v3, s5 -; GFX6-NEXT: v_mov_b32_e32 v8, s2 -; GFX6-NEXT: v_mov_b32_e32 v9, s3 +; GFX6-NEXT: v_mov_b32_e32 v3, s3 +; GFX6-NEXT: v_mov_b32_e32 v8, s0 +; GFX6-NEXT: v_mov_b32_e32 v9, s1 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc ; GFX6-NEXT: v_readfirstlane_b32 s0, v4 @@ -6018,86 +5953,82 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; ; GFX8-LABEL: s_ssubsat_v2i128: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_sub_u32 s16, s0, s8 -; GFX8-NEXT: s_subb_u32 s17, s1, s9 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: s_subb_u32 s18, s2, s10 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: s_subb_u32 s19, s3, s11 -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[16:17], v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: s_cmp_eq_u64 s[18:19], s[2:3] -; GFX8-NEXT: s_cselect_b32 s0, 1, 0 +; GFX8-NEXT: s_sub_u32 s18, s0, s8 +; GFX8-NEXT: s_subb_u32 s19, s1, s9 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: s_subb_u32 s16, s2, s10 +; GFX8-NEXT: v_mov_b32_e32 v1, s9 +; GFX8-NEXT: s_subb_u32 s17, s3, s11 +; GFX8-NEXT: v_mov_b32_e32 v2, s10 +; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v3, s11 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], s[10:11] +; GFX8-NEXT: s_cselect_b32 s8, 1, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[18:19], v[2:3] -; GFX8-NEXT: s_and_b32 s0, 1, s0 +; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] +; GFX8-NEXT: s_and_b32 s0, 1, s8 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX8-NEXT: v_cmp_gt_u64_e64 s[0:1], s[8:9], 0 +; GFX8-NEXT: s_cmp_eq_u64 s[16:17], 0 +; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[16:17], 0 +; GFX8-NEXT: s_cselect_b32 s2, 1, 0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GFX8-NEXT: s_cmp_eq_u64 s[10:11], 0 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] -; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[10:11], 0 -; GFX8-NEXT: s_cselect_b32 s2, 1, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; GFX8-NEXT: s_and_b32 s0, 1, s2 -; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX8-NEXT: s_ashr_i32 s0, s19, 31 +; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0 +; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1] +; GFX8-NEXT: v_xor_b32_e32 v0, v0, v1 +; GFX8-NEXT: s_ashr_i32 s0, s17, 31 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX8-NEXT: s_add_i32 s1, s0, 0x80000000 ; GFX8-NEXT: v_mov_b32_e32 v1, s0 -; GFX8-NEXT: v_mov_b32_e32 v2, s16 -; GFX8-NEXT: v_mov_b32_e32 v3, s17 +; GFX8-NEXT: v_mov_b32_e32 v2, s18 +; GFX8-NEXT: v_mov_b32_e32 v3, s19 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v1, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc ; GFX8-NEXT: v_mov_b32_e32 v0, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s18 -; GFX8-NEXT: v_mov_b32_e32 v3, s19 -; GFX8-NEXT: s_sub_u32 s0, s4, s12 +; GFX8-NEXT: v_mov_b32_e32 v2, s16 +; GFX8-NEXT: v_mov_b32_e32 v3, s17 +; GFX8-NEXT: s_sub_u32 s8, s4, s12 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v2, v1, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v7, v3, v0, vcc -; GFX8-NEXT: s_subb_u32 s1, s5, s13 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: s_subb_u32 s2, s6, s14 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: s_subb_u32 s3, s7, s15 -; GFX8-NEXT: v_mov_b32_e32 v2, s6 -; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v3, s7 -; GFX8-NEXT: s_cmp_eq_u64 s[2:3], s[6:7] -; GFX8-NEXT: s_cselect_b32 s4, 1, 0 +; GFX8-NEXT: s_subb_u32 s9, s5, s13 +; GFX8-NEXT: v_mov_b32_e32 v0, s12 +; GFX8-NEXT: s_subb_u32 s0, s6, s14 +; GFX8-NEXT: v_mov_b32_e32 v1, s13 +; GFX8-NEXT: s_subb_u32 s1, s7, s15 +; GFX8-NEXT: v_mov_b32_e32 v2, s14 +; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v3, s15 +; GFX8-NEXT: s_cmp_eq_u64 s[6:7], s[14:15] +; GFX8-NEXT: s_cselect_b32 s2, 1, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] -; GFX8-NEXT: s_and_b32 s4, 1, s4 +; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] +; GFX8-NEXT: s_and_b32 s2, 1, s2 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 -; GFX8-NEXT: v_cmp_gt_u64_e64 s[4:5], s[12:13], 0 +; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 +; GFX8-NEXT: s_cmp_eq_u64 s[0:1], 0 +; GFX8-NEXT: v_cmp_lt_i64_e64 s[2:3], s[0:1], 0 +; GFX8-NEXT: s_cselect_b32 s4, 1, 0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GFX8-NEXT: s_cmp_eq_u64 s[14:15], 0 -; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] -; GFX8-NEXT: v_cmp_gt_i64_e64 s[4:5], s[14:15], 0 -; GFX8-NEXT: s_cselect_b32 s6, 1, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] -; GFX8-NEXT: s_and_b32 s4, 1, s6 -; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX8-NEXT: s_ashr_i32 s4, s3, 31 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3] +; GFX8-NEXT: s_and_b32 s2, 1, s4 +; GFX8-NEXT: v_cmp_ne_u32_e64 s[2:3], 0, s2 +; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[2:3] +; GFX8-NEXT: v_xor_b32_e32 v0, v0, v1 +; GFX8-NEXT: s_ashr_i32 s2, s1, 31 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-NEXT: s_add_i32 s5, s4, 0x80000000 -; GFX8-NEXT: v_mov_b32_e32 v1, s4 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: s_add_i32 s3, s2, 0x80000000 +; GFX8-NEXT: v_mov_b32_e32 v1, s2 +; GFX8-NEXT: v_mov_b32_e32 v2, s8 +; GFX8-NEXT: v_mov_b32_e32 v3, s9 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v1, vcc -; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: v_mov_b32_e32 v8, s2 -; GFX8-NEXT: v_mov_b32_e32 v9, s3 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_mov_b32_e32 v8, s0 +; GFX8-NEXT: v_mov_b32_e32 v9, s1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc ; GFX8-NEXT: v_readfirstlane_b32 s0, v4 @@ -6112,86 +6043,82 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; ; GFX9-LABEL: s_ssubsat_v2i128: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_sub_u32 s16, s0, s8 -; GFX9-NEXT: s_subb_u32 s17, s1, s9 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: s_subb_u32 s18, s2, s10 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: s_subb_u32 s19, s3, s11 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[16:17], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: s_cmp_eq_u64 s[18:19], s[2:3] -; GFX9-NEXT: s_cselect_b32 s0, 1, 0 +; GFX9-NEXT: s_sub_u32 s18, s0, s8 +; GFX9-NEXT: s_subb_u32 s19, s1, s9 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: s_subb_u32 s16, s2, s10 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: s_subb_u32 s17, s3, s11 +; GFX9-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v3, s11 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], s[10:11] +; GFX9-NEXT: s_cselect_b32 s8, 1, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[18:19], v[2:3] -; GFX9-NEXT: s_and_b32 s0, 1, s0 +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] +; GFX9-NEXT: s_and_b32 s0, 1, s8 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX9-NEXT: v_cmp_gt_u64_e64 s[0:1], s[8:9], 0 +; GFX9-NEXT: s_cmp_eq_u64 s[16:17], 0 +; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[16:17], 0 +; GFX9-NEXT: s_cselect_b32 s2, 1, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GFX9-NEXT: s_cmp_eq_u64 s[10:11], 0 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] -; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[10:11], 0 -; GFX9-NEXT: s_cselect_b32 s2, 1, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; GFX9-NEXT: s_and_b32 s0, 1, s2 -; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX9-NEXT: s_ashr_i32 s0, s19, 31 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0 +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1] +; GFX9-NEXT: v_xor_b32_e32 v0, v0, v1 +; GFX9-NEXT: s_ashr_i32 s0, s17, 31 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_add_i32 s1, s0, 0x80000000 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v1, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc ; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s18 -; GFX9-NEXT: v_mov_b32_e32 v3, s19 -; GFX9-NEXT: s_sub_u32 s0, s4, s12 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: s_sub_u32 s8, s4, s12 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v1, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v7, v3, v0, vcc -; GFX9-NEXT: s_subb_u32 s1, s5, s13 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: s_subb_u32 s2, s6, s14 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: s_subb_u32 s3, s7, s15 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-NEXT: s_cmp_eq_u64 s[2:3], s[6:7] -; GFX9-NEXT: s_cselect_b32 s4, 1, 0 +; GFX9-NEXT: s_subb_u32 s9, s5, s13 +; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: s_subb_u32 s0, s6, s14 +; GFX9-NEXT: v_mov_b32_e32 v1, s13 +; GFX9-NEXT: s_subb_u32 s1, s7, s15 +; GFX9-NEXT: v_mov_b32_e32 v2, s14 +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v3, s15 +; GFX9-NEXT: s_cmp_eq_u64 s[6:7], s[14:15] +; GFX9-NEXT: s_cselect_b32 s2, 1, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] -; GFX9-NEXT: s_and_b32 s4, 1, s4 +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] +; GFX9-NEXT: s_and_b32 s2, 1, s2 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 -; GFX9-NEXT: v_cmp_gt_u64_e64 s[4:5], s[12:13], 0 +; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 +; GFX9-NEXT: s_cmp_eq_u64 s[0:1], 0 +; GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], s[0:1], 0 +; GFX9-NEXT: s_cselect_b32 s4, 1, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GFX9-NEXT: s_cmp_eq_u64 s[14:15], 0 -; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] -; GFX9-NEXT: v_cmp_gt_i64_e64 s[4:5], s[14:15], 0 -; GFX9-NEXT: s_cselect_b32 s6, 1, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] -; GFX9-NEXT: s_and_b32 s4, 1, s6 -; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX9-NEXT: s_ashr_i32 s4, s3, 31 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3] +; GFX9-NEXT: s_and_b32 s2, 1, s4 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], 0, s2 +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[2:3] +; GFX9-NEXT: v_xor_b32_e32 v0, v0, v1 +; GFX9-NEXT: s_ashr_i32 s2, s1, 31 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: s_add_i32 s5, s4, 0x80000000 -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: s_add_i32 s3, s2, 0x80000000 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: v_mov_b32_e32 v8, s2 -; GFX9-NEXT: v_mov_b32_e32 v9, s3 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_mov_b32_e32 v8, s0 +; GFX9-NEXT: v_mov_b32_e32 v9, s1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc ; GFX9-NEXT: v_readfirstlane_b32 s0, v4 @@ -6207,79 +6134,75 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX10-LABEL: s_ssubsat_v2i128: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_sub_u32 s18, s0, s8 +; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[0:1], s[8:9] ; GFX10-NEXT: s_subb_u32 s19, s1, s9 ; GFX10-NEXT: s_subb_u32 s16, s2, s10 -; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[18:19], s[0:1] ; GFX10-NEXT: s_subb_u32 s17, s3, s11 -; GFX10-NEXT: s_cmp_eq_u64 s[16:17], s[2:3] -; GFX10-NEXT: s_cselect_b32 s20, 1, 0 +; GFX10-NEXT: s_cmp_eq_u64 s[2:3], s[10:11] +; GFX10-NEXT: v_mov_b32_e32 v4, s17 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[16:17], s[2:3] -; GFX10-NEXT: v_cmp_gt_u64_e64 s2, s[8:9], 0 +; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[2:3], s[10:11] +; GFX10-NEXT: s_cselect_b32 s20, 1, 0 +; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[16:17], 0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 ; GFX10-NEXT: s_and_b32 s0, 1, s20 -; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX10-NEXT: s_cmp_eq_u64 s[16:17], 0 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s2 -; GFX10-NEXT: v_cmp_gt_i64_e64 s2, s[10:11], 0 -; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 ; GFX10-NEXT: s_cselect_b32 s1, 1, 0 -; GFX10-NEXT: s_ashr_i32 s8, s17, 31 +; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 ; GFX10-NEXT: s_and_b32 s1, 1, s1 -; GFX10-NEXT: s_add_i32 s9, s8, 0x80000000 -; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2 +; GFX10-NEXT: s_ashr_i32 s2, s17, 31 +; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s1 +; GFX10-NEXT: s_add_i32 s3, s2, 0x80000000 +; GFX10-NEXT: s_sub_u32 s8, s4, s12 +; GFX10-NEXT: v_cmp_lt_u64_e64 s4, s[4:5], s[12:13] ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo -; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1 -; GFX10-NEXT: s_sub_u32 s0, s4, s12 -; GFX10-NEXT: s_subb_u32 s1, s5, s13 -; GFX10-NEXT: s_subb_u32 s2, s6, s14 -; GFX10-NEXT: v_cmp_lt_u64_e64 s4, s[0:1], s[4:5] -; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo -; GFX10-NEXT: s_subb_u32 s3, s7, s15 -; GFX10-NEXT: v_mov_b32_e32 v5, s0 -; GFX10-NEXT: s_cmp_eq_u64 s[2:3], s[6:7] -; GFX10-NEXT: v_mov_b32_e32 v6, s2 -; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, 0, s0 +; GFX10-NEXT: s_subb_u32 s9, s5, s13 +; GFX10-NEXT: s_subb_u32 s0, s6, s14 +; GFX10-NEXT: s_subb_u32 s1, s7, s15 +; GFX10-NEXT: s_cmp_eq_u64 s[6:7], s[14:15] +; GFX10-NEXT: v_xor_b32_e32 v0, v0, v1 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s4 -; GFX10-NEXT: v_cmp_lt_i64_e64 s4, s[2:3], s[6:7] -; GFX10-NEXT: v_cmp_gt_u64_e64 s6, s[12:13], 0 +; GFX10-NEXT: v_cmp_lt_i64_e64 s4, s[6:7], s[14:15] ; GFX10-NEXT: s_cselect_b32 s10, 1, 0 +; GFX10-NEXT: v_cmp_lt_i64_e64 s6, s[0:1], 0 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX10-NEXT: v_mov_b32_e32 v7, s3 +; GFX10-NEXT: v_mov_b32_e32 v5, s8 +; GFX10-NEXT: v_mov_b32_e32 v6, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4 ; GFX10-NEXT: s_and_b32 s4, 1, s10 -; GFX10-NEXT: s_cmp_eq_u64 s[14:15], 0 +; GFX10-NEXT: s_cmp_eq_u64 s[0:1], 0 ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s6 -; GFX10-NEXT: v_cmp_gt_i64_e64 s6, s[14:15], 0 -; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s4 ; GFX10-NEXT: s_cselect_b32 s5, 1, 0 -; GFX10-NEXT: s_ashr_i32 s4, s3, 31 +; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s4 ; GFX10-NEXT: s_and_b32 s5, 1, s5 -; GFX10-NEXT: s_add_i32 s0, s4, 0x80000000 -; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s6 +; GFX10-NEXT: v_mov_b32_e32 v7, s1 +; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, s5 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo -; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s5 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc_lo -; GFX10-NEXT: v_mov_b32_e32 v3, s18 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, s19 -; GFX10-NEXT: v_mov_b32_e32 v4, s17 -; GFX10-NEXT: v_xor_b32_e32 v1, v2, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v3, 0, s4 +; GFX10-NEXT: v_mov_b32_e32 v3, s18 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, s3, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo +; GFX10-NEXT: v_xor_b32_e32 v1, v1, v2 ; GFX10-NEXT: v_mov_b32_e32 v2, s16 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s8, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s8, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, s9, vcc_lo -; GFX10-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s8, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s2, vcc_lo ; GFX10-NEXT: v_readfirstlane_b32 s3, v4 -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s2, vcc_lo +; GFX10-NEXT: s_ashr_i32 s2, s1, 31 ; GFX10-NEXT: v_readfirstlane_b32 s1, v0 -; GFX10-NEXT: v_readfirstlane_b32 s2, v2 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, s4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, s4, vcc_lo +; GFX10-NEXT: s_add_i32 s0, s2, 0x80000000 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, s9 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, s2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, s2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, s0, vcc_lo ; GFX10-NEXT: v_readfirstlane_b32 s0, v3 +; GFX10-NEXT: v_readfirstlane_b32 s2, v2 ; GFX10-NEXT: v_readfirstlane_b32 s4, v5 ; GFX10-NEXT: v_readfirstlane_b32 s5, v1 ; GFX10-NEXT: v_readfirstlane_b32 s6, v6 @@ -6289,77 +6212,74 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX11-LABEL: s_ssubsat_v2i128: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_sub_u32 s18, s0, s8 +; GFX11-NEXT: v_cmp_lt_u64_e64 s0, s[0:1], s[8:9] ; GFX11-NEXT: s_subb_u32 s19, s1, s9 ; GFX11-NEXT: s_subb_u32 s16, s2, s10 -; GFX11-NEXT: v_cmp_lt_u64_e64 s0, s[18:19], s[0:1] ; GFX11-NEXT: s_subb_u32 s17, s3, s11 -; GFX11-NEXT: s_cmp_eq_u64 s[16:17], s[2:3] -; GFX11-NEXT: s_cselect_b32 s20, 1, 0 +; GFX11-NEXT: s_cmp_eq_u64 s[2:3], s[10:11] ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[16:17], s[2:3] -; GFX11-NEXT: v_cmp_gt_u64_e64 s2, s[8:9], 0 +; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[2:3], s[10:11] +; GFX11-NEXT: s_cselect_b32 s20, 1, 0 +; GFX11-NEXT: v_cmp_lt_i64_e64 s2, s[16:17], 0 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 ; GFX11-NEXT: s_and_b32 s0, 1, s20 -; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX11-NEXT: s_cmp_eq_u64 s[16:17], 0 ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s2 -; GFX11-NEXT: v_cmp_gt_i64_e64 s2, s[10:11], 0 -; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 ; GFX11-NEXT: s_cselect_b32 s1, 1, 0 -; GFX11-NEXT: s_ashr_i32 s8, s17, 31 +; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 ; GFX11-NEXT: s_and_b32 s1, 1, s1 -; GFX11-NEXT: s_add_i32 s9, s8, 0x80000000 -; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2 +; GFX11-NEXT: s_ashr_i32 s2, s17, 31 +; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, s1 +; GFX11-NEXT: s_add_i32 s3, s2, 0x80000000 +; GFX11-NEXT: s_sub_u32 s8, s4, s12 +; GFX11-NEXT: v_cmp_lt_u64_e64 s4, s[4:5], s[12:13] ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo -; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1 -; GFX11-NEXT: s_sub_u32 s0, s4, s12 -; GFX11-NEXT: s_subb_u32 s1, s5, s13 -; GFX11-NEXT: s_subb_u32 s2, s6, s14 -; GFX11-NEXT: v_cmp_lt_u64_e64 s4, s[0:1], s[4:5] -; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo -; GFX11-NEXT: s_subb_u32 s3, s7, s15 -; GFX11-NEXT: v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v7, s3 -; GFX11-NEXT: s_cmp_eq_u64 s[2:3], s[6:7] -; GFX11-NEXT: v_xor_b32_e32 v0, v1, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v2, 0, s0 +; GFX11-NEXT: s_subb_u32 s9, s5, s13 +; GFX11-NEXT: s_subb_u32 s0, s6, s14 +; GFX11-NEXT: s_subb_u32 s1, s7, s15 +; GFX11-NEXT: s_cmp_eq_u64 s[6:7], s[14:15] +; GFX11-NEXT: v_xor_b32_e32 v0, v0, v1 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s4 -; GFX11-NEXT: v_cmp_lt_i64_e64 s4, s[2:3], s[6:7] -; GFX11-NEXT: v_cmp_gt_u64_e64 s6, s[12:13], 0 +; GFX11-NEXT: v_cmp_lt_i64_e64 s4, s[6:7], s[14:15] ; GFX11-NEXT: s_cselect_b32 s10, 1, 0 -; GFX11-NEXT: v_mov_b32_e32 v5, s0 +; GFX11-NEXT: v_cmp_lt_i64_e64 s6, s[0:1], 0 +; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX11-NEXT: v_mov_b32_e32 v4, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v7, s1 ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4 ; GFX11-NEXT: s_and_b32 s4, 1, s10 -; GFX11-NEXT: s_cmp_eq_u64 s[14:15], 0 +; GFX11-NEXT: s_cmp_eq_u64 s[0:1], 0 ; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s6 -; GFX11-NEXT: v_cmp_gt_i64_e64 s6, s[14:15], 0 -; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s4 ; GFX11-NEXT: s_cselect_b32 s5, 1, 0 -; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s4 ; GFX11-NEXT: s_and_b32 s5, 1, s5 -; GFX11-NEXT: s_ashr_i32 s4, s3, 31 -; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, s6 +; GFX11-NEXT: v_mov_b32_e32 v5, s8 +; GFX11-NEXT: v_cmp_ne_u32_e64 s4, 0, s5 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo -; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s5 -; GFX11-NEXT: s_add_i32 s0, s4, 0x80000000 -; GFX11-NEXT: v_dual_cndmask_b32 v2, v4, v3 :: v_dual_mov_b32 v3, s18 +; GFX11-NEXT: v_cndmask_b32_e64 v2, v3, 0, s4 +; GFX11-NEXT: v_mov_b32_e32 v3, s18 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_mov_b32_e32 v4, s17 -; GFX11-NEXT: v_xor_b32_e32 v1, v2, v1 +; GFX11-NEXT: v_xor_b32_e32 v1, v1, v2 ; GFX11-NEXT: v_mov_b32_e32 v0, s19 ; GFX11-NEXT: v_mov_b32_e32 v2, s16 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s8, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, s9, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s2, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, s3, vcc_lo ; GFX11-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s8, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s8, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s2, vcc_lo +; GFX11-NEXT: s_ashr_i32 s2, s1, 31 ; GFX11-NEXT: v_readfirstlane_b32 s3, v4 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v1, s9 +; GFX11-NEXT: s_add_i32 s0, s2, 0x80000000 ; GFX11-NEXT: v_readfirstlane_b32 s1, v0 -; GFX11-NEXT: v_readfirstlane_b32 s2, v2 -; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, s4, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s4, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, s4, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, s2, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, s2, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, s0, vcc_lo ; GFX11-NEXT: v_readfirstlane_b32 s0, v3 +; GFX11-NEXT: v_readfirstlane_b32 s2, v2 ; GFX11-NEXT: v_readfirstlane_b32 s4, v5 ; GFX11-NEXT: v_readfirstlane_b32 s5, v1 ; GFX11-NEXT: v_readfirstlane_b32 s6, v6 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/subo.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/subo.ll index 017575b92143b..2d65433c082e3 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/subo.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/subo.ll @@ -204,9 +204,9 @@ define i32 @v_ssubo_i32(i32 %a, i32 %b) { ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_sub_i32_e32 v2, vcc, v0, v1 -; GFX7-NEXT: v_cmp_lt_i32_e32 vcc, v2, v0 -; GFX7-NEXT: v_cmp_lt_i32_e64 s[4:5], 0, v1 -; GFX7-NEXT: s_xor_b64 s[4:5], s[4:5], vcc +; GFX7-NEXT: v_cmp_lt_i32_e32 vcc, v0, v1 +; GFX7-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v2 +; GFX7-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; GFX7-NEXT: v_sub_i32_e32 v0, vcc, v2, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -215,9 +215,9 @@ define i32 @v_ssubo_i32(i32 %a, i32 %b) { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v0, v1 -; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v2, v0 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], 0, v1 -; GFX8-NEXT: s_xor_b64 s[4:5], s[4:5], vcc +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v0, v1 +; GFX8-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v2 +; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v2, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -226,9 +226,9 @@ define i32 @v_ssubo_i32(i32 %a, i32 %b) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_sub_u32_e32 v2, v0, v1 -; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, v2, v0 -; GFX9-NEXT: v_cmp_lt_i32_e64 s[4:5], 0, v1 -; GFX9-NEXT: s_xor_b64 s[4:5], s[4:5], vcc +; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, v0, v1 +; GFX9-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v2 +; GFX9-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; GFX9-NEXT: v_sub_u32_e32 v0, v2, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -246,9 +246,9 @@ define i64 @v_ssubo_i64(i64 %a, i64 %b) { ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_sub_i32_e32 v4, vcc, v0, v2 ; GFX7-NEXT: v_subb_u32_e32 v5, vcc, v1, v3, vcc -; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1] -; GFX7-NEXT: v_cmp_lt_i64_e64 s[4:5], 0, v[2:3] -; GFX7-NEXT: s_xor_b64 s[4:5], s[4:5], vcc +; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[4:5] +; GFX7-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; GFX7-NEXT: v_sub_i32_e32 v0, vcc, v4, v0 ; GFX7-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v5, vcc @@ -259,9 +259,9 @@ define i64 @v_ssubo_i64(i64 %a, i64 %b) { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v0, v2 ; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v1, v3, vcc -; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1] -; GFX8-NEXT: v_cmp_lt_i64_e64 s[4:5], 0, v[2:3] -; GFX8-NEXT: s_xor_b64 s[4:5], s[4:5], vcc +; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[4:5] +; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v4, v0 ; GFX8-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v5, vcc @@ -272,9 +272,9 @@ define i64 @v_ssubo_i64(i64 %a, i64 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v0, v2 ; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v1, v3, vcc -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1] -; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], 0, v[2:3] -; GFX9-NEXT: s_xor_b64 s[4:5], s[4:5], vcc +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[4:5] +; GFX9-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v4, v0 ; GFX9-NEXT: v_subbrev_co_u32_e32 v1, vcc, 0, v5, vcc @@ -293,12 +293,12 @@ define <2 x i32> @v_ssubo_v2i32(<2 x i32> %a, <2 x i32> %b) { ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_sub_i32_e32 v4, vcc, v0, v2 ; GFX7-NEXT: v_sub_i32_e32 v5, vcc, v1, v3 -; GFX7-NEXT: v_cmp_lt_i32_e32 vcc, v4, v0 -; GFX7-NEXT: v_cmp_lt_i32_e64 s[4:5], v5, v1 -; GFX7-NEXT: v_cmp_lt_i32_e64 s[6:7], 0, v2 -; GFX7-NEXT: v_cmp_lt_i32_e64 s[8:9], 0, v3 -; GFX7-NEXT: s_xor_b64 s[6:7], s[6:7], vcc -; GFX7-NEXT: s_xor_b64 s[4:5], s[8:9], s[4:5] +; GFX7-NEXT: v_cmp_lt_i32_e32 vcc, v0, v2 +; GFX7-NEXT: v_cmp_lt_i32_e64 s[4:5], v1, v3 +; GFX7-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v4 +; GFX7-NEXT: v_cmp_gt_i32_e64 s[8:9], 0, v5 +; GFX7-NEXT: s_xor_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9] ; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[6:7] ; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] ; GFX7-NEXT: v_sub_i32_e32 v0, vcc, v4, v0 @@ -310,12 +310,12 @@ define <2 x i32> @v_ssubo_v2i32(<2 x i32> %a, <2 x i32> %b) { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v0, v2 ; GFX8-NEXT: v_sub_u32_e32 v5, vcc, v1, v3 -; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v4, v0 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v5, v1 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[6:7], 0, v2 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[8:9], 0, v3 -; GFX8-NEXT: s_xor_b64 s[6:7], s[6:7], vcc -; GFX8-NEXT: s_xor_b64 s[4:5], s[8:9], s[4:5] +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v0, v2 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v1, v3 +; GFX8-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v4 +; GFX8-NEXT: v_cmp_gt_i32_e64 s[8:9], 0, v5 +; GFX8-NEXT: s_xor_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9] ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[6:7] ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v4, v0 @@ -327,12 +327,12 @@ define <2 x i32> @v_ssubo_v2i32(<2 x i32> %a, <2 x i32> %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_sub_u32_e32 v4, v0, v2 ; GFX9-NEXT: v_sub_u32_e32 v5, v1, v3 -; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, v4, v0 -; GFX9-NEXT: v_cmp_lt_i32_e64 s[4:5], v5, v1 -; GFX9-NEXT: v_cmp_lt_i32_e64 s[6:7], 0, v2 -; GFX9-NEXT: v_cmp_lt_i32_e64 s[8:9], 0, v3 -; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], s[8:9], s[4:5] +; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, v0, v2 +; GFX9-NEXT: v_cmp_lt_i32_e64 s[4:5], v1, v3 +; GFX9-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v4 +; GFX9-NEXT: v_cmp_gt_i32_e64 s[8:9], 0, v5 +; GFX9-NEXT: s_xor_b64 s[6:7], vcc, s[6:7] +; GFX9-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9] ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[6:7] ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] ; GFX9-NEXT: v_sub_u32_e32 v0, v4, v0 @@ -351,12 +351,12 @@ define i8 @v_ssubo_i8(i8 %a, i8 %b) { ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_sub_i32_e32 v2, vcc, v0, v1 -; GFX7-NEXT: v_bfe_i32 v3, v2, 0, 8 ; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 8 -; GFX7-NEXT: v_cmp_lt_i32_e32 vcc, v3, v0 -; GFX7-NEXT: v_bfe_i32 v0, v1, 0, 8 -; GFX7-NEXT: v_cmp_lt_i32_e64 s[4:5], 0, v0 -; GFX7-NEXT: s_xor_b64 s[4:5], s[4:5], vcc +; GFX7-NEXT: v_bfe_i32 v1, v1, 0, 8 +; GFX7-NEXT: v_cmp_lt_i32_e32 vcc, v0, v1 +; GFX7-NEXT: v_bfe_i32 v0, v2, 0, 8 +; GFX7-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v0 +; GFX7-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; GFX7-NEXT: v_sub_i32_e32 v0, vcc, v2, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -365,12 +365,12 @@ define i8 @v_ssubo_i8(i8 %a, i8 %b) { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_sub_u16_e32 v2, v0, v1 -; GFX8-NEXT: v_bfe_i32 v3, v2, 0, 8 ; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 8 -; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v3, v0 -; GFX8-NEXT: v_bfe_i32 v0, v1, 0, 8 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], 0, v0 -; GFX8-NEXT: s_xor_b64 s[4:5], s[4:5], vcc +; GFX8-NEXT: v_bfe_i32 v1, v1, 0, 8 +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v0, v1 +; GFX8-NEXT: v_bfe_i32 v0, v2, 0, 8 +; GFX8-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v0 +; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; GFX8-NEXT: v_sub_u16_e32 v0, v2, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -379,10 +379,10 @@ define i8 @v_ssubo_i8(i8 %a, i8 %b) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_sub_u16_e32 v2, v0, v1 -; GFX9-NEXT: v_cmp_lt_i32_sdwa s[4:5], sext(v2), sext(v0) src0_sel:BYTE_0 src1_sel:BYTE_0 +; GFX9-NEXT: v_cmp_lt_i32_sdwa s[4:5], sext(v0), sext(v1) src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_cmp_gt_i32_sdwa s[6:7], sext(v1), v0 src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_xor_b64 s[4:5], s[6:7], s[4:5] +; GFX9-NEXT: v_cmp_lt_i32_sdwa s[6:7], sext(v2), v0 src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7] ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; GFX9-NEXT: v_sub_u16_e32 v0, v2, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -399,12 +399,12 @@ define i7 @v_ssubo_i7(i7 %a, i7 %b) { ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_sub_i32_e32 v2, vcc, v0, v1 -; GFX7-NEXT: v_bfe_i32 v3, v2, 0, 7 ; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 7 -; GFX7-NEXT: v_cmp_lt_i32_e32 vcc, v3, v0 -; GFX7-NEXT: v_bfe_i32 v0, v1, 0, 7 -; GFX7-NEXT: v_cmp_lt_i32_e64 s[4:5], 0, v0 -; GFX7-NEXT: s_xor_b64 s[4:5], s[4:5], vcc +; GFX7-NEXT: v_bfe_i32 v1, v1, 0, 7 +; GFX7-NEXT: v_cmp_lt_i32_e32 vcc, v0, v1 +; GFX7-NEXT: v_bfe_i32 v0, v2, 0, 7 +; GFX7-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v0 +; GFX7-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; GFX7-NEXT: v_sub_i32_e32 v0, vcc, v2, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -413,12 +413,12 @@ define i7 @v_ssubo_i7(i7 %a, i7 %b) { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_sub_u16_e32 v2, v0, v1 -; GFX8-NEXT: v_bfe_i32 v3, v2, 0, 7 ; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 7 -; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v3, v0 -; GFX8-NEXT: v_bfe_i32 v0, v1, 0, 7 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], 0, v0 -; GFX8-NEXT: s_xor_b64 s[4:5], s[4:5], vcc +; GFX8-NEXT: v_bfe_i32 v1, v1, 0, 7 +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v0, v1 +; GFX8-NEXT: v_bfe_i32 v0, v2, 0, 7 +; GFX8-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v0 +; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; GFX8-NEXT: v_sub_u16_e32 v0, v2, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -427,12 +427,12 @@ define i7 @v_ssubo_i7(i7 %a, i7 %b) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_sub_u16_e32 v2, v0, v1 -; GFX9-NEXT: v_bfe_i32 v3, v2, 0, 7 ; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 7 -; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, v3, v0 -; GFX9-NEXT: v_bfe_i32 v0, v1, 0, 7 -; GFX9-NEXT: v_cmp_lt_i32_e64 s[4:5], 0, v0 -; GFX9-NEXT: s_xor_b64 s[4:5], s[4:5], vcc +; GFX9-NEXT: v_bfe_i32 v1, v1, 0, 7 +; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, v0, v1 +; GFX9-NEXT: v_bfe_i32 v0, v2, 0, 7 +; GFX9-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v0 +; GFX9-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; GFX9-NEXT: v_sub_u16_e32 v0, v2, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -635,33 +635,33 @@ define amdgpu_ps i32 @s_ssubo_i32(i32 inreg %a, i32 inreg %b) { ; GFX7-LABEL: s_ssubo_i32: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_sub_i32 s2, s0, s1 -; GFX7-NEXT: s_cmp_lt_i32 s2, s0 +; GFX7-NEXT: s_cmp_lt_i32 s0, s1 ; GFX7-NEXT: s_cselect_b32 s0, 1, 0 -; GFX7-NEXT: s_cmp_gt_i32 s1, 0 +; GFX7-NEXT: s_cmp_lt_i32 s2, 0 ; GFX7-NEXT: s_cselect_b32 s1, 1, 0 -; GFX7-NEXT: s_xor_b32 s0, s1, s0 +; GFX7-NEXT: s_xor_b32 s0, s0, s1 ; GFX7-NEXT: s_sub_i32 s0, s2, s0 ; GFX7-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_ssubo_i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_sub_i32 s2, s0, s1 -; GFX8-NEXT: s_cmp_lt_i32 s2, s0 +; GFX8-NEXT: s_cmp_lt_i32 s0, s1 ; GFX8-NEXT: s_cselect_b32 s0, 1, 0 -; GFX8-NEXT: s_cmp_gt_i32 s1, 0 +; GFX8-NEXT: s_cmp_lt_i32 s2, 0 ; GFX8-NEXT: s_cselect_b32 s1, 1, 0 -; GFX8-NEXT: s_xor_b32 s0, s1, s0 +; GFX8-NEXT: s_xor_b32 s0, s0, s1 ; GFX8-NEXT: s_sub_i32 s0, s2, s0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_ssubo_i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_sub_i32 s2, s0, s1 -; GFX9-NEXT: s_cmp_lt_i32 s2, s0 +; GFX9-NEXT: s_cmp_lt_i32 s0, s1 ; GFX9-NEXT: s_cselect_b32 s0, 1, 0 -; GFX9-NEXT: s_cmp_gt_i32 s1, 0 +; GFX9-NEXT: s_cmp_lt_i32 s2, 0 ; GFX9-NEXT: s_cselect_b32 s1, 1, 0 -; GFX9-NEXT: s_xor_b32 s0, s1, s0 +; GFX9-NEXT: s_xor_b32 s0, s0, s1 ; GFX9-NEXT: s_sub_i32 s0, s2, s0 ; GFX9-NEXT: ; return to shader part epilog %ssubo = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %a, i32 %b) @@ -676,13 +676,13 @@ define amdgpu_ps i64 @s_ssubo_i64(i64 inreg %a, i64 inreg %b) { ; GFX7-LABEL: s_ssubo_i64: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_sub_u32 s4, s0, s2 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-NEXT: s_subb_u32 s5, s1, s3 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] -; GFX7-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] +; GFX7-NEXT: v_cmp_lt_i64_e64 s[0:1], s[4:5], 0 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_xor_b64 s[0:1], s[0:1], vcc +; GFX7-NEXT: s_xor_b64 s[0:1], vcc, s[0:1] ; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; GFX7-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 ; GFX7-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc @@ -693,13 +693,13 @@ define amdgpu_ps i64 @s_ssubo_i64(i64 inreg %a, i64 inreg %b) { ; GFX8-LABEL: s_ssubo_i64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_sub_u32 s4, s0, s2 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: s_subb_u32 s5, s1, s3 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] -; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] +; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[4:5], 0 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: s_xor_b64 s[0:1], s[0:1], vcc +; GFX8-NEXT: s_xor_b64 s[0:1], vcc, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0 ; GFX8-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc @@ -710,13 +710,13 @@ define amdgpu_ps i64 @s_ssubo_i64(i64 inreg %a, i64 inreg %b) { ; GFX9-LABEL: s_ssubo_i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_sub_u32 s4, s0, s2 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: s_subb_u32 s5, s1, s3 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] -; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] +; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[4:5], 0 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: s_xor_b64 s[0:1], s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[0:1], vcc, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s4, v0 ; GFX9-NEXT: v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc @@ -736,16 +736,16 @@ define amdgpu_ps <2 x i32> @s_ssubo_v2i32(<2 x i32> inreg %a, <2 x i32> inreg %b ; GFX7: ; %bb.0: ; GFX7-NEXT: s_sub_i32 s4, s0, s2 ; GFX7-NEXT: s_sub_i32 s5, s1, s3 -; GFX7-NEXT: s_cmp_lt_i32 s4, s0 +; GFX7-NEXT: s_cmp_lt_i32 s0, s2 ; GFX7-NEXT: s_cselect_b32 s0, 1, 0 -; GFX7-NEXT: s_cmp_lt_i32 s5, s1 +; GFX7-NEXT: s_cmp_lt_i32 s1, s3 ; GFX7-NEXT: s_cselect_b32 s1, 1, 0 -; GFX7-NEXT: s_cmp_gt_i32 s2, 0 +; GFX7-NEXT: s_cmp_lt_i32 s4, 0 ; GFX7-NEXT: s_cselect_b32 s2, 1, 0 -; GFX7-NEXT: s_cmp_gt_i32 s3, 0 +; GFX7-NEXT: s_cmp_lt_i32 s5, 0 ; GFX7-NEXT: s_cselect_b32 s3, 1, 0 -; GFX7-NEXT: s_xor_b32 s0, s2, s0 -; GFX7-NEXT: s_xor_b32 s1, s3, s1 +; GFX7-NEXT: s_xor_b32 s0, s0, s2 +; GFX7-NEXT: s_xor_b32 s1, s1, s3 ; GFX7-NEXT: s_sub_i32 s0, s4, s0 ; GFX7-NEXT: s_sub_i32 s1, s5, s1 ; GFX7-NEXT: ; return to shader part epilog @@ -754,16 +754,16 @@ define amdgpu_ps <2 x i32> @s_ssubo_v2i32(<2 x i32> inreg %a, <2 x i32> inreg %b ; GFX8: ; %bb.0: ; GFX8-NEXT: s_sub_i32 s4, s0, s2 ; GFX8-NEXT: s_sub_i32 s5, s1, s3 -; GFX8-NEXT: s_cmp_lt_i32 s4, s0 +; GFX8-NEXT: s_cmp_lt_i32 s0, s2 ; GFX8-NEXT: s_cselect_b32 s0, 1, 0 -; GFX8-NEXT: s_cmp_lt_i32 s5, s1 +; GFX8-NEXT: s_cmp_lt_i32 s1, s3 ; GFX8-NEXT: s_cselect_b32 s1, 1, 0 -; GFX8-NEXT: s_cmp_gt_i32 s2, 0 +; GFX8-NEXT: s_cmp_lt_i32 s4, 0 ; GFX8-NEXT: s_cselect_b32 s2, 1, 0 -; GFX8-NEXT: s_cmp_gt_i32 s3, 0 +; GFX8-NEXT: s_cmp_lt_i32 s5, 0 ; GFX8-NEXT: s_cselect_b32 s3, 1, 0 -; GFX8-NEXT: s_xor_b32 s0, s2, s0 -; GFX8-NEXT: s_xor_b32 s1, s3, s1 +; GFX8-NEXT: s_xor_b32 s0, s0, s2 +; GFX8-NEXT: s_xor_b32 s1, s1, s3 ; GFX8-NEXT: s_sub_i32 s0, s4, s0 ; GFX8-NEXT: s_sub_i32 s1, s5, s1 ; GFX8-NEXT: ; return to shader part epilog @@ -772,16 +772,16 @@ define amdgpu_ps <2 x i32> @s_ssubo_v2i32(<2 x i32> inreg %a, <2 x i32> inreg %b ; GFX9: ; %bb.0: ; GFX9-NEXT: s_sub_i32 s4, s0, s2 ; GFX9-NEXT: s_sub_i32 s5, s1, s3 -; GFX9-NEXT: s_cmp_lt_i32 s4, s0 +; GFX9-NEXT: s_cmp_lt_i32 s0, s2 ; GFX9-NEXT: s_cselect_b32 s0, 1, 0 -; GFX9-NEXT: s_cmp_lt_i32 s5, s1 +; GFX9-NEXT: s_cmp_lt_i32 s1, s3 ; GFX9-NEXT: s_cselect_b32 s1, 1, 0 -; GFX9-NEXT: s_cmp_gt_i32 s2, 0 +; GFX9-NEXT: s_cmp_lt_i32 s4, 0 ; GFX9-NEXT: s_cselect_b32 s2, 1, 0 -; GFX9-NEXT: s_cmp_gt_i32 s3, 0 +; GFX9-NEXT: s_cmp_lt_i32 s5, 0 ; GFX9-NEXT: s_cselect_b32 s3, 1, 0 -; GFX9-NEXT: s_xor_b32 s0, s2, s0 -; GFX9-NEXT: s_xor_b32 s1, s3, s1 +; GFX9-NEXT: s_xor_b32 s0, s0, s2 +; GFX9-NEXT: s_xor_b32 s1, s1, s3 ; GFX9-NEXT: s_sub_i32 s0, s4, s0 ; GFX9-NEXT: s_sub_i32 s1, s5, s1 ; GFX9-NEXT: ; return to shader part epilog @@ -798,12 +798,12 @@ define i8 @s_ssubo_i8(i8 %a, i8 %b) { ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_sub_i32_e32 v2, vcc, v0, v1 -; GFX7-NEXT: v_bfe_i32 v3, v2, 0, 8 ; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 8 -; GFX7-NEXT: v_cmp_lt_i32_e32 vcc, v3, v0 -; GFX7-NEXT: v_bfe_i32 v0, v1, 0, 8 -; GFX7-NEXT: v_cmp_lt_i32_e64 s[4:5], 0, v0 -; GFX7-NEXT: s_xor_b64 s[4:5], s[4:5], vcc +; GFX7-NEXT: v_bfe_i32 v1, v1, 0, 8 +; GFX7-NEXT: v_cmp_lt_i32_e32 vcc, v0, v1 +; GFX7-NEXT: v_bfe_i32 v0, v2, 0, 8 +; GFX7-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v0 +; GFX7-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; GFX7-NEXT: v_sub_i32_e32 v0, vcc, v2, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -812,12 +812,12 @@ define i8 @s_ssubo_i8(i8 %a, i8 %b) { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_sub_u16_e32 v2, v0, v1 -; GFX8-NEXT: v_bfe_i32 v3, v2, 0, 8 ; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 8 -; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v3, v0 -; GFX8-NEXT: v_bfe_i32 v0, v1, 0, 8 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], 0, v0 -; GFX8-NEXT: s_xor_b64 s[4:5], s[4:5], vcc +; GFX8-NEXT: v_bfe_i32 v1, v1, 0, 8 +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v0, v1 +; GFX8-NEXT: v_bfe_i32 v0, v2, 0, 8 +; GFX8-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v0 +; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; GFX8-NEXT: v_sub_u16_e32 v0, v2, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -826,10 +826,10 @@ define i8 @s_ssubo_i8(i8 %a, i8 %b) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_sub_u16_e32 v2, v0, v1 -; GFX9-NEXT: v_cmp_lt_i32_sdwa s[4:5], sext(v2), sext(v0) src0_sel:BYTE_0 src1_sel:BYTE_0 +; GFX9-NEXT: v_cmp_lt_i32_sdwa s[4:5], sext(v0), sext(v1) src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_cmp_gt_i32_sdwa s[6:7], sext(v1), v0 src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_xor_b64 s[4:5], s[6:7], s[4:5] +; GFX9-NEXT: v_cmp_lt_i32_sdwa s[6:7], sext(v2), v0 src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7] ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; GFX9-NEXT: v_sub_u16_e32 v0, v2, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -846,12 +846,12 @@ define i7 @s_ssubo_i7(i7 %a, i7 %b) { ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_sub_i32_e32 v2, vcc, v0, v1 -; GFX7-NEXT: v_bfe_i32 v3, v2, 0, 7 ; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 7 -; GFX7-NEXT: v_cmp_lt_i32_e32 vcc, v3, v0 -; GFX7-NEXT: v_bfe_i32 v0, v1, 0, 7 -; GFX7-NEXT: v_cmp_lt_i32_e64 s[4:5], 0, v0 -; GFX7-NEXT: s_xor_b64 s[4:5], s[4:5], vcc +; GFX7-NEXT: v_bfe_i32 v1, v1, 0, 7 +; GFX7-NEXT: v_cmp_lt_i32_e32 vcc, v0, v1 +; GFX7-NEXT: v_bfe_i32 v0, v2, 0, 7 +; GFX7-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v0 +; GFX7-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; GFX7-NEXT: v_sub_i32_e32 v0, vcc, v2, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -860,12 +860,12 @@ define i7 @s_ssubo_i7(i7 %a, i7 %b) { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_sub_u16_e32 v2, v0, v1 -; GFX8-NEXT: v_bfe_i32 v3, v2, 0, 7 ; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 7 -; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v3, v0 -; GFX8-NEXT: v_bfe_i32 v0, v1, 0, 7 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], 0, v0 -; GFX8-NEXT: s_xor_b64 s[4:5], s[4:5], vcc +; GFX8-NEXT: v_bfe_i32 v1, v1, 0, 7 +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v0, v1 +; GFX8-NEXT: v_bfe_i32 v0, v2, 0, 7 +; GFX8-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v0 +; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; GFX8-NEXT: v_sub_u16_e32 v0, v2, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -874,12 +874,12 @@ define i7 @s_ssubo_i7(i7 %a, i7 %b) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_sub_u16_e32 v2, v0, v1 -; GFX9-NEXT: v_bfe_i32 v3, v2, 0, 7 ; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 7 -; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, v3, v0 -; GFX9-NEXT: v_bfe_i32 v0, v1, 0, 7 -; GFX9-NEXT: v_cmp_lt_i32_e64 s[4:5], 0, v0 -; GFX9-NEXT: s_xor_b64 s[4:5], s[4:5], vcc +; GFX9-NEXT: v_bfe_i32 v1, v1, 0, 7 +; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, v0, v1 +; GFX9-NEXT: v_bfe_i32 v0, v2, 0, 7 +; GFX9-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v0 +; GFX9-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; GFX9-NEXT: v_sub_u16_e32 v0, v2, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -969,9 +969,9 @@ define amdgpu_ps i32 @ssubo_i32_sv(i32 inreg %a, i32 %b) { ; GFX7-LABEL: ssubo_i32_sv: ; GFX7: ; %bb.0: ; GFX7-NEXT: v_sub_i32_e32 v1, vcc, s0, v0 -; GFX7-NEXT: v_cmp_gt_i32_e32 vcc, s0, v1 -; GFX7-NEXT: v_cmp_lt_i32_e64 s[0:1], 0, v0 -; GFX7-NEXT: s_xor_b64 s[0:1], s[0:1], vcc +; GFX7-NEXT: v_cmp_lt_i32_e32 vcc, s0, v0 +; GFX7-NEXT: v_cmp_gt_i32_e64 s[0:1], 0, v1 +; GFX7-NEXT: s_xor_b64 s[0:1], vcc, s[0:1] ; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; GFX7-NEXT: v_sub_i32_e32 v0, vcc, v1, v0 ; GFX7-NEXT: v_readfirstlane_b32 s0, v0 @@ -980,9 +980,9 @@ define amdgpu_ps i32 @ssubo_i32_sv(i32 inreg %a, i32 %b) { ; GFX8-LABEL: ssubo_i32_sv: ; GFX8: ; %bb.0: ; GFX8-NEXT: v_sub_u32_e32 v1, vcc, s0, v0 -; GFX8-NEXT: v_cmp_gt_i32_e32 vcc, s0, v1 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[0:1], 0, v0 -; GFX8-NEXT: s_xor_b64 s[0:1], s[0:1], vcc +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, s0, v0 +; GFX8-NEXT: v_cmp_gt_i32_e64 s[0:1], 0, v1 +; GFX8-NEXT: s_xor_b64 s[0:1], vcc, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v1, v0 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 @@ -991,9 +991,9 @@ define amdgpu_ps i32 @ssubo_i32_sv(i32 inreg %a, i32 %b) { ; GFX9-LABEL: ssubo_i32_sv: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_sub_u32_e32 v1, s0, v0 -; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, s0, v1 -; GFX9-NEXT: v_cmp_lt_i32_e64 s[0:1], 0, v0 -; GFX9-NEXT: s_xor_b64 s[0:1], s[0:1], vcc +; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, s0, v0 +; GFX9-NEXT: v_cmp_gt_i32_e64 s[0:1], 0, v1 +; GFX9-NEXT: s_xor_b64 s[0:1], vcc, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; GFX9-NEXT: v_sub_u32_e32 v0, v1, v0 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 @@ -1010,12 +1010,12 @@ define amdgpu_ps i16 @ssubo_i16_sv(i16 inreg %a, i16 %b) { ; GFX7-LABEL: ssubo_i16_sv: ; GFX7: ; %bb.0: ; GFX7-NEXT: v_sub_i32_e32 v1, vcc, s0, v0 -; GFX7-NEXT: v_bfe_i32 v2, v1, 0, 16 ; GFX7-NEXT: s_sext_i32_i16 s0, s0 ; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX7-NEXT: v_cmp_gt_i32_e32 vcc, s0, v2 -; GFX7-NEXT: v_cmp_lt_i32_e64 s[0:1], 0, v0 -; GFX7-NEXT: s_xor_b64 s[0:1], s[0:1], vcc +; GFX7-NEXT: v_cmp_lt_i32_e32 vcc, s0, v0 +; GFX7-NEXT: v_bfe_i32 v0, v1, 0, 16 +; GFX7-NEXT: v_cmp_gt_i32_e64 s[0:1], 0, v0 +; GFX7-NEXT: s_xor_b64 s[0:1], vcc, s[0:1] ; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; GFX7-NEXT: v_sub_i32_e32 v0, vcc, v1, v0 ; GFX7-NEXT: v_readfirstlane_b32 s0, v0 @@ -1024,9 +1024,9 @@ define amdgpu_ps i16 @ssubo_i16_sv(i16 inreg %a, i16 %b) { ; GFX8-LABEL: ssubo_i16_sv: ; GFX8: ; %bb.0: ; GFX8-NEXT: v_sub_u16_e32 v1, s0, v0 -; GFX8-NEXT: v_cmp_gt_i16_e32 vcc, s0, v1 -; GFX8-NEXT: v_cmp_lt_i16_e64 s[0:1], 0, v0 -; GFX8-NEXT: s_xor_b64 s[0:1], s[0:1], vcc +; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, s0, v0 +; GFX8-NEXT: v_cmp_gt_i16_e64 s[0:1], 0, v1 +; GFX8-NEXT: s_xor_b64 s[0:1], vcc, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; GFX8-NEXT: v_sub_u16_e32 v0, v1, v0 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 @@ -1035,9 +1035,9 @@ define amdgpu_ps i16 @ssubo_i16_sv(i16 inreg %a, i16 %b) { ; GFX9-LABEL: ssubo_i16_sv: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_sub_u16_e32 v1, s0, v0 -; GFX9-NEXT: v_cmp_gt_i16_e32 vcc, s0, v1 -; GFX9-NEXT: v_cmp_lt_i16_e64 s[0:1], 0, v0 -; GFX9-NEXT: s_xor_b64 s[0:1], s[0:1], vcc +; GFX9-NEXT: v_cmp_lt_i16_e32 vcc, s0, v0 +; GFX9-NEXT: v_cmp_gt_i16_e64 s[0:1], 0, v1 +; GFX9-NEXT: s_xor_b64 s[0:1], vcc, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; GFX9-NEXT: v_sub_u16_e32 v0, v1, v0 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 diff --git a/llvm/test/CodeGen/AMDGPU/ssubo.ll b/llvm/test/CodeGen/AMDGPU/ssubo.ll index 382d8928a28b0..50e0eda9a6842 100644 --- a/llvm/test/CodeGen/AMDGPU/ssubo.ll +++ b/llvm/test/CodeGen/AMDGPU/ssubo.ll @@ -17,18 +17,18 @@ define amdgpu_kernel void @ssubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b) ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: s_sub_u32 s10, s2, s8 -; SI-NEXT: s_subb_u32 s11, s3, s9 -; SI-NEXT: v_mov_b32_e32 v1, s3 -; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[10:11], v[0:1] -; SI-NEXT: v_cmp_gt_i64_e64 s[2:3], s[8:9], 0 ; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: v_mov_b32_e32 v0, s8 +; SI-NEXT: v_mov_b32_e32 v1, s9 +; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] +; SI-NEXT: s_sub_u32 s2, s2, s8 +; SI-NEXT: s_subb_u32 s3, s3, s9 +; SI-NEXT: v_cmp_lt_i64_e64 s[8:9], s[2:3], 0 ; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: s_xor_b64 s[0:1], s[2:3], vcc +; SI-NEXT: s_xor_b64 s[0:1], vcc, s[8:9] ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; SI-NEXT: v_mov_b32_e32 v1, s11 -; SI-NEXT: v_add_i32_e32 v0, vcc, s10, v0 +; SI-NEXT: v_mov_b32_e32 v1, s3 +; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v0 ; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm @@ -38,36 +38,36 @@ define amdgpu_kernel void @ssubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b) ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s2 -; VI-NEXT: s_sub_u32 s6, s2, s4 -; VI-NEXT: v_mov_b32_e32 v2, s3 -; VI-NEXT: s_subb_u32 s7, s3, s5 -; VI-NEXT: v_cmp_gt_i64_e64 s[8:9], s[4:5], 0 -; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[1:2] ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_sub_u32 s4, s2, s4 +; VI-NEXT: s_subb_u32 s5, s3, s5 +; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[1:2] +; VI-NEXT: v_cmp_lt_i64_e64 s[2:3], s[4:5], 0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_xor_b64 s[0:1], s[8:9], vcc +; VI-NEXT: s_xor_b64 s[0:1], vcc, s[2:3] ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] -; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: v_add_u32_e32 v2, vcc, s6, v2 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: ssubo_i64_zext: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: s_sub_u32 s4, s2, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_subb_u32 s5, s3, s7 -; GFX9-NEXT: v_cmp_gt_i64_e64 s[8:9], s[6:7], 0 -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] +; GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], s[4:5], 0 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: s_xor_b64 s[2:3], s[8:9], vcc +; GFX9-NEXT: s_xor_b64 s[2:3], vcc, s[2:3] ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc @@ -83,9 +83,9 @@ define amdgpu_kernel void @ssubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b) ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_sub_u32 s4, s2, s6 ; GFX10-NEXT: s_subb_u32 s5, s3, s7 -; GFX10-NEXT: v_cmp_gt_i64_e64 s6, s[6:7], 0 -; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[4:5], s[2:3] -; GFX10-NEXT: s_xor_b32 s2, s6, s2 +; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[2:3], s[6:7] +; GFX10-NEXT: v_cmp_lt_i64_e64 s3, s[4:5], 0 +; GFX10-NEXT: s_xor_b32 s2, s2, s3 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 ; GFX10-NEXT: v_add_co_u32 v0, s2, s4, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s5, 0, s2 @@ -101,9 +101,9 @@ define amdgpu_kernel void @ssubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b) ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_sub_u32 s6, s2, s4 ; GFX11-NEXT: s_subb_u32 s7, s3, s5 -; GFX11-NEXT: v_cmp_gt_i64_e64 s4, s[4:5], 0 -; GFX11-NEXT: v_cmp_lt_i64_e64 s2, s[6:7], s[2:3] -; GFX11-NEXT: s_xor_b32 s2, s4, s2 +; GFX11-NEXT: v_cmp_lt_i64_e64 s2, s[2:3], s[4:5] +; GFX11-NEXT: v_cmp_lt_i64_e64 s3, s[6:7], 0 +; GFX11-NEXT: s_xor_b32 s2, s2, s3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 ; GFX11-NEXT: v_add_co_u32 v0, s2, s6, v0 @@ -130,14 +130,14 @@ define amdgpu_kernel void @s_ssubo_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_sub_i32 s12, s8, s9 -; SI-NEXT: s_cmp_gt_i32 s9, 0 -; SI-NEXT: s_cselect_b64 s[10:11], -1, 0 -; SI-NEXT: s_cmp_lt_i32 s12, s8 -; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_cmp_lt_i32 s8, s9 ; SI-NEXT: s_cselect_b64 s[8:9], -1, 0 +; SI-NEXT: s_cmp_lt_i32 s12, 0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_cselect_b64 s[10:11], -1, 0 ; SI-NEXT: v_mov_b32_e32 v0, s12 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; SI-NEXT: s_xor_b64 s[4:5], s[10:11], s[8:9] +; SI-NEXT: s_xor_b64 s[4:5], s[8:9], s[10:11] ; SI-NEXT: s_mov_b32 s0, s2 ; SI-NEXT: s_mov_b32 s1, s3 ; SI-NEXT: s_mov_b32 s2, s6 @@ -154,10 +154,10 @@ define amdgpu_kernel void @s_ssubo_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_sub_i32 s6, s4, s5 -; VI-NEXT: s_cmp_gt_i32 s5, 0 +; VI-NEXT: s_cmp_lt_i32 s4, s5 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; VI-NEXT: s_cmp_lt_i32 s6, s4 +; VI-NEXT: s_cmp_lt_i32 s6, 0 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 @@ -247,8 +247,8 @@ define amdgpu_kernel void @v_ssubo_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; SI-NEXT: s_mov_b32 s5, s3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_sub_i32_e32 v2, vcc, v0, v1 -; SI-NEXT: v_cmp_lt_i32_e32 vcc, 0, v1 -; SI-NEXT: v_cmp_lt_i32_e64 s[0:1], v2, v0 +; SI-NEXT: v_cmp_lt_i32_e32 vcc, v0, v1 +; SI-NEXT: v_cmp_gt_i32_e64 s[0:1], 0, v2 ; SI-NEXT: s_xor_b64 s[0:1], vcc, s[0:1] ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 @@ -271,8 +271,8 @@ define amdgpu_kernel void @v_ssubo_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_sub_u32_e32 v6, vcc, v4, v5 -; VI-NEXT: v_cmp_lt_i32_e32 vcc, 0, v5 -; VI-NEXT: v_cmp_lt_i32_e64 s[0:1], v6, v4 +; VI-NEXT: v_cmp_lt_i32_e32 vcc, v4, v5 +; VI-NEXT: v_cmp_gt_i32_e64 s[0:1], 0, v6 ; VI-NEXT: s_xor_b64 s[0:1], vcc, s[0:1] ; VI-NEXT: flat_store_dword v[0:1], v6 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] @@ -347,17 +347,17 @@ define amdgpu_kernel void @s_ssubo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_sub_u32 s12, s4, s6 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] +; SI-NEXT: s_sub_u32 s4, s4, s6 +; SI-NEXT: s_subb_u32 s5, s5, s7 +; SI-NEXT: v_cmp_lt_i64_e64 s[6:7], s[4:5], 0 ; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: s_subb_u32 s13, s5, s7 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[0:1] -; SI-NEXT: v_cmp_gt_i64_e64 s[4:5], s[6:7], 0 -; SI-NEXT: v_mov_b32_e32 v0, s12 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 -; SI-NEXT: v_mov_b32_e32 v1, s13 -; SI-NEXT: s_xor_b64 s[4:5], s[4:5], vcc +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: s_xor_b64 s[4:5], vcc, s[6:7] ; SI-NEXT: s_mov_b32 s0, s2 ; SI-NEXT: s_mov_b32 s1, s3 ; SI-NEXT: s_mov_b32 s2, s10 @@ -373,18 +373,18 @@ define amdgpu_kernel void @s_ssubo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v4, s6 ; VI-NEXT: s_sub_u32 s0, s4, s6 -; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v5, s7 ; VI-NEXT: s_subb_u32 s1, s5, s7 -; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[4:5] -; VI-NEXT: v_cmp_gt_i64_e64 s[2:3], s[6:7], 0 +; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[4:5] +; VI-NEXT: v_cmp_lt_i64_e64 s[2:3], s[0:1], 0 ; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: s_xor_b64 s[0:1], s[2:3], vcc +; VI-NEXT: s_xor_b64 s[0:1], vcc, s[2:3] ; VI-NEXT: flat_store_dwordx2 v[0:1], v[4:5] ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; VI-NEXT: flat_store_byte v[2:3], v0 @@ -395,15 +395,15 @@ define amdgpu_kernel void @s_ssubo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s14 ; GFX9-NEXT: s_sub_u32 s0, s12, s14 -; GFX9-NEXT: v_mov_b32_e32 v0, s12 -; GFX9-NEXT: v_mov_b32_e32 v1, s13 +; GFX9-NEXT: v_mov_b32_e32 v1, s15 ; GFX9-NEXT: s_subb_u32 s1, s13, s15 -; GFX9-NEXT: v_cmp_gt_i64_e64 s[2:3], s[14:15], 0 -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[0:1] +; GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], s[0:1], 0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: s_xor_b64 s[0:1], s[2:3], vcc +; GFX9-NEXT: s_xor_b64 s[0:1], vcc, s[2:3] ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; GFX9-NEXT: global_store_byte v2, v0, s[10:11] @@ -416,8 +416,8 @@ define amdgpu_kernel void @s_ssubo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_sub_u32 s0, s12, s14 ; GFX10-NEXT: s_subb_u32 s1, s13, s15 -; GFX10-NEXT: v_cmp_gt_i64_e64 s2, s[14:15], 0 -; GFX10-NEXT: v_cmp_lt_i64_e64 s3, s[0:1], s[12:13] +; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[12:13], s[14:15] +; GFX10-NEXT: v_cmp_lt_i64_e64 s3, s[0:1], 0 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: s_xor_b32 s0, s2, s3 @@ -432,11 +432,11 @@ define amdgpu_kernel void @s_ssubo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_sub_u32 s8, s4, s6 ; GFX11-NEXT: s_subb_u32 s9, s5, s7 -; GFX11-NEXT: v_cmp_gt_i64_e64 s6, s[6:7], 0 -; GFX11-NEXT: v_cmp_lt_i64_e64 s4, s[8:9], s[4:5] +; GFX11-NEXT: v_cmp_lt_i64_e64 s4, s[4:5], s[6:7] +; GFX11-NEXT: v_cmp_lt_i64_e64 s5, s[8:9], 0 ; GFX11-NEXT: v_mov_b32_e32 v0, s8 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9 -; GFX11-NEXT: s_xor_b32 s4, s6, s4 +; GFX11-NEXT: s_xor_b32 s4, s4, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 ; GFX11-NEXT: s_clause 0x1 @@ -473,12 +473,13 @@ define amdgpu_kernel void @v_ssubo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; SI-NEXT: s_mov_b32 s4, s2 ; SI-NEXT: s_mov_b32 s5, s3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_sub_i32_e32 v4, vcc, v0, v2 -; SI-NEXT: v_subb_u32_e32 v5, vcc, v1, v3, vcc -; SI-NEXT: v_cmp_lt_i64_e32 vcc, 0, v[2:3] -; SI-NEXT: v_cmp_lt_i64_e64 s[0:1], v[4:5], v[0:1] -; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[8:11], 0 +; SI-NEXT: v_cmp_lt_i64_e32 vcc, v[0:1], v[2:3] +; SI-NEXT: v_sub_i32_e64 v0, s[0:1], v0, v2 +; SI-NEXT: v_subb_u32_e64 v1, s[0:1], v1, v3, s[0:1] +; SI-NEXT: v_cmp_gt_i64_e64 s[0:1], 0, v[0:1] +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; SI-NEXT: s_xor_b64 s[0:1], vcc, s[0:1] +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm @@ -498,11 +499,11 @@ define amdgpu_kernel void @v_ssubo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; VI-NEXT: v_mov_b32_e32 v6, s2 ; VI-NEXT: v_mov_b32_e32 v7, s3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_sub_u32_e32 v8, vcc, v0, v2 -; VI-NEXT: v_subb_u32_e32 v9, vcc, v1, v3, vcc -; VI-NEXT: v_cmp_lt_i64_e32 vcc, 0, v[2:3] -; VI-NEXT: v_cmp_lt_i64_e64 s[0:1], v[8:9], v[0:1] -; VI-NEXT: flat_store_dwordx2 v[4:5], v[8:9] +; VI-NEXT: v_cmp_lt_i64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: v_sub_u32_e64 v0, s[0:1], v0, v2 +; VI-NEXT: v_subb_u32_e64 v1, s[0:1], v1, v3, s[0:1] +; VI-NEXT: v_cmp_gt_i64_e64 s[0:1], 0, v[0:1] +; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1] ; VI-NEXT: s_xor_b64 s[0:1], vcc, s[0:1] ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; VI-NEXT: flat_store_byte v[6:7], v0 @@ -511,19 +512,19 @@ define amdgpu_kernel void @v_ssubo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX9-LABEL: v_ssubo_i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v6, s[12:13] -; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[14:15] +; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[12:13] +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[14:15] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v0, v2 -; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v1, v3, vcc -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, 0, v[2:3] -; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], v[4:5], v[0:1] -; GFX9-NEXT: global_store_dwordx2 v6, v[4:5], s[8:9] +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_sub_co_u32_e64 v0, s[0:1], v0, v2 +; GFX9-NEXT: v_subb_co_u32_e64 v1, s[0:1], v1, v3, s[0:1] +; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], 0, v[0:1] +; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[8:9] ; GFX9-NEXT: s_xor_b64 s[0:1], vcc, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; GFX9-NEXT: global_store_byte v6, v0, s[10:11] +; GFX9-NEXT: global_store_byte v4, v0, s[10:11] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_ssubo_i64: @@ -537,8 +538,8 @@ define amdgpu_kernel void @v_ssubo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v0, v2 ; GFX10-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo -; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, 0, v[2:3] -; GFX10-NEXT: v_cmp_lt_i64_e64 s0, v[4:5], v[0:1] +; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[0:1], v[2:3] +; GFX10-NEXT: v_cmp_gt_i64_e64 s0, 0, v[4:5] ; GFX10-NEXT: s_xor_b32 s0, vcc_lo, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 ; GFX10-NEXT: global_store_dwordx2 v6, v[4:5], s[8:9] @@ -557,8 +558,8 @@ define amdgpu_kernel void @v_ssubo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX11-NEXT: v_sub_co_u32 v4, vcc_lo, v0, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_sub_co_ci_u32_e64 v5, null, v1, v3, vcc_lo -; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, 0, v[2:3] -; GFX11-NEXT: v_cmp_lt_i64_e64 s0, v[4:5], v[0:1] +; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[0:1], v[2:3] +; GFX11-NEXT: v_cmp_gt_i64_e64 s0, 0, v[4:5] ; GFX11-NEXT: s_xor_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 @@ -598,15 +599,15 @@ define amdgpu_kernel void @v_ssubo_v2i32(ptr addrspace(1) %out, ptr addrspace(1) ; SI-NEXT: s_mov_b32 s12, s2 ; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_sub_i32_e32 v5, vcc, v1, v3 ; SI-NEXT: v_sub_i32_e32 v4, vcc, v0, v2 -; SI-NEXT: v_cmp_lt_i32_e64 s[0:1], 0, v3 -; SI-NEXT: v_cmp_lt_i32_e64 s[4:5], v5, v1 -; SI-NEXT: v_cmp_lt_i32_e32 vcc, 0, v2 -; SI-NEXT: v_cmp_lt_i32_e64 s[2:3], v4, v0 -; SI-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] -; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] -; SI-NEXT: s_xor_b64 s[0:1], vcc, s[2:3] +; SI-NEXT: v_sub_i32_e32 v5, vcc, v1, v3 +; SI-NEXT: v_cmp_lt_i32_e64 s[0:1], v0, v2 +; SI-NEXT: v_cmp_lt_i32_e32 vcc, v1, v3 +; SI-NEXT: v_cmp_gt_i32_e64 s[2:3], 0, v4 +; SI-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v5 +; SI-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; SI-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] +; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[8:11], 0 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[12:15], 0 @@ -627,15 +628,15 @@ define amdgpu_kernel void @v_ssubo_v2i32(ptr addrspace(1) %out, ptr addrspace(1) ; VI-NEXT: v_mov_b32_e32 v6, s2 ; VI-NEXT: v_mov_b32_e32 v7, s3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_sub_u32_e32 v9, vcc, v1, v3 ; VI-NEXT: v_sub_u32_e32 v8, vcc, v0, v2 -; VI-NEXT: v_cmp_lt_i32_e64 s[0:1], 0, v3 -; VI-NEXT: v_cmp_lt_i32_e64 s[4:5], v9, v1 -; VI-NEXT: v_cmp_lt_i32_e32 vcc, 0, v2 -; VI-NEXT: v_cmp_lt_i32_e64 s[2:3], v8, v0 -; VI-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] -; VI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] -; VI-NEXT: s_xor_b64 s[0:1], vcc, s[2:3] +; VI-NEXT: v_sub_u32_e32 v9, vcc, v1, v3 +; VI-NEXT: v_cmp_lt_i32_e64 s[0:1], v0, v2 +; VI-NEXT: v_cmp_lt_i32_e32 vcc, v1, v3 +; VI-NEXT: v_cmp_gt_i32_e64 s[2:3], 0, v8 +; VI-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v9 +; VI-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] +; VI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; VI-NEXT: flat_store_dwordx2 v[4:5], v[8:9] ; VI-NEXT: flat_store_dwordx2 v[6:7], v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/ssubsat.ll index 40d80f5e83e36..24ad6af504b7b 100644 --- a/llvm/test/CodeGen/AMDGPU/ssubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/ssubsat.ll @@ -80,13 +80,13 @@ define i16 @v_ssubsat_i16(i16 %lhs, i16 %rhs) { ; GFX8-LABEL: v_ssubsat_i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, 0, v1 -; GFX8-NEXT: v_sub_u16_e32 v1, v0, v1 -; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v1, v0 -; GFX8-NEXT: v_ashrrev_i16_e32 v0, 15, v1 -; GFX8-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 +; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v0, v1 +; GFX8-NEXT: v_sub_u16_e32 v0, v0, v1 +; GFX8-NEXT: v_cmp_gt_i16_e64 s[4:5], 0, v0 +; GFX8-NEXT: v_ashrrev_i16_e32 v1, 15, v0 +; GFX8-NEXT: v_xor_b32_e32 v1, 0xffff8000, v1 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_ssubsat_i16: @@ -120,25 +120,25 @@ define i32 @v_ssubsat_i32(i32 %lhs, i32 %rhs) { ; GFX6-LABEL: v_ssubsat_i32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v1 -; GFX6-NEXT: v_sub_i32_e64 v1, s[4:5], v0, v1 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v1, v0 -; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v1 -; GFX6-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v0, v1 +; GFX6-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v1 +; GFX6-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v0 +; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX6-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_ssubsat_i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v1 -; GFX8-NEXT: v_sub_u32_e64 v1, s[4:5], v0, v1 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v1, v0 -; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v1 -; GFX8-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v0, v1 +; GFX8-NEXT: v_sub_u32_e64 v0, s[4:5], v0, v1 +; GFX8-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v0 +; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX8-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_ssubsat_i32: @@ -181,21 +181,21 @@ define <2 x i16> @v_ssubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX8-NEXT: v_sub_u16_e32 v4, v3, v2 -; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v4, v3 -; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], 0, v2 -; GFX8-NEXT: v_ashrrev_i16_e32 v2, 15, v4 -; GFX8-NEXT: v_xor_b32_e32 v2, 0xffff8000, v2 -; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc -; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, 0, v1 -; GFX8-NEXT: v_sub_u16_e32 v1, v0, v1 -; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v1, v0 -; GFX8-NEXT: v_ashrrev_i16_e32 v0, 15, v1 -; GFX8-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 +; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v3, v2 +; GFX8-NEXT: v_sub_u16_e32 v2, v3, v2 +; GFX8-NEXT: v_cmp_gt_i16_e64 s[4:5], 0, v2 +; GFX8-NEXT: v_ashrrev_i16_e32 v3, 15, v2 +; GFX8-NEXT: v_xor_b32_e32 v3, 0xffff8000, v3 +; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v0, v1 +; GFX8-NEXT: v_sub_u16_e32 v0, v0, v1 +; GFX8-NEXT: v_cmp_gt_i16_e64 s[4:5], 0, v0 +; GFX8-NEXT: v_ashrrev_i16_e32 v1, 15, v0 +; GFX8-NEXT: v_xor_b32_e32 v1, 0xffff8000, v1 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -244,27 +244,27 @@ define <3 x i16> @v_ssubsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX8-NEXT: v_sub_u16_e32 v6, v5, v4 -; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v6, v5 -; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], 0, v4 -; GFX8-NEXT: v_ashrrev_i16_e32 v4, 15, v6 -; GFX8-NEXT: v_xor_b32_e32 v4, 0xffff8000, v4 -; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc -; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc -; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, 0, v3 -; GFX8-NEXT: v_sub_u16_e32 v3, v1, v3 -; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v3, v1 -; GFX8-NEXT: v_ashrrev_i16_e32 v1, 15, v3 -; GFX8-NEXT: v_xor_b32_e32 v1, 0xffff8000, v1 +; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v5, v4 +; GFX8-NEXT: v_sub_u16_e32 v4, v5, v4 +; GFX8-NEXT: v_cmp_gt_i16_e64 s[4:5], 0, v4 +; GFX8-NEXT: v_ashrrev_i16_e32 v5, 15, v4 +; GFX8-NEXT: v_xor_b32_e32 v5, 0xffff8000, v5 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, 0, v2 -; GFX8-NEXT: v_sub_u16_e32 v2, v0, v2 -; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v2, v0 -; GFX8-NEXT: v_ashrrev_i16_e32 v0, 15, v2 -; GFX8-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v1, v3 +; GFX8-NEXT: v_sub_u16_e32 v1, v1, v3 +; GFX8-NEXT: v_cmp_gt_i16_e64 s[4:5], 0, v1 +; GFX8-NEXT: v_ashrrev_i16_e32 v3, 15, v1 +; GFX8-NEXT: v_xor_b32_e32 v3, 0xffff8000, v3 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v0, v2 +; GFX8-NEXT: v_sub_u16_e32 v0, v0, v2 +; GFX8-NEXT: v_cmp_gt_i16_e64 s[4:5], 0, v0 +; GFX8-NEXT: v_ashrrev_i16_e32 v2, 15, v0 +; GFX8-NEXT: v_xor_b32_e32 v2, 0xffff8000, v2 +; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -321,39 +321,39 @@ define <2 x float> @v_ssubsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX8-NEXT: v_sub_u16_e32 v6, v5, v4 -; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v6, v5 -; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], 0, v4 -; GFX8-NEXT: v_ashrrev_i16_e32 v4, 15, v6 -; GFX8-NEXT: v_xor_b32_e32 v4, 0xffff8000, v4 -; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc -; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc -; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, 0, v2 -; GFX8-NEXT: v_sub_u16_e32 v2, v0, v2 -; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v2, v0 -; GFX8-NEXT: v_ashrrev_i16_e32 v0, 15, v2 -; GFX8-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 +; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v5, v4 +; GFX8-NEXT: v_sub_u16_e32 v4, v5, v4 +; GFX8-NEXT: v_cmp_gt_i16_e64 s[4:5], 0, v4 +; GFX8-NEXT: v_ashrrev_i16_e32 v5, 15, v4 +; GFX8-NEXT: v_xor_b32_e32 v5, 0xffff8000, v5 +; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v0, v2 +; GFX8-NEXT: v_sub_u16_e32 v0, v0, v2 +; GFX8-NEXT: v_cmp_gt_i16_e64 s[4:5], 0, v0 +; GFX8-NEXT: v_ashrrev_i16_e32 v2, 15, v0 +; GFX8-NEXT: v_xor_b32_e32 v2, 0xffff8000, v2 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; GFX8-NEXT: v_sub_u16_e32 v5, v4, v2 -; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v5, v4 -; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], 0, v2 -; GFX8-NEXT: v_ashrrev_i16_e32 v2, 15, v5 -; GFX8-NEXT: v_xor_b32_e32 v2, 0xffff8000, v2 -; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc -; GFX8-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc -; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, 0, v3 -; GFX8-NEXT: v_sub_u16_e32 v3, v1, v3 -; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v3, v1 -; GFX8-NEXT: v_ashrrev_i16_e32 v1, 15, v3 -; GFX8-NEXT: v_xor_b32_e32 v1, 0xffff8000, v1 +; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v4, v2 +; GFX8-NEXT: v_sub_u16_e32 v2, v4, v2 +; GFX8-NEXT: v_cmp_gt_i16_e64 s[4:5], 0, v2 +; GFX8-NEXT: v_ashrrev_i16_e32 v4, 15, v2 +; GFX8-NEXT: v_xor_b32_e32 v4, 0xffff8000, v4 +; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v1, v3 +; GFX8-NEXT: v_sub_u16_e32 v1, v1, v3 +; GFX8-NEXT: v_cmp_gt_i16_e64 s[4:5], 0, v1 +; GFX8-NEXT: v_ashrrev_i16_e32 v3, 15, v1 +; GFX8-NEXT: v_xor_b32_e32 v3, 0xffff8000, v3 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX8-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -379,39 +379,39 @@ define <2 x i32> @v_ssubsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { ; GFX6-LABEL: v_ssubsat_v2i32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v2 -; GFX6-NEXT: v_sub_i32_e64 v2, s[4:5], v0, v2 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v2, v0 -; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v2 -; GFX6-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v0, v2 +; GFX6-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v2 +; GFX6-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v0 +; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v0 +; GFX6-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; GFX6-NEXT: v_sub_i32_e64 v2, s[4:5], v1, v3 -; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v3 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v2, v1 -; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v2 -; GFX6-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v1, v3 +; GFX6-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v3 +; GFX6-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v1 +; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v1 +; GFX6-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_ssubsat_v2i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v2 -; GFX8-NEXT: v_sub_u32_e64 v2, s[4:5], v0, v2 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v2, v0 -; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v2 -; GFX8-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v0, v2 +; GFX8-NEXT: v_sub_u32_e64 v0, s[4:5], v0, v2 +; GFX8-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v0 +; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v0 +; GFX8-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; GFX8-NEXT: v_sub_u32_e64 v2, s[4:5], v1, v3 -; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v3 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v2, v1 -; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v2 -; GFX8-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v1, v3 +; GFX8-NEXT: v_sub_u32_e64 v1, s[4:5], v1, v3 +; GFX8-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v1 +; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v1 +; GFX8-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_ssubsat_v2i32: @@ -435,53 +435,53 @@ define <3 x i32> @v_ssubsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) { ; GFX6-LABEL: v_ssubsat_v3i32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v3 -; GFX6-NEXT: v_sub_i32_e64 v3, s[4:5], v0, v3 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v3, v0 -; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v3 -; GFX6-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v0, v3 +; GFX6-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v3 +; GFX6-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v0 +; GFX6-NEXT: v_ashrrev_i32_e32 v3, 31, v0 +; GFX6-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc -; GFX6-NEXT: v_sub_i32_e64 v3, s[4:5], v1, v4 -; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v4 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v3, v1 -; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v3 -; GFX6-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v1, v4 +; GFX6-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v4 +; GFX6-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v1 +; GFX6-NEXT: v_ashrrev_i32_e32 v3, 31, v1 +; GFX6-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX6-NEXT: v_sub_i32_e64 v3, s[4:5], v2, v5 -; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v5 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v3, v2 -; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v3 -; GFX6-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 +; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v2, v5 +; GFX6-NEXT: v_sub_i32_e64 v2, s[4:5], v2, v5 +; GFX6-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v2 +; GFX6-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX6-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_ssubsat_v3i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v3 -; GFX8-NEXT: v_sub_u32_e64 v3, s[4:5], v0, v3 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v3, v0 -; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v3 -; GFX8-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v0, v3 +; GFX8-NEXT: v_sub_u32_e64 v0, s[4:5], v0, v3 +; GFX8-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v0 +; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v0 +; GFX8-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc -; GFX8-NEXT: v_sub_u32_e64 v3, s[4:5], v1, v4 -; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v4 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v3, v1 -; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v3 -; GFX8-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v1, v4 +; GFX8-NEXT: v_sub_u32_e64 v1, s[4:5], v1, v4 +; GFX8-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v1 +; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v1 +; GFX8-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX8-NEXT: v_sub_u32_e64 v3, s[4:5], v2, v5 -; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v5 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v3, v2 -; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v3 -; GFX8-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v2, v5 +; GFX8-NEXT: v_sub_u32_e64 v2, s[4:5], v2, v5 +; GFX8-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v2 +; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX8-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_ssubsat_v3i32: @@ -507,67 +507,67 @@ define <4 x i32> @v_ssubsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { ; GFX6-LABEL: v_ssubsat_v4i32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v4 -; GFX6-NEXT: v_sub_i32_e64 v4, s[4:5], v0, v4 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v4, v0 -; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v4 -; GFX6-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v0, v4 +; GFX6-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v4 +; GFX6-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v0 +; GFX6-NEXT: v_ashrrev_i32_e32 v4, 31, v0 +; GFX6-NEXT: v_xor_b32_e32 v4, 0x80000000, v4 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX6-NEXT: v_sub_i32_e64 v4, s[4:5], v1, v5 -; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v5 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v4, v1 -; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v4 -; GFX6-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v1, v5 +; GFX6-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v5 +; GFX6-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v1 +; GFX6-NEXT: v_ashrrev_i32_e32 v4, 31, v1 +; GFX6-NEXT: v_xor_b32_e32 v4, 0x80000000, v4 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; GFX6-NEXT: v_sub_i32_e64 v4, s[4:5], v2, v6 -; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v6 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v4, v2 -; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v4 -; GFX6-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 +; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v2, v6 +; GFX6-NEXT: v_sub_i32_e64 v2, s[4:5], v2, v6 +; GFX6-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v2 +; GFX6-NEXT: v_ashrrev_i32_e32 v4, 31, v2 +; GFX6-NEXT: v_xor_b32_e32 v4, 0x80000000, v4 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX6-NEXT: v_sub_i32_e64 v4, s[4:5], v3, v7 -; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v7 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v4, v3 -; GFX6-NEXT: v_ashrrev_i32_e32 v3, 31, v4 -; GFX6-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 +; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v3, v7 +; GFX6-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v7 +; GFX6-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v3 +; GFX6-NEXT: v_ashrrev_i32_e32 v4, 31, v3 +; GFX6-NEXT: v_xor_b32_e32 v4, 0x80000000, v4 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_ssubsat_v4i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v4 -; GFX8-NEXT: v_sub_u32_e64 v4, s[4:5], v0, v4 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v4, v0 -; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v4 -; GFX8-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v0, v4 +; GFX8-NEXT: v_sub_u32_e64 v0, s[4:5], v0, v4 +; GFX8-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v0 +; GFX8-NEXT: v_ashrrev_i32_e32 v4, 31, v0 +; GFX8-NEXT: v_xor_b32_e32 v4, 0x80000000, v4 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX8-NEXT: v_sub_u32_e64 v4, s[4:5], v1, v5 -; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v5 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v4, v1 -; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v4 -; GFX8-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v1, v5 +; GFX8-NEXT: v_sub_u32_e64 v1, s[4:5], v1, v5 +; GFX8-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v1 +; GFX8-NEXT: v_ashrrev_i32_e32 v4, 31, v1 +; GFX8-NEXT: v_xor_b32_e32 v4, 0x80000000, v4 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; GFX8-NEXT: v_sub_u32_e64 v4, s[4:5], v2, v6 -; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v6 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v4, v2 -; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v4 -; GFX8-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v2, v6 +; GFX8-NEXT: v_sub_u32_e64 v2, s[4:5], v2, v6 +; GFX8-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v2 +; GFX8-NEXT: v_ashrrev_i32_e32 v4, 31, v2 +; GFX8-NEXT: v_xor_b32_e32 v4, 0x80000000, v4 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX8-NEXT: v_sub_u32_e64 v4, s[4:5], v3, v7 -; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v7 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v4, v3 -; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v4 -; GFX8-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v3, v7 +; GFX8-NEXT: v_sub_u32_e64 v3, s[4:5], v3, v7 +; GFX8-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v3 +; GFX8-NEXT: v_ashrrev_i32_e32 v4, 31, v3 +; GFX8-NEXT: v_xor_b32_e32 v4, 0x80000000, v4 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_ssubsat_v4i32: @@ -595,123 +595,123 @@ define <8 x i32> @v_ssubsat_v8i32(<8 x i32> %lhs, <8 x i32> %rhs) { ; GFX6-LABEL: v_ssubsat_v8i32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v8 -; GFX6-NEXT: v_sub_i32_e64 v8, s[4:5], v0, v8 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v0 -; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v8 -; GFX6-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v0, v8 +; GFX6-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v8 +; GFX6-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v0 +; GFX6-NEXT: v_ashrrev_i32_e32 v8, 31, v0 +; GFX6-NEXT: v_xor_b32_e32 v8, 0x80000000, v8 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc -; GFX6-NEXT: v_sub_i32_e64 v8, s[4:5], v1, v9 -; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v9 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v1 -; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v8 -; GFX6-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v1, v9 +; GFX6-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v9 +; GFX6-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v1 +; GFX6-NEXT: v_ashrrev_i32_e32 v8, 31, v1 +; GFX6-NEXT: v_xor_b32_e32 v8, 0x80000000, v8 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc -; GFX6-NEXT: v_sub_i32_e64 v8, s[4:5], v2, v10 -; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v10 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v2 -; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v8 -; GFX6-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 +; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v2, v10 +; GFX6-NEXT: v_sub_i32_e64 v2, s[4:5], v2, v10 +; GFX6-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v2 +; GFX6-NEXT: v_ashrrev_i32_e32 v8, 31, v2 +; GFX6-NEXT: v_xor_b32_e32 v8, 0x80000000, v8 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc -; GFX6-NEXT: v_sub_i32_e64 v8, s[4:5], v3, v11 -; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v11 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v3 -; GFX6-NEXT: v_ashrrev_i32_e32 v3, 31, v8 -; GFX6-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 +; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v3, v11 +; GFX6-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v11 +; GFX6-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v3 +; GFX6-NEXT: v_ashrrev_i32_e32 v8, 31, v3 +; GFX6-NEXT: v_xor_b32_e32 v8, 0x80000000, v8 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc -; GFX6-NEXT: v_sub_i32_e64 v8, s[4:5], v4, v12 -; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v12 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v4 -; GFX6-NEXT: v_ashrrev_i32_e32 v4, 31, v8 -; GFX6-NEXT: v_xor_b32_e32 v4, 0x80000000, v4 +; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v4, v12 +; GFX6-NEXT: v_sub_i32_e64 v4, s[4:5], v4, v12 +; GFX6-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v4 +; GFX6-NEXT: v_ashrrev_i32_e32 v8, 31, v4 +; GFX6-NEXT: v_xor_b32_e32 v8, 0x80000000, v8 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc -; GFX6-NEXT: v_sub_i32_e64 v8, s[4:5], v5, v13 -; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v13 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v5 -; GFX6-NEXT: v_ashrrev_i32_e32 v5, 31, v8 -; GFX6-NEXT: v_xor_b32_e32 v5, 0x80000000, v5 +; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v5, v13 +; GFX6-NEXT: v_sub_i32_e64 v5, s[4:5], v5, v13 +; GFX6-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v5 +; GFX6-NEXT: v_ashrrev_i32_e32 v8, 31, v5 +; GFX6-NEXT: v_xor_b32_e32 v8, 0x80000000, v8 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc -; GFX6-NEXT: v_sub_i32_e64 v8, s[4:5], v6, v14 -; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v14 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v6 -; GFX6-NEXT: v_ashrrev_i32_e32 v6, 31, v8 -; GFX6-NEXT: v_xor_b32_e32 v6, 0x80000000, v6 +; GFX6-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v6, v14 +; GFX6-NEXT: v_sub_i32_e64 v6, s[4:5], v6, v14 +; GFX6-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v6 +; GFX6-NEXT: v_ashrrev_i32_e32 v8, 31, v6 +; GFX6-NEXT: v_xor_b32_e32 v8, 0x80000000, v8 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc -; GFX6-NEXT: v_sub_i32_e64 v8, s[4:5], v7, v15 -; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v15 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v7 -; GFX6-NEXT: v_ashrrev_i32_e32 v7, 31, v8 -; GFX6-NEXT: v_xor_b32_e32 v7, 0x80000000, v7 +; GFX6-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v7, v15 +; GFX6-NEXT: v_sub_i32_e64 v7, s[4:5], v7, v15 +; GFX6-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v7 +; GFX6-NEXT: v_ashrrev_i32_e32 v8, 31, v7 +; GFX6-NEXT: v_xor_b32_e32 v8, 0x80000000, v8 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_ssubsat_v8i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v8 -; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v0, v8 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v0 -; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v8 -; GFX8-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v0, v8 +; GFX8-NEXT: v_sub_u32_e64 v0, s[4:5], v0, v8 +; GFX8-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v0 +; GFX8-NEXT: v_ashrrev_i32_e32 v8, 31, v0 +; GFX8-NEXT: v_xor_b32_e32 v8, 0x80000000, v8 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc -; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v1, v9 -; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v9 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v1 -; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v8 -; GFX8-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v1, v9 +; GFX8-NEXT: v_sub_u32_e64 v1, s[4:5], v1, v9 +; GFX8-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v1 +; GFX8-NEXT: v_ashrrev_i32_e32 v8, 31, v1 +; GFX8-NEXT: v_xor_b32_e32 v8, 0x80000000, v8 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc -; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v2, v10 -; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v10 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v2 -; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v8 -; GFX8-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v2, v10 +; GFX8-NEXT: v_sub_u32_e64 v2, s[4:5], v2, v10 +; GFX8-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v2 +; GFX8-NEXT: v_ashrrev_i32_e32 v8, 31, v2 +; GFX8-NEXT: v_xor_b32_e32 v8, 0x80000000, v8 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc -; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v3, v11 -; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v11 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v3 -; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v8 -; GFX8-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v3, v11 +; GFX8-NEXT: v_sub_u32_e64 v3, s[4:5], v3, v11 +; GFX8-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v3 +; GFX8-NEXT: v_ashrrev_i32_e32 v8, 31, v3 +; GFX8-NEXT: v_xor_b32_e32 v8, 0x80000000, v8 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc -; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v4, v12 -; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v12 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v4 -; GFX8-NEXT: v_ashrrev_i32_e32 v4, 31, v8 -; GFX8-NEXT: v_xor_b32_e32 v4, 0x80000000, v4 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v4, v12 +; GFX8-NEXT: v_sub_u32_e64 v4, s[4:5], v4, v12 +; GFX8-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v4 +; GFX8-NEXT: v_ashrrev_i32_e32 v8, 31, v4 +; GFX8-NEXT: v_xor_b32_e32 v8, 0x80000000, v8 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc -; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v5, v13 -; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v13 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v5 -; GFX8-NEXT: v_ashrrev_i32_e32 v5, 31, v8 -; GFX8-NEXT: v_xor_b32_e32 v5, 0x80000000, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v5, v13 +; GFX8-NEXT: v_sub_u32_e64 v5, s[4:5], v5, v13 +; GFX8-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v5 +; GFX8-NEXT: v_ashrrev_i32_e32 v8, 31, v5 +; GFX8-NEXT: v_xor_b32_e32 v8, 0x80000000, v8 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc -; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v6, v14 -; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v14 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v6 -; GFX8-NEXT: v_ashrrev_i32_e32 v6, 31, v8 -; GFX8-NEXT: v_xor_b32_e32 v6, 0x80000000, v6 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v6, v14 +; GFX8-NEXT: v_sub_u32_e64 v6, s[4:5], v6, v14 +; GFX8-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v6 +; GFX8-NEXT: v_ashrrev_i32_e32 v8, 31, v6 +; GFX8-NEXT: v_xor_b32_e32 v8, 0x80000000, v8 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc -; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v7, v15 -; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v15 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v7 -; GFX8-NEXT: v_ashrrev_i32_e32 v7, 31, v8 -; GFX8-NEXT: v_xor_b32_e32 v7, 0x80000000, v7 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v7, v15 +; GFX8-NEXT: v_sub_u32_e64 v7, s[4:5], v7, v15 +; GFX8-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v7 +; GFX8-NEXT: v_ashrrev_i32_e32 v8, 31, v7 +; GFX8-NEXT: v_xor_b32_e32 v8, 0x80000000, v8 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_ssubsat_v8i32: @@ -747,239 +747,239 @@ define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) { ; GFX6-LABEL: v_ssubsat_v16i32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v16 -; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v0, v16 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v0 -; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v16 -; GFX6-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v0, v16 +; GFX6-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v16 +; GFX6-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v0 +; GFX6-NEXT: v_ashrrev_i32_e32 v16, 31, v0 +; GFX6-NEXT: v_xor_b32_e32 v16, 0x80000000, v16 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc -; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v1, v17 -; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v17 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v1 -; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v16 -; GFX6-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v1, v17 +; GFX6-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v17 +; GFX6-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v1 +; GFX6-NEXT: v_ashrrev_i32_e32 v16, 31, v1 +; GFX6-NEXT: v_xor_b32_e32 v16, 0x80000000, v16 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v1, v16, v1, vcc -; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v2, v18 -; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v18 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v2 -; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v16 -; GFX6-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 +; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v16, vcc +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v2, v18 +; GFX6-NEXT: v_sub_i32_e64 v2, s[4:5], v2, v18 +; GFX6-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v2 +; GFX6-NEXT: v_ashrrev_i32_e32 v16, 31, v2 +; GFX6-NEXT: v_xor_b32_e32 v16, 0x80000000, v16 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v2, v16, v2, vcc -; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v3, v19 -; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v19 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v3 -; GFX6-NEXT: v_ashrrev_i32_e32 v3, 31, v16 -; GFX6-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 +; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v16, vcc +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v3, v19 +; GFX6-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v19 +; GFX6-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v3 +; GFX6-NEXT: v_ashrrev_i32_e32 v16, 31, v3 +; GFX6-NEXT: v_xor_b32_e32 v16, 0x80000000, v16 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v3, v16, v3, vcc -; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v4, v20 -; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v20 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v4 -; GFX6-NEXT: v_ashrrev_i32_e32 v4, 31, v16 -; GFX6-NEXT: v_xor_b32_e32 v4, 0x80000000, v4 +; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v16, vcc +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v4, v20 +; GFX6-NEXT: v_sub_i32_e64 v4, s[4:5], v4, v20 +; GFX6-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v4 +; GFX6-NEXT: v_ashrrev_i32_e32 v16, 31, v4 +; GFX6-NEXT: v_xor_b32_e32 v16, 0x80000000, v16 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v4, v16, v4, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v16, vcc ; GFX6-NEXT: buffer_load_dword v16, off, s[0:3], s32 -; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v5, v21 -; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v21 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v5 -; GFX6-NEXT: v_ashrrev_i32_e32 v5, 31, v17 -; GFX6-NEXT: v_xor_b32_e32 v5, 0x80000000, v5 +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v5, v21 +; GFX6-NEXT: v_sub_i32_e64 v5, s[4:5], v5, v21 +; GFX6-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v5 +; GFX6-NEXT: v_ashrrev_i32_e32 v17, 31, v5 +; GFX6-NEXT: v_xor_b32_e32 v17, 0x80000000, v17 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v5, v17, v5, vcc -; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v6, v22 -; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v22 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v6 -; GFX6-NEXT: v_ashrrev_i32_e32 v6, 31, v17 -; GFX6-NEXT: v_xor_b32_e32 v6, 0x80000000, v6 +; GFX6-NEXT: v_cndmask_b32_e32 v5, v5, v17, vcc +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v6, v22 +; GFX6-NEXT: v_sub_i32_e64 v6, s[4:5], v6, v22 +; GFX6-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v6 +; GFX6-NEXT: v_ashrrev_i32_e32 v17, 31, v6 +; GFX6-NEXT: v_xor_b32_e32 v17, 0x80000000, v17 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v6, v17, v6, vcc -; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v7, v23 -; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v23 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v7 -; GFX6-NEXT: v_ashrrev_i32_e32 v7, 31, v17 -; GFX6-NEXT: v_xor_b32_e32 v7, 0x80000000, v7 +; GFX6-NEXT: v_cndmask_b32_e32 v6, v6, v17, vcc +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v7, v23 +; GFX6-NEXT: v_sub_i32_e64 v7, s[4:5], v7, v23 +; GFX6-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v7 +; GFX6-NEXT: v_ashrrev_i32_e32 v17, 31, v7 +; GFX6-NEXT: v_xor_b32_e32 v17, 0x80000000, v17 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v7, v17, v7, vcc -; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v8, v24 -; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v24 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v8 -; GFX6-NEXT: v_ashrrev_i32_e32 v8, 31, v17 -; GFX6-NEXT: v_xor_b32_e32 v8, 0x80000000, v8 +; GFX6-NEXT: v_cndmask_b32_e32 v7, v7, v17, vcc +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v8, v24 +; GFX6-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v24 +; GFX6-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v8 +; GFX6-NEXT: v_ashrrev_i32_e32 v17, 31, v8 +; GFX6-NEXT: v_xor_b32_e32 v17, 0x80000000, v17 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v8, v17, v8, vcc -; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v9, v25 -; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v25 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v9 -; GFX6-NEXT: v_ashrrev_i32_e32 v9, 31, v17 -; GFX6-NEXT: v_xor_b32_e32 v9, 0x80000000, v9 +; GFX6-NEXT: v_cndmask_b32_e32 v8, v8, v17, vcc +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v9, v25 +; GFX6-NEXT: v_sub_i32_e64 v9, s[4:5], v9, v25 +; GFX6-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v9 +; GFX6-NEXT: v_ashrrev_i32_e32 v17, 31, v9 +; GFX6-NEXT: v_xor_b32_e32 v17, 0x80000000, v17 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v9, v17, v9, vcc -; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v10, v26 -; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v26 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v10 -; GFX6-NEXT: v_ashrrev_i32_e32 v10, 31, v17 -; GFX6-NEXT: v_xor_b32_e32 v10, 0x80000000, v10 +; GFX6-NEXT: v_cndmask_b32_e32 v9, v9, v17, vcc +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v10, v26 +; GFX6-NEXT: v_sub_i32_e64 v10, s[4:5], v10, v26 +; GFX6-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v10 +; GFX6-NEXT: v_ashrrev_i32_e32 v17, 31, v10 +; GFX6-NEXT: v_xor_b32_e32 v17, 0x80000000, v17 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v10, v17, v10, vcc -; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v11, v27 -; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v27 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v11 -; GFX6-NEXT: v_ashrrev_i32_e32 v11, 31, v17 -; GFX6-NEXT: v_xor_b32_e32 v11, 0x80000000, v11 +; GFX6-NEXT: v_cndmask_b32_e32 v10, v10, v17, vcc +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v11, v27 +; GFX6-NEXT: v_sub_i32_e64 v11, s[4:5], v11, v27 +; GFX6-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v11 +; GFX6-NEXT: v_ashrrev_i32_e32 v17, 31, v11 +; GFX6-NEXT: v_xor_b32_e32 v17, 0x80000000, v17 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v11, v17, v11, vcc -; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v12, v28 -; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v28 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v12 -; GFX6-NEXT: v_ashrrev_i32_e32 v12, 31, v17 -; GFX6-NEXT: v_xor_b32_e32 v12, 0x80000000, v12 +; GFX6-NEXT: v_cndmask_b32_e32 v11, v11, v17, vcc +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v12, v28 +; GFX6-NEXT: v_sub_i32_e64 v12, s[4:5], v12, v28 +; GFX6-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v12 +; GFX6-NEXT: v_ashrrev_i32_e32 v17, 31, v12 +; GFX6-NEXT: v_xor_b32_e32 v17, 0x80000000, v17 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v12, v17, v12, vcc -; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v13, v29 -; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v29 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v13 -; GFX6-NEXT: v_ashrrev_i32_e32 v13, 31, v17 -; GFX6-NEXT: v_xor_b32_e32 v13, 0x80000000, v13 +; GFX6-NEXT: v_cndmask_b32_e32 v12, v12, v17, vcc +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v13, v29 +; GFX6-NEXT: v_sub_i32_e64 v13, s[4:5], v13, v29 +; GFX6-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v13 +; GFX6-NEXT: v_ashrrev_i32_e32 v17, 31, v13 +; GFX6-NEXT: v_xor_b32_e32 v17, 0x80000000, v17 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v13, v17, v13, vcc -; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v14, v30 -; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v30 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v14 -; GFX6-NEXT: v_ashrrev_i32_e32 v14, 31, v17 -; GFX6-NEXT: v_xor_b32_e32 v14, 0x80000000, v14 +; GFX6-NEXT: v_cndmask_b32_e32 v13, v13, v17, vcc +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v14, v30 +; GFX6-NEXT: v_sub_i32_e64 v14, s[4:5], v14, v30 +; GFX6-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v14 +; GFX6-NEXT: v_ashrrev_i32_e32 v17, 31, v14 +; GFX6-NEXT: v_xor_b32_e32 v17, 0x80000000, v17 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v14, v17, v14, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v14, v14, v17, vcc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v16 -; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v15, v16 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v15 -; GFX6-NEXT: v_ashrrev_i32_e32 v15, 31, v16 -; GFX6-NEXT: v_xor_b32_e32 v15, 0x80000000, v15 +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v15, v16 +; GFX6-NEXT: v_sub_i32_e64 v15, s[4:5], v15, v16 +; GFX6-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v15 +; GFX6-NEXT: v_ashrrev_i32_e32 v16, 31, v15 +; GFX6-NEXT: v_xor_b32_e32 v16, 0x80000000, v16 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v15, v16, v15, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v15, v15, v16, vcc ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_ssubsat_v16i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v16 -; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v0, v16 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v0 -; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v16 -; GFX8-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v0, v16 +; GFX8-NEXT: v_sub_u32_e64 v0, s[4:5], v0, v16 +; GFX8-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v0 +; GFX8-NEXT: v_ashrrev_i32_e32 v16, 31, v0 +; GFX8-NEXT: v_xor_b32_e32 v16, 0x80000000, v16 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc -; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v1, v17 -; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v17 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v1 -; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v16 -; GFX8-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v1, v17 +; GFX8-NEXT: v_sub_u32_e64 v1, s[4:5], v1, v17 +; GFX8-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v1 +; GFX8-NEXT: v_ashrrev_i32_e32 v16, 31, v1 +; GFX8-NEXT: v_xor_b32_e32 v16, 0x80000000, v16 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v16, v1, vcc -; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v2, v18 -; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v18 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v2 -; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v16 -; GFX8-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v16, vcc +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v2, v18 +; GFX8-NEXT: v_sub_u32_e64 v2, s[4:5], v2, v18 +; GFX8-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v2 +; GFX8-NEXT: v_ashrrev_i32_e32 v16, 31, v2 +; GFX8-NEXT: v_xor_b32_e32 v16, 0x80000000, v16 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v2, v16, v2, vcc -; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v3, v19 -; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v19 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v3 -; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v16 -; GFX8-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v16, vcc +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v3, v19 +; GFX8-NEXT: v_sub_u32_e64 v3, s[4:5], v3, v19 +; GFX8-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v3 +; GFX8-NEXT: v_ashrrev_i32_e32 v16, 31, v3 +; GFX8-NEXT: v_xor_b32_e32 v16, 0x80000000, v16 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v3, v16, v3, vcc -; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v4, v20 -; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v20 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v4 -; GFX8-NEXT: v_ashrrev_i32_e32 v4, 31, v16 -; GFX8-NEXT: v_xor_b32_e32 v4, 0x80000000, v4 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v16, vcc +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v4, v20 +; GFX8-NEXT: v_sub_u32_e64 v4, s[4:5], v4, v20 +; GFX8-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v4 +; GFX8-NEXT: v_ashrrev_i32_e32 v16, 31, v4 +; GFX8-NEXT: v_xor_b32_e32 v16, 0x80000000, v16 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v4, v16, v4, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v16, vcc ; GFX8-NEXT: buffer_load_dword v16, off, s[0:3], s32 -; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v5, v21 -; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v21 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v5 -; GFX8-NEXT: v_ashrrev_i32_e32 v5, 31, v17 -; GFX8-NEXT: v_xor_b32_e32 v5, 0x80000000, v5 +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v5, v21 +; GFX8-NEXT: v_sub_u32_e64 v5, s[4:5], v5, v21 +; GFX8-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v5 +; GFX8-NEXT: v_ashrrev_i32_e32 v17, 31, v5 +; GFX8-NEXT: v_xor_b32_e32 v17, 0x80000000, v17 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v5, v17, v5, vcc -; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v6, v22 -; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v22 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v6 -; GFX8-NEXT: v_ashrrev_i32_e32 v6, 31, v17 -; GFX8-NEXT: v_xor_b32_e32 v6, 0x80000000, v6 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v17, vcc +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v6, v22 +; GFX8-NEXT: v_sub_u32_e64 v6, s[4:5], v6, v22 +; GFX8-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v6 +; GFX8-NEXT: v_ashrrev_i32_e32 v17, 31, v6 +; GFX8-NEXT: v_xor_b32_e32 v17, 0x80000000, v17 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v6, v17, v6, vcc -; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v7, v23 -; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v23 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v7 -; GFX8-NEXT: v_ashrrev_i32_e32 v7, 31, v17 -; GFX8-NEXT: v_xor_b32_e32 v7, 0x80000000, v7 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v6, v17, vcc +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v7, v23 +; GFX8-NEXT: v_sub_u32_e64 v7, s[4:5], v7, v23 +; GFX8-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v7 +; GFX8-NEXT: v_ashrrev_i32_e32 v17, 31, v7 +; GFX8-NEXT: v_xor_b32_e32 v17, 0x80000000, v17 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v7, v17, v7, vcc -; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v8, v24 -; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v24 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v8 -; GFX8-NEXT: v_ashrrev_i32_e32 v8, 31, v17 -; GFX8-NEXT: v_xor_b32_e32 v8, 0x80000000, v8 +; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v17, vcc +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v8, v24 +; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v8, v24 +; GFX8-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v8 +; GFX8-NEXT: v_ashrrev_i32_e32 v17, 31, v8 +; GFX8-NEXT: v_xor_b32_e32 v17, 0x80000000, v17 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v8, v17, v8, vcc -; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v9, v25 -; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v25 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v9 -; GFX8-NEXT: v_ashrrev_i32_e32 v9, 31, v17 -; GFX8-NEXT: v_xor_b32_e32 v9, 0x80000000, v9 +; GFX8-NEXT: v_cndmask_b32_e32 v8, v8, v17, vcc +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v9, v25 +; GFX8-NEXT: v_sub_u32_e64 v9, s[4:5], v9, v25 +; GFX8-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v9 +; GFX8-NEXT: v_ashrrev_i32_e32 v17, 31, v9 +; GFX8-NEXT: v_xor_b32_e32 v17, 0x80000000, v17 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v9, v17, v9, vcc -; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v10, v26 -; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v26 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v10 -; GFX8-NEXT: v_ashrrev_i32_e32 v10, 31, v17 -; GFX8-NEXT: v_xor_b32_e32 v10, 0x80000000, v10 +; GFX8-NEXT: v_cndmask_b32_e32 v9, v9, v17, vcc +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v10, v26 +; GFX8-NEXT: v_sub_u32_e64 v10, s[4:5], v10, v26 +; GFX8-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v10 +; GFX8-NEXT: v_ashrrev_i32_e32 v17, 31, v10 +; GFX8-NEXT: v_xor_b32_e32 v17, 0x80000000, v17 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v10, v17, v10, vcc -; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v11, v27 -; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v27 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v11 -; GFX8-NEXT: v_ashrrev_i32_e32 v11, 31, v17 -; GFX8-NEXT: v_xor_b32_e32 v11, 0x80000000, v11 +; GFX8-NEXT: v_cndmask_b32_e32 v10, v10, v17, vcc +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v11, v27 +; GFX8-NEXT: v_sub_u32_e64 v11, s[4:5], v11, v27 +; GFX8-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v11 +; GFX8-NEXT: v_ashrrev_i32_e32 v17, 31, v11 +; GFX8-NEXT: v_xor_b32_e32 v17, 0x80000000, v17 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v11, v17, v11, vcc -; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v12, v28 -; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v28 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v12 -; GFX8-NEXT: v_ashrrev_i32_e32 v12, 31, v17 -; GFX8-NEXT: v_xor_b32_e32 v12, 0x80000000, v12 +; GFX8-NEXT: v_cndmask_b32_e32 v11, v11, v17, vcc +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v12, v28 +; GFX8-NEXT: v_sub_u32_e64 v12, s[4:5], v12, v28 +; GFX8-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v12 +; GFX8-NEXT: v_ashrrev_i32_e32 v17, 31, v12 +; GFX8-NEXT: v_xor_b32_e32 v17, 0x80000000, v17 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v12, v17, v12, vcc -; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v13, v29 -; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v29 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v13 -; GFX8-NEXT: v_ashrrev_i32_e32 v13, 31, v17 -; GFX8-NEXT: v_xor_b32_e32 v13, 0x80000000, v13 +; GFX8-NEXT: v_cndmask_b32_e32 v12, v12, v17, vcc +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v13, v29 +; GFX8-NEXT: v_sub_u32_e64 v13, s[4:5], v13, v29 +; GFX8-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v13 +; GFX8-NEXT: v_ashrrev_i32_e32 v17, 31, v13 +; GFX8-NEXT: v_xor_b32_e32 v17, 0x80000000, v17 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v13, v17, v13, vcc -; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v14, v30 -; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v30 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v14 -; GFX8-NEXT: v_ashrrev_i32_e32 v14, 31, v17 -; GFX8-NEXT: v_xor_b32_e32 v14, 0x80000000, v14 +; GFX8-NEXT: v_cndmask_b32_e32 v13, v13, v17, vcc +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v14, v30 +; GFX8-NEXT: v_sub_u32_e64 v14, s[4:5], v14, v30 +; GFX8-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v14 +; GFX8-NEXT: v_ashrrev_i32_e32 v17, 31, v14 +; GFX8-NEXT: v_xor_b32_e32 v17, 0x80000000, v17 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v14, v17, v14, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v14, v14, v17, vcc ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v16 -; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v15, v16 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v15 -; GFX8-NEXT: v_ashrrev_i32_e32 v15, 31, v16 -; GFX8-NEXT: v_xor_b32_e32 v15, 0x80000000, v15 +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v15, v16 +; GFX8-NEXT: v_sub_u32_e64 v15, s[4:5], v15, v16 +; GFX8-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v15 +; GFX8-NEXT: v_ashrrev_i32_e32 v16, 31, v15 +; GFX8-NEXT: v_xor_b32_e32 v16, 0x80000000, v16 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v15, v16, v15, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v15, v15, v16, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_ssubsat_v16i32: @@ -1059,43 +1059,43 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) { ; GFX6-LABEL: v_ssubsat_i64: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v0, v2 -; GFX6-NEXT: v_subb_u32_e32 v5, vcc, v1, v3, vcc -; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1] -; GFX6-NEXT: v_cmp_lt_i64_e64 s[4:5], 0, v[2:3] -; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v5 -; GFX6-NEXT: s_xor_b64 vcc, s[4:5], vcc -; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX6-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 -; GFX6-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, v[0:1], v[2:3] +; GFX6-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v2 +; GFX6-NEXT: v_subb_u32_e64 v1, s[4:5], v1, v3, s[4:5] +; GFX6-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[0:1] +; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v1 +; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX6-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 +; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_ssubsat_i64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v0, v2 -; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v1, v3, vcc -; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1] -; GFX8-NEXT: v_cmp_lt_i64_e64 s[4:5], 0, v[2:3] -; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v5 -; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX8-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_sub_u32_e64 v0, s[4:5], v0, v2 +; GFX8-NEXT: v_subb_u32_e64 v1, s[4:5], v1, v3, s[4:5] +; GFX8-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[0:1] +; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v1 +; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX8-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_ssubsat_i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v0, v2 -; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v1, v3, vcc -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1] -; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], 0, v[2:3] -; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v5 -; GFX9-NEXT: s_xor_b64 vcc, s[4:5], vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX9-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_sub_co_u32_e64 v0, s[4:5], v0, v2 +; GFX9-NEXT: v_subb_co_u32_e64 v1, s[4:5], v1, v3, s[4:5] +; GFX9-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[0:1] +; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v1 +; GFX9-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX9-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_ssubsat_i64: @@ -1103,11 +1103,11 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v0, v2 ; GFX10-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo -; GFX10-NEXT: v_cmp_lt_i64_e64 s4, 0, v[2:3] +; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[0:1], v[2:3] ; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5 -; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1] +; GFX10-NEXT: v_cmp_gt_i64_e64 s4, 0, v[4:5] ; GFX10-NEXT: v_xor_b32_e32 v1, 0x80000000, v6 -; GFX10-NEXT: s_xor_b32 vcc_lo, s4, vcc_lo +; GFX10-NEXT: s_xor_b32 vcc_lo, vcc_lo, s4 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -1117,11 +1117,11 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_sub_co_u32 v4, vcc_lo, v0, v2 ; GFX11-NEXT: v_sub_co_ci_u32_e64 v5, null, v1, v3, vcc_lo -; GFX11-NEXT: v_cmp_lt_i64_e64 s0, 0, v[2:3] +; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[0:1], v[2:3] ; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v5 -; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1] +; GFX11-NEXT: v_cmp_gt_i64_e64 s0, 0, v[4:5] ; GFX11-NEXT: v_xor_b32_e32 v1, 0x80000000, v6 -; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo +; GFX11-NEXT: s_xor_b32 vcc_lo, vcc_lo, s0 ; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v6 :: v_dual_cndmask_b32 v1, v5, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call i64 @llvm.ssub.sat.i64(i64 %lhs, i64 %rhs) diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-addo-subo-rv32.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-addo-subo-rv32.mir index 5288bb5e87011..35f20117b26cd 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-addo-subo-rv32.mir +++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-addo-subo-rv32.mir @@ -232,9 +232,9 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11 ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY]], [[COPY1]] ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(slt), [[SUB]](s32), [[COPY]] - ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(sgt), [[COPY1]](s32), [[C]] - ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[ICMP1]], [[ICMP]] + ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(slt), [[COPY]](s32), [[COPY1]] + ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(slt), [[SUB]](s32), [[C]] + ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[ICMP]], [[ICMP1]] ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[SUB]](s32) ; CHECK-NEXT: $x10 = COPY [[COPY2]](s32) ; CHECK-NEXT: $x11 = COPY [[XOR]](s32) @@ -266,15 +266,15 @@ body: | ; CHECK-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[COPY1]], [[COPY3]] ; CHECK-NEXT: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[SUB1]], [[ICMP]] ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[SUB]](s32), [[COPY]] - ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(slt), [[SUB2]](s32), [[COPY1]] - ; CHECK-NEXT: [[ICMP3:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[SUB2]](s32), [[COPY1]] + ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY]](s32), [[COPY2]] + ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(slt), [[COPY1]](s32), [[COPY3]] + ; CHECK-NEXT: [[ICMP3:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY1]](s32), [[COPY3]] ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP3]](s32), [[ICMP1]], [[ICMP2]] - ; CHECK-NEXT: [[ICMP4:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), [[COPY2]](s32), [[C]] - ; CHECK-NEXT: [[ICMP5:%[0-9]+]]:_(s32) = G_ICMP intpred(sgt), [[COPY3]](s32), [[C]] - ; CHECK-NEXT: [[ICMP6:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY3]](s32), [[C]] + ; CHECK-NEXT: [[ICMP4:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[SUB]](s32), [[C]] + ; CHECK-NEXT: [[ICMP5:%[0-9]+]]:_(s32) = G_ICMP intpred(slt), [[SUB2]](s32), [[C]] + ; CHECK-NEXT: [[ICMP6:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[SUB2]](s32), [[C]] ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP6]](s32), [[ICMP4]], [[ICMP5]] - ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[SELECT1]], [[SELECT]] + ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[SELECT]], [[SELECT1]] ; CHECK-NEXT: $x10 = COPY [[SUB]](s32) ; CHECK-NEXT: $x11 = COPY [[SUB2]](s32) ; CHECK-NEXT: $x12 = COPY [[XOR]](s32) diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-addo-subo-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-addo-subo-rv64.mir index eed1aac8f6c13..22fe335b3ee35 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-addo-subo-rv64.mir +++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-addo-subo-rv64.mir @@ -246,9 +246,9 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11 ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[COPY]], [[COPY1]] ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(slt), [[SUB]](s64), [[COPY]] - ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s64) = G_ICMP intpred(sgt), [[COPY1]](s64), [[C]] - ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s64) = G_XOR [[ICMP1]], [[ICMP]] + ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(slt), [[COPY]](s64), [[COPY1]] + ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s64) = G_ICMP intpred(slt), [[SUB]](s64), [[C]] + ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s64) = G_XOR [[ICMP]], [[ICMP1]] ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[SUB]](s64) ; CHECK-NEXT: $x10 = COPY [[COPY2]](s64) ; CHECK-NEXT: $x11 = COPY [[XOR]](s64) diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-sat-rv32.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-sat-rv32.mir index 227a8cd7eb5ba..01bd20801ef2e 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-sat-rv32.mir +++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-sat-rv32.mir @@ -272,9 +272,9 @@ body: | ; RV32I-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11 ; RV32I-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY]], [[COPY1]] ; RV32I-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; RV32I-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(slt), [[SUB]](s32), [[COPY]] - ; RV32I-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(sgt), [[COPY1]](s32), [[C]] - ; RV32I-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[ICMP1]], [[ICMP]] + ; RV32I-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(slt), [[COPY]](s32), [[COPY1]] + ; RV32I-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(slt), [[SUB]](s32), [[C]] + ; RV32I-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[ICMP]], [[ICMP1]] ; RV32I-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[SUB]](s32) ; RV32I-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 31 ; RV32I-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[COPY2]], [[C1]](s32) @@ -325,15 +325,15 @@ body: | ; CHECK-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[COPY1]], [[COPY3]] ; CHECK-NEXT: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[SUB1]], [[ICMP]] ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[SUB]](s32), [[COPY]] - ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(slt), [[SUB2]](s32), [[COPY1]] - ; CHECK-NEXT: [[ICMP3:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[SUB2]](s32), [[COPY1]] + ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY]](s32), [[COPY2]] + ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(slt), [[COPY1]](s32), [[COPY3]] + ; CHECK-NEXT: [[ICMP3:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY1]](s32), [[COPY3]] ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP3]](s32), [[ICMP1]], [[ICMP2]] - ; CHECK-NEXT: [[ICMP4:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), [[COPY2]](s32), [[C]] - ; CHECK-NEXT: [[ICMP5:%[0-9]+]]:_(s32) = G_ICMP intpred(sgt), [[COPY3]](s32), [[C]] - ; CHECK-NEXT: [[ICMP6:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY3]](s32), [[C]] + ; CHECK-NEXT: [[ICMP4:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[SUB]](s32), [[C]] + ; CHECK-NEXT: [[ICMP5:%[0-9]+]]:_(s32) = G_ICMP intpred(slt), [[SUB2]](s32), [[C]] + ; CHECK-NEXT: [[ICMP6:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[SUB2]](s32), [[C]] ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP6]](s32), [[ICMP4]], [[ICMP5]] - ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[SELECT1]], [[SELECT]] + ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[SELECT]], [[SELECT1]] ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 31 ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SUB2]], [[C1]](s32) ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2147483648 diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-sat-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-sat-rv64.mir index d162bfcca1bc0..35147fbbc97e6 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-sat-rv64.mir +++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-sat-rv64.mir @@ -281,9 +281,9 @@ body: | ; RV64I-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11 ; RV64I-NEXT: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[COPY]], [[COPY1]] ; RV64I-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; RV64I-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(slt), [[SUB]](s64), [[COPY]] - ; RV64I-NEXT: [[ICMP1:%[0-9]+]]:_(s64) = G_ICMP intpred(sgt), [[COPY1]](s64), [[C]] - ; RV64I-NEXT: [[XOR:%[0-9]+]]:_(s64) = G_XOR [[ICMP1]], [[ICMP]] + ; RV64I-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(slt), [[COPY]](s64), [[COPY1]] + ; RV64I-NEXT: [[ICMP1:%[0-9]+]]:_(s64) = G_ICMP intpred(slt), [[SUB]](s64), [[C]] + ; RV64I-NEXT: [[XOR:%[0-9]+]]:_(s64) = G_XOR [[ICMP]], [[ICMP1]] ; RV64I-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[SUB]](s64) ; RV64I-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 63 ; RV64I-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[COPY2]], [[C1]](s64) diff --git a/llvm/test/CodeGen/RISCV/arith-with-overflow.ll b/llvm/test/CodeGen/RISCV/arith-with-overflow.ll index 4efc224ab1ca7..f8a9c6ae0fbf4 100644 --- a/llvm/test/CodeGen/RISCV/arith-with-overflow.ll +++ b/llvm/test/CodeGen/RISCV/arith-with-overflow.ll @@ -27,9 +27,9 @@ entry: define i1 @ssub(i32 %a, i32 %b, ptr %c) nounwind { ; RV32I-LABEL: ssub: ; RV32I: # %bb.0: # %entry -; RV32I-NEXT: sgtz a3, a1 +; RV32I-NEXT: slt a3, a0, a1 ; RV32I-NEXT: sub a1, a0, a1 -; RV32I-NEXT: slt a0, a1, a0 +; RV32I-NEXT: slti a0, a1, 0 ; RV32I-NEXT: xor a0, a3, a0 ; RV32I-NEXT: sw a1, 0(a2) ; RV32I-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/ssub_sat.ll b/llvm/test/CodeGen/RISCV/ssub_sat.ll index ba4d170c719fc..5bffbbf13d09f 100644 --- a/llvm/test/CodeGen/RISCV/ssub_sat.ll +++ b/llvm/test/CodeGen/RISCV/ssub_sat.ll @@ -13,11 +13,10 @@ declare i64 @llvm.ssub.sat.i64(i64, i64) define signext i32 @func(i32 signext %x, i32 signext %y) nounwind { ; RV32-LABEL: func: ; RV32: # %bb.0: -; RV32-NEXT: mv a2, a0 -; RV32-NEXT: sgtz a3, a1 +; RV32-NEXT: slt a2, a0, a1 ; RV32-NEXT: sub a0, a0, a1 -; RV32-NEXT: slt a1, a0, a2 -; RV32-NEXT: beq a3, a1, .LBB0_2 +; RV32-NEXT: slti a1, a0, 0 +; RV32-NEXT: beq a2, a1, .LBB0_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: srai a0, a0, 31 ; RV32-NEXT: lui a1, 524288 @@ -73,11 +72,10 @@ define i64 @func2(i64 %x, i64 %y) nounwind { ; ; RV64-LABEL: func2: ; RV64: # %bb.0: -; RV64-NEXT: mv a2, a0 -; RV64-NEXT: sgtz a3, a1 +; RV64-NEXT: slt a2, a0, a1 ; RV64-NEXT: sub a0, a0, a1 -; RV64-NEXT: slt a1, a0, a2 -; RV64-NEXT: beq a3, a1, .LBB1_2 +; RV64-NEXT: slti a1, a0, 0 +; RV64-NEXT: beq a2, a1, .LBB1_2 ; RV64-NEXT: # %bb.1: ; RV64-NEXT: srai a0, a0, 63 ; RV64-NEXT: li a1, -1 diff --git a/llvm/test/CodeGen/RISCV/ssub_sat_plus.ll b/llvm/test/CodeGen/RISCV/ssub_sat_plus.ll index 437c1e2a2e489..78cc2cb1eb4cf 100644 --- a/llvm/test/CodeGen/RISCV/ssub_sat_plus.ll +++ b/llvm/test/CodeGen/RISCV/ssub_sat_plus.ll @@ -13,12 +13,11 @@ declare i64 @llvm.ssub.sat.i64(i64, i64) define i32 @func32(i32 %x, i32 %y, i32 %z) nounwind { ; RV32-LABEL: func32: ; RV32: # %bb.0: -; RV32-NEXT: mv a3, a0 -; RV32-NEXT: mul a0, a1, a2 -; RV32-NEXT: sgtz a1, a0 -; RV32-NEXT: sub a0, a3, a0 -; RV32-NEXT: slt a2, a0, a3 -; RV32-NEXT: beq a1, a2, .LBB0_2 +; RV32-NEXT: mul a1, a1, a2 +; RV32-NEXT: slt a2, a0, a1 +; RV32-NEXT: sub a0, a0, a1 +; RV32-NEXT: slti a1, a0, 0 +; RV32-NEXT: beq a2, a1, .LBB0_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: srai a0, a0, 31 ; RV32-NEXT: lui a1, 524288 @@ -77,11 +76,10 @@ define i64 @func64(i64 %x, i64 %y, i64 %z) nounwind { ; ; RV64-LABEL: func64: ; RV64: # %bb.0: -; RV64-NEXT: mv a1, a0 -; RV64-NEXT: sgtz a3, a2 +; RV64-NEXT: slt a1, a0, a2 ; RV64-NEXT: sub a0, a0, a2 -; RV64-NEXT: slt a1, a0, a1 -; RV64-NEXT: beq a3, a1, .LBB1_2 +; RV64-NEXT: slti a2, a0, 0 +; RV64-NEXT: beq a1, a2, .LBB1_2 ; RV64-NEXT: # %bb.1: ; RV64-NEXT: srai a0, a0, 63 ; RV64-NEXT: li a1, -1 diff --git a/llvm/test/CodeGen/RISCV/xaluo.ll b/llvm/test/CodeGen/RISCV/xaluo.ll index a30593d7d7afb..699e791fccf48 100644 --- a/llvm/test/CodeGen/RISCV/xaluo.ll +++ b/llvm/test/CodeGen/RISCV/xaluo.ll @@ -748,9 +748,9 @@ entry: define zeroext i1 @ssubo1.i32(i32 signext %v1, i32 signext %v2, ptr %res) { ; RV32-LABEL: ssubo1.i32: ; RV32: # %bb.0: # %entry -; RV32-NEXT: sgtz a3, a1 +; RV32-NEXT: slt a3, a0, a1 ; RV32-NEXT: sub a1, a0, a1 -; RV32-NEXT: slt a0, a1, a0 +; RV32-NEXT: slti a0, a1, 0 ; RV32-NEXT: xor a0, a3, a0 ; RV32-NEXT: sw a1, 0(a2) ; RV32-NEXT: ret @@ -766,9 +766,9 @@ define zeroext i1 @ssubo1.i32(i32 signext %v1, i32 signext %v2, ptr %res) { ; ; RV32ZBA-LABEL: ssubo1.i32: ; RV32ZBA: # %bb.0: # %entry -; RV32ZBA-NEXT: sgtz a3, a1 +; RV32ZBA-NEXT: slt a3, a0, a1 ; RV32ZBA-NEXT: sub a1, a0, a1 -; RV32ZBA-NEXT: slt a0, a1, a0 +; RV32ZBA-NEXT: slti a0, a1, 0 ; RV32ZBA-NEXT: xor a0, a3, a0 ; RV32ZBA-NEXT: sw a1, 0(a2) ; RV32ZBA-NEXT: ret @@ -784,9 +784,9 @@ define zeroext i1 @ssubo1.i32(i32 signext %v1, i32 signext %v2, ptr %res) { ; ; RV32ZICOND-LABEL: ssubo1.i32: ; RV32ZICOND: # %bb.0: # %entry -; RV32ZICOND-NEXT: sgtz a3, a1 +; RV32ZICOND-NEXT: slt a3, a0, a1 ; RV32ZICOND-NEXT: sub a1, a0, a1 -; RV32ZICOND-NEXT: slt a0, a1, a0 +; RV32ZICOND-NEXT: slti a0, a1, 0 ; RV32ZICOND-NEXT: xor a0, a3, a0 ; RV32ZICOND-NEXT: sw a1, 0(a2) ; RV32ZICOND-NEXT: ret @@ -874,9 +874,9 @@ define zeroext i1 @ssubo.i64(i64 %v1, i64 %v2, ptr %res) { ; ; RV64-LABEL: ssubo.i64: ; RV64: # %bb.0: # %entry -; RV64-NEXT: sgtz a3, a1 +; RV64-NEXT: slt a3, a0, a1 ; RV64-NEXT: sub a1, a0, a1 -; RV64-NEXT: slt a0, a1, a0 +; RV64-NEXT: slti a0, a1, 0 ; RV64-NEXT: xor a0, a3, a0 ; RV64-NEXT: sd a1, 0(a2) ; RV64-NEXT: ret @@ -897,9 +897,9 @@ define zeroext i1 @ssubo.i64(i64 %v1, i64 %v2, ptr %res) { ; ; RV64ZBA-LABEL: ssubo.i64: ; RV64ZBA: # %bb.0: # %entry -; RV64ZBA-NEXT: sgtz a3, a1 +; RV64ZBA-NEXT: slt a3, a0, a1 ; RV64ZBA-NEXT: sub a1, a0, a1 -; RV64ZBA-NEXT: slt a0, a1, a0 +; RV64ZBA-NEXT: slti a0, a1, 0 ; RV64ZBA-NEXT: xor a0, a3, a0 ; RV64ZBA-NEXT: sd a1, 0(a2) ; RV64ZBA-NEXT: ret @@ -920,9 +920,9 @@ define zeroext i1 @ssubo.i64(i64 %v1, i64 %v2, ptr %res) { ; ; RV64ZICOND-LABEL: ssubo.i64: ; RV64ZICOND: # %bb.0: # %entry -; RV64ZICOND-NEXT: sgtz a3, a1 +; RV64ZICOND-NEXT: slt a3, a0, a1 ; RV64ZICOND-NEXT: sub a1, a0, a1 -; RV64ZICOND-NEXT: slt a0, a1, a0 +; RV64ZICOND-NEXT: slti a0, a1, 0 ; RV64ZICOND-NEXT: xor a0, a3, a0 ; RV64ZICOND-NEXT: sd a1, 0(a2) ; RV64ZICOND-NEXT: ret @@ -2527,9 +2527,9 @@ entry: define i32 @ssubo.select.i32(i32 signext %v1, i32 signext %v2) { ; RV32-LABEL: ssubo.select.i32: ; RV32: # %bb.0: # %entry -; RV32-NEXT: sgtz a2, a1 +; RV32-NEXT: slt a2, a0, a1 ; RV32-NEXT: sub a3, a0, a1 -; RV32-NEXT: slt a3, a3, a0 +; RV32-NEXT: slti a3, a3, 0 ; RV32-NEXT: bne a2, a3, .LBB36_2 ; RV32-NEXT: # %bb.1: # %entry ; RV32-NEXT: mv a0, a1 @@ -2548,9 +2548,9 @@ define i32 @ssubo.select.i32(i32 signext %v1, i32 signext %v2) { ; ; RV32ZBA-LABEL: ssubo.select.i32: ; RV32ZBA: # %bb.0: # %entry -; RV32ZBA-NEXT: sgtz a2, a1 +; RV32ZBA-NEXT: slt a2, a0, a1 ; RV32ZBA-NEXT: sub a3, a0, a1 -; RV32ZBA-NEXT: slt a3, a3, a0 +; RV32ZBA-NEXT: slti a3, a3, 0 ; RV32ZBA-NEXT: bne a2, a3, .LBB36_2 ; RV32ZBA-NEXT: # %bb.1: # %entry ; RV32ZBA-NEXT: mv a0, a1 @@ -2569,9 +2569,9 @@ define i32 @ssubo.select.i32(i32 signext %v1, i32 signext %v2) { ; ; RV32ZICOND-LABEL: ssubo.select.i32: ; RV32ZICOND: # %bb.0: # %entry -; RV32ZICOND-NEXT: sgtz a2, a1 +; RV32ZICOND-NEXT: slt a2, a0, a1 ; RV32ZICOND-NEXT: sub a3, a0, a1 -; RV32ZICOND-NEXT: slt a3, a3, a0 +; RV32ZICOND-NEXT: slti a3, a3, 0 ; RV32ZICOND-NEXT: xor a2, a2, a3 ; RV32ZICOND-NEXT: czero.nez a1, a1, a2 ; RV32ZICOND-NEXT: czero.eqz a0, a0, a2 @@ -2597,9 +2597,9 @@ entry: define i1 @ssubo.not.i32(i32 signext %v1, i32 signext %v2) { ; RV32-LABEL: ssubo.not.i32: ; RV32: # %bb.0: # %entry -; RV32-NEXT: sgtz a2, a1 -; RV32-NEXT: sub a1, a0, a1 -; RV32-NEXT: slt a0, a1, a0 +; RV32-NEXT: slt a2, a0, a1 +; RV32-NEXT: sub a0, a0, a1 +; RV32-NEXT: slti a0, a0, 0 ; RV32-NEXT: xor a0, a2, a0 ; RV32-NEXT: xori a0, a0, 1 ; RV32-NEXT: ret @@ -2614,9 +2614,9 @@ define i1 @ssubo.not.i32(i32 signext %v1, i32 signext %v2) { ; ; RV32ZBA-LABEL: ssubo.not.i32: ; RV32ZBA: # %bb.0: # %entry -; RV32ZBA-NEXT: sgtz a2, a1 -; RV32ZBA-NEXT: sub a1, a0, a1 -; RV32ZBA-NEXT: slt a0, a1, a0 +; RV32ZBA-NEXT: slt a2, a0, a1 +; RV32ZBA-NEXT: sub a0, a0, a1 +; RV32ZBA-NEXT: slti a0, a0, 0 ; RV32ZBA-NEXT: xor a0, a2, a0 ; RV32ZBA-NEXT: xori a0, a0, 1 ; RV32ZBA-NEXT: ret @@ -2631,9 +2631,9 @@ define i1 @ssubo.not.i32(i32 signext %v1, i32 signext %v2) { ; ; RV32ZICOND-LABEL: ssubo.not.i32: ; RV32ZICOND: # %bb.0: # %entry -; RV32ZICOND-NEXT: sgtz a2, a1 -; RV32ZICOND-NEXT: sub a1, a0, a1 -; RV32ZICOND-NEXT: slt a0, a1, a0 +; RV32ZICOND-NEXT: slt a2, a0, a1 +; RV32ZICOND-NEXT: sub a0, a0, a1 +; RV32ZICOND-NEXT: slti a0, a0, 0 ; RV32ZICOND-NEXT: xor a0, a2, a0 ; RV32ZICOND-NEXT: xori a0, a0, 1 ; RV32ZICOND-NEXT: ret @@ -2670,9 +2670,9 @@ define i64 @ssubo.select.i64(i64 %v1, i64 %v2) { ; ; RV64-LABEL: ssubo.select.i64: ; RV64: # %bb.0: # %entry -; RV64-NEXT: sgtz a2, a1 +; RV64-NEXT: slt a2, a0, a1 ; RV64-NEXT: sub a3, a0, a1 -; RV64-NEXT: slt a3, a3, a0 +; RV64-NEXT: slti a3, a3, 0 ; RV64-NEXT: bne a2, a3, .LBB38_2 ; RV64-NEXT: # %bb.1: # %entry ; RV64-NEXT: mv a0, a1 @@ -2696,9 +2696,9 @@ define i64 @ssubo.select.i64(i64 %v1, i64 %v2) { ; ; RV64ZBA-LABEL: ssubo.select.i64: ; RV64ZBA: # %bb.0: # %entry -; RV64ZBA-NEXT: sgtz a2, a1 +; RV64ZBA-NEXT: slt a2, a0, a1 ; RV64ZBA-NEXT: sub a3, a0, a1 -; RV64ZBA-NEXT: slt a3, a3, a0 +; RV64ZBA-NEXT: slti a3, a3, 0 ; RV64ZBA-NEXT: bne a2, a3, .LBB38_2 ; RV64ZBA-NEXT: # %bb.1: # %entry ; RV64ZBA-NEXT: mv a0, a1 @@ -2724,9 +2724,9 @@ define i64 @ssubo.select.i64(i64 %v1, i64 %v2) { ; ; RV64ZICOND-LABEL: ssubo.select.i64: ; RV64ZICOND: # %bb.0: # %entry -; RV64ZICOND-NEXT: sgtz a2, a1 +; RV64ZICOND-NEXT: slt a2, a0, a1 ; RV64ZICOND-NEXT: sub a3, a0, a1 -; RV64ZICOND-NEXT: slt a3, a3, a0 +; RV64ZICOND-NEXT: slti a3, a3, 0 ; RV64ZICOND-NEXT: xor a2, a2, a3 ; RV64ZICOND-NEXT: czero.nez a1, a1, a2 ; RV64ZICOND-NEXT: czero.eqz a0, a0, a2 @@ -2754,9 +2754,9 @@ define i1 @ssub.not.i64(i64 %v1, i64 %v2) { ; ; RV64-LABEL: ssub.not.i64: ; RV64: # %bb.0: # %entry -; RV64-NEXT: sgtz a2, a1 -; RV64-NEXT: sub a1, a0, a1 -; RV64-NEXT: slt a0, a1, a0 +; RV64-NEXT: slt a2, a0, a1 +; RV64-NEXT: sub a0, a0, a1 +; RV64-NEXT: slti a0, a0, 0 ; RV64-NEXT: xor a0, a2, a0 ; RV64-NEXT: xori a0, a0, 1 ; RV64-NEXT: ret @@ -2775,9 +2775,9 @@ define i1 @ssub.not.i64(i64 %v1, i64 %v2) { ; ; RV64ZBA-LABEL: ssub.not.i64: ; RV64ZBA: # %bb.0: # %entry -; RV64ZBA-NEXT: sgtz a2, a1 -; RV64ZBA-NEXT: sub a1, a0, a1 -; RV64ZBA-NEXT: slt a0, a1, a0 +; RV64ZBA-NEXT: slt a2, a0, a1 +; RV64ZBA-NEXT: sub a0, a0, a1 +; RV64ZBA-NEXT: slti a0, a0, 0 ; RV64ZBA-NEXT: xor a0, a2, a0 ; RV64ZBA-NEXT: xori a0, a0, 1 ; RV64ZBA-NEXT: ret @@ -2796,9 +2796,9 @@ define i1 @ssub.not.i64(i64 %v1, i64 %v2) { ; ; RV64ZICOND-LABEL: ssub.not.i64: ; RV64ZICOND: # %bb.0: # %entry -; RV64ZICOND-NEXT: sgtz a2, a1 -; RV64ZICOND-NEXT: sub a1, a0, a1 -; RV64ZICOND-NEXT: slt a0, a1, a0 +; RV64ZICOND-NEXT: slt a2, a0, a1 +; RV64ZICOND-NEXT: sub a0, a0, a1 +; RV64ZICOND-NEXT: slti a0, a0, 0 ; RV64ZICOND-NEXT: xor a0, a2, a0 ; RV64ZICOND-NEXT: xori a0, a0, 1 ; RV64ZICOND-NEXT: ret @@ -4196,9 +4196,9 @@ continue: define zeroext i1 @ssubo.br.i32(i32 signext %v1, i32 signext %v2) { ; RV32-LABEL: ssubo.br.i32: ; RV32: # %bb.0: # %entry -; RV32-NEXT: sgtz a2, a1 -; RV32-NEXT: sub a1, a0, a1 -; RV32-NEXT: slt a0, a1, a0 +; RV32-NEXT: slt a2, a0, a1 +; RV32-NEXT: sub a0, a0, a1 +; RV32-NEXT: slti a0, a0, 0 ; RV32-NEXT: beq a2, a0, .LBB56_2 ; RV32-NEXT: # %bb.1: # %overflow ; RV32-NEXT: li a0, 0 @@ -4221,9 +4221,9 @@ define zeroext i1 @ssubo.br.i32(i32 signext %v1, i32 signext %v2) { ; ; RV32ZBA-LABEL: ssubo.br.i32: ; RV32ZBA: # %bb.0: # %entry -; RV32ZBA-NEXT: sgtz a2, a1 -; RV32ZBA-NEXT: sub a1, a0, a1 -; RV32ZBA-NEXT: slt a0, a1, a0 +; RV32ZBA-NEXT: slt a2, a0, a1 +; RV32ZBA-NEXT: sub a0, a0, a1 +; RV32ZBA-NEXT: slti a0, a0, 0 ; RV32ZBA-NEXT: beq a2, a0, .LBB56_2 ; RV32ZBA-NEXT: # %bb.1: # %overflow ; RV32ZBA-NEXT: li a0, 0 @@ -4246,9 +4246,9 @@ define zeroext i1 @ssubo.br.i32(i32 signext %v1, i32 signext %v2) { ; ; RV32ZICOND-LABEL: ssubo.br.i32: ; RV32ZICOND: # %bb.0: # %entry -; RV32ZICOND-NEXT: sgtz a2, a1 -; RV32ZICOND-NEXT: sub a1, a0, a1 -; RV32ZICOND-NEXT: slt a0, a1, a0 +; RV32ZICOND-NEXT: slt a2, a0, a1 +; RV32ZICOND-NEXT: sub a0, a0, a1 +; RV32ZICOND-NEXT: slti a0, a0, 0 ; RV32ZICOND-NEXT: beq a2, a0, .LBB56_2 ; RV32ZICOND-NEXT: # %bb.1: # %overflow ; RV32ZICOND-NEXT: li a0, 0 @@ -4300,9 +4300,9 @@ define zeroext i1 @ssubo.br.i64(i64 %v1, i64 %v2) { ; ; RV64-LABEL: ssubo.br.i64: ; RV64: # %bb.0: # %entry -; RV64-NEXT: sgtz a2, a1 -; RV64-NEXT: sub a1, a0, a1 -; RV64-NEXT: slt a0, a1, a0 +; RV64-NEXT: slt a2, a0, a1 +; RV64-NEXT: sub a0, a0, a1 +; RV64-NEXT: slti a0, a0, 0 ; RV64-NEXT: beq a2, a0, .LBB57_2 ; RV64-NEXT: # %bb.1: # %overflow ; RV64-NEXT: li a0, 0 @@ -4329,9 +4329,9 @@ define zeroext i1 @ssubo.br.i64(i64 %v1, i64 %v2) { ; ; RV64ZBA-LABEL: ssubo.br.i64: ; RV64ZBA: # %bb.0: # %entry -; RV64ZBA-NEXT: sgtz a2, a1 -; RV64ZBA-NEXT: sub a1, a0, a1 -; RV64ZBA-NEXT: slt a0, a1, a0 +; RV64ZBA-NEXT: slt a2, a0, a1 +; RV64ZBA-NEXT: sub a0, a0, a1 +; RV64ZBA-NEXT: slti a0, a0, 0 ; RV64ZBA-NEXT: beq a2, a0, .LBB57_2 ; RV64ZBA-NEXT: # %bb.1: # %overflow ; RV64ZBA-NEXT: li a0, 0 @@ -4358,9 +4358,9 @@ define zeroext i1 @ssubo.br.i64(i64 %v1, i64 %v2) { ; ; RV64ZICOND-LABEL: ssubo.br.i64: ; RV64ZICOND: # %bb.0: # %entry -; RV64ZICOND-NEXT: sgtz a2, a1 -; RV64ZICOND-NEXT: sub a1, a0, a1 -; RV64ZICOND-NEXT: slt a0, a1, a0 +; RV64ZICOND-NEXT: slt a2, a0, a1 +; RV64ZICOND-NEXT: sub a0, a0, a1 +; RV64ZICOND-NEXT: slti a0, a0, 0 ; RV64ZICOND-NEXT: beq a2, a0, .LBB57_2 ; RV64ZICOND-NEXT: # %bb.1: # %overflow ; RV64ZICOND-NEXT: li a0, 0 diff --git a/llvm/test/CodeGen/RISCV/xqcia.ll b/llvm/test/CodeGen/RISCV/xqcia.ll index c75bb9daefcf2..29beb221797f2 100644 --- a/llvm/test/CodeGen/RISCV/xqcia.ll +++ b/llvm/test/CodeGen/RISCV/xqcia.ll @@ -48,11 +48,10 @@ define i32 @addusat(i32 %a, i32 %b) { define i32 @subsat(i32 %a, i32 %b) { ; RV32I-LABEL: subsat: ; RV32I: # %bb.0: -; RV32I-NEXT: mv a2, a0 -; RV32I-NEXT: sgtz a3, a1 +; RV32I-NEXT: slt a2, a0, a1 ; RV32I-NEXT: sub a0, a0, a1 -; RV32I-NEXT: slt a1, a0, a2 -; RV32I-NEXT: beq a3, a1, .LBB2_2 +; RV32I-NEXT: slti a1, a0, 0 +; RV32I-NEXT: beq a2, a1, .LBB2_2 ; RV32I-NEXT: # %bb.1: ; RV32I-NEXT: srai a0, a0, 31 ; RV32I-NEXT: lui a1, 524288 diff --git a/llvm/test/CodeGen/Thumb2/mve-saturating-arith.ll b/llvm/test/CodeGen/Thumb2/mve-saturating-arith.ll index bbc0ff9bd1be5..d566d0ddce6ba 100644 --- a/llvm/test/CodeGen/Thumb2/mve-saturating-arith.ll +++ b/llvm/test/CodeGen/Thumb2/mve-saturating-arith.ll @@ -179,47 +179,40 @@ entry: define arm_aapcs_vfpcc <2 x i64> @ssub_int64_t(<2 x i64> %src1, <2 x i64> %src2) { ; CHECK-LABEL: ssub_int64_t: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, r7, lr} -; CHECK-NEXT: push {r4, r5, r6, r7, lr} -; CHECK-NEXT: vmov r2, r3, d2 -; CHECK-NEXT: vmov r1, r0, d0 +; CHECK-NEXT: .save {r4, r5, r7, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: vmov r0, r1, d2 +; CHECK-NEXT: mov.w r12, #1 +; CHECK-NEXT: vmov r2, r3, d0 ; CHECK-NEXT: vmov r4, r5, d1 -; CHECK-NEXT: subs.w r12, r1, r2 -; CHECK-NEXT: sbc.w lr, r0, r3 -; CHECK-NEXT: subs.w r1, r12, r1 -; CHECK-NEXT: sbcs.w r0, lr, r0 -; CHECK-NEXT: mov.w r1, #0 -; CHECK-NEXT: cset r0, lt -; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: sbcs.w r2, r1, r3 +; CHECK-NEXT: subs.w lr, r2, r0 +; CHECK-NEXT: sbcs.w r1, r3, r1 +; CHECK-NEXT: mov.w r3, #0 +; CHECK-NEXT: lsr.w r2, r1, #31 ; CHECK-NEXT: it lt -; CHECK-NEXT: eorlt r0, r0, #1 -; CHECK-NEXT: vmov r2, r3, d3 -; CHECK-NEXT: rsbs r0, r0, #0 -; CHECK-NEXT: subs r6, r4, r2 -; CHECK-NEXT: sbc.w r7, r5, r3 -; CHECK-NEXT: subs r4, r6, r4 -; CHECK-NEXT: sbcs.w r4, r7, r5 -; CHECK-NEXT: vmov q0[2], q0[0], r12, r6 -; CHECK-NEXT: cset r4, lt +; CHECK-NEXT: eorlt.w r2, r12, r1, lsr #31 ; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: sbcs.w r2, r1, r3 -; CHECK-NEXT: bfi r1, r0, #0, #8 +; CHECK-NEXT: bfi r3, r2, #0, #8 +; CHECK-NEXT: vmov r2, r0, d3 +; CHECK-NEXT: subs r2, r4, r2 +; CHECK-NEXT: sbcs.w r0, r5, r0 +; CHECK-NEXT: vmov q0[2], q0[0], lr, r2 +; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 +; CHECK-NEXT: asr.w r1, r1, #31 +; CHECK-NEXT: lsr.w r4, r0, #31 ; CHECK-NEXT: it lt -; CHECK-NEXT: eorlt r4, r4, #1 -; CHECK-NEXT: rsbs r0, r4, #0 -; CHECK-NEXT: bfi r1, r0, #8, #8 -; CHECK-NEXT: asrs r0, r7, #31 -; CHECK-NEXT: vmsr p0, r1 -; CHECK-NEXT: asr.w r1, lr, #31 +; CHECK-NEXT: eorlt.w r4, r12, r0, lsr #31 +; CHECK-NEXT: asrs r0, r0, #31 ; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 -; CHECK-NEXT: vmov q0[3], q0[1], lr, r7 +; CHECK-NEXT: rsbs r5, r4, #0 ; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 ; CHECK-NEXT: adr r0, .LCPI11_0 ; CHECK-NEXT: vldrw.u32 q2, [r0] +; CHECK-NEXT: bfi r3, r5, #8, #8 +; CHECK-NEXT: vmsr p0, r3 ; CHECK-NEXT: veor q1, q1, q2 ; CHECK-NEXT: vpsel q0, q1, q0 -; CHECK-NEXT: pop {r4, r5, r6, r7, pc} +; CHECK-NEXT: pop {r4, r5, r7, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.1: ; CHECK-NEXT: .LCPI11_0: diff --git a/llvm/test/CodeGen/X86/expand-vp-int-intrinsics.ll b/llvm/test/CodeGen/X86/expand-vp-int-intrinsics.ll index dbfa69d497698..09c7c4b7a26f6 100644 --- a/llvm/test/CodeGen/X86/expand-vp-int-intrinsics.ll +++ b/llvm/test/CodeGen/X86/expand-vp-int-intrinsics.ll @@ -1821,67 +1821,60 @@ declare <4 x i32> @llvm.vp.uadd.sat.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32) define <4 x i32> @vp_ssub_sat_v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 zeroext %evl) { ; X86-LABEL: vp_ssub_sat_v4i32: ; X86: # %bb.0: -; X86-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; X86-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm2 -; X86-NEXT: vpsubd %xmm1, %xmm0, %xmm1 -; X86-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; X86-NEXT: vpxor %xmm0, %xmm2, %xmm0 -; X86-NEXT: vpsrad $31, %xmm1, %xmm2 -; X86-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2, %xmm2 -; X86-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 +; X86-NEXT: vpsubd %xmm1, %xmm0, %xmm2 +; X86-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; X86-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; X86-NEXT: vpsrad $31, %xmm2, %xmm1 +; X86-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1 +; X86-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 ; X86-NEXT: retl ; ; SSE-LABEL: vp_ssub_sat_v4i32: ; SSE: # %bb.0: -; SSE-NEXT: pxor %xmm2, %xmm2 -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: psubd %xmm1, %xmm3 -; SSE-NEXT: pcmpgtd %xmm2, %xmm1 -; SSE-NEXT: pcmpgtd %xmm3, %xmm0 -; SSE-NEXT: pxor %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: psrad $31, %xmm3 -; SSE-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE-NEXT: psubd %xmm1, %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE-NEXT: pxor %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: psrad $31, %xmm0 +; SSE-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: vp_ssub_sat_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpxor %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2 -; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX1-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 +; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsrad $31, %xmm2, %xmm1 +; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: vp_ssub_sat_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpxor %xmm0, %xmm2, %xmm0 -; AVX2-NEXT: vpsrad $31, %xmm1, %xmm2 +; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpsrad $31, %xmm2, %xmm1 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] -; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 +; AVX2-NEXT: vpxor %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: vp_ssub_sat_v4i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpcmpgtd %xmm2, %xmm1, %k0 -; AVX512-NEXT: vpsubd %xmm1, %xmm0, %xmm1 -; AVX512-NEXT: vpcmpgtd %xmm1, %xmm0, %k1 +; AVX512-NEXT: vpcmpgtd %xmm0, %xmm1, %k0 +; AVX512-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vpcmpgtd %xmm0, %xmm1, %k1 ; AVX512-NEXT: kxorw %k1, %k0, %k1 -; AVX512-NEXT: vpsrad $31, %xmm1, %xmm0 -; AVX512-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm1 {%k1} -; AVX512-NEXT: vmovdqa %xmm1, %xmm0 +; AVX512-NEXT: vpsrad $31, %xmm0, %xmm1 +; AVX512-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0 {%k1} ; AVX512-NEXT: retq %v = call <4 x i32> @llvm.vp.ssub.sat.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 %evl) ret <4 x i32> %v diff --git a/llvm/test/CodeGen/X86/ssub_sat.ll b/llvm/test/CodeGen/X86/ssub_sat.ll index 8ecc8b39ac468..ee6b60c075630 100644 --- a/llvm/test/CodeGen/X86/ssub_sat.ll +++ b/llvm/test/CodeGen/X86/ssub_sat.ll @@ -207,18 +207,18 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; ; X64-LABEL: vec: ; X64: # %bb.0: -; X64-NEXT: pxor %xmm2, %xmm2 -; X64-NEXT: movdqa %xmm0, %xmm3 -; X64-NEXT: psubd %xmm1, %xmm3 -; X64-NEXT: pcmpgtd %xmm2, %xmm1 -; X64-NEXT: pcmpgtd %xmm3, %xmm0 -; X64-NEXT: pxor %xmm1, %xmm0 -; X64-NEXT: movdqa %xmm0, %xmm1 -; X64-NEXT: pandn %xmm3, %xmm1 -; X64-NEXT: psrad $31, %xmm3 -; X64-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; X64-NEXT: pand %xmm3, %xmm0 -; X64-NEXT: por %xmm1, %xmm0 +; X64-NEXT: movdqa %xmm1, %xmm2 +; X64-NEXT: pcmpgtd %xmm0, %xmm2 +; X64-NEXT: psubd %xmm1, %xmm0 +; X64-NEXT: pxor %xmm1, %xmm1 +; X64-NEXT: pcmpgtd %xmm0, %xmm1 +; X64-NEXT: pxor %xmm2, %xmm1 +; X64-NEXT: movdqa %xmm1, %xmm2 +; X64-NEXT: pandn %xmm0, %xmm2 +; X64-NEXT: psrad $31, %xmm0 +; X64-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-NEXT: pand %xmm1, %xmm0 +; X64-NEXT: por %xmm2, %xmm0 ; X64-NEXT: retq %tmp = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %x, <4 x i32> %y) ret <4 x i32> %tmp diff --git a/llvm/test/CodeGen/X86/ssub_sat_vec.ll b/llvm/test/CodeGen/X86/ssub_sat_vec.ll index eb2ad4fdff92f..bdac954031fb5 100644 --- a/llvm/test/CodeGen/X86/ssub_sat_vec.ll +++ b/llvm/test/CodeGen/X86/ssub_sat_vec.ll @@ -612,99 +612,91 @@ define <16 x i1> @v16i1(<16 x i1> %x, <16 x i1> %y) nounwind { define <2 x i32> @v2i32(<2 x i32> %x, <2 x i32> %y) nounwind { ; SSE2-LABEL: v2i32: ; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: psubd %xmm1, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pandn %xmm3, %xmm1 -; SSE2-NEXT: psrad $31, %xmm3 -; SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE2-NEXT: psubd %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pandn %xmm0, %xmm2 +; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: v2i32: ; SSSE3: # %bb.0: -; SSSE3-NEXT: pxor %xmm2, %xmm2 -; SSSE3-NEXT: movdqa %xmm0, %xmm3 -; SSSE3-NEXT: psubd %xmm1, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm1 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm0 -; SSSE3-NEXT: pxor %xmm1, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: pandn %xmm3, %xmm1 -; SSSE3-NEXT: psrad $31, %xmm3 -; SSSE3-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; SSSE3-NEXT: pand %xmm3, %xmm0 -; SSSE3-NEXT: por %xmm1, %xmm0 +; SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm2 +; SSSE3-NEXT: psubd %xmm1, %xmm0 +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm1 +; SSSE3-NEXT: pxor %xmm2, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSSE3-NEXT: pandn %xmm0, %xmm2 +; SSSE3-NEXT: psrad $31, %xmm0 +; SSSE3-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSSE3-NEXT: pand %xmm1, %xmm0 +; SSSE3-NEXT: por %xmm2, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: v2i32: ; SSE41: # %bb.0: -; SSE41-NEXT: pxor %xmm3, %xmm3 ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: psubd %xmm1, %xmm2 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm1 -; SSE41-NEXT: pcmpgtd %xmm2, %xmm0 -; SSE41-NEXT: pxor %xmm1, %xmm0 -; SSE41-NEXT: movdqa %xmm2, %xmm1 -; SSE41-NEXT: psrad $31, %xmm1 -; SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE41-NEXT: pxor %xmm2, %xmm1 +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: psrad $31, %xmm3 +; SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm3, %xmm2 ; SSE41-NEXT: movaps %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: v2i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpxor %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2 -; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX1-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 +; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsrad $31, %xmm2, %xmm1 +; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: v2i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpxor %xmm0, %xmm2, %xmm0 -; AVX2-NEXT: vpsrad $31, %xmm1, %xmm2 +; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpsrad $31, %xmm2, %xmm1 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] -; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 +; AVX2-NEXT: vpxor %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: v2i32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm2 -; AVX512F-NEXT: vpsubd %xmm1, %xmm0, %xmm1 -; AVX512F-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; AVX512F-NEXT: vpxor %xmm0, %xmm2, %xmm0 -; AVX512F-NEXT: vpsrad $31, %xmm1, %xmm2 +; AVX512F-NEXT: vpsubd %xmm1, %xmm0, %xmm2 +; AVX512F-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; AVX512F-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX512F-NEXT: vpsrad $31, %xmm2, %xmm1 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] -; AVX512F-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX512F-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 +; AVX512F-NEXT: vpxor %xmm3, %xmm1, %xmm1 +; AVX512F-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: v2i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512BW-NEXT: vpcmpgtd %xmm2, %xmm1, %k0 -; AVX512BW-NEXT: vpsubd %xmm1, %xmm0, %xmm1 -; AVX512BW-NEXT: vpcmpgtd %xmm1, %xmm0, %k1 +; AVX512BW-NEXT: vpcmpgtd %xmm0, %xmm1, %k0 +; AVX512BW-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-NEXT: vpcmpgtd %xmm0, %xmm1, %k1 ; AVX512BW-NEXT: kxorw %k1, %k0, %k1 -; AVX512BW-NEXT: vpsrad $31, %xmm1, %xmm0 -; AVX512BW-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm1 {%k1} -; AVX512BW-NEXT: vmovdqa %xmm1, %xmm0 +; AVX512BW-NEXT: vpsrad $31, %xmm0, %xmm1 +; AVX512BW-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0 {%k1} ; AVX512BW-NEXT: retq %z = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> %x, <2 x i32> %y) ret <2 x i32> %z @@ -713,99 +705,91 @@ define <2 x i32> @v2i32(<2 x i32> %x, <2 x i32> %y) nounwind { define <4 x i32> @v4i32(<4 x i32> %x, <4 x i32> %y) nounwind { ; SSE2-LABEL: v4i32: ; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: psubd %xmm1, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pandn %xmm3, %xmm1 -; SSE2-NEXT: psrad $31, %xmm3 -; SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE2-NEXT: psubd %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pandn %xmm0, %xmm2 +; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: v4i32: ; SSSE3: # %bb.0: -; SSSE3-NEXT: pxor %xmm2, %xmm2 -; SSSE3-NEXT: movdqa %xmm0, %xmm3 -; SSSE3-NEXT: psubd %xmm1, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm1 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm0 -; SSSE3-NEXT: pxor %xmm1, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: pandn %xmm3, %xmm1 -; SSSE3-NEXT: psrad $31, %xmm3 -; SSSE3-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; SSSE3-NEXT: pand %xmm3, %xmm0 -; SSSE3-NEXT: por %xmm1, %xmm0 +; SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm2 +; SSSE3-NEXT: psubd %xmm1, %xmm0 +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm1 +; SSSE3-NEXT: pxor %xmm2, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSSE3-NEXT: pandn %xmm0, %xmm2 +; SSSE3-NEXT: psrad $31, %xmm0 +; SSSE3-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSSE3-NEXT: pand %xmm1, %xmm0 +; SSSE3-NEXT: por %xmm2, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: v4i32: ; SSE41: # %bb.0: -; SSE41-NEXT: pxor %xmm3, %xmm3 ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: psubd %xmm1, %xmm2 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm1 -; SSE41-NEXT: pcmpgtd %xmm2, %xmm0 -; SSE41-NEXT: pxor %xmm1, %xmm0 -; SSE41-NEXT: movdqa %xmm2, %xmm1 -; SSE41-NEXT: psrad $31, %xmm1 -; SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE41-NEXT: pxor %xmm2, %xmm1 +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: psrad $31, %xmm3 +; SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm3, %xmm2 ; SSE41-NEXT: movaps %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpxor %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2 -; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX1-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 +; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsrad $31, %xmm2, %xmm1 +; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpxor %xmm0, %xmm2, %xmm0 -; AVX2-NEXT: vpsrad $31, %xmm1, %xmm2 +; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpsrad $31, %xmm2, %xmm1 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] -; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 +; AVX2-NEXT: vpxor %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: v4i32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm2 -; AVX512F-NEXT: vpsubd %xmm1, %xmm0, %xmm1 -; AVX512F-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; AVX512F-NEXT: vpxor %xmm0, %xmm2, %xmm0 -; AVX512F-NEXT: vpsrad $31, %xmm1, %xmm2 +; AVX512F-NEXT: vpsubd %xmm1, %xmm0, %xmm2 +; AVX512F-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; AVX512F-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX512F-NEXT: vpsrad $31, %xmm2, %xmm1 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] -; AVX512F-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX512F-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 +; AVX512F-NEXT: vpxor %xmm3, %xmm1, %xmm1 +; AVX512F-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: v4i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512BW-NEXT: vpcmpgtd %xmm2, %xmm1, %k0 -; AVX512BW-NEXT: vpsubd %xmm1, %xmm0, %xmm1 -; AVX512BW-NEXT: vpcmpgtd %xmm1, %xmm0, %k1 +; AVX512BW-NEXT: vpcmpgtd %xmm0, %xmm1, %k0 +; AVX512BW-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-NEXT: vpcmpgtd %xmm0, %xmm1, %k1 ; AVX512BW-NEXT: kxorw %k1, %k0, %k1 -; AVX512BW-NEXT: vpsrad $31, %xmm1, %xmm0 -; AVX512BW-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm1 {%k1} -; AVX512BW-NEXT: vmovdqa %xmm1, %xmm0 +; AVX512BW-NEXT: vpsrad $31, %xmm0, %xmm1 +; AVX512BW-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0 {%k1} ; AVX512BW-NEXT: retq %z = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %x, <4 x i32> %y) ret <4 x i32> %z @@ -814,145 +798,136 @@ define <4 x i32> @v4i32(<4 x i32> %x, <4 x i32> %y) nounwind { define <8 x i32> @v8i32(<8 x i32> %x, <8 x i32> %y) nounwind { ; SSE2-LABEL: v8i32: ; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: movdqa %xmm0, %xmm5 -; SSE2-NEXT: psubd %xmm2, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm0 -; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pandn %xmm5, %xmm2 -; SSE2-NEXT: psrad $31, %xmm5 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE2-NEXT: psubd %xmm2, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm5 +; SSE2-NEXT: pxor %xmm4, %xmm5 +; SSE2-NEXT: movdqa %xmm5, %xmm4 +; SSE2-NEXT: pandn %xmm0, %xmm4 +; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: pxor %xmm6, %xmm5 +; SSE2-NEXT: pxor %xmm6, %xmm0 ; SSE2-NEXT: pand %xmm5, %xmm0 -; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psubd %xmm3, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm1 -; SSE2-NEXT: pxor %xmm3, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: pandn %xmm2, %xmm3 -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: pxor %xmm6, %xmm2 +; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm4 +; SSE2-NEXT: psubd %xmm3, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 +; SSE2-NEXT: pxor %xmm4, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm1, %xmm3 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pxor %xmm6, %xmm1 ; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: por %xmm3, %xmm1 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: v8i32: ; SSSE3: # %bb.0: -; SSSE3-NEXT: pxor %xmm4, %xmm4 -; SSSE3-NEXT: movdqa %xmm0, %xmm5 -; SSSE3-NEXT: psubd %xmm2, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm2 -; SSSE3-NEXT: pcmpgtd %xmm5, %xmm0 -; SSSE3-NEXT: pxor %xmm2, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSSE3-NEXT: pandn %xmm5, %xmm2 -; SSSE3-NEXT: psrad $31, %xmm5 +; SSSE3-NEXT: movdqa %xmm2, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4 +; SSSE3-NEXT: psubd %xmm2, %xmm0 +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: pxor %xmm5, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm5 +; SSSE3-NEXT: pxor %xmm4, %xmm5 +; SSSE3-NEXT: movdqa %xmm5, %xmm4 +; SSSE3-NEXT: pandn %xmm0, %xmm4 +; SSSE3-NEXT: psrad $31, %xmm0 ; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648] -; SSSE3-NEXT: pxor %xmm6, %xmm5 +; SSSE3-NEXT: pxor %xmm6, %xmm0 ; SSSE3-NEXT: pand %xmm5, %xmm0 -; SSSE3-NEXT: por %xmm2, %xmm0 -; SSSE3-NEXT: movdqa %xmm1, %xmm2 -; SSSE3-NEXT: psubd %xmm3, %xmm2 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm1 -; SSSE3-NEXT: pxor %xmm3, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm3 -; SSSE3-NEXT: pandn %xmm2, %xmm3 -; SSSE3-NEXT: psrad $31, %xmm2 -; SSSE3-NEXT: pxor %xmm6, %xmm2 +; SSSE3-NEXT: por %xmm4, %xmm0 +; SSSE3-NEXT: movdqa %xmm3, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4 +; SSSE3-NEXT: psubd %xmm3, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 +; SSSE3-NEXT: pxor %xmm4, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, %xmm3 +; SSSE3-NEXT: pandn %xmm1, %xmm3 +; SSSE3-NEXT: psrad $31, %xmm1 +; SSSE3-NEXT: pxor %xmm6, %xmm1 ; SSSE3-NEXT: pand %xmm2, %xmm1 ; SSSE3-NEXT: por %xmm3, %xmm1 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: v8i32: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm1, %xmm5 -; SSE41-NEXT: pxor %xmm6, %xmm6 ; SSE41-NEXT: movdqa %xmm0, %xmm4 ; SSE41-NEXT: psubd %xmm2, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm6, %xmm2 -; SSE41-NEXT: pcmpgtd %xmm4, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm4, %xmm1 -; SSE41-NEXT: psrad $31, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] -; SSE41-NEXT: pxor %xmm2, %xmm1 -; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm4 -; SSE41-NEXT: movdqa %xmm5, %xmm1 -; SSE41-NEXT: psubd %xmm3, %xmm1 -; SSE41-NEXT: pcmpgtd %xmm6, %xmm3 -; SSE41-NEXT: pcmpgtd %xmm1, %xmm5 -; SSE41-NEXT: pxor %xmm3, %xmm5 -; SSE41-NEXT: movdqa %xmm1, %xmm3 -; SSE41-NEXT: psrad $31, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE41-NEXT: pxor %xmm4, %xmm2 +; SSE41-NEXT: movdqa %xmm4, %xmm5 +; SSE41-NEXT: psrad $31, %xmm5 +; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648] +; SSE41-NEXT: pxor %xmm6, %xmm5 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm5, %xmm4 +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psubd %xmm3, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm1, %xmm3 ; SSE41-NEXT: pxor %xmm2, %xmm3 -; SSE41-NEXT: movdqa %xmm5, %xmm0 -; SSE41-NEXT: blendvps %xmm0, %xmm3, %xmm1 +; SSE41-NEXT: movdqa %xmm2, %xmm1 +; SSE41-NEXT: psrad $31, %xmm1 +; SSE41-NEXT: pxor %xmm6, %xmm1 +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm2 ; SSE41-NEXT: movaps %xmm4, %xmm0 +; SSE41-NEXT: movaps %xmm2, %xmm1 ; SSE41-NEXT: retq ; ; AVX1-LABEL: v8i32: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpgtd %xmm3, %xmm2, %xmm4 -; AVX1-NEXT: vpcmpgtd %xmm3, %xmm1, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vpsubd %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpcmpgtd %xmm2, %xmm4, %xmm4 -; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 -; AVX1-NEXT: vxorps %ymm0, %ymm3, %ymm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm3 -; AVX1-NEXT: vpsrad $31, %xmm1, %xmm1 -; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpsubd %xmm2, %xmm3, %xmm4 +; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm5 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm6 +; AVX1-NEXT: vpcmpgtd %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vxorps %ymm6, %ymm0, %ymm0 +; AVX1-NEXT: vpsrad $31, %xmm5, %xmm1 +; AVX1-NEXT: vpsrad $31, %xmm4, %xmm2 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX1-NEXT: vblendvps %ymm0, %ymm1, %ymm3, %ymm0 +; AVX1-NEXT: vblendvps %ymm0, %ymm1, %ymm6, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpgtd %ymm2, %ymm1, %ymm2 -; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm1 -; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpxor %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: vpsrad $31, %ymm1, %ymm2 +; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpcmpgtd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpsrad $31, %ymm2, %ymm1 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] -; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm2 -; AVX2-NEXT: vblendvps %ymm0, %ymm2, %ymm1, %ymm0 +; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vblendvps %ymm0, %ymm1, %ymm2, %ymm0 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: v8i32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpgtd %ymm2, %ymm1, %ymm2 -; AVX512F-NEXT: vpsubd %ymm1, %ymm0, %ymm1 -; AVX512F-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpxor %ymm0, %ymm2, %ymm0 -; AVX512F-NEXT: vpsrad $31, %ymm1, %ymm2 +; AVX512F-NEXT: vpsubd %ymm1, %ymm0, %ymm2 +; AVX512F-NEXT: vpcmpgtd %ymm0, %ymm1, %ymm0 +; AVX512F-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpsrad $31, %ymm2, %ymm1 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm3 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] -; AVX512F-NEXT: vpxor %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vblendvps %ymm0, %ymm2, %ymm1, %ymm0 +; AVX512F-NEXT: vpxor %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vblendvps %ymm0, %ymm1, %ymm2, %ymm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: v8i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512BW-NEXT: vpcmpgtd %ymm2, %ymm1, %k0 -; AVX512BW-NEXT: vpsubd %ymm1, %ymm0, %ymm1 -; AVX512BW-NEXT: vpcmpgtd %ymm1, %ymm0, %k1 +; AVX512BW-NEXT: vpcmpgtd %ymm0, %ymm1, %k0 +; AVX512BW-NEXT: vpsubd %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-NEXT: vpcmpgtd %ymm0, %ymm1, %k1 ; AVX512BW-NEXT: kxorw %k1, %k0, %k1 -; AVX512BW-NEXT: vpsrad $31, %ymm1, %ymm0 -; AVX512BW-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm1 {%k1} -; AVX512BW-NEXT: vmovdqa %ymm1, %ymm0 +; AVX512BW-NEXT: vpsrad $31, %ymm0, %ymm1 +; AVX512BW-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm0 {%k1} ; AVX512BW-NEXT: retq %z = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> %x, <8 x i32> %y) ret <8 x i32> %z @@ -961,223 +936,215 @@ define <8 x i32> @v8i32(<8 x i32> %x, <8 x i32> %y) nounwind { define <16 x i32> @v16i32(<16 x i32> %x, <16 x i32> %y) nounwind { ; SSE2-LABEL: v16i32: ; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm8, %xmm8 -; SSE2-NEXT: movdqa %xmm0, %xmm9 -; SSE2-NEXT: psubd %xmm4, %xmm9 -; SSE2-NEXT: pcmpgtd %xmm8, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm0 -; SSE2-NEXT: pxor %xmm4, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm10 -; SSE2-NEXT: pandn %xmm9, %xmm10 -; SSE2-NEXT: psrad $31, %xmm9 -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: pxor %xmm4, %xmm9 +; SSE2-NEXT: movdqa %xmm4, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm8 +; SSE2-NEXT: psubd %xmm4, %xmm0 +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: pxor %xmm9, %xmm9 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE2-NEXT: pxor %xmm8, %xmm9 +; SSE2-NEXT: movdqa %xmm9, %xmm10 +; SSE2-NEXT: pandn %xmm0, %xmm10 +; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: pxor %xmm8, %xmm0 ; SSE2-NEXT: pand %xmm9, %xmm0 ; SSE2-NEXT: por %xmm10, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm9 -; SSE2-NEXT: psubd %xmm5, %xmm9 -; SSE2-NEXT: pcmpgtd %xmm8, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm1 -; SSE2-NEXT: pxor %xmm5, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm5 -; SSE2-NEXT: pandn %xmm9, %xmm5 -; SSE2-NEXT: psrad $31, %xmm9 -; SSE2-NEXT: pxor %xmm4, %xmm9 -; SSE2-NEXT: pand %xmm9, %xmm1 -; SSE2-NEXT: por %xmm5, %xmm1 -; SSE2-NEXT: movdqa %xmm2, %xmm5 -; SSE2-NEXT: psubd %xmm6, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm8, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm2 -; SSE2-NEXT: pxor %xmm6, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm6 -; SSE2-NEXT: pandn %xmm5, %xmm6 -; SSE2-NEXT: psrad $31, %xmm5 -; SSE2-NEXT: pxor %xmm4, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: por %xmm6, %xmm2 -; SSE2-NEXT: movdqa %xmm3, %xmm5 -; SSE2-NEXT: psubd %xmm7, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm8, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm3 -; SSE2-NEXT: pxor %xmm7, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm6 -; SSE2-NEXT: pandn %xmm5, %xmm6 -; SSE2-NEXT: psrad $31, %xmm5 -; SSE2-NEXT: pxor %xmm4, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm3 -; SSE2-NEXT: por %xmm6, %xmm3 +; SSE2-NEXT: movdqa %xmm5, %xmm9 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm9 +; SSE2-NEXT: psubd %xmm5, %xmm1 +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm5 +; SSE2-NEXT: pxor %xmm9, %xmm5 +; SSE2-NEXT: movdqa %xmm5, %xmm9 +; SSE2-NEXT: pandn %xmm1, %xmm9 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pxor %xmm8, %xmm1 +; SSE2-NEXT: pand %xmm5, %xmm1 +; SSE2-NEXT: por %xmm9, %xmm1 +; SSE2-NEXT: movdqa %xmm6, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm5 +; SSE2-NEXT: psubd %xmm6, %xmm2 +; SSE2-NEXT: pxor %xmm6, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm6 +; SSE2-NEXT: pxor %xmm5, %xmm6 +; SSE2-NEXT: movdqa %xmm6, %xmm5 +; SSE2-NEXT: pandn %xmm2, %xmm5 +; SSE2-NEXT: psrad $31, %xmm2 +; SSE2-NEXT: pxor %xmm8, %xmm2 +; SSE2-NEXT: pand %xmm6, %xmm2 +; SSE2-NEXT: por %xmm5, %xmm2 +; SSE2-NEXT: movdqa %xmm7, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 +; SSE2-NEXT: psubd %xmm7, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 +; SSE2-NEXT: pxor %xmm5, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: pandn %xmm3, %xmm5 +; SSE2-NEXT: psrad $31, %xmm3 +; SSE2-NEXT: pxor %xmm8, %xmm3 +; SSE2-NEXT: pand %xmm4, %xmm3 +; SSE2-NEXT: por %xmm5, %xmm3 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: v16i32: ; SSSE3: # %bb.0: -; SSSE3-NEXT: pxor %xmm8, %xmm8 -; SSSE3-NEXT: movdqa %xmm0, %xmm9 -; SSSE3-NEXT: psubd %xmm4, %xmm9 -; SSSE3-NEXT: pcmpgtd %xmm8, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm9, %xmm0 -; SSSE3-NEXT: pxor %xmm4, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, %xmm10 -; SSSE3-NEXT: pandn %xmm9, %xmm10 -; SSSE3-NEXT: psrad $31, %xmm9 -; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] -; SSSE3-NEXT: pxor %xmm4, %xmm9 +; SSSE3-NEXT: movdqa %xmm4, %xmm8 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm8 +; SSSE3-NEXT: psubd %xmm4, %xmm0 +; SSSE3-NEXT: pxor %xmm4, %xmm4 +; SSSE3-NEXT: pxor %xmm9, %xmm9 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm9 +; SSSE3-NEXT: pxor %xmm8, %xmm9 +; SSSE3-NEXT: movdqa %xmm9, %xmm10 +; SSSE3-NEXT: pandn %xmm0, %xmm10 +; SSSE3-NEXT: psrad $31, %xmm0 +; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648,2147483648,2147483648] +; SSSE3-NEXT: pxor %xmm8, %xmm0 ; SSSE3-NEXT: pand %xmm9, %xmm0 ; SSSE3-NEXT: por %xmm10, %xmm0 -; SSSE3-NEXT: movdqa %xmm1, %xmm9 -; SSSE3-NEXT: psubd %xmm5, %xmm9 -; SSSE3-NEXT: pcmpgtd %xmm8, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm9, %xmm1 -; SSSE3-NEXT: pxor %xmm5, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm5 -; SSSE3-NEXT: pandn %xmm9, %xmm5 -; SSSE3-NEXT: psrad $31, %xmm9 -; SSSE3-NEXT: pxor %xmm4, %xmm9 -; SSSE3-NEXT: pand %xmm9, %xmm1 -; SSSE3-NEXT: por %xmm5, %xmm1 -; SSSE3-NEXT: movdqa %xmm2, %xmm5 -; SSSE3-NEXT: psubd %xmm6, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm8, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm5, %xmm2 -; SSSE3-NEXT: pxor %xmm6, %xmm2 -; SSSE3-NEXT: movdqa %xmm2, %xmm6 -; SSSE3-NEXT: pandn %xmm5, %xmm6 -; SSSE3-NEXT: psrad $31, %xmm5 -; SSSE3-NEXT: pxor %xmm4, %xmm5 -; SSSE3-NEXT: pand %xmm5, %xmm2 -; SSSE3-NEXT: por %xmm6, %xmm2 -; SSSE3-NEXT: movdqa %xmm3, %xmm5 -; SSSE3-NEXT: psubd %xmm7, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm8, %xmm7 -; SSSE3-NEXT: pcmpgtd %xmm5, %xmm3 -; SSSE3-NEXT: pxor %xmm7, %xmm3 -; SSSE3-NEXT: movdqa %xmm3, %xmm6 -; SSSE3-NEXT: pandn %xmm5, %xmm6 -; SSSE3-NEXT: psrad $31, %xmm5 -; SSSE3-NEXT: pxor %xmm4, %xmm5 -; SSSE3-NEXT: pand %xmm5, %xmm3 -; SSSE3-NEXT: por %xmm6, %xmm3 +; SSSE3-NEXT: movdqa %xmm5, %xmm9 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm9 +; SSSE3-NEXT: psubd %xmm5, %xmm1 +; SSSE3-NEXT: pxor %xmm5, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm5 +; SSSE3-NEXT: pxor %xmm9, %xmm5 +; SSSE3-NEXT: movdqa %xmm5, %xmm9 +; SSSE3-NEXT: pandn %xmm1, %xmm9 +; SSSE3-NEXT: psrad $31, %xmm1 +; SSSE3-NEXT: pxor %xmm8, %xmm1 +; SSSE3-NEXT: pand %xmm5, %xmm1 +; SSSE3-NEXT: por %xmm9, %xmm1 +; SSSE3-NEXT: movdqa %xmm6, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm5 +; SSSE3-NEXT: psubd %xmm6, %xmm2 +; SSSE3-NEXT: pxor %xmm6, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm6 +; SSSE3-NEXT: pxor %xmm5, %xmm6 +; SSSE3-NEXT: movdqa %xmm6, %xmm5 +; SSSE3-NEXT: pandn %xmm2, %xmm5 +; SSSE3-NEXT: psrad $31, %xmm2 +; SSSE3-NEXT: pxor %xmm8, %xmm2 +; SSSE3-NEXT: pand %xmm6, %xmm2 +; SSSE3-NEXT: por %xmm5, %xmm2 +; SSSE3-NEXT: movdqa %xmm7, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5 +; SSSE3-NEXT: psubd %xmm7, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm4 +; SSSE3-NEXT: pxor %xmm5, %xmm4 +; SSSE3-NEXT: movdqa %xmm4, %xmm5 +; SSSE3-NEXT: pandn %xmm3, %xmm5 +; SSSE3-NEXT: psrad $31, %xmm3 +; SSSE3-NEXT: pxor %xmm8, %xmm3 +; SSSE3-NEXT: pand %xmm4, %xmm3 +; SSSE3-NEXT: por %xmm5, %xmm3 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: v16i32: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm3, %xmm11 -; SSE41-NEXT: movdqa %xmm2, %xmm10 +; SSE41-NEXT: movdqa %xmm4, %xmm8 ; SSE41-NEXT: movdqa %xmm1, %xmm9 -; SSE41-NEXT: pxor %xmm12, %xmm12 -; SSE41-NEXT: movdqa %xmm0, %xmm8 -; SSE41-NEXT: psubd %xmm4, %xmm8 -; SSE41-NEXT: pcmpgtd %xmm12, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm8, %xmm0 -; SSE41-NEXT: pxor %xmm4, %xmm0 -; SSE41-NEXT: movdqa %xmm8, %xmm1 +; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: psubd %xmm8, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm8 +; SSE41-NEXT: pxor %xmm4, %xmm8 +; SSE41-NEXT: movdqa %xmm4, %xmm1 ; SSE41-NEXT: psrad $31, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] -; SSE41-NEXT: pxor %xmm4, %xmm1 -; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm8 +; SSE41-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648,2147483648,2147483648] +; SSE41-NEXT: pxor %xmm10, %xmm1 +; SSE41-NEXT: movdqa %xmm8, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm4 ; SSE41-NEXT: movdqa %xmm9, %xmm1 ; SSE41-NEXT: psubd %xmm5, %xmm1 -; SSE41-NEXT: pcmpgtd %xmm12, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm1, %xmm9 -; SSE41-NEXT: pxor %xmm5, %xmm9 -; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm9, %xmm5 +; SSE41-NEXT: pxor %xmm1, %xmm5 +; SSE41-NEXT: movdqa %xmm1, %xmm8 +; SSE41-NEXT: psrad $31, %xmm8 +; SSE41-NEXT: pxor %xmm10, %xmm8 +; SSE41-NEXT: movdqa %xmm5, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm8, %xmm1 +; SSE41-NEXT: movdqa %xmm2, %xmm5 +; SSE41-NEXT: psubd %xmm6, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm2, %xmm6 +; SSE41-NEXT: pxor %xmm5, %xmm6 +; SSE41-NEXT: movdqa %xmm5, %xmm2 ; SSE41-NEXT: psrad $31, %xmm2 -; SSE41-NEXT: pxor %xmm4, %xmm2 -; SSE41-NEXT: movdqa %xmm9, %xmm0 -; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm10, %xmm2 -; SSE41-NEXT: psubd %xmm6, %xmm2 -; SSE41-NEXT: pcmpgtd %xmm12, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm2, %xmm10 -; SSE41-NEXT: pxor %xmm6, %xmm10 -; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: psrad $31, %xmm3 -; SSE41-NEXT: pxor %xmm4, %xmm3 -; SSE41-NEXT: movdqa %xmm10, %xmm0 -; SSE41-NEXT: blendvps %xmm0, %xmm3, %xmm2 -; SSE41-NEXT: movdqa %xmm11, %xmm3 -; SSE41-NEXT: psubd %xmm7, %xmm3 -; SSE41-NEXT: pcmpgtd %xmm12, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm11 -; SSE41-NEXT: pxor %xmm7, %xmm11 -; SSE41-NEXT: movdqa %xmm3, %xmm5 -; SSE41-NEXT: psrad $31, %xmm5 -; SSE41-NEXT: pxor %xmm4, %xmm5 -; SSE41-NEXT: movdqa %xmm11, %xmm0 -; SSE41-NEXT: blendvps %xmm0, %xmm5, %xmm3 -; SSE41-NEXT: movaps %xmm8, %xmm0 +; SSE41-NEXT: pxor %xmm10, %xmm2 +; SSE41-NEXT: movdqa %xmm6, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm5 +; SSE41-NEXT: movdqa %xmm3, %xmm6 +; SSE41-NEXT: psubd %xmm7, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm7 +; SSE41-NEXT: pxor %xmm6, %xmm7 +; SSE41-NEXT: movdqa %xmm6, %xmm2 +; SSE41-NEXT: psrad $31, %xmm2 +; SSE41-NEXT: pxor %xmm10, %xmm2 +; SSE41-NEXT: movdqa %xmm7, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm6 +; SSE41-NEXT: movaps %xmm4, %xmm0 +; SSE41-NEXT: movaps %xmm5, %xmm2 +; SSE41-NEXT: movaps %xmm6, %xmm3 ; SSE41-NEXT: retq ; ; AVX1-LABEL: v16i32: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 -; AVX1-NEXT: vpcmpgtd %xmm5, %xmm4, %xmm6 -; AVX1-NEXT: vpcmpgtd %xmm5, %xmm2, %xmm7 -; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7 -; AVX1-NEXT: vpsubd %xmm4, %xmm7, %xmm4 -; AVX1-NEXT: vpcmpgtd %xmm4, %xmm7, %xmm7 -; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vpcmpgtd %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm0 -; AVX1-NEXT: vxorps %ymm0, %ymm6, %ymm0 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm6 -; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2 -; AVX1-NEXT: vpsrad $31, %xmm4, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vpsubd %xmm4, %xmm5, %xmm6 +; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm7 +; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm8 +; AVX1-NEXT: vpcmpgtd %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-NEXT: vxorps %ymm0, %ymm8, %ymm0 +; AVX1-NEXT: vpsrad $31, %xmm7, %xmm2 +; AVX1-NEXT: vpsrad $31, %xmm6, %xmm4 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 ; AVX1-NEXT: vbroadcastss {{.*#+}} ymm4 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] ; AVX1-NEXT: vxorps %ymm4, %ymm2, %ymm2 -; AVX1-NEXT: vblendvps %ymm0, %ymm2, %ymm6, %ymm0 +; AVX1-NEXT: vblendvps %ymm0, %ymm2, %ymm8, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2 -; AVX1-NEXT: vpcmpgtd %xmm5, %xmm2, %xmm6 -; AVX1-NEXT: vpcmpgtd %xmm5, %xmm3, %xmm5 -; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6 -; AVX1-NEXT: vpsubd %xmm2, %xmm6, %xmm2 -; AVX1-NEXT: vpcmpgtd %xmm2, %xmm6, %xmm6 -; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm3 -; AVX1-NEXT: vpcmpgtd %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 -; AVX1-NEXT: vxorps %ymm1, %ymm5, %ymm1 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm5 -; AVX1-NEXT: vpsrad $31, %xmm3, %xmm3 -; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 +; AVX1-NEXT: vpsubd %xmm2, %xmm5, %xmm6 +; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm7 +; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm8 +; AVX1-NEXT: vpcmpgtd %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpgtd %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: vxorps %ymm1, %ymm8, %ymm1 +; AVX1-NEXT: vpsrad $31, %xmm7, %xmm2 +; AVX1-NEXT: vpsrad $31, %xmm6, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX1-NEXT: vxorps %ymm4, %ymm2, %ymm2 -; AVX1-NEXT: vblendvps %ymm1, %ymm2, %ymm5, %ymm1 +; AVX1-NEXT: vblendvps %ymm1, %ymm2, %ymm8, %ymm1 ; AVX1-NEXT: retq ; ; AVX2-LABEL: v16i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX2-NEXT: vpcmpgtd %ymm4, %ymm2, %ymm5 -; AVX2-NEXT: vpsubd %ymm2, %ymm0, %ymm2 -; AVX2-NEXT: vpcmpgtd %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpxor %ymm0, %ymm5, %ymm0 -; AVX2-NEXT: vpsrad $31, %ymm2, %ymm5 -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm6 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] -; AVX2-NEXT: vpxor %ymm6, %ymm5, %ymm5 -; AVX2-NEXT: vblendvps %ymm0, %ymm5, %ymm2, %ymm0 -; AVX2-NEXT: vpcmpgtd %ymm4, %ymm3, %ymm2 -; AVX2-NEXT: vpsubd %ymm3, %ymm1, %ymm3 -; AVX2-NEXT: vpcmpgtd %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpxor %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vpsrad $31, %ymm3, %ymm2 -; AVX2-NEXT: vpxor %ymm6, %ymm2, %ymm2 -; AVX2-NEXT: vblendvps %ymm1, %ymm2, %ymm3, %ymm1 +; AVX2-NEXT: vpsubd %ymm2, %ymm0, %ymm4 +; AVX2-NEXT: vpcmpgtd %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vpsrad $31, %ymm4, %ymm2 +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm5 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] +; AVX2-NEXT: vpxor %ymm5, %ymm2, %ymm2 +; AVX2-NEXT: vblendvps %ymm0, %ymm2, %ymm4, %ymm0 +; AVX2-NEXT: vpsubd %ymm3, %ymm1, %ymm2 +; AVX2-NEXT: vpcmpgtd %ymm1, %ymm3, %ymm1 +; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpsrad $31, %ymm2, %ymm3 +; AVX2-NEXT: vpxor %ymm5, %ymm3, %ymm3 +; AVX2-NEXT: vblendvps %ymm1, %ymm3, %ymm2, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: v16i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpcmpgtd %zmm2, %zmm1, %k0 -; AVX512-NEXT: vpsubd %zmm1, %zmm0, %zmm1 -; AVX512-NEXT: vpcmpgtd %zmm1, %zmm0, %k1 +; AVX512-NEXT: vpcmpgtd %zmm0, %zmm1, %k0 +; AVX512-NEXT: vpsubd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vpcmpgtd %zmm0, %zmm1, %k1 ; AVX512-NEXT: kxorw %k1, %k0, %k1 -; AVX512-NEXT: vpsrad $31, %zmm1, %zmm0 -; AVX512-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm1 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512-NEXT: vpsrad $31, %zmm0, %zmm1 +; AVX512-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 {%k1} ; AVX512-NEXT: retq %z = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> %x, <16 x i32> %y) ret <16 x i32> %z @@ -1189,32 +1156,24 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind { ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: pxor %xmm2, %xmm3 -; SSE2-NEXT: psubq %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: pxor %xmm2, %xmm4 -; SSE2-NEXT: movdqa %xmm3, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm3, %xmm4 +; SSE2-NEXT: pxor %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm5, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] -; SSE2-NEXT: por %xmm3, %xmm4 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: pxor %xmm4, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pandn %xmm0, %xmm2 +; SSE2-NEXT: por %xmm2, %xmm3 +; SSE2-NEXT: psubq %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm1 +; SSE2-NEXT: pxor %xmm3, %xmm1 +; SSE2-NEXT: psrad $31, %xmm2 +; SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: pandn %xmm0, %xmm1 ; SSE2-NEXT: por %xmm2, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: retq @@ -1224,32 +1183,24 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind { ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] ; SSSE3-NEXT: movdqa %xmm0, %xmm3 ; SSSE3-NEXT: pxor %xmm2, %xmm3 -; SSSE3-NEXT: psubq %xmm1, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, %xmm4 -; SSSE3-NEXT: pxor %xmm2, %xmm4 -; SSSE3-NEXT: movdqa %xmm3, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm3, %xmm4 +; SSSE3-NEXT: pxor %xmm1, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm3, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSSE3-NEXT: pand %xmm5, %xmm2 ; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] -; SSSE3-NEXT: pand %xmm6, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] -; SSSE3-NEXT: por %xmm3, %xmm4 -; SSSE3-NEXT: pxor %xmm2, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm2, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: pand %xmm5, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] -; SSSE3-NEXT: por %xmm1, %xmm2 -; SSSE3-NEXT: pxor %xmm4, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSSE3-NEXT: psrad $31, %xmm1 -; SSSE3-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSSE3-NEXT: pand %xmm2, %xmm1 -; SSSE3-NEXT: pandn %xmm0, %xmm2 +; SSSE3-NEXT: por %xmm2, %xmm3 +; SSSE3-NEXT: psubq %xmm1, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm1 +; SSSE3-NEXT: pxor %xmm3, %xmm1 +; SSSE3-NEXT: psrad $31, %xmm2 +; SSSE3-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSSE3-NEXT: pand %xmm1, %xmm2 +; SSSE3-NEXT: pandn %xmm0, %xmm1 ; SSSE3-NEXT: por %xmm2, %xmm1 ; SSSE3-NEXT: movdqa %xmm1, %xmm0 ; SSSE3-NEXT: retq @@ -1260,23 +1211,15 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind { ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = [2147483648,2147483648] ; SSE41-NEXT: movdqa %xmm2, %xmm3 ; SSE41-NEXT: pxor %xmm0, %xmm3 +; SSE41-NEXT: pxor %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm3, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,2,2] +; SSE41-NEXT: pand %xmm4, %xmm3 +; SSE41-NEXT: por %xmm0, %xmm3 ; SSE41-NEXT: psubq %xmm1, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm4 -; SSE41-NEXT: pxor %xmm0, %xmm4 -; SSE41-NEXT: movdqa %xmm3, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm4, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm4 -; SSE41-NEXT: por %xmm3, %xmm4 -; SSE41-NEXT: pxor %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm3 -; SSE41-NEXT: por %xmm1, %xmm3 -; SSE41-NEXT: pxor %xmm4, %xmm3 +; SSE41-NEXT: pxor %xmm2, %xmm3 ; SSE41-NEXT: movapd {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 @@ -1287,55 +1230,47 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind { ; ; AVX1-LABEL: v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpxor %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] -; AVX1-NEXT: # xmm2 = mem[0,0] -; AVX1-NEXT: vblendvpd %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX1-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0 +; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vblendvpd %xmm2, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vblendvpd %xmm0, %xmm1, %xmm2, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpxor %xmm0, %xmm2, %xmm0 -; AVX2-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] -; AVX2-NEXT: # xmm2 = mem[0,0] -; AVX2-NEXT: vblendvpd %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX2-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0 +; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: # xmm1 = mem[0,0] +; AVX2-NEXT: vblendvpd %xmm2, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vblendvpd %xmm0, %xmm1, %xmm2, %xmm0 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: v2i64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm2 -; AVX512F-NEXT: vpsubq %xmm1, %xmm0, %xmm1 -; AVX512F-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 -; AVX512F-NEXT: vpxor %xmm0, %xmm2, %xmm0 -; AVX512F-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] -; AVX512F-NEXT: # xmm2 = mem[0,0] -; AVX512F-NEXT: vblendvpd %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX512F-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0 +; AVX512F-NEXT: vpsubq %xmm1, %xmm0, %xmm2 +; AVX512F-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; AVX512F-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX512F-NEXT: vmovddup {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] +; AVX512F-NEXT: # xmm1 = mem[0,0] +; AVX512F-NEXT: vblendvpd %xmm2, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512F-NEXT: vblendvpd %xmm0, %xmm1, %xmm2, %xmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: v2i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512BW-NEXT: vpcmpgtq %xmm2, %xmm1, %k0 -; AVX512BW-NEXT: vpsubq %xmm1, %xmm0, %xmm1 -; AVX512BW-NEXT: vpcmpgtq %xmm1, %xmm0, %k1 -; AVX512BW-NEXT: kxorw %k1, %k0, %k1 -; AVX512BW-NEXT: vpcmpgtq %xmm1, %xmm2, %k2 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808] -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm0 {%k2} = [9223372036854775807,9223372036854775807] -; AVX512BW-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1} -; AVX512BW-NEXT: vmovdqa %xmm1, %xmm0 +; AVX512BW-NEXT: vpcmpgtq %xmm0, %xmm1, %k0 +; AVX512BW-NEXT: vpsubq %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-NEXT: vpcmpgtq %xmm0, %xmm1, %k1 +; AVX512BW-NEXT: kxorw %k1, %k0, %k2 +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm1 {%k1} = [9223372036854775807,9223372036854775807] +; AVX512BW-NEXT: vmovdqa64 %xmm1, %xmm0 {%k2} ; AVX512BW-NEXT: retq %z = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> %x, <2 x i64> %y) ret <2 x i64> %z @@ -1348,62 +1283,46 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind { ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648] ; SSE2-NEXT: pxor %xmm5, %xmm0 ; SSE2-NEXT: psubq %xmm2, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm6 -; SSE2-NEXT: pxor %xmm5, %xmm6 -; SSE2-NEXT: movdqa %xmm0, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm0, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] -; SSE2-NEXT: pand %xmm8, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm6 ; SSE2-NEXT: pxor %xmm5, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm5, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: pand %xmm7, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm7 -; SSE2-NEXT: pxor %xmm6, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [9223372036854775808,9223372036854775808] -; SSE2-NEXT: pxor %xmm6, %xmm0 +; SSE2-NEXT: movdqa %xmm2, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] ; SSE2-NEXT: pand %xmm7, %xmm0 -; SSE2-NEXT: pandn %xmm4, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm4[1,1,3,3] +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm0 +; SSE2-NEXT: pxor %xmm6, %xmm0 +; SSE2-NEXT: psrad $31, %xmm7 +; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [9223372036854775808,9223372036854775808] +; SSE2-NEXT: pxor %xmm6, %xmm7 +; SSE2-NEXT: pand %xmm0, %xmm7 +; SSE2-NEXT: pandn %xmm4, %xmm0 ; SSE2-NEXT: por %xmm7, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pxor %xmm5, %xmm2 -; SSE2-NEXT: psubq %xmm3, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm4 ; SSE2-NEXT: pxor %xmm5, %xmm4 -; SSE2-NEXT: movdqa %xmm2, %xmm7 +; SSE2-NEXT: pxor %xmm3, %xmm5 +; SSE2-NEXT: movdqa %xmm5, %xmm7 ; SSE2-NEXT: pcmpgtd %xmm4, %xmm7 ; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm2, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] -; SSE2-NEXT: pand %xmm8, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm4 -; SSE2-NEXT: pxor %xmm5, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm2[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm5, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-NEXT: pand %xmm7, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3] -; SSE2-NEXT: por %xmm3, %xmm5 -; SSE2-NEXT: pxor %xmm4, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: pxor %xmm6, %xmm2 -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: pandn %xmm1, %xmm5 -; SSE2-NEXT: por %xmm5, %xmm2 +; SSE2-NEXT: pcmpeqd %xmm4, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] +; SSE2-NEXT: pand %xmm8, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm5 +; SSE2-NEXT: psubq %xmm3, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] +; SSE2-NEXT: pcmpgtd %xmm3, %xmm2 +; SSE2-NEXT: pxor %xmm5, %xmm2 +; SSE2-NEXT: psrad $31, %xmm3 +; SSE2-NEXT: pxor %xmm6, %xmm3 +; SSE2-NEXT: pand %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm1, %xmm2 +; SSE2-NEXT: por %xmm3, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: retq ; @@ -1413,62 +1332,46 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind { ; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648] ; SSSE3-NEXT: pxor %xmm5, %xmm0 ; SSSE3-NEXT: psubq %xmm2, %xmm4 -; SSSE3-NEXT: movdqa %xmm4, %xmm6 -; SSSE3-NEXT: pxor %xmm5, %xmm6 -; SSSE3-NEXT: movdqa %xmm0, %xmm7 -; SSSE3-NEXT: pcmpgtd %xmm6, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm0, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] -; SSSE3-NEXT: pand %xmm8, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3] -; SSSE3-NEXT: por %xmm0, %xmm6 ; SSSE3-NEXT: pxor %xmm5, %xmm2 -; SSSE3-NEXT: movdqa %xmm2, %xmm0 -; SSSE3-NEXT: pcmpgtd %xmm5, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm5, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSSE3-NEXT: pand %xmm7, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] -; SSSE3-NEXT: por %xmm2, %xmm7 -; SSSE3-NEXT: pxor %xmm6, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] -; SSSE3-NEXT: psrad $31, %xmm0 -; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [9223372036854775808,9223372036854775808] -; SSSE3-NEXT: pxor %xmm6, %xmm0 +; SSSE3-NEXT: movdqa %xmm2, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm0, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] ; SSSE3-NEXT: pand %xmm7, %xmm0 -; SSSE3-NEXT: pandn %xmm4, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSSE3-NEXT: por %xmm0, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm4[1,1,3,3] +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: pxor %xmm0, %xmm0 +; SSSE3-NEXT: pcmpgtd %xmm7, %xmm0 +; SSSE3-NEXT: pxor %xmm6, %xmm0 +; SSSE3-NEXT: psrad $31, %xmm7 +; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [9223372036854775808,9223372036854775808] +; SSSE3-NEXT: pxor %xmm6, %xmm7 +; SSSE3-NEXT: pand %xmm0, %xmm7 +; SSSE3-NEXT: pandn %xmm4, %xmm0 ; SSSE3-NEXT: por %xmm7, %xmm0 -; SSSE3-NEXT: movdqa %xmm1, %xmm2 -; SSSE3-NEXT: pxor %xmm5, %xmm2 -; SSSE3-NEXT: psubq %xmm3, %xmm1 ; SSSE3-NEXT: movdqa %xmm1, %xmm4 ; SSSE3-NEXT: pxor %xmm5, %xmm4 -; SSSE3-NEXT: movdqa %xmm2, %xmm7 +; SSSE3-NEXT: pxor %xmm3, %xmm5 +; SSSE3-NEXT: movdqa %xmm5, %xmm7 ; SSSE3-NEXT: pcmpgtd %xmm4, %xmm7 ; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm2, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] -; SSSE3-NEXT: pand %xmm8, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3] -; SSSE3-NEXT: por %xmm2, %xmm4 -; SSSE3-NEXT: pxor %xmm5, %xmm3 -; SSSE3-NEXT: movdqa %xmm3, %xmm2 -; SSSE3-NEXT: pcmpgtd %xmm5, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm2[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm5, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSSE3-NEXT: pand %xmm7, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3] -; SSSE3-NEXT: por %xmm3, %xmm5 -; SSSE3-NEXT: pxor %xmm4, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; SSSE3-NEXT: psrad $31, %xmm2 -; SSSE3-NEXT: pxor %xmm6, %xmm2 -; SSSE3-NEXT: pand %xmm5, %xmm2 -; SSSE3-NEXT: pandn %xmm1, %xmm5 -; SSSE3-NEXT: por %xmm5, %xmm2 +; SSSE3-NEXT: pcmpeqd %xmm4, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] +; SSSE3-NEXT: pand %xmm8, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSSE3-NEXT: por %xmm4, %xmm5 +; SSSE3-NEXT: psubq %xmm3, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm2 +; SSSE3-NEXT: pxor %xmm5, %xmm2 +; SSSE3-NEXT: psrad $31, %xmm3 +; SSSE3-NEXT: pxor %xmm6, %xmm3 +; SSSE3-NEXT: pand %xmm2, %xmm3 +; SSSE3-NEXT: pandn %xmm1, %xmm2 +; SSSE3-NEXT: por %xmm3, %xmm2 ; SSSE3-NEXT: movdqa %xmm2, %xmm1 ; SSSE3-NEXT: retq ; @@ -1478,113 +1381,90 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind { ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = [2147483648,2147483648] ; SSE41-NEXT: pxor %xmm6, %xmm0 ; SSE41-NEXT: psubq %xmm2, %xmm4 -; SSE41-NEXT: movdqa %xmm4, %xmm5 -; SSE41-NEXT: pxor %xmm6, %xmm5 -; SSE41-NEXT: movdqa %xmm0, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm5, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm5, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,0,2,2] -; SSE41-NEXT: pand %xmm7, %xmm8 -; SSE41-NEXT: por %xmm0, %xmm8 ; SSE41-NEXT: pxor %xmm6, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm6, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm2 ; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2] -; SSE41-NEXT: pand %xmm0, %xmm5 +; SSE41-NEXT: pand %xmm7, %xmm5 ; SSE41-NEXT: por %xmm2, %xmm5 -; SSE41-NEXT: pxor %xmm8, %xmm5 -; SSE41-NEXT: movapd {{.*#+}} xmm8 = [9223372036854775807,9223372036854775807] -; SSE41-NEXT: movapd {{.*#+}} xmm7 = [9223372036854775808,9223372036854775808] -; SSE41-NEXT: movapd %xmm7, %xmm2 +; SSE41-NEXT: pxor %xmm4, %xmm5 +; SSE41-NEXT: movapd {{.*#+}} xmm7 = [9223372036854775807,9223372036854775807] +; SSE41-NEXT: movapd {{.*#+}} xmm8 = [9223372036854775808,9223372036854775808] +; SSE41-NEXT: movapd %xmm8, %xmm2 ; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm2 +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm2 ; SSE41-NEXT: movdqa %xmm5, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm4 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pxor %xmm6, %xmm0 +; SSE41-NEXT: pxor %xmm3, %xmm6 +; SSE41-NEXT: movdqa %xmm6, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,0,2,2] +; SSE41-NEXT: pand %xmm5, %xmm2 +; SSE41-NEXT: por %xmm6, %xmm2 ; SSE41-NEXT: psubq %xmm3, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: pxor %xmm6, %xmm2 -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm2, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm2, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm9 -; SSE41-NEXT: por %xmm0, %xmm9 -; SSE41-NEXT: pxor %xmm6, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm6, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] -; SSE41-NEXT: pand %xmm0, %xmm2 -; SSE41-NEXT: por %xmm3, %xmm2 -; SSE41-NEXT: pxor %xmm9, %xmm2 +; SSE41-NEXT: pxor %xmm1, %xmm2 ; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm7 +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8 ; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1 +; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm1 ; SSE41-NEXT: movapd %xmm4, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm4 -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm1, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm5 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 -; AVX1-NEXT: vpsubq %xmm2, %xmm5, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm5, %xmm5 -; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 -; AVX1-NEXT: vxorpd %ymm0, %ymm4, %ymm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm4 -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: vpsubq %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm5 +; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 +; AVX1-NEXT: vxorpd %ymm1, %ymm4, %ymm3 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: vxorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX1-NEXT: vblendvpd %ymm0, %ymm1, %ymm4, %ymm0 +; AVX1-NEXT: vblendvpd %ymm3, %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3 -; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm1 -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpxor %ymm0, %ymm3, %ymm0 -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm2 +; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm2 -; AVX2-NEXT: vblendvpd %ymm0, %ymm2, %ymm1, %ymm0 +; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vblendvpd %ymm0, %ymm1, %ymm2, %ymm0 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: v4i64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2 -; AVX512F-NEXT: vpsubq %ymm1, %ymm0, %ymm1 -; AVX512F-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpxor %ymm0, %ymm2, %ymm0 -; AVX512F-NEXT: vpsraq $63, %zmm1, %zmm2 +; AVX512F-NEXT: vpsubq %ymm1, %ymm0, %ymm2 +; AVX512F-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; AVX512F-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpsraq $63, %zmm2, %zmm1 ; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX512F-NEXT: vpxor %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vblendvpd %ymm0, %ymm2, %ymm1, %ymm0 +; AVX512F-NEXT: vpxor %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vblendvpd %ymm0, %ymm1, %ymm2, %ymm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: v4i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512BW-NEXT: vpcmpgtq %ymm2, %ymm1, %k0 -; AVX512BW-NEXT: vpsubq %ymm1, %ymm0, %ymm1 -; AVX512BW-NEXT: vpcmpgtq %ymm1, %ymm0, %k1 +; AVX512BW-NEXT: vpcmpgtq %ymm0, %ymm1, %k0 +; AVX512BW-NEXT: vpsubq %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-NEXT: vpcmpgtq %ymm0, %ymm1, %k1 ; AVX512BW-NEXT: kxorw %k1, %k0, %k1 -; AVX512BW-NEXT: vpsraq $63, %ymm1, %ymm0 -; AVX512BW-NEXT: vpxorq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm1 {%k1} -; AVX512BW-NEXT: vmovdqa %ymm1, %ymm0 +; AVX512BW-NEXT: vpsraq $63, %ymm0, %ymm1 +; AVX512BW-NEXT: vpxorq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %ymm0 {%k1} ; AVX512BW-NEXT: retq %z = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> %x, <4 x i64> %y) ret <4 x i64> %z @@ -1598,122 +1478,90 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind { ; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648] ; SSE2-NEXT: pxor %xmm9, %xmm0 ; SSE2-NEXT: psubq %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm10 -; SSE2-NEXT: pxor %xmm9, %xmm10 -; SSE2-NEXT: movdqa %xmm0, %xmm11 -; SSE2-NEXT: pcmpgtd %xmm10, %xmm11 -; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm0, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3] -; SSE2-NEXT: pand %xmm12, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm11[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm10 ; SSE2-NEXT: pxor %xmm9, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: pand %xmm11, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm0[1,1,3,3] -; SSE2-NEXT: por %xmm4, %xmm11 -; SSE2-NEXT: pxor %xmm10, %xmm11 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [9223372036854775808,9223372036854775808] -; SSE2-NEXT: pxor %xmm10, %xmm0 +; SSE2-NEXT: movdqa %xmm4, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm0, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] ; SSE2-NEXT: pand %xmm11, %xmm0 -; SSE2-NEXT: pandn %xmm1, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm1[1,1,3,3] +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: pcmpgtd %xmm11, %xmm0 +; SSE2-NEXT: pxor %xmm10, %xmm0 +; SSE2-NEXT: psrad $31, %xmm11 +; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [9223372036854775808,9223372036854775808] +; SSE2-NEXT: pxor %xmm10, %xmm11 +; SSE2-NEXT: pand %xmm0, %xmm11 +; SSE2-NEXT: pandn %xmm1, %xmm0 ; SSE2-NEXT: por %xmm11, %xmm0 ; SSE2-NEXT: movdqa %xmm8, %xmm1 ; SSE2-NEXT: pxor %xmm9, %xmm1 ; SSE2-NEXT: psubq %xmm5, %xmm8 -; SSE2-NEXT: movdqa %xmm8, %xmm4 -; SSE2-NEXT: pxor %xmm9, %xmm4 -; SSE2-NEXT: movdqa %xmm1, %xmm11 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm11 +; SSE2-NEXT: pxor %xmm9, %xmm5 +; SSE2-NEXT: movdqa %xmm5, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm11 ; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm1, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm1, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3] ; SSE2-NEXT: pand %xmm12, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm11[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm4 -; SSE2-NEXT: pxor %xmm9, %xmm5 -; SSE2-NEXT: movdqa %xmm5, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm1[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE2-NEXT: pand %xmm11, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm1[1,1,3,3] -; SSE2-NEXT: por %xmm5, %xmm11 -; SSE2-NEXT: pxor %xmm4, %xmm11 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,3,3] -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: pxor %xmm10, %xmm1 -; SSE2-NEXT: pand %xmm11, %xmm1 -; SSE2-NEXT: pandn %xmm8, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm11[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm8[1,1,3,3] +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm11, %xmm1 +; SSE2-NEXT: pxor %xmm5, %xmm1 +; SSE2-NEXT: psrad $31, %xmm11 +; SSE2-NEXT: pxor %xmm10, %xmm11 +; SSE2-NEXT: pand %xmm1, %xmm11 +; SSE2-NEXT: pandn %xmm8, %xmm1 ; SSE2-NEXT: por %xmm11, %xmm1 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: pxor %xmm9, %xmm4 -; SSE2-NEXT: psubq %xmm6, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm5 ; SSE2-NEXT: pxor %xmm9, %xmm5 -; SSE2-NEXT: movdqa %xmm4, %xmm8 +; SSE2-NEXT: psubq %xmm6, %xmm2 +; SSE2-NEXT: pxor %xmm9, %xmm6 +; SSE2-NEXT: movdqa %xmm6, %xmm8 ; SSE2-NEXT: pcmpgtd %xmm5, %xmm8 ; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm8[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm4, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] -; SSE2-NEXT: pand %xmm11, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3] -; SSE2-NEXT: por %xmm4, %xmm5 -; SSE2-NEXT: pxor %xmm9, %xmm6 -; SSE2-NEXT: movdqa %xmm6, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSE2-NEXT: pand %xmm8, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm6, %xmm8 -; SSE2-NEXT: pxor %xmm5, %xmm8 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] -; SSE2-NEXT: psrad $31, %xmm4 -; SSE2-NEXT: pxor %xmm10, %xmm4 -; SSE2-NEXT: pand %xmm8, %xmm4 -; SSE2-NEXT: pandn %xmm2, %xmm8 -; SSE2-NEXT: por %xmm8, %xmm4 +; SSE2-NEXT: pcmpeqd %xmm5, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] +; SSE2-NEXT: pand %xmm11, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm8[1,1,3,3] +; SSE2-NEXT: por %xmm5, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm2[1,1,3,3] +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm8, %xmm5 +; SSE2-NEXT: pxor %xmm6, %xmm5 +; SSE2-NEXT: psrad $31, %xmm8 +; SSE2-NEXT: pxor %xmm10, %xmm8 +; SSE2-NEXT: pand %xmm5, %xmm8 +; SSE2-NEXT: pandn %xmm2, %xmm5 +; SSE2-NEXT: por %xmm8, %xmm5 ; SSE2-NEXT: movdqa %xmm3, %xmm2 ; SSE2-NEXT: pxor %xmm9, %xmm2 -; SSE2-NEXT: psubq %xmm7, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm5 -; SSE2-NEXT: pxor %xmm9, %xmm5 -; SSE2-NEXT: movdqa %xmm2, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm6 +; SSE2-NEXT: pxor %xmm7, %xmm9 +; SSE2-NEXT: movdqa %xmm9, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm6 ; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm2, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm2, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm9[1,1,3,3] ; SSE2-NEXT: pand %xmm8, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm5 -; SSE2-NEXT: pxor %xmm9, %xmm7 -; SSE2-NEXT: movdqa %xmm7, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: por %xmm7, %xmm2 -; SSE2-NEXT: pxor %xmm5, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] -; SSE2-NEXT: psrad $31, %xmm5 -; SSE2-NEXT: pxor %xmm10, %xmm5 -; SSE2-NEXT: pand %xmm2, %xmm5 -; SSE2-NEXT: pandn %xmm3, %xmm2 -; SSE2-NEXT: por %xmm2, %xmm5 -; SSE2-NEXT: movdqa %xmm4, %xmm2 -; SSE2-NEXT: movdqa %xmm5, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm2, %xmm6 +; SSE2-NEXT: psubq %xmm7, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 +; SSE2-NEXT: pxor %xmm6, %xmm4 +; SSE2-NEXT: psrad $31, %xmm2 +; SSE2-NEXT: pxor %xmm10, %xmm2 +; SSE2-NEXT: pand %xmm4, %xmm2 +; SSE2-NEXT: pandn %xmm3, %xmm4 +; SSE2-NEXT: por %xmm2, %xmm4 +; SSE2-NEXT: movdqa %xmm5, %xmm2 +; SSE2-NEXT: movdqa %xmm4, %xmm3 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: v8i64: @@ -1723,122 +1571,90 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind { ; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648] ; SSSE3-NEXT: pxor %xmm9, %xmm0 ; SSSE3-NEXT: psubq %xmm4, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm10 -; SSSE3-NEXT: pxor %xmm9, %xmm10 -; SSSE3-NEXT: movdqa %xmm0, %xmm11 -; SSSE3-NEXT: pcmpgtd %xmm10, %xmm11 -; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm0, %xmm10 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3] -; SSSE3-NEXT: pand %xmm12, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm11[1,1,3,3] -; SSSE3-NEXT: por %xmm0, %xmm10 ; SSSE3-NEXT: pxor %xmm9, %xmm4 -; SSSE3-NEXT: movdqa %xmm4, %xmm0 -; SSSE3-NEXT: pcmpgtd %xmm9, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSSE3-NEXT: pand %xmm11, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm0[1,1,3,3] -; SSSE3-NEXT: por %xmm4, %xmm11 -; SSSE3-NEXT: pxor %xmm10, %xmm11 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] -; SSSE3-NEXT: psrad $31, %xmm0 -; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [9223372036854775808,9223372036854775808] -; SSSE3-NEXT: pxor %xmm10, %xmm0 +; SSSE3-NEXT: movdqa %xmm4, %xmm10 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm10 +; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm0, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] ; SSSE3-NEXT: pand %xmm11, %xmm0 -; SSSE3-NEXT: pandn %xmm1, %xmm11 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3] +; SSSE3-NEXT: por %xmm0, %xmm10 +; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm1[1,1,3,3] +; SSSE3-NEXT: pxor %xmm4, %xmm4 +; SSSE3-NEXT: pxor %xmm0, %xmm0 +; SSSE3-NEXT: pcmpgtd %xmm11, %xmm0 +; SSSE3-NEXT: pxor %xmm10, %xmm0 +; SSSE3-NEXT: psrad $31, %xmm11 +; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [9223372036854775808,9223372036854775808] +; SSSE3-NEXT: pxor %xmm10, %xmm11 +; SSSE3-NEXT: pand %xmm0, %xmm11 +; SSSE3-NEXT: pandn %xmm1, %xmm0 ; SSSE3-NEXT: por %xmm11, %xmm0 ; SSSE3-NEXT: movdqa %xmm8, %xmm1 ; SSSE3-NEXT: pxor %xmm9, %xmm1 ; SSSE3-NEXT: psubq %xmm5, %xmm8 -; SSSE3-NEXT: movdqa %xmm8, %xmm4 -; SSSE3-NEXT: pxor %xmm9, %xmm4 -; SSSE3-NEXT: movdqa %xmm1, %xmm11 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm11 +; SSSE3-NEXT: pxor %xmm9, %xmm5 +; SSSE3-NEXT: movdqa %xmm5, %xmm11 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm11 ; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm1, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm1, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3] ; SSSE3-NEXT: pand %xmm12, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm11[1,1,3,3] -; SSSE3-NEXT: por %xmm1, %xmm4 -; SSSE3-NEXT: pxor %xmm9, %xmm5 -; SSSE3-NEXT: movdqa %xmm5, %xmm1 -; SSSE3-NEXT: pcmpgtd %xmm9, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm1[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSSE3-NEXT: pand %xmm11, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm1[1,1,3,3] -; SSSE3-NEXT: por %xmm5, %xmm11 -; SSSE3-NEXT: pxor %xmm4, %xmm11 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,3,3] -; SSSE3-NEXT: psrad $31, %xmm1 -; SSSE3-NEXT: pxor %xmm10, %xmm1 -; SSSE3-NEXT: pand %xmm11, %xmm1 -; SSSE3-NEXT: pandn %xmm8, %xmm11 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm11[1,1,3,3] +; SSSE3-NEXT: por %xmm1, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm8[1,1,3,3] +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm11, %xmm1 +; SSSE3-NEXT: pxor %xmm5, %xmm1 +; SSSE3-NEXT: psrad $31, %xmm11 +; SSSE3-NEXT: pxor %xmm10, %xmm11 +; SSSE3-NEXT: pand %xmm1, %xmm11 +; SSSE3-NEXT: pandn %xmm8, %xmm1 ; SSSE3-NEXT: por %xmm11, %xmm1 -; SSSE3-NEXT: movdqa %xmm2, %xmm4 -; SSSE3-NEXT: pxor %xmm9, %xmm4 -; SSSE3-NEXT: psubq %xmm6, %xmm2 ; SSSE3-NEXT: movdqa %xmm2, %xmm5 ; SSSE3-NEXT: pxor %xmm9, %xmm5 -; SSSE3-NEXT: movdqa %xmm4, %xmm8 +; SSSE3-NEXT: psubq %xmm6, %xmm2 +; SSSE3-NEXT: pxor %xmm9, %xmm6 +; SSSE3-NEXT: movdqa %xmm6, %xmm8 ; SSSE3-NEXT: pcmpgtd %xmm5, %xmm8 ; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm8[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm4, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] -; SSSE3-NEXT: pand %xmm11, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3] -; SSSE3-NEXT: por %xmm4, %xmm5 -; SSSE3-NEXT: pxor %xmm9, %xmm6 -; SSSE3-NEXT: movdqa %xmm6, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm9, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSSE3-NEXT: pand %xmm8, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm6, %xmm8 -; SSSE3-NEXT: pxor %xmm5, %xmm8 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] -; SSSE3-NEXT: psrad $31, %xmm4 -; SSSE3-NEXT: pxor %xmm10, %xmm4 -; SSSE3-NEXT: pand %xmm8, %xmm4 -; SSSE3-NEXT: pandn %xmm2, %xmm8 -; SSSE3-NEXT: por %xmm8, %xmm4 +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] +; SSSE3-NEXT: pand %xmm11, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm8[1,1,3,3] +; SSSE3-NEXT: por %xmm5, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm2[1,1,3,3] +; SSSE3-NEXT: pxor %xmm5, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm8, %xmm5 +; SSSE3-NEXT: pxor %xmm6, %xmm5 +; SSSE3-NEXT: psrad $31, %xmm8 +; SSSE3-NEXT: pxor %xmm10, %xmm8 +; SSSE3-NEXT: pand %xmm5, %xmm8 +; SSSE3-NEXT: pandn %xmm2, %xmm5 +; SSSE3-NEXT: por %xmm8, %xmm5 ; SSSE3-NEXT: movdqa %xmm3, %xmm2 ; SSSE3-NEXT: pxor %xmm9, %xmm2 -; SSSE3-NEXT: psubq %xmm7, %xmm3 -; SSSE3-NEXT: movdqa %xmm3, %xmm5 -; SSSE3-NEXT: pxor %xmm9, %xmm5 -; SSSE3-NEXT: movdqa %xmm2, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm5, %xmm6 +; SSSE3-NEXT: pxor %xmm7, %xmm9 +; SSSE3-NEXT: movdqa %xmm9, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm6 ; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm2, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm2, %xmm9 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm9[1,1,3,3] ; SSSE3-NEXT: pand %xmm8, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] -; SSSE3-NEXT: por %xmm2, %xmm5 -; SSSE3-NEXT: pxor %xmm9, %xmm7 -; SSSE3-NEXT: movdqa %xmm7, %xmm2 -; SSSE3-NEXT: pcmpgtd %xmm9, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] -; SSSE3-NEXT: pand %xmm6, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSSE3-NEXT: por %xmm7, %xmm2 -; SSSE3-NEXT: pxor %xmm5, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] -; SSSE3-NEXT: psrad $31, %xmm5 -; SSSE3-NEXT: pxor %xmm10, %xmm5 -; SSSE3-NEXT: pand %xmm2, %xmm5 -; SSSE3-NEXT: pandn %xmm3, %xmm2 -; SSSE3-NEXT: por %xmm2, %xmm5 -; SSSE3-NEXT: movdqa %xmm4, %xmm2 -; SSSE3-NEXT: movdqa %xmm5, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSSE3-NEXT: por %xmm2, %xmm6 +; SSSE3-NEXT: psubq %xmm7, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4 +; SSSE3-NEXT: pxor %xmm6, %xmm4 +; SSSE3-NEXT: psrad $31, %xmm2 +; SSSE3-NEXT: pxor %xmm10, %xmm2 +; SSSE3-NEXT: pand %xmm4, %xmm2 +; SSSE3-NEXT: pandn %xmm3, %xmm4 +; SSSE3-NEXT: por %xmm2, %xmm4 +; SSSE3-NEXT: movdqa %xmm5, %xmm2 +; SSSE3-NEXT: movdqa %xmm4, %xmm3 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: v8i64: @@ -1847,22 +1663,14 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind { ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm10 = [2147483648,2147483648] ; SSE41-NEXT: pxor %xmm10, %xmm0 ; SSE41-NEXT: psubq %xmm4, %xmm8 -; SSE41-NEXT: movdqa %xmm8, %xmm9 -; SSE41-NEXT: pxor %xmm10, %xmm9 -; SSE41-NEXT: movdqa %xmm0, %xmm11 -; SSE41-NEXT: pcmpeqd %xmm9, %xmm11 -; SSE41-NEXT: pcmpgtd %xmm9, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm0[0,0,2,2] -; SSE41-NEXT: pand %xmm11, %xmm12 -; SSE41-NEXT: por %xmm0, %xmm12 ; SSE41-NEXT: pxor %xmm10, %xmm4 -; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm10, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm10, %xmm4 +; SSE41-NEXT: movdqa %xmm4, %xmm11 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm11 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 ; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm0, %xmm9 +; SSE41-NEXT: pand %xmm11, %xmm9 ; SSE41-NEXT: por %xmm4, %xmm9 -; SSE41-NEXT: pxor %xmm12, %xmm9 +; SSE41-NEXT: pxor %xmm8, %xmm9 ; SSE41-NEXT: movapd {{.*#+}} xmm12 = [9223372036854775807,9223372036854775807] ; SSE41-NEXT: movapd {{.*#+}} xmm11 = [9223372036854775808,9223372036854775808] ; SSE41-NEXT: movapd %xmm11, %xmm4 @@ -1873,22 +1681,14 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind { ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pxor %xmm10, %xmm0 ; SSE41-NEXT: psubq %xmm5, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm4 -; SSE41-NEXT: pxor %xmm10, %xmm4 -; SSE41-NEXT: movdqa %xmm0, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm4, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm4, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm0[0,0,2,2] -; SSE41-NEXT: pand %xmm9, %xmm13 -; SSE41-NEXT: por %xmm0, %xmm13 ; SSE41-NEXT: pxor %xmm10, %xmm5 -; SSE41-NEXT: movdqa %xmm5, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm10, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm10, %xmm5 +; SSE41-NEXT: movdqa %xmm5, %xmm9 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm5 ; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,0,2,2] -; SSE41-NEXT: pand %xmm0, %xmm4 +; SSE41-NEXT: pand %xmm9, %xmm4 ; SSE41-NEXT: por %xmm5, %xmm4 -; SSE41-NEXT: pxor %xmm13, %xmm4 +; SSE41-NEXT: pxor %xmm1, %xmm4 ; SSE41-NEXT: movapd %xmm11, %xmm5 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm5 @@ -1897,22 +1697,14 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind { ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: pxor %xmm10, %xmm0 ; SSE41-NEXT: psubq %xmm6, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm4 -; SSE41-NEXT: pxor %xmm10, %xmm4 -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm4, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm9 -; SSE41-NEXT: por %xmm0, %xmm9 ; SSE41-NEXT: pxor %xmm10, %xmm6 -; SSE41-NEXT: movdqa %xmm6, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm10, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm10, %xmm6 +; SSE41-NEXT: movdqa %xmm6, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 ; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm0, %xmm4 +; SSE41-NEXT: pand %xmm5, %xmm4 ; SSE41-NEXT: por %xmm6, %xmm4 -; SSE41-NEXT: pxor %xmm9, %xmm4 +; SSE41-NEXT: pxor %xmm2, %xmm4 ; SSE41-NEXT: movapd %xmm11, %xmm5 ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm5 @@ -1920,23 +1712,15 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind { ; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm2 ; SSE41-NEXT: movdqa %xmm3, %xmm0 ; SSE41-NEXT: pxor %xmm10, %xmm0 +; SSE41-NEXT: pxor %xmm7, %xmm10 +; SSE41-NEXT: movdqa %xmm10, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm10[0,0,2,2] +; SSE41-NEXT: pand %xmm5, %xmm4 +; SSE41-NEXT: por %xmm10, %xmm4 ; SSE41-NEXT: psubq %xmm7, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pxor %xmm10, %xmm4 -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm4, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm6 -; SSE41-NEXT: por %xmm0, %xmm6 -; SSE41-NEXT: pxor %xmm10, %xmm7 -; SSE41-NEXT: movdqa %xmm7, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm10, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm10, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2] -; SSE41-NEXT: pand %xmm0, %xmm4 -; SSE41-NEXT: por %xmm7, %xmm4 -; SSE41-NEXT: pxor %xmm6, %xmm4 +; SSE41-NEXT: pxor %xmm3, %xmm4 ; SSE41-NEXT: movdqa %xmm3, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm11 ; SSE41-NEXT: movdqa %xmm4, %xmm0 @@ -1946,74 +1730,65 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind { ; ; AVX1-LABEL: v8i64: ; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm6 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm7 -; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7 -; AVX1-NEXT: vpsubq %xmm5, %xmm7, %xmm5 -; AVX1-NEXT: vpcmpgtq %xmm5, %xmm7, %xmm7 -; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm0 -; AVX1-NEXT: vxorpd %ymm0, %ymm6, %ymm0 -; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm6 -; AVX1-NEXT: vpcmpgtq %xmm5, %xmm4, %xmm5 -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 -; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm5 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX1-NEXT: vxorpd %ymm5, %ymm2, %ymm2 -; AVX1-NEXT: vblendvpd %ymm0, %ymm2, %ymm6, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm6 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm7 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm7 ; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 -; AVX1-NEXT: vpsubq %xmm2, %xmm7, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm7, %xmm7 -; AVX1-NEXT: vpsubq %xmm3, %xmm1, %xmm3 -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm1 -; AVX1-NEXT: vxorpd %ymm1, %ymm6, %ymm1 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm6 -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX1-NEXT: vxorpd %ymm5, %ymm2, %ymm2 -; AVX1-NEXT: vblendvpd %ymm1, %ymm2, %ymm6, %ymm1 +; AVX1-NEXT: vpsubq %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm7 +; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm5, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm2, %ymm2 +; AVX1-NEXT: vxorpd %ymm2, %ymm6, %ymm6 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; AVX1-NEXT: vxorpd %ymm4, %ymm2, %ymm2 +; AVX1-NEXT: vblendvpd %ymm6, %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm6 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm6, %xmm7 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm8 +; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm8, %ymm7 +; AVX1-NEXT: vpsubq %xmm6, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm5, %xmm6 +; AVX1-NEXT: vpsubq %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm5, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 +; AVX1-NEXT: vxorpd %ymm3, %ymm7, %ymm5 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: vxorpd %ymm4, %ymm3, %ymm2 +; AVX1-NEXT: vblendvpd %ymm5, %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: retq ; ; AVX2-LABEL: v8i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm4 +; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpcmpgtq %ymm4, %ymm2, %ymm5 -; AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm2 -; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpxor %ymm0, %ymm5, %ymm0 -; AVX2-NEXT: vpcmpgtq %ymm2, %ymm4, %ymm5 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm6 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %ymm6, %ymm5, %ymm5 -; AVX2-NEXT: vblendvpd %ymm0, %ymm5, %ymm2, %ymm0 -; AVX2-NEXT: vpcmpgtq %ymm4, %ymm3, %ymm2 -; AVX2-NEXT: vpsubq %ymm3, %ymm1, %ymm3 -; AVX2-NEXT: vpcmpgtq %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpxor %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm2 +; AVX2-NEXT: vblendvpd %ymm0, %ymm5, %ymm4, %ymm0 +; AVX2-NEXT: vpsubq %ymm3, %ymm1, %ymm4 +; AVX2-NEXT: vpcmpgtq %ymm1, %ymm3, %ymm1 +; AVX2-NEXT: vpxor %ymm4, %ymm1, %ymm1 +; AVX2-NEXT: vpcmpgtq %ymm4, %ymm2, %ymm2 ; AVX2-NEXT: vpxor %ymm6, %ymm2, %ymm2 -; AVX2-NEXT: vblendvpd %ymm1, %ymm2, %ymm3, %ymm1 +; AVX2-NEXT: vblendvpd %ymm1, %ymm2, %ymm4, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: v8i64: ; AVX512: # %bb.0: -; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpcmpgtq %zmm2, %zmm1, %k0 -; AVX512-NEXT: vpsubq %zmm1, %zmm0, %zmm1 -; AVX512-NEXT: vpcmpgtq %zmm1, %zmm0, %k1 +; AVX512-NEXT: vpcmpgtq %zmm0, %zmm1, %k0 +; AVX512-NEXT: vpsubq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vpcmpgtq %zmm0, %zmm1, %k1 ; AVX512-NEXT: kxorw %k1, %k0, %k1 -; AVX512-NEXT: vpsraq $63, %zmm1, %zmm0 -; AVX512-NEXT: vpxorq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm1 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512-NEXT: vpsraq $63, %zmm0, %zmm1 +; AVX512-NEXT: vpxorq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 {%k1} ; AVX512-NEXT: retq %z = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> %x, <8 x i64> %y) ret <8 x i64> %z diff --git a/llvm/test/CodeGen/X86/vec_ssubo.ll b/llvm/test/CodeGen/X86/vec_ssubo.ll index 746c09e5e70db..57811c0eb8233 100644 --- a/llvm/test/CodeGen/X86/vec_ssubo.ll +++ b/llvm/test/CodeGen/X86/vec_ssubo.ll @@ -43,20 +43,21 @@ define <1 x i32> @ssubo_v1i32(<1 x i32> %a0, <1 x i32> %a1, ptr %p2) nounwind { define <2 x i32> @ssubo_v2i32(<2 x i32> %a0, <2 x i32> %a1, ptr %p2) nounwind { ; SSE-LABEL: ssubo_v2i32: ; SSE: # %bb.0: -; SSE-NEXT: pxor %xmm2, %xmm2 -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: psubd %xmm1, %xmm3 -; SSE-NEXT: pcmpgtd %xmm2, %xmm1 -; SSE-NEXT: pcmpgtd %xmm3, %xmm0 -; SSE-NEXT: pxor %xmm1, %xmm0 -; SSE-NEXT: movq %xmm3, (%rdi) +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE-NEXT: psubd %xmm1, %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE-NEXT: pxor %xmm2, %xmm1 +; SSE-NEXT: movq %xmm0, (%rdi) +; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: ssubo_v2i32: ; AVX: # %bb.0: -; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm2 +; AVX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm2 ; AVX-NEXT: vpsubd %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpxor %xmm0, %xmm2, %xmm0 ; AVX-NEXT: vmovq %xmm1, (%rdi) @@ -64,9 +65,9 @@ define <2 x i32> @ssubo_v2i32(<2 x i32> %a0, <2 x i32> %a1, ptr %p2) nounwind { ; ; AVX512-LABEL: ssubo_v2i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpcmpgtd %xmm2, %xmm1, %k0 +; AVX512-NEXT: vpcmpgtd %xmm0, %xmm1, %k0 ; AVX512-NEXT: vpsubd %xmm1, %xmm0, %xmm1 +; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vpcmpgtd %xmm1, %xmm0, %k1 ; AVX512-NEXT: kxorw %k1, %k0, %k1 ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 @@ -84,47 +85,50 @@ define <2 x i32> @ssubo_v2i32(<2 x i32> %a0, <2 x i32> %a1, ptr %p2) nounwind { define <3 x i32> @ssubo_v3i32(<3 x i32> %a0, <3 x i32> %a1, ptr %p2) nounwind { ; SSE2-LABEL: ssubo_v3i32: ; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: psubd %xmm1, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm0 -; SSE2-NEXT: movq %xmm3, (%rdi) -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] -; SSE2-NEXT: movd %xmm1, 8(%rdi) +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE2-NEXT: psubd %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm1 +; SSE2-NEXT: movq %xmm0, (%rdi) +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; SSE2-NEXT: movd %xmm0, 8(%rdi) +; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: ssubo_v3i32: ; SSSE3: # %bb.0: -; SSSE3-NEXT: pxor %xmm2, %xmm2 -; SSSE3-NEXT: movdqa %xmm0, %xmm3 -; SSSE3-NEXT: psubd %xmm1, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm1 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm0 -; SSSE3-NEXT: pxor %xmm1, %xmm0 -; SSSE3-NEXT: movq %xmm3, (%rdi) -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] -; SSSE3-NEXT: movd %xmm1, 8(%rdi) +; SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm2 +; SSSE3-NEXT: psubd %xmm1, %xmm0 +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm1 +; SSSE3-NEXT: pxor %xmm2, %xmm1 +; SSSE3-NEXT: movq %xmm0, (%rdi) +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; SSSE3-NEXT: movd %xmm0, 8(%rdi) +; SSSE3-NEXT: movdqa %xmm1, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: ssubo_v3i32: ; SSE41: # %bb.0: -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: psubd %xmm1, %xmm3 -; SSE41-NEXT: pcmpgtd %xmm2, %xmm1 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm0 -; SSE41-NEXT: pxor %xmm1, %xmm0 -; SSE41-NEXT: pextrd $2, %xmm3, 8(%rdi) -; SSE41-NEXT: movq %xmm3, (%rdi) +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE41-NEXT: psubd %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE41-NEXT: pxor %xmm2, %xmm1 +; SSE41-NEXT: pextrd $2, %xmm0, 8(%rdi) +; SSE41-NEXT: movq %xmm0, (%rdi) +; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: ssubo_v3i32: ; AVX: # %bb.0: -; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm2 +; AVX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm2 ; AVX-NEXT: vpsubd %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpxor %xmm0, %xmm2, %xmm0 ; AVX-NEXT: vpextrd $2, %xmm1, 8(%rdi) @@ -133,9 +137,9 @@ define <3 x i32> @ssubo_v3i32(<3 x i32> %a0, <3 x i32> %a1, ptr %p2) nounwind { ; ; AVX512-LABEL: ssubo_v3i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpcmpgtd %xmm2, %xmm1, %k0 +; AVX512-NEXT: vpcmpgtd %xmm0, %xmm1, %k0 ; AVX512-NEXT: vpsubd %xmm1, %xmm0, %xmm1 +; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vpcmpgtd %xmm1, %xmm0, %k1 ; AVX512-NEXT: kxorw %k1, %k0, %k1 ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 @@ -154,20 +158,21 @@ define <3 x i32> @ssubo_v3i32(<3 x i32> %a0, <3 x i32> %a1, ptr %p2) nounwind { define <4 x i32> @ssubo_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %p2) nounwind { ; SSE-LABEL: ssubo_v4i32: ; SSE: # %bb.0: -; SSE-NEXT: pxor %xmm2, %xmm2 -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: psubd %xmm1, %xmm3 -; SSE-NEXT: pcmpgtd %xmm2, %xmm1 -; SSE-NEXT: pcmpgtd %xmm3, %xmm0 -; SSE-NEXT: pxor %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm3, (%rdi) +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE-NEXT: psubd %xmm1, %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE-NEXT: pxor %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm0, (%rdi) +; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: ssubo_v4i32: ; AVX: # %bb.0: -; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm2 +; AVX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm2 ; AVX-NEXT: vpsubd %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpxor %xmm0, %xmm2, %xmm0 ; AVX-NEXT: vmovdqa %xmm1, (%rdi) @@ -175,9 +180,9 @@ define <4 x i32> @ssubo_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %p2) nounwind { ; ; AVX512-LABEL: ssubo_v4i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpcmpgtd %xmm2, %xmm1, %k0 +; AVX512-NEXT: vpcmpgtd %xmm0, %xmm1, %k0 ; AVX512-NEXT: vpsubd %xmm1, %xmm0, %xmm1 +; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vpcmpgtd %xmm1, %xmm0, %k1 ; AVX512-NEXT: kxorw %k1, %k0, %k1 ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 @@ -199,39 +204,40 @@ define <6 x i32> @ssubo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind { ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE2-NEXT: movd %r8d, %xmm0 +; SSE2-NEXT: movd %ecx, %xmm1 +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: movd %edx, %xmm3 +; SSE2-NEXT: movd %esi, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE2-NEXT: movd %r8d, %xmm1 -; SSE2-NEXT: movd %ecx, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE2-NEXT: movd %edx, %xmm1 -; SSE2-NEXT: movd %esi, %xmm3 -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] ; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE2-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE2-NEXT: movd %r9d, %xmm1 ; SSE2-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: psubd %xmm0, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm3 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE2-NEXT: psubd %xmm2, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: pxor %xmm5, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm0 -; SSE2-NEXT: pxor %xmm3, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: psubd %xmm2, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm2 -; SSE2-NEXT: pxor %xmm1, %xmm2 -; SSE2-NEXT: movq %xmm3, 16(%rcx) -; SSE2-NEXT: movdqa %xmm4, (%rcx) +; SSE2-NEXT: pcmpgtd %xmm0, %xmm5 +; SSE2-NEXT: pxor %xmm4, %xmm5 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm4 +; SSE2-NEXT: psubd %xmm3, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 +; SSE2-NEXT: pxor %xmm4, %xmm2 +; SSE2-NEXT: movq %xmm1, 16(%rcx) +; SSE2-NEXT: movdqa %xmm0, (%rcx) ; SSE2-NEXT: movq %xmm2, 16(%rdi) -; SSE2-NEXT: movdqa %xmm0, (%rdi) +; SSE2-NEXT: movdqa %xmm5, (%rdi) ; SSE2-NEXT: retq ; ; SSSE3-LABEL: ssubo_v6i32: @@ -240,97 +246,99 @@ define <6 x i32> @ssubo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind { ; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero ; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSSE3-NEXT: movd %r8d, %xmm0 +; SSSE3-NEXT: movd %ecx, %xmm1 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSSE3-NEXT: movd %edx, %xmm3 +; SSSE3-NEXT: movd %esi, %xmm0 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSSE3-NEXT: movd %r8d, %xmm1 -; SSSE3-NEXT: movd %ecx, %xmm2 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSSE3-NEXT: movd %edx, %xmm1 -; SSSE3-NEXT: movd %esi, %xmm3 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] ; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSSE3-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSSE3-NEXT: movd %r9d, %xmm1 ; SSSE3-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero ; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] ; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; SSSE3-NEXT: movdqa %xmm3, %xmm4 -; SSSE3-NEXT: psubd %xmm0, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm3 +; SSSE3-NEXT: movdqa %xmm2, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4 +; SSSE3-NEXT: psubd %xmm2, %xmm0 +; SSSE3-NEXT: pxor %xmm2, %xmm2 ; SSSE3-NEXT: pxor %xmm5, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm5, %xmm0 -; SSSE3-NEXT: pxor %xmm3, %xmm0 -; SSSE3-NEXT: movdqa %xmm1, %xmm3 -; SSSE3-NEXT: psubd %xmm2, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm1 -; SSSE3-NEXT: pcmpgtd %xmm5, %xmm2 -; SSSE3-NEXT: pxor %xmm1, %xmm2 -; SSSE3-NEXT: movq %xmm3, 16(%rcx) -; SSSE3-NEXT: movdqa %xmm4, (%rcx) +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm5 +; SSSE3-NEXT: pxor %xmm4, %xmm5 +; SSSE3-NEXT: movdqa %xmm3, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4 +; SSSE3-NEXT: psubd %xmm3, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 +; SSSE3-NEXT: pxor %xmm4, %xmm2 +; SSSE3-NEXT: movq %xmm1, 16(%rcx) +; SSSE3-NEXT: movdqa %xmm0, (%rcx) ; SSSE3-NEXT: movq %xmm2, 16(%rdi) -; SSSE3-NEXT: movdqa %xmm0, (%rdi) +; SSSE3-NEXT: movdqa %xmm5, (%rdi) ; SSSE3-NEXT: retq ; ; SSE41-LABEL: ssubo_v6i32: ; SSE41: # %bb.0: ; SSE41-NEXT: movq %rdi, %rax -; SSE41-NEXT: movd %esi, %xmm1 -; SSE41-NEXT: pinsrd $1, %edx, %xmm1 -; SSE41-NEXT: pinsrd $2, %ecx, %xmm1 -; SSE41-NEXT: pinsrd $3, %r8d, %xmm1 -; SSE41-NEXT: movd %r9d, %xmm0 -; SSE41-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm0 -; SSE41-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE41-NEXT: movd %esi, %xmm0 +; SSE41-NEXT: pinsrd $1, %edx, %xmm0 +; SSE41-NEXT: pinsrd $2, %ecx, %xmm0 +; SSE41-NEXT: pinsrd $3, %r8d, %xmm0 +; SSE41-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE41-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm1 +; SSE41-NEXT: movd %r9d, %xmm2 ; SSE41-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm2 ; SSE41-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero ; SSE41-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm3 ; SSE41-NEXT: pinsrd $2, {{[0-9]+}}(%rsp), %xmm3 ; SSE41-NEXT: pinsrd $3, {{[0-9]+}}(%rsp), %xmm3 ; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; SSE41-NEXT: movdqa %xmm1, %xmm4 -; SSE41-NEXT: psubd %xmm3, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm4, %xmm1 +; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE41-NEXT: psubd %xmm3, %xmm0 +; SSE41-NEXT: pxor %xmm3, %xmm3 ; SSE41-NEXT: pxor %xmm5, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm5, %xmm3 -; SSE41-NEXT: pxor %xmm1, %xmm3 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psubd %xmm2, %xmm1 -; SSE41-NEXT: pcmpgtd %xmm5, %xmm2 -; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movq %xmm1, 16(%rcx) -; SSE41-NEXT: movdqa %xmm4, (%rcx) -; SSE41-NEXT: movq %xmm0, 16(%rdi) -; SSE41-NEXT: movdqa %xmm3, (%rdi) +; SSE41-NEXT: pcmpgtd %xmm0, %xmm5 +; SSE41-NEXT: pxor %xmm4, %xmm5 +; SSE41-NEXT: movdqa %xmm1, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm2, %xmm4 +; SSE41-NEXT: psubd %xmm1, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm2, %xmm3 +; SSE41-NEXT: pxor %xmm4, %xmm3 +; SSE41-NEXT: movq %xmm2, 16(%rcx) +; SSE41-NEXT: movdqa %xmm0, (%rcx) +; SSE41-NEXT: movq %xmm3, 16(%rdi) +; SSE41-NEXT: movdqa %xmm5, (%rdi) ; SSE41-NEXT: retq ; ; AVX1-LABEL: ssubo_v6i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm4 +; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm5 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 +; AVX1-NEXT: vpsubd %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpgtd %xmm3, %xmm2, %xmm4 -; AVX1-NEXT: vpcmpgtd %xmm3, %xmm1, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vpsubd %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpcmpgtd %xmm2, %xmm4, %xmm4 +; AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm5 ; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 -; AVX1-NEXT: vxorps %ymm0, %ymm3, %ymm0 +; AVX1-NEXT: vpcmpgtd %xmm1, %xmm3, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 +; AVX1-NEXT: vxorps %ymm0, %ymm4, %ymm0 ; AVX1-NEXT: vmovq %xmm2, 16(%rdi) ; AVX1-NEXT: vmovdqa %xmm1, (%rdi) ; AVX1-NEXT: retq ; ; AVX2-LABEL: ssubo_v6i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpgtd %ymm2, %ymm1, %ymm2 +; AVX2-NEXT: vpcmpgtd %ymm0, %ymm1, %ymm2 ; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm1 +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 @@ -340,9 +348,9 @@ define <6 x i32> @ssubo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind { ; ; AVX512-LABEL: ssubo_v6i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpcmpgtd %ymm2, %ymm1, %k0 +; AVX512-NEXT: vpcmpgtd %ymm0, %ymm1, %k0 ; AVX512-NEXT: vpsubd %ymm1, %ymm0, %ymm1 +; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vpcmpgtd %ymm1, %ymm0, %k1 ; AVX512-NEXT: kxorw %k1, %k0, %k1 ; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 @@ -362,44 +370,47 @@ define <6 x i32> @ssubo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind { define <8 x i32> @ssubo_v8i32(<8 x i32> %a0, <8 x i32> %a1, ptr %p2) nounwind { ; SSE-LABEL: ssubo_v8i32: ; SSE: # %bb.0: +; SSE-NEXT: movdqa %xmm2, %xmm5 +; SSE-NEXT: pcmpgtd %xmm0, %xmm5 +; SSE-NEXT: psubd %xmm2, %xmm0 +; SSE-NEXT: pxor %xmm2, %xmm2 ; SSE-NEXT: pxor %xmm4, %xmm4 -; SSE-NEXT: movdqa %xmm0, %xmm5 -; SSE-NEXT: psubd %xmm2, %xmm5 -; SSE-NEXT: pcmpgtd %xmm4, %xmm2 -; SSE-NEXT: pcmpgtd %xmm5, %xmm0 -; SSE-NEXT: pxor %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: psubd %xmm3, %xmm2 -; SSE-NEXT: pcmpgtd %xmm4, %xmm3 -; SSE-NEXT: pcmpgtd %xmm2, %xmm1 -; SSE-NEXT: pxor %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm2, 16(%rdi) -; SSE-NEXT: movdqa %xmm5, (%rdi) +; SSE-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE-NEXT: pxor %xmm5, %xmm4 +; SSE-NEXT: movdqa %xmm3, %xmm5 +; SSE-NEXT: pcmpgtd %xmm1, %xmm5 +; SSE-NEXT: psubd %xmm3, %xmm1 +; SSE-NEXT: pcmpgtd %xmm1, %xmm2 +; SSE-NEXT: pxor %xmm5, %xmm2 +; SSE-NEXT: movdqa %xmm1, 16(%rdi) +; SSE-NEXT: movdqa %xmm0, (%rdi) +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: retq ; ; AVX1-LABEL: ssubo_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm4 +; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm5 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 +; AVX1-NEXT: vpsubd %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpgtd %xmm3, %xmm2, %xmm4 -; AVX1-NEXT: vpcmpgtd %xmm3, %xmm1, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vpsubd %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpcmpgtd %xmm2, %xmm4, %xmm4 +; AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm5 ; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 -; AVX1-NEXT: vxorps %ymm0, %ymm3, %ymm0 +; AVX1-NEXT: vpcmpgtd %xmm1, %xmm3, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 +; AVX1-NEXT: vxorps %ymm0, %ymm4, %ymm0 ; AVX1-NEXT: vmovdqa %xmm2, 16(%rdi) ; AVX1-NEXT: vmovdqa %xmm1, (%rdi) ; AVX1-NEXT: retq ; ; AVX2-LABEL: ssubo_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpgtd %ymm2, %ymm1, %ymm2 +; AVX2-NEXT: vpcmpgtd %ymm0, %ymm1, %ymm2 ; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm1 +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vmovdqa %ymm1, (%rdi) @@ -407,9 +418,9 @@ define <8 x i32> @ssubo_v8i32(<8 x i32> %a0, <8 x i32> %a1, ptr %p2) nounwind { ; ; AVX512-LABEL: ssubo_v8i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpcmpgtd %ymm2, %ymm1, %k0 +; AVX512-NEXT: vpcmpgtd %ymm0, %ymm1, %k0 ; AVX512-NEXT: vpsubd %ymm1, %ymm0, %ymm1 +; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vpcmpgtd %ymm1, %ymm0, %k1 ; AVX512-NEXT: kxorw %k1, %k0, %k1 ; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 @@ -427,57 +438,64 @@ define <8 x i32> @ssubo_v8i32(<8 x i32> %a0, <8 x i32> %a1, ptr %p2) nounwind { define <16 x i32> @ssubo_v16i32(<16 x i32> %a0, <16 x i32> %a1, ptr %p2) nounwind { ; SSE-LABEL: ssubo_v16i32: ; SSE: # %bb.0: -; SSE-NEXT: pxor %xmm9, %xmm9 -; SSE-NEXT: movdqa %xmm0, %xmm8 -; SSE-NEXT: psubd %xmm4, %xmm8 -; SSE-NEXT: pcmpgtd %xmm9, %xmm4 -; SSE-NEXT: pcmpgtd %xmm8, %xmm0 -; SSE-NEXT: pxor %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: psubd %xmm5, %xmm4 -; SSE-NEXT: pcmpgtd %xmm9, %xmm5 -; SSE-NEXT: pcmpgtd %xmm4, %xmm1 -; SSE-NEXT: pxor %xmm5, %xmm1 -; SSE-NEXT: movdqa %xmm2, %xmm5 -; SSE-NEXT: psubd %xmm6, %xmm5 -; SSE-NEXT: pcmpgtd %xmm9, %xmm6 -; SSE-NEXT: pcmpgtd %xmm5, %xmm2 -; SSE-NEXT: pxor %xmm6, %xmm2 -; SSE-NEXT: movdqa %xmm3, %xmm6 -; SSE-NEXT: psubd %xmm7, %xmm6 -; SSE-NEXT: pcmpgtd %xmm9, %xmm7 -; SSE-NEXT: pcmpgtd %xmm6, %xmm3 -; SSE-NEXT: pxor %xmm7, %xmm3 -; SSE-NEXT: movdqa %xmm6, 48(%rdi) -; SSE-NEXT: movdqa %xmm5, 32(%rdi) -; SSE-NEXT: movdqa %xmm4, 16(%rdi) -; SSE-NEXT: movdqa %xmm8, (%rdi) +; SSE-NEXT: movdqa %xmm4, %xmm9 +; SSE-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE-NEXT: psubd %xmm4, %xmm0 +; SSE-NEXT: pxor %xmm4, %xmm4 +; SSE-NEXT: pxor %xmm8, %xmm8 +; SSE-NEXT: pcmpgtd %xmm0, %xmm8 +; SSE-NEXT: pxor %xmm9, %xmm8 +; SSE-NEXT: movdqa %xmm5, %xmm9 +; SSE-NEXT: pcmpgtd %xmm1, %xmm9 +; SSE-NEXT: psubd %xmm5, %xmm1 +; SSE-NEXT: pxor %xmm5, %xmm5 +; SSE-NEXT: pcmpgtd %xmm1, %xmm5 +; SSE-NEXT: pxor %xmm9, %xmm5 +; SSE-NEXT: movdqa %xmm6, %xmm9 +; SSE-NEXT: pcmpgtd %xmm2, %xmm9 +; SSE-NEXT: psubd %xmm6, %xmm2 +; SSE-NEXT: pxor %xmm6, %xmm6 +; SSE-NEXT: pcmpgtd %xmm2, %xmm6 +; SSE-NEXT: pxor %xmm9, %xmm6 +; SSE-NEXT: movdqa %xmm7, %xmm9 +; SSE-NEXT: pcmpgtd %xmm3, %xmm9 +; SSE-NEXT: psubd %xmm7, %xmm3 +; SSE-NEXT: pcmpgtd %xmm3, %xmm4 +; SSE-NEXT: pxor %xmm9, %xmm4 +; SSE-NEXT: movdqa %xmm3, 48(%rdi) +; SSE-NEXT: movdqa %xmm2, 32(%rdi) +; SSE-NEXT: movdqa %xmm1, 16(%rdi) +; SSE-NEXT: movdqa %xmm0, (%rdi) +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: movdqa %xmm4, %xmm3 ; SSE-NEXT: retq ; ; AVX1-LABEL: ssubo_v16i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 +; AVX1-NEXT: vpcmpgtd %xmm4, %xmm5, %xmm6 +; AVX1-NEXT: vpsubd %xmm5, %xmm4, %xmm4 ; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 -; AVX1-NEXT: vpcmpgtd %xmm5, %xmm4, %xmm6 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 -; AVX1-NEXT: vpsubd %xmm4, %xmm7, %xmm4 -; AVX1-NEXT: vpcmpgtd %xmm4, %xmm7, %xmm7 +; AVX1-NEXT: vpcmpgtd %xmm4, %xmm5, %xmm7 ; AVX1-NEXT: vpxor %xmm7, %xmm6, %xmm6 -; AVX1-NEXT: vpcmpgtd %xmm5, %xmm3, %xmm7 +; AVX1-NEXT: vpcmpgtd %xmm1, %xmm3, %xmm7 ; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm3 -; AVX1-NEXT: vpcmpgtd %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpgtd %xmm3, %xmm5, %xmm1 ; AVX1-NEXT: vpxor %xmm1, %xmm7, %xmm1 ; AVX1-NEXT: vpackssdw %xmm6, %xmm1, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6 -; AVX1-NEXT: vpcmpgtd %xmm5, %xmm6, %xmm7 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm8 -; AVX1-NEXT: vpsubd %xmm6, %xmm8, %xmm6 -; AVX1-NEXT: vpcmpgtd %xmm6, %xmm8, %xmm8 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm7 +; AVX1-NEXT: vpcmpgtd %xmm6, %xmm7, %xmm8 +; AVX1-NEXT: vpsubd %xmm7, %xmm6, %xmm6 +; AVX1-NEXT: vpcmpgtd %xmm6, %xmm5, %xmm7 ; AVX1-NEXT: vpxor %xmm7, %xmm8, %xmm7 -; AVX1-NEXT: vpcmpgtd %xmm5, %xmm2, %xmm5 +; AVX1-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm8 ; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vpcmpgtd %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpxor %xmm0, %xmm5, %xmm0 +; AVX1-NEXT: vpcmpgtd %xmm2, %xmm5, %xmm0 +; AVX1-NEXT: vpxor %xmm0, %xmm8, %xmm0 ; AVX1-NEXT: vpackssdw %xmm7, %xmm0, %xmm0 ; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm5 @@ -497,18 +515,18 @@ define <16 x i32> @ssubo_v16i32(<16 x i32> %a0, <16 x i32> %a1, ptr %p2) nounwin ; ; AVX2-LABEL: ssubo_v16i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX2-NEXT: vpcmpgtd %ymm4, %ymm3, %ymm5 +; AVX2-NEXT: vpcmpgtd %ymm1, %ymm3, %ymm4 ; AVX2-NEXT: vpsubd %ymm3, %ymm1, %ymm3 -; AVX2-NEXT: vpcmpgtd %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpxor %ymm1, %ymm5, %ymm1 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm5 -; AVX2-NEXT: vpackssdw %xmm5, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpgtd %ymm4, %ymm2, %ymm4 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpgtd %ymm3, %ymm1, %ymm5 +; AVX2-NEXT: vpxor %ymm5, %ymm4, %ymm4 +; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-NEXT: vpackssdw %xmm5, %xmm4, %xmm4 +; AVX2-NEXT: vpcmpgtd %ymm0, %ymm2, %ymm5 ; AVX2-NEXT: vpsubd %ymm2, %ymm0, %ymm2 -; AVX2-NEXT: vpcmpgtd %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpxor %ymm0, %ymm4, %ymm0 -; AVX2-NEXT: vpacksswb %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpgtd %ymm2, %ymm1, %ymm0 +; AVX2-NEXT: vpxor %ymm0, %ymm5, %ymm0 +; AVX2-NEXT: vpacksswb %xmm4, %xmm4, %xmm1 ; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12,16,16,16,16,20,20,20,20,24,24,24,24,28,28,28,28] ; AVX2-NEXT: vmovdqa %ymm3, 32(%rdi) @@ -517,9 +535,9 @@ define <16 x i32> @ssubo_v16i32(<16 x i32> %a0, <16 x i32> %a1, ptr %p2) nounwin ; ; AVX512-LABEL: ssubo_v16i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpcmpgtd %zmm2, %zmm1, %k0 +; AVX512-NEXT: vpcmpgtd %zmm0, %zmm1, %k0 ; AVX512-NEXT: vpsubd %zmm1, %zmm0, %zmm1 +; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vpcmpgtd %zmm1, %zmm0, %k1 ; AVX512-NEXT: kxorw %k1, %k0, %k1 ; AVX512-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 @@ -761,26 +779,22 @@ define <2 x i32> @ssubo_v2i64(<2 x i64> %a0, <2 x i64> %a1, ptr %p2) nounwind { ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: pxor %xmm2, %xmm3 +; SSE2-NEXT: pxor %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 +; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm4, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm2, %xmm3 ; SSE2-NEXT: psubq %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 +; SSE2-NEXT: pxor %xmm3, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] ; SSE2-NEXT: movdqa %xmm0, (%rdi) -; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: pxor %xmm2, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm3, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: ssubo_v2i64: @@ -788,26 +802,22 @@ define <2 x i32> @ssubo_v2i64(<2 x i64> %a0, <2 x i64> %a1, ptr %p2) nounwind { ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] ; SSSE3-NEXT: movdqa %xmm0, %xmm3 ; SSSE3-NEXT: pxor %xmm2, %xmm3 +; SSSE3-NEXT: pxor %xmm1, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm4 +; SSSE3-NEXT: pcmpeqd %xmm3, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSSE3-NEXT: pand %xmm4, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] +; SSSE3-NEXT: por %xmm2, %xmm3 ; SSSE3-NEXT: psubq %xmm1, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 +; SSSE3-NEXT: pxor %xmm3, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] ; SSSE3-NEXT: movdqa %xmm0, (%rdi) -; SSSE3-NEXT: pxor %xmm2, %xmm0 -; SSSE3-NEXT: movdqa %xmm3, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4 -; SSSE3-NEXT: pcmpeqd %xmm3, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSSE3-NEXT: pand %xmm4, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm0, %xmm3 -; SSSE3-NEXT: pxor %xmm2, %xmm1 ; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm0 -; SSSE3-NEXT: pcmpeqd %xmm2, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: pand %xmm0, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSSE3-NEXT: por %xmm1, %xmm0 -; SSSE3-NEXT: pxor %xmm3, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: ssubo_v2i64: @@ -815,33 +825,29 @@ define <2 x i32> @ssubo_v2i64(<2 x i64> %a0, <2 x i64> %a1, ptr %p2) nounwind { ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = [2147483648,2147483648] ; SSE41-NEXT: movdqa %xmm0, %xmm3 ; SSE41-NEXT: pxor %xmm2, %xmm3 +; SSE41-NEXT: pxor %xmm1, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm3, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE41-NEXT: pand %xmm4, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] +; SSE41-NEXT: por %xmm2, %xmm3 ; SSE41-NEXT: psubq %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; SSE41-NEXT: pxor %xmm2, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm1, %xmm2 +; SSE41-NEXT: pxor %xmm3, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] ; SSE41-NEXT: movdqa %xmm0, (%rdi) -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm3, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] -; SSE41-NEXT: por %xmm0, %xmm3 -; SSE41-NEXT: pxor %xmm2, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm2, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm2, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE41-NEXT: pand %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE41-NEXT: retq ; ; AVX-LABEL: ssubo_v2i64: ; AVX: # %bb.0: -; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm2 +; AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX-NEXT: vpsubq %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpxor %xmm0, %xmm2, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] @@ -850,9 +856,9 @@ define <2 x i32> @ssubo_v2i64(<2 x i64> %a0, <2 x i64> %a1, ptr %p2) nounwind { ; ; AVX512-LABEL: ssubo_v2i64: ; AVX512: # %bb.0: -; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpcmpgtq %xmm2, %xmm1, %k0 +; AVX512-NEXT: vpcmpgtq %xmm0, %xmm1, %k0 ; AVX512-NEXT: vpsubq %xmm1, %xmm0, %xmm1 +; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vpcmpgtq %xmm1, %xmm0, %k1 ; AVX512-NEXT: kxorw %k1, %k0, %k1 ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0