@@ -554,12 +554,13 @@ let OtherPredicates = [isGFX10Plus, Has16BitInsts], True16Predicate = NotHasTrue
554554 defm: Ternary_i16_Pats_gfx9<mul, add, V_MAD_U16_gfx9_e64>;
555555} // End OtherPredicates = [isGFX10Plus, Has16BitInsts], True16Predicate = NotHasTrue16BitInsts
556556
557- class ThreeOpFragSDAG<SDPatternOperator op1, SDPatternOperator op2> : PatFrag<
557+ class ThreeOpFragSDAG<SDPatternOperator op1, SDPatternOperator op2, bit op1IsRight = 0 > : PatFrag<
558558 (ops node:$x, node:$y, node:$z),
559559 // When the inner operation is used multiple times, selecting 3-op
560560 // instructions may still be beneficial -- if the other users can be
561561 // combined similarly. Let's be conservative for now.
562- (op2 (HasOneUseBinOp<op1> node:$x, node:$y), node:$z),
562+ !if(op1IsRight, (op2 node:$z, (HasOneUseBinOp<op1> node:$x, node:$y)),
563+ (op2 (HasOneUseBinOp<op1> node:$x, node:$y), node:$z)),
563564 [{
564565 // Only use VALU ops when the result is divergent.
565566 if (!N->isDivergent())
@@ -586,7 +587,10 @@ class ThreeOpFragSDAG<SDPatternOperator op1, SDPatternOperator op2> : PatFrag<
586587 let PredicateCodeUsesOperands = 1;
587588}
588589
589- class ThreeOpFrag<SDPatternOperator op1, SDPatternOperator op2> : ThreeOpFragSDAG<op1, op2> {
590+ // Matches (op2 (op1 x, y), z) if op1IsRight = 0 and
591+ // matches (op2 z, (op1, x, y)) if op1IsRight = 1.
592+ class ThreeOpFrag<SDPatternOperator op1, SDPatternOperator op2,
593+ bit op1IsRight = 0> : ThreeOpFragSDAG<op1, op2, op1IsRight> {
590594 // The divergence predicate is irrelevant in GlobalISel, as we have
591595 // proper register bank checks. We just need to verify the constant
592596 // bus restriction when all the sources are considered.
@@ -938,12 +942,19 @@ def : GCNPat<
938942 (DivergentBinFrag<mul> i32:$src0, IsPow2Plus1:$src1),
939943 (V_LSHL_ADD_U32_e64 i32:$src0, (i32 (Log2_32 imm:$src1)), i32:$src0)>;
940944
941- let SubtargetPredicate = HasLshlAddU64Inst in
945+ let SubtargetPredicate = HasLshlAddU64Inst in {
942946def : GCNPat<
943947 (ThreeOpFrag<shl_0_to_4, add> i64:$src0, i32:$src1, i64:$src2),
944948 (V_LSHL_ADD_U64_e64 VSrc_b64:$src0, VSrc_b32:$src1, VSrc_b64:$src2)
945949>;
946950
951+ def : GCNPat <
952+ // (ptradd z, (shl x, y)) -> ((x << y) + z)
953+ (ThreeOpFrag<shl_0_to_4, ptradd, /*op1IsRight=*/1> i64:$src0, i32:$src1, i64:$src2),
954+ (V_LSHL_ADD_U64_e64 VSrc_b64:$src0, VSrc_b32:$src1, VSrc_b64:$src2)
955+ >;
956+ } // End SubtargetPredicate = HasLshlAddU64Inst
957+
947958let SubtargetPredicate = HasAddMinMaxInsts in {
948959def : ThreeOp_i32_Pats<add, smax, V_ADD_MAX_I32_e64>;
949960def : ThreeOp_i32_Pats<add, umax, V_ADD_MAX_U32_e64>;
@@ -1019,19 +1030,24 @@ multiclass IMAD32_Pats <VOP3_Pseudo inst> {
10191030
10201031// Handle cases where amdgpu-codegenprepare-mul24 made a mul24 instead of a normal mul.
10211032// We need to separate this because otherwise OtherPredicates would be overriden.
1022- class IMAD32_Mul24_Pat<VOP3_Pseudo inst>: GCNPat <
1023- (i64 (add (i64 (AMDGPUmul_u24 i32:$src0, i32:$src1)), i64:$src2)),
1024- (inst $src0, $src1, $src2, 0 /* clamp */)
1025- >;
1033+ class IMAD32_Mul24_Pats_Impl<VOP3_Pseudo inst, SDPatternOperator AddOp, bit mulIsRight = 0> : GCNPat <
1034+ !if(mulIsRight, (i64 (AddOp i64:$src2, (i64 (AMDGPUmul_u24 i32:$src0, i32:$src1)))),
1035+ (i64 (AddOp (i64 (AMDGPUmul_u24 i32:$src0, i32:$src1)), i64:$src2))),
1036+ (inst $src0, $src1, $src2, 0 /* clamp */)>;
1037+
1038+ multiclass IMAD32_Mul24_Pats<VOP3_Pseudo inst> {
1039+ def : IMAD32_Mul24_Pats_Impl<inst, add>;
1040+ def : IMAD32_Mul24_Pats_Impl<inst, ptradd, /*mulIsRight=*/1>;
1041+ }
10261042
10271043// exclude pre-GFX9 where it was slow
10281044let OtherPredicates = [HasNotMADIntraFwdBug], SubtargetPredicate = isGFX9Plus in {
10291045 defm : IMAD32_Pats<V_MAD_U64_U32_e64>;
1030- def : IMAD32_Mul24_Pat <V_MAD_U64_U32_e64>;
1046+ defm : IMAD32_Mul24_Pats <V_MAD_U64_U32_e64>;
10311047}
10321048let OtherPredicates = [HasMADIntraFwdBug], SubtargetPredicate = isGFX11Only in {
10331049 defm : IMAD32_Pats<V_MAD_U64_U32_gfx11_e64>;
1034- def : IMAD32_Mul24_Pat <V_MAD_U64_U32_gfx11_e64>;
1050+ defm : IMAD32_Mul24_Pats <V_MAD_U64_U32_gfx11_e64>;
10351051}
10361052
10371053def VOP3_PERMLANE_Profile : VOP3_Profile<VOPProfile <[i32, i32, i32, i32]>, VOP3_OPSEL> {
0 commit comments