Skip to content

Commit dba82e5

Browse files
committed
Use ptradd_commutable PatFrags to make ptradd commutable in some DAG patterns
1 parent b787cb8 commit dba82e5

File tree

2 files changed

+75
-38
lines changed

2 files changed

+75
-38
lines changed

llvm/lib/Target/AMDGPU/VOP3Instructions.td

Lines changed: 21 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,12 @@
99
def BITOP3_32 : ComplexPattern<i32, 4, "SelectBITOP3", [and, or, xor]>;
1010
def BITOP3_16 : ComplexPattern<i16, 4, "SelectBITOP3", [and, or, xor]>;
1111

12+
// Matches PTRADD as a commutative operation. Patterns using this PatFrag must
13+
// set GISelShouldIgnore = 1 as commuting the corresponding G_PTR_ADD is
14+
// invalid.
15+
def ptradd_commutative : PatFrags<(ops node:$src0, node:$src1),
16+
[(ptradd node:$src0, node:$src1), (ptradd node:$src1, node:$src0)]>;
17+
1218
// Special case for v_div_fmas_{f32|f64}, since it seems to be the
1319
// only VOP instruction that implicitly reads VCC.
1420
let Asm64 = " $vdst, $src0_modifiers, $src1_modifiers, $src2_modifiers$clamp$omod" in {
@@ -514,13 +520,12 @@ let OtherPredicates = [isGFX10Plus, Has16BitInsts], True16Predicate = NotHasTrue
514520
defm: Ternary_i16_Pats_gfx9<mul, add, V_MAD_U16_gfx9_e64>;
515521
} // End OtherPredicates = [isGFX10Plus, Has16BitInsts], True16Predicate = NotHasTrue16BitInsts
516522

517-
class ThreeOpFragSDAG<SDPatternOperator op1, SDPatternOperator op2, bit op1IsRight = 0> : PatFrag<
523+
class ThreeOpFragSDAG<SDPatternOperator op1, SDPatternOperator op2> : PatFrag<
518524
(ops node:$x, node:$y, node:$z),
519525
// When the inner operation is used multiple times, selecting 3-op
520526
// instructions may still be beneficial -- if the other users can be
521527
// combined similarly. Let's be conservative for now.
522-
!if(op1IsRight, (op2 node:$z, (HasOneUseBinOp<op1> node:$x, node:$y)),
523-
(op2 (HasOneUseBinOp<op1> node:$x, node:$y), node:$z)),
528+
(op2 (HasOneUseBinOp<op1> node:$x, node:$y), node:$z),
524529
[{
525530
// Only use VALU ops when the result is divergent.
526531
if (!N->isDivergent())
@@ -547,10 +552,7 @@ class ThreeOpFragSDAG<SDPatternOperator op1, SDPatternOperator op2, bit op1IsRig
547552
let PredicateCodeUsesOperands = 1;
548553
}
549554

550-
// Matches (op2 (op1 x, y), z) if op1IsRight = 0 and
551-
// matches (op2 z, (op1, x, y)) if op1IsRight = 1.
552-
class ThreeOpFrag<SDPatternOperator op1, SDPatternOperator op2,
553-
bit op1IsRight = 0> : ThreeOpFragSDAG<op1, op2, op1IsRight> {
555+
class ThreeOpFrag<SDPatternOperator op1, SDPatternOperator op2> : ThreeOpFragSDAG<op1, op2> {
554556
// The divergence predicate is irrelevant in GlobalISel, as we have
555557
// proper register bank checks. We just need to verify the constant
556558
// bus restriction when all the sources are considered.
@@ -847,10 +849,11 @@ def : GCNPat<
847849
>;
848850

849851
def : GCNPat <
850-
// (ptradd z, (shl x, y)) -> ((x << y) + z)
851-
(ThreeOpFrag<shl_0_to_4, ptradd, /*op1IsRight=*/1> i64:$src0, i32:$src1, i64:$src2),
852-
(V_LSHL_ADD_U64_e64 VSrc_b64:$src0, VSrc_b32:$src1, VSrc_b64:$src2)
853-
>;
852+
// (ptradd z, (shl x, y)) or (ptradd (shl x, y), z) -> ((x << y) + z)
853+
(ThreeOpFrag<shl_0_to_4, ptradd_commutative> i64:$src0, i32:$src1, i64:$src2),
854+
(V_LSHL_ADD_U64_e64 VSrc_b64:$src0, VSrc_b32:$src1, VSrc_b64:$src2)> {
855+
let GISelShouldIgnore = 1;
856+
}
854857
} // End SubtargetPredicate = HasLshlAddU64Inst
855858

856859
def : VOPBinOpClampPat<saddsat, V_ADD_I32_e64, i32>;
@@ -921,14 +924,16 @@ multiclass IMAD32_Pats <VOP3_Pseudo inst> {
921924

922925
// Handle cases where amdgpu-codegenprepare-mul24 made a mul24 instead of a normal mul.
923926
// We need to separate this because otherwise OtherPredicates would be overriden.
924-
class IMAD32_Mul24_Pats_Impl<VOP3_Pseudo inst, SDPatternOperator AddOp, bit mulIsRight = 0> : GCNPat <
925-
!if(mulIsRight, (i64 (AddOp i64:$src2, (i64 (AMDGPUmul_u24 i32:$src0, i32:$src1)))),
926-
(i64 (AddOp (i64 (AMDGPUmul_u24 i32:$src0, i32:$src1)), i64:$src2))),
927-
(inst $src0, $src1, $src2, 0 /* clamp */)>;
927+
class IMAD32_Mul24_Pats_Impl<VOP3_Pseudo inst, SDPatternOperator AddOp> : GCNPat <
928+
(i64 (AddOp (i64 (AMDGPUmul_u24 i32:$src0, i32:$src1)), i64:$src2)),
929+
(inst $src0, $src1, $src2, 0 /* clamp */)
930+
>;
928931

929932
multiclass IMAD32_Mul24_Pats<VOP3_Pseudo inst> {
930933
def : IMAD32_Mul24_Pats_Impl<inst, add>;
931-
def : IMAD32_Mul24_Pats_Impl<inst, ptradd, /*mulIsRight=*/1>;
934+
def : IMAD32_Mul24_Pats_Impl<inst, ptradd_commutative> {
935+
let GISelShouldIgnore = 1;
936+
}
932937
}
933938

934939
// exclude pre-GFX9 where it was slow

llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll

Lines changed: 54 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -686,17 +686,33 @@ define amdgpu_ps float @flat_load_i8_vgpr64_sgpr32_offset_8388607(ptr %vbase, i3
686686

687687
; Cannot push the shift into 32-bits, and cannot match.
688688
define amdgpu_ps float @flat_load_saddr_f32_natural_addressing(ptr inreg %sbase, ptr %voffset.ptr) {
689-
; GFX1250-LABEL: flat_load_saddr_f32_natural_addressing:
690-
; GFX1250: ; %bb.0:
691-
; GFX1250-NEXT: flat_load_b32 v0, v[0:1]
692-
; GFX1250-NEXT: s_wait_xcnt 0x0
693-
; GFX1250-NEXT: v_mov_b32_e32 v1, 0
694-
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
695-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
696-
; GFX1250-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 2, s[2:3]
697-
; GFX1250-NEXT: flat_load_b32 v0, v[0:1]
698-
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
699-
; GFX1250-NEXT: ; return to shader part epilog
689+
; GFX1250-SDAG-LABEL: flat_load_saddr_f32_natural_addressing:
690+
; GFX1250-SDAG: ; %bb.0:
691+
; GFX1250-SDAG-NEXT: flat_load_b32 v0, v[0:1]
692+
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
693+
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
694+
; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
695+
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
696+
; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 2, s[2:3]
697+
; GFX1250-SDAG-NEXT: flat_load_b32 v0, v[0:1]
698+
; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
699+
; GFX1250-SDAG-NEXT: ; return to shader part epilog
700+
;
701+
; GFX1250-GISEL-LABEL: flat_load_saddr_f32_natural_addressing:
702+
; GFX1250-GISEL: ; %bb.0:
703+
; GFX1250-GISEL-NEXT: flat_load_b32 v0, v[0:1]
704+
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
705+
; GFX1250-GISEL-NEXT: v_mov_b32_e32 v1, 0
706+
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
707+
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
708+
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
709+
; GFX1250-GISEL-NEXT: v_lshlrev_b64_e32 v[0:1], 2, v[0:1]
710+
; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0
711+
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
712+
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v3, v1, vcc_lo
713+
; GFX1250-GISEL-NEXT: flat_load_b32 v0, v[0:1]
714+
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
715+
; GFX1250-GISEL-NEXT: ; return to shader part epilog
700716
%voffset = load i32, ptr %voffset.ptr
701717
%zext.offset = zext i32 %voffset to i64
702718
%gep = getelementptr inbounds float, ptr %sbase, i64 %zext.offset
@@ -758,17 +774,33 @@ define amdgpu_ps float @flat_load_f32_saddr_zext_vgpr_range_imm_offset(ptr inreg
758774

759775
; Range is 1 beyond the limit where we can move the shift into 32-bits.
760776
define amdgpu_ps float @flat_load_f32_saddr_zext_vgpr_range_too_large(ptr inreg %sbase, ptr %voffset.ptr) {
761-
; GFX1250-LABEL: flat_load_f32_saddr_zext_vgpr_range_too_large:
762-
; GFX1250: ; %bb.0:
763-
; GFX1250-NEXT: flat_load_b32 v0, v[0:1]
764-
; GFX1250-NEXT: s_wait_xcnt 0x0
765-
; GFX1250-NEXT: v_mov_b32_e32 v1, 0
766-
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
767-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
768-
; GFX1250-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 2, s[2:3]
769-
; GFX1250-NEXT: flat_load_b32 v0, v[0:1]
770-
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
771-
; GFX1250-NEXT: ; return to shader part epilog
777+
; GFX1250-SDAG-LABEL: flat_load_f32_saddr_zext_vgpr_range_too_large:
778+
; GFX1250-SDAG: ; %bb.0:
779+
; GFX1250-SDAG-NEXT: flat_load_b32 v0, v[0:1]
780+
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
781+
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
782+
; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
783+
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
784+
; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 2, s[2:3]
785+
; GFX1250-SDAG-NEXT: flat_load_b32 v0, v[0:1]
786+
; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
787+
; GFX1250-SDAG-NEXT: ; return to shader part epilog
788+
;
789+
; GFX1250-GISEL-LABEL: flat_load_f32_saddr_zext_vgpr_range_too_large:
790+
; GFX1250-GISEL: ; %bb.0:
791+
; GFX1250-GISEL-NEXT: flat_load_b32 v0, v[0:1]
792+
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
793+
; GFX1250-GISEL-NEXT: v_mov_b32_e32 v1, 0
794+
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
795+
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
796+
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
797+
; GFX1250-GISEL-NEXT: v_lshlrev_b64_e32 v[0:1], 2, v[0:1]
798+
; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0
799+
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
800+
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v3, v1, vcc_lo
801+
; GFX1250-GISEL-NEXT: flat_load_b32 v0, v[0:1]
802+
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
803+
; GFX1250-GISEL-NEXT: ; return to shader part epilog
772804
%voffset = load i32, ptr %voffset.ptr, !range !1, !noundef !{}
773805
%zext.offset = zext i32 %voffset to i64
774806
%gep = getelementptr inbounds float, ptr %sbase, i64 %zext.offset

0 commit comments

Comments
 (0)