From dd691618fd7a431ab1edb2cd1c7de8d71d019355 Mon Sep 17 00:00:00 2001 From: shore <372660931@qq.com> Date: Fri, 18 Jul 2025 10:35:30 +0800 Subject: [PATCH 1/3] Add FABS to canCreateUndefOrPoison --- .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 1 + llvm/test/CodeGen/AMDGPU/freeze.ll | 240 +++++++++++++++++- 2 files changed, 239 insertions(+), 2 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 682d93d0abf3f..56c8bb441ddf8 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -5569,6 +5569,7 @@ bool SelectionDAG::canCreateUndefOrPoison(SDValue Op, const APInt &DemandedElts, case ISD::BUILD_VECTOR: case ISD::BUILD_PAIR: case ISD::SPLAT_VECTOR: + case ISD::FABS: return false; case ISD::ABS: diff --git a/llvm/test/CodeGen/AMDGPU/freeze.ll b/llvm/test/CodeGen/AMDGPU/freeze.ll index ac438062ae208..0476bc47e2366 100644 --- a/llvm/test/CodeGen/AMDGPU/freeze.ll +++ b/llvm/test/CodeGen/AMDGPU/freeze.ll @@ -14592,5 +14592,241 @@ define void @freeze_v4i1_vcc(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { store <4 x i1> %freeze, ptr addrspace(1) %ptrb ret void } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GFX8-SDAG: {{.*}} + +define double @tgt(float %a, double %b, double %c) { +; GFX6-SDAG-LABEL: tgt: +; GFX6-SDAG: ; %bb.0: +; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-SDAG-NEXT: v_mov_b32_e32 v5, v0 +; GFX6-SDAG-NEXT: v_add_f64 v[0:1], |v[4:5]|, v[1:2] +; GFX6-SDAG-NEXT: v_add_f64 v[2:3], |v[4:5]|, v[3:4] +; GFX6-SDAG-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] +; GFX6-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-GISEL-LABEL: tgt: +; GFX6-GISEL: ; %bb.0: +; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-GISEL-NEXT: v_and_b32_e32 v5, 0x7fffffff, v0 +; GFX6-GISEL-NEXT: v_add_f64 v[0:1], v[4:5], v[1:2] +; GFX6-GISEL-NEXT: v_add_f64 v[2:3], v[4:5], v[3:4] +; GFX6-GISEL-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] +; GFX6-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-SDAG-LABEL: tgt: +; GFX7-SDAG: ; %bb.0: +; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-SDAG-NEXT: v_mov_b32_e32 v5, v0 +; GFX7-SDAG-NEXT: v_add_f64 v[0:1], |v[4:5]|, v[1:2] +; GFX7-SDAG-NEXT: v_add_f64 v[2:3], |v[4:5]|, v[3:4] +; GFX7-SDAG-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] +; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-GISEL-LABEL: tgt: +; GFX7-GISEL: ; %bb.0: +; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_and_b32_e32 v5, 0x7fffffff, v0 +; GFX7-GISEL-NEXT: v_add_f64 v[0:1], v[4:5], v[1:2] +; GFX7-GISEL-NEXT: v_add_f64 v[2:3], v[4:5], v[3:4] +; GFX7-GISEL-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] +; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-SDAG-LABEL: tgt: +; GFX8-SDAG: ; %bb.0: +; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-SDAG-NEXT: v_mov_b32_e32 v5, v0 +; GFX8-SDAG-NEXT: v_add_f64 v[0:1], |v[4:5]|, v[1:2] +; GFX8-SDAG-NEXT: v_add_f64 v[2:3], |v[4:5]|, v[3:4] +; GFX8-SDAG-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] +; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-GISEL-LABEL: tgt: +; GFX8-GISEL: ; %bb.0: +; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-GISEL-NEXT: v_and_b32_e32 v5, 0x7fffffff, v0 +; GFX8-GISEL-NEXT: v_add_f64 v[0:1], v[4:5], v[1:2] +; GFX8-GISEL-NEXT: v_add_f64 v[2:3], v[4:5], v[3:4] +; GFX8-GISEL-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] +; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-GISEL-LABEL: tgt: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: v_and_b32_e32 v5, 0x7fffffff, v0 +; GFX9-GISEL-NEXT: v_add_f64 v[0:1], v[4:5], v[1:2] +; GFX9-GISEL-NEXT: v_add_f64 v[2:3], v[4:5], v[3:4] +; GFX9-GISEL-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] +; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-SDAG-LABEL: tgt: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-SDAG-NEXT: v_add_f64 v[0:1], |v[4:5]|, v[1:2] +; GFX10-SDAG-NEXT: v_add_f64 v[2:3], |v[4:5]|, v[3:4] +; GFX10-SDAG-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-GISEL-LABEL: tgt: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-GISEL-NEXT: v_and_b32_e32 v5, 0x7fffffff, v0 +; GFX10-GISEL-NEXT: v_add_f64 v[0:1], v[4:5], v[1:2] +; GFX10-GISEL-NEXT: v_add_f64 v[2:3], v[4:5], v[3:4] +; GFX10-GISEL-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] +; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: tgt: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_mov_b32_e32 v5, v0 +; GFX11-SDAG-NEXT: v_add_f64 v[0:1], |v[4:5]|, v[1:2] +; GFX11-SDAG-NEXT: v_add_f64 v[2:3], |v[4:5]|, v[3:4] +; GFX11-SDAG-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: tgt: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_and_b32_e32 v5, 0x7fffffff, v0 +; GFX11-GISEL-NEXT: v_add_f64 v[0:1], v[4:5], v[1:2] +; GFX11-GISEL-NEXT: v_add_f64 v[2:3], v[4:5], v[3:4] +; GFX11-GISEL-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] + %pv = insertelement <2 x float> poison, float %a, i32 1 + %d = bitcast <2 x float> %pv to double + %r = call double @llvm.fabs.f64(double %d) + %fr = freeze double %r + %add1 = fadd double %fr, %b + %add2 = fadd double %fr, %c + %add = fadd double %add1, %add2 + ret double %add +} + +define <4 x float> @src(<4 x float> %A, <4 x float> %B) { +; GFX6-SDAG-LABEL: src: +; GFX6-SDAG: ; %bb.0: +; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-SDAG-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GFX6-SDAG-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; GFX6-SDAG-NEXT: v_and_b32_e32 v2, 0x7fffffff, v2 +; GFX6-SDAG-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 +; GFX6-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-GISEL-LABEL: src: +; GFX6-GISEL: ; %bb.0: +; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-GISEL-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GFX6-GISEL-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; GFX6-GISEL-NEXT: v_and_b32_e32 v2, 0x7fffffff, v2 +; GFX6-GISEL-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 +; GFX6-GISEL-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GFX6-GISEL-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; GFX6-GISEL-NEXT: v_and_b32_e32 v2, 0x7fffffff, v2 +; GFX6-GISEL-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 +; GFX6-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-SDAG-LABEL: src: +; GFX7-SDAG: ; %bb.0: +; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GFX7-SDAG-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; GFX7-SDAG-NEXT: v_and_b32_e32 v2, 0x7fffffff, v2 +; GFX7-SDAG-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 +; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-GISEL-LABEL: src: +; GFX7-GISEL: ; %bb.0: +; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GFX7-GISEL-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; GFX7-GISEL-NEXT: v_and_b32_e32 v2, 0x7fffffff, v2 +; GFX7-GISEL-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 +; GFX7-GISEL-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GFX7-GISEL-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; GFX7-GISEL-NEXT: v_and_b32_e32 v2, 0x7fffffff, v2 +; GFX7-GISEL-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 +; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-SDAG-LABEL: src: +; GFX8-SDAG: ; %bb.0: +; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-SDAG-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GFX8-SDAG-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; GFX8-SDAG-NEXT: v_and_b32_e32 v2, 0x7fffffff, v2 +; GFX8-SDAG-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 +; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-GISEL-LABEL: src: +; GFX8-GISEL: ; %bb.0: +; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-GISEL-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GFX8-GISEL-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; GFX8-GISEL-NEXT: v_and_b32_e32 v2, 0x7fffffff, v2 +; GFX8-GISEL-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 +; GFX8-GISEL-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GFX8-GISEL-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; GFX8-GISEL-NEXT: v_and_b32_e32 v2, 0x7fffffff, v2 +; GFX8-GISEL-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 +; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-GISEL-LABEL: src: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GFX9-GISEL-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; GFX9-GISEL-NEXT: v_and_b32_e32 v2, 0x7fffffff, v2 +; GFX9-GISEL-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 +; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GFX9-GISEL-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; GFX9-GISEL-NEXT: v_and_b32_e32 v2, 0x7fffffff, v2 +; GFX9-GISEL-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 +; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-SDAG-LABEL: src: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GFX10-SDAG-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; GFX10-SDAG-NEXT: v_and_b32_e32 v2, 0x7fffffff, v2 +; GFX10-SDAG-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-GISEL-LABEL: src: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GFX10-GISEL-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; GFX10-GISEL-NEXT: v_and_b32_e32 v2, 0x7fffffff, v2 +; GFX10-GISEL-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 +; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GFX10-GISEL-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; GFX10-GISEL-NEXT: v_and_b32_e32 v2, 0x7fffffff, v2 +; GFX10-GISEL-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 +; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: src: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GFX11-SDAG-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; GFX11-SDAG-NEXT: v_and_b32_e32 v2, 0x7fffffff, v2 +; GFX11-SDAG-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: src: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GFX11-GISEL-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; GFX11-GISEL-NEXT: v_and_b32_e32 v2, 0x7fffffff, v2 +; GFX11-GISEL-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GFX11-GISEL-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; GFX11-GISEL-NEXT: v_and_b32_e32 v2, 0x7fffffff, v2 +; GFX11-GISEL-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] + %A0 = call <4 x float> @llvm.fabs.v4f32(<4 x float> %A) + %F1 = freeze <4 x float> %A0 + %A1 = call <4 x float> @llvm.fabs.v4f32(<4 x float> %F1) + ret <4 x float> %A1 +} From e3d844b2ff071c4bc916fd7611dab66ae0babaf8 Mon Sep 17 00:00:00 2001 From: shore <372660931@qq.com> Date: Fri, 18 Jul 2025 12:13:08 +0800 Subject: [PATCH 2/3] fix test cases name --- llvm/test/CodeGen/AMDGPU/freeze.ll | 48 +++++++++++++++--------------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/freeze.ll b/llvm/test/CodeGen/AMDGPU/freeze.ll index 0476bc47e2366..9a347d71bf430 100644 --- a/llvm/test/CodeGen/AMDGPU/freeze.ll +++ b/llvm/test/CodeGen/AMDGPU/freeze.ll @@ -14593,8 +14593,8 @@ define void @freeze_v4i1_vcc(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { ret void } -define double @tgt(float %a, double %b, double %c) { -; GFX6-SDAG-LABEL: tgt: +define double @freeze_fabs_double(float %a, double %b, double %c) { +; GFX6-SDAG-LABEL: freeze_fabs_double: ; GFX6-SDAG: ; %bb.0: ; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-SDAG-NEXT: v_mov_b32_e32 v5, v0 @@ -14603,7 +14603,7 @@ define double @tgt(float %a, double %b, double %c) { ; GFX6-SDAG-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] ; GFX6-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-GISEL-LABEL: tgt: +; GFX6-GISEL-LABEL: freeze_fabs_double: ; GFX6-GISEL: ; %bb.0: ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-GISEL-NEXT: v_and_b32_e32 v5, 0x7fffffff, v0 @@ -14612,7 +14612,7 @@ define double @tgt(float %a, double %b, double %c) { ; GFX6-GISEL-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] ; GFX6-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-SDAG-LABEL: tgt: +; GFX7-SDAG-LABEL: freeze_fabs_double: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-SDAG-NEXT: v_mov_b32_e32 v5, v0 @@ -14621,7 +14621,7 @@ define double @tgt(float %a, double %b, double %c) { ; GFX7-SDAG-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-GISEL-LABEL: tgt: +; GFX7-GISEL-LABEL: freeze_fabs_double: ; GFX7-GISEL: ; %bb.0: ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-GISEL-NEXT: v_and_b32_e32 v5, 0x7fffffff, v0 @@ -14630,7 +14630,7 @@ define double @tgt(float %a, double %b, double %c) { ; GFX7-GISEL-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-SDAG-LABEL: tgt: +; GFX8-SDAG-LABEL: freeze_fabs_double: ; GFX8-SDAG: ; %bb.0: ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-SDAG-NEXT: v_mov_b32_e32 v5, v0 @@ -14639,7 +14639,7 @@ define double @tgt(float %a, double %b, double %c) { ; GFX8-SDAG-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] ; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-GISEL-LABEL: tgt: +; GFX8-GISEL-LABEL: freeze_fabs_double: ; GFX8-GISEL: ; %bb.0: ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-GISEL-NEXT: v_and_b32_e32 v5, 0x7fffffff, v0 @@ -14648,7 +14648,7 @@ define double @tgt(float %a, double %b, double %c) { ; GFX8-GISEL-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-GISEL-LABEL: tgt: +; GFX9-GISEL-LABEL: freeze_fabs_double: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-GISEL-NEXT: v_and_b32_e32 v5, 0x7fffffff, v0 @@ -14657,7 +14657,7 @@ define double @tgt(float %a, double %b, double %c) { ; GFX9-GISEL-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-SDAG-LABEL: tgt: +; GFX10-SDAG-LABEL: freeze_fabs_double: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v5, v0 @@ -14666,7 +14666,7 @@ define double @tgt(float %a, double %b, double %c) { ; GFX10-SDAG-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] ; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-GISEL-LABEL: tgt: +; GFX10-GISEL-LABEL: freeze_fabs_double: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-GISEL-NEXT: v_and_b32_e32 v5, 0x7fffffff, v0 @@ -14675,7 +14675,7 @@ define double @tgt(float %a, double %b, double %c) { ; GFX10-GISEL-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-SDAG-LABEL: tgt: +; GFX11-SDAG-LABEL: freeze_fabs_double: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SDAG-NEXT: v_mov_b32_e32 v5, v0 @@ -14684,7 +14684,7 @@ define double @tgt(float %a, double %b, double %c) { ; GFX11-SDAG-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-GISEL-LABEL: tgt: +; GFX11-GISEL-LABEL: freeze_fabs_double: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-GISEL-NEXT: v_and_b32_e32 v5, 0x7fffffff, v0 @@ -14702,8 +14702,8 @@ define double @tgt(float %a, double %b, double %c) { ret double %add } -define <4 x float> @src(<4 x float> %A, <4 x float> %B) { -; GFX6-SDAG-LABEL: src: +define <4 x float> @freeze_fabs_v4float(<4 x float> %A, <4 x float> %B) { +; GFX6-SDAG-LABEL: freeze_fabs_v4float: ; GFX6-SDAG: ; %bb.0: ; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-SDAG-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 @@ -14712,7 +14712,7 @@ define <4 x float> @src(<4 x float> %A, <4 x float> %B) { ; GFX6-SDAG-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 ; GFX6-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-GISEL-LABEL: src: +; GFX6-GISEL-LABEL: freeze_fabs_v4float: ; GFX6-GISEL: ; %bb.0: ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-GISEL-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 @@ -14725,7 +14725,7 @@ define <4 x float> @src(<4 x float> %A, <4 x float> %B) { ; GFX6-GISEL-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 ; GFX6-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-SDAG-LABEL: src: +; GFX7-SDAG-LABEL: freeze_fabs_v4float: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 @@ -14734,7 +14734,7 @@ define <4 x float> @src(<4 x float> %A, <4 x float> %B) { ; GFX7-SDAG-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-GISEL-LABEL: src: +; GFX7-GISEL-LABEL: freeze_fabs_v4float: ; GFX7-GISEL: ; %bb.0: ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-GISEL-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 @@ -14747,7 +14747,7 @@ define <4 x float> @src(<4 x float> %A, <4 x float> %B) { ; GFX7-GISEL-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-SDAG-LABEL: src: +; GFX8-SDAG-LABEL: freeze_fabs_v4float: ; GFX8-SDAG: ; %bb.0: ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-SDAG-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 @@ -14756,7 +14756,7 @@ define <4 x float> @src(<4 x float> %A, <4 x float> %B) { ; GFX8-SDAG-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 ; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-GISEL-LABEL: src: +; GFX8-GISEL-LABEL: freeze_fabs_v4float: ; GFX8-GISEL: ; %bb.0: ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-GISEL-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 @@ -14769,7 +14769,7 @@ define <4 x float> @src(<4 x float> %A, <4 x float> %B) { ; GFX8-GISEL-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-GISEL-LABEL: src: +; GFX9-GISEL-LABEL: freeze_fabs_v4float: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 @@ -14782,7 +14782,7 @@ define <4 x float> @src(<4 x float> %A, <4 x float> %B) { ; GFX9-GISEL-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-SDAG-LABEL: src: +; GFX10-SDAG-LABEL: freeze_fabs_v4float: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SDAG-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 @@ -14791,7 +14791,7 @@ define <4 x float> @src(<4 x float> %A, <4 x float> %B) { ; GFX10-SDAG-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 ; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-GISEL-LABEL: src: +; GFX10-GISEL-LABEL: freeze_fabs_v4float: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 @@ -14804,7 +14804,7 @@ define <4 x float> @src(<4 x float> %A, <4 x float> %B) { ; GFX10-GISEL-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-SDAG-LABEL: src: +; GFX11-SDAG-LABEL: freeze_fabs_v4float: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 @@ -14813,7 +14813,7 @@ define <4 x float> @src(<4 x float> %A, <4 x float> %B) { ; GFX11-SDAG-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-GISEL-LABEL: src: +; GFX11-GISEL-LABEL: freeze_fabs_v4float: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 From d64a98a0e25952ba6746b9ff12e7b28af4a5da63 Mon Sep 17 00:00:00 2001 From: shore <372660931@qq.com> Date: Fri, 18 Jul 2025 13:45:47 +0800 Subject: [PATCH 3/3] fix cases changed by #148796 --- llvm/test/CodeGen/AMDGPU/fmaximum3.ll | 43 +-- llvm/test/CodeGen/AMDGPU/fminimum3.ll | 43 +-- llvm/test/CodeGen/AMDGPU/fnearbyint.ll | 21 +- llvm/test/CodeGen/AMDGPU/fract-match.ll | 57 ++- llvm/test/CodeGen/AMDGPU/llvm.frexp.ll | 57 +-- llvm/test/CodeGen/AMDGPU/llvm.rint.f64.ll | 2 +- llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll | 381 ++++++++++----------- llvm/test/CodeGen/AMDGPU/lround.ll | 51 ++- llvm/test/CodeGen/AMDGPU/roundeven.ll | 37 +- 9 files changed, 298 insertions(+), 394 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll index e5fe4160a4b05..069a47ec97bfe 100644 --- a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll @@ -3272,10 +3272,9 @@ define double @v_fmaximum3_f64_fabs0(double %a, double %b, double %c) { ; GFX9-LABEL: v_fmaximum3_f64_fabs0: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 -; GFX9-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] +; GFX9-NEXT: v_max_f64 v[6:7], |v[0:1]|, v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000 -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_cmp_u_f64_e64 vcc, |v[0:1]|, v[2:3] ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc @@ -3307,10 +3306,9 @@ define double @v_fmaximum3_f64_fabs1(double %a, double %b, double %c) { ; GFX9-LABEL: v_fmaximum3_f64_fabs1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 -; GFX9-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] +; GFX9-NEXT: v_max_f64 v[6:7], v[0:1], |v[2:3]| ; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000 -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_cmp_u_f64_e64 vcc, v[0:1], |v[2:3]| ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc @@ -3345,12 +3343,11 @@ define double @v_fmaximum3_f64_fabs2(double %a, double %b, double %c) { ; GFX9-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000 ; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] -; GFX9-NEXT: v_and_b32_e32 v5, 0x7fffffff, v5 -; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc -; GFX9-NEXT: v_max_f64 v[2:3], v[0:1], v[4:5] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] +; GFX9-NEXT: v_max_f64 v[2:3], v[0:1], |v[4:5]| +; GFX9-NEXT: v_cmp_u_f64_e64 vcc, v[0:1], |v[4:5]| ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc @@ -3377,17 +3374,14 @@ define double @v_fmaximum3_f64_fabs_all(double %a, double %b, double %c) { ; GFX9-LABEL: v_fmaximum3_f64_fabs_all: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 -; GFX9-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 -; GFX9-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] +; GFX9-NEXT: v_max_f64 v[6:7], |v[0:1]|, |v[2:3]| ; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000 -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] -; GFX9-NEXT: v_and_b32_e32 v5, 0x7fffffff, v5 -; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cmp_u_f64_e64 vcc, |v[0:1]|, |v[2:3]| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc -; GFX9-NEXT: v_max_f64 v[2:3], v[0:1], v[4:5] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] +; GFX9-NEXT: v_max_f64 v[2:3], v[0:1], |v[4:5]| +; GFX9-NEXT: v_cmp_u_f64_e64 vcc, v[0:1], |v[4:5]| ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc @@ -3452,17 +3446,14 @@ define double @v_fmaximum3_f64_fneg_fabs_all(double %a, double %b, double %c) { ; GFX9-LABEL: v_fmaximum3_f64_fneg_fabs_all: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 -; GFX9-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 -; GFX9-NEXT: v_max_f64 v[6:7], -v[0:1], -v[2:3] +; GFX9-NEXT: v_max_f64 v[6:7], -|v[0:1]|, -|v[2:3]| ; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000 -; GFX9-NEXT: v_cmp_u_f64_e64 vcc, -v[0:1], -v[2:3] -; GFX9-NEXT: v_and_b32_e32 v5, 0x7fffffff, v5 -; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cmp_u_f64_e64 vcc, -|v[0:1]|, -|v[2:3]| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc -; GFX9-NEXT: v_max_f64 v[2:3], v[0:1], -v[4:5] -; GFX9-NEXT: v_cmp_u_f64_e64 vcc, v[0:1], -v[4:5] +; GFX9-NEXT: v_max_f64 v[2:3], v[0:1], -|v[4:5]| +; GFX9-NEXT: v_cmp_u_f64_e64 vcc, v[0:1], -|v[4:5]| ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc diff --git a/llvm/test/CodeGen/AMDGPU/fminimum3.ll b/llvm/test/CodeGen/AMDGPU/fminimum3.ll index 6873c617c64a1..d8746b58b16b7 100644 --- a/llvm/test/CodeGen/AMDGPU/fminimum3.ll +++ b/llvm/test/CodeGen/AMDGPU/fminimum3.ll @@ -3272,10 +3272,9 @@ define double @v_fminimum3_f64_fabs0(double %a, double %b, double %c) { ; GFX9-LABEL: v_fminimum3_f64_fabs0: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 -; GFX9-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] +; GFX9-NEXT: v_min_f64 v[6:7], |v[0:1]|, v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000 -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_cmp_u_f64_e64 vcc, |v[0:1]|, v[2:3] ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc @@ -3307,10 +3306,9 @@ define double @v_fminimum3_f64_fabs1(double %a, double %b, double %c) { ; GFX9-LABEL: v_fminimum3_f64_fabs1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 -; GFX9-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] +; GFX9-NEXT: v_min_f64 v[6:7], v[0:1], |v[2:3]| ; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000 -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_cmp_u_f64_e64 vcc, v[0:1], |v[2:3]| ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc @@ -3345,12 +3343,11 @@ define double @v_fminimum3_f64_fabs2(double %a, double %b, double %c) { ; GFX9-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000 ; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] -; GFX9-NEXT: v_and_b32_e32 v5, 0x7fffffff, v5 -; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc -; GFX9-NEXT: v_min_f64 v[2:3], v[0:1], v[4:5] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] +; GFX9-NEXT: v_min_f64 v[2:3], v[0:1], |v[4:5]| +; GFX9-NEXT: v_cmp_u_f64_e64 vcc, v[0:1], |v[4:5]| ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc @@ -3377,17 +3374,14 @@ define double @v_fminimum3_f64_fabs_all(double %a, double %b, double %c) { ; GFX9-LABEL: v_fminimum3_f64_fabs_all: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 -; GFX9-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 -; GFX9-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] +; GFX9-NEXT: v_min_f64 v[6:7], |v[0:1]|, |v[2:3]| ; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000 -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] -; GFX9-NEXT: v_and_b32_e32 v5, 0x7fffffff, v5 -; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cmp_u_f64_e64 vcc, |v[0:1]|, |v[2:3]| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc -; GFX9-NEXT: v_min_f64 v[2:3], v[0:1], v[4:5] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] +; GFX9-NEXT: v_min_f64 v[2:3], v[0:1], |v[4:5]| +; GFX9-NEXT: v_cmp_u_f64_e64 vcc, v[0:1], |v[4:5]| ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc @@ -3452,17 +3446,14 @@ define double @v_fminimum3_f64_fneg_fabs_all(double %a, double %b, double %c) { ; GFX9-LABEL: v_fminimum3_f64_fneg_fabs_all: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 -; GFX9-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 -; GFX9-NEXT: v_min_f64 v[6:7], -v[0:1], -v[2:3] +; GFX9-NEXT: v_min_f64 v[6:7], -|v[0:1]|, -|v[2:3]| ; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000 -; GFX9-NEXT: v_cmp_u_f64_e64 vcc, -v[0:1], -v[2:3] -; GFX9-NEXT: v_and_b32_e32 v5, 0x7fffffff, v5 -; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cmp_u_f64_e64 vcc, -|v[0:1]|, -|v[2:3]| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc -; GFX9-NEXT: v_min_f64 v[2:3], v[0:1], -v[4:5] -; GFX9-NEXT: v_cmp_u_f64_e64 vcc, v[0:1], -v[4:5] +; GFX9-NEXT: v_min_f64 v[2:3], v[0:1], -|v[4:5]| +; GFX9-NEXT: v_cmp_u_f64_e64 vcc, v[0:1], -|v[4:5]| ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc diff --git a/llvm/test/CodeGen/AMDGPU/fnearbyint.ll b/llvm/test/CodeGen/AMDGPU/fnearbyint.ll index 193cee967f3c4..e9fd6119d0c36 100644 --- a/llvm/test/CodeGen/AMDGPU/fnearbyint.ll +++ b/llvm/test/CodeGen/AMDGPU/fnearbyint.ll @@ -223,9 +223,8 @@ define amdgpu_kernel void @nearbyint_f64(ptr addrspace(1) %out, double %in) { ; SI-NEXT: v_bfi_b32 v1, s8, v1, v6 ; SI-NEXT: v_mov_b32_e32 v7, s2 ; SI-NEXT: v_add_f64 v[4:5], s[2:3], v[0:1] -; SI-NEXT: s_bitset0_b32 s3, 31 ; SI-NEXT: v_add_f64 v[0:1], v[4:5], -v[0:1] -; SI-NEXT: v_cmp_gt_f64_e32 vcc, s[2:3], v[2:3] +; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[2:3]|, v[2:3] ; SI-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 @@ -285,16 +284,14 @@ define amdgpu_kernel void @nearbyint_v2f64(ptr addrspace(1) %out, <2 x double> % ; SI-NEXT: v_mov_b32_e32 v9, s5 ; SI-NEXT: v_mov_b32_e32 v10, s4 ; SI-NEXT: v_add_f64 v[2:3], s[6:7], v[0:1] -; SI-NEXT: s_bitset0_b32 s7, 31 ; SI-NEXT: v_add_f64 v[2:3], v[2:3], -v[0:1] ; SI-NEXT: v_bfi_b32 v1, s10, v6, v9 -; SI-NEXT: v_cmp_gt_f64_e32 vcc, s[6:7], v[4:5] +; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[6:7]|, v[4:5] ; SI-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc ; SI-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc ; SI-NEXT: v_add_f64 v[6:7], s[4:5], v[0:1] -; SI-NEXT: s_bitset0_b32 s5, 31 ; SI-NEXT: v_add_f64 v[0:1], v[6:7], -v[0:1] -; SI-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[4:5] +; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[4:5]|, v[4:5] ; SI-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 @@ -368,30 +365,26 @@ define amdgpu_kernel void @nearbyint_v4f64(ptr addrspace(1) %out, <4 x double> % ; SI-NEXT: v_mov_b32_e32 v14, s5 ; SI-NEXT: v_mov_b32_e32 v15, s4 ; SI-NEXT: v_add_f64 v[0:1], s[2:3], v[4:5] -; SI-NEXT: s_bitset0_b32 s3, 31 ; SI-NEXT: v_add_f64 v[0:1], v[0:1], -v[4:5] ; SI-NEXT: v_bfi_b32 v5, s14, v10, v7 -; SI-NEXT: v_cmp_gt_f64_e32 vcc, s[2:3], v[8:9] +; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[2:3]|, v[8:9] ; SI-NEXT: v_cndmask_b32_e32 v3, v1, v2, vcc ; SI-NEXT: v_cndmask_b32_e32 v2, v0, v6, vcc ; SI-NEXT: v_add_f64 v[0:1], s[0:1], v[4:5] -; SI-NEXT: s_bitset0_b32 s1, 31 ; SI-NEXT: v_add_f64 v[0:1], v[0:1], -v[4:5] ; SI-NEXT: v_bfi_b32 v5, s14, v10, v12 -; SI-NEXT: v_cmp_gt_f64_e32 vcc, s[0:1], v[8:9] +; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[0:1]|, v[8:9] ; SI-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v11, vcc ; SI-NEXT: v_add_f64 v[6:7], s[6:7], v[4:5] -; SI-NEXT: s_bitset0_b32 s7, 31 ; SI-NEXT: v_add_f64 v[6:7], v[6:7], -v[4:5] ; SI-NEXT: v_bfi_b32 v5, s14, v10, v14 -; SI-NEXT: v_cmp_gt_f64_e32 vcc, s[6:7], v[8:9] +; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[6:7]|, v[8:9] ; SI-NEXT: v_cndmask_b32_e32 v7, v7, v12, vcc ; SI-NEXT: v_cndmask_b32_e32 v6, v6, v13, vcc ; SI-NEXT: v_add_f64 v[10:11], s[4:5], v[4:5] -; SI-NEXT: s_bitset0_b32 s5, 31 ; SI-NEXT: v_add_f64 v[4:5], v[10:11], -v[4:5] -; SI-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[8:9] +; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[4:5]|, v[8:9] ; SI-NEXT: v_cndmask_b32_e32 v5, v5, v14, vcc ; SI-NEXT: v_cndmask_b32_e32 v4, v4, v15, vcc ; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:16 diff --git a/llvm/test/CodeGen/AMDGPU/fract-match.ll b/llvm/test/CodeGen/AMDGPU/fract-match.ll index f50944cc8a5b1..d97ea042b50fc 100644 --- a/llvm/test/CodeGen/AMDGPU/fract-match.ll +++ b/llvm/test/CodeGen/AMDGPU/fract-match.ll @@ -2356,11 +2356,10 @@ define double @safe_math_fract_f64(double %x, ptr addrspace(1) writeonly capture ; GFX6-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1] ; GFX6-NEXT: v_min_f64 v[6:7], v[6:7], s[8:9] ; GFX6-NEXT: s_mov_b32 s8, 0 -; GFX6-NEXT: v_cndmask_b32_e32 v7, v7, v1, vcc -; GFX6-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 ; GFX6-NEXT: s_mov_b32 s9, 0x7ff00000 +; GFX6-NEXT: v_cndmask_b32_e32 v7, v7, v1, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v6, v6, v0, vcc -; GFX6-NEXT: v_cmp_neq_f64_e32 vcc, s[8:9], v[0:1] +; GFX6-NEXT: v_cmp_neq_f64_e64 vcc, |v[0:1]|, s[8:9] ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 @@ -2375,18 +2374,17 @@ define double @safe_math_fract_f64(double %x, ptr addrspace(1) writeonly capture ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, 0 -; GFX7-NEXT: v_floor_f64_e32 v[4:5], v[0:1] -; GFX7-NEXT: v_fract_f64_e32 v[6:7], v[0:1] -; GFX7-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 ; GFX7-NEXT: s_mov_b32 s5, 0x7ff00000 -; GFX7-NEXT: v_cmp_neq_f64_e32 vcc, s[4:5], v[0:1] +; GFX7-NEXT: v_fract_f64_e32 v[4:5], v[0:1] +; GFX7-NEXT: v_cmp_neq_f64_e64 vcc, |v[0:1]|, s[4:5] +; GFX7-NEXT: v_floor_f64_e32 v[6:7], v[0:1] ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_store_dwordx2 v[4:5], v[2:3], s[4:7], 0 addr64 -; GFX7-NEXT: v_cndmask_b32_e32 v0, 0, v6, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v7, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc +; GFX7-NEXT: buffer_store_dwordx2 v[6:7], v[2:3], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -2394,27 +2392,25 @@ define double @safe_math_fract_f64(double %x, ptr addrspace(1) writeonly capture ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 s4, 0 -; GFX8-NEXT: v_floor_f64_e32 v[4:5], v[0:1] -; GFX8-NEXT: v_fract_f64_e32 v[6:7], v[0:1] -; GFX8-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 ; GFX8-NEXT: s_mov_b32 s5, 0x7ff00000 -; GFX8-NEXT: v_cmp_neq_f64_e32 vcc, s[4:5], v[0:1] -; GFX8-NEXT: global_store_dwordx2 v[2:3], v[4:5], off -; GFX8-NEXT: v_cndmask_b32_e32 v0, 0, v6, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v7, vcc +; GFX8-NEXT: v_fract_f64_e32 v[4:5], v[0:1] +; GFX8-NEXT: v_cmp_neq_f64_e64 vcc, |v[0:1]|, s[4:5] +; GFX8-NEXT: v_floor_f64_e32 v[6:7], v[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc +; GFX8-NEXT: global_store_dwordx2 v[2:3], v[6:7], off ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: safe_math_fract_f64: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_floor_f64_e32 v[4:5], v[0:1] -; GFX11-NEXT: v_fract_f64_e32 v[6:7], v[0:1] -; GFX11-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cmp_neq_f64_e32 vcc_lo, 0x7ff00000, v[0:1] -; GFX11-NEXT: global_store_b64 v[2:3], v[4:5], off -; GFX11-NEXT: v_dual_cndmask_b32 v0, 0, v6 :: v_dual_cndmask_b32 v1, 0, v7 +; GFX11-NEXT: v_fract_f64_e32 v[4:5], v[0:1] +; GFX11-NEXT: v_cmp_neq_f64_e64 vcc_lo, 0x7ff00000, |v[0:1]| +; GFX11-NEXT: v_floor_f64_e32 v[6:7], v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_dual_cndmask_b32 v0, 0, v4 :: v_dual_cndmask_b32 v1, 0, v5 +; GFX11-NEXT: global_store_b64 v[2:3], v[6:7], off ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: safe_math_fract_f64: @@ -2424,14 +2420,13 @@ define double @safe_math_fract_f64(double %x, ptr addrspace(1) writeonly capture ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_floor_f64_e32 v[4:5], v[0:1] -; GFX12-NEXT: v_fract_f64_e32 v[6:7], v[0:1] -; GFX12-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_cmp_neq_f64_e32 vcc_lo, 0x7ff00000, v[0:1] -; GFX12-NEXT: global_store_b64 v[2:3], v[4:5], off +; GFX12-NEXT: v_fract_f64_e32 v[4:5], v[0:1] +; GFX12-NEXT: v_cmp_neq_f64_e64 vcc_lo, 0x7ff00000, |v[0:1]| +; GFX12-NEXT: v_floor_f64_e32 v[6:7], v[0:1] ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_dual_cndmask_b32 v0, 0, v6 :: v_dual_cndmask_b32 v1, 0, v7 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: v_dual_cndmask_b32 v0, 0, v4 :: v_dual_cndmask_b32 v1, 0, v5 +; GFX12-NEXT: global_store_b64 v[2:3], v[6:7], off ; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call double @llvm.floor.f64(double %x) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll b/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll index 0bb973c0e5512..3a4bf1c81ed58 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll @@ -1759,13 +1759,11 @@ define double @test_frexp_f64_i32_only_use_fract(double %a) { ; GFX6-SDAG: ; %bb.0: ; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-SDAG-NEXT: s_mov_b32 s4, 0 -; GFX6-SDAG-NEXT: v_and_b32_e32 v3, 0x7fffffff, v1 -; GFX6-SDAG-NEXT: v_mov_b32_e32 v2, v0 ; GFX6-SDAG-NEXT: s_mov_b32 s5, 0x7ff00000 -; GFX6-SDAG-NEXT: v_frexp_mant_f64_e32 v[4:5], v[0:1] -; GFX6-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3] -; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX6-SDAG-NEXT: v_frexp_mant_f64_e32 v[2:3], v[0:1] +; GFX6-SDAG-NEXT: v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5] +; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX6-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_frexp_f64_i32_only_use_fract: @@ -1961,24 +1959,20 @@ define { <2 x double>, <2 x i32> } @test_frexp_v2f64_v2i32(<2 x double> %a) { } define <2 x double> @test_frexp_v2f64_v2i32_only_use_fract(<2 x double> %a) { -; GFX6-SDAG-LABEL: test_frexp_v2f64_v2i32_only_use_fract: -; GFX6-SDAG: ; %bb.0: -; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-SDAG-NEXT: s_mov_b32 s4, 0 -; GFX6-SDAG-NEXT: v_and_b32_e32 v5, 0x7fffffff, v1 -; GFX6-SDAG-NEXT: v_mov_b32_e32 v4, v0 -; GFX6-SDAG-NEXT: s_mov_b32 s5, 0x7ff00000 -; GFX6-SDAG-NEXT: v_frexp_mant_f64_e32 v[6:7], v[0:1] -; GFX6-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[4:5] -; GFX6-SDAG-NEXT: v_and_b32_e32 v5, 0x7fffffff, v3 -; GFX6-SDAG-NEXT: v_mov_b32_e32 v4, v2 -; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc -; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc -; GFX6-SDAG-NEXT: v_frexp_mant_f64_e32 v[6:7], v[2:3] -; GFX6-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[4:5] -; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc -; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc -; GFX6-SDAG-NEXT: s_setpc_b64 s[30:31] +; GFX6-LABEL: test_frexp_v2f64_v2i32_only_use_fract: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s4, 0 +; GFX6-NEXT: s_mov_b32 s5, 0x7ff00000 +; GFX6-NEXT: v_frexp_mant_f64_e32 v[4:5], v[0:1] +; GFX6-NEXT: v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX6-NEXT: v_frexp_mant_f64_e32 v[4:5], v[2:3] +; GFX6-NEXT: v_cmp_lt_f64_e64 vcc, |v[2:3]|, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_frexp_v2f64_v2i32_only_use_fract: ; GFX8: ; %bb.0: @@ -2011,21 +2005,6 @@ define <2 x double> @test_frexp_v2f64_v2i32_only_use_fract(<2 x double> %a) { ; GFX12-NEXT: v_frexp_mant_f64_e32 v[0:1], v[0:1] ; GFX12-NEXT: v_frexp_mant_f64_e32 v[2:3], v[2:3] ; GFX12-NEXT: s_setpc_b64 s[30:31] -; -; GFX6-GISEL-LABEL: test_frexp_v2f64_v2i32_only_use_fract: -; GFX6-GISEL: ; %bb.0: -; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX6-GISEL-NEXT: s_mov_b32 s5, 0x7ff00000 -; GFX6-GISEL-NEXT: v_frexp_mant_f64_e32 v[4:5], v[0:1] -; GFX6-GISEL-NEXT: v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5] -; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX6-GISEL-NEXT: v_frexp_mant_f64_e32 v[4:5], v[2:3] -; GFX6-GISEL-NEXT: v_cmp_lt_f64_e64 vcc, |v[2:3]|, s[4:5] -; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GFX6-GISEL-NEXT: s_setpc_b64 s[30:31] %result = call { <2 x double>, <2 x i32> } @llvm.frexp.v2f64.v2i32(<2 x double> %a) %result.0 = extractvalue { <2 x double>, <2 x i32> } %result, 0 ret <2 x double> %result.0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.rint.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.rint.f64.ll index 53660ffffa691..c6cf6f64db1eb 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.rint.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.rint.f64.ll @@ -8,7 +8,7 @@ ; SI-DAG: v_add_f64 ; SI-DAG: v_add_f64 -; SI-DAG: v_cmp_gt_f64_e32 +; SI-DAG: v_cmp_gt_f64_e64 ; SI: v_cndmask_b32 ; SI: v_cndmask_b32 ; SI: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll index 2500af1ae109f..355f77acfd302 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll @@ -9,33 +9,32 @@ define amdgpu_kernel void @round_f64(ptr addrspace(1) %out, double %x) #0 { ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s5, 0xfffff ; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bfe_u32 s8, s3, 0xb0014 -; SI-NEXT: s_addk_i32 s8, 0xfc01 -; SI-NEXT: s_lshr_b64 s[4:5], s[4:5], s8 +; SI-NEXT: s_bfe_u32 s7, s3, 0xb0014 +; SI-NEXT: s_addk_i32 s7, 0xfc01 +; SI-NEXT: s_lshr_b64 s[4:5], s[4:5], s7 +; SI-NEXT: s_and_b32 s8, s3, 0x80000000 ; SI-NEXT: s_andn2_b64 s[4:5], s[2:3], s[4:5] -; SI-NEXT: s_and_b32 s9, s3, 0x80000000 -; SI-NEXT: s_cmp_lt_i32 s8, 0 +; SI-NEXT: s_cmp_lt_i32 s7, 0 ; SI-NEXT: s_cselect_b32 s4, 0, s4 -; SI-NEXT: s_cselect_b32 s5, s9, s5 -; SI-NEXT: s_cmp_gt_i32 s8, 51 +; SI-NEXT: s_cselect_b32 s5, s8, s5 +; SI-NEXT: s_cmp_gt_i32 s7, 51 ; SI-NEXT: s_cselect_b32 s8, s2, s4 ; SI-NEXT: s_cselect_b32 s9, s3, s5 ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_mov_b32_e32 v1, s9 ; SI-NEXT: v_add_f64 v[0:1], s[2:3], -v[0:1] ; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 -; SI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[0:1] -; SI-NEXT: s_brev_b32 s0, -2 -; SI-NEXT: s_and_b64 s[10:11], vcc, exec -; SI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 -; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_cmp_ge_f64_e64 s[10:11], |v[0:1]|, 0.5 +; SI-NEXT: s_brev_b32 s2, -2 +; SI-NEXT: s_and_b64 s[10:11], s[10:11], exec +; SI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 +; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: v_mov_b32_e32 v1, s3 -; SI-NEXT: v_bfi_b32 v1, s0, v0, v1 +; SI-NEXT: v_bfi_b32 v1, s2, v0, v1 ; SI-NEXT: v_mov_b32_e32 v0, 0 ; SI-NEXT: v_add_f64 v[0:1], s[8:9], v[0:1] +; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm @@ -50,10 +49,9 @@ define amdgpu_kernel void @round_f64(ptr addrspace(1) %out, double %x) #0 { ; CI-NEXT: v_trunc_f64_e32 v[0:1], s[2:3] ; CI-NEXT: s_mov_b32 s4, s0 ; CI-NEXT: v_add_f64 v[2:3], s[2:3], -v[0:1] -; CI-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 -; CI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[2:3] +; CI-NEXT: v_cmp_ge_f64_e64 s[8:9], |v[2:3]|, 0.5 ; CI-NEXT: v_mov_b32_e32 v2, s3 -; CI-NEXT: s_and_b64 s[2:3], vcc, exec +; CI-NEXT: s_and_b64 s[2:3], s[8:9], exec ; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 ; CI-NEXT: v_mov_b32_e32 v3, s0 ; CI-NEXT: v_bfi_b32 v3, s5, v3, v2 @@ -78,12 +76,13 @@ define amdgpu_kernel void @v_round_f64(ptr addrspace(1) %out, ptr addrspace(1) % ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b64 s[4:5], s[2:3] ; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_movk_i32 s4, 0xfc01 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s3, 0xfffff -; SI-NEXT: s_brev_b32 s4, -2 +; SI-NEXT: v_mov_b32_e32 v8, 0x3ff00000 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_bfe_u32 v4, v3, 20, 11 -; SI-NEXT: v_add_i32_e32 v6, vcc, 0xfffffc01, v4 +; SI-NEXT: v_add_i32_e32 v6, vcc, s4, v4 ; SI-NEXT: v_lshr_b64 v[4:5], s[2:3], v6 ; SI-NEXT: v_and_b32_e32 v7, 0x80000000, v3 ; SI-NEXT: v_not_b32_e32 v5, v5 @@ -97,14 +96,13 @@ define amdgpu_kernel void @v_round_f64(ptr addrspace(1) %out, ptr addrspace(1) % ; SI-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc ; SI-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc ; SI-NEXT: v_add_f64 v[6:7], v[2:3], -v[4:5] -; SI-NEXT: v_mov_b32_e32 v2, 0x3ff00000 -; SI-NEXT: v_and_b32_e32 v7, 0x7fffffff, v7 -; SI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[6:7] -; SI-NEXT: s_mov_b64 s[2:3], s[6:7] -; SI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc -; SI-NEXT: v_bfi_b32 v3, s4, v2, v3 +; SI-NEXT: s_brev_b32 s2, -2 +; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5 +; SI-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc +; SI-NEXT: v_bfi_b32 v3, s2, v2, v3 ; SI-NEXT: v_mov_b32_e32 v2, v1 ; SI-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; @@ -118,14 +116,13 @@ define amdgpu_kernel void @v_round_f64(ptr addrspace(1) %out, ptr addrspace(1) % ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_mov_b64 s[4:5], s[2:3] ; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 +; CI-NEXT: v_mov_b32_e32 v8, 0x3ff00000 ; CI-NEXT: s_brev_b32 s2, -2 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_trunc_f64_e32 v[4:5], v[2:3] ; CI-NEXT: v_add_f64 v[6:7], v[2:3], -v[4:5] -; CI-NEXT: v_mov_b32_e32 v2, 0x3ff00000 -; CI-NEXT: v_and_b32_e32 v7, 0x7fffffff, v7 -; CI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[6:7] -; CI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5 +; CI-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc ; CI-NEXT: v_bfi_b32 v3, s2, v2, v3 ; CI-NEXT: v_mov_b32_e32 v2, v1 ; CI-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] @@ -164,37 +161,35 @@ define amdgpu_kernel void @round_v2f64(ptr addrspace(1) %out, <2 x double> %in) ; SI-NEXT: v_mov_b32_e32 v1, s13 ; SI-NEXT: v_add_f64 v[0:1], s[10:11], -v[0:1] ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; SI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 -; SI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[0:1] -; SI-NEXT: s_brev_b32 s3, -2 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: s_bfe_u32 s4, s9, 0xb0014 -; SI-NEXT: s_add_i32 s10, s4, 0xfffffc01 -; SI-NEXT: s_lshr_b64 s[4:5], s[6:7], s10 +; SI-NEXT: v_cmp_ge_f64_e64 s[14:15], |v[0:1]|, 0.5 +; SI-NEXT: s_brev_b32 s10, -2 +; SI-NEXT: s_and_b64 s[4:5], s[14:15], exec +; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 +; SI-NEXT: v_mov_b32_e32 v0, s3 +; SI-NEXT: s_bfe_u32 s3, s9, 0xb0014 +; SI-NEXT: s_addk_i32 s3, 0xfc01 +; SI-NEXT: s_lshr_b64 s[4:5], s[6:7], s3 ; SI-NEXT: s_andn2_b64 s[4:5], s[8:9], s[4:5] ; SI-NEXT: s_and_b32 s6, s9, 0x80000000 -; SI-NEXT: s_cmp_lt_i32 s10, 0 +; SI-NEXT: s_cmp_lt_i32 s3, 0 ; SI-NEXT: s_cselect_b32 s4, 0, s4 ; SI-NEXT: s_cselect_b32 s5, s6, s5 -; SI-NEXT: s_cmp_gt_i32 s10, 51 +; SI-NEXT: s_cmp_gt_i32 s3, 51 ; SI-NEXT: s_cselect_b32 s4, s8, s4 ; SI-NEXT: s_cselect_b32 s5, s9, s5 ; SI-NEXT: v_mov_b32_e32 v2, s4 ; SI-NEXT: v_mov_b32_e32 v3, s5 ; SI-NEXT: v_add_f64 v[2:3], s[8:9], -v[2:3] ; SI-NEXT: v_mov_b32_e32 v1, s11 -; SI-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 -; SI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[2:3] -; SI-NEXT: v_bfi_b32 v1, s3, v0, v1 -; SI-NEXT: s_and_b64 s[6:7], vcc, exec +; SI-NEXT: v_cmp_ge_f64_e64 s[6:7], |v[2:3]|, 0.5 +; SI-NEXT: v_bfi_b32 v1, s10, v0, v1 +; SI-NEXT: s_and_b64 s[6:7], s[6:7], exec ; SI-NEXT: v_mov_b32_e32 v0, 0 -; SI-NEXT: s_cselect_b32 s6, 0x3ff00000, 0 +; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 ; SI-NEXT: v_add_f64 v[2:3], s[12:13], v[0:1] -; SI-NEXT: v_mov_b32_e32 v1, s6 +; SI-NEXT: v_mov_b32_e32 v1, s3 ; SI-NEXT: v_mov_b32_e32 v4, s9 -; SI-NEXT: v_bfi_b32 v1, s3, v1, v4 +; SI-NEXT: v_bfi_b32 v1, s10, v1, v4 ; SI-NEXT: v_add_f64 v[0:1], s[4:5], v[0:1] ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -213,16 +208,14 @@ define amdgpu_kernel void @round_v2f64(ptr addrspace(1) %out, <2 x double> %in) ; CI-NEXT: v_trunc_f64_e32 v[6:7], s[8:9] ; CI-NEXT: v_add_f64 v[4:5], s[10:11], -v[2:3] ; CI-NEXT: v_mov_b32_e32 v1, s11 -; CI-NEXT: v_and_b32_e32 v5, 0x7fffffff, v5 -; CI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[4:5] +; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[4:5]|, 0.5 ; CI-NEXT: v_add_f64 v[4:5], s[8:9], -v[6:7] -; CI-NEXT: s_and_b64 s[4:5], vcc, exec -; CI-NEXT: v_and_b32_e32 v5, 0x7fffffff, v5 -; CI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[4:5] +; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec ; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 ; CI-NEXT: v_mov_b32_e32 v8, s4 -; CI-NEXT: s_and_b64 s[4:5], vcc, exec +; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[4:5]|, 0.5 ; CI-NEXT: v_bfi_b32 v1, s2, v8, v1 +; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec ; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 ; CI-NEXT: v_add_f64 v[2:3], v[2:3], v[0:1] ; CI-NEXT: v_mov_b32_e32 v1, s4 @@ -261,80 +254,76 @@ define amdgpu_kernel void @round_v4f64(ptr addrspace(1) %out, <4 x double> %in) ; SI-NEXT: v_mov_b32_e32 v1, s17 ; SI-NEXT: v_add_f64 v[0:1], s[10:11], -v[0:1] ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; SI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 -; SI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[0:1] +; SI-NEXT: v_cmp_ge_f64_e64 s[18:19], |v[0:1]|, 0.5 ; SI-NEXT: v_mov_b32_e32 v1, s11 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: s_bfe_u32 s4, s9, 0xb0014 -; SI-NEXT: s_add_i32 s10, s4, 0xfffffc01 -; SI-NEXT: s_lshr_b64 s[4:5], s[6:7], s10 +; SI-NEXT: s_and_b64 s[4:5], s[18:19], exec +; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 +; SI-NEXT: v_mov_b32_e32 v0, s3 +; SI-NEXT: s_bfe_u32 s3, s9, 0xb0014 +; SI-NEXT: s_addk_i32 s3, 0xfc01 +; SI-NEXT: s_lshr_b64 s[4:5], s[6:7], s3 ; SI-NEXT: s_andn2_b64 s[4:5], s[8:9], s[4:5] -; SI-NEXT: s_and_b32 s11, s9, 0x80000000 -; SI-NEXT: s_cmp_lt_i32 s10, 0 +; SI-NEXT: s_and_b32 s10, s9, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s3, 0 ; SI-NEXT: s_cselect_b32 s4, 0, s4 -; SI-NEXT: s_cselect_b32 s5, s11, s5 -; SI-NEXT: s_cmp_gt_i32 s10, 51 -; SI-NEXT: s_brev_b32 s3, -2 +; SI-NEXT: s_cselect_b32 s5, s10, s5 +; SI-NEXT: s_cmp_gt_i32 s3, 51 +; SI-NEXT: s_brev_b32 s18, -2 ; SI-NEXT: s_cselect_b32 s4, s8, s4 -; SI-NEXT: v_bfi_b32 v5, s3, v0, v1 +; SI-NEXT: v_bfi_b32 v5, s18, v0, v1 ; SI-NEXT: s_cselect_b32 s5, s9, s5 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_add_f64 v[0:1], s[8:9], -v[0:1] ; SI-NEXT: v_add_f64 v[2:3], s[16:17], v[4:5] -; SI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 -; SI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[0:1] +; SI-NEXT: v_cmp_ge_f64_e64 s[10:11], |v[0:1]|, 0.5 ; SI-NEXT: v_mov_b32_e32 v6, s9 -; SI-NEXT: s_and_b64 s[10:11], vcc, exec -; SI-NEXT: s_cselect_b32 s8, 0x3ff00000, 0 -; SI-NEXT: v_mov_b32_e32 v5, s8 -; SI-NEXT: s_bfe_u32 s8, s15, 0xb0014 -; SI-NEXT: s_add_i32 s10, s8, 0xfffffc01 -; SI-NEXT: s_lshr_b64 s[8:9], s[6:7], s10 +; SI-NEXT: s_and_b64 s[10:11], s[10:11], exec +; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 +; SI-NEXT: v_mov_b32_e32 v5, s3 +; SI-NEXT: s_bfe_u32 s3, s15, 0xb0014 +; SI-NEXT: s_addk_i32 s3, 0xfc01 +; SI-NEXT: s_lshr_b64 s[8:9], s[6:7], s3 ; SI-NEXT: s_andn2_b64 s[8:9], s[14:15], s[8:9] -; SI-NEXT: s_and_b32 s11, s15, 0x80000000 -; SI-NEXT: s_cmp_lt_i32 s10, 0 +; SI-NEXT: s_and_b32 s10, s15, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s3, 0 ; SI-NEXT: s_cselect_b32 s8, 0, s8 -; SI-NEXT: s_cselect_b32 s9, s11, s9 -; SI-NEXT: s_cmp_gt_i32 s10, 51 +; SI-NEXT: s_cselect_b32 s9, s10, s9 +; SI-NEXT: s_cmp_gt_i32 s3, 51 ; SI-NEXT: s_cselect_b32 s8, s14, s8 ; SI-NEXT: s_cselect_b32 s9, s15, s9 ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_mov_b32_e32 v1, s9 ; SI-NEXT: v_add_f64 v[0:1], s[14:15], -v[0:1] -; SI-NEXT: v_bfi_b32 v5, s3, v5, v6 -; SI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 -; SI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[0:1] +; SI-NEXT: v_bfi_b32 v5, s18, v5, v6 +; SI-NEXT: v_cmp_ge_f64_e64 s[10:11], |v[0:1]|, 0.5 ; SI-NEXT: v_add_f64 v[0:1], s[4:5], v[4:5] -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 -; SI-NEXT: v_mov_b32_e32 v8, s4 -; SI-NEXT: s_bfe_u32 s4, s13, 0xb0014 -; SI-NEXT: s_add_i32 s10, s4, 0xfffffc01 -; SI-NEXT: s_lshr_b64 s[4:5], s[6:7], s10 +; SI-NEXT: s_and_b64 s[4:5], s[10:11], exec +; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 +; SI-NEXT: v_mov_b32_e32 v8, s3 +; SI-NEXT: s_bfe_u32 s3, s13, 0xb0014 +; SI-NEXT: s_addk_i32 s3, 0xfc01 +; SI-NEXT: s_lshr_b64 s[4:5], s[6:7], s3 ; SI-NEXT: s_andn2_b64 s[4:5], s[12:13], s[4:5] ; SI-NEXT: s_and_b32 s6, s13, 0x80000000 -; SI-NEXT: s_cmp_lt_i32 s10, 0 +; SI-NEXT: s_cmp_lt_i32 s3, 0 ; SI-NEXT: s_cselect_b32 s4, 0, s4 ; SI-NEXT: s_cselect_b32 s5, s6, s5 -; SI-NEXT: s_cmp_gt_i32 s10, 51 +; SI-NEXT: s_cmp_gt_i32 s3, 51 ; SI-NEXT: s_cselect_b32 s5, s13, s5 ; SI-NEXT: s_cselect_b32 s4, s12, s4 ; SI-NEXT: v_mov_b32_e32 v6, s5 ; SI-NEXT: v_mov_b32_e32 v5, s4 ; SI-NEXT: v_add_f64 v[6:7], s[12:13], -v[5:6] ; SI-NEXT: v_mov_b32_e32 v9, s15 -; SI-NEXT: v_and_b32_e32 v7, 0x7fffffff, v7 -; SI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[6:7] -; SI-NEXT: v_bfi_b32 v5, s3, v8, v9 -; SI-NEXT: s_and_b64 s[6:7], vcc, exec -; SI-NEXT: s_cselect_b32 s6, 0x3ff00000, 0 +; SI-NEXT: v_cmp_ge_f64_e64 s[6:7], |v[6:7]|, 0.5 +; SI-NEXT: v_bfi_b32 v5, s18, v8, v9 +; SI-NEXT: s_and_b64 s[6:7], s[6:7], exec +; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 ; SI-NEXT: v_add_f64 v[6:7], s[8:9], v[4:5] -; SI-NEXT: v_mov_b32_e32 v5, s6 +; SI-NEXT: v_mov_b32_e32 v5, s3 ; SI-NEXT: v_mov_b32_e32 v8, s13 -; SI-NEXT: v_bfi_b32 v5, s3, v5, v8 +; SI-NEXT: v_bfi_b32 v5, s18, v5, v8 ; SI-NEXT: v_add_f64 v[4:5], s[4:5], v[4:5] ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -354,35 +343,31 @@ define amdgpu_kernel void @round_v4f64(ptr addrspace(1) %out, <4 x double> %in) ; CI-NEXT: v_trunc_f64_e32 v[6:7], s[8:9] ; CI-NEXT: v_add_f64 v[2:3], s[10:11], -v[0:1] ; CI-NEXT: v_mov_b32_e32 v5, s11 -; CI-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 -; CI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[2:3] +; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[2:3]|, 0.5 ; CI-NEXT: v_add_f64 v[2:3], s[8:9], -v[6:7] -; CI-NEXT: s_and_b64 s[4:5], vcc, exec -; CI-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 +; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec ; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 -; CI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[2:3] ; CI-NEXT: v_mov_b32_e32 v8, s4 +; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[2:3]|, 0.5 ; CI-NEXT: v_bfi_b32 v5, s2, v8, v5 -; CI-NEXT: s_and_b64 s[4:5], vcc, exec ; CI-NEXT: v_trunc_f64_e32 v[8:9], s[14:15] -; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 +; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec ; CI-NEXT: v_add_f64 v[2:3], v[0:1], v[4:5] +; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 +; CI-NEXT: v_add_f64 v[0:1], s[14:15], -v[8:9] ; CI-NEXT: v_mov_b32_e32 v5, s4 ; CI-NEXT: v_mov_b32_e32 v10, s9 -; CI-NEXT: v_add_f64 v[0:1], s[14:15], -v[8:9] ; CI-NEXT: v_bfi_b32 v5, s2, v5, v10 +; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[0:1]|, 0.5 ; CI-NEXT: v_trunc_f64_e32 v[10:11], s[12:13] -; CI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 -; CI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[0:1] ; CI-NEXT: v_add_f64 v[0:1], v[6:7], v[4:5] +; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec ; CI-NEXT: v_add_f64 v[6:7], s[12:13], -v[10:11] -; CI-NEXT: s_and_b64 s[4:5], vcc, exec -; CI-NEXT: v_and_b32_e32 v7, 0x7fffffff, v7 -; CI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[6:7] ; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 ; CI-NEXT: v_mov_b32_e32 v5, s4 +; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[6:7]|, 0.5 ; CI-NEXT: v_mov_b32_e32 v12, s15 -; CI-NEXT: s_and_b64 s[4:5], vcc, exec +; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec ; CI-NEXT: v_bfi_b32 v5, s2, v5, v12 ; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 ; CI-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] @@ -423,10 +408,9 @@ define amdgpu_kernel void @round_v8f64(ptr addrspace(1) %out, <8 x double> %in) ; SI-NEXT: v_mov_b32_e32 v1, s25 ; SI-NEXT: v_add_f64 v[0:1], s[10:11], -v[0:1] ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; SI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 -; SI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[0:1] +; SI-NEXT: v_cmp_ge_f64_e64 s[26:27], |v[0:1]|, 0.5 ; SI-NEXT: v_mov_b32_e32 v1, s11 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_and_b64 s[4:5], s[26:27], exec ; SI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: s_bfe_u32 s4, s9, 0xb0014 @@ -446,10 +430,9 @@ define amdgpu_kernel void @round_v8f64(ptr addrspace(1) %out, <8 x double> %in) ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_add_f64 v[0:1], s[8:9], -v[0:1] ; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 -; SI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[0:1] +; SI-NEXT: v_cmp_ge_f64_e64 s[10:11], |v[0:1]|, 0.5 ; SI-NEXT: v_add_f64 v[2:3], s[24:25], v[8:9] -; SI-NEXT: s_and_b64 s[10:11], vcc, exec +; SI-NEXT: s_and_b64 s[10:11], s[10:11], exec ; SI-NEXT: s_cselect_b32 s8, 0x3ff00000, 0 ; SI-NEXT: v_mov_b32_e32 v4, s8 ; SI-NEXT: s_bfe_u32 s8, s15, 0xb0014 @@ -467,10 +450,9 @@ define amdgpu_kernel void @round_v8f64(ptr addrspace(1) %out, <8 x double> %in) ; SI-NEXT: v_mov_b32_e32 v1, s9 ; SI-NEXT: v_add_f64 v[0:1], s[14:15], -v[0:1] ; SI-NEXT: v_bfi_b32 v9, s3, v4, v5 -; SI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 -; SI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[0:1] +; SI-NEXT: v_cmp_ge_f64_e64 s[10:11], |v[0:1]|, 0.5 ; SI-NEXT: v_add_f64 v[0:1], s[4:5], v[8:9] -; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_and_b64 s[4:5], s[10:11], exec ; SI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 ; SI-NEXT: v_mov_b32_e32 v6, s4 ; SI-NEXT: s_bfe_u32 s4, s13, 0xb0014 @@ -488,11 +470,10 @@ define amdgpu_kernel void @round_v8f64(ptr addrspace(1) %out, <8 x double> %in) ; SI-NEXT: v_mov_b32_e32 v5, s5 ; SI-NEXT: v_add_f64 v[4:5], s[12:13], -v[4:5] ; SI-NEXT: v_mov_b32_e32 v7, s15 -; SI-NEXT: v_and_b32_e32 v5, 0x7fffffff, v5 -; SI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[4:5] +; SI-NEXT: v_cmp_ge_f64_e64 s[10:11], |v[4:5]|, 0.5 ; SI-NEXT: v_bfi_b32 v9, s3, v6, v7 ; SI-NEXT: v_add_f64 v[6:7], s[8:9], v[8:9] -; SI-NEXT: s_and_b64 s[8:9], vcc, exec +; SI-NEXT: s_and_b64 s[8:9], s[10:11], exec ; SI-NEXT: s_cselect_b32 s8, 0x3ff00000, 0 ; SI-NEXT: v_mov_b32_e32 v9, s8 ; SI-NEXT: s_bfe_u32 s8, s19, 0xb0014 @@ -510,11 +491,10 @@ define amdgpu_kernel void @round_v8f64(ptr addrspace(1) %out, <8 x double> %in) ; SI-NEXT: v_mov_b32_e32 v5, s9 ; SI-NEXT: v_add_f64 v[4:5], s[18:19], -v[4:5] ; SI-NEXT: v_mov_b32_e32 v10, s13 -; SI-NEXT: v_and_b32_e32 v5, 0x7fffffff, v5 -; SI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[4:5] +; SI-NEXT: v_cmp_ge_f64_e64 s[10:11], |v[4:5]|, 0.5 ; SI-NEXT: v_bfi_b32 v9, s3, v9, v10 ; SI-NEXT: v_add_f64 v[4:5], s[4:5], v[8:9] -; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_and_b64 s[4:5], s[10:11], exec ; SI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 ; SI-NEXT: v_mov_b32_e32 v12, s4 ; SI-NEXT: s_bfe_u32 s4, s17, 0xb0014 @@ -532,11 +512,10 @@ define amdgpu_kernel void @round_v8f64(ptr addrspace(1) %out, <8 x double> %in) ; SI-NEXT: v_mov_b32_e32 v9, s4 ; SI-NEXT: v_add_f64 v[10:11], s[16:17], -v[9:10] ; SI-NEXT: v_mov_b32_e32 v13, s19 -; SI-NEXT: v_and_b32_e32 v11, 0x7fffffff, v11 -; SI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[10:11] +; SI-NEXT: v_cmp_ge_f64_e64 s[10:11], |v[10:11]|, 0.5 ; SI-NEXT: v_bfi_b32 v9, s3, v12, v13 ; SI-NEXT: v_add_f64 v[12:13], s[8:9], v[8:9] -; SI-NEXT: s_and_b64 s[8:9], vcc, exec +; SI-NEXT: s_and_b64 s[8:9], s[10:11], exec ; SI-NEXT: s_cselect_b32 s8, 0x3ff00000, 0 ; SI-NEXT: v_mov_b32_e32 v14, s8 ; SI-NEXT: s_bfe_u32 s8, s23, 0xb0014 @@ -554,11 +533,10 @@ define amdgpu_kernel void @round_v8f64(ptr addrspace(1) %out, <8 x double> %in) ; SI-NEXT: v_mov_b32_e32 v9, s8 ; SI-NEXT: v_add_f64 v[10:11], s[22:23], -v[9:10] ; SI-NEXT: v_mov_b32_e32 v15, s17 -; SI-NEXT: v_and_b32_e32 v11, 0x7fffffff, v11 -; SI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[10:11] +; SI-NEXT: v_cmp_ge_f64_e64 s[10:11], |v[10:11]|, 0.5 ; SI-NEXT: v_bfi_b32 v9, s3, v14, v15 ; SI-NEXT: v_add_f64 v[10:11], s[4:5], v[8:9] -; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_and_b64 s[4:5], s[10:11], exec ; SI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 ; SI-NEXT: v_mov_b32_e32 v9, s4 ; SI-NEXT: s_bfe_u32 s4, s21, 0xb0014 @@ -576,10 +554,9 @@ define amdgpu_kernel void @round_v8f64(ptr addrspace(1) %out, <8 x double> %in) ; SI-NEXT: v_mov_b32_e32 v14, s4 ; SI-NEXT: v_add_f64 v[14:15], s[20:21], -v[14:15] ; SI-NEXT: v_mov_b32_e32 v16, s23 -; SI-NEXT: v_and_b32_e32 v15, 0x7fffffff, v15 -; SI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[14:15] +; SI-NEXT: v_cmp_ge_f64_e64 s[6:7], |v[14:15]|, 0.5 ; SI-NEXT: v_bfi_b32 v9, s3, v9, v16 -; SI-NEXT: s_and_b64 s[6:7], vcc, exec +; SI-NEXT: s_and_b64 s[6:7], s[6:7], exec ; SI-NEXT: s_cselect_b32 s6, 0x3ff00000, 0 ; SI-NEXT: v_add_f64 v[16:17], s[8:9], v[8:9] ; SI-NEXT: v_mov_b32_e32 v9, s6 @@ -598,95 +575,87 @@ define amdgpu_kernel void @round_v8f64(ptr addrspace(1) %out, <8 x double> %in) ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x19 ; CI-NEXT: s_brev_b32 s6, -2 -; CI-NEXT: v_mov_b32_e32 v4, 0 +; CI-NEXT: v_mov_b32_e32 v12, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_trunc_f64_e32 v[0:1], s[10:11] -; CI-NEXT: v_trunc_f64_e32 v[6:7], s[8:9] +; CI-NEXT: v_trunc_f64_e32 v[4:5], s[8:9] ; CI-NEXT: v_add_f64 v[2:3], s[10:11], -v[0:1] -; CI-NEXT: v_add_f64 v[8:9], s[8:9], -v[6:7] -; CI-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 -; CI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[2:3] -; CI-NEXT: v_and_b32_e32 v9, 0x7fffffff, v9 -; CI-NEXT: s_and_b64 s[2:3], vcc, exec -; CI-NEXT: v_cmp_le_f64_e64 s[0:1], 0.5, v[8:9] -; CI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 -; CI-NEXT: v_mov_b32_e32 v5, s11 -; CI-NEXT: v_mov_b32_e32 v2, s2 -; CI-NEXT: v_trunc_f64_e32 v[8:9], s[14:15] -; CI-NEXT: v_bfi_b32 v5, s6, v2, v5 +; CI-NEXT: v_add_f64 v[6:7], s[8:9], -v[4:5] +; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[2:3]|, 0.5 +; CI-NEXT: v_cmp_ge_f64_e64 s[2:3], |v[6:7]|, 0.5 ; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec -; CI-NEXT: v_add_f64 v[2:3], v[0:1], v[4:5] +; CI-NEXT: s_cselect_b32 s7, 0x3ff00000, 0 +; CI-NEXT: v_mov_b32_e32 v8, s11 +; CI-NEXT: s_and_b64 s[0:1], s[2:3], exec +; CI-NEXT: v_mov_b32_e32 v2, s7 +; CI-NEXT: v_trunc_f64_e32 v[6:7], s[14:15] +; CI-NEXT: v_bfi_b32 v13, s6, v2, v8 ; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 -; CI-NEXT: v_add_f64 v[0:1], s[14:15], -v[8:9] -; CI-NEXT: v_mov_b32_e32 v5, s0 -; CI-NEXT: v_mov_b32_e32 v10, s9 -; CI-NEXT: v_bfi_b32 v5, s6, v5, v10 -; CI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 -; CI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[0:1] -; CI-NEXT: v_add_f64 v[0:1], v[6:7], v[4:5] -; CI-NEXT: v_trunc_f64_e32 v[6:7], s[12:13] -; CI-NEXT: s_and_b64 s[0:1], vcc, exec -; CI-NEXT: v_add_f64 v[10:11], s[12:13], -v[6:7] +; CI-NEXT: v_add_f64 v[2:3], v[0:1], v[12:13] +; CI-NEXT: v_mov_b32_e32 v8, s0 +; CI-NEXT: v_mov_b32_e32 v9, s9 +; CI-NEXT: v_add_f64 v[0:1], s[14:15], -v[6:7] +; CI-NEXT: v_bfi_b32 v13, s6, v8, v9 +; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[0:1]|, 0.5 +; CI-NEXT: v_add_f64 v[0:1], v[4:5], v[12:13] +; CI-NEXT: v_trunc_f64_e32 v[4:5], s[12:13] +; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec +; CI-NEXT: v_add_f64 v[8:9], s[12:13], -v[4:5] ; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 -; CI-NEXT: v_and_b32_e32 v11, 0x7fffffff, v11 -; CI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[10:11] -; CI-NEXT: v_mov_b32_e32 v5, s0 -; CI-NEXT: v_mov_b32_e32 v12, s15 -; CI-NEXT: s_and_b64 s[0:1], vcc, exec -; CI-NEXT: v_trunc_f64_e32 v[10:11], s[18:19] -; CI-NEXT: v_bfi_b32 v5, s6, v5, v12 +; CI-NEXT: v_mov_b32_e32 v10, s0 +; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[8:9]|, 0.5 +; CI-NEXT: v_trunc_f64_e32 v[8:9], s[18:19] +; CI-NEXT: v_mov_b32_e32 v11, s15 +; CI-NEXT: v_bfi_b32 v13, s6, v10, v11 +; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec +; CI-NEXT: v_add_f64 v[10:11], s[18:19], -v[8:9] ; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 -; CI-NEXT: v_add_f64 v[8:9], v[8:9], v[4:5] -; CI-NEXT: v_mov_b32_e32 v5, s0 +; CI-NEXT: v_add_f64 v[6:7], v[6:7], v[12:13] +; CI-NEXT: v_mov_b32_e32 v13, s0 ; CI-NEXT: v_mov_b32_e32 v14, s13 -; CI-NEXT: v_add_f64 v[12:13], s[18:19], -v[10:11] -; CI-NEXT: v_bfi_b32 v5, s6, v5, v14 +; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[10:11]|, 0.5 +; CI-NEXT: v_bfi_b32 v13, s6, v13, v14 ; CI-NEXT: v_trunc_f64_e32 v[14:15], s[16:17] -; CI-NEXT: v_and_b32_e32 v13, 0x7fffffff, v13 -; CI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[12:13] -; CI-NEXT: v_add_f64 v[12:13], s[16:17], -v[14:15] -; CI-NEXT: s_and_b64 s[0:1], vcc, exec -; CI-NEXT: v_and_b32_e32 v13, 0x7fffffff, v13 -; CI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[12:13] +; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec ; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 -; CI-NEXT: v_add_f64 v[6:7], v[6:7], v[4:5] -; CI-NEXT: v_mov_b32_e32 v5, s0 +; CI-NEXT: v_add_f64 v[10:11], s[16:17], -v[14:15] +; CI-NEXT: v_add_f64 v[4:5], v[4:5], v[12:13] +; CI-NEXT: v_mov_b32_e32 v13, s0 ; CI-NEXT: v_mov_b32_e32 v16, s19 -; CI-NEXT: s_and_b64 s[0:1], vcc, exec -; CI-NEXT: v_bfi_b32 v5, s6, v5, v16 -; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 +; CI-NEXT: v_bfi_b32 v13, s6, v13, v16 +; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[10:11]|, 0.5 ; CI-NEXT: v_trunc_f64_e32 v[16:17], s[22:23] -; CI-NEXT: v_add_f64 v[12:13], v[10:11], v[4:5] -; CI-NEXT: v_mov_b32_e32 v5, s0 -; CI-NEXT: v_mov_b32_e32 v10, s17 -; CI-NEXT: v_bfi_b32 v5, s6, v5, v10 +; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec ; CI-NEXT: v_add_f64 v[18:19], s[22:23], -v[16:17] -; CI-NEXT: v_add_f64 v[10:11], v[14:15], v[4:5] -; CI-NEXT: v_trunc_f64_e32 v[14:15], s[20:21] -; CI-NEXT: v_and_b32_e32 v19, 0x7fffffff, v19 -; CI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[18:19] -; CI-NEXT: v_add_f64 v[18:19], s[20:21], -v[14:15] -; CI-NEXT: s_and_b64 s[0:1], vcc, exec -; CI-NEXT: v_and_b32_e32 v19, 0x7fffffff, v19 -; CI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[18:19] ; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 -; CI-NEXT: v_mov_b32_e32 v5, s0 -; CI-NEXT: s_and_b64 s[0:1], vcc, exec -; CI-NEXT: v_mov_b32_e32 v18, s23 +; CI-NEXT: v_add_f64 v[10:11], v[8:9], v[12:13] +; CI-NEXT: v_mov_b32_e32 v8, s0 +; CI-NEXT: v_mov_b32_e32 v9, s17 +; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[18:19]|, 0.5 +; CI-NEXT: v_trunc_f64_e32 v[18:19], s[20:21] +; CI-NEXT: v_bfi_b32 v13, s6, v8, v9 +; CI-NEXT: v_add_f64 v[8:9], v[14:15], v[12:13] +; CI-NEXT: v_add_f64 v[13:14], s[20:21], -v[18:19] +; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec +; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[13:14]|, 0.5 +; CI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 +; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec ; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 -; CI-NEXT: v_bfi_b32 v5, s6, v5, v18 -; CI-NEXT: v_mov_b32_e32 v18, s0 +; CI-NEXT: v_mov_b32_e32 v13, s2 +; CI-NEXT: v_mov_b32_e32 v14, s23 +; CI-NEXT: v_mov_b32_e32 v20, s0 ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; CI-NEXT: v_mov_b32_e32 v19, s21 -; CI-NEXT: v_add_f64 v[16:17], v[16:17], v[4:5] -; CI-NEXT: v_bfi_b32 v5, s6, v18, v19 -; CI-NEXT: v_add_f64 v[14:15], v[14:15], v[4:5] +; CI-NEXT: v_bfi_b32 v13, s6, v13, v14 +; CI-NEXT: v_mov_b32_e32 v21, s21 +; CI-NEXT: v_add_f64 v[14:15], v[16:17], v[12:13] +; CI-NEXT: v_bfi_b32 v13, s6, v20, v21 +; CI-NEXT: v_add_f64 v[12:13], v[18:19], v[12:13] ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:48 -; CI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:32 -; CI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:16 +; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 +; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 +; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 ; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; CI-NEXT: s_endpgm %result = call <8 x double> @llvm.round.v8f64(<8 x double> %in) #1 diff --git a/llvm/test/CodeGen/AMDGPU/lround.ll b/llvm/test/CodeGen/AMDGPU/lround.ll index 5e2412742ec69..8036e32f90eb0 100644 --- a/llvm/test/CodeGen/AMDGPU/lround.ll +++ b/llvm/test/CodeGen/AMDGPU/lround.ll @@ -101,8 +101,7 @@ define i32 @intrinsic_lround_i32_f64(double %arg) { ; GFX9-SDAG-NEXT: s_brev_b32 s4, -2 ; GFX9-SDAG-NEXT: v_add_f64 v[4:5], v[0:1], -v[2:3] ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0x3ff00000 -; GFX9-SDAG-NEXT: v_and_b32_e32 v5, 0x7fffffff, v5 -; GFX9-SDAG-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[4:5] +; GFX9-SDAG-NEXT: v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5 ; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GFX9-SDAG-NEXT: v_bfi_b32 v1, s4, v0, v1 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0 @@ -130,9 +129,8 @@ define i32 @intrinsic_lround_i32_f64(double %arg) { ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SDAG-NEXT: v_trunc_f64_e32 v[2:3], v[0:1] ; GFX10-SDAG-NEXT: v_add_f64 v[4:5], v[0:1], -v[2:3] -; GFX10-SDAG-NEXT: v_and_b32_e32 v5, 0x7fffffff, v5 -; GFX10-SDAG-NEXT: v_cmp_le_f64_e32 vcc_lo, 0.5, v[4:5] -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x3ff00000, vcc_lo +; GFX10-SDAG-NEXT: v_cmp_ge_f64_e64 s4, |v[4:5]|, 0.5 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x3ff00000, s4 ; GFX10-SDAG-NEXT: v_bfi_b32 v1, 0x7fffffff, v0, v1 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-SDAG-NEXT: v_add_f64 v[0:1], v[2:3], v[0:1] @@ -158,10 +156,9 @@ define i32 @intrinsic_lround_i32_f64(double %arg) { ; GFX11-SDAG-NEXT: v_trunc_f64_e32 v[2:3], v[0:1] ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_add_f64 v[4:5], v[0:1], -v[2:3] -; GFX11-SDAG-NEXT: v_and_b32_e32 v5, 0x7fffffff, v5 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_cmp_le_f64_e32 vcc_lo, 0.5, v[4:5] -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x3ff00000, vcc_lo +; GFX11-SDAG-NEXT: v_cmp_ge_f64_e64 s0, |v[4:5]|, 0.5 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x3ff00000, s0 ; GFX11-SDAG-NEXT: v_bfi_b32 v1, 0x7fffffff, v0, v1 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -355,8 +352,7 @@ define i64 @intrinsic_lround_i64_f64(double %arg) { ; GFX9-SDAG-NEXT: s_brev_b32 s4, -2 ; GFX9-SDAG-NEXT: v_add_f64 v[4:5], v[0:1], -v[2:3] ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-SDAG-NEXT: v_and_b32_e32 v5, 0x7fffffff, v5 -; GFX9-SDAG-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[4:5] +; GFX9-SDAG-NEXT: v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5 ; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v6, vcc ; GFX9-SDAG-NEXT: v_bfi_b32 v1, s4, v4, v1 ; GFX9-SDAG-NEXT: v_add_f64 v[0:1], v[2:3], v[0:1] @@ -401,9 +397,8 @@ define i64 @intrinsic_lround_i64_f64(double %arg) { ; GFX10-SDAG-NEXT: v_trunc_f64_e32 v[2:3], v[0:1] ; GFX10-SDAG-NEXT: v_add_f64 v[4:5], v[0:1], -v[2:3] ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-SDAG-NEXT: v_and_b32_e32 v5, 0x7fffffff, v5 -; GFX10-SDAG-NEXT: v_cmp_le_f64_e32 vcc_lo, 0.5, v[4:5] -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x3ff00000, vcc_lo +; GFX10-SDAG-NEXT: v_cmp_ge_f64_e64 s4, |v[4:5]|, 0.5 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x3ff00000, s4 ; GFX10-SDAG-NEXT: v_bfi_b32 v1, 0x7fffffff, v4, v1 ; GFX10-SDAG-NEXT: v_add_f64 v[0:1], v[2:3], v[0:1] ; GFX10-SDAG-NEXT: v_trunc_f64_e32 v[0:1], v[0:1] @@ -436,12 +431,12 @@ define i64 @intrinsic_lround_i64_f64(double %arg) { ; GFX11-SDAG: ; %bb.0: ; %entry ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SDAG-NEXT: v_trunc_f64_e32 v[2:3], v[0:1] -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-SDAG-NEXT: v_add_f64 v[4:5], v[0:1], -v[2:3] -; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_and_b32 v5, 0x7fffffff, v5 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_cmp_le_f64_e32 vcc_lo, 0.5, v[4:5] -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x3ff00000, vcc_lo +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-SDAG-NEXT: v_cmp_ge_f64_e64 s0, |v[4:5]|, 0.5 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x3ff00000, s0 ; GFX11-SDAG-NEXT: v_bfi_b32 v1, 0x7fffffff, v4, v1 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_add_f64 v[0:1], v[2:3], v[0:1] @@ -648,8 +643,7 @@ define i64 @intrinsic_llround_i64_f64(double %arg) { ; GFX9-SDAG-NEXT: s_brev_b32 s4, -2 ; GFX9-SDAG-NEXT: v_add_f64 v[4:5], v[0:1], -v[2:3] ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-SDAG-NEXT: v_and_b32_e32 v5, 0x7fffffff, v5 -; GFX9-SDAG-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[4:5] +; GFX9-SDAG-NEXT: v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5 ; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v6, vcc ; GFX9-SDAG-NEXT: v_bfi_b32 v1, s4, v4, v1 ; GFX9-SDAG-NEXT: v_add_f64 v[0:1], v[2:3], v[0:1] @@ -694,9 +688,8 @@ define i64 @intrinsic_llround_i64_f64(double %arg) { ; GFX10-SDAG-NEXT: v_trunc_f64_e32 v[2:3], v[0:1] ; GFX10-SDAG-NEXT: v_add_f64 v[4:5], v[0:1], -v[2:3] ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-SDAG-NEXT: v_and_b32_e32 v5, 0x7fffffff, v5 -; GFX10-SDAG-NEXT: v_cmp_le_f64_e32 vcc_lo, 0.5, v[4:5] -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x3ff00000, vcc_lo +; GFX10-SDAG-NEXT: v_cmp_ge_f64_e64 s4, |v[4:5]|, 0.5 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x3ff00000, s4 ; GFX10-SDAG-NEXT: v_bfi_b32 v1, 0x7fffffff, v4, v1 ; GFX10-SDAG-NEXT: v_add_f64 v[0:1], v[2:3], v[0:1] ; GFX10-SDAG-NEXT: v_trunc_f64_e32 v[0:1], v[0:1] @@ -729,12 +722,12 @@ define i64 @intrinsic_llround_i64_f64(double %arg) { ; GFX11-SDAG: ; %bb.0: ; %entry ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SDAG-NEXT: v_trunc_f64_e32 v[2:3], v[0:1] -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-SDAG-NEXT: v_add_f64 v[4:5], v[0:1], -v[2:3] -; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_and_b32 v5, 0x7fffffff, v5 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_cmp_le_f64_e32 vcc_lo, 0.5, v[4:5] -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x3ff00000, vcc_lo +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-SDAG-NEXT: v_cmp_ge_f64_e64 s0, |v[4:5]|, 0.5 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x3ff00000, s0 ; GFX11-SDAG-NEXT: v_bfi_b32 v1, 0x7fffffff, v4, v1 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_add_f64 v[0:1], v[2:3], v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/roundeven.ll b/llvm/test/CodeGen/AMDGPU/roundeven.ll index 3b9462cd690d5..59a1fe041bf90 100644 --- a/llvm/test/CodeGen/AMDGPU/roundeven.ll +++ b/llvm/test/CodeGen/AMDGPU/roundeven.ll @@ -1125,18 +1125,16 @@ define double @v_roundeven_f64(double %x) { ; SDAG_GFX6: ; %bb.0: ; SDAG_GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG_GFX6-NEXT: s_brev_b32 s6, -2 -; SDAG_GFX6-NEXT: v_mov_b32_e32 v4, 0x43300000 -; SDAG_GFX6-NEXT: v_bfi_b32 v5, s6, v4, v1 -; SDAG_GFX6-NEXT: v_mov_b32_e32 v4, 0 +; SDAG_GFX6-NEXT: v_mov_b32_e32 v2, 0x43300000 +; SDAG_GFX6-NEXT: v_bfi_b32 v3, s6, v2, v1 +; SDAG_GFX6-NEXT: v_mov_b32_e32 v2, 0 ; SDAG_GFX6-NEXT: s_mov_b32 s4, -1 -; SDAG_GFX6-NEXT: v_add_f64 v[6:7], v[0:1], v[4:5] -; SDAG_GFX6-NEXT: v_and_b32_e32 v3, 0x7fffffff, v1 -; SDAG_GFX6-NEXT: v_mov_b32_e32 v2, v0 +; SDAG_GFX6-NEXT: v_add_f64 v[4:5], v[0:1], v[2:3] ; SDAG_GFX6-NEXT: s_mov_b32 s5, 0x432fffff -; SDAG_GFX6-NEXT: v_add_f64 v[4:5], v[6:7], -v[4:5] -; SDAG_GFX6-NEXT: v_cmp_lt_f64_e32 vcc, s[4:5], v[2:3] -; SDAG_GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; SDAG_GFX6-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; SDAG_GFX6-NEXT: v_add_f64 v[2:3], v[4:5], -v[2:3] +; SDAG_GFX6-NEXT: v_cmp_gt_f64_e64 vcc, |v[0:1]|, s[4:5] +; SDAG_GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SDAG_GFX6-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; SDAG_GFX6-NEXT: s_setpc_b64 s[30:31] ; ; SDAG_GFX7-LABEL: v_roundeven_f64: @@ -1217,10 +1215,9 @@ define double @v_roundeven_f64_fneg(double %x) { ; SDAG_GFX6-NEXT: v_mov_b32_e32 v2, 0 ; SDAG_GFX6-NEXT: v_add_f64 v[4:5], -v[0:1], v[2:3] ; SDAG_GFX6-NEXT: s_mov_b32 s4, -1 -; SDAG_GFX6-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 ; SDAG_GFX6-NEXT: s_mov_b32 s5, 0x432fffff ; SDAG_GFX6-NEXT: v_add_f64 v[2:3], v[4:5], -v[2:3] -; SDAG_GFX6-NEXT: v_cmp_lt_f64_e32 vcc, s[4:5], v[0:1] +; SDAG_GFX6-NEXT: v_cmp_gt_f64_e64 vcc, |v[0:1]|, s[4:5] ; SDAG_GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; SDAG_GFX6-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc ; SDAG_GFX6-NEXT: s_setpc_b64 s[30:31] @@ -1308,24 +1305,20 @@ define <2 x double> @v_roundeven_v2f64(<2 x double> %x) { ; SDAG_GFX6: ; %bb.0: ; SDAG_GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG_GFX6-NEXT: s_brev_b32 s6, -2 -; SDAG_GFX6-NEXT: v_mov_b32_e32 v9, 0x43300000 -; SDAG_GFX6-NEXT: v_bfi_b32 v5, s6, v9, v1 +; SDAG_GFX6-NEXT: v_mov_b32_e32 v8, 0x43300000 +; SDAG_GFX6-NEXT: v_bfi_b32 v5, s6, v8, v1 ; SDAG_GFX6-NEXT: v_mov_b32_e32 v4, 0 ; SDAG_GFX6-NEXT: v_add_f64 v[6:7], v[0:1], v[4:5] ; SDAG_GFX6-NEXT: s_mov_b32 s4, -1 -; SDAG_GFX6-NEXT: v_add_f64 v[5:6], v[6:7], -v[4:5] -; SDAG_GFX6-NEXT: v_and_b32_e32 v8, 0x7fffffff, v1 -; SDAG_GFX6-NEXT: v_mov_b32_e32 v7, v0 ; SDAG_GFX6-NEXT: s_mov_b32 s5, 0x432fffff -; SDAG_GFX6-NEXT: v_cmp_lt_f64_e32 vcc, s[4:5], v[7:8] +; SDAG_GFX6-NEXT: v_add_f64 v[5:6], v[6:7], -v[4:5] +; SDAG_GFX6-NEXT: v_cmp_gt_f64_e64 vcc, |v[0:1]|, s[4:5] ; SDAG_GFX6-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc -; SDAG_GFX6-NEXT: v_bfi_b32 v5, s6, v9, v3 +; SDAG_GFX6-NEXT: v_bfi_b32 v5, s6, v8, v3 ; SDAG_GFX6-NEXT: v_add_f64 v[7:8], v[2:3], v[4:5] ; SDAG_GFX6-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc ; SDAG_GFX6-NEXT: v_add_f64 v[4:5], v[7:8], -v[4:5] -; SDAG_GFX6-NEXT: v_and_b32_e32 v7, 0x7fffffff, v3 -; SDAG_GFX6-NEXT: v_mov_b32_e32 v6, v2 -; SDAG_GFX6-NEXT: v_cmp_lt_f64_e32 vcc, s[4:5], v[6:7] +; SDAG_GFX6-NEXT: v_cmp_gt_f64_e64 vcc, |v[2:3]|, s[4:5] ; SDAG_GFX6-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; SDAG_GFX6-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; SDAG_GFX6-NEXT: s_setpc_b64 s[30:31]