https://github.com/arsenm created https://github.com/llvm/llvm-project/pull/177630
This treats it as free on targets without legal f16. This matches the existing logic in fneg, and they should be the same. The test changes are mostly neutral with a few improvements. >From 4f09b7a8242056adab73cfd59d13eba4c997406d Mon Sep 17 00:00:00 2001 From: Matt Arsenault <[email protected]> Date: Fri, 23 Jan 2026 19:02:01 +0100 Subject: [PATCH] AMDGPU: Ignore type legality in isFAbsFree This treats it as free on targets without legal f16. This matches the existing logic in fneg, and they should be the same. The test changes are mostly neutral with a few improvements. --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 4 +- llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll | 268 ++++++++---------- llvm/test/CodeGen/AMDGPU/fabs.f16.ll | 8 +- .../test/CodeGen/AMDGPU/fmed3-cast-combine.ll | 32 +-- llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll | 16 +- llvm/test/CodeGen/AMDGPU/fp-classify.ll | 27 +- llvm/test/CodeGen/AMDGPU/fract-match.ll | 46 ++- llvm/test/CodeGen/AMDGPU/frem.ll | 176 ++++++------ .../AMDGPU/select-fabs-fneg-extract.f16.ll | 6 +- 9 files changed, 275 insertions(+), 308 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 7652351e2e247..1efe47c61f1a8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -954,8 +954,8 @@ bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const { assert(VT.isFloatingPoint()); // Packed operations do not have a fabs modifier. - return VT == MVT::f32 || VT == MVT::f64 || - (Subtarget->has16BitInsts() && (VT == MVT::f16 || VT == MVT::bf16)); + // Report this based on the end legalized type. + return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f16 || VT == MVT::bf16; } bool AMDGPUTargetLowering::isFNegFree(EVT VT) const { diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll index a43bfb5d45679..62e97279b606a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll @@ -5,35 +5,35 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 { ; CI-LABEL: frem_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; CI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0xd ; CI-NEXT: ; implicit-def: $vgpr1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_load_dword s2, s[2:3], 0x0 -; CI-NEXT: s_load_dword s3, s[4:5], 0x2 -; CI-NEXT: s_mov_b32 s4, 1 +; CI-NEXT: s_load_dword s0, s[10:11], 0x0 +; CI-NEXT: s_load_dword s1, s[2:3], 0x2 +; CI-NEXT: s_mov_b32 s2, 1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e64 v2, |s2| -; CI-NEXT: v_cvt_f32_f16_e64 v0, |s3| +; CI-NEXT: v_cvt_f32_f16_e64 v2, |s0| +; CI-NEXT: v_cvt_f32_f16_e64 v0, |s1| ; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v2, v0 ; CI-NEXT: s_cbranch_vccz .LBB0_2 ; CI-NEXT: ; %bb.1: ; %frem.else -; CI-NEXT: s_and_b32 s4, s2, 0x8000 +; CI-NEXT: s_and_b32 s2, s0, 0x8000 ; CI-NEXT: v_cmp_eq_f32_e32 vcc, v2, v0 -; CI-NEXT: v_mov_b32_e32 v1, s4 -; CI-NEXT: v_mov_b32_e32 v3, s2 +; CI-NEXT: v_mov_b32_e32 v1, s2 +; CI-NEXT: v_mov_b32_e32 v3, s0 ; CI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; CI-NEXT: s_mov_b32 s4, 0 +; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: .LBB0_2: ; %Flow18 -; CI-NEXT: s_xor_b32 s4, s4, 1 -; CI-NEXT: s_cmp_lg_u32 s4, 0 +; CI-NEXT: s_xor_b32 s2, s2, 1 +; CI-NEXT: s_cmp_lg_u32 s2, 0 ; CI-NEXT: s_cbranch_scc1 .LBB0_8 ; CI-NEXT: ; %bb.3: ; %frem.compute ; CI-NEXT: v_frexp_mant_f32_e32 v1, v2 ; CI-NEXT: v_ldexp_f32_e64 v4, v1, 11 ; CI-NEXT: v_frexp_mant_f32_e32 v1, v0 ; CI-NEXT: v_ldexp_f32_e64 v1, v1, 1 -; CI-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, 1.0 +; CI-NEXT: v_div_scale_f32 v3, s[2:3], v1, v1, 1.0 ; CI-NEXT: v_frexp_exp_i32_f32_e32 v5, v2 ; CI-NEXT: v_frexp_exp_i32_f32_e32 v6, v0 ; CI-NEXT: v_add_i32_e32 v2, vcc, -1, v5 @@ -83,23 +83,21 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; CI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; CI-NEXT: v_ldexp_f32_e32 v0, v1, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: s_and_b32 s4, s2, 0x8000 +; CI-NEXT: s_and_b32 s2, s0, 0x8000 ; CI-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; CI-NEXT: v_or_b32_e32 v1, s4, v0 +; CI-NEXT: v_or_b32_e32 v1, s2, v0 ; CI-NEXT: .LBB0_8: ; %Flow19 -; CI-NEXT: v_cvt_f32_f16_e32 v0, s3 -; CI-NEXT: s_and_b32 s2, s2, 0x7fff -; CI-NEXT: s_cmpk_lg_i32 s2, 0x7c00 -; CI-NEXT: s_cselect_b32 s2, 1, 0 +; CI-NEXT: v_cvt_f32_f16_e32 v0, s1 +; CI-NEXT: v_mov_b32_e32 v2, 0x7f800000 +; CI-NEXT: s_mov_b32 s10, -1 +; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v0 +; CI-NEXT: v_cvt_f32_f16_e64 v0, |s0| +; CI-NEXT: v_cmp_nge_f32_e64 s[0:1], v0, v2 ; CI-NEXT: v_mov_b32_e32 v0, 0x7e00 -; CI-NEXT: s_and_b32 s2, 1, s2 ; CI-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc -; CI-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 -; CI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; CI-NEXT: s_mov_b32 s2, -1 -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; CI-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[0:1] +; CI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: frem_f16: @@ -1117,30 +1115,30 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; CI-NEXT: v_or_b32_e32 v0, s2, v0 ; CI-NEXT: .LBB9_8: ; %Flow58 -; CI-NEXT: s_lshr_b32 s2, s0, 16 -; CI-NEXT: s_lshr_b32 s3, s1, 16 -; CI-NEXT: v_cvt_f32_f16_e64 v3, |s2| -; CI-NEXT: v_cvt_f32_f16_e64 v2, |s3| -; CI-NEXT: s_mov_b32 s4, 1 +; CI-NEXT: s_lshr_b32 s4, s0, 16 +; CI-NEXT: s_lshr_b32 s2, s1, 16 +; CI-NEXT: v_cvt_f32_f16_e64 v3, |s4| +; CI-NEXT: v_cvt_f32_f16_e64 v2, |s2| +; CI-NEXT: s_mov_b32 s3, 1 ; CI-NEXT: ; implicit-def: $vgpr1 ; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v3, v2 ; CI-NEXT: s_cbranch_vccz .LBB9_10 ; CI-NEXT: ; %bb.9: ; %frem.else -; CI-NEXT: s_and_b32 s4, s2, 0x8000 +; CI-NEXT: s_and_b32 s3, s4, 0x8000 ; CI-NEXT: v_cmp_eq_f32_e32 vcc, v3, v2 -; CI-NEXT: v_mov_b32_e32 v1, s4 -; CI-NEXT: v_mov_b32_e32 v4, s2 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: v_mov_b32_e32 v4, s4 ; CI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; CI-NEXT: s_mov_b32 s4, 0 +; CI-NEXT: s_mov_b32 s3, 0 ; CI-NEXT: .LBB9_10: ; %Flow53 -; CI-NEXT: s_xor_b32 s4, s4, 1 -; CI-NEXT: s_cmp_lg_u32 s4, 0 +; CI-NEXT: s_xor_b32 s3, s3, 1 +; CI-NEXT: s_cmp_lg_u32 s3, 0 ; CI-NEXT: s_cbranch_scc1 .LBB9_16 ; CI-NEXT: ; %bb.11: ; %frem.compute ; CI-NEXT: v_frexp_mant_f32_e32 v4, v2 ; CI-NEXT: v_frexp_exp_i32_f32_e32 v7, v2 ; CI-NEXT: v_ldexp_f32_e64 v2, v4, 1 -; CI-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, 1.0 +; CI-NEXT: v_div_scale_f32 v4, s[6:7], v2, v2, 1.0 ; CI-NEXT: v_frexp_mant_f32_e32 v1, v3 ; CI-NEXT: v_frexp_exp_i32_f32_e32 v6, v3 ; CI-NEXT: v_add_i32_e32 v3, vcc, -1, v6 @@ -1191,35 +1189,30 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc ; CI-NEXT: v_ldexp_f32_e32 v1, v2, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: s_and_b32 s4, s2, 0x8000 +; CI-NEXT: s_and_b32 s3, s4, 0x8000 ; CI-NEXT: v_and_b32_e32 v1, 0x7fff, v1 -; CI-NEXT: v_or_b32_e32 v1, s4, v1 +; CI-NEXT: v_or_b32_e32 v1, s3, v1 ; CI-NEXT: .LBB9_16: ; %Flow54 ; CI-NEXT: v_cvt_f32_f16_e32 v2, s1 -; CI-NEXT: s_and_b32 s0, s0, 0x7fff -; CI-NEXT: s_cmpk_lg_i32 s0, 0x7c00 -; CI-NEXT: s_cselect_b32 s4, 1, 0 -; CI-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v2, s3 -; CI-NEXT: s_and_b32 s2, s2, 0x7fff -; CI-NEXT: s_cmpk_lg_i32 s2, 0x7c00 -; CI-NEXT: s_cselect_b32 s2, 1, 0 -; CI-NEXT: v_cmp_nlg_f32_e64 s[0:1], 0, v2 +; CI-NEXT: v_mov_b32_e32 v3, 0x7f800000 +; CI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; CI-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v2 +; CI-NEXT: v_cvt_f32_f16_e64 v2, |s0| +; CI-NEXT: s_mov_b32 s10, -1 +; CI-NEXT: s_mov_b32 s11, 0xf000 +; CI-NEXT: v_cmp_nge_f32_e64 s[0:1], v2, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v2, s2 +; CI-NEXT: v_cmp_nlg_f32_e64 s[2:3], 0, v2 +; CI-NEXT: v_cvt_f32_f16_e64 v2, |s4| +; CI-NEXT: v_cmp_nge_f32_e64 s[4:5], v2, v3 ; CI-NEXT: v_mov_b32_e32 v2, 0x7e00 -; CI-NEXT: s_and_b32 s3, 1, s4 -; CI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; CI-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[2:3] ; CI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; CI-NEXT: v_cmp_ne_u32_e64 vcc, 0, s3 -; CI-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] -; CI-NEXT: s_and_b32 s0, 1, s2 -; CI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; CI-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; CI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; CI-NEXT: v_cndmask_b32_e64 v1, v2, v1, s[4:5] +; CI-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[0:1] ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; CI-NEXT: v_or_b32_e32 v0, v0, v1 -; CI-NEXT: s_mov_b32 s10, -1 -; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; CI-NEXT: s_endpgm ; @@ -1413,23 +1406,23 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 { ; CI-LABEL: frem_v4f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x9 ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; CI-NEXT: ; implicit-def: $vgpr0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_load_dwordx2 s[4:5], s[10:11], 0x0 -; CI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; CI-NEXT: s_load_dwordx2 s[8:9], s[18:19], 0x0 +; CI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x8 ; CI-NEXT: s_mov_b32 s0, 1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e64 v2, |s4| -; CI-NEXT: v_cvt_f32_f16_e64 v1, |s2| +; CI-NEXT: v_cvt_f32_f16_e64 v2, |s8| +; CI-NEXT: v_cvt_f32_f16_e64 v1, |s6| ; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v2, v1 ; CI-NEXT: s_cbranch_vccz .LBB10_2 ; CI-NEXT: ; %bb.1: ; %frem.else86 -; CI-NEXT: s_and_b32 s0, s4, 0x8000 +; CI-NEXT: s_and_b32 s0, s8, 0x8000 ; CI-NEXT: v_cmp_eq_f32_e32 vcc, v2, v1 ; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: v_mov_b32_e32 v3, s4 +; CI-NEXT: v_mov_b32_e32 v3, s8 ; CI-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc ; CI-NEXT: s_mov_b32 s0, 0 ; CI-NEXT: .LBB10_2: ; %Flow135 @@ -1491,34 +1484,34 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; CI-NEXT: v_ldexp_f32_e32 v0, v1, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: s_and_b32 s0, s4, 0x8000 +; CI-NEXT: s_and_b32 s0, s8, 0x8000 ; CI-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; CI-NEXT: v_or_b32_e32 v0, s0, v0 ; CI-NEXT: .LBB10_8: ; %Flow136 -; CI-NEXT: s_lshr_b32 s6, s4, 16 -; CI-NEXT: s_lshr_b32 s0, s2, 16 -; CI-NEXT: v_cvt_f32_f16_e64 v3, |s6| -; CI-NEXT: v_cvt_f32_f16_e64 v2, |s0| -; CI-NEXT: s_mov_b32 s1, 1 +; CI-NEXT: s_lshr_b32 s4, s8, 16 +; CI-NEXT: s_lshr_b32 s2, s6, 16 +; CI-NEXT: v_cvt_f32_f16_e64 v3, |s4| +; CI-NEXT: v_cvt_f32_f16_e64 v2, |s2| +; CI-NEXT: s_mov_b32 s0, 1 ; CI-NEXT: ; implicit-def: $vgpr1 ; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v3, v2 ; CI-NEXT: s_cbranch_vccz .LBB10_10 ; CI-NEXT: ; %bb.9: ; %frem.else53 -; CI-NEXT: s_and_b32 s1, s6, 0x8000 +; CI-NEXT: s_and_b32 s0, s4, 0x8000 ; CI-NEXT: v_cmp_eq_f32_e32 vcc, v3, v2 -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: v_mov_b32_e32 v4, s6 +; CI-NEXT: v_mov_b32_e32 v1, s0 +; CI-NEXT: v_mov_b32_e32 v4, s4 ; CI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; CI-NEXT: s_mov_b32 s1, 0 +; CI-NEXT: s_mov_b32 s0, 0 ; CI-NEXT: .LBB10_10: ; %Flow131 -; CI-NEXT: s_xor_b32 s1, s1, 1 -; CI-NEXT: s_cmp_lg_u32 s1, 0 +; CI-NEXT: s_xor_b32 s0, s0, 1 +; CI-NEXT: s_cmp_lg_u32 s0, 0 ; CI-NEXT: s_cbranch_scc1 .LBB10_16 ; CI-NEXT: ; %bb.11: ; %frem.compute52 ; CI-NEXT: v_frexp_mant_f32_e32 v4, v2 ; CI-NEXT: v_frexp_exp_i32_f32_e32 v7, v2 ; CI-NEXT: v_ldexp_f32_e64 v2, v4, 1 -; CI-NEXT: v_div_scale_f32 v4, s[10:11], v2, v2, 1.0 +; CI-NEXT: v_div_scale_f32 v4, s[0:1], v2, v2, 1.0 ; CI-NEXT: v_frexp_mant_f32_e32 v1, v3 ; CI-NEXT: v_frexp_exp_i32_f32_e32 v6, v3 ; CI-NEXT: v_add_i32_e32 v3, vcc, -1, v6 @@ -1569,32 +1562,32 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc ; CI-NEXT: v_ldexp_f32_e32 v1, v2, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: s_and_b32 s1, s6, 0x8000 +; CI-NEXT: s_and_b32 s0, s4, 0x8000 ; CI-NEXT: v_and_b32_e32 v1, 0x7fff, v1 -; CI-NEXT: v_or_b32_e32 v1, s1, v1 +; CI-NEXT: v_or_b32_e32 v1, s0, v1 ; CI-NEXT: .LBB10_16: ; %Flow132 -; CI-NEXT: v_cvt_f32_f16_e64 v4, |s5| -; CI-NEXT: v_cvt_f32_f16_e64 v3, |s3| -; CI-NEXT: s_mov_b32 s1, 1 +; CI-NEXT: v_cvt_f32_f16_e64 v4, |s9| +; CI-NEXT: v_cvt_f32_f16_e64 v3, |s7| +; CI-NEXT: s_mov_b32 s0, 1 ; CI-NEXT: ; implicit-def: $vgpr2 ; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v4, v3 ; CI-NEXT: s_cbranch_vccz .LBB10_18 ; CI-NEXT: ; %bb.17: ; %frem.else20 -; CI-NEXT: s_and_b32 s1, s5, 0x8000 +; CI-NEXT: s_and_b32 s0, s9, 0x8000 ; CI-NEXT: v_cmp_eq_f32_e32 vcc, v4, v3 -; CI-NEXT: v_mov_b32_e32 v2, s1 -; CI-NEXT: v_mov_b32_e32 v5, s5 +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: v_mov_b32_e32 v5, s9 ; CI-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc -; CI-NEXT: s_mov_b32 s1, 0 +; CI-NEXT: s_mov_b32 s0, 0 ; CI-NEXT: .LBB10_18: ; %Flow127 -; CI-NEXT: s_xor_b32 s1, s1, 1 -; CI-NEXT: s_cmp_lg_u32 s1, 0 +; CI-NEXT: s_xor_b32 s0, s0, 1 +; CI-NEXT: s_cmp_lg_u32 s0, 0 ; CI-NEXT: s_cbranch_scc1 .LBB10_24 ; CI-NEXT: ; %bb.19: ; %frem.compute19 ; CI-NEXT: v_frexp_mant_f32_e32 v5, v3 ; CI-NEXT: v_frexp_exp_i32_f32_e32 v8, v3 ; CI-NEXT: v_ldexp_f32_e64 v3, v5, 1 -; CI-NEXT: v_div_scale_f32 v5, s[10:11], v3, v3, 1.0 +; CI-NEXT: v_div_scale_f32 v5, s[0:1], v3, v3, 1.0 ; CI-NEXT: v_frexp_mant_f32_e32 v2, v4 ; CI-NEXT: v_frexp_exp_i32_f32_e32 v7, v4 ; CI-NEXT: v_add_i32_e32 v4, vcc, -1, v7 @@ -1645,34 +1638,34 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc ; CI-NEXT: v_ldexp_f32_e32 v2, v3, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: s_and_b32 s1, s5, 0x8000 +; CI-NEXT: s_and_b32 s0, s9, 0x8000 ; CI-NEXT: v_and_b32_e32 v2, 0x7fff, v2 -; CI-NEXT: v_or_b32_e32 v2, s1, v2 +; CI-NEXT: v_or_b32_e32 v2, s0, v2 ; CI-NEXT: .LBB10_24: ; %Flow128 -; CI-NEXT: s_lshr_b32 s7, s5, 16 -; CI-NEXT: s_lshr_b32 s10, s3, 16 -; CI-NEXT: v_cvt_f32_f16_e64 v5, |s7| +; CI-NEXT: s_lshr_b32 s12, s9, 16 +; CI-NEXT: s_lshr_b32 s10, s7, 16 +; CI-NEXT: v_cvt_f32_f16_e64 v5, |s12| ; CI-NEXT: v_cvt_f32_f16_e64 v4, |s10| -; CI-NEXT: s_mov_b32 s1, 1 +; CI-NEXT: s_mov_b32 s0, 1 ; CI-NEXT: ; implicit-def: $vgpr3 ; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v5, v4 ; CI-NEXT: s_cbranch_vccz .LBB10_26 ; CI-NEXT: ; %bb.25: ; %frem.else -; CI-NEXT: s_and_b32 s1, s7, 0x8000 +; CI-NEXT: s_and_b32 s0, s12, 0x8000 ; CI-NEXT: v_cmp_eq_f32_e32 vcc, v5, v4 -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v6, s7 +; CI-NEXT: v_mov_b32_e32 v3, s0 +; CI-NEXT: v_mov_b32_e32 v6, s12 ; CI-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc -; CI-NEXT: s_mov_b32 s1, 0 +; CI-NEXT: s_mov_b32 s0, 0 ; CI-NEXT: .LBB10_26: ; %Flow123 -; CI-NEXT: s_xor_b32 s1, s1, 1 -; CI-NEXT: s_cmp_lg_u32 s1, 0 +; CI-NEXT: s_xor_b32 s0, s0, 1 +; CI-NEXT: s_cmp_lg_u32 s0, 0 ; CI-NEXT: s_cbranch_scc1 .LBB10_32 ; CI-NEXT: ; %bb.27: ; %frem.compute ; CI-NEXT: v_frexp_mant_f32_e32 v6, v4 ; CI-NEXT: v_frexp_exp_i32_f32_e32 v9, v4 ; CI-NEXT: v_ldexp_f32_e64 v4, v6, 1 -; CI-NEXT: v_div_scale_f32 v6, s[12:13], v4, v4, 1.0 +; CI-NEXT: v_div_scale_f32 v6, s[0:1], v4, v4, 1.0 ; CI-NEXT: v_frexp_mant_f32_e32 v3, v5 ; CI-NEXT: v_frexp_exp_i32_f32_e32 v8, v5 ; CI-NEXT: v_add_i32_e32 v5, vcc, -1, v8 @@ -1723,58 +1716,47 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc ; CI-NEXT: v_ldexp_f32_e32 v3, v4, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: s_and_b32 s1, s7, 0x8000 +; CI-NEXT: s_and_b32 s0, s12, 0x8000 ; CI-NEXT: v_and_b32_e32 v3, 0x7fff, v3 -; CI-NEXT: v_or_b32_e32 v3, s1, v3 +; CI-NEXT: v_or_b32_e32 v3, s0, v3 ; CI-NEXT: .LBB10_32: ; %Flow124 -; CI-NEXT: v_cvt_f32_f16_e32 v4, s2 -; CI-NEXT: s_and_b32 s1, s4, 0x7fff -; CI-NEXT: s_cmpk_lg_i32 s1, 0x7c00 -; CI-NEXT: s_cselect_b32 s11, 1, 0 +; CI-NEXT: v_cvt_f32_f16_e32 v4, s6 +; CI-NEXT: v_mov_b32_e32 v5, 0x7f800000 +; CI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; CI-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v4, s0 -; CI-NEXT: s_and_b32 s2, s6, 0x7fff -; CI-NEXT: s_cmpk_lg_i32 s2, 0x7c00 -; CI-NEXT: s_cselect_b32 s6, 1, 0 -; CI-NEXT: v_cmp_nlg_f32_e64 s[0:1], 0, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v4, s3 -; CI-NEXT: s_and_b32 s4, s5, 0x7fff -; CI-NEXT: s_cmpk_lg_i32 s4, 0x7c00 -; CI-NEXT: s_cselect_b32 s12, 1, 0 +; CI-NEXT: v_cvt_f32_f16_e64 v4, |s8| +; CI-NEXT: s_mov_b32 s18, -1 +; CI-NEXT: s_mov_b32 s19, 0xf000 +; CI-NEXT: v_cmp_nge_f32_e64 s[0:1], v4, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v4, s2 ; CI-NEXT: v_cmp_nlg_f32_e64 s[2:3], 0, v4 +; CI-NEXT: v_cvt_f32_f16_e64 v4, |s4| +; CI-NEXT: v_cmp_nge_f32_e64 s[4:5], v4, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v4, s7 +; CI-NEXT: v_cmp_nlg_f32_e64 s[6:7], 0, v4 +; CI-NEXT: v_cvt_f32_f16_e64 v4, |s9| +; CI-NEXT: v_cmp_nge_f32_e64 s[8:9], v4, v5 ; CI-NEXT: v_cvt_f32_f16_e32 v4, s10 -; CI-NEXT: s_and_b32 s7, s7, 0x7fff -; CI-NEXT: s_cmpk_lg_i32 s7, 0x7c00 -; CI-NEXT: s_cselect_b32 s7, 1, 0 -; CI-NEXT: v_cmp_nlg_f32_e64 s[4:5], 0, v4 -; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; CI-NEXT: v_cmp_nlg_f32_e64 s[10:11], 0, v4 +; CI-NEXT: v_cvt_f32_f16_e64 v4, |s12| +; CI-NEXT: v_cmp_nge_f32_e64 s[12:13], v4, v5 ; CI-NEXT: v_mov_b32_e32 v4, 0x7e00 -; CI-NEXT: s_and_b32 s10, 1, s11 -; CI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; CI-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[2:3] ; CI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; CI-NEXT: v_cmp_ne_u32_e64 vcc, 0, s10 -; CI-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[0:1] -; CI-NEXT: s_and_b32 s0, 1, s6 -; CI-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; CI-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; CI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; CI-NEXT: v_cndmask_b32_e64 v1, v4, v1, s[4:5] +; CI-NEXT: v_cndmask_b32_e64 v0, v4, v0, s[0:1] ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; CI-NEXT: v_or_b32_e32 v0, v0, v1 ; CI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; CI-NEXT: s_and_b32 s0, 1, s12 -; CI-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[2:3] -; CI-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 ; CI-NEXT: v_and_b32_e32 v2, 0xffff, v3 -; CI-NEXT: s_and_b32 s0, 1, s7 -; CI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; CI-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[4:5] -; CI-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; CI-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; CI-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[10:11] +; CI-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[6:7] +; CI-NEXT: v_cndmask_b32_e64 v2, v4, v2, s[12:13] +; CI-NEXT: v_cndmask_b32_e64 v1, v4, v1, s[8:9] ; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; CI-NEXT: v_or_b32_e32 v1, v1, v2 -; CI-NEXT: s_mov_b32 s10, -1 -; CI-NEXT: s_mov_b32 s11, 0xf000 -; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[16:19], 0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: frem_v4f16: diff --git a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll index 27cf49aec8229..e9014e212b76f 100644 --- a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll @@ -279,11 +279,11 @@ define amdgpu_kernel void @fabs_fold_f16(ptr addrspace(1) %out, half %in0, half ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e64 v0, |s0| -; CI-NEXT: s_lshr_b32 s0, s0, 16 -; CI-NEXT: v_cvt_f32_f16_e32 v1, s0 +; CI-NEXT: s_lshr_b32 s1, s0, 16 +; CI-NEXT: v_cvt_f32_f16_e32 v0, s1 +; CI-NEXT: v_cvt_f32_f16_e64 v1, |s0| ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; CI-NEXT: v_mul_f32_e32 v0, v0, v1 +; CI-NEXT: v_mul_f32_e32 v0, v1, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 diff --git a/llvm/test/CodeGen/AMDGPU/fmed3-cast-combine.ll b/llvm/test/CodeGen/AMDGPU/fmed3-cast-combine.ll index c0ff9b5a041ef..d8014962eb3bd 100644 --- a/llvm/test/CodeGen/AMDGPU/fmed3-cast-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/fmed3-cast-combine.ll @@ -426,25 +426,15 @@ define half @fmed3_f32_fpext_f16_k0_k2(half %arg1) #1 { } define half @fmed3_f32_fpext_f16_fabs(half %arg0, half %arg1, half %arg2) #1 { -; GFX7-SDAG-LABEL: fmed3_f32_fpext_f16_fabs: -; GFX7-SDAG: ; %bb.0: -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v2, |v2| -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v1, |v1| -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v0, |v0| -; GFX7-SDAG-NEXT: v_med3_f32 v0, v0, v1, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-GISEL-LABEL: fmed3_f32_fpext_f16_fabs: -; GFX7-GISEL: ; %bb.0: -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e64 v0, |v0| -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e64 v1, |v1| -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e64 v2, |v2| -; GFX7-GISEL-NEXT: v_med3_f32 v0, v0, v1, v2 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX7-LABEL: fmed3_f32_fpext_f16_fabs: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f32_f16_e64 v0, |v0| +; GFX7-NEXT: v_cvt_f32_f16_e64 v1, |v1| +; GFX7-NEXT: v_cvt_f32_f16_e64 v2, |v2| +; GFX7-NEXT: v_med3_f32 v0, v0, v1, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: fmed3_f32_fpext_f16_fabs: ; GFX8-SDAG: ; %bb.0: @@ -649,9 +639,9 @@ define half @fmed3_f32_fpext_f16_fneg_fabs(half %arg0, half %arg1, half %arg2) # ; GFX7-SDAG-LABEL: fmed3_f32_fpext_f16_fneg_fabs: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v2, |v2| -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v1, |v1| ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v0, |v0| +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v1, |v1| +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v2, |v2| ; GFX7-SDAG-NEXT: v_med3_f32 v0, -v0, -v1, -v2 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll index 9d9a851a5507e..cbd4017c6cf1c 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll @@ -13,11 +13,11 @@ define amdgpu_kernel void @fneg_fabs_fadd_f16(ptr addrspace(1) %out, half %x, ha ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e64 v0, |s0| -; CI-NEXT: s_lshr_b32 s0, s0, 16 -; CI-NEXT: v_cvt_f32_f16_e32 v1, s0 +; CI-NEXT: s_lshr_b32 s1, s0, 16 +; CI-NEXT: v_cvt_f32_f16_e32 v0, s1 +; CI-NEXT: v_cvt_f32_f16_e64 v1, |s0| ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; CI-NEXT: v_sub_f32_e32 v0, v1, v0 +; CI-NEXT: v_sub_f32_e32 v0, v0, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -93,10 +93,10 @@ define amdgpu_kernel void @fneg_fabs_fmul_f16(ptr addrspace(1) %out, half %x, ha ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_and_b32 s1, s0, 0x7fff -; CI-NEXT: s_lshr_b32 s0, s0, 16 -; CI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; CI-NEXT: v_cvt_f32_f16_e64 v1, -|s1| +; CI-NEXT: s_lshr_b32 s1, s0, 16 +; CI-NEXT: s_and_b32 s0, s0, 0x7fff +; CI-NEXT: v_cvt_f32_f16_e32 v0, s1 +; CI-NEXT: v_cvt_f32_f16_e64 v1, -|s0| ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: v_mul_f32_e32 v0, v0, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v0 diff --git a/llvm/test/CodeGen/AMDGPU/fp-classify.ll b/llvm/test/CodeGen/AMDGPU/fp-classify.ll index 4b800e4d47172..7ff5bbf4821b7 100644 --- a/llvm/test/CodeGen/AMDGPU/fp-classify.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-classify.ll @@ -595,14 +595,14 @@ define amdgpu_kernel void @test_not_isfinite_pattern_4_wrong_ord_test(ptr addrsp define amdgpu_kernel void @test_isinf_pattern_f16(ptr addrspace(1) nocapture %out, half %x) #0 { ; SI-LABEL: test_isinf_pattern_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s6, s[4:5], 0xb ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_load_dword s4, s[4:5], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_movk_i32 s5, 0x204 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s4, s6, 0x7fff -; SI-NEXT: s_cmpk_eq_i32 s4, 0x7c00 -; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: v_cmp_class_f32_e64 s[4:5], v0, s5 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm @@ -642,17 +642,14 @@ define amdgpu_kernel void @test_isinf_pattern_f16(ptr addrspace(1) nocapture %ou define amdgpu_kernel void @test_isfinite_pattern_0_f16(ptr addrspace(1) nocapture %out, half %x) #0 { ; SI-LABEL: test_isfinite_pattern_0_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s6, s[4:5], 0xb ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_load_dword s4, s[4:5], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_movk_i32 s5, 0x1f8 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 -; SI-NEXT: s_and_b32 s4, s6, 0x7fff -; SI-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 -; SI-NEXT: s_cmpk_lg_i32 s4, 0x7c00 -; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 -; SI-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: v_cmp_class_f32_e64 s[4:5], v0, s5 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm @@ -694,14 +691,14 @@ define amdgpu_kernel void @test_isfinite_pattern_0_f16(ptr addrspace(1) nocaptur define amdgpu_kernel void @test_isfinite_pattern_4_f16(ptr addrspace(1) nocapture %out, half %x) #0 { ; SI-LABEL: test_isfinite_pattern_4_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s6, s[4:5], 0xb ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_load_dword s4, s[4:5], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_movk_i32 s5, 0x1f8 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s4, s6, 0x7fff -; SI-NEXT: s_cmpk_lt_i32 s4, 0x7c00 -; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: v_cmp_class_f32_e64 s[4:5], v0, s5 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/fract-match.ll b/llvm/test/CodeGen/AMDGPU/fract-match.ll index 5ff03c8dd4543..d8660617c7677 100644 --- a/llvm/test/CodeGen/AMDGPU/fract-match.ll +++ b/llvm/test/CodeGen/AMDGPU/fract-match.ll @@ -2486,20 +2486,19 @@ define half @safe_math_fract_f16(half %x, ptr addrspace(1) writeonly captures(no ; GFX6-LABEL: safe_math_fract_f16: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GFX6-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX6-NEXT: s_movk_i32 s4, 0x7c00 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: s_mov_b32 s4, 0x7f800000 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: v_floor_f32_e32 v4, v3 -; GFX6-NEXT: v_sub_f32_e32 v5, v3, v4 -; GFX6-NEXT: v_min_f32_e32 v5, 0x3f7fe000, v5 -; GFX6-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX6-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, s4, v0 -; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: v_floor_f32_e32 v3, v0 +; GFX6-NEXT: v_sub_f32_e32 v4, v0, v3 +; GFX6-NEXT: v_min_f32_e32 v4, 0x3f7fe000, v4 +; GFX6-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc +; GFX6-NEXT: v_cmp_neq_f32_e64 vcc, |v0|, s4 +; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_store_short v3, v[1:2], s[4:7], 0 addr64 @@ -2509,20 +2508,19 @@ define half @safe_math_fract_f16(half %x, ptr addrspace(1) writeonly captures(no ; GFX7-LABEL: safe_math_fract_f16: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX7-NEXT: s_movk_i32 s4, 0x7c00 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: s_mov_b32 s4, 0x7f800000 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_floor_f32_e32 v4, v3 -; GFX7-NEXT: v_sub_f32_e32 v5, v3, v4 -; GFX7-NEXT: v_min_f32_e32 v5, 0x3f7fe000, v5 -; GFX7-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX7-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s4, v0 -; GFX7-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: v_floor_f32_e32 v3, v0 +; GFX7-NEXT: v_sub_f32_e32 v4, v0, v3 +; GFX7-NEXT: v_min_f32_e32 v4, 0x3f7fe000, v4 +; GFX7-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX7-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc +; GFX7-NEXT: v_cmp_neq_f32_e64 vcc, |v0|, s4 +; GFX7-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_store_short v3, v[1:2], s[4:7], 0 addr64 diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll index c17a1d181015c..88f6427d94042 100644 --- a/llvm/test/CodeGen/AMDGPU/frem.ll +++ b/llvm/test/CodeGen/AMDGPU/frem.ll @@ -119,16 +119,16 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v1 +; SI-NEXT: v_cmp_lg_f32_e32 vcc, 0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; SI-NEXT: s_movk_i32 s0, 0x7c00 -; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, v0 -; SI-NEXT: s_or_b64 vcc, s[0:1], vcc +; SI-NEXT: v_cvt_f32_f16_e64 v0, |v0| +; SI-NEXT: s_mov_b32 s0, 0x7f800000 +; SI-NEXT: v_cmp_nle_f32_e64 s[0:1], s0, v0 +; SI-NEXT: s_and_b64 vcc, s[0:1], vcc ; SI-NEXT: v_mov_b32_e32 v0, 0x7fc00000 -; SI-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm @@ -225,18 +225,18 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; CI-NEXT: .LBB0_8: ; %Flow19 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: s_movk_i32 s0, 0x7c00 +; CI-NEXT: s_mov_b32 s0, 0x7f800000 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; CI-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, v0 -; CI-NEXT: v_mov_b32_e32 v0, 0x7fc00000 -; CI-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v1, v3 -; CI-NEXT: s_or_b64 vcc, s[0:1], vcc +; CI-NEXT: v_cvt_f32_f16_e64 v0, |v0| ; CI-NEXT: s_mov_b32 s10, -1 +; CI-NEXT: v_cmp_lg_f32_e32 vcc, 0, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v3 +; CI-NEXT: v_cmp_nle_f32_e64 s[0:1], s0, v0 +; CI-NEXT: s_and_b64 vcc, s[0:1], vcc +; CI-NEXT: v_mov_b32_e32 v0, 0x7fc00000 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; CI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; CI-NEXT: s_endpgm @@ -5011,29 +5011,29 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: .LBB9_16: ; %Flow54 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v3 +; SI-NEXT: v_cmp_lg_f32_e32 vcc, 0, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_and_b32_e32 v2, 0x7fff, v2 -; SI-NEXT: s_movk_i32 s2, 0x7c00 -; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v2 -; SI-NEXT: s_or_b64 vcc, s[0:1], vcc +; SI-NEXT: v_cvt_f32_f16_e64 v2, |v2| +; SI-NEXT: s_mov_b32 s2, 0x7f800000 +; SI-NEXT: v_cmp_nle_f32_e64 s[0:1], s2, v2 +; SI-NEXT: s_and_b64 vcc, s[0:1], vcc ; SI-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; SI-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc +; SI-NEXT: v_cndmask_b32_e32 v3, v2, v3, vcc ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v1 +; SI-NEXT: v_cmp_lg_f32_e32 vcc, 0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v0 -; SI-NEXT: s_or_b64 vcc, s[0:1], vcc -; SI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; SI-NEXT: v_cvt_f32_f16_e64 v0, |v0| +; SI-NEXT: v_cmp_nle_f32_e64 s[0:1], s2, v0 +; SI-NEXT: s_and_b64 vcc, s[0:1], vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v3, v0 @@ -5208,30 +5208,30 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_bfi_b32 v5, s0, v5, v0 ; CI-NEXT: .LBB9_16: ; %Flow54 ; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: s_movk_i32 s2, 0x7c00 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: s_mov_b32 s2, 0x7f800000 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_cvt_f32_f16_e64 v2, |v2| ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_and_b32_e32 v2, 0x7fff, v2 -; CI-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v2 -; CI-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_cmp_lg_f32_e32 vcc, 0, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v3, v4 -; CI-NEXT: s_or_b64 vcc, s[0:1], vcc +; CI-NEXT: v_cmp_nle_f32_e64 s[0:1], s2, v2 +; CI-NEXT: s_and_b64 vcc, s[0:1], vcc ; CI-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_cvt_f32_f16_e64 v0, |v0| ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 -; CI-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; CI-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc -; CI-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v1 +; CI-NEXT: v_cndmask_b32_e32 v3, v2, v3, vcc +; CI-NEXT: v_cmp_lg_f32_e32 vcc, 0, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v5 -; CI-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v0 -; CI-NEXT: s_or_b64 vcc, s[0:1], vcc +; CI-NEXT: v_cmp_nle_f32_e64 s[0:1], s2, v0 +; CI-NEXT: s_and_b64 vcc, s[0:1], vcc ; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; CI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; CI-NEXT: v_or_b32_e32 v0, v3, v0 @@ -7509,51 +7509,51 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: .LBB10_32: ; %Flow124 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v7 +; SI-NEXT: v_cmp_lg_f32_e32 vcc, 0, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_and_b32_e32 v6, 0x7fff, v6 -; SI-NEXT: s_movk_i32 s2, 0x7c00 -; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v6 -; SI-NEXT: s_or_b64 vcc, s[0:1], vcc +; SI-NEXT: v_cvt_f32_f16_e64 v6, |v6| +; SI-NEXT: s_mov_b32 s2, 0x7f800000 +; SI-NEXT: v_cmp_nle_f32_e64 s[0:1], s2, v6 +; SI-NEXT: s_and_b64 vcc, s[0:1], vcc ; SI-NEXT: v_mov_b32_e32 v6, 0x7fc00000 -; SI-NEXT: v_cndmask_b32_e32 v7, v7, v6, vcc +; SI-NEXT: v_cndmask_b32_e32 v7, v6, v7, vcc ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v5 +; SI-NEXT: v_cmp_lg_f32_e32 vcc, 0, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_and_b32_e32 v4, 0x7fff, v4 -; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v4 -; SI-NEXT: s_or_b64 vcc, s[0:1], vcc -; SI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; SI-NEXT: v_cvt_f32_f16_e64 v4, |v4| +; SI-NEXT: v_cmp_nle_f32_e64 s[0:1], s2, v4 +; SI-NEXT: s_and_b64 vcc, s[0:1], vcc +; SI-NEXT: v_cndmask_b32_e32 v4, v6, v5, vcc ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v3 +; SI-NEXT: v_cmp_lg_f32_e32 vcc, 0, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_and_b32_e32 v2, 0x7fff, v2 -; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v2 -; SI-NEXT: s_or_b64 vcc, s[0:1], vcc -; SI-NEXT: v_cndmask_b32_e32 v2, v3, v6, vcc +; SI-NEXT: v_cvt_f32_f16_e64 v2, |v2| +; SI-NEXT: v_cmp_nle_f32_e64 s[0:1], s2, v2 +; SI-NEXT: s_and_b64 vcc, s[0:1], vcc +; SI-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v1 +; SI-NEXT: v_cmp_lg_f32_e32 vcc, 0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v0 -; SI-NEXT: s_or_b64 vcc, s[0:1], vcc -; SI-NEXT: v_cndmask_b32_e32 v0, v1, v6, vcc +; SI-NEXT: v_cvt_f32_f16_e64 v0, |v0| +; SI-NEXT: v_cmp_nle_f32_e64 s[0:1], s2, v0 +; SI-NEXT: s_and_b64 vcc, s[0:1], vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v6, v1, vcc ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v1, v2, v0 @@ -7882,52 +7882,52 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_bfi_b32 v11, s0, v11, v0 ; CI-NEXT: .LBB10_32: ; %Flow124 ; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; CI-NEXT: s_movk_i32 s2, 0x7c00 +; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; CI-NEXT: s_mov_b32 s2, 0x7f800000 ; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; CI-NEXT: v_cvt_f32_f16_e64 v6, |v6| ; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-NEXT: v_and_b32_e32 v6, 0x7fff, v6 -; CI-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v6 -; CI-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v7 +; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; CI-NEXT: v_cmp_lg_f32_e32 vcc, 0, v7 ; CI-NEXT: v_cvt_f16_f32_e32 v7, v8 -; CI-NEXT: s_or_b64 vcc, s[0:1], vcc +; CI-NEXT: v_cmp_nle_f32_e64 s[0:1], s2, v6 +; CI-NEXT: s_and_b64 vcc, s[0:1], vcc ; CI-NEXT: v_mov_b32_e32 v6, 0x7fc00000 -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cndmask_b32_e32 v7, v7, v6, vcc -; CI-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v5 +; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; CI-NEXT: v_cvt_f32_f16_e64 v4, |v4| +; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_cndmask_b32_e32 v7, v6, v7, vcc +; CI-NEXT: v_cmp_lg_f32_e32 vcc, 0, v5 ; CI-NEXT: v_cvt_f16_f32_e32 v5, v9 -; CI-NEXT: v_and_b32_e32 v4, 0x7fff, v4 -; CI-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v4 -; CI-NEXT: s_or_b64 vcc, s[0:1], vcc +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_cmp_nle_f32_e64 s[0:1], s2, v4 +; CI-NEXT: s_and_b64 vcc, s[0:1], vcc ; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e64 v2, |v2| ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; CI-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v3 +; CI-NEXT: v_cndmask_b32_e32 v4, v6, v5, vcc +; CI-NEXT: v_cmp_lg_f32_e32 vcc, 0, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v3, v10 -; CI-NEXT: v_and_b32_e32 v2, 0x7fff, v2 -; CI-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v2 -; CI-NEXT: s_or_b64 vcc, s[0:1], vcc +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cmp_nle_f32_e64 s[0:1], s2, v2 +; CI-NEXT: s_and_b64 vcc, s[0:1], vcc ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; CI-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v0 +; CI-NEXT: v_cvt_f32_f16_e64 v0, |v0| ; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; CI-NEXT: v_cndmask_b32_e32 v2, v3, v6, vcc -; CI-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; CI-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc +; CI-NEXT: v_cmp_lg_f32_e32 vcc, 0, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v11 -; CI-NEXT: s_or_b64 vcc, s[0:1], vcc +; CI-NEXT: v_cmp_nle_f32_e64 s[0:1], s2, v0 +; CI-NEXT: s_and_b64 vcc, s[0:1], vcc ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 -; CI-NEXT: v_cndmask_b32_e32 v0, v1, v6, vcc +; CI-NEXT: v_cndmask_b32_e32 v0, v6, v1, vcc ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; CI-NEXT: v_or_b32_e32 v1, v2, v0 diff --git a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.f16.ll b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.f16.ll index 9ae6e60385bbb..6d4b1c4621054 100644 --- a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.f16.ll @@ -300,8 +300,8 @@ define half @add_select_fabs_var_f16(i32 %c, half %x, half %y, half %z) { ; CI-LABEL: add_select_fabs_var_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e64 v1, |v1| ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cvt_f32_f16_e64 v1, |v1| ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc @@ -1772,8 +1772,8 @@ define half @add_select_negfabs_fabs_f16(i32 %c, half %x, half %y, half %z) { ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-NEXT: v_and_b32_e32 v1, 0x7fff, v1 -; CI-NEXT: v_cvt_f32_f16_e64 v2, |v2| ; CI-NEXT: v_cvt_f32_f16_e64 v1, -|v1| +; CI-NEXT: v_cvt_f32_f16_e64 v2, |v2| ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc @@ -1924,8 +1924,8 @@ define half @add_select_neg_fabs_f16(i32 %c, half %x, half %y, half %z) { ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; CI-NEXT: v_cvt_f32_f16_e64 v2, |v2| ; CI-NEXT: v_cvt_f32_f16_e64 v1, -v1 +; CI-NEXT: v_cvt_f32_f16_e64 v2, |v2| ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc _______________________________________________ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
