llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-llvm-globalisel @llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) <details> <summary>Changes</summary> As expected the code is much worse, but more correct. We could do a better job with source modifier management around fp16_to_fp/fp_to_fp16. --- Patch is 11.21 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/177419.diff 122 Files Affected: - (modified) llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h (-2) - (modified) llvm/lib/Target/AMDGPU/R600ISelLowering.h (+2) - (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll (+17895-22432) - (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll (+1271-1543) - (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.160bit.ll (+564-726) - (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.16bit.ll (+62-61) - (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.192bit.ll (+1039-1413) - (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.224bit.ll (+761-989) - (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll (+2577-3133) - (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.288bit.ll (+1005-1299) - (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll (+2998-3522) - (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll (+297-371) - (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.352bit.ll (+1185-1543) - (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.384bit.ll (+2132-2825) - (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.448bit.ll (+2430-3197) - (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll (+161-179) - (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll (+6837-7654) - (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll (+3363-4997) - (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll (+3950-5879) - (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll (+649-785) - (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll (+4526-6764) - (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll (+5221-7834) - (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll (+5837-8743) - (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll (+6300-9763) - (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll (+7043-10980) - (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll (+770-890) - (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll (+30-40) - (modified) llvm/test/CodeGen/AMDGPU/br_cc.f16.ll (+16-26) - (modified) llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll (+336-487) - (modified) llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll (+168-239) - (modified) llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll (+168-239) - (modified) llvm/test/CodeGen/AMDGPU/calling-conventions.ll (+16-16) - (modified) llvm/test/CodeGen/AMDGPU/clamp-modifier.ll (+150-25) - (modified) llvm/test/CodeGen/AMDGPU/clamp.ll (+139-35) - (modified) llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll (+292-294) - (modified) llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll (+32-21) - (modified) llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll (+1-1) - (modified) llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll (+119-175) - (modified) llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll (+43-68) - (modified) llvm/test/CodeGen/AMDGPU/fabs.f16.ll (+12-12) - (modified) llvm/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll (+12) - (modified) llvm/test/CodeGen/AMDGPU/fadd.f16.ll (+19-19) - (modified) llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll (+163-163) - (modified) llvm/test/CodeGen/AMDGPU/fcmp.f16.ll (+126-126) - (modified) llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll (+8-20) - (modified) llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll (+1025-1320) - (modified) llvm/test/CodeGen/AMDGPU/fdiv.f16.ll (+22-22) - (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll (+232-366) - (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll (+170-266) - (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll (+170-266) - (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll (+170-266) - (modified) llvm/test/CodeGen/AMDGPU/fmax3-maximumnum.ll (+144-84) - (modified) llvm/test/CodeGen/AMDGPU/fmax3.ll (+42-26) - (modified) llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll (+153-144) - (modified) llvm/test/CodeGen/AMDGPU/fmed3-cast-combine.ll (+154-369) - (modified) llvm/test/CodeGen/AMDGPU/fmed3.ll (+72-25) - (modified) llvm/test/CodeGen/AMDGPU/fmin3-minimumnum.ll (+144-84) - (modified) llvm/test/CodeGen/AMDGPU/fmin3.ll (+42-26) - (modified) llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll (+153-144) - (modified) llvm/test/CodeGen/AMDGPU/fmul.f16.ll (+41-41) - (modified) llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll (+1738-441) - (modified) llvm/test/CodeGen/AMDGPU/fneg-combines.ll (+7-8) - (modified) llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll (+15-23) - (modified) llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll (+7-8) - (modified) llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll (+12-28) - (modified) llvm/test/CodeGen/AMDGPU/fneg.f16.ll (+7-7) - (modified) llvm/test/CodeGen/AMDGPU/fpext.f16.ll (+13-13) - (modified) llvm/test/CodeGen/AMDGPU/fpow.ll (+11-11) - (modified) llvm/test/CodeGen/AMDGPU/fptosi.f16.ll (+3-3) - (modified) llvm/test/CodeGen/AMDGPU/fract-match.ll (+144-94) - (modified) llvm/test/CodeGen/AMDGPU/freeze.ll (+37-15) - (modified) llvm/test/CodeGen/AMDGPU/frem.ll (+1549-1427) - (modified) llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll (+608-928) - (modified) llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll (+368-568) - (modified) llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll (+368-568) - (modified) llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll (+368-568) - (modified) llvm/test/CodeGen/AMDGPU/half.ll (+268-291) - (modified) llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll (+103-140) - (modified) llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll (+7-7) - (modified) llvm/test/CodeGen/AMDGPU/llvm.exp.ll (+47-59) - (modified) llvm/test/CodeGen/AMDGPU/llvm.exp10.ll (+55-61) - (modified) llvm/test/CodeGen/AMDGPU/llvm.exp2.ll (+17-41) - (modified) llvm/test/CodeGen/AMDGPU/llvm.fma.f16.ll (+897-187) - (modified) llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll (+49-36) - (modified) llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll (+30-30) - (modified) llvm/test/CodeGen/AMDGPU/llvm.log.ll (+39-64) - (modified) llvm/test/CodeGen/AMDGPU/llvm.log10.ll (+39-64) - (modified) llvm/test/CodeGen/AMDGPU/llvm.log2.ll (+29-52) - (modified) llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll (+269-265) - (modified) llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll (+60-60) - (modified) llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll (+64-64) - (modified) llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll (+7-7) - (modified) llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll (+108-184) - (modified) llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll (+108-184) - (modified) llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll (+108-184) - (modified) llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll (+108-184) - (modified) llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll (+26-18) - (modified) llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll (+183-135) - (modified) llvm/test/CodeGen/AMDGPU/mad-mix.ll (+154-272) - (modified) llvm/test/CodeGen/AMDGPU/maximumnum.ll (+456-403) - (modified) llvm/test/CodeGen/AMDGPU/minimumnum.ll (+439-395) - (modified) llvm/test/CodeGen/AMDGPU/omod.ll (+9-3) - (modified) llvm/test/CodeGen/AMDGPU/repeated-divisor.ll (+99-85) - (modified) llvm/test/CodeGen/AMDGPU/roundeven.ll (+14-14) - (modified) llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.f16.ll (+306-179) - (modified) llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll (+560-458) - (modified) llvm/test/CodeGen/AMDGPU/select-flags-to-fmin-fmax.ll (+286-268) - (modified) llvm/test/CodeGen/AMDGPU/select-phi-s16-fp.ll (+7-13) - (modified) llvm/test/CodeGen/AMDGPU/select.f16.ll (+261-600) - (modified) llvm/test/CodeGen/AMDGPU/strict_fpext.ll (+26-15) - (modified) llvm/test/CodeGen/AMDGPU/strict_fptrunc.ll (+5-36) - (modified) llvm/test/CodeGen/AMDGPU/strictfp_f16_abi_promote.ll (+33-57) - (modified) llvm/test/CodeGen/AMDGPU/v_cndmask.ll (+2-3) - (modified) llvm/test/CodeGen/AMDGPU/v_mac.ll (+10-2) - (modified) llvm/test/CodeGen/AMDGPU/v_mac_f16.ll (+170-109) - (modified) llvm/test/CodeGen/AMDGPU/v_madak_f16.ll (+21-13) - (modified) llvm/test/CodeGen/AMDGPU/vector-reduce-fadd.ll (+128-72) - (modified) llvm/test/CodeGen/AMDGPU/vector-reduce-fmax.ll (+117-74) - (modified) llvm/test/CodeGen/AMDGPU/vector-reduce-fmaximum.ll (+157-111) - (modified) llvm/test/CodeGen/AMDGPU/vector-reduce-fmin.ll (+117-74) - (modified) llvm/test/CodeGen/AMDGPU/vector-reduce-fminimum.ll (+157-111) - (modified) llvm/test/CodeGen/AMDGPU/vector-reduce-fmul.ll (+128-72) ``````````diff diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h index dde84e2090b90..2f8777fffdc92 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -405,8 +405,6 @@ class AMDGPUTargetLowering : public TargetLowering { // are using vector compares until that is fixed. return true; } - - bool softPromoteHalfType() const override { return false; } }; } // End namespace llvm diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.h b/llvm/lib/Target/AMDGPU/R600ISelLowering.h index 661efb8684813..bb7fc46a98cbd 100644 --- a/llvm/lib/Target/AMDGPU/R600ISelLowering.h +++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.h @@ -117,6 +117,8 @@ class R600TargetLowering final : public AMDGPUTargetLowering { TargetLowering::AtomicExpansionKind shouldExpandAtomicRMWInIR(const AtomicRMWInst *RMW) const override; + + bool softPromoteHalfType() const override { return false; } }; } // End namespace llvm; diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll index 2ce67c3848bae..a62a1828a6e93 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll @@ -30728,6 +30728,8 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-LABEL: bitcast_v32i32_to_v64f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill @@ -30744,650 +30746,219 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB20_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v30 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v27 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v56, v2 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v32, v31, v30, 16 +; SI-NEXT: v_alignbit_b32 v33, v29, v28, 16 +; SI-NEXT: v_alignbit_b32 v34, v27, v26, 16 +; SI-NEXT: v_alignbit_b32 v35, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v36, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v37, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v38, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v39, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v48, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v49, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v50, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v53, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v55, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v42, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v44, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v47, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v15 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v13 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v11 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v9 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v7 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v5 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v29 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v58, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v1 ; SI-NEXT: .LBB20_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB20_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v24 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v38 -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v36 -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v44, vcc, 3, v62 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v44 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v62 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v46 -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v21 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v32 -; SI-NEXT: v_mov_b32_e32 v32, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v31 -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v19 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v63 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v18 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v61 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: v_add_i32_e32 v33, vcc, 3, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v59 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: ... [truncated] `````````` </details> https://github.com/llvm/llvm-project/pull/177419 _______________________________________________ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
