llvmbot wrote:

<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-globalisel

@llvm/pr-subscribers-backend-amdgpu

Author: Matt Arsenault (arsenm)

<details>
<summary>Changes</summary>

As expected the code is much worse, but more correct.
We could do a better job with source modifier management around
fp16_to_fp/fp_to_fp16.

---

Patch is 11.21 MiB, truncated to 20.00 KiB below, full version: 
https://github.com/llvm/llvm-project/pull/177419.diff


122 Files Affected:

- (modified) llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h (-2) 
- (modified) llvm/lib/Target/AMDGPU/R600ISelLowering.h (+2) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll (+17895-22432) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll (+1271-1543) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.160bit.ll (+564-726) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.16bit.ll (+62-61) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.192bit.ll (+1039-1413) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.224bit.ll (+761-989) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll (+2577-3133) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.288bit.ll (+1005-1299) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll (+2998-3522) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll (+297-371) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.352bit.ll (+1185-1543) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.384bit.ll (+2132-2825) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.448bit.ll (+2430-3197) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll (+161-179) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll (+6837-7654) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll (+3363-4997) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll (+3950-5879) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll (+649-785) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll (+4526-6764) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll (+5221-7834) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll (+5837-8743) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll (+6300-9763) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll (+7043-10980) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll (+770-890) 
- (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll 
(+30-40) 
- (modified) llvm/test/CodeGen/AMDGPU/br_cc.f16.ll (+16-26) 
- (modified) llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll 
(+336-487) 
- (modified) llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll 
(+168-239) 
- (modified) llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll 
(+168-239) 
- (modified) llvm/test/CodeGen/AMDGPU/calling-conventions.ll (+16-16) 
- (modified) llvm/test/CodeGen/AMDGPU/clamp-modifier.ll (+150-25) 
- (modified) llvm/test/CodeGen/AMDGPU/clamp.ll (+139-35) 
- (modified) llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll 
(+292-294) 
- (modified) llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll (+32-21) 
- (modified) llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll (+1-1) 
- (modified) llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll (+119-175) 
- (modified) llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll (+43-68) 
- (modified) llvm/test/CodeGen/AMDGPU/fabs.f16.ll (+12-12) 
- (modified) llvm/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll (+12) 
- (modified) llvm/test/CodeGen/AMDGPU/fadd.f16.ll (+19-19) 
- (modified) llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll (+163-163) 
- (modified) llvm/test/CodeGen/AMDGPU/fcmp.f16.ll (+126-126) 
- (modified) llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll (+8-20) 
- (modified) llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll (+1025-1320) 
- (modified) llvm/test/CodeGen/AMDGPU/fdiv.f16.ll (+22-22) 
- (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll (+232-366) 
- (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll (+170-266) 
- (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll (+170-266) 
- (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll (+170-266) 
- (modified) llvm/test/CodeGen/AMDGPU/fmax3-maximumnum.ll (+144-84) 
- (modified) llvm/test/CodeGen/AMDGPU/fmax3.ll (+42-26) 
- (modified) llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll (+153-144) 
- (modified) llvm/test/CodeGen/AMDGPU/fmed3-cast-combine.ll (+154-369) 
- (modified) llvm/test/CodeGen/AMDGPU/fmed3.ll (+72-25) 
- (modified) llvm/test/CodeGen/AMDGPU/fmin3-minimumnum.ll (+144-84) 
- (modified) llvm/test/CodeGen/AMDGPU/fmin3.ll (+42-26) 
- (modified) llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll (+153-144) 
- (modified) llvm/test/CodeGen/AMDGPU/fmul.f16.ll (+41-41) 
- (modified) llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll (+1738-441) 
- (modified) llvm/test/CodeGen/AMDGPU/fneg-combines.ll (+7-8) 
- (modified) llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll (+15-23) 
- (modified) llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll (+7-8) 
- (modified) llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll (+12-28) 
- (modified) llvm/test/CodeGen/AMDGPU/fneg.f16.ll (+7-7) 
- (modified) llvm/test/CodeGen/AMDGPU/fpext.f16.ll (+13-13) 
- (modified) llvm/test/CodeGen/AMDGPU/fpow.ll (+11-11) 
- (modified) llvm/test/CodeGen/AMDGPU/fptosi.f16.ll (+3-3) 
- (modified) llvm/test/CodeGen/AMDGPU/fract-match.ll (+144-94) 
- (modified) llvm/test/CodeGen/AMDGPU/freeze.ll (+37-15) 
- (modified) llvm/test/CodeGen/AMDGPU/frem.ll (+1549-1427) 
- (modified) llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll (+608-928) 
- (modified) llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll (+368-568) 
- (modified) llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll (+368-568) 
- (modified) llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll (+368-568) 
- (modified) llvm/test/CodeGen/AMDGPU/half.ll (+268-291) 
- (modified) llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll (+103-140) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll (+7-7) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.exp.ll (+47-59) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.exp10.ll (+55-61) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.exp2.ll (+17-41) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.fma.f16.ll (+897-187) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll (+49-36) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll (+30-30) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.log.ll (+39-64) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.log10.ll (+39-64) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.log2.ll (+29-52) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll (+269-265) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll (+60-60) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll (+64-64) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll (+7-7) 
- (modified) llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll (+108-184) 
- (modified) llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll (+108-184) 
- (modified) llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll (+108-184) 
- (modified) llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll (+108-184) 
- (modified) llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll (+26-18) 
- (modified) llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll (+183-135) 
- (modified) llvm/test/CodeGen/AMDGPU/mad-mix.ll (+154-272) 
- (modified) llvm/test/CodeGen/AMDGPU/maximumnum.ll (+456-403) 
- (modified) llvm/test/CodeGen/AMDGPU/minimumnum.ll (+439-395) 
- (modified) llvm/test/CodeGen/AMDGPU/omod.ll (+9-3) 
- (modified) llvm/test/CodeGen/AMDGPU/repeated-divisor.ll (+99-85) 
- (modified) llvm/test/CodeGen/AMDGPU/roundeven.ll (+14-14) 
- (modified) llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.f16.ll 
(+306-179) 
- (modified) llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll 
(+560-458) 
- (modified) llvm/test/CodeGen/AMDGPU/select-flags-to-fmin-fmax.ll (+286-268) 
- (modified) llvm/test/CodeGen/AMDGPU/select-phi-s16-fp.ll (+7-13) 
- (modified) llvm/test/CodeGen/AMDGPU/select.f16.ll (+261-600) 
- (modified) llvm/test/CodeGen/AMDGPU/strict_fpext.ll (+26-15) 
- (modified) llvm/test/CodeGen/AMDGPU/strict_fptrunc.ll (+5-36) 
- (modified) llvm/test/CodeGen/AMDGPU/strictfp_f16_abi_promote.ll (+33-57) 
- (modified) llvm/test/CodeGen/AMDGPU/v_cndmask.ll (+2-3) 
- (modified) llvm/test/CodeGen/AMDGPU/v_mac.ll (+10-2) 
- (modified) llvm/test/CodeGen/AMDGPU/v_mac_f16.ll (+170-109) 
- (modified) llvm/test/CodeGen/AMDGPU/v_madak_f16.ll (+21-13) 
- (modified) llvm/test/CodeGen/AMDGPU/vector-reduce-fadd.ll (+128-72) 
- (modified) llvm/test/CodeGen/AMDGPU/vector-reduce-fmax.ll (+117-74) 
- (modified) llvm/test/CodeGen/AMDGPU/vector-reduce-fmaximum.ll (+157-111) 
- (modified) llvm/test/CodeGen/AMDGPU/vector-reduce-fmin.ll (+117-74) 
- (modified) llvm/test/CodeGen/AMDGPU/vector-reduce-fminimum.ll (+157-111) 
- (modified) llvm/test/CodeGen/AMDGPU/vector-reduce-fmul.ll (+128-72) 


``````````diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h 
b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index dde84e2090b90..2f8777fffdc92 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -405,8 +405,6 @@ class AMDGPUTargetLowering : public TargetLowering {
     // are using vector compares until that is fixed.
     return true;
   }
-
-  bool softPromoteHalfType() const override { return false; }
 };
 
 } // End namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.h 
b/llvm/lib/Target/AMDGPU/R600ISelLowering.h
index 661efb8684813..bb7fc46a98cbd 100644
--- a/llvm/lib/Target/AMDGPU/R600ISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.h
@@ -117,6 +117,8 @@ class R600TargetLowering final : public 
AMDGPUTargetLowering {
 
   TargetLowering::AtomicExpansionKind
   shouldExpandAtomicRMWInIR(const AtomicRMWInst *RMW) const override;
+
+  bool softPromoteHalfType() const override { return false; }
 };
 
 } // End namespace llvm;
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll 
b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
index 2ce67c3848bae..a62a1828a6e93 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
@@ -30728,6 +30728,8 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> 
%a, i32 %b) {
 ; SI-LABEL: bitcast_v32i32_to_v64f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte 
Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte 
Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte 
Folded Spill
@@ -30744,650 +30746,219 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x 
i32> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte 
Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte 
Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte 
Folded Spill
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32
-; SI-NEXT:    ; implicit-def: $vgpr60
-; SI-NEXT:    ; implicit-def: $vgpr61
-; SI-NEXT:    ; implicit-def: $vgpr58
-; SI-NEXT:    ; implicit-def: $vgpr59
-; SI-NEXT:    ; implicit-def: $vgpr56
-; SI-NEXT:    ; implicit-def: $vgpr57
-; SI-NEXT:    ; implicit-def: $vgpr44
 ; SI-NEXT:    ; implicit-def: $vgpr47
-; SI-NEXT:    ; implicit-def: $vgpr45
-; SI-NEXT:    ; implicit-def: $vgpr43
-; SI-NEXT:    ; implicit-def: $vgpr41
+; SI-NEXT:    ; implicit-def: $vgpr63
+; SI-NEXT:    ; implicit-def: $vgpr44
+; SI-NEXT:    ; implicit-def: $vgpr62
+; SI-NEXT:    ; implicit-def: $vgpr42
+; SI-NEXT:    ; implicit-def: $vgpr61
 ; SI-NEXT:    ; implicit-def: $vgpr55
+; SI-NEXT:    ; implicit-def: $vgpr60
 ; SI-NEXT:    ; implicit-def: $vgpr53
-; SI-NEXT:    ; implicit-def: $vgpr51
+; SI-NEXT:    ; implicit-def: $vgpr59
+; SI-NEXT:    ; implicit-def: $vgpr50
+; SI-NEXT:    ; implicit-def: $vgpr58
 ; SI-NEXT:    ; implicit-def: $vgpr49
-; SI-NEXT:    ; implicit-def: $vgpr39
-; SI-NEXT:    ; implicit-def: $vgpr37
-; SI-NEXT:    ; implicit-def: $vgpr35
-; SI-NEXT:    ; implicit-def: $vgpr33
-; SI-NEXT:    ; implicit-def: $vgpr63
+; SI-NEXT:    ; implicit-def: $vgpr57
 ; SI-NEXT:    ; implicit-def: $vgpr48
-; SI-NEXT:    ; implicit-def: $vgpr50
+; SI-NEXT:    ; implicit-def: $vgpr56
+; SI-NEXT:    ; implicit-def: $vgpr39
 ; SI-NEXT:    ; implicit-def: $vgpr46
-; SI-NEXT:    ; implicit-def: $vgpr52
 ; SI-NEXT:    ; implicit-def: $vgpr38
-; SI-NEXT:    ; implicit-def: $vgpr54
-; SI-NEXT:    ; implicit-def: $vgpr40
+; SI-NEXT:    ; implicit-def: $vgpr45
+; SI-NEXT:    ; implicit-def: $vgpr37
+; SI-NEXT:    ; implicit-def: $vgpr43
 ; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; implicit-def: $vgpr42
+; SI-NEXT:    ; implicit-def: $vgpr41
+; SI-NEXT:    ; implicit-def: $vgpr35
+; SI-NEXT:    ; implicit-def: $vgpr40
 ; SI-NEXT:    ; implicit-def: $vgpr34
+; SI-NEXT:    ; implicit-def: $vgpr54
+; SI-NEXT:    ; implicit-def: $vgpr33
+; SI-NEXT:    ; implicit-def: $vgpr52
+; SI-NEXT:    ; implicit-def: $vgpr51
+; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v32
 ; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
-; SI-NEXT:    ; implicit-def: $vgpr31
-; SI-NEXT:    ; kill: killed $vgpr31
-; SI-NEXT:    ; implicit-def: $vgpr31
-; SI-NEXT:    ; kill: killed $vgpr31
-; SI-NEXT:    ; implicit-def: $vgpr31
-; SI-NEXT:    ; kill: killed $vgpr31
-; SI-NEXT:    ; implicit-def: $vgpr31
-; SI-NEXT:    ; kill: killed $vgpr31
-; SI-NEXT:    ; implicit-def: $vgpr31
-; SI-NEXT:    ; kill: killed $vgpr31
-; SI-NEXT:    ; implicit-def: $vgpr31
-; SI-NEXT:    ; kill: killed $vgpr31
-; SI-NEXT:    ; implicit-def: $vgpr31
-; SI-NEXT:    ; kill: killed $vgpr31
-; SI-NEXT:    ; implicit-def: $vgpr31
-; SI-NEXT:    ; kill: killed $vgpr31
-; SI-NEXT:    ; implicit-def: $vgpr31
-; SI-NEXT:    ; kill: killed $vgpr31
-; SI-NEXT:    ; implicit-def: $vgpr31
-; SI-NEXT:    ; kill: killed $vgpr31
-; SI-NEXT:    ; implicit-def: $vgpr31
-; SI-NEXT:    ; kill: killed $vgpr31
-; SI-NEXT:    ; implicit-def: $vgpr31
-; SI-NEXT:    ; kill: killed $vgpr31
-; SI-NEXT:    ; implicit-def: $vgpr31
-; SI-NEXT:    ; kill: killed $vgpr31
-; SI-NEXT:    ; implicit-def: $vgpr31
-; SI-NEXT:    ; kill: killed $vgpr31
-; SI-NEXT:    ; implicit-def: $vgpr31
-; SI-NEXT:    ; kill: killed $vgpr31
-; SI-NEXT:    ; implicit-def: $vgpr31
-; SI-NEXT:    ; kill: killed $vgpr31
-; SI-NEXT:    ; implicit-def: $vgpr31
-; SI-NEXT:    ; kill: killed $vgpr31
-; SI-NEXT:    ; implicit-def: $vgpr31
-; SI-NEXT:    ; kill: killed $vgpr31
-; SI-NEXT:    ; implicit-def: $vgpr31
-; SI-NEXT:    ; kill: killed $vgpr31
-; SI-NEXT:    ; implicit-def: $vgpr31
-; SI-NEXT:    ; kill: killed $vgpr31
-; SI-NEXT:    ; implicit-def: $vgpr31
-; SI-NEXT:    ; kill: killed $vgpr31
-; SI-NEXT:    ; implicit-def: $vgpr31
-; SI-NEXT:    ; kill: killed $vgpr31
-; SI-NEXT:    ; implicit-def: $vgpr31
-; SI-NEXT:    ; kill: killed $vgpr31
-; SI-NEXT:    ; implicit-def: $vgpr31
-; SI-NEXT:    ; kill: killed $vgpr31
-; SI-NEXT:    ; implicit-def: $vgpr31
-; SI-NEXT:    ; kill: killed $vgpr31
-; SI-NEXT:    ; implicit-def: $vgpr31
-; SI-NEXT:    ; kill: killed $vgpr31
-; SI-NEXT:    ; implicit-def: $vgpr31
-; SI-NEXT:    ; kill: killed $vgpr31
-; SI-NEXT:    ; implicit-def: $vgpr31
-; SI-NEXT:    ; kill: killed $vgpr31
-; SI-NEXT:    ; implicit-def: $vgpr31
-; SI-NEXT:    ; kill: killed $vgpr31
-; SI-NEXT:    ; implicit-def: $vgpr31
-; SI-NEXT:    ; kill: killed $vgpr31
-; SI-NEXT:    ; implicit-def: $vgpr31
-; SI-NEXT:    ; kill: killed $vgpr31
-; SI-NEXT:    ; implicit-def: $vgpr31
-; SI-NEXT:    ; kill: killed $vgpr31
-; SI-NEXT:    ; implicit-def: $vgpr31
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB20_2
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v62
-; SI-NEXT:    v_cvt_f32_f16_e32 v32, v31
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v30
-; SI-NEXT:    v_cvt_f32_f16_e32 v42, v31
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v29
-; SI-NEXT:    v_cvt_f32_f16_e32 v40, v31
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v28
-; SI-NEXT:    v_cvt_f32_f16_e32 v54, v31
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v27
-; SI-NEXT:    v_cvt_f32_f16_e32 v52, v31
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v26
-; SI-NEXT:    v_cvt_f32_f16_e32 v50, v31
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v25
-; SI-NEXT:    v_cvt_f32_f16_e32 v48, v31
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v24
-; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; SI-NEXT:    v_cvt_f32_f16_e32 v26, v26
-; SI-NEXT:    v_cvt_f32_f16_e32 v25, v25
-; SI-NEXT:    v_cvt_f32_f16_e32 v24, v24
-; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte 
Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v23
-; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; SI-NEXT:    v_cvt_f32_f16_e32 v23, v23
-; SI-NEXT:    v_cvt_f32_f16_e32 v34, v62
-; SI-NEXT:    v_cvt_f32_f16_e32 v36, v30
-; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte 
Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v22
-; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; SI-NEXT:    v_cvt_f32_f16_e32 v22, v22
-; SI-NEXT:    v_cvt_f32_f16_e32 v38, v28
-; SI-NEXT:    v_cvt_f32_f16_e32 v46, v27
-; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte 
Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v21
-; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; SI-NEXT:    v_cvt_f32_f16_e32 v21, v21
-; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:72 ; 4-byte 
Folded Spill
-; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:80 ; 4-byte 
Folded Spill
-; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:100 ; 4-byte 
Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v20
-; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; SI-NEXT:    v_cvt_f32_f16_e32 v20, v20
-; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:88 ; 4-byte 
Folded Spill
-; SI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:96 ; 4-byte 
Folded Spill
-; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:108 ; 4-byte 
Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v19
-; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; SI-NEXT:    v_cvt_f32_f16_e32 v19, v19
-; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:104 ; 4-byte 
Folded Spill
-; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:112 ; 4-byte 
Folded Spill
-; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte 
Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v18
-; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; SI-NEXT:    v_cvt_f32_f16_e32 v18, v18
-; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:116 ; 4-byte 
Folded Spill
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:128 ; 4-byte 
Folded Spill
-; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:124 ; 4-byte 
Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v17
-; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; SI-NEXT:    v_cvt_f32_f16_e32 v17, v17
-; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:132 ; 4-byte 
Folded Spill
-; SI-NEXT:    v_cvt_f32_f16_e32 v44, v3
-; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:136 ; 4-byte 
Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v16
-; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; SI-NEXT:    v_cvt_f32_f16_e32 v16, v16
-; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:140 ; 4-byte 
Folded Spill
-; SI-NEXT:    v_cvt_f32_f16_e32 v56, v2
-; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte 
Folded Spill
+; SI-NEXT:    v_alignbit_b32 v32, v31, v30, 16
+; SI-NEXT:    v_alignbit_b32 v33, v29, v28, 16
+; SI-NEXT:    v_alignbit_b32 v34, v27, v26, 16
+; SI-NEXT:    v_alignbit_b32 v35, v25, v24, 16
+; SI-NEXT:    v_alignbit_b32 v36, v23, v22, 16
+; SI-NEXT:    v_alignbit_b32 v37, v21, v20, 16
+; SI-NEXT:    v_alignbit_b32 v38, v19, v18, 16
+; SI-NEXT:    v_alignbit_b32 v39, v17, v16, 16
+; SI-NEXT:    v_alignbit_b32 v48, v15, v14, 16
+; SI-NEXT:    v_alignbit_b32 v49, v13, v12, 16
+; SI-NEXT:    v_alignbit_b32 v50, v11, v10, 16
+; SI-NEXT:    v_alignbit_b32 v53, v9, v8, 16
+; SI-NEXT:    v_alignbit_b32 v55, v7, v6, 16
+; SI-NEXT:    v_alignbit_b32 v42, v5, v4, 16
+; SI-NEXT:    v_alignbit_b32 v44, v3, v2, 16
+; SI-NEXT:    v_alignbit_b32 v47, v1, v0, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v51, 16, v31
+; SI-NEXT:    v_lshrrev_b32_e32 v52, 16, v29
+; SI-NEXT:    v_lshrrev_b32_e32 v54, 16, v27
+; SI-NEXT:    v_lshrrev_b32_e32 v40, 16, v25
+; SI-NEXT:    v_lshrrev_b32_e32 v41, 16, v23
+; SI-NEXT:    v_lshrrev_b32_e32 v43, 16, v21
+; SI-NEXT:    v_lshrrev_b32_e32 v45, 16, v19
+; SI-NEXT:    v_lshrrev_b32_e32 v46, 16, v17
+; SI-NEXT:    v_lshrrev_b32_e32 v56, 16, v15
+; SI-NEXT:    s_waitcnt expcnt(6)
+; SI-NEXT:    v_lshrrev_b32_e32 v57, 16, v13
+; SI-NEXT:    s_waitcnt expcnt(5)
+; SI-NEXT:    v_lshrrev_b32_e32 v58, 16, v11
+; SI-NEXT:    s_waitcnt expcnt(4)
+; SI-NEXT:    v_lshrrev_b32_e32 v59, 16, v9
+; SI-NEXT:    s_waitcnt expcnt(3)
+; SI-NEXT:    v_lshrrev_b32_e32 v60, 16, v7
+; SI-NEXT:    s_waitcnt expcnt(2)
+; SI-NEXT:    v_lshrrev_b32_e32 v61, 16, v5
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_lshrrev_b32_e32 v62, 16, v3
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v15
-; SI-NEXT:    v_cvt_f32_f16_e32 v63, v31
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v14
-; SI-NEXT:    v_cvt_f32_f16_e32 v33, v31
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v13
-; SI-NEXT:    v_cvt_f32_f16_e32 v35, v31
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v12
-; SI-NEXT:    v_cvt_f32_f16_e32 v37, v31
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v11
-; SI-NEXT:    v_cvt_f32_f16_e32 v39, v31
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v10
-; SI-NEXT:    v_cvt_f32_f16_e32 v49, v31
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v9
-; SI-NEXT:    v_cvt_f32_f16_e32 v51, v31
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v8
-; SI-NEXT:    v_cvt_f32_f16_e32 v53, v31
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v7
-; SI-NEXT:    v_cvt_f32_f16_e32 v55, v31
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v6
-; SI-NEXT:    v_cvt_f32_f16_e32 v41, v31
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v5
-; SI-NEXT:    v_cvt_f32_f16_e32 v43, v31
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v4
-; SI-NEXT:    v_cvt_f32_f16_e32 v45, v31
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v3
-; SI-NEXT:    v_cvt_f32_f16_e32 v15, v15
-; SI-NEXT:    v_cvt_f32_f16_e32 v14, v14
-; SI-NEXT:    v_cvt_f32_f16_e32 v13, v13
-; SI-NEXT:    v_cvt_f32_f16_e32 v12, v12
-; SI-NEXT:    v_cvt_f32_f16_e32 v11, v11
-; SI-NEXT:    v_cvt_f32_f16_e32 v10, v10
-; SI-NEXT:    v_cvt_f32_f16_e32 v9, v9
-; SI-NEXT:    v_cvt_f32_f16_e32 v8, v8
-; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
-; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
-; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; SI-NEXT:    v_cvt_f32_f16_e32 v47, v31
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v2
-; SI-NEXT:    v_cvt_f32_f16_e32 v57, v31
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v1
-; SI-NEXT:    v_cvt_f32_f16_e32 v59, v31
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v0
-; SI-NEXT:    v_cvt_f32_f16_e32 v61, v31
-; SI-NEXT:    v_cvt_f32_f16_e32 v31, v29
-; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:148 ; 4-byte 
Folded Spill
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:152 ; 4-byte 
Folded Spill
-; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:156 ; 4-byte 
Folded Spill
-; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:160 ; 4-byte 
Folded Spill
-; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:164 ; 4-byte 
Folded Spill
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:168 ; 4-byte 
Folded Spill
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:172 ; 4-byte 
Folded Spill
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:176 ; 4-byte 
Folded Spill
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:180 ; 4-byte 
Folded Spill
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:184 ; 4-byte 
Folded Spill
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:188 ; 4-byte 
Folded Spill
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:192 ; 4-byte 
Folded Spill
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:196 ; 4-byte 
Folded Spill
-; SI-NEXT:    v_cvt_f32_f16_e32 v58, v1
-; SI-NEXT:    v_cvt_f32_f16_e32 v60, v0
-; SI-NEXT:    ; implicit-def: $vgpr0
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr3
-; SI-NEXT:    ; implicit-def: $vgpr4
-; SI-NEXT:    ; implicit-def: $vgpr5
-; SI-NEXT:    ; implicit-def: $vgpr6
-; SI-NEXT:    ; implicit-def: $vgpr7
-; SI-NEXT:    ; implicit-def: $vgpr8
-; SI-NEXT:    ; implicit-def: $vgpr9
-; SI-NEXT:    ; implicit-def: $vgpr10
-; SI-NEXT:    ; implicit-def: $vgpr11
-; SI-NEXT:    ; implicit-def: $vgpr12
-; SI-NEXT:    ; implicit-def: $vgpr13
-; SI-NEXT:    ; implicit-def: $vgpr14
-; SI-NEXT:    ; implicit-def: $vgpr15
-; SI-NEXT:    ; implicit-def: $vgpr16
-; SI-NEXT:    ; implicit-def: $vgpr17
-; SI-NEXT:    ; implicit-def: $vgpr18
-; SI-NEXT:    ; implicit-def: $vgpr19
-; SI-NEXT:    ; implicit-def: $vgpr20
-; SI-NEXT:    ; implicit-def: $vgpr21
-; SI-NEXT:    ; implicit-def: $vgpr22
-; SI-NEXT:    ; implicit-def: $vgpr23
-; SI-NEXT:    ; implicit-def: $vgpr24
-; SI-NEXT:    ; implicit-def: $vgpr25
-; SI-NEXT:    ; implicit-def: $vgpr26
-; SI-NEXT:    ; implicit-def: $vgpr27
-; SI-NEXT:    ; implicit-def: $vgpr28
-; SI-NEXT:    ; implicit-def: $vgpr29
-; SI-NEXT:    ; implicit-def: $vgpr30
-; SI-NEXT:    ; implicit-def: $vgpr62
+; SI-NEXT:    v_lshrrev_b32_e32 v63, 16, v1
 ; SI-NEXT:  .LBB20_2: ; %Flow
 ; SI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB20_4
 ; SI-NEXT:  ; %bb.3: ; %cmp.true
-; SI-NEXT:    s_waitcnt expcnt(2)
-; SI-NEXT:    v_add_i32_e32 v6, vcc, 3, v6
-; SI-NEXT:    v_lshrrev_b32_e32 v41, 16, v6
-; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
-; SI-NEXT:    v_add_i32_e32 v24, vcc, 3, v24
-; SI-NEXT:    v_lshrrev_b32_e32 v38, 16, v24
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:188 ; 4-byte 
Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v6, v38
-; SI-NEXT:    v_add_i32_e32 v23, vcc, 3, v23
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 3, v5
-; SI-NEXT:    v_lshrrev_b32_e32 v36, 16, v23
-; SI-NEXT:    v_lshrrev_b32_e32 v43, 16, v5
-; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:76 ; 4-byte 
Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v6, v36
-; SI-NEXT:    v_add_i32_e32 v22, vcc, 3, v22
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_add_i32_e32 v44, vcc, 3, v62
-; SI-NEXT:    v_lshrrev_b32_e32 v46, 16, v22
-; SI-NEXT:    v_lshrrev_b32_e32 v62, 16, v44
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:192 ; 4-byte 
Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v5, v62
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:84 ; 4-byte 
Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v6, v46
-; SI-NEXT:    v_add_i32_e32 v20, vcc, 3, v20
-; SI-NEXT:    v_add_i32_e32 v21, vcc, 3, v21
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v20
-; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v21
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:92 ; 4-byte 
Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v6, v32
-; SI-NEXT:    v_mov_b32_e32 v32, v5
-; SI-NEXT:    v_cvt_f32_f16_e32 v5, v31
-; SI-NEXT:    v_add_i32_e32 v19, vcc, 3, v19
-; SI-NEXT:    v_lshrrev_b32_e32 v63, 16, v19
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:108 ; 4-byte 
Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v5, v63
-; SI-NEXT:    v_add_i32_e32 v18, vcc, 3, v18
-; SI-NEXT:    v_lshrrev_b32_e32 v61, 16, v18
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:120 ; 4-byte 
Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v5, v61
-; SI-NEXT:    v_add_i32_e32 v4, vcc, 3, v4
-; SI-NEXT:    v_add_i32_e32 v17, vcc, 3, v17
-; SI-NEXT:    v_add_i32_e32 v33, vcc, 3, v0
-; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v4
-; SI-NEXT:    v_lshrrev_b32_e32 v59, 16, v17
-; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:124 ; 4-byte 
Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v5, v59
 ; SI-NEXT:    v_add_i32_e32 v1, vcc, 3, v1
-; SI-NEXT:   ...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/177419
_______________________________________________
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

Reply via email to