Module: Mesa
Branch: main
Commit: 9ecfd7919baeeabd462b72ca10d4c24818db7b21
URL:    
http://cgit.freedesktop.org/mesa/mesa/commit/?id=9ecfd7919baeeabd462b72ca10d4c24818db7b21

Author: Georg Lehmann <[email protected]>
Date:   Tue Dec 19 22:16:33 2023 +0100

aco: optimize 32bit fsign by using fmulz with Inf

2 instruction fsign with the power of cursed DX9 floating point rules.

Foz-DB Navi31:
Totals from 3803 (4.86% of 78196) affected shaders:
Instrs: 8436366 -> 8412549 (-0.28%); split: -0.29%, +0.00%
CodeSize: 43174284 -> 43114676 (-0.14%); split: -0.14%, +0.01%
SpillSGPRs: 3241 -> 3247 (+0.19%)
Latency: 66333841 -> 66287361 (-0.07%); split: -0.08%, +0.01%
InvThroughput: 10331902 -> 10316916 (-0.15%); split: -0.15%, +0.01%
VClause: 165455 -> 165472 (+0.01%); split: -0.01%, +0.02%
SClause: 242352 -> 242335 (-0.01%); split: -0.02%, +0.01%
Copies: 604086 -> 605781 (+0.28%); split: -0.04%, +0.32%
Branches: 214017 -> 214013 (-0.00%)
PreSGPRs: 209413 -> 209726 (+0.15%)

Reviewed-by: Daniel Schürmann <[email protected]>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26765>

---

 src/amd/compiler/aco_instruction_selection.cpp | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/src/amd/compiler/aco_instruction_selection.cpp 
b/src/amd/compiler/aco_instruction_selection.cpp
index 866d6de71f7..63573876f2e 100644
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@@ -2896,16 +2896,13 @@ visit_alu_instr(isel_context* ctx, nir_alu_instr* instr)
             bld.vop1(aco_opcode::v_cvt_f16_i16, Definition(dst), src);
          }
       } else if (dst.regClass() == v1) {
-         if (ctx->block->fp_mode.denorm32 == fp_denorm_flush) {
-            /* If denormals are flushed, then v_mul_legacy_f32(2.0, src) can 
become omod. */
-            src =
-               bld.vop2(aco_opcode::v_mul_legacy_f32, bld.def(v1), 
Operand::c32(0x40000000), src);
-         } else {
-            src = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), 
Operand::zero(), src);
-         }
-         src =
-            bld.vop3(aco_opcode::v_med3_i32, bld.def(v1), Operand::c32(-1), 
src, Operand::c32(1u));
-         bld.vop1(aco_opcode::v_cvt_f32_i32, Definition(dst), src);
+         /* Legacy multiply with +Inf means +-0.0 becomes +0.0 and all other 
numbers
+          * the correctly signed Inf. After that, we only need to clamp 
between -1.0 and +1.0.
+          */
+         Temp inf = bld.copy(bld.def(s1), Operand::c32(0x7f800000));
+         src = bld.vop2(aco_opcode::v_mul_legacy_f32, bld.def(v1), inf, src);
+         bld.vop3(aco_opcode::v_med3_f32, Definition(dst), 
Operand::c32(0x3f800000), src,
+                  Operand::c32(0xbf800000));
       } else if (dst.regClass() == v2) {
          Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f64, bld.def(bld.lm), 
Operand::zero(), src);
          Temp tmp = bld.copy(bld.def(v1), Operand::c32(0x3FF00000u));

Reply via email to