Module: Mesa Branch: main Commit: 9ecfd7919baeeabd462b72ca10d4c24818db7b21 URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=9ecfd7919baeeabd462b72ca10d4c24818db7b21
Author: Georg Lehmann <[email protected]> Date: Tue Dec 19 22:16:33 2023 +0100 aco: optimize 32bit fsign by using fmulz with Inf 2 instruction fsign with the power of cursed DX9 floating point rules. Foz-DB Navi31: Totals from 3803 (4.86% of 78196) affected shaders: Instrs: 8436366 -> 8412549 (-0.28%); split: -0.29%, +0.00% CodeSize: 43174284 -> 43114676 (-0.14%); split: -0.14%, +0.01% SpillSGPRs: 3241 -> 3247 (+0.19%) Latency: 66333841 -> 66287361 (-0.07%); split: -0.08%, +0.01% InvThroughput: 10331902 -> 10316916 (-0.15%); split: -0.15%, +0.01% VClause: 165455 -> 165472 (+0.01%); split: -0.01%, +0.02% SClause: 242352 -> 242335 (-0.01%); split: -0.02%, +0.01% Copies: 604086 -> 605781 (+0.28%); split: -0.04%, +0.32% Branches: 214017 -> 214013 (-0.00%) PreSGPRs: 209413 -> 209726 (+0.15%) Reviewed-by: Daniel Schürmann <[email protected]> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26765> --- src/amd/compiler/aco_instruction_selection.cpp | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index 866d6de71f7..63573876f2e 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -2896,16 +2896,13 @@ visit_alu_instr(isel_context* ctx, nir_alu_instr* instr) bld.vop1(aco_opcode::v_cvt_f16_i16, Definition(dst), src); } } else if (dst.regClass() == v1) { - if (ctx->block->fp_mode.denorm32 == fp_denorm_flush) { - /* If denormals are flushed, then v_mul_legacy_f32(2.0, src) can become omod. */ - src = - bld.vop2(aco_opcode::v_mul_legacy_f32, bld.def(v1), Operand::c32(0x40000000), src); - } else { - src = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), Operand::zero(), src); - } - src = - bld.vop3(aco_opcode::v_med3_i32, bld.def(v1), Operand::c32(-1), src, Operand::c32(1u)); - bld.vop1(aco_opcode::v_cvt_f32_i32, Definition(dst), src); + /* Legacy multiply with +Inf means +-0.0 becomes +0.0 and all other numbers + * the correctly signed Inf. After that, we only need to clamp between -1.0 and +1.0. + */ + Temp inf = bld.copy(bld.def(s1), Operand::c32(0x7f800000)); + src = bld.vop2(aco_opcode::v_mul_legacy_f32, bld.def(v1), inf, src); + bld.vop3(aco_opcode::v_med3_f32, Definition(dst), Operand::c32(0x3f800000), src, + Operand::c32(0xbf800000)); } else if (dst.regClass() == v2) { Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f64, bld.def(bld.lm), Operand::zero(), src); Temp tmp = bld.copy(bld.def(v1), Operand::c32(0x3FF00000u));
