Module: Mesa Branch: main Commit: 1092f37805464d9c694ad7fb73d31241d74e5f20 URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=1092f37805464d9c694ad7fb73d31241d74e5f20
Author: Rhys Perry <[email protected]> Date: Mon Jan 17 16:52:10 2022 +0000 aco: use v_fma_mix to combine mul/add/fma output conversions fossil-db (Sienna Cichlid): Totals from 42 (0.03% of 134913) affected shaders: CodeSize: 596904 -> 596332 (-0.10%); split: -0.10%, +0.00% Instrs: 110194 -> 109902 (-0.26%) Latency: 1205239 -> 1204915 (-0.03%); split: -0.03%, +0.00% InvThroughput: 189697 -> 189375 (-0.17%) VClause: 1365 -> 1366 (+0.07%) Copies: 5429 -> 5414 (-0.28%); split: -0.33%, +0.06% Branches: 4034 -> 4026 (-0.20%) fossil-db (Navi): Totals from 42 (0.03% of 134913) affected shaders: CodeSize: 596044 -> 595488 (-0.09%); split: -0.10%, +0.00% Instrs: 110845 -> 110540 (-0.28%) Latency: 1206131 -> 1205747 (-0.03%) InvThroughput: 190178 -> 189809 (-0.19%) VClause: 1372 -> 1370 (-0.15%); split: -0.29%, +0.15% Copies: 5671 -> 5641 (-0.53%); split: -0.56%, +0.04% Branches: 4033 -> 4025 (-0.20%) fossil-db (Vega): Totals from 42 (0.03% of 135048) affected shaders: CodeSize: 605824 -> 605352 (-0.08%); split: -0.08%, +0.00% Instrs: 115975 -> 115706 (-0.23%) Latency: 1399845 -> 1398912 (-0.07%) InvThroughput: 489901 -> 489442 (-0.09%) VClause: 1314 -> 1311 (-0.23%); split: -0.38%, +0.15% Copies: 9673 -> 9666 (-0.07%); split: -0.12%, +0.05% Branches: 4025 -> 4024 (-0.02%) Signed-off-by: Rhys Perry <[email protected]> Reviewed-by: Daniel Schürmann <[email protected]> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14769> --- src/amd/compiler/aco_optimizer.cpp | 52 +++++++++++++++++++++++++++++++++++--- 1 file changed, 48 insertions(+), 4 deletions(-) diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp index 7109c2e4f7b..933ce834628 100644 --- a/src/amd/compiler/aco_optimizer.cpp +++ b/src/amd/compiler/aco_optimizer.cpp @@ -123,6 +123,7 @@ enum Label { label_dpp16 = 1ull << 35, label_dpp8 = 1ull << 36, label_f2f32 = 1ull << 37, + label_f2f16 = 1ull << 38, }; static constexpr uint64_t instr_usedef_labels = @@ -130,7 +131,7 @@ static constexpr uint64_t instr_usedef_labels = label_uniform_bitwise | label_minmax | label_vopc | label_usedef | label_extract | label_dpp16 | label_dpp8 | label_f2f32; static constexpr uint64_t instr_mod_labels = - label_omod2 | label_omod4 | label_omod5 | label_clamp | label_insert; + label_omod2 | label_omod4 | label_omod5 | label_clamp | label_insert | label_f2f16; static constexpr uint64_t instr_labels = instr_usedef_labels | instr_mod_labels; static constexpr uint64_t temp_labels = label_abs | label_neg | label_temp | label_vcc | label_b2f | @@ -326,6 +327,14 @@ struct ssa_info { bool is_clamp() { return label & label_clamp; } + void set_f2f16(Instruction* conv) + { + add_label(label_f2f16); + instr = conv; + } + + bool is_f2f16() { return label & label_f2f16; } + void set_undefined() { add_label(label_undefined); } bool is_undefined() { return label & label_undefined; } @@ -1889,6 +1898,11 @@ label_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr) ctx.info[instr->definitions[0].tempId()].set_usedef(instr.get()); break; } + case aco_opcode::v_cvt_f16_f32: { + if (instr->operands[0].isTemp()) + ctx.info[instr->operands[0].tempId()].set_f2f16(instr.get()); + break; + } case aco_opcode::v_cvt_f32_f16: { if (instr->operands[0].isTemp()) ctx.info[instr->definitions[0].tempId()].set_f2f32(instr.get()); @@ -3060,7 +3074,7 @@ apply_omod_clamp(opt_ctx& ctx, aco_ptr<Instruction>& instr) } instr->definitions[0].swapTemp(def_info.instr->definitions[0]); - ctx.info[instr->definitions[0].tempId()].label &= label_clamp | label_insert; + ctx.info[instr->definitions[0].tempId()].label &= label_clamp | label_insert | label_f2f16; ctx.uses[def_info.instr->definitions[0].tempId()]--; return true; @@ -3499,11 +3513,41 @@ to_mad_mix(opt_ctx& ctx, aco_ptr<Instruction>& instr) vop3p->clamp = instr->isVOP3() && instr->vop3().clamp; instr = std::move(vop3p); - ctx.info[instr->definitions[0].tempId()].label &= label_clamp | label_mul; + ctx.info[instr->definitions[0].tempId()].label &= label_f2f16 | label_clamp | label_mul; if (ctx.info[instr->definitions[0].tempId()].label & label_mul) ctx.info[instr->definitions[0].tempId()].instr = instr.get(); } +bool +combine_output_conversion(opt_ctx& ctx, aco_ptr<Instruction>& instr) +{ + ssa_info& def_info = ctx.info[instr->definitions[0].tempId()]; + if (!def_info.is_f2f16()) + return false; + Instruction* conv = def_info.instr; + + if (!can_use_mad_mix(ctx, instr) || ctx.uses[instr->definitions[0].tempId()] != 1) + return false; + + if (!ctx.uses[conv->definitions[0].tempId()]) + return false; + + if (conv->usesModifiers()) + return false; + + if (!instr->isVOP3P()) + to_mad_mix(ctx, instr); + + instr->opcode = aco_opcode::v_fma_mixlo_f16; + instr->definitions[0].swapTemp(conv->definitions[0]); + if (conv->definitions[0].isPrecise()) + instr->definitions[0].setPrecise(true); + ctx.info[instr->definitions[0].tempId()].label &= label_clamp; + ctx.uses[conv->definitions[0].tempId()]--; + + return true; +} + void combine_mad_mix(opt_ctx& ctx, aco_ptr<Instruction>& instr) { @@ -3603,7 +3647,7 @@ combine_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr) if (can_apply_sgprs(ctx, instr)) apply_sgprs(ctx, instr); combine_mad_mix(ctx, instr); - while (apply_omod_clamp(ctx, instr)) + while (apply_omod_clamp(ctx, instr) | combine_output_conversion(ctx, instr)) ; apply_insert(ctx, instr); }
