From: Ian Romanick <ian.d.roman...@intel.com> I have CC'ed everyone responsible for drivers that sets lower_flrp32 or lower_flrp64.
No changes on any other Intel platforms. Iron Lake total instructions in shared programs: 7752306 -> 7716901 (-0.46%) instructions in affected programs: 1160861 -> 1125456 (-3.05%) helped: 4020 HURT: 10 helped stats (abs) min: 1 max: 40 x̄: 8.81 x̃: 9 helped stats (rel) min: 0.20% max: 86.96% x̄: 4.99% x̃: 3.05% HURT stats (abs) min: 1 max: 2 x̄: 1.20 x̃: 1 HURT stats (rel) min: 1.06% max: 3.92% x̄: 1.62% x̃: 1.06% 95% mean confidence interval for instructions value: -8.93 -8.64 95% mean confidence interval for instructions %-change: -5.15% -4.79% Instructions are helped. total cycles in shared programs: 177868254 -> 177689740 (-0.10%) cycles in affected programs: 26413132 -> 26234618 (-0.68%) helped: 3927 HURT: 72 helped stats (abs) min: 2 max: 646 x̄: 45.66 x̃: 48 helped stats (rel) min: <.01% max: 94.58% x̄: 2.38% x̃: 0.88% HURT stats (abs) min: 2 max: 406 x̄: 10.75 x̃: 6 HURT stats (rel) min: <.01% max: 2.77% x̄: 0.19% x̃: 0.02% 95% mean confidence interval for cycles value: -45.58 -43.70 95% mean confidence interval for cycles %-change: -2.47% -2.20% Cycles are helped. LOST: 3 GAINED: 35 GM45 total instructions in shared programs: 4760579 -> 4741934 (-0.39%) instructions in affected programs: 643230 -> 624585 (-2.90%) helped: 2165 HURT: 9 helped stats (abs) min: 1 max: 40 x̄: 8.62 x̃: 9 helped stats (rel) min: 0.20% max: 86.96% x̄: 4.74% x̃: 2.87% HURT stats (abs) min: 1 max: 2 x̄: 1.11 x̃: 1 HURT stats (rel) min: 1.06% max: 3.77% x̄: 1.36% x̃: 1.06% 95% mean confidence interval for instructions value: -8.77 -8.38 95% mean confidence interval for instructions %-change: -4.95% -4.48% Instructions are helped. total cycles in shared programs: 121648572 -> 121542280 (-0.09%) cycles in affected programs: 16923170 -> 16816878 (-0.63%) helped: 2114 HURT: 51 helped stats (abs) min: 2 max: 646 x̄: 50.61 x̃: 50 helped stats (rel) min: <.01% max: 93.33% x̄: 2.39% x̃: 0.90% HURT stats (abs) min: 4 max: 406 x̄: 13.84 x̃: 6 HURT stats (rel) min: <.01% max: 2.77% x̄: 0.19% x̃: 0.01% 95% mean confidence interval for cycles value: -50.56 -47.63 95% mean confidence interval for cycles %-change: -2.52% -2.14% Cycles are helped. LOST: 38 GAINED: 38 Signed-off-by: Ian Romanick <ian.d.roman...@intel.com> Cc: Marek Olšák <marek.ol...@amd.com> Cc: Rob Clark <robdcl...@gmail.com> Cc: Eric Anholt <e...@anholt.net> Cc: Dave Airlie <airl...@redhat.com> Cc: Timothy Arceri <tarc...@itsqueeze.com> --- src/compiler/nir/nir_lower_flrp.c | 134 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 134 insertions(+) diff --git a/src/compiler/nir/nir_lower_flrp.c b/src/compiler/nir/nir_lower_flrp.c index 0c7e803b20f..b86f5b5f2df 100644 --- a/src/compiler/nir/nir_lower_flrp.c +++ b/src/compiler/nir/nir_lower_flrp.c @@ -137,6 +137,89 @@ replace_with_fast(struct nir_builder *bld, struct u_vector *dead_flrp, append_flrp_to_dead_list(dead_flrp, alu); } +/** + * Replace flrp(a, b, c) with (b*c ± c) + a + */ +static void +replace_with_expanded_ffma_and_add(struct nir_builder *bld, + struct u_vector *dead_flrp, + struct nir_alu_instr *alu, bool subtract_c) +{ + nir_ssa_def *const a = nir_ssa_for_alu_src(bld, alu, 0); + nir_ssa_def *const b = nir_ssa_for_alu_src(bld, alu, 1); + nir_ssa_def *const c = nir_ssa_for_alu_src(bld, alu, 2); + + nir_ssa_def *const b_times_c = nir_fadd(bld, b, c); + nir_instr_as_alu(b_times_c->parent_instr)->exact = alu->exact; + + nir_ssa_def *inner_sum; + + if (subtract_c) { + nir_ssa_def *const neg_c = nir_fneg(bld, c); + nir_instr_as_alu(neg_c->parent_instr)->exact = alu->exact; + + inner_sum = nir_fadd(bld, b_times_c, neg_c); + } else { + inner_sum = nir_fadd(bld, b_times_c, c); + } + + nir_instr_as_alu(inner_sum->parent_instr)->exact = alu->exact; + + nir_ssa_def *const outer_sum = nir_fadd(bld, inner_sum, a); + nir_instr_as_alu(outer_sum->parent_instr)->exact = alu->exact; + + nir_ssa_def_rewrite_uses(&alu->dest.dest.ssa, nir_src_for_ssa(outer_sum)); + + /* DO NOT REMOVE the original flrp yet. Many of the lowering choices are + * based on other uses of the sources. Removing the flrp may cause the + * last flrp in a sequence to make a different, incorrect choice. + */ + append_flrp_to_dead_list(dead_flrp, alu); +} + +/** + * Determines whether a swizzled source is constant w/ all components the same. + * + * The value of the constant is stored in \c result. + * + * \return + * True if all components of the swizzled source are the same constant. + * Otherwise false is returned. + */ +static bool +all_same_constant(const nir_alu_instr *instr, unsigned src, double *result) +{ + nir_const_value *val = nir_src_as_const_value(instr->src[src].src); + + if (!val) + return false; + + const uint8_t *const swizzle = instr->src[src].swizzle; + const unsigned num_components = nir_dest_num_components(instr->dest.dest); + + if (instr->dest.dest.ssa.bit_size == 32) { + const float first = val->f32[swizzle[0]]; + + for (unsigned i = 1; i < num_components; i++) { + if (val->f32[swizzle[i]] != first) + return false; + } + + *result = first; + } else { + const double first = val->f64[swizzle[0]]; + + for (unsigned i = 1; i < num_components; i++) { + if (val->f64[swizzle[i]] != first) + return false; + } + + *result = first; + } + + return true; +} + static bool sources_are_constants_with_similar_magnitudes(const nir_alu_instr *instr) { @@ -265,6 +348,57 @@ convert_flrp_instruction(nir_builder *bld, return; } + /* + * - If x = 1: + * + * (yt + -t) + 1 + * + * - If x = -1: + * + * (yt + t) - 1 + * + * In both cases, x is used in place of ±1 for simplicity. Both forms + * lend to ffma generation on platforms that support ffma. + */ + double src0_as_constant; + if (all_same_constant(alu, 0, &src0_as_constant)) { + if (src0_as_constant == 1.0) { + replace_with_expanded_ffma_and_add(bld, dead_flrp, alu, + true /* subtract t */); + return; + } else if (src0_as_constant == -1.0) { + replace_with_expanded_ffma_and_add(bld, dead_flrp, alu, + false /* add t */); + return; + } + } + + /* + * - If y = ±1: + * + * x(1 - t) + yt + * + * In this case either the multiply in yt will be eliminated by + * nir_opt_algebraic. If FMA is supported, this results in fma(x, (1 - + * t), ±t) for two instructions. If FMA is not supported, then the cost + * is 3 instructions. We rely on nir_opt_algebraic to generate the FMA + * instructions as well. + * + * Another possible replacement is + * + * -xt + x ± t + * + * Some groupings of this may be better on some platforms in some + * circumstances, bit it is probably dependent on scheduling. Futher + * investigation may be required. + */ + double src1_as_constant; + if ((all_same_constant(alu, 1, &src1_as_constant) && + (src1_as_constant == -1.0 || src1_as_constant == 1.0))) { + replace_with_strict(bld, dead_flrp, alu); + return; + } + if (have_ffma) { if (always_precise) { replace_with_strict_ffma(bld, dead_flrp, alu); -- 2.14.4 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev