On Wed, 16 Oct 2024, Robin Dapp wrote:
> Hi,
>
> this is probably rather an RFC than a patch as I'm not sure whether
> reassoc is the right place to fix it. On top, the heuristic might
> be a bit "ad-hoc". Maybe we can also work around it in the vectorizer?
>
> The following function is vectorized in a very inefficient way because we
> construct vectors from scalar loads.
>
> uint64_t
> foo (uint8_t *pix, int i_stride)
> {
> uint32_t sum = 0, sqr = 0;
> int x, y;
> for (y = 0; y < 16; y++)
> {
> for (x = 0; x < 16; x++)
> sum += pix[x];
> pix += i_stride;
> }
> return sum;
> }
>
> The reason for this is that reassoc reorders the last three operands of
> the summation sequence by rank introducing a temporary in the process
> that breaks the "homogeneity" (sum_n = sum_{n - 1} + pix[n]) of the sum.
>
> This patch adds a function likely_vectorizable_p that checks if an
> operand vector contains only operands of the same rank except the last
> one. In that case the sequence is likely vectorizable and the easier
> vectorization will outweigh any CSE opportunity we can expose by
> swapping operands.
Hmm, reassoc isn't supposed to apply width > 1 before vectorization;
IIRC I "fixed" that at some point, but this is what I see:
# sum_126 = PHI <sum_145(5), 0(2)>
...
_23 = *pix_134;
_24 = (unsigned int) _23;
_31 = MEM[(uint8_t *)pix_134 + 1B];
_32 = (unsigned int) _31;
_118 = _24 + _32;
sum_33 = _118 + sum_126;
...
I think the place you add the condition to on swap_ops_for_binary_stmt
simply lacks a !reassoc_insert_powi_p check like we have on the earlier
rewrite_expr_tree_parallel.
Richard.
> Bootstrapped and regtested on x86, regtested on rv64gcv.
>
> Regards
> Robin
>
> gcc/ChangeLog:
>
> * tree-ssa-reassoc.cc (likely_vectorizable_p): New function.
> (reassociate_bb): Use new function.
> (dump_ops_vector): Change prototype.
> (debug_ops_vector): Change prototype.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.dg/tree-ssa/reassoc-52.c: New test.
> ---
> gcc/testsuite/gcc.dg/tree-ssa/reassoc-52.c | 27 ++++++
> gcc/tree-ssa-reassoc.cc | 102 ++++++++++++++++++++-
> 2 files changed, 124 insertions(+), 5 deletions(-)
> create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/reassoc-52.c
>
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/reassoc-52.c
> b/gcc/testsuite/gcc.dg/tree-ssa/reassoc-52.c
> new file mode 100644
> index 00000000000..b117b7519bb
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/reassoc-52.c
> @@ -0,0 +1,27 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-O3 -fdump-tree-reassoc-details" } */
> +
> +typedef unsigned char uint8_t;
> +typedef unsigned int uint32_t;
> +typedef unsigned long uint64_t;
> +
> +uint64_t
> +foo (uint8_t *pix, int i_stride)
> +{
> + uint32_t sum = 0, sqr = 0;
> + int x, y;
> + for (y = 0; y < 16; y++)
> + {
> + for (x = 0; x < 16; x++)
> + sum += pix[x];
> + pix += i_stride;
> + }
> + return sum;
> +}
> +
> +/* Ensure that we only add to sum variables and don't create a temporary that
> + does something else. In doing so we enable a much more efficient
> + vectorization scheme. */
> +
> +/* { dg-final { scan-tree-dump-times "\\s+sum_\\d+\\s+=\\s+sum_\\d+\\s+\\\+"
> 16 "reassoc1" } } */
> +/* { dg-final { scan-tree-dump-times "\\s+sum_\\d+\\s=.*\\\+\\ssum_\\d+;" 1
> "reassoc1" } } */
> diff --git a/gcc/tree-ssa-reassoc.cc b/gcc/tree-ssa-reassoc.cc
> index 556ecdebe2d..13f8a1070b6 100644
> --- a/gcc/tree-ssa-reassoc.cc
> +++ b/gcc/tree-ssa-reassoc.cc
> @@ -6992,6 +6992,96 @@ rank_ops_for_fma (vec<operand_entry *> *ops)
> }
> return mult_num;
> }
> +
> +/* Check if the operand vector contains operands that all have the same rank
> + apart from the last one. The last one must be a PHI node which links back
> + to the first operand.
> + This can indicate an easily vectorizable sequence in which case we don't
> + want to reorder the first three elements for rank reasons.
> +
> + Consider the following example:
> +
> + for (int y = 0; y < 16; y++)
> + {
> + for (int x = 0; x < 16; x++)
> + {
> + sum += pix[x];
> + }
> + ...
> + }
> +
> + # sum_201 = PHI <sum_230(3), 0(2)>
> + # y_149 = PHI <y_24(3), 0(2)>
> +
> + _33 = *pix_214;
> + _34 = (unsigned intD.4) _33;
> + sum_35 = sum_201 + _34;
> +
> + _46 = MEM[(uint8_tD.2849 *)pix_214 + 1B];
> + _47 = (unsigned intD.4) _46;
> + sum_48 = sum_35 + _47;
> + _49 = (intD.1) _46;
> +
> + All loads of pix are of the same rank, just the sum PHI has a different
> + one in {..., _46, _33, sum_201}.
> + swap_ops_for_binary_stmt will move sum_201 before _46 and _33 so that
> + the first operation _33 OP _46 is exposed as an optimization opportunity.
> + In doing so it will trip up the vectorizer which now needs to work much
> + harder because it can't recognize the whole sequence as doing the same
> + operation. */
> +
> +static bool
> +likely_vectorizable_p (vec<operand_entry *> *ops)
> +{
> + operand_entry *oe;
> + unsigned int i;
> + unsigned int rank = 0;
> +
> + if (ops->length () < 3)
> + return false;
> +
> + /* Check if the last operand is a PHI. */
> + operand_entry *oelast = (*ops)[ops->length () - 1];
> +
> + if (TREE_CODE (oelast->op) != SSA_NAME)
> + return false;
> +
> + gimple *glast = SSA_NAME_DEF_STMT (oelast->op);
> +
> + if (gimple_code (glast) != GIMPLE_PHI
> + || gimple_phi_num_args (glast) != 2)
> + return false;
> +
> + /* If so, check if its first argument is a gimple assign. */
> + tree phi_arg = gimple_phi_arg_def (glast, 0);
> +
> + if (TREE_CODE (phi_arg) != SSA_NAME)
> + return false;
> +
> + gimple *garg_def = SSA_NAME_DEF_STMT (phi_arg);
> + if (!is_gimple_assign (garg_def))
> + return false;
> +
> + /* It must link back to the beginning of our sequence. */
> + if (gimple_assign_rhs2 (garg_def) != (*ops)[0]->op)
> + return false;
> +
> + /* Make sure everything apart from the PHI has the same rank. */
> + FOR_EACH_VEC_ELT (*ops, i, oe)
> + {
> + if (i < ops->length () - 1)
> + {
> + if (rank == 0)
> + rank = oe->rank;
> + else
> + if (rank != oe->rank)
> + return false;
> + }
> + }
> +
> + return true;
> +}
> +
> /* Reassociate expressions in basic block BB and its post-dominator as
> children.
>
> @@ -7182,6 +7272,8 @@ reassociate_bb (basic_block bb)
> mult_num = rank_ops_for_fma (&ops);
> }
>
> + bool vectorizable_p = likely_vectorizable_p (&ops);
> +
> /* Only rewrite the expression tree to parallel in the
> last reassoc pass to avoid useless work back-and-forth
> with initial linearization. */
> @@ -7208,7 +7300,7 @@ reassociate_bb (basic_block bb)
> binary op are chosen wisely. */
> int len = ops.length ();
> if (len >= 3
> - && (!has_fma
> + && ((!has_fma && !vectorizable_p)
> /* width > 1 means ranking ops results in better
> parallelism. Check current value to avoid
> calling get_reassociation_width again. */
> @@ -7346,13 +7438,13 @@ branch_fixup (void)
> reassoc_branch_fixups.release ();
> }
>
> -void dump_ops_vector (FILE *file, vec<operand_entry *> ops);
> -void debug_ops_vector (vec<operand_entry *> ops);
> +void dump_ops_vector (FILE *file, const vec<operand_entry *> &ops);
> +void debug_ops_vector (const vec<operand_entry *> &ops);
>
> /* Dump the operand entry vector OPS to FILE. */
>
> void
> -dump_ops_vector (FILE *file, vec<operand_entry *> ops)
> +dump_ops_vector (FILE *file, const vec<operand_entry *> &ops)
> {
> operand_entry *oe;
> unsigned int i;
> @@ -7368,7 +7460,7 @@ dump_ops_vector (FILE *file, vec<operand_entry *> ops)
> /* Dump the operand entry vector OPS to STDERR. */
>
> DEBUG_FUNCTION void
> -debug_ops_vector (vec<operand_entry *> ops)
> +debug_ops_vector (const vec<operand_entry *> &ops)
> {
> dump_ops_vector (stderr, ops);
> }
>
--
Richard Biener <[email protected]>
SUSE Software Solutions Germany GmbH,
Frankenstrasse 146, 90461 Nuernberg, Germany;
GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)