On Thu, 6 Feb 2025, Richard Sandiford wrote:
> In this PR, we used to generate:
>
> .L6:
> mov v30.16b, v31.16b
> fadd v31.4s, v31.4s, v27.4s
> fadd v29.4s, v30.4s, v28.4s
> stp q30, q29, [x0]
> add x0, x0, 32
> cmp x1, x0
> bne .L6
>
> for an unrolled induction in:
>
> for (int i = 0; i < 1024; i++)
> {
> arr[i] = freq;
> freq += step;
> }
>
> with the problem being the unnecessary MOV.
>
> The main induction IV was incremented by VF * step == 2 * nunits * step,
> and then nunits * step was added for the second store to arr.
>
> The original patch for the PR (r14-2367-g224fd59b2dc8) avoided the MOV
> by incrementing the IV by nunits * step twice. The problem with that
> approach is that it doubles the loop-carried latency. This change was
> deliberately not preserved when moving from loop-vect to SLP and so
> the test started failing again after r15-3509-gd34cda720988.
>
> I think the main problem is that we put the IV increment in the wrong
> place. Normal IVs created by create_iv are placed before the exit
> condition where possible, but vectorizable_induction instead always
> inserted them at the start of the loop body. The only use of the
> incremented IV is by the phi node, so the effect is to make both
> the old and new IV values live for the whole loop body, which is
> why we need the MOV.
>
> The simplest fix therefore seems to be to reuse the create_iv logic.
>
> Bootstrapped & regression-tested on aarch64-linus-gnu and
> x86_64-linux-gnu. As Richi suggested, I also tested vect.exp
> using unix/-mavx512f/--param=vect-partial-vector-usage=2.
> OK to install?
OK.
Thanks,
Richard.
> Richard
>
>
> gcc/
> PR tree-optimization/110449
> * tree-ssa-loop-manip.h (insert_iv_increment): Declare.
> * tree-ssa-loop-manip.cc (insert_iv_increment): New function,
> split out from...
> (create_iv): ...here and generalized to gimple_seqs.
> * tree-vect-loop.cc (vectorizable_induction): Use
> standard_iv_increment_position and insert_iv_increment
> to insert the IV increment.
>
> gcc/testsuite/
> PR tree-optimization/110449
> * gcc.target/aarch64/pr110449.c: Expect an increment by 8.0,
> but test that there is no MOV.
> ---
> gcc/testsuite/gcc.target/aarch64/pr110449.c | 25 +++++----
> gcc/tree-ssa-loop-manip.cc | 62 ++++++++++++---------
> gcc/tree-ssa-loop-manip.h | 1 +
> gcc/tree-vect-loop.cc | 6 +-
> 4 files changed, 57 insertions(+), 37 deletions(-)
>
> diff --git a/gcc/testsuite/gcc.target/aarch64/pr110449.c
> b/gcc/testsuite/gcc.target/aarch64/pr110449.c
> index bb3b6dcfe08..51ca3f4b816 100644
> --- a/gcc/testsuite/gcc.target/aarch64/pr110449.c
> +++ b/gcc/testsuite/gcc.target/aarch64/pr110449.c
> @@ -1,8 +1,10 @@
> /* { dg-do compile } */
> /* { dg-options "-Ofast -mcpu=neoverse-n2 --param
> aarch64-vect-unroll-limit=2" } */
> -/* { dg-final { scan-assembler-not "8.0e\\+0" } } */
> +/* { dg-final { scan-assembler {, #?8.0e\+0} } } */
> +/* { dg-final { scan-assembler-not {\tmov\tv} } } */
>
> -/* Calcualte the vectorized induction with smaller step for an unrolled loop.
> +/* Insert the induction IV updates before the exit condition, rather than
> + at the start of the loop body.
>
> before (suggested_unroll_factor=2):
> fmov s30, 8.0e+0
> @@ -19,15 +21,16 @@
> bne .L6
>
> after:
> - fmov s31, 4.0e+0
> - dup v29.4s, v31.s[0]
> - .L6:
> - fadd v30.4s, v31.4s, v29.4s
> - stp q31, q30, [x0]
> - add x0, x0, 32
> - fadd v31.4s, v29.4s, v30.4s
> - cmp x0, x1
> - bne .L6 */
> + fmov s31, 8.0e+0
> + fmov s29, 4.0e+0
> + dup v31.4s, v31.s[0]
> + dup v29.4s, v29.s[0]
> + .L2:
> + fadd v30.4s, v0.4s, v29.4s
> + stp q0, q30, [x0], 32
> + fadd v0.4s, v0.4s, v31.4s
> + cmp x1, x0
> + bne .L2 */
>
> void
> foo2 (float *arr, float freq, float step)
> diff --git a/gcc/tree-ssa-loop-manip.cc b/gcc/tree-ssa-loop-manip.cc
> index 6ceb9df370b..2907fa6532d 100644
> --- a/gcc/tree-ssa-loop-manip.cc
> +++ b/gcc/tree-ssa-loop-manip.cc
> @@ -47,6 +47,39 @@ along with GCC; see the file COPYING3. If not see
> so that we can free them all at once. */
> static bitmap_obstack loop_renamer_obstack;
>
> +/* Insert IV increment statements STMTS before or after INCR_POS;
> + AFTER selects which. INCR_POS and AFTER can be computed using
> + standard_iv_increment_position. */
> +
> +void
> +insert_iv_increment (gimple_stmt_iterator *incr_pos, bool after,
> + gimple_seq stmts)
> +{
> + /* Prevent the increment from inheriting a bogus location if it is not put
> + immediately after a statement whose location is known. */
> + if (after)
> + {
> + gimple_stmt_iterator gsi = *incr_pos;
> + if (!gsi_end_p (gsi))
> + gsi_next_nondebug (&gsi);
> + if (gsi_end_p (gsi))
> + {
> + edge e = single_succ_edge (gsi_bb (*incr_pos));
> + gimple_seq_set_location (stmts, e->goto_locus);
> + }
> + gsi_insert_seq_after (incr_pos, stmts, GSI_NEW_STMT);
> + }
> + else
> + {
> + gimple_stmt_iterator gsi = *incr_pos;
> + if (!gsi_end_p (gsi) && is_gimple_debug (gsi_stmt (gsi)))
> + gsi_next_nondebug (&gsi);
> + if (!gsi_end_p (gsi))
> + gimple_seq_set_location (stmts, gimple_location (gsi_stmt (gsi)));
> + gsi_insert_seq_before (incr_pos, stmts, GSI_NEW_STMT);
> + }
> +}
> +
> /* Creates an induction variable with value BASE (+/-) STEP * iteration in
> LOOP.
> If INCR_OP is PLUS_EXPR, the induction variable is BASE + STEP *
> iteration.
> If INCR_OP is MINUS_EXPR, the induction variable is BASE - STEP *
> iteration.
> @@ -63,7 +96,6 @@ create_iv (tree base, tree_code incr_op, tree step, tree
> var, class loop *loop,
> gimple_stmt_iterator *incr_pos, bool after, tree *var_before,
> tree *var_after)
> {
> - gassign *stmt;
> gphi *phi;
> tree initial, step1;
> gimple_seq stmts;
> @@ -126,30 +158,10 @@ create_iv (tree base, tree_code incr_op, tree step,
> tree var, class loop *loop,
> if (stmts)
> gsi_insert_seq_on_edge_immediate (pe, stmts);
>
> - stmt = gimple_build_assign (va, incr_op, vb, step);
> - /* Prevent the increment from inheriting a bogus location if it is not put
> - immediately after a statement whose location is known. */
> - if (after)
> - {
> - gimple_stmt_iterator gsi = *incr_pos;
> - if (!gsi_end_p (gsi))
> - gsi_next_nondebug (&gsi);
> - if (gsi_end_p (gsi))
> - {
> - edge e = single_succ_edge (gsi_bb (*incr_pos));
> - gimple_set_location (stmt, e->goto_locus);
> - }
> - gsi_insert_after (incr_pos, stmt, GSI_NEW_STMT);
> - }
> - else
> - {
> - gimple_stmt_iterator gsi = *incr_pos;
> - if (!gsi_end_p (gsi) && is_gimple_debug (gsi_stmt (gsi)))
> - gsi_next_nondebug (&gsi);
> - if (!gsi_end_p (gsi))
> - gimple_set_location (stmt, gimple_location (gsi_stmt (gsi)));
> - gsi_insert_before (incr_pos, stmt, GSI_NEW_STMT);
> - }
> + gimple_seq incr_stmts = nullptr;
> + gimple_seq_add_stmt (&incr_stmts,
> + gimple_build_assign (va, incr_op, vb, step));
> + insert_iv_increment (incr_pos, after, incr_stmts);
>
> initial = force_gimple_operand (base, &stmts, true, var);
> if (stmts)
> diff --git a/gcc/tree-ssa-loop-manip.h b/gcc/tree-ssa-loop-manip.h
> index b1f65e3c071..80f680565c0 100644
> --- a/gcc/tree-ssa-loop-manip.h
> +++ b/gcc/tree-ssa-loop-manip.h
> @@ -22,6 +22,7 @@ along with GCC; see the file COPYING3. If not see
>
> typedef void (*transform_callback)(class loop *, void *);
>
> +extern void insert_iv_increment (gimple_stmt_iterator *, bool, gimple_seq);
> extern void create_iv (tree, tree_code, tree, tree, class loop *,
> gimple_stmt_iterator *, bool, tree *, tree *);
> extern void rewrite_into_loop_closed_ssa (bitmap, unsigned);
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index 03426207879..eea0b89db69 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -10580,6 +10580,10 @@ vectorizable_induction (loop_vec_info loop_vinfo,
> [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2]. */
> if (slp_node)
> {
> + gimple_stmt_iterator incr_si;
> + bool insert_after;
> + standard_iv_increment_position (iv_loop, &incr_si, &insert_after);
> +
> /* The initial values are vectorized, but any lanes > group_size
> need adjustment. */
> slp_tree init_node
> @@ -10810,7 +10814,7 @@ vectorizable_induction (loop_vec_info loop_vinfo,
> vec_def = gimple_build (&stmts,
> PLUS_EXPR, step_vectype, vec_def, up);
> vec_def = gimple_convert (&stmts, vectype, vec_def);
> - gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
> + insert_iv_increment (&incr_si, insert_after, stmts);
> add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
> UNKNOWN_LOCATION);
>
>
--
Richard Biener <[email protected]>
SUSE Software Solutions Germany GmbH,
Frankenstrasse 146, 90461 Nuernberg, Germany;
GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)