On Wed, Jul 5, 2023 at 8:44 AM Hao Liu OS via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> Hi,
>
> If a loop is unrolled by n times during vectoriation, two steps are used to
> calculate the induction variable:
>   - The small step for the unrolled ith-copy: vec_1 = vec_iv + (VF/n * Step)
>   - The large step for the whole loop: vec_loop = vec_iv + (VF * Step)
>
> This patch calculates an extra vec_n to replace vec_loop:
>   vec_n = vec_prev + (VF/n * S) = vec_iv + (VF/n * S) * n = vec_loop.
>
> So that we can save the large step register and related operations.

OK.  It would be nice to avoid the dead stmts created earlier though.

Thanks,
Richard.

> gcc/ChangeLog:
>
>         PR tree-optimization/110449
>         * tree-vect-loop.cc (vectorizable_induction): use vec_n to replace
>         vec_loop for the unrolled loop.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/aarch64/pr110449.c: New testcase.
> ---
>  gcc/testsuite/gcc.target/aarch64/pr110449.c | 40 +++++++++++++++++++++
>  gcc/tree-vect-loop.cc                       | 21 +++++++++--
>  2 files changed, 58 insertions(+), 3 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/pr110449.c
>
> diff --git a/gcc/testsuite/gcc.target/aarch64/pr110449.c 
> b/gcc/testsuite/gcc.target/aarch64/pr110449.c
> new file mode 100644
> index 00000000000..bb3b6dcfe08
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/pr110449.c
> @@ -0,0 +1,40 @@
> +/* { dg-do compile } */
> +/* { dg-options "-Ofast -mcpu=neoverse-n2 --param 
> aarch64-vect-unroll-limit=2" } */
> +/* { dg-final { scan-assembler-not "8.0e\\+0" } } */
> +
> +/* Calcualte the vectorized induction with smaller step for an unrolled loop.
> +
> +   before (suggested_unroll_factor=2):
> +         fmov    s30, 8.0e+0
> +         fmov    s31, 4.0e+0
> +         dup     v27.4s, v30.s[0]
> +         dup     v28.4s, v31.s[0]
> +     .L6:
> +         mov     v30.16b, v31.16b
> +         fadd    v31.4s, v31.4s, v27.4s
> +         fadd    v29.4s, v30.4s, v28.4s
> +         stp     q30, q29, [x0]
> +         add     x0, x0, 32
> +         cmp     x1, x0
> +         bne     .L6
> +
> +   after:
> +         fmov    s31, 4.0e+0
> +         dup     v29.4s, v31.s[0]
> +     .L6:
> +         fadd    v30.4s, v31.4s, v29.4s
> +         stp     q31, q30, [x0]
> +         add     x0, x0, 32
> +         fadd    v31.4s, v29.4s, v30.4s
> +         cmp     x0, x1
> +         bne     .L6  */
> +
> +void
> +foo2 (float *arr, float freq, float step)
> +{
> +  for (int i = 0; i < 1024; i++)
> +    {
> +      arr[i] = freq;
> +      freq += step;
> +    }
> +}
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index 3b46c58a8d8..706ecbffd0c 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -10114,7 +10114,7 @@ vectorizable_induction (loop_vec_info loop_vinfo,
>                                    new_vec, step_vectype, NULL);
>
>        vec_def = induc_def;
> -      for (i = 1; i < ncopies; i++)
> +      for (i = 1; i < ncopies + 1; i++)
>         {
>           /* vec_i = vec_prev + vec_step  */
>           gimple_seq stmts = NULL;
> @@ -10124,8 +10124,23 @@ vectorizable_induction (loop_vec_info loop_vinfo,
>           vec_def = gimple_convert (&stmts, vectype, vec_def);
>
>           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
> -         new_stmt = SSA_NAME_DEF_STMT (vec_def);
> -         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
> +         if (i < ncopies)
> +           {
> +             new_stmt = SSA_NAME_DEF_STMT (vec_def);
> +             STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
> +           }
> +         else
> +           {
> +             /* vec_1 = vec_iv + (VF/n * S)
> +                vec_2 = vec_1 + (VF/n * S)
> +                ...
> +                vec_n = vec_prev + (VF/n * S) = vec_iv + VF * S = vec_loop
> +
> +                vec_n is used as vec_loop to save the large step register and
> +                related operations.  */
> +             add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
> +                          UNKNOWN_LOCATION);
> +           }
>         }
>      }
>
> --
> 2.34.1

Reply via email to