On Tue, 15 Apr 2025, Tamar Christina wrote:
> Hi All,
>
> The following example:
>
> #define N 512
> #define START 2
> #define END 505
>
> int x[N] __attribute__((aligned(32)));
>
> int __attribute__((noipa))
> foo (void)
> {
> for (signed int i = START; i < END; ++i)
> {
> if (x[i] == 0)
> return i;
> }
> return -1;
> }
>
> generates incorrect code with fixed length SVE because for early break we need
> to know which value to start the scalar loop with if we take an early exit.
>
> Historically this means that we take the first element of every induction.
> this is because there's an assumption in place, that even with masked loops
> the
> masks come from a whilel* instruction.
>
> As such we reduce using a BIT_FIELD_REF <, 0>.
>
> When PFA was added this assumption was correct for non-masked loop, however we
> assumed that PFA for VLA wouldn't work for now, and disabled it using the
> alignment requirement checks. We also expected VLS to PFA using scalar loops.
>
> However as this PR shows, for VLS the vectorizer can, and does in some
> circumstances choose to peel using masks by masking the first iteration of the
> loop with an additional alignment mask.
>
> When this is done, the first elements of the predicate can be inactive. In
> this
> example element 1 is inactive based on the calculated misalignment. hence the
> -1 value in the first vector IV element.
>
> When we reduce using BIT_FIELD_REF we get the wrong value.
>
> This patch updates it by creating a new scalar PHI that keeps track of whether
> we are the first iteration of the loop (with the additional masking) or
> whether
> we have taken a loop iteration already.
>
> The generated sequence:
>
> pre-header:
> bb1:
> i_1 = <number of leading inactive elements>
>
> header:
> bb2:
> i_2 = PHI <i_1(bb1), 0(latch)>
> …
>
> early-exit:
> bb3:
> i_3 = iv_step * i_2 + PHI<vector-iv>
>
> Which eliminates the need to do an expensive mask based reduction.
>
> This fixes gromacs with one OpenMP thread. But with > 1 there is still an
> issue.
>
> Bootstrapped Regtested on aarch64-none-linux-gnu,
> arm-none-linux-gnueabihf, x86_64-pc-linux-gnu
> -m32, -m64 and no issues.
>
> Ok for master?
>
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
> PR tree-optimization/119351
> * tree-vect-loop-manip.cc (vect_can_advance_ivs_p): Record non-linear
> inductions.
> * tree-vectorizer.h (LOOP_VINFO_MASK_NITERS_PFA_OFFSET,
> LOOP_VINFO_NON_LINEAR_IV): New.
> (class _loop_vec_info): Add mask_skip_niters_pfa_offset and
> nonlinear_iv.
> * tree-vect-loop.cc (_loop_vec_info::_loop_vec_info): Initialize them.
> (vectorizable_induction): If early break and PFA using masking create a
> new phi which tracks where the scalar code needs to start...
> (vectorizable_live_operation): ...and generate the adjustments here.
> (vect_use_loop_mask_for_alignment_p): Reject non-linear inductions and
> early break needing peeling.
>
> gcc/testsuite/ChangeLog:
>
> PR tree-optimization/119351
> * gcc.target/aarch64/sve/peel_ind_10.c: New test.
> * gcc.target/aarch64/sve/peel_ind_10_run.c: New test.
> * gcc.target/aarch64/sve/peel_ind_5.c: New test.
> * gcc.target/aarch64/sve/peel_ind_5_run.c: New test.
> * gcc.target/aarch64/sve/peel_ind_6.c: New test.
> * gcc.target/aarch64/sve/peel_ind_6_run.c: New test.
> * gcc.target/aarch64/sve/peel_ind_7.c: New test.
> * gcc.target/aarch64/sve/peel_ind_7_run.c: New test.
> * gcc.target/aarch64/sve/peel_ind_8.c: New test.
> * gcc.target/aarch64/sve/peel_ind_8_run.c: New test.
> * gcc.target/aarch64/sve/peel_ind_9.c: New test.
> * gcc.target/aarch64/sve/peel_ind_9_run.c: New test.
>
> ---
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_10.c
> b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_10.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..b7a7bc5cb0cfdfdb74adb120c54ba15019832cf1
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_10.c
> @@ -0,0 +1,24 @@
> +/* Fix for PR119351 alignment peeling with vectors and VLS. */
> +/* { dg-do compile } */
> +/* { dg-options "-Ofast -msve-vector-bits=256 --param
> aarch64-autovec-preference=sve-only -fdump-tree-vect-details" } */
> +
> +#define N 512
> +#define START 0
> +#define END 505
> +
> +int x[N] __attribute__((aligned(32)));
> +
> +int __attribute__((noipa))
> +foo (int start)
> +{
> + for (unsigned int i = start; i < END; ++i)
> + {
> + if (x[i] == 0)
> + return i;
> + }
> + return -1;
> +}
> +
> +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
> +/* { dg-final { scan-tree-dump "pfa_iv_offset" "vect" } } */
> +/* { dg-final { scan-tree-dump "Alignment of access forced using peeling"
> "vect" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_10_run.c
> b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_10_run.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..6169aebcc40cc1553f30c1af61ccec91b51cdb42
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_10_run.c
> @@ -0,0 +1,17 @@
> +/* Fix for PR119351 alignment peeling with vectors and VLS. */
> +/* { dg-do run { target aarch64_sve_hw } } */
> +/* { dg-options "-Ofast --param aarch64-autovec-preference=sve-only" } */
> +/* { dg-additional-options "-msve-vector-bits=256" { target
> aarch64_sve256_hw } } */
> +/* { dg-additional-options "-msve-vector-bits=128" { target
> aarch64_sve128_hw } } */
> +
> +#include "peel_ind_10.c"
> +
> +int __attribute__ ((optimize (1)))
> +main (void)
> +{
> + int res = foo (START);
> + asm volatile ("");
> + if (res != START)
> + __builtin_abort ();
> + return 0;
> +}
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_5.c
> b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_5.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..a03bb1dec21ef75aa0cbfb22c8bb02b99644239e
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_5.c
> @@ -0,0 +1,24 @@
> +/* Fix for PR119351 alignment peeling with vectors and VLS. */
> +/* { dg-do compile } */
> +/* { dg-options "-Ofast -msve-vector-bits=256 --param
> aarch64-autovec-preference=sve-only -fdump-tree-vect-details" } */
> +
> +#define N 512
> +#define START 2
> +#define END 505
> +
> +int x[N] __attribute__((aligned(32)));
> +
> +int __attribute__((noipa))
> +foo (void)
> +{
> + for (signed int i = START; i < END; ++i)
> + {
> + if (x[i] == 0)
> + return i;
> + }
> + return -1;
> +}
> +
> +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
> +/* { dg-final { scan-tree-dump "pfa_iv_offset" "vect" } } */
> +/* { dg-final { scan-tree-dump "Alignment of access forced using peeling"
> "vect" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_5_run.c
> b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_5_run.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..f26befeab7e53561f84b037aec857b44cf018456
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_5_run.c
> @@ -0,0 +1,17 @@
> +/* Fix for PR119351 alignment peeling with vectors and VLS. */
> +/* { dg-do run { target aarch64_sve_hw } } */
> +/* { dg-options "-Ofast --param aarch64-autovec-preference=sve-only" } */
> +/* { dg-additional-options "-msve-vector-bits=256" { target
> aarch64_sve256_hw } } */
> +/* { dg-additional-options "-msve-vector-bits=128" { target
> aarch64_sve128_hw } } */
> +
> +#include "peel_ind_5.c"
> +
> +int __attribute__ ((optimize (1)))
> +main (void)
> +{
> + int res = foo ();
> + asm volatile ("");
> + if (res != START)
> + __builtin_abort ();
> + return 0;
> +}
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_6.c
> b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_6.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..9bfd1a65c4feb0c140d4abf98508fc8af08042ba
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_6.c
> @@ -0,0 +1,24 @@
> +/* Fix for PR119351 alignment peeling with vectors and VLS. */
> +/* { dg-do compile } */
> +/* { dg-options "-Ofast -msve-vector-bits=256 --param
> aarch64-autovec-preference=sve-only -fdump-tree-vect-details" } */
> +
> +#define N 512
> +#define START 1
> +#define END 505
> +
> +int x[N] __attribute__((aligned(32)));
> +
> +int __attribute__((noipa))
> +foo (int start)
> +{
> + for (unsigned int i = start; i < END; ++i)
> + {
> + if (x[i] == 0)
> + return i;
> + }
> + return -1;
> +}
> +
> +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
> +/* { dg-final { scan-tree-dump "pfa_iv_offset" "vect" } } */
> +/* { dg-final { scan-tree-dump "Alignment of access forced using peeling"
> "vect" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_6_run.c
> b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_6_run.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..4fdf3e4e7cac70dc48bad487db37e1e5838b87ab
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_6_run.c
> @@ -0,0 +1,17 @@
> +/* Fix for PR119351 alignment peeling with vectors and VLS. */
> +/* { dg-do run { target aarch64_sve_hw } } */
> +/* { dg-options "-Ofast --param aarch64-autovec-preference=sve-only" } */
> +/* { dg-additional-options "-msve-vector-bits=256" { target
> aarch64_sve256_hw } } */
> +/* { dg-additional-options "-msve-vector-bits=128" { target
> aarch64_sve128_hw } } */
> +
> +#include "peel_ind_6.c"
> +
> +int __attribute__ ((optimize (1)))
> +main (void)
> +{
> + int res = foo (START);
> + asm volatile ("");
> + if (res != START)
> + __builtin_abort ();
> + return 0;
> +}
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_7.c
> b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_7.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..0182e131a173b7b05e88c3393ba854b2da25c6b2
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_7.c
> @@ -0,0 +1,24 @@
> +/* Fix for PR119351 alignment peeling with vectors and VLS. */
> +/* { dg-do compile } */
> +/* { dg-options "-Ofast -msve-vector-bits=256 --param
> aarch64-autovec-preference=sve-only -fdump-tree-vect-details" } */
> +
> +#define N 512
> +#define START 1
> +#define END 505
> +
> +int x[N] __attribute__((aligned(32)));
> +
> +int __attribute__((noipa))
> +foo (void)
> +{
> + for (unsigned int i = START; i < END; ++i)
> + {
> + if (x[i] == 0)
> + return i;
> + }
> + return -1;
> +}
> +
> +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
> +/* { dg-final { scan-tree-dump "pfa_iv_offset" "vect" } } */
> +/* { dg-final { scan-tree-dump "Alignment of access forced using peeling"
> "vect" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_7_run.c
> b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_7_run.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..05608dd85f13912f8555ac3f39284f6894875998
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_7_run.c
> @@ -0,0 +1,17 @@
> +/* Fix for PR119351 alignment peeling with vectors and VLS. */
> +/* { dg-do run { target aarch64_sve_hw } } */
> +/* { dg-options "-Ofast --param aarch64-autovec-preference=sve-only" } */
> +/* { dg-additional-options "-msve-vector-bits=256" { target
> aarch64_sve256_hw } } */
> +/* { dg-additional-options "-msve-vector-bits=128" { target
> aarch64_sve128_hw } } */
> +
> +#include "peel_ind_7.c"
> +
> +int __attribute__ ((optimize (1)))
> +main (void)
> +{
> + int res = foo ();
> + asm volatile ("");
> + if (res != START)
> + __builtin_abort ();
> + return 0;
> +}
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_8.c
> b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_8.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..043348b55d0e8e5e5a5c461b4a4f22b45dfba8e8
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_8.c
> @@ -0,0 +1,24 @@
> +/* Fix for PR119351 alignment peeling with vectors and VLS. */
> +/* { dg-do compile } */
> +/* { dg-options "-Ofast -msve-vector-bits=256 --param
> aarch64-autovec-preference=sve-only -fdump-tree-vect-details" } */
> +
> +#define N 512
> +#define START 1
> +#define END 505
> +
> +int x[N] __attribute__((aligned(32)));
> +
> +int __attribute__((noipa))
> +foo (void)
> +{
> + for (unsigned int i = START; i < END; i*=2)
> + {
> + if (x[i] == 0)
> + return i;
> + }
> + return -1;
> +}
> +
> +/* { dg-final { scan-tree-dump-not "LOOP VECTORIZED" "vect" } } */
> +/* { dg-final { scan-tree-dump-not "pfa_iv_offset" "vect" } } */
> +/* { dg-final { scan-tree-dump-not "Alignment of access forced using
> peeling" "vect" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_8_run.c
> b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_8_run.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..aa8612248bffdc9f4367b8f6699d395ab2726dec
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_8_run.c
> @@ -0,0 +1,17 @@
> +/* Fix for PR119351 alignment peeling with vectors and VLS. */
> +/* { dg-do run { target aarch64_sve_hw } } */
> +/* { dg-options "-Ofast --param aarch64-autovec-preference=sve-only" } */
> +/* { dg-additional-options "-msve-vector-bits=256" { target
> aarch64_sve256_hw } } */
> +/* { dg-additional-options "-msve-vector-bits=128" { target
> aarch64_sve128_hw } } */
> +
> +#include "peel_ind_8.c"
> +
> +int __attribute__ ((optimize (1)))
> +main (void)
> +{
> + int res = foo ();
> + asm volatile ("");
> + if (res != START)
> + __builtin_abort ();
> + return 0;
> +}
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_9.c
> b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_9.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..cc904e88170f072e1d3c6be86643d99a7cd5cb12
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_9.c
> @@ -0,0 +1,25 @@
> +/* Fix for PR119351 alignment peeling with vectors and VLS. */
> +/* { dg-do compile } */
> +/* { dg-options "-Ofast -msve-vector-bits=256 --param
> aarch64-autovec-preference=sve-only -fdump-tree-vect-details" } */
> +
> +#define N 512
> +#define START 1
> +#define END 505
> +
> +int x[N] __attribute__((aligned(32)));
> +
> +int __attribute__((noipa))
> +foo (void)
> +{
> + for (int *p = x + START; p < x + END; p++)
> + {
> + if (*p == 0)
> + return START;
> + }
> + return -1;
> +}
> +
> +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
> +/* Peels using a scalar loop. */
> +/* { dg-final { scan-tree-dump-not "pfa_iv_offset" "vect" } } */
> +/* { dg-final { scan-tree-dump "Alignment of access forced using peeling"
> "vect" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_9_run.c
> b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_9_run.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..767f8bd284ca7c3b9f595c5428c20175ed176d96
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_9_run.c
> @@ -0,0 +1,17 @@
> +/* Fix for PR119351 alignment peeling with vectors and VLS. */
> +/* { dg-do run { target aarch64_sve_hw } } */
> +/* { dg-options "-Ofast --param aarch64-autovec-preference=sve-only" } */
> +/* { dg-additional-options "-msve-vector-bits=256" { target
> aarch64_sve256_hw } } */
> +/* { dg-additional-options "-msve-vector-bits=128" { target
> aarch64_sve128_hw } } */
> +
> +#include "peel_ind_9.c"
> +
> +int __attribute__ ((optimize (1)))
> +main (void)
> +{
> + int res = foo ();
> + asm volatile ("");
> + if (res != START)
> + __builtin_abort ();
> + return 0;
> +}
> diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
> index
> 56a4e9a8b63f3cae0bf596bf5d22893887dc80e8..ea22c1c6050bd2867ee2ecf28379b342b89fddc9
> 100644
> --- a/gcc/tree-vect-loop-manip.cc
> +++ b/gcc/tree-vect-loop-manip.cc
> @@ -2244,6 +2244,8 @@ vect_can_advance_ivs_p (loop_vec_info loop_vinfo)
> induction_type = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (phi_info);
> if (induction_type != vect_step_op_add)
> {
> + /* Mark if we have a non-linear IV. */
> + LOOP_VINFO_NON_LINEAR_IV (loop_vinfo) = true;
Please move this to vect_analyze_scalar_cycles_1 at
if (dump_enabled_p ())
dump_printf_loc (MSG_NOTE, vect_location, "Detected
induction.\n");
STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
where you can check STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_vinfo) !=
vect_step_op_add. Doing it in vect_can_advance_ivs_p makes it non-obvious
that it will be set reliably before you query it in
vect_use_loop_mask_for_alignment_p.
OK with that change.
Thanks,
Richard.
> if (!vect_can_peel_nonlinear_iv_p (loop_vinfo, phi_info))
> return false;
>
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index
> 9413dcef702597ab27165e676546b190e2bd36ba..efb870e8f60315c47c4e5ea18940988ed9986306
> 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -1046,12 +1046,14 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in,
> vec_info_shared *shared)
> suggested_unroll_factor (1),
> max_vectorization_factor (0),
> mask_skip_niters (NULL_TREE),
> + mask_skip_niters_pfa_offset (NULL_TREE),
> rgroup_compare_type (NULL_TREE),
> simd_if_cond (NULL_TREE),
> partial_vector_style (vect_partial_vectors_none),
> unaligned_dr (NULL),
> peeling_for_alignment (0),
> ptr_mask (0),
> + nonlinear_iv (false),
> ivexpr_map (NULL),
> scan_map (NULL),
> slp_unrolling_factor (1),
> @@ -10678,6 +10680,54 @@ vectorizable_induction (loop_vec_info loop_vinfo,
> LOOP_VINFO_MASK_SKIP_NITERS
> (loop_vinfo));
> peel_mul = gimple_build_vector_from_val (&init_stmts,
> step_vectype, peel_mul);
> +
> + /* If early break then we have to create a new PHI which we can use as
> + an offset to adjust the induction reduction in early exits.
> +
> + This is because when peeling for alignment using masking, the first
> + few elements of the vector can be inactive. As such if we find the
> + entry in the first iteration we have adjust the starting point of
> + the scalar code.
> +
> + We do this by creating a new scalar PHI that keeps track of whether
> + we are the first iteration of the loop (with the additional masking)
> + or whether we have taken a loop iteration already.
> +
> + The generated sequence:
> +
> + pre-header:
> + bb1:
> + i_1 = <number of leading inactive elements>
> +
> + header:
> + bb2:
> + i_2 = PHI <i_1(bb1), 0(latch)>
> + …
> +
> + early-exit:
> + bb3:
> + i_3 = iv_step * i_2 + PHI<vector-iv>
> +
> + The first part of the adjustment to create i_1 and i_2 are done here
> + and the last part creating i_3 is done in
> + vectorizable_live_operations when the induction extraction is
> + materialized. */
> + if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
> + && !LOOP_VINFO_MASK_NITERS_PFA_OFFSET (loop_vinfo))
> + {
> + auto skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
> + tree ty_skip_niters = TREE_TYPE (skip_niters);
> + tree break_lhs_phi = vect_get_new_vect_var (ty_skip_niters,
> + vect_scalar_var,
> + "pfa_iv_offset");
> + gphi *nphi = create_phi_node (break_lhs_phi, bb);
> + add_phi_arg (nphi, skip_niters, pe, UNKNOWN_LOCATION);
> + add_phi_arg (nphi, build_zero_cst (ty_skip_niters),
> + loop_latch_edge (iv_loop), UNKNOWN_LOCATION);
> +
> + LOOP_VINFO_MASK_NITERS_PFA_OFFSET (loop_vinfo)
> + = PHI_RESULT (nphi);
> + }
> }
> tree step_mul = NULL_TREE;
> unsigned ivn;
> @@ -11565,8 +11615,10 @@ vectorizable_live_operation (vec_info *vinfo,
> stmt_vec_info stmt_info,
> /* For early exit where the exit is not in the BB that leads
> to the latch then we're restarting the iteration in the
> scalar loop. So get the first live value. */
> - if ((all_exits_as_early_p || !main_exit_edge)
> - && STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
> + bool early_break_first_element_p
> + = (all_exits_as_early_p || !main_exit_edge)
> + && STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def;
> + if (early_break_first_element_p)
> {
> tmp_vec_lhs = vec_lhs0;
> tmp_bitstart = build_zero_cst (TREE_TYPE (bitstart));
> @@ -11581,6 +11633,41 @@ vectorizable_live_operation (vec_info *vinfo,
> stmt_vec_info stmt_info,
> lhs_type, &exit_gsi);
>
> auto gsi = gsi_for_stmt (use_stmt);
> + if (early_break_first_element_p
> + && LOOP_VINFO_MASK_NITERS_PFA_OFFSET (loop_vinfo))
> + {
> + tree step_expr
> + = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
> + tree break_lhs_phi
> + = LOOP_VINFO_MASK_NITERS_PFA_OFFSET (loop_vinfo);
> + tree ty_skip_niters = TREE_TYPE (break_lhs_phi);
> + gimple_seq iv_stmts = NULL;
> +
> + /* Now create the PHI for the outside loop usage to
> + retrieve the value for the offset counter. */
> + tree rphi_step
> + = gimple_convert (&iv_stmts, ty_skip_niters, step_expr);
> + tree tmp2
> + = gimple_build (&iv_stmts, MULT_EXPR,
> + ty_skip_niters, rphi_step,
> + break_lhs_phi);
> +
> + if (POINTER_TYPE_P (TREE_TYPE (new_tree)))
> + tmp2 = gimple_build (&iv_stmts, POINTER_PLUS_EXPR,
> + TREE_TYPE (new_tree), new_tree, tmp2);
> + else
> + {
> + tmp2 = gimple_convert (&iv_stmts, TREE_TYPE (new_tree),
> + tmp2);
> + tmp2 = gimple_build (&iv_stmts, PLUS_EXPR,
> + TREE_TYPE (new_tree), new_tree,
> + tmp2);
> + }
> +
> + new_tree = tmp2;
> + gsi_insert_seq_before (&exit_gsi, iv_stmts, GSI_SAME_STMT);
> + }
> +
> tree lhs_phi = gimple_phi_result (use_stmt);
> remove_phi_node (&gsi, false);
> gimple *copy = gimple_build_assign (lhs_phi, new_tree);
> diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> index
> 97caf61b345735d297ec49fd6ca64797435b46fc..01d19c77656198f92f06619f73752598edd47fab
> 100644
> --- a/gcc/tree-vectorizer.h
> +++ b/gcc/tree-vectorizer.h
> @@ -818,6 +818,11 @@ public:
> elements that should be false in the first mask). */
> tree mask_skip_niters;
>
> + /* If we are using a loop mask to align memory addresses and we're in an
> + early break loop then this variable contains the number of elements that
> + were skipped during the initial iteration of the loop. */
> + tree mask_skip_niters_pfa_offset;
> +
> /* The type that the loop control IV should be converted to before
> testing which of the VF scalars are active and inactive.
> Only meaningful if LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
> @@ -854,6 +859,9 @@ public:
> /* The mask used to check the alignment of pointers or arrays. */
> int ptr_mask;
>
> + /* Indicates whether the loop has any non-linear IV. */
> + bool nonlinear_iv;
> +
> /* Data Dependence Relations defining address ranges that are candidates
> for a run-time aliasing check. */
> auto_vec<ddr_p> may_alias_ddrs;
> @@ -1064,6 +1072,7 @@ public:
> #define LOOP_VINFO_MASKS(L) (L)->masks
> #define LOOP_VINFO_LENS(L) (L)->lens
> #define LOOP_VINFO_MASK_SKIP_NITERS(L) (L)->mask_skip_niters
> +#define LOOP_VINFO_MASK_NITERS_PFA_OFFSET(L) (L)->mask_skip_niters_pfa_offset
> #define LOOP_VINFO_RGROUP_COMPARE_TYPE(L) (L)->rgroup_compare_type
> #define LOOP_VINFO_RGROUP_IV_TYPE(L) (L)->rgroup_iv_type
> #define LOOP_VINFO_PARTIAL_VECTORS_STYLE(L) (L)->partial_vector_style
> @@ -1073,6 +1082,7 @@ public:
> #define LOOP_VINFO_DDRS(L) (L)->shared->ddrs
> #define LOOP_VINFO_INT_NITERS(L) (TREE_INT_CST_LOW
> ((L)->num_iters))
> #define LOOP_VINFO_PEELING_FOR_ALIGNMENT(L) (L)->peeling_for_alignment
> +#define LOOP_VINFO_NON_LINEAR_IV(L) (L)->nonlinear_iv
> #define LOOP_VINFO_UNALIGNED_DR(L) (L)->unaligned_dr
> #define LOOP_VINFO_MAY_MISALIGN_STMTS(L) (L)->may_misalign_stmts
> #define LOOP_VINFO_MAY_ALIAS_DDRS(L) (L)->may_alias_ddrs
> @@ -2138,8 +2148,14 @@ unlimited_cost_model (loop_p loop)
> inline bool
> vect_use_loop_mask_for_alignment_p (loop_vec_info loop_vinfo)
> {
> + /* With early break vectorization we don't know whether the accesses will
> stay
> + inside the loop or not. TODO: The early break adjustment code can be
> + implemented the same way as vectorizable_linear_induction. However we
> + can't test this today so reject it. */
> return (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
> - && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
> + && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
> + && !(LOOP_VINFO_NON_LINEAR_IV (loop_vinfo)
> + && LOOP_VINFO_EARLY_BREAKS (loop_vinfo)));
> }
>
> /* Return the number of vectors of type VECTYPE that are needed to get
>
>
>
--
Richard Biener <[email protected]>
SUSE Software Solutions Germany GmbH,
Frankenstrasse 146, 90461 Nuernberg, Germany;
GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)