Richard Biener <[email protected]> writes:
> The following ensures that peeling a single iteration for gaps is
> sufficient by enforcing niter masking (partial vector use) given
> we cannot (always) statically decide when the vector size isn't known.
> The condition guarding this and thus statically giving a pass in
> some cases for VL vectors is questionable, the patch doesn't address
> this.
>
> This fixes a set of known failout from enabling
> --param vect-force-slp=1 by default.
>
> Bootstrapped and tested on x86_64-unknown-linux-gnu.
>
> PR tree-optimization/117558
> * tree-vectorizer.h (_loop_vec_info::must_use_partial_vectors_p): New.
> (LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P): Likewise.
> * tree-vect-loop.cc (_loop_vec_info::_loop_vec_info): Initialize
> must_use_partial_vectors_p.
> (vect_determine_partial_vectors_and_peeling): Enforce it.
> (vect_analyze_loop_2): Reset before restarting.
> * tree-vect-stmts.cc (get_group_load_store_type): When peeling
> a single gap iteration cannot be determined safe statically
> enforce the use of partial vectors.
LGTM. Just to make sure I understand...
> ---
> gcc/tree-vect-loop.cc | 13 ++++++++++++-
> gcc/tree-vect-stmts.cc | 24 +++++++++++++++++++-----
> gcc/tree-vectorizer.h | 4 ++++
> 3 files changed, 35 insertions(+), 6 deletions(-)
>
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index c67248e851d..18c4fa1d000 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -1059,6 +1059,7 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in,
> vec_info_shared *shared)
> inner_loop_cost_factor (param_vect_inner_loop_cost_factor),
> vectorizable (false),
> can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
> + must_use_partial_vectors_p (false),
> using_partial_vectors_p (false),
> using_decrementing_iv_p (false),
> using_select_vl_p (false),
> @@ -2679,7 +2680,10 @@ vect_determine_partial_vectors_and_peeling
> (loop_vec_info loop_vinfo)
> LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
> LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
> if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
> - && need_peeling_or_partial_vectors_p)
> + && LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P (loop_vinfo))
> + LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
> + else if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
> + && need_peeling_or_partial_vectors_p)
> {
> /* For partial-vector-usage=1, try to push the handling of partial
> vectors to the epilogue, with the main loop continuing to operate
> @@ -2702,6 +2706,12 @@ vect_determine_partial_vectors_and_peeling
> (loop_vec_info loop_vinfo)
> LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
> }
>
> + if (LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P (loop_vinfo)
> + && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
> + return opt_result::failure_at (vect_location,
> + "not vectorized: loop needs but cannot "
> + "use partial vectors\n");
> +
> if (dump_enabled_p ())
> dump_printf_loc (MSG_NOTE, vect_location,
> "operating on %s vectors%s.\n",
> @@ -3387,6 +3397,7 @@ again:
> LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
> LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
> = saved_can_use_partial_vectors_p;
> + LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
> LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
> if (loop_vinfo->scan_map)
> loop_vinfo->scan_map->empty ();
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index 458056dd13d..f4a4d5a554c 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -2202,11 +2202,25 @@ get_group_load_store_type (vec_info *vinfo,
> stmt_vec_info stmt_info,
> (vectype, cnunits / cpart_size,
> &half_vtype) == NULL_TREE)))
> {
> - if (dump_enabled_p ())
> - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> - "peeling for gaps insufficient for "
> - "access\n");
> - return false;
> + /* If all fails we can still resort to niter masking, so
> + enforce the use of partial vectors. */
> + if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
> + {
> + if (dump_enabled_p ())
> + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> + "peeling for gaps insufficient for "
> + "access unless using partial "
> + "vectors\n");
> + LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P (loop_vinfo) = true;
> + }
> + else
> + {
> + if (dump_enabled_p ())
> + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> + "peeling for gaps insufficient for "
> + "access\n");
> + return false;
> + }
...is this a compile-time optimisation? I.e. CAN_USE_PARTIAL_VECTORS_P
mustn't ever go from false to true, so if it's already false, there's no
point continuing?
Richard
> }
> }
> }
> diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> index 273e8c644e7..d85dd594094 100644
> --- a/gcc/tree-vectorizer.h
> +++ b/gcc/tree-vectorizer.h
> @@ -913,6 +913,9 @@ public:
> fewer than VF scalars. */
> bool can_use_partial_vectors_p;
>
> + /* Records whether we must use niter masking for correctness reasons. */
> + bool must_use_partial_vectors_p;
> +
> /* True if we've decided to use partially-populated vectors, so that
> the vector loop can handle fewer than VF scalars. */
> bool using_partial_vectors_p;
> @@ -1051,6 +1054,7 @@ public:
> #define LOOP_VINFO_VERSIONING_THRESHOLD(L) (L)->versioning_threshold
> #define LOOP_VINFO_VECTORIZABLE_P(L) (L)->vectorizable
> #define LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P(L)
> (L)->can_use_partial_vectors_p
> +#define LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P(L)
> (L)->can_use_partial_vectors_p
> #define LOOP_VINFO_USING_PARTIAL_VECTORS_P(L) (L)->using_partial_vectors_p
> #define LOOP_VINFO_USING_DECREMENTING_IV_P(L) (L)->using_decrementing_iv_p
> #define LOOP_VINFO_USING_SELECT_VL_P(L) (L)->using_select_vl_p