Hi all,
This patch adds the ability to define a target hook to unroll the main
vectorized loop. It also introduces --param's vect-unroll and
vect-unroll-reductions to control this through a command-line. I found
this useful to experiment and believe can help when tuning, so I decided
to leave it in.
We only unroll the main loop and have disabled unrolling epilogues for
now. We also do not support unrolling of any loop that has a negative
step and we do not support unrolling a loop with any reduction other
than a TREE_CODE_REDUCTION.
Bootstrapped and regression tested on aarch64-linux-gnu as part of the
series.
gcc/ChangeLog:
* doc/tm.texi: Document TARGET_VECTORIZE_UNROLL_FACTOR
and TARGET_VECTORIZE_ADD_STMT_COST_FOR_UNROLL.
* doc/tm.texi.in: Add entries for target hooks above.
* params.opt: Add vect-unroll and vect-unroll-reductions
parameters.
* target.def: Define hooks TARGET_VECTORIZE_UNROLL_FACTOR
and TARGET_VECTORIZE_ADD_STMT_COST_FOR_UNROLL.
* targhooks.c (default_add_stmt_cost_for_unroll): New.
(default_unroll_factor): Likewise.
* targhooks.h (default_add_stmt_cost_for_unroll): Likewise.
(default_unroll_factor): Likewise.
* tree-vect-loop.c (_loop_vec_info::_loop_vec_info): Initialize
par_unrolling_factor.
(vect_update_vf_for_slp): Use unrolling factor to update
vectorization
factor.
(vect_determine_partial_vectors_and_peeling): Account for
unrolling.
(vect_determine_unroll_factor): Determine how much to unroll
vectorized
main loop.
(vect_analyze_loop_2): Call vect_determine_unroll_factor.
(vect_analyze_loop): Allow for epilogue vectorization when
unrolling
and rewalk vector_mode array for the epilogues.
(vectorizable_reduction): Disable single_defuse_cycle when
unrolling.
* tree-vectorizer.h (vect_unroll_value): Declare
par_unrolling_factor
as a member of loop_vec_info.
diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index
f68f42638a112bed8396fd634bd3fd3c44ce848a..3bc9694d2162055d3db165ef888f35deb676548b
100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -6283,6 +6283,19 @@ allocated by TARGET_VECTORIZE_INIT_COST. The default
releases the
accumulator.
@end deftypefn
+@deftypefn {Target Hook} void TARGET_VECTORIZE_ADD_STMT_COST_FOR_UNROLL (class
vec_info *@var{vinfo}, class _stmt_vec_info *@var{stmt_info}, void *@var{data})
+This hook should update the target-specific @var{data} relative
+relative to the statement represented by @var{stmt_vinfo} to be used
+later to determine the unrolling factor for this loop using the current
+vectorization factor.
+@end deftypefn
+
+@deftypefn {Target Hook} unsigned TARGET_VECTORIZE_UNROLL_FACTOR (class
vec_info *@var{vinfo}, void *@var{data})
+This hook should return the desired vector unrolling factor for a loop with
+@var{vinfo} based on the target-specific @var{data}. The default returns one,
+which means no unrolling will be performed.
+@end deftypefn
+
@deftypefn {Target Hook} tree TARGET_VECTORIZE_BUILTIN_GATHER (const_tree
@var{mem_vectype}, const_tree @var{index_type}, int @var{scale})
Target builtin that implements vector gather operation. @var{mem_vectype}
is the vector type of the load and @var{index_type} is scalar type of
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index
fdf16b901c537e6a02f630a80a2213d2dcb6d5d6..40f4cb02c34f575439f35070301855ddaf82a21a
100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -4195,6 +4195,10 @@ address; but often a machine-dependent strategy can
generate better code.
@hook TARGET_VECTORIZE_DESTROY_COST_DATA
+@hook TARGET_VECTORIZE_ADD_STMT_COST_FOR_UNROLL
+
+@hook TARGET_VECTORIZE_UNROLL_FACTOR
+
@hook TARGET_VECTORIZE_BUILTIN_GATHER
@hook TARGET_VECTORIZE_BUILTIN_SCATTER
diff --git a/gcc/params.opt b/gcc/params.opt
index
f414dc1a61cfa9d5b9ded75e96560fc1f73041a5..00f92d4484797df0dbbad052f45205469cbb2c49
100644
--- a/gcc/params.opt
+++ b/gcc/params.opt
@@ -1117,4 +1117,12 @@ Controls how loop vectorizer uses partial vectors. 0
means never, 1 means only
Common Joined UInteger Var(param_vect_inner_loop_cost_factor) Init(50)
IntegerRange(1, 10000) Param Optimization
The maximum factor which the loop vectorizer applies to the cost of statements
in an inner loop relative to the loop being vectorized.
+-param=vect-unroll=
+Common Joined UInteger Var(param_vect_unroll) Init(0) IntegerRange(0, 32)
Param Optimization
+Controls how many times the vectorizer tries to unroll loops. Also see
vect-unroll-reductions.
+
+-param=vect-unroll-reductions=
+Common Joined UInteger Var(param_vect_unroll_reductions) Init(0)
IntegerRange(0, 32) Param Optimization
+Controls how many times the vectorizer tries to unroll loops that contain
associative reductions. 0 means that such loops should be unrolled vect-unroll
times.
+
; This comment is to ensure we retain the blank line above.
diff --git a/gcc/target.def b/gcc/target.def
index
28a34f1d51b5abb41c537b9cd327ca59f1f9260f..0eac529f17bd981b6494fe613117f28803a02390
100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -2095,6 +2095,28 @@ accumulator.",
(void *data),
default_destroy_cost_data)
+/* Target function to record cost approximation to be used by
+ TARGET_VECTORIZE_UNROLL_FACTOR. */
+DEFHOOK
+(add_stmt_cost_for_unroll,
+ "This hook should update the target-specific @var{data} relative\n\
+relative to the statement represented by @var{stmt_vinfo} to be used\n\
+later to determine the unrolling factor for this loop using the current\n\
+vectorization factor.",
+ void,
+ (class vec_info *vinfo, class _stmt_vec_info *stmt_info, void *data),
+ default_add_stmt_cost_for_unroll)
+
+/* Function to determine unroll factor for vectorization. */
+DEFHOOK
+(unroll_factor,
+ "This hook should return the desired vector unrolling factor for a loop
with\n\
+@var{vinfo} based on the target-specific @var{data}. The default returns
one,\n\
+which means no unrolling will be performed.",
+ unsigned,
+ (class vec_info *vinfo, void *data),
+ default_unroll_factor)
+
HOOK_VECTOR_END (vectorize)
#undef HOOK_PREFIX
diff --git a/gcc/targhooks.h b/gcc/targhooks.h
index
92d51992e625c2497aa8496b1e2e3d916e5706fd..d285c24d6d398cfabb58c291fd2dcbfa6e1bd8f6
100644
--- a/gcc/targhooks.h
+++ b/gcc/targhooks.h
@@ -125,6 +125,9 @@ extern unsigned default_add_stmt_cost (class vec_info *,
void *, int,
enum vect_cost_model_location);
extern void default_finish_cost (void *, unsigned *, unsigned *, unsigned *);
extern void default_destroy_cost_data (void *);
+extern void default_add_stmt_cost_for_unroll (class vec_info *,
+ class _stmt_vec_info *, void *);
+extern unsigned default_unroll_factor (class vec_info *, void *);
/* OpenACC hooks. */
extern bool default_goacc_validate_dims (tree, int [], int, unsigned);
diff --git a/gcc/targhooks.c b/gcc/targhooks.c
index
c9b5208853dbc15706a65d1eb335e28e0564325e..9bc7e80e5a67129633dab99a871b6babff65de97
100644
--- a/gcc/targhooks.c
+++ b/gcc/targhooks.c
@@ -1535,6 +1535,26 @@ default_destroy_cost_data (void *data)
free (data);
}
+/* By default, we do not perform unrolling so this function does not need
+ to do anything. */
+void
+default_add_stmt_cost_for_unroll (class vec_info *vinfo ATTRIBUTE_UNUSED,
+ class _stmt_vec_info *stmt_info
+ ATTRIBUTE_UNUSED,
+ void *data ATTRIBUTE_UNUSED)
+{
+}
+
+
+/* By default, return a vector unroll factor of one, meaning no unrolling will
+ be performed. */
+unsigned
+default_unroll_factor (class vec_info *vinfo ATTRIBUTE_UNUSED,
+ void *data ATTRIBUTE_UNUSED)
+{
+ return 1;
+}
+
/* Determine whether or not a pointer mode is valid. Assume defaults
of ptr_mode or Pmode - can be overridden. */
bool
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index
0c8d992624b59ddd056aff594738305d6be5afa8..14f8150d7c262b9422784e0e997ca4387664a20a
100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -828,6 +828,7 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in,
vec_info_shared *shared)
skip_main_loop_edge (nullptr),
skip_this_loop_edge (nullptr),
reusable_accumulators (),
+ par_unrolling_factor (1),
max_vectorization_factor (0),
mask_skip_niters (NULL_TREE),
rgroup_compare_type (NULL_TREE),
@@ -1594,6 +1595,7 @@ vect_update_vf_for_slp (loop_vec_info loop_vinfo)
dump_printf_loc (MSG_NOTE, vect_location,
"Loop contains only SLP stmts\n");
vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
+ vectorization_factor *= loop_vinfo->par_unrolling_factor;
}
else
{
@@ -2131,7 +2133,8 @@ vect_determine_partial_vectors_and_peeling (loop_vec_info
loop_vinfo,
??? We could then end up failing to use partial vectors if we
decide to peel iterations into a prologue, and if the main loop
then ends up processing fewer than VF iterations. */
- if (param_vect_partial_vector_usage == 1
+ if ((param_vect_partial_vector_usage == 1
+ || loop_vinfo->par_unrolling_factor > 1)
&& !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
&& !vect_known_niters_smaller_than_vf (loop_vinfo))
LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
@@ -2192,6 +2195,101 @@ vect_determine_partial_vectors_and_peeling
(loop_vec_info loop_vinfo,
return opt_result::success ();
}
+
+static poly_uint64
+vect_determine_unroll_factor (loop_vec_info loop_vinfo)
+{
+ stmt_vec_info stmt_info;
+ unsigned i;
+ bool seen_reduction_p = false;
+ bool can_unroll_p = !LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
+ poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+
+ if (!can_unroll_p)
+ return vectorization_factor;
+
+ DUMP_VECT_SCOPE ("vect_determine_unroll_factor");
+
+ void *target_cost_data = init_cost (loop_vinfo->loop, true);
+
+ FOR_EACH_VEC_ELT (loop_vinfo->stmt_vec_infos, i, stmt_info)
+ {
+ if (STMT_VINFO_IN_PATTERN_P (stmt_info)
+ || !STMT_VINFO_RELEVANT_P (stmt_info)
+ || stmt_info->vectype == NULL_TREE)
+ continue;
+ /* Do not unroll loops with negative steps as it is unlikely that
+ vectorization will succeed due to the way we deal with negative steps
+ in loads and stores in 'get_load_store_type'. */
+ if (stmt_info->dr_aux.dr
+ && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
+ {
+ dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
+ tree step = vect_dr_behavior (loop_vinfo, dr_info)->step;
+ if (TREE_CODE (step) == INTEGER_CST
+ && tree_int_cst_compare (step, size_zero_node) < 0)
+ {
+ can_unroll_p = false;
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "could not unroll due to negative step\n");
+ break;
+ }
+ }
+
+ if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
+ {
+ auto red_info = info_for_reduction (loop_vinfo, stmt_info);
+ if (STMT_VINFO_REDUC_TYPE (red_info) == TREE_CODE_REDUCTION)
+ seen_reduction_p = true;
+ else
+ {
+ can_unroll_p = false;
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "could not unroll due to unsupported "
+ "reduction\n");
+ break;
+ }
+ }
+
+ targetm.vectorize.add_stmt_cost_for_unroll (loop_vinfo, stmt_info,
+ target_cost_data);
+ }
+
+ if (!can_unroll_p)
+ {
+ return vectorization_factor;
+ }
+
+ unsigned int unrolling_factor = 1;
+ if (maybe_gt (vectorization_factor, 1U))
+ unrolling_factor = vect_unroll_value (loop_vinfo, seen_reduction_p,
+ target_cost_data);
+
+
+ destroy_cost_data (target_cost_data);
+
+ while (unrolling_factor > 1)
+ {
+ poly_uint64 candidate_factor = vectorization_factor * unrolling_factor;
+ if (estimated_poly_value (candidate_factor, POLY_VALUE_MAX)
+ <= (HOST_WIDE_INT) LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo))
+ {
+ vectorization_factor = candidate_factor;
+ break;
+ }
+ unrolling_factor /= 2;
+ }
+ loop_vinfo->par_unrolling_factor = unrolling_factor;
+ LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_NOTE, vect_location, "unrolling factor = %d\n",
+ unrolling_factor);
+
+ return vectorization_factor;
+}
+
/* Function vect_analyze_loop_2.
Apply a set of analyses on LOOP, and create a loop_vec_info struct
@@ -2320,6 +2418,8 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool
&fatal, unsigned *n_stmts)
"can't determine vectorization factor.\n");
return ok;
}
+
+ vect_determine_unroll_factor (loop_vinfo);
if (max_vf != MAX_VECTORIZATION_FACTOR
&& maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
return opt_result::failure_at (vect_location, "bad data dependence.\n");
@@ -3062,7 +3162,14 @@ vect_analyze_loop (class loop *loop, vec_info_shared
*shared)
gcc_assert (vect_epilogues);
delete vinfos.pop ();
}
+ /* Check if we may want to replace the current first_loop_vinfo
+ with the new loop, but only if they have different vector
+ modes. If they have the same vector mode this means the main
+ loop is an unrolled loop and we are trying to vectorize the
+ epilogue using the same vector mode but with a lower
+ vectorization factor. */
if (vinfos.is_empty ()
+ && loop_vinfo->vector_mode != first_loop_vinfo->vector_mode
&& vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
{
loop_vec_info main_loop_vinfo
@@ -3156,10 +3263,26 @@ vect_analyze_loop (class loop *loop, vec_info_shared
*shared)
The retry should be in the same mode as original. */
if (vect_epilogues
&& loop_vinfo
- && LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo))
+ && (LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo)
+ || loop_vinfo->par_unrolling_factor > 1))
{
- gcc_assert (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
+ gcc_assert ((LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
+ || loop_vinfo->par_unrolling_factor > 1)
&& !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo));
+ /* If we are unrolling, try all VECTOR_MODES for the epilogue. */
+ if (loop_vinfo->par_unrolling_factor > 1)
+ {
+ next_vector_mode = vector_modes[0];
+ mode_i = 1;
+
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "***** Re-trying analysis with vector mode"
+ " %s for epilogue with partial vectors.\n",
+ GET_MODE_NAME (next_vector_mode));
+ continue;
+ }
+
if (dump_enabled_p ())
dump_printf_loc (MSG_NOTE, vect_location,
"***** Re-trying analysis with same vector mode"
@@ -7212,7 +7335,8 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
participating. */
if (ncopies > 1
&& (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
- && reduc_chain_length == 1)
+ && reduc_chain_length == 1
+ && loop_vinfo->par_unrolling_factor == 1)
single_defuse_cycle = true;
if (single_defuse_cycle || lane_reduc_code_p)
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index
9c2c29d61fae5e651a112b103482131e3d646fb6..b51e82a0663a391a096480bff03a2191bc11dcf4
100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -620,6 +620,11 @@ public:
about the reductions that generated them. */
hash_map<tree, vect_reusable_accumulator> reusable_accumulators;
+ /* The number of times that we've unrolled the vector loop in order
+ to promote more ILP. This value is folded into vectorization_factor
+ (and therefore exactly divides vectorization_factor). */
+ unsigned int par_unrolling_factor;
+
/* Maximum runtime vectorization factor, or MAX_VECTORIZATION_FACTOR
if there is no particular limit. */
unsigned HOST_WIDE_INT max_vectorization_factor;
@@ -1810,6 +1815,20 @@ vect_apply_runtime_profitability_check_p (loop_vec_info
loop_vinfo)
&& th >= vect_vf_for_cost (loop_vinfo));
}
+/* Return the number of times that we should unroll general
+ reduction-free loops. */
+
+inline unsigned int
+vect_unroll_value (loop_vec_info loop_vinfo, bool seen_reduction_p, void *data)
+{
+ if (seen_reduction_p && param_vect_unroll_reductions >= 1)
+ return param_vect_unroll_reductions;
+ if (param_vect_unroll >= 1)
+ return param_vect_unroll;
+ else
+ return targetm.vectorize.unroll_factor (loop_vinfo, data);
+}
+
/* Source location + hotness information. */
extern dump_user_location_t vect_location;