On 15/10/2021 09:48, Richard Biener wrote:
On Tue, 12 Oct 2021, Andre Vieira (lists) wrote:

Hi Richi,

I think this is what you meant, I now hide all the unrolling cost calculations
in the existing target hooks for costs. I did need to adjust 'finish_cost' to
take the loop_vinfo so the target's implementations are able to set the newly
renamed 'suggested_unroll_factor'.

Also added the checks for the epilogue's VF.

Is this more like what you had in mind?
Not exactly (sorry..).  For the target hook I think we don't want to
pass vec_info but instead another output parameter like the existing
ones.

vect_estimate_min_profitable_iters should then via
vect_analyze_loop_costing and vect_analyze_loop_2 report the unroll
suggestion to vect_analyze_loop which should then, if the suggestion
was > 1, instead of iterating to the next vector mode run again
with a fixed VF (old VF times suggested unroll factor - there's
min_vf in vect_analyze_loop_2 which we should adjust to
the old VF times two for example and maybe store the suggested
factor as hint) - if it succeeds the result will end up in the
list of considered modes (where we now may have more than one
entry for the same mode but a different VF), we probably want to
only consider more unrolling once.

For simplicity I'd probably set min_vf = max_vf = old VF * suggested
factor, thus take the targets request literally.

Richard.

Hi,

I now pass an output parameter to finish_costs and route it through the various calls up to vect_analyze_loop.  I tried to rework vect_determine_vectorization_factor and noticed that merely setting min_vf and max_vf is not enough, we only use these to check whether the vectorization factor is within range, well actually we only use max_vf at that stage. We only seem to use 'min_vf' to make sure the data_references are valid.  I am not sure my changes are the most appropriate here, for instance I am pretty sure the checks for max and min vf I added in vect_determine_vectorization_factor are currently superfluous as they will pass by design, but thought they might be good future proofing?

Also I changed how we compare against max_vf, rather than relying on the 'MAX_VECTORIZATION' I decided to use the estimated_poly_value with POLY_VALUE_MAX, to be able to bound it further in case we have knowledge of the VL. I am not entirely about the validity of this change, maybe we are better off keeping the MAX_VECTORIZATION in place and not making any changes to max_vf for unrolling.

What do you think?
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 
36519ccc5a58abab483c38d0a6c5f039592bfc7f..9b1e01e9b62050d7e34bc55454771e40bdbdb4cb
 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -15972,8 +15972,8 @@ aarch64_adjust_body_cost (aarch64_vector_costs *costs, 
unsigned int body_cost)
 
 /* Implement TARGET_VECTORIZE_FINISH_COST.  */
 static void
-aarch64_finish_cost (void *data, unsigned *prologue_cost,
-                    unsigned *body_cost, unsigned *epilogue_cost)
+aarch64_finish_cost (void *data, unsigned *prologue_cost, unsigned *body_cost,
+                    unsigned *epilogue_cost, unsigned *suggested_unroll_factor)
 {
   auto *costs = static_cast<aarch64_vector_costs *> (data);
   *prologue_cost = costs->region[vect_prologue];
@@ -15984,6 +15984,9 @@ aarch64_finish_cost (void *data, unsigned 
*prologue_cost,
       && costs->vec_flags
       && aarch64_use_new_vector_costs_p ())
     *body_cost = aarch64_adjust_body_cost (costs, *body_cost);
+
+  if(suggested_unroll_factor)
+    *suggested_unroll_factor = 1;
 }
 
 /* Implement TARGET_VECTORIZE_DESTROY_COST_DATA.  */
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 
afc2674d49da370ae0f5ef277df7e9954f303b8e..a48e43879512793907fef946c1575c3ed7f68092
 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -23048,13 +23048,15 @@ ix86_add_stmt_cost (class vec_info *vinfo, void 
*data, int count,
 /* Implement targetm.vectorize.finish_cost.  */
 
 static void
-ix86_finish_cost (void *data, unsigned *prologue_cost,
-                 unsigned *body_cost, unsigned *epilogue_cost)
+ix86_finish_cost (void *data, unsigned *prologue_cost, unsigned *body_cost,
+                 unsigned *epilogue_cost, unsigned *suggested_unroll_factor)
 {
   unsigned *cost = (unsigned *) data;
   *prologue_cost = cost[vect_prologue];
   *body_cost     = cost[vect_body];
   *epilogue_cost = cost[vect_epilogue];
+  if (suggested_unroll_factor)
+    *suggested_unroll_factor = 1;
 }
 
 /* Implement targetm.vectorize.destroy_cost_data.  */
diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
index 
ad81dfb316dff00cde810d6b1edd31fa49d5c1e8..59d30ad6fcd1758383c52e34a0f90a126c501ec3
 100644
--- a/gcc/config/rs6000/rs6000.c
+++ b/gcc/config/rs6000/rs6000.c
@@ -5551,8 +5551,8 @@ rs6000_adjust_vect_cost_per_loop (rs6000_cost_data *data)
 /* Implement targetm.vectorize.finish_cost.  */
 
 static void
-rs6000_finish_cost (void *data, unsigned *prologue_cost,
-                   unsigned *body_cost, unsigned *epilogue_cost)
+rs6000_finish_cost (void *data, unsigned *prologue_cost, unsigned *body_cost,
+                   unsigned *epilogue_cost, unsigned *suggested_unroll_factor)
 {
   rs6000_cost_data *cost_data = (rs6000_cost_data*) data;
 
@@ -5578,6 +5578,8 @@ rs6000_finish_cost (void *data, unsigned *prologue_cost,
   *prologue_cost = cost_data->cost[vect_prologue];
   *body_cost     = cost_data->cost[vect_body];
   *epilogue_cost = cost_data->cost[vect_epilogue];
+  if (suggested_unroll_factor)
+    *suggested_unroll_factor = 1;
 }
 
 /* Implement targetm.vectorize.destroy_cost_data.  */
diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index 
be8148583d8571b0d035b1938db9d056bfd213a8..c584260b02c3e8d4fcd7b31c38321d5f81a71428
 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -6276,11 +6276,12 @@ return value should be viewed as a tentative cost that 
may later be
 revised.
 @end deftypefn
 
-@deftypefn {Target Hook} void TARGET_VECTORIZE_FINISH_COST (void *@var{data}, 
unsigned *@var{prologue_cost}, unsigned *@var{body_cost}, unsigned 
*@var{epilogue_cost})
+@deftypefn {Target Hook} void TARGET_VECTORIZE_FINISH_COST (void *@var{data}, 
unsigned *@var{prologue_cost}, unsigned *@var{body_cost}, unsigned 
*@var{epilogue_cost}, unsigned *@var{suggested_unroll_factor})
 This hook should complete calculations of the cost of vectorizing a loop
 or basic block based on @var{data}, and return the prologue, body, and
-epilogue costs as unsigned integers.  The default returns the value of
-the three accumulators.
+epilogue costs as unsigned integers.  It also asks the backend whether it
+has a @var{suggested_unroll_factor}.  The default returns the value of
+the three cost accumulators.
 @end deftypefn
 
 @deftypefn {Target Hook} void TARGET_VECTORIZE_DESTROY_COST_DATA (void 
*@var{data})
diff --git a/gcc/target.def b/gcc/target.def
index 
bfa819609c21bd71c0cc585c01dba42534453f47..df0f170ff3378671e802d82a8bce8e153d8cf8fe
 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -2078,11 +2078,12 @@ DEFHOOK
 (finish_cost,
  "This hook should complete calculations of the cost of vectorizing a loop\n\
 or basic block based on @var{data}, and return the prologue, body, and\n\
-epilogue costs as unsigned integers.  The default returns the value of\n\
-the three accumulators.",
+epilogue costs as unsigned integers.  It also asks the backend whether it\n\
+has a @var{suggested_unroll_factor}.  The default returns the value of\n\
+the three cost accumulators.",
  void,
  (void *data, unsigned *prologue_cost, unsigned *body_cost,
-  unsigned *epilogue_cost),
+  unsigned *epilogue_cost, unsigned *suggested_unroll_factor),
  default_finish_cost)
 
 /* Function to delete target-specific cost modeling data.  */
diff --git a/gcc/targhooks.h b/gcc/targhooks.h
index 
92d51992e625c2497aa8496b1e2e3d916e5706fd..b9697c366876fe5a8c444ffcf58bdc6b5c33b0ad
 100644
--- a/gcc/targhooks.h
+++ b/gcc/targhooks.h
@@ -123,7 +123,8 @@ extern unsigned default_add_stmt_cost (class vec_info *, 
void *, int,
                                       enum vect_cost_for_stmt,
                                       class _stmt_vec_info *, tree, int,
                                       enum vect_cost_model_location);
-extern void default_finish_cost (void *, unsigned *, unsigned *, unsigned *);
+extern void default_finish_cost (void *, unsigned *, unsigned *, unsigned *,
+                                unsigned *);
 extern void default_destroy_cost_data (void *);
 
 /* OpenACC hooks.  */
diff --git a/gcc/targhooks.c b/gcc/targhooks.c
index 
c9b5208853dbc15706a65d1eb335e28e0564325e..8552d9a0f144e7bcee3f2653f2ea84ea677f80a2
 100644
--- a/gcc/targhooks.c
+++ b/gcc/targhooks.c
@@ -1518,13 +1518,18 @@ default_add_stmt_cost (class vec_info *vinfo, void 
*data, int count,
 /* By default, the cost model just returns the accumulated costs.  */
 
 void
-default_finish_cost (void *data, unsigned *prologue_cost,
-                    unsigned *body_cost, unsigned *epilogue_cost)
+default_finish_cost (void *data,
+                    unsigned *prologue_cost, unsigned *body_cost,
+                    unsigned *epilogue_cost,
+                    unsigned *suggested_unroll_factor)
 {
   unsigned *cost = (unsigned *) data;
   *prologue_cost = cost[vect_prologue];
   *body_cost     = cost[vect_body];
   *epilogue_cost = cost[vect_epilogue];
+  /* Do not unroll.  */
+  if (suggested_unroll_factor)
+    *suggested_unroll_factor = 1;
 }
 
 /* Free the cost data.  */
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index 
5a5b8da2e771a1dd204f22a6447eba96bb3b352c..1bfe2e4f989143f4415c6c5b4a0b902ef1e00d66
 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -153,7 +153,8 @@ along with GCC; see the file COPYING3.  If not see
    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 */
 
-static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
+static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *,
+                                               unsigned *);
 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
                                               bool *, bool *);
 
@@ -277,7 +278,8 @@ vect_determine_vf_for_stmt (vec_info *vinfo,
 */
 
 static opt_result
-vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
+vect_determine_vectorization_factor (loop_vec_info loop_vinfo,
+                                    poly_uint64 min_vf)
 {
   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
@@ -354,6 +356,28 @@ vect_determine_vectorization_factor (loop_vec_info 
loop_vinfo)
         }
     }
 
+  /* Apply the suggested unrolling factor, this was determined by the backend
+     during finish_cost the first time we ran the analyzis for this
+     vector mode.  */
+  if (loop_vinfo->suggested_unroll_factor > 1)
+    {
+      poly_uint64 unrolled_vf
+       = vectorization_factor * loop_vinfo->suggested_unroll_factor;
+      unsigned HOST_WIDE_INT max_vf = estimated_poly_value (unrolled_vf,
+                                                           POLY_VALUE_MAX);
+      /* Make sure the unrolled vectorization factor fits the min and max
+         vectorization factor.  */
+       if (max_vf <= LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo)
+         && known_le (unrolled_vf, min_vf))
+       vectorization_factor = unrolled_vf;
+      else if (dump_enabled_p ())
+       dump_printf_loc (MSG_NOTE, vect_location,
+                        "Unrolling failed due to unroll factor not fitting in"
+                        " range of min and max vectorization factor:"
+                        " [%d, %d]\n",
+                        min_vf, max_vf);
+    }
+
   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
   if (dump_enabled_p ())
     {
@@ -828,6 +852,7 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in, 
vec_info_shared *shared)
     skip_main_loop_edge (nullptr),
     skip_this_loop_edge (nullptr),
     reusable_accumulators (),
+    suggested_unroll_factor (1),
     max_vectorization_factor (0),
     mask_skip_niters (NULL_TREE),
     rgroup_compare_type (NULL_TREE),
@@ -1829,7 +1854,8 @@ vect_known_niters_smaller_than_vf (loop_vec_info 
loop_vinfo)
    definitely no, or -1 if it's worth retrying.  */
 
 static int
-vect_analyze_loop_costing (loop_vec_info loop_vinfo)
+vect_analyze_loop_costing (loop_vec_info loop_vinfo,
+                          unsigned *suggested_unroll_factor)
 {
   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
@@ -1863,7 +1889,8 @@ vect_analyze_loop_costing (loop_vec_info loop_vinfo)
 
   int min_profitable_iters, min_profitable_estimate;
   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
-                                     &min_profitable_estimate);
+                                     &min_profitable_estimate,
+                                     suggested_unroll_factor);
 
   if (min_profitable_iters < 0)
     {
@@ -2128,10 +2155,16 @@ vect_determine_partial_vectors_and_peeling 
(loop_vec_info loop_vinfo,
         vectors to the epilogue, with the main loop continuing to operate
         on full vectors.
 
+        If we are unrolling we also do not want to use partial vectors. This
+        is to avoid the overhead of generating multiple masks and also to
+        avoid having to execute entire iterations of FALSE masked instructions
+        when dealing with one or less full iterations.
+
         ??? We could then end up failing to use partial vectors if we
         decide to peel iterations into a prologue, and if the main loop
         then ends up processing fewer than VF iterations.  */
-      if (param_vect_partial_vector_usage == 1
+      if ((param_vect_partial_vector_usage == 1
+          || loop_vinfo->suggested_unroll_factor > 1)
          && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
          && !vect_known_niters_smaller_than_vf (loop_vinfo))
        LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
@@ -2198,13 +2231,16 @@ vect_determine_partial_vectors_and_peeling 
(loop_vec_info loop_vinfo,
    for it.  The different analyses will record information in the
    loop_vec_info struct.  */
 static opt_result
-vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
+vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts,
+                    unsigned *suggested_unroll_factor,
+                    poly_uint64 min_vf = 2)
 {
   opt_result ok = opt_result::success ();
   int res;
   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
-  poly_uint64 min_vf = 2;
   loop_vec_info orig_loop_vinfo = NULL;
+  if (*suggested_unroll_factor > 1)
+    max_vf = estimated_poly_value (min_vf, POLY_VALUE_MAX);
 
   /* If we are dealing with an epilogue then orig_loop_vinfo points to the
      loop_vec_info of the first vectorized loop.  */
@@ -2308,11 +2344,12 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool 
&fatal, unsigned *n_stmts)
       return ok;
     }
   if (max_vf != MAX_VECTORIZATION_FACTOR
-      && maybe_lt (max_vf, min_vf))
+      && loop_vinfo->suggested_unroll_factor == 1
+      && max_vf < estimated_poly_value (min_vf, POLY_VALUE_MAX))
     return opt_result::failure_at (vect_location, "bad data dependence.\n");
   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
 
-  ok = vect_determine_vectorization_factor (loop_vinfo);
+  ok = vect_determine_vectorization_factor (loop_vinfo, min_vf);
   if (!ok)
     {
       if (dump_enabled_p ())
@@ -2321,7 +2358,9 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool 
&fatal, unsigned *n_stmts)
       return ok;
     }
   if (max_vf != MAX_VECTORIZATION_FACTOR
-      && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
+      && loop_vinfo->suggested_unroll_factor == 1
+      && max_vf < estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
+                                       POLY_VALUE_MAX))
     return opt_result::failure_at (vect_location, "bad data dependence.\n");
 
   /* Compute the scalar iteration cost.  */
@@ -2547,7 +2586,7 @@ start_over:
     return ok;
 
   /* Check the costings of the loop make vectorizing worthwhile.  */
-  res = vect_analyze_loop_costing (loop_vinfo);
+  res = vect_analyze_loop_costing (loop_vinfo, suggested_unroll_factor);
   if (res < 0)
     {
       ok = opt_result::failure_at (vect_location,
@@ -2879,6 +2918,122 @@ vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
   return true;
 }
 
+/* Determine whether we can unroll this loop.  */
+
+static bool
+vect_can_unroll (loop_vec_info loop_vinfo)
+{
+  stmt_vec_info stmt_info;
+  unsigned i;
+  poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+
+  if (known_le (vectorization_factor, 1U))
+    {
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_NOTE, vect_location,
+                        "will not unroll loop with a VF of 1 or less\n");
+      return false;
+    }
+
+  FOR_EACH_VEC_ELT (loop_vinfo->stmt_vec_infos, i, stmt_info)
+    {
+      if (STMT_VINFO_IN_PATTERN_P (stmt_info)
+         || !STMT_VINFO_RELEVANT_P (stmt_info)
+         || stmt_info->vectype == NULL_TREE)
+       continue;
+      /* Do not unroll loops with negative steps as it is unlikely that
+        vectorization will succeed due to the way we deal with negative steps
+        in loads and stores in 'get_load_store_type'.  */
+      if (stmt_info->dr_aux.dr
+         && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
+       {
+         dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
+         tree step = vect_dr_behavior (loop_vinfo, dr_info)->step;
+         if (TREE_CODE (step) == INTEGER_CST
+             && tree_int_cst_compare (step, size_zero_node) < 0)
+           {
+             if (dump_enabled_p ())
+               dump_printf_loc (MSG_NOTE, vect_location,
+                                "could not unroll due to negative step\n");
+             return false;
+           }
+       }
+
+      if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
+       {
+         auto red_info = info_for_reduction (loop_vinfo, stmt_info);
+         if (STMT_VINFO_REDUC_TYPE (red_info) != TREE_CODE_REDUCTION)
+           {
+             if (dump_enabled_p ())
+               dump_printf_loc (MSG_NOTE, vect_location,
+                                "could not unroll loop with reduction due to "
+                               "non TREE_CODE_REDUCTION\n");
+             return false;
+           }
+       }
+    }
+
+  return true;
+}
+
+
+/* Try to unroll the current loop.  First determine the unrolling factor using
+   the analysis done for the current vector mode.  Then re-analyze the loop for
+   the given unrolling factor and the current vector mode.  */
+
+static opt_loop_vec_info
+vect_try_unrolling (loop_vec_info loop_vinfo, unsigned *n_stmts,
+                   unsigned suggested_unroll_factor)
+{
+  DUMP_VECT_SCOPE ("vect_try_unrolling");
+
+  if (suggested_unroll_factor == 1)
+    return opt_loop_vec_info::failure_at (vect_location,
+                                         "*** Target determined unrolling is"
+                                         " not profitable.\n");
+
+  if (!vect_can_unroll (loop_vinfo))
+    return opt_loop_vec_info::failure_at (vect_location,
+                                         "*** Can not unroll this loop.\n");
+
+  loop_vec_info unrolled_vinfo
+    = opt_loop_vec_info::success (vect_analyze_loop_form (loop_vinfo->loop,
+                                                         loop_vinfo->shared));
+  unrolled_vinfo->vector_mode = loop_vinfo->vector_mode;
+
+  /* Use the suggested_unrolling_factor that was returned at the target's
+     TARGET_VECTORIZE_FINISH_COST hook.  */
+  unrolled_vinfo->suggested_unroll_factor = suggested_unroll_factor;
+  poly_uint64 unrolled_vf
+    = LOOP_VINFO_VECT_FACTOR (loop_vinfo) * suggested_unroll_factor;
+
+  if (dump_enabled_p ())
+    dump_printf_loc (MSG_NOTE, vect_location,
+                    "***** unrolling factor %d chosen for vector mode %s,"
+                    "re-trying analyzis...\n",
+                    suggested_unroll_factor,
+                    GET_MODE_NAME (unrolled_vinfo->vector_mode));
+  bool unrolling_fatal = false;
+  if (vect_analyze_loop_2 (unrolled_vinfo, unrolling_fatal, n_stmts,
+                          &suggested_unroll_factor,
+                          unrolled_vf)
+      && known_ne (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
+                  LOOP_VINFO_VECT_FACTOR (unrolled_vinfo)))
+    {
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_NOTE, vect_location,
+                        "unrolling succeeded with factor = %d\n",
+                        suggested_unroll_factor);
+      unrolled_vinfo->loop->aux = NULL;
+      return opt_loop_vec_info::success (unrolled_vinfo);
+    }
+
+  loop_vinfo->loop->aux = NULL;
+  return opt_loop_vec_info::failure_at (vect_location,
+                                       "unrolling failed with factor = %d\n",
+                                       suggested_unroll_factor);
+}
+
 /* If LOOP_VINFO is already a main loop, return it unmodified.  Otherwise
    try to reanalyze it as a main loop.  Return the loop_vinfo on success
    and null on failure.  */
@@ -2902,8 +3057,16 @@ vect_reanalyze_as_main_loop (loop_vec_info loop_vinfo, 
unsigned int *n_stmts)
   main_loop_vinfo->vector_mode = loop_vinfo->vector_mode;
 
   bool fatal = false;
-  bool res = vect_analyze_loop_2 (main_loop_vinfo, fatal, n_stmts);
+  unsigned suggested_unroll_factor = 1;
+  bool res = vect_analyze_loop_2 (main_loop_vinfo, fatal, n_stmts,
+                                 &suggested_unroll_factor);
   loop->aux = NULL;
+  opt_loop_vec_info unrolled_vinfo
+    = opt_loop_vec_info::success (vect_try_unrolling (main_loop_vinfo, n_stmts,
+                                                     suggested_unroll_factor));
+  if (unrolled_vinfo)
+    main_loop_vinfo = unrolled_vinfo;
+
   if (!res)
     {
       if (dump_enabled_p ())
@@ -2960,6 +3123,7 @@ vect_analyze_loop (class loop *loop, vec_info_shared 
*shared)
   bool vect_epilogues = false;
   opt_result res = opt_result::success ();
   unsigned HOST_WIDE_INT simdlen = loop->simdlen;
+  unsigned suggested_unroll_factor = 1;
   while (1)
     {
       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
@@ -3007,7 +3171,8 @@ vect_analyze_loop (class loop *loop, vec_info_shared 
*shared)
       if (vect_epilogues)
        LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = first_loop_vinfo;
 
-      res = vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts);
+      res = vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts,
+                                &suggested_unroll_factor);
       if (mode_i == 0)
        autodetected_vector_mode = loop_vinfo->vector_mode;
       if (dump_enabled_p ())
@@ -3038,6 +3203,18 @@ vect_analyze_loop (class loop *loop, vec_info_shared 
*shared)
 
       if (res)
        {
+         /* Only try unrolling main loops.  */
+         if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
+           {
+             opt_loop_vec_info unrolled_vinfo =
+               vect_try_unrolling (loop_vinfo, &n_stmts,
+                                   suggested_unroll_factor);
+             if (unrolled_vinfo)
+               loop_vinfo = unrolled_vinfo;
+             /* Reset suggested_unroll_factor for next loop_vinfo.  */
+             suggested_unroll_factor = 1;
+           }
+
          LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
          vectorized_loops++;
 
@@ -3056,13 +3233,26 @@ vect_analyze_loop (class loop *loop, vec_info_shared 
*shared)
              /* Keep trying to roll back vectorization attempts while the
                 loop_vec_infos they produced were worse than this one.  */
              vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
+             poly_uint64 vinfo_vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+             poly_uint64 first_vinfo_vf
+               = LOOP_VINFO_VECT_FACTOR (first_loop_vinfo);
              while (!vinfos.is_empty ()
+                    && (known_lt (vinfo_vf, first_vinfo_vf)
+                        || (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
+                            && maybe_eq (vinfo_vf, first_vinfo_vf)))
                     && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
                {
                  gcc_assert (vect_epilogues);
                  delete vinfos.pop ();
                }
+             /* Check if we may want to replace the current first_loop_vinfo
+                with the new loop, but only if they have different vector
+                modes.  If they have the same vector mode this means the main
+                loop is an unrolled loop and we are trying to vectorize the
+                epilogue using the same vector mode but with a lower
+                vectorization factor.  */
              if (vinfos.is_empty ()
+                 && loop_vinfo->vector_mode != first_loop_vinfo->vector_mode
                  && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
                {
                  loop_vec_info main_loop_vinfo
@@ -3105,14 +3295,34 @@ vect_analyze_loop (class loop *loop, vec_info_shared 
*shared)
                   /* For now only allow one epilogue loop.  */
                   && first_loop_vinfo->epilogue_vinfos.is_empty ())
            {
-             first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
-             poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
-             gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
-                         || maybe_ne (lowest_th, 0U));
-             /* Keep track of the known smallest versioning
-                threshold.  */
-             if (ordered_p (lowest_th, th))
-               lowest_th = ordered_min (lowest_th, th);
+             /* Ensure the epilogue has a smaller VF than the main loop or
+                uses predication and has the same VF.  */
+             if (known_lt (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
+                           LOOP_VINFO_VECT_FACTOR (first_loop_vinfo))
+                 || (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
+                     && maybe_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
+                                  LOOP_VINFO_VECT_FACTOR (first_loop_vinfo))))
+               {
+                 first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
+                 poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
+                 gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
+                             || maybe_ne (lowest_th, 0U));
+                 /* Keep track of the known smallest versioning
+                    threshold.  */
+                 if (ordered_p (lowest_th, th))
+                   lowest_th = ordered_min (lowest_th, th);
+               }
+             else
+               {
+                 if (dump_enabled_p ())
+                   dump_printf_loc (MSG_NOTE, vect_location,
+                                    "***** Will not use %s mode as an"
+                                    " epilogue, since it leads to an higher"
+                                    " vectorization factor than main loop\n",
+                                    GET_MODE_NAME (loop_vinfo->vector_mode));
+                 delete loop_vinfo;
+                 loop_vinfo = opt_loop_vec_info::success (NULL);
+               }
            }
          else
            {
@@ -3153,13 +3363,32 @@ vect_analyze_loop (class loop *loop, vec_info_shared 
*shared)
 
       /* Handle the case that the original loop can use partial
         vectorization, but want to only adopt it for the epilogue.
-        The retry should be in the same mode as original.  */
+        The retry should be in the same mode as original.
+        Also handle the case where we have unrolled the main loop and want to
+        retry all vector modes again for the epilogues, since the VF is now
+        at least twice as high as the current vector mode.  */
       if (vect_epilogues
          && loop_vinfo
-         && LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo))
+         && (LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo)
+             || loop_vinfo->suggested_unroll_factor > 1))
        {
-         gcc_assert (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
+         gcc_assert ((LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
+                      || loop_vinfo->suggested_unroll_factor > 1)
                      && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo));
+         /* If we are unrolling, try all VECTOR_MODES for the epilogue.  */
+         if (loop_vinfo->suggested_unroll_factor > 1)
+           {
+             next_vector_mode = vector_modes[0];
+             mode_i = 1;
+
+             if (dump_enabled_p ())
+               dump_printf_loc (MSG_NOTE, vect_location,
+                                "***** Re-trying analysis with vector mode"
+                                " %s for epilogues after unrolling.\n",
+                                GET_MODE_NAME (next_vector_mode));
+             continue;
+           }
+
          if (dump_enabled_p ())
            dump_printf_loc (MSG_NOTE, vect_location,
                             "***** Re-trying analysis with same vector mode"
@@ -3862,7 +4091,8 @@ vect_get_known_peeling_cost (loop_vec_info loop_vinfo, 
int peel_iters_prologue,
 static void
 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
                                    int *ret_min_profitable_niters,
-                                   int *ret_min_profitable_estimate)
+                                   int *ret_min_profitable_estimate,
+                                   unsigned *suggested_unroll_factor)
 {
   int min_profitable_iters;
   int min_profitable_estimate;
@@ -4222,8 +4452,9 @@ vect_estimate_min_profitable_iters (loop_vec_info 
loop_vinfo,
     }
 
   /* Complete the target-specific cost calculations.  */
-  finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
-              &vec_inside_cost, &vec_epilogue_cost);
+  finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo),
+              &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost,
+              suggested_unroll_factor);
 
   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
 
@@ -7212,7 +7443,8 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
    participating.  */
   if (ncopies > 1
       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
-      && reduc_chain_length == 1)
+      && reduc_chain_length == 1
+      && loop_vinfo->suggested_unroll_factor == 1)
     single_defuse_cycle = true;
 
   if (single_defuse_cycle || lane_reduc_code_p)
diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c
index 
024a1c38a2342246d7891db1de5f1d6e6458d5dd..a8a6c6a19ed4c98144f9097467c59386fdbe8233
 100644
--- a/gcc/tree-vect-slp.c
+++ b/gcc/tree-vect-slp.c
@@ -5418,8 +5418,8 @@ vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
        }
       while (vi < li_vector_costs.length ()
             && li_vector_costs[vi].first == vl);
-      finish_cost (vect_target_cost_data, &vec_prologue_cost,
-                  &vec_inside_cost, &vec_epilogue_cost);
+      finish_cost (vect_target_cost_data, &vec_prologue_cost, &vec_inside_cost,
+                  &vec_epilogue_cost);
       destroy_cost_data (vect_target_cost_data);
 
       vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 
c4c5678e7f1abafc25c465319dbacf3ef50f0ae9..8b182cd34e7d6a8d9e55a9c1003900b8216a952f
 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -621,6 +621,13 @@ public:
      about the reductions that generated them.  */
   hash_map<tree, vect_reusable_accumulator> reusable_accumulators;
 
+  /* The number of times that the target suggested we unroll the vector loop
+     in order to promote more ILP.  This value will be used to re-analyze the
+     loop for vectorization and if successful the value will be folded into
+     vectorization_factor (and therefore exactly divides
+     vectorization_factor).  */
+  unsigned int suggested_unroll_factor;
+
   /* Maximum runtime vectorization factor, or MAX_VECTORIZATION_FACTOR
      if there is no particular limit.  */
   unsigned HOST_WIDE_INT max_vectorization_factor;
@@ -1571,9 +1578,11 @@ add_stmt_cost (vec_info *vinfo, void *data, 
stmt_info_for_cost *i)
 
 static inline void
 finish_cost (void *data, unsigned *prologue_cost,
-            unsigned *body_cost, unsigned *epilogue_cost)
+            unsigned *body_cost, unsigned *epilogue_cost,
+            unsigned *suggested_unroll_factor = NULL)
 {
-  targetm.vectorize.finish_cost (data, prologue_cost, body_cost, 
epilogue_cost);
+  targetm.vectorize.finish_cost (data, prologue_cost, body_cost, epilogue_cost,
+                                suggested_unroll_factor);
 }
 
 /* Alias targetm.vectorize.destroy_cost_data.  */

Reply via email to