Hi Richard,

As I mentioned in the IRC channel, this is my current work in progress patch. It currently ICE's when vectorizing gcc/testsuite/gcc.c-torture/execute/nestfunc-2.c with '-O3' and '--param vect-epilogues-nomask=1'.

It ICE's because the epilogue loop (after if conversion) and main loop (before vectorization) are not the same, there are a bunch of extra BBs and the epilogue loop seems to need some cleaning up too.

Let me know if you see a way around this issue.

Cheers,
Andre
diff --git a/gcc/cfgloop.h b/gcc/cfgloop.h
index 0b0154ffd7bf031a005de993b101d9db6dd98c43..d01512ea46467f1cf77793bdc75b48e71b0b9641 100644
--- a/gcc/cfgloop.h
+++ b/gcc/cfgloop.h
@@ -21,6 +21,7 @@ along with GCC; see the file COPYING3.  If not see
 #define GCC_CFGLOOP_H
 
 #include "cfgloopmanip.h"
+#include "target.h"
 
 /* Structure to hold decision about unrolling/peeling.  */
 enum lpt_dec
@@ -268,6 +269,9 @@ public:
      the basic-block from being collected but its index can still be
      reused.  */
   basic_block former_header;
+
+  /* Keep track of vector sizes we know we can vectorize the epilogue with.  */
+  vector_sizes epilogue_vsizes;
 };
 
 /* Set if the loop is known to be infinite.  */
diff --git a/gcc/cfgloop.c b/gcc/cfgloop.c
index 4ad1f658708f83dbd8789666c26d4bd056837bc6..f3e81bcd00b3f125389aa15b12dc5201b3578d20 100644
--- a/gcc/cfgloop.c
+++ b/gcc/cfgloop.c
@@ -198,6 +198,7 @@ flow_loop_free (class loop *loop)
       exit->prev = exit;
     }
 
+  loop->epilogue_vsizes.release();
   ggc_free (loop->exits);
   ggc_free (loop);
 }
@@ -355,6 +356,7 @@ alloc_loop (void)
   loop->nb_iterations_upper_bound = 0;
   loop->nb_iterations_likely_upper_bound = 0;
   loop->nb_iterations_estimate = 0;
+  loop->epilogue_vsizes.create(8);
   return loop;
 }
 
diff --git a/gcc/gengtype.c b/gcc/gengtype.c
index 53317337cf8c8e8caefd6b819d28b3bba301e755..80fb6ef71465b24e034fa45d69fec56be6b2e7f8 100644
--- a/gcc/gengtype.c
+++ b/gcc/gengtype.c
@@ -5197,6 +5197,7 @@ main (int argc, char **argv)
       POS_HERE (do_scalar_typedef ("widest_int", &pos));
       POS_HERE (do_scalar_typedef ("int64_t", &pos));
       POS_HERE (do_scalar_typedef ("poly_int64", &pos));
+      POS_HERE (do_scalar_typedef ("poly_uint64", &pos));
       POS_HERE (do_scalar_typedef ("uint64_t", &pos));
       POS_HERE (do_scalar_typedef ("uint8", &pos));
       POS_HERE (do_scalar_typedef ("uintptr_t", &pos));
@@ -5206,6 +5207,7 @@ main (int argc, char **argv)
       POS_HERE (do_scalar_typedef ("machine_mode", &pos));
       POS_HERE (do_scalar_typedef ("fixed_size_mode", &pos));
       POS_HERE (do_scalar_typedef ("CONSTEXPR", &pos));
+      POS_HERE (do_scalar_typedef ("vector_sizes", &pos));
       POS_HERE (do_typedef ("PTR", 
 			    create_pointer (resolve_typedef ("void", &pos)),
 			    &pos));
diff --git a/gcc/tree-vect-loop-manip.c b/gcc/tree-vect-loop-manip.c
index 5c25441c70a271f04730486e513437fffa75b7e3..b1c13dafdeeec8d95f00c232822d3ab9b11f4046 100644
--- a/gcc/tree-vect-loop-manip.c
+++ b/gcc/tree-vect-loop-manip.c
@@ -26,6 +26,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "tree.h"
 #include "gimple.h"
 #include "cfghooks.h"
+#include "tree-if-conv.h"
 #include "tree-pass.h"
 #include "ssa.h"
 #include "fold-const.h"
@@ -1730,6 +1731,7 @@ vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters,
 {
   unsigned int i;
   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
+  vec<data_reference> datarefs_copy = loop_vinfo->shared->datarefs_copy;
   struct data_reference *dr;
 
   DUMP_VECT_SCOPE ("vect_update_inits_of_dr");
@@ -1756,6 +1758,12 @@ vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters,
       if (!STMT_VINFO_GATHER_SCATTER_P (dr_info->stmt))
 	vect_update_init_of_dr (dr, niters, code);
     }
+  FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
+    {
+      dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
+      if (!STMT_VINFO_GATHER_SCATTER_P (dr_info->stmt))
+	vect_update_init_of_dr (dr, niters, code);
+    }
 }
 
 /* For the information recorded in LOOP_VINFO prepare the loop for peeling
@@ -2409,6 +2417,8 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
   profile_probability prob_prolog, prob_vector, prob_epilog;
   int estimated_vf;
   int prolog_peeling = 0;
+  bool vect_epilogues
+    = loop_vinfo->epilogue_vinfos.length () > 0;
   /* We currently do not support prolog peeling if the target alignment is not
      known at compile time.  'vect_gen_prolog_loop_niters' depends on the
      target alignment being constant.  */
@@ -2469,12 +2479,12 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
   /* Prolog loop may be skipped.  */
   bool skip_prolog = (prolog_peeling != 0);
   /* Skip to epilog if scalar loop may be preferred.  It's only needed
-     when we peel for epilog loop and when it hasn't been checked with
-     loop versioning.  */
+     when we peel for epilog loop or when we loop version.  */
   bool skip_vector = (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
 		      ? maybe_lt (LOOP_VINFO_INT_NITERS (loop_vinfo),
 				  bound_prolog + bound_epilog)
-		      : !LOOP_REQUIRES_VERSIONING (loop_vinfo));
+		      : (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
+			 || vect_epilogues));
   /* Epilog loop must be executed if the number of iterations for epilog
      loop is known at compile time, otherwise we need to add a check at
      the end of vector loop and skip to the end of epilog loop.  */
@@ -2586,6 +2596,7 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
 	}
       /* Peel epilog and put it on exit edge of loop.  */
       epilog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, scalar_loop, e);
+
       if (!epilog)
 	{
 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
@@ -2966,9 +2977,7 @@ vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo, tree * cond_expr)
    *COND_EXPR_STMT_LIST.  */
 
 class loop *
-vect_loop_versioning (loop_vec_info loop_vinfo,
-		      unsigned int th, bool check_profitability,
-		      poly_uint64 versioning_threshold)
+vect_loop_versioning (loop_vec_info loop_vinfo)
 {
   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *nloop;
   class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
@@ -2988,10 +2997,15 @@ vect_loop_versioning (loop_vec_info loop_vinfo,
   bool version_align = LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo);
   bool version_alias = LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo);
   bool version_niter = LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo);
+  poly_uint64 versioning_threshold
+    = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
   tree version_simd_if_cond
     = LOOP_REQUIRES_VERSIONING_FOR_SIMD_IF_COND (loop_vinfo);
+  unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
 
-  if (check_profitability)
+  if (th >= vect_vf_for_cost (loop_vinfo)
+      && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
+      && !ordered_p (th, versioning_threshold))
     cond_expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
 			     build_int_cst (TREE_TYPE (scalar_loop_iters),
 					    th - 1));
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index b0cbbac0cb5ba1ffce706715d3dbb9139063803d..3c355eccc5bef6668456fddf485b4996f2d2fb38 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -885,6 +885,8 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
 	    }
 	}
     }
+
+  epilogue_vinfos.create (6);
 }
 
 /* Free all levels of MASKS.  */
@@ -960,6 +962,7 @@ _loop_vec_info::~_loop_vec_info ()
   release_vec_loop_masks (&masks);
   delete ivexpr_map;
   delete scan_map;
+  epilogue_vinfos.release ();
 
   loop->aux = NULL;
 }
@@ -1726,7 +1729,13 @@ vect_analyze_loop_costing (loop_vec_info loop_vinfo)
       return 0;
     }
 
-  HOST_WIDE_INT estimated_niter = estimated_stmt_executions_int (loop);
+  HOST_WIDE_INT estimated_niter = -1;
+
+  if (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
+    estimated_niter
+      = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
+  if (estimated_niter == -1)
+    estimated_niter = estimated_stmt_executions_int (loop);
   if (estimated_niter == -1)
     estimated_niter = likely_max_stmt_executions_int (loop);
   if (estimated_niter != -1
@@ -1864,6 +1873,7 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
   int res;
   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
   poly_uint64 min_vf = 2;
+  loop_vec_info orig_loop_vinfo = NULL;
 
   /* The first group of checks is independent of the vector size.  */
   fatal = true;
@@ -2183,9 +2193,12 @@ start_over:
      enough for both peeled prolog loop and vector loop.  This check
      can be merged along with threshold check of loop versioning, so
      increase threshold for this case if necessary.  */
-  if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
+  if (LOOP_REQUIRES_VERSIONING (loop_vinfo)
+      || ((orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
+	  && LOOP_REQUIRES_VERSIONING (orig_loop_vinfo)))
     {
       poly_uint64 niters_th = 0;
+      unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
 
       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
 	{
@@ -2206,6 +2219,14 @@ start_over:
       /* One additional iteration because of peeling for gap.  */
       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
 	niters_th += 1;
+
+      /*  Use the same condition as vect_transform_loop to decide when to use
+	  the cost to determine a versioning threshold.  */
+      if (th >= vect_vf_for_cost (loop_vinfo)
+	  && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
+	  && ordered_p (th, niters_th))
+	niters_th = ordered_max (poly_uint64 (th), niters_th);
+
       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
     }
 
@@ -2329,14 +2350,8 @@ again:
    be vectorized.  */
 opt_loop_vec_info
 vect_analyze_loop (class loop *loop, loop_vec_info orig_loop_vinfo,
-		   vec_info_shared *shared)
+		   vec_info_shared *shared, vector_sizes vector_sizes)
 {
-  auto_vector_sizes vector_sizes;
-
-  /* Autodetect first vector size we try.  */
-  current_vector_size = 0;
-  targetm.vectorize.autovectorize_vector_sizes (&vector_sizes,
-						loop->simdlen != 0);
   unsigned int next_size = 0;
 
   DUMP_VECT_SCOPE ("analyze_loop_nest");
@@ -2357,6 +2372,9 @@ vect_analyze_loop (class loop *loop, loop_vec_info orig_loop_vinfo,
   poly_uint64 autodetected_vector_size = 0;
   opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
   poly_uint64 first_vector_size = 0;
+  poly_uint64 lowest_th = 0;
+  unsigned vectorized_loops = 0;
+  bool vect_epilogues = !loop->simdlen && PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK);
   while (1)
     {
       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
@@ -2375,24 +2393,53 @@ vect_analyze_loop (class loop *loop, loop_vec_info orig_loop_vinfo,
 
       if (orig_loop_vinfo)
 	LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
+      else if (vect_epilogues && first_loop_vinfo)
+	{
+	  LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = first_loop_vinfo;
+	}
 
       opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts);
       if (res)
 	{
 	  LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
+	  vectorized_loops++;
 
-	  if (loop->simdlen
-	      && maybe_ne (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
-			   (unsigned HOST_WIDE_INT) loop->simdlen))
+	  if ((loop->simdlen
+	       && maybe_ne (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
+			    (unsigned HOST_WIDE_INT) loop->simdlen))
+	      || vect_epilogues)
 	    {
 	      if (first_loop_vinfo == NULL)
 		{
 		  first_loop_vinfo = loop_vinfo;
+		  lowest_th
+		    = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
 		  first_vector_size = current_vector_size;
 		  loop->aux = NULL;
 		}
 	      else
-		delete loop_vinfo;
+		{
+		  /* Keep track of vector sizes that we know we can vectorize
+		     the epilogue with.  */
+		  if (vect_epilogues)
+		    {
+		      loop->epilogue_vsizes.reserve (1);
+		      loop->epilogue_vsizes.quick_push (current_vector_size);
+		      first_loop_vinfo->epilogue_vinfos.reserve (1);
+		      first_loop_vinfo->epilogue_vinfos.quick_push (loop_vinfo);
+		      LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = first_loop_vinfo;
+		      poly_uint64 th
+			= LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
+		      gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
+				  || maybe_ne (lowest_th, 0U));
+		      /* Keep track of the known smallest versioning threshold.
+		       */
+		      if (ordered_p (lowest_th, th))
+			lowest_th = ordered_min (lowest_th, th);
+		    }
+		  else
+		    delete loop_vinfo;
+		}
 	    }
 	  else
 	    {
@@ -2430,6 +2477,8 @@ vect_analyze_loop (class loop *loop, loop_vec_info orig_loop_vinfo,
 		  dump_dec (MSG_NOTE, current_vector_size);
 		  dump_printf (MSG_NOTE, "\n");
 		}
+	      LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
+
 	      return first_loop_vinfo;
 	    }
 	  else
@@ -8483,6 +8532,7 @@ vect_transform_loop (loop_vec_info loop_vinfo)
   gimple *stmt;
   bool check_profitability = false;
   unsigned int th;
+  auto_vec<gimple *> orig_stmts;
 
   DUMP_VECT_SCOPE ("vec_transform_loop");
 
@@ -8497,11 +8547,11 @@ vect_transform_loop (loop_vec_info loop_vinfo)
   if (th >= vect_vf_for_cost (loop_vinfo)
       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
     {
-      if (dump_enabled_p ())
-	dump_printf_loc (MSG_NOTE, vect_location,
-			 "Profitability threshold is %d loop iterations.\n",
-                         th);
-      check_profitability = true;
+	if (dump_enabled_p ())
+	  dump_printf_loc (MSG_NOTE, vect_location,
+			   "Profitability threshold is %d loop iterations.\n",
+			   th);
+	check_profitability = true;
     }
 
   /* Make sure there exists a single-predecessor exit bb.  Do this before 
@@ -8519,18 +8569,8 @@ vect_transform_loop (loop_vec_info loop_vinfo)
 
   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
     {
-      poly_uint64 versioning_threshold
-	= LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
-      if (check_profitability
-	  && ordered_p (poly_uint64 (th), versioning_threshold))
-	{
-	  versioning_threshold = ordered_max (poly_uint64 (th),
-					      versioning_threshold);
-	  check_profitability = false;
-	}
       class loop *sloop
-	= vect_loop_versioning (loop_vinfo, th, check_profitability,
-				versioning_threshold);
+	= vect_loop_versioning (loop_vinfo);
       sloop->force_vectorize = false;
       check_profitability = false;
     }
@@ -8558,6 +8598,66 @@ vect_transform_loop (loop_vec_info loop_vinfo)
   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
 			      &step_vector, &niters_vector_mult_vf, th,
 			      check_profitability, niters_no_overflow);
+  if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
+    epilogue = NULL;
+
+  if (loop_vinfo->epilogue_vinfos.length () == 0)
+    epilogue = NULL;
+
+  /* Note LOOP_VINFO_NITERS_KNOWN_P and LOOP_VINFO_INT_NITERS work
+     on niters already ajusted for the iterations of the prologue.  */
+  if (epilogue && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
+      && known_eq (vf, lowest_vf))
+    {
+      vector_sizes vector_sizes = loop->epilogue_vsizes;
+      unsigned int next_size = 0;
+      unsigned HOST_WIDE_INT eiters
+	= (LOOP_VINFO_INT_NITERS (loop_vinfo)
+	   - LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
+      eiters
+	= eiters % lowest_vf + LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
+      epilogue->nb_iterations_upper_bound = eiters - 1;
+      epilogue->any_upper_bound = true;
+
+      unsigned int ratio;
+      while (next_size < vector_sizes.length ()
+	     && !(constant_multiple_p (current_vector_size,
+				       vector_sizes[next_size], &ratio)
+		  && eiters >= lowest_vf / ratio))
+	next_size += 1;
+
+      if (next_size == vector_sizes.length ())
+	epilogue = NULL;
+    }
+
+  if (epilogue)
+    {
+      loop_vec_info epilogue_vinfo = loop_vinfo->epilogue_vinfos[0];
+      loop_vinfo->epilogue_vinfos.ordered_remove (0);
+      epilogue->aux = epilogue_vinfo;
+      LOOP_VINFO_LOOP (epilogue_vinfo) = epilogue;
+      epilogue->simduid = loop->simduid;
+
+      epilogue->force_vectorize = loop->force_vectorize;
+      epilogue->safelen = loop->safelen;
+      epilogue->dont_vectorize = false;
+
+      /* update stmts in stmt_vec_info for epilog */
+      gimple_stmt_iterator gsi;
+      gphi_iterator phi_gsi;
+      basic_block *bbs = get_loop_body (loop);
+
+      for (unsigned i = 0; i < loop->num_nodes; ++i)
+	{
+	  for (phi_gsi = gsi_start_phis (bbs[i]); !gsi_end_p (phi_gsi);
+	       gsi_next (&phi_gsi))
+	    orig_stmts.safe_push (phi_gsi.phi ());
+
+	  for (gsi = gsi_start_bb (bbs[i]); !gsi_end_p (gsi); gsi_next (&gsi))
+	    orig_stmts.safe_push (gsi_stmt (gsi));
+	}
+    }
+
   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
       && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
     scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
@@ -8814,57 +8914,86 @@ vect_transform_loop (loop_vec_info loop_vinfo)
      since vectorized loop can have loop-carried dependencies.  */
   loop->safelen = 0;
 
-  /* Don't vectorize epilogue for epilogue.  */
-  if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
-    epilogue = NULL;
-
-  if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
-    epilogue = NULL;
 
   if (epilogue)
     {
-      auto_vector_sizes vector_sizes;
-      targetm.vectorize.autovectorize_vector_sizes (&vector_sizes, false);
-      unsigned int next_size = 0;
+      if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
+	tree_if_conversion (epilogue);
 
-      /* Note LOOP_VINFO_NITERS_KNOWN_P and LOOP_VINFO_INT_NITERS work
-         on niters already ajusted for the iterations of the prologue.  */
-      if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
-	  && known_eq (vf, lowest_vf))
+      loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
+      hash_map<tree,tree> mapping;
+      auto_vec<stmt_vec_info> worklist;
+      basic_block *bbs = get_loop_body (epilogue);
+      gimple_stmt_iterator gsi;
+      gphi_iterator phi_gsi;
+      gimple * orig_stmt, * new_stmt;
+      stmt_vec_info stmt_vinfo = NULL;
+
+      LOOP_VINFO_BBS (epilogue_vinfo) = bbs;
+      for (unsigned i = 0; i < epilogue->num_nodes; ++i)
 	{
-	  unsigned HOST_WIDE_INT eiters
-	    = (LOOP_VINFO_INT_NITERS (loop_vinfo)
-	       - LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
-	  eiters
-	    = eiters % lowest_vf + LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
-	  epilogue->nb_iterations_upper_bound = eiters - 1;
-	  epilogue->any_upper_bound = true;
-
-	  unsigned int ratio;
-	  while (next_size < vector_sizes.length ()
-		 && !(constant_multiple_p (current_vector_size,
-					   vector_sizes[next_size], &ratio)
-		      && eiters >= lowest_vf / ratio))
-	    next_size += 1;
+	  for (phi_gsi = gsi_start_phis (bbs[i]); !gsi_end_p (phi_gsi);
+	       gsi_next (&phi_gsi))
+	    {
+	      gcc_assert (orig_stmts.length () > 0);
+	      orig_stmt = orig_stmts[0];
+	      orig_stmts.ordered_remove (0);
+	      new_stmt = phi_gsi.phi ();
+
+	      stmt_vinfo
+		= epilogue_vinfo->lookup_stmt (orig_stmt);
+
+	      STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
+	      gimple_set_uid (new_stmt, gimple_uid (orig_stmt));
+
+	      mapping.put (gimple_phi_result (orig_stmt),
+			    gimple_phi_result (new_stmt));
+
+	      if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
+		worklist.safe_push (stmt_vinfo);
+	    }
+
+	  for (gsi = gsi_start_bb (bbs[i]); !gsi_end_p (gsi); gsi_next (&gsi))
+	    {
+	      gcc_assert (orig_stmts.length () > 0);
+	      orig_stmt = orig_stmts[0];
+	      orig_stmts.ordered_remove (0);
+	      new_stmt = gsi_stmt (gsi);
+
+	      stmt_vinfo
+		= epilogue_vinfo->lookup_stmt (orig_stmt);
+
+	      STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
+	      gimple_set_uid (new_stmt, gimple_uid (orig_stmt));
+
+	      if (is_gimple_assign (orig_stmt))
+		{
+		  gcc_assert (is_gimple_assign (new_stmt));
+		  mapping.put (gimple_assign_lhs (orig_stmt),
+			      gimple_assign_lhs (new_stmt));
+		}
+
+	      if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
+		worklist.safe_push (stmt_vinfo);
+	    }
+	  gcc_assert (orig_stmts.is_empty ());
 	}
-      else
-	while (next_size < vector_sizes.length ()
-	       && maybe_lt (current_vector_size, vector_sizes[next_size]))
-	  next_size += 1;
 
-      if (next_size == vector_sizes.length ())
-	epilogue = NULL;
-    }
+      for (unsigned i = 0; i < worklist.length (); ++i)
+	{
+	  tree *new_t;
+	  gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (worklist[i]);
 
-  if (epilogue)
-    {
-      epilogue->force_vectorize = loop->force_vectorize;
-      epilogue->safelen = loop->safelen;
-      epilogue->dont_vectorize = false;
+	  while (seq)
+	    {
+	      for (unsigned j = 1; j < gimple_num_ops (seq); ++j)
+		if ((new_t = mapping.get(gimple_op (seq, j))))
+		  gimple_set_op (seq, j, *new_t);
+	      seq = seq->next;
+	    }
+	}
 
-      /* We may need to if-convert epilogue to vectorize it.  */
-      if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
-	tree_if_conversion (epilogue);
+      vect_analyze_scalar_cycles (epilogue_vinfo);
     }
 
   return epilogue;
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 1456cde4c2c2dec7244c504d2c496248894a4f1e..6e453d39190b362b6d5a01bc2167e10a617f91f9 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -564,6 +564,8 @@ public:
      this points to the original vectorized loop.  Otherwise NULL.  */
   _loop_vec_info *orig_loop_info;
 
+  vec<_loop_vec_info *> epilogue_vinfos;
+
 } *loop_vec_info;
 
 /* Access Functions.  */
@@ -1480,10 +1482,9 @@ extern void vect_set_loop_condition (class loop *, loop_vec_info,
 extern bool slpeel_can_duplicate_loop_p (const class loop *, const_edge);
 class loop *slpeel_tree_duplicate_loop_to_edge_cfg (class loop *,
 						     class loop *, edge);
-class loop *vect_loop_versioning (loop_vec_info, unsigned int, bool,
-				   poly_uint64);
+class loop *vect_loop_versioning (loop_vec_info);
 extern class loop *vect_do_peeling (loop_vec_info, tree, tree,
-				     tree *, tree *, tree *, int, bool, bool);
+				    tree *, tree *, tree *, int, bool, bool);
 extern void vect_prepare_for_masked_peels (loop_vec_info);
 extern dump_user_location_t find_loop_location (class loop *);
 extern bool vect_can_advance_ivs_p (loop_vec_info);
@@ -1610,7 +1611,8 @@ extern bool check_reduction_path (dump_user_location_t, loop_p, gphi *, tree,
 /* Drive for loop analysis stage.  */
 extern opt_loop_vec_info vect_analyze_loop (class loop *,
 					    loop_vec_info,
-					    vec_info_shared *);
+					    vec_info_shared *,
+					    vector_sizes);
 extern tree vect_build_loop_niters (loop_vec_info, bool * = NULL);
 extern void vect_gen_vector_loop_niters (loop_vec_info, tree, tree *,
 					 tree *, bool);
diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c
index 173e6b51652fd023893b38da786ff28f827553b5..71bbf4fdf8dc7588c45a0e8feef9272b52c0c04c 100644
--- a/gcc/tree-vectorizer.c
+++ b/gcc/tree-vectorizer.c
@@ -875,6 +875,10 @@ try_vectorize_loop_1 (hash_table<simduid_to_vf> *&simduid_to_vf_htab,
   vec_info_shared shared;
   auto_purge_vect_location sentinel;
   vect_location = find_loop_location (loop);
+  auto_vector_sizes auto_vector_sizes;
+  vector_sizes vector_sizes;
+  bool assert_versioning = false;
+
   if (LOCATION_LOCUS (vect_location.get_location_t ()) != UNKNOWN_LOCATION
       && dump_enabled_p ())
     dump_printf (MSG_NOTE | MSG_PRIORITY_INTERNALS,
@@ -882,10 +886,35 @@ try_vectorize_loop_1 (hash_table<simduid_to_vf> *&simduid_to_vf_htab,
 		 LOCATION_FILE (vect_location.get_location_t ()),
 		 LOCATION_LINE (vect_location.get_location_t ()));
 
+  /* If this is an epilogue, we already know what vector sizes we will use for
+     vectorization as the analyzis was part of the main vectorized loop.  Use
+     these instead of going through all vector sizes again.  */
+  if (orig_loop_vinfo
+      && !LOOP_VINFO_LOOP (orig_loop_vinfo)->epilogue_vsizes.is_empty ())
+    {
+      vector_sizes = LOOP_VINFO_LOOP (orig_loop_vinfo)->epilogue_vsizes;
+      assert_versioning = LOOP_REQUIRES_VERSIONING (orig_loop_vinfo);
+      current_vector_size = vector_sizes[0];
+    }
+  else
+    {
+      /* Autodetect first vector size we try.  */
+      current_vector_size = 0;
+
+      targetm.vectorize.autovectorize_vector_sizes (&auto_vector_sizes,
+						    loop->simdlen != 0);
+      vector_sizes = auto_vector_sizes;
+    }
+
   /* Try to analyze the loop, retaining an opt_problem if dump_enabled_p.  */
-  opt_loop_vec_info loop_vinfo
-    = vect_analyze_loop (loop, orig_loop_vinfo, &shared);
-  loop->aux = loop_vinfo;
+  opt_loop_vec_info loop_vinfo = opt_loop_vec_info::success (NULL);
+  if (loop_vec_info_for_loop (loop))
+    loop_vinfo = opt_loop_vec_info::success (loop_vec_info_for_loop (loop));
+  else
+    {
+      loop_vinfo = vect_analyze_loop (loop, orig_loop_vinfo, &shared, vector_sizes);
+      loop->aux = loop_vinfo;
+    }
 
   if (!loop_vinfo)
     if (dump_enabled_p ())
@@ -898,6 +927,10 @@ try_vectorize_loop_1 (hash_table<simduid_to_vf> *&simduid_to_vf_htab,
 
   if (!loop_vinfo || !LOOP_VINFO_VECTORIZABLE_P (loop_vinfo))
     {
+      /* If this loops requires versioning, make sure the analyzis done on the
+	 epilogue loops succeeds.  */
+      gcc_assert (!assert_versioning);
+
       /* Free existing information if loop is analyzed with some
 	 assumptions.  */
       if (loop_constraint_set_p (loop, LOOP_C_FINITE))
@@ -1013,8 +1046,13 @@ try_vectorize_loop_1 (hash_table<simduid_to_vf> *&simduid_to_vf_htab,
 
   /* Epilogue of vectorized loop must be vectorized too.  */
   if (new_loop)
-    ret |= try_vectorize_loop_1 (simduid_to_vf_htab, num_vectorized_loops,
-				 new_loop, loop_vinfo, NULL, NULL);
+    {
+      /* Don't include vectorized epilogues in the "vectorized loops" count.
+       */
+      unsigned dont_count = *num_vectorized_loops;
+      ret |= try_vectorize_loop_1 (simduid_to_vf_htab, &dont_count,
+				   new_loop, loop_vinfo, NULL, NULL);
+    }
 
   return ret;
 }

Reply via email to