[google][4.6][i386]Support autocloning for corei7 with -mvarch= option to remove LCP stalls in loops (issue5865043)

2012-03-20 Thread Sriraman Tallam
This patch adds support to version for corei7 with -mvarch option. The 
versioning supported is in the case where a loop generates a LCP stalling 
instruction in corei7. In such cases, on corei7, limiting the unroll factor to 
try to keep the unrolled loop body small enough to fit in the Corei7's loop 
stream detector can hide LCP stalls in loops. With mvarch, the function 
containing the loop is multi-versioned and one version is tagged with 
tune=corei7 so that the unroll factor can be limited on this version.

Please see: http://gcc.gnu.org/ml/gcc-patches/2011-12/msg01230.html for 
discussion on mvarch option.
Please see: http://gcc.gnu.org/ml/gcc-patches/2011-12/msg00123.html for 
discussion  on LCP stalls in corei7.


The autocloning framework is only avaiable in google/gcc-4_6 branch. I am 
working on porting this to trunk.

* config/i386/i386.c (find_himode_assigns): New function.
(mversionable_for_core2_p): Add new param version_number.
(mversionable_for_corei7_p): New function.
(ix86_mversion_function): Check for corei7 versioning.
* params.def (PARAM_MAX_FUNCTION_SIZE_FOR_AUTO_CLONING): Bump
allowed limit to 5000.
*  mversn-dispatch.c (do_auto_clone): Reverse fn_ver_addr_chain.

Index: config/i386/i386.c
===
--- config/i386/i386.c  (revision 185514)
+++ config/i386/i386.c  (working copy)
@@ -26507,6 +26507,132 @@ any_loops_vectorizable_with_load_store (void)
   return vectorizable_loop_found;
 }
 
+/* Returns true if this function finds a loop that contains a possible LCP
+   stalling instruction on corei7.   This is used to multiversion functions
+   for corei7.  
+
+   This function looks for instructions that store a constant into
+   HImode (16-bit) memory. These require a length-changing prefix and on
+   corei7 are prone to LCP stalls. These stalls can be avoided if the loop
+   is streamed from the loop stream detector.  */
+
+static bool
+find_himode_assigns (void)
+{
+  gimple_stmt_iterator gsi;
+  gimple stmt;
+  enum gimple_code code;
+  tree lhs/*, rhs*/;
+  enum machine_mode mode;
+  basic_block *body;
+  unsigned i;
+  loop_iterator li;
+  struct loop *loop;
+  bool found = false;
+  location_t locus = 0;
+  int stmt_count;
+  unsigned HOST_WIDE_INT n_unroll, max_unroll;
+
+  if (!flag_unroll_loops)
+return false;
+
+  loop_optimizer_init (LOOPS_NORMAL
+   | LOOPS_HAVE_RECORDED_EXITS);
+  if (number_of_loops ()  1)
+return false;
+
+  scev_initialize();
+
+  if (profile_status == PROFILE_READ)
+max_unroll = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES_FEEDBACK);
+  else
+max_unroll = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES);
+
+  FOR_EACH_LOOP (li, loop, LI_ONLY_INNERMOST)
+{
+  tree niter;
+
+  /* Will not peel/unroll cold areas.  */
+  if (optimize_loop_for_size_p (loop))
+continue;
+
+  /* Can the loop be manipulated?  */
+  if (!can_duplicate_loop_p (loop))
+continue;
+
+  niter = number_of_latch_executions (loop);
+  if (host_integerp (niter, 1))
+   {
+ n_unroll = tree_low_cst (niter, 1);
+ if (n_unroll = max_unroll)
+   continue;
+   }
+
+  body = get_loop_body (loop);
+  found = false;
+  stmt_count = 0;
+
+  for (i = 0; i  loop-num_nodes; i++)
+   {
+ for (gsi = gsi_start_bb (body[i]); !gsi_end_p (gsi); gsi_next (gsi))
+   {
+ stmt = gsi_stmt (gsi);
+ stmt_count++;
+ if (found)
+   continue;
+ code = gimple_code (stmt);
+ if (code != GIMPLE_ASSIGN)
+   continue;
+ lhs = gimple_assign_lhs (stmt);
+ if (TREE_CODE (lhs) != MEM_REF 
+ TREE_CODE (lhs) != COMPONENT_REF 
+ TREE_CODE (lhs) != ARRAY_REF)
+   continue;
+ if (gimple_assign_rhs_code(stmt) != INTEGER_CST)
+   continue;
+ mode = TYPE_MODE (TREE_TYPE (lhs));
+ if (mode == HImode)
+   {
+ locus = gimple_location (stmt);
+ found = true;
+   }
+  }
+   }
+  /* Don't worry about large loops that won't be unrolled anyway. In fact,
+   * don't worry about unrolling loops that are already over the size of 
the
+   * LSD (28 insts). Since instruction counts may be a little off at this
+   * point, due to downstream transformations, include loops a little 
bigger
+   * than the LSD size.
+   */
+  if (found  stmt_count  40)
+   {
+ n_unroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS)/stmt_count;
+ /* Check for a simple peel candidate */
+ if (!(loop-header-count
+expected_loop_iterations (loop)  2 * n_unroll))
+   {
+ location_t locus2;
+ edge exit;
+ if ((exit = single_exit(loop)) != 

Re: [google][4.6][i386]Support autocloning for corei7 with -mvarch= option to remove LCP stalls in loops (issue5865043)

2012-03-20 Thread Teresa Johnson
On Tue, Mar 20, 2012 at 2:04 PM, Sriraman Tallam tmsri...@google.com wrote:
 This patch adds support to version for corei7 with -mvarch option. The 
 versioning supported is in the case where a loop generates a LCP stalling 
 instruction in corei7. In such cases, on corei7, limiting the unroll factor 
 to try to keep the unrolled loop body small enough to fit in the Corei7's 
 loop stream detector can hide LCP stalls in loops. With mvarch, the function 
 containing the loop is multi-versioned and one version is tagged with 
 tune=corei7 so that the unroll factor can be limited on this version.

 Please see: http://gcc.gnu.org/ml/gcc-patches/2011-12/msg01230.html for 
 discussion on mvarch option.
 Please see: http://gcc.gnu.org/ml/gcc-patches/2011-12/msg00123.html for 
 discussion  on LCP stalls in corei7.


 The autocloning framework is only avaiable in google/gcc-4_6 branch. I am 
 working on porting this to trunk.

        * config/i386/i386.c (find_himode_assigns): New function.
        (mversionable_for_core2_p): Add new param version_number.
        (mversionable_for_corei7_p): New function.
        (ix86_mversion_function): Check for corei7 versioning.
        * params.def (PARAM_MAX_FUNCTION_SIZE_FOR_AUTO_CLONING): Bump
        allowed limit to 5000.
        *  mversn-dispatch.c (do_auto_clone): Reverse fn_ver_addr_chain.

 Index: config/i386/i386.c
 ===
 --- config/i386/i386.c  (revision 185514)
 +++ config/i386/i386.c  (working copy)
 @@ -26507,6 +26507,132 @@ any_loops_vectorizable_with_load_store (void)
   return vectorizable_loop_found;
  }

 +/* Returns true if this function finds a loop that contains a possible LCP
 +   stalling instruction on corei7.   This is used to multiversion functions
 +   for corei7.
 +
 +   This function looks for instructions that store a constant into
 +   HImode (16-bit) memory. These require a length-changing prefix and on
 +   corei7 are prone to LCP stalls. These stalls can be avoided if the loop
 +   is streamed from the loop stream detector.  */
 +
 +static bool
 +find_himode_assigns (void)
 +{
 +  gimple_stmt_iterator gsi;
 +  gimple stmt;
 +  enum gimple_code code;
 +  tree lhs/*, rhs*/;

Can rhs be removed?

 +  enum machine_mode mode;
 +  basic_block *body;
 +  unsigned i;
 +  loop_iterator li;
 +  struct loop *loop;
 +  bool found = false;
 +  location_t locus = 0;

locus is dead (assigned but not read).

 +  int stmt_count;
 +  unsigned HOST_WIDE_INT n_unroll, max_unroll;
 +
 +  if (!flag_unroll_loops)
 +    return false;
 +
 +  loop_optimizer_init (LOOPS_NORMAL
 +                       | LOOPS_HAVE_RECORDED_EXITS);
 +  if (number_of_loops ()  1)
 +    return false;
 +
 +  scev_initialize();
 +
 +  if (profile_status == PROFILE_READ)
 +    max_unroll = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES_FEEDBACK);
 +  else
 +    max_unroll = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES);

It might be clearer to rename max_unroll to max_peel_times or
something like that to be clearer.

 +
 +  FOR_EACH_LOOP (li, loop, LI_ONLY_INNERMOST)
 +    {
 +      tree niter;
 +
 +      /* Will not peel/unroll cold areas.  */
 +      if (optimize_loop_for_size_p (loop))
 +        continue;
 +
 +      /* Can the loop be manipulated?  */
 +      if (!can_duplicate_loop_p (loop))
 +        continue;
 +
 +      niter = number_of_latch_executions (loop);
 +      if (host_integerp (niter, 1))
 +       {
 +         n_unroll = tree_low_cst (niter, 1);
 +         if (n_unroll = max_unroll)
 +           continue;
 +       }
 +
 +      body = get_loop_body (loop);
 +      found = false;
 +      stmt_count = 0;
 +
 +      for (i = 0; i  loop-num_nodes; i++)
 +       {
 +         for (gsi = gsi_start_bb (body[i]); !gsi_end_p (gsi); gsi_next 
 (gsi))
 +           {
 +             stmt = gsi_stmt (gsi);
 +             stmt_count++;
 +             if (found)
 +               continue;
 +             code = gimple_code (stmt);
 +             if (code != GIMPLE_ASSIGN)
 +               continue;
 +             lhs = gimple_assign_lhs (stmt);
 +             if (TREE_CODE (lhs) != MEM_REF 
 +                 TREE_CODE (lhs) != COMPONENT_REF 
 +                 TREE_CODE (lhs) != ARRAY_REF)
 +               continue;
 +             if (gimple_assign_rhs_code(stmt) != INTEGER_CST)
 +               continue;
 +             mode = TYPE_MODE (TREE_TYPE (lhs));
 +             if (mode == HImode)
 +               {
 +                 locus = gimple_location (stmt);
 +                 found = true;
 +               }
 +          }
 +       }
 +      /* Don't worry about large loops that won't be unrolled anyway. In 
 fact,
 +       * don't worry about unrolling loops that are already over the size of 
 the
 +       * LSD (28 insts). Since instruction counts may be a little off at this
 +       * point, due to downstream transformations, include loops a little 
 bigger
 +       * than the LSD size.
 +       */
 +      if (found