The attached patch detects loops containing instructions that tend to
incur high LCP (loop changing prefix) stalls on Core i7, and limits
their unroll factor to try to keep the unrolled loop body small enough
to fit in the Corei7's loop stream detector which can hide LCP stalls
in loops.

To do this I leveraged the existing TARGET_LOOP_UNROLL_ADJUST target
hook, which was previously only defined for s390. I added one
additional call to this target hook, when unrolling for constant trip
count loops. Previously it was only called for runtime computed trip
counts. Andreas, can you comment on the effect for s390 of this
additional call of the target hook, since I can't measure that?

Successfully bootstrapped and checked with x86_64-unknown-linux-gnu.
Could someone please review?

Thanks,
Teresa

2011-12-01  Teresa Johnson  <tejohn...@google.com>

        * loop-unroll.c (loop_has_FP_comp): New function.
        (decide_unroll_constant_iterations): Call loop unroll target hook.
        * cfgloop.h (loop_has_FP_comp): Ditto.
        * config/i386/i386.c (ix86_loop_unroll_adjust): New function.
        (TARGET_LOOP_UNROLL_ADJUST): Define hook for x86.

Index: loop-unroll.c
===================================================================
--- loop-unroll.c       (revision 181902)
+++ loop-unroll.c       (working copy)
@@ -152,6 +152,38 @@ static void combine_var_copies_in_loop_e
                                             basic_block);
 static rtx get_expansion (struct var_to_expand *);

+/* Determine whether LOOP contains floating-point computation. */
+bool
+loop_has_FP_comp(struct loop *loop)
+{
+  rtx set, dest;
+  basic_block *body, bb;
+  unsigned i;
+  rtx insn;
+
+  body = get_loop_body (loop);
+  for (i = 0; i < loop->num_nodes; i++)
+    {
+      bb = body[i];
+
+      FOR_BB_INSNS (bb, insn)
+      {
+        set = single_set (insn);
+        if (!set)
+          continue;
+
+        dest = SET_DEST (set);
+        if (FLOAT_MODE_P (GET_MODE (dest)))
+        {
+          free (body);
+          return true;
+        }
+      }
+    }
+  free (body);
+  return false;
+}
+
 /* Unroll and/or peel (depending on FLAGS) LOOPS.  */
 void
 unroll_and_peel_loops (int flags)
@@ -547,6 +579,9 @@ decide_unroll_constant_iterations (struc
   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
     nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);

+  if (targetm.loop_unroll_adjust)
+    nunroll = targetm.loop_unroll_adjust (nunroll, loop);
+
   /* Skip big loops.  */
   if (nunroll <= 1)
     {
Index: cfgloop.h
===================================================================
--- cfgloop.h   (revision 181902)
+++ cfgloop.h   (working copy)
@@ -693,5 +693,6 @@ extern void unroll_and_peel_loops (int);
 extern void doloop_optimize_loops (void);
 extern void move_loop_invariants (void);
 extern bool finite_loop_p (struct loop *);
+extern bool loop_has_FP_comp(struct loop *loop);

 #endif /* GCC_CFGLOOP_H */
Index: config/i386/i386.c
===================================================================
--- config/i386/i386.c  (revision 181902)
+++ config/i386/i386.c  (working copy)
@@ -60,6 +60,7 @@ along with GCC; see the file COPYING3.
 #include "fibheap.h"
 #include "opts.h"
 #include "diagnostic.h"
+#include "cfgloop.h"

 enum upper_128bits_state
 {
@@ -38370,6 +38371,75 @@ ix86_autovectorize_vector_sizes (void)
   return (TARGET_AVX && !TARGET_PREFER_AVX128) ? 32 | 16 : 0;
 }

+/* If LOOP contains a possible LCP stalling instruction on corei7,
+   calculate new number of times to unroll instead of NUNROLL so that
+   the unrolled loop will still likely fit into the loop stream detector. */
+static unsigned
+ix86_loop_unroll_adjust (unsigned nunroll, struct loop *loop)
+{
+  basic_block *body, bb;
+  unsigned i;
+  rtx insn;
+  bool has_FP;
+  bool found = false;
+  unsigned newunroll;
+
+  if (ix86_tune != PROCESSOR_COREI7_64 &&
+      ix86_tune != PROCESSOR_COREI7_32)
+    return nunroll;
+
+  /* Look for instructions that store a constant into HImode (16-bit)
+     memory. These require a length-changing prefix and on corei7 are
+     prone to LCP stalls. These stalls can be avoided if the loop
+     is streamed from the loop stream detector. */
+  body = get_loop_body (loop);
+  for (i = 0; i < loop->num_nodes && !found; i++)
+    {
+      bb = body[i];
+
+      FOR_BB_INSNS (bb, insn)
+        {
+          rtx set_expr;
+          set_expr = single_set (insn);
+          if (set_expr != NULL_RTX
+              && GET_MODE (SET_DEST (set_expr)) == HImode
+              && CONST_INT_P (SET_SRC (set_expr))
+              && MEM_P (SET_DEST (set_expr)))
+            {
+              found = true;
+              break;
+            }
+        }
+    }
+  free (body);
+
+  if (!found)
+    return nunroll;
+
+  /* Don't reduce unroll factor in loops with floating point
+     computation, which tend to benefit more heavily from
+     larger unroll factors and are less likely to bottleneck
+     at the decoder. */
+  has_FP = loop_has_FP_comp(loop);
+  if (has_FP)
+    return nunroll;
+
+  if (dump_file)
+    {
+      fprintf (dump_file,
+               ";; Loop contains HImode store of const (possible LCP
stalls),\n");
+      fprintf (dump_file,
+               "   reduce unroll factor to fit into Loop Stream Detector\n");
+    }
+
+  /* On corei7 the loop stream detector can hold about 28 instructions, so
+     don't allow unrolling to exceed that. */
+  newunroll = 28 / loop->av_ninsns;
+  if (newunroll < nunroll)
+    return newunroll;
+
+  return nunroll;
+}
+
 /* Initialize the GCC target structure.  */
 #undef TARGET_RETURN_IN_MEMORY
 #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory
@@ -38685,6 +38755,9 @@ ix86_autovectorize_vector_sizes (void)
 #define TARGET_INIT_LIBFUNCS darwin_rename_builtins
 #endif

+#undef TARGET_LOOP_UNROLL_ADJUST
+#define TARGET_LOOP_UNROLL_ADJUST ix86_loop_unroll_adjust
+
 struct gcc_target targetm = TARGET_INITIALIZER;
 ^L
 #include "gt-i386.h"

-- 
Teresa Johnson | Software Engineer | tejohn...@google.com | 408-460-2413

Reply via email to