The following patch changes the inliner callsite filter with FDO/LIPO.
 Previously, cold callsites were unconditionally rejected.  Now the
callsite may still be inlined if the _caller_ is sufficiently hot (max
count of any bb in the function is above hot threshold).  This gives
about 0.5 - 1% geomean performance on x86-64 (depending on microarch)
on internal benchmarks with < 1% average code size increase.

Bootstrapped and reg tested.  Ok for google/gcc-4_6?

Mark

2011-08-23  Mark Heffernan  <meh...@google.com>

        * basic-block.h (maybe_hot_frequency_p): Add prototype.
        * cgraph.c (dump_cgraph_node): Add field to dump.
        (cgraph_clone_node) Handle new field.
        * cgraph.h (cgraph_node): New field max_bb_count.
        * cgraphbuild.c (rebuild_cgraph_edges): Compute max_bb_count.
        * cgraphunit.c (cgraph_copy_node_for_versioning) Handle new field.
        * common.opt (finline-hot-caller): New option.
        * ipa-inline.c (cgraph_mark_inline_edge) Update max_bb_count.
        (edge_hot_enough_p) New function.
        (cgraph_decide_inlining_of_small_functions) Call edge_hot_enough_p.
        * predict.c (maybe_hot_frequency_p): Remove static keyword and
        guard with profile_info check.
        * testsuite/gcc.dg/tree-prof/inliner-1.c: Add flag.
        * testsuite/gcc.dg/tree-prof/lipo/inliner-1_0.c: Add flag.
Index: cgraphbuild.c
===================================================================
--- cgraphbuild.c	(revision 177964)
+++ cgraphbuild.c	(working copy)
@@ -591,9 +591,12 @@ rebuild_cgraph_edges (void)
   ipa_remove_all_references (&node->ref_list);
 
   node->count = ENTRY_BLOCK_PTR->count;
+  node->max_bb_count = 0;
 
   FOR_EACH_BB (bb)
     {
+      if (bb->count > node->max_bb_count)
+	node->max_bb_count = bb->count;
       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 	{
 	  gimple stmt = gsi_stmt (gsi);
Index: cgraph.c
===================================================================
--- cgraph.c	(revision 177964)
+++ cgraph.c	(working copy)
@@ -1904,6 +1904,9 @@ dump_cgraph_node (FILE *f, struct cgraph
   if (node->count)
     fprintf (f, " executed "HOST_WIDEST_INT_PRINT_DEC"x",
 	     (HOST_WIDEST_INT)node->count);
+  if (node->max_bb_count)
+    fprintf (f, " hottest bb executed "HOST_WIDEST_INT_PRINT_DEC"x",
+	     (HOST_WIDEST_INT)node->max_bb_count);
   if (node->local.inline_summary.self_time)
     fprintf (f, " %i time, %i benefit", node->local.inline_summary.self_time,
     					node->local.inline_summary.time_inlining_benefit);
@@ -2234,6 +2237,9 @@ cgraph_clone_node (struct cgraph_node *n
   new_node->global = n->global;
   new_node->rtl = n->rtl;
   new_node->count = count;
+  new_node->max_bb_count = count;
+  if (n->count)
+    new_node->max_bb_count = count * n->max_bb_count / n->count;
   new_node->is_versioned_clone = n->is_versioned_clone;
   new_node->frequency = n->frequency;
   new_node->clone = n->clone;
@@ -2252,6 +2258,9 @@ cgraph_clone_node (struct cgraph_node *n
       n->count -= count;
       if (n->count < 0)
 	n->count = 0;
+      n->max_bb_count -= new_node->max_bb_count;
+      if (n->max_bb_count < 0)
+	n->max_bb_count = 0;
     }
 
   FOR_EACH_VEC_ELT (cgraph_edge_p, redirect_callers, i, e)
Index: cgraph.h
===================================================================
--- cgraph.h	(revision 177964)
+++ cgraph.h	(working copy)
@@ -235,6 +235,8 @@ struct GTY((chain_next ("%h.next"), chai
 
   /* Expected number of executions: calculated in profile.c.  */
   gcov_type count;
+  /* Maximum count of any basic block in the function.  */
+  gcov_type max_bb_count;
   /* How to scale counts at materialization time; used to merge
      LTO units with different number of profile runs.  */
   int count_materialization_scale;
Index: cgraphunit.c
===================================================================
--- cgraphunit.c	(revision 177964)
+++ cgraphunit.c	(working copy)
@@ -2187,6 +2187,7 @@ cgraph_copy_node_for_versioning (struct 
    new_version->rtl = old_version->rtl;
    new_version->reachable = true;
    new_version->count = old_version->count;
+   new_version->max_bb_count = old_version->max_bb_count;
    new_version->is_versioned_clone = true;
 
    for (e = old_version->callees; e; e=e->next_callee)
Index: testsuite/gcc.dg/tree-prof/inliner-1.c
===================================================================
--- testsuite/gcc.dg/tree-prof/inliner-1.c	(revision 177964)
+++ testsuite/gcc.dg/tree-prof/inliner-1.c	(working copy)
@@ -1,4 +1,4 @@
-/* { dg-options "-O2 -fdump-tree-optimized" } */
+/* { dg-options "-O2 -fno-inline-hot-caller -fdump-tree-optimized" } */
 int a;
 int b[100];
 void abort (void);
@@ -34,7 +34,7 @@ main ()
   return 0;
 }
 
-/* cold function should be inlined, while hot function should not.  
+/* cold function should be not inlined, while hot function should be.
    Look for "cold_function () [tail call];" call statement not for the
    declaration or other apperances of the string in dump.  */
 /* { dg-final-use { scan-tree-dump "cold_function ..;" "optimized"} } */
Index: testsuite/gcc.dg/tree-prof/lipo/inliner-1_0.c
===================================================================
--- testsuite/gcc.dg/tree-prof/lipo/inliner-1_0.c	(revision 177964)
+++ testsuite/gcc.dg/tree-prof/lipo/inliner-1_0.c	(working copy)
@@ -1,4 +1,4 @@
-/* { dg-options "-O2 -fdump-tree-optimized" } */
+/* { dg-options "-O2 -fno-inline-hot-caller -fdump-tree-optimized" } */
 int a;
 int b[100];
 void abort (void);
@@ -34,7 +34,7 @@ main ()
   return 0;
 }
 
-/* cold function should be inlined, while hot function should not.  
+/* cold function should not be inlined, while hot function should be.
    Look for "cold_function () [tail call];" call statement not for the
    declaration or other apperances of the string in dump.  */
 /* { dg-final-use { scan-tree-dump "cold_function ..;" "optimized"} } */
Index: ipa-inline.c
===================================================================
--- ipa-inline.c	(revision 177964)
+++ ipa-inline.c	(working copy)
@@ -332,6 +332,9 @@ cgraph_mark_inline_edge (struct cgraph_e
       new_size = cgraph_estimate_size_after_inlining (to, what);
       to->global.size = new_size;
       to->global.time = cgraph_estimate_time_after_inlining (freq, to, what);
+
+      if (to->max_bb_count < e->callee->max_bb_count)
+	to->max_bb_count = e->callee->max_bb_count;
     }
   gcc_assert (what->global.inlined_to == to);
   if (new_size > old_size)
@@ -1057,6 +1060,19 @@ add_new_edges_to_heap (fibheap_t heap, V
     }
 }
 
+/* Returns true if an edge or its caller are hot enough to
+   be considered for inlining.  */
+
+static bool
+edge_hot_enough_p (struct cgraph_edge *edge)
+{
+  if (cgraph_maybe_hot_edge_p (edge))
+    return true;
+  if (flag_inline_hot_caller && maybe_hot_count_p (edge->caller->max_bb_count))
+    return true;
+  return false;
+}
+
 
 /* We use greedy algorithm for inlining of small functions:
    All inline candidates are put into prioritized heap based on estimated
@@ -1201,7 +1217,7 @@ cgraph_decide_inlining_of_small_function
 
       if (edge->callee->local.disregard_inline_limits)
 	;
-      else if (!cgraph_maybe_hot_edge_p (edge))
+      else if (!edge_hot_enough_p (edge))
  	not_good = CIF_UNLIKELY_CALL;
       else if (!flag_inline_functions
 	  && !DECL_DECLARED_INLINE_P (edge->callee->decl))
Index: predict.c
===================================================================
--- predict.c	(revision 177964)
+++ predict.c	(working copy)
@@ -131,13 +131,13 @@ maybe_hot_frequency_p (int freq)
   return true;
 }
 
-/* Return TRUE if frequency FREQ is considered to be hot.  */
+/* Return TRUE if frequency COUNT is considered to be hot.  */
 
-static inline bool
+bool
 maybe_hot_count_p (gcov_type count)
 {
-  if (profile_status != PROFILE_READ)
-    return true;
+  if (!profile_info)
+    return false;
   /* Code executed at most once is not hot.  */
   if (profile_info->runs >= count)
     return false;
Index: common.opt
===================================================================
--- common.opt	(revision 177964)
+++ common.opt	(working copy)
@@ -1327,6 +1327,10 @@ finline-limit=
 Common RejectNegative Joined UInteger
 -finline-limit=<number>	Limit the size of inlined functions to <number>
 
+finline-hot-caller
+Common Report Var(flag_inline_hot_caller) Init(1) Optimization
+Consider inlining cold callsites if the caller includes hot code
+
 finstrument-functions
 Common Report Var(flag_instrument_function_entry_exit)
 Instrument function entry and exit with profiling calls
Index: basic-block.h
===================================================================
--- basic-block.h	(revision 177964)
+++ basic-block.h	(working copy)
@@ -744,6 +744,7 @@ extern struct edge_list *pre_edge_rev_lc
 extern void compute_available (sbitmap *, sbitmap *, sbitmap *, sbitmap *);
 
 /* In predict.c */
+extern bool maybe_hot_count_p (gcov_type);
 extern bool maybe_hot_bb_p (const_basic_block);
 extern bool maybe_hot_edge_p (edge);
 extern bool probably_never_executed_bb_p (const_basic_block);

Reply via email to