The following patch changes the inliner callsite filter with FDO/LIPO. Previously, cold callsites were unconditionally rejected. Now the callsite may still be inlined if the _caller_ is sufficiently hot (max count of any bb in the function is above hot threshold). This gives about 0.5 - 1% geomean performance on x86-64 (depending on microarch) on internal benchmarks with < 1% average code size increase.
Bootstrapped and reg tested. Ok for google/gcc-4_6? Mark 2011-08-23 Mark Heffernan <meh...@google.com> * basic-block.h (maybe_hot_frequency_p): Add prototype. * cgraph.c (dump_cgraph_node): Add field to dump. (cgraph_clone_node) Handle new field. * cgraph.h (cgraph_node): New field max_bb_count. * cgraphbuild.c (rebuild_cgraph_edges): Compute max_bb_count. * cgraphunit.c (cgraph_copy_node_for_versioning) Handle new field. * common.opt (finline-hot-caller): New option. * ipa-inline.c (cgraph_mark_inline_edge) Update max_bb_count. (edge_hot_enough_p) New function. (cgraph_decide_inlining_of_small_functions) Call edge_hot_enough_p. * predict.c (maybe_hot_frequency_p): Remove static keyword and guard with profile_info check. * testsuite/gcc.dg/tree-prof/inliner-1.c: Add flag. * testsuite/gcc.dg/tree-prof/lipo/inliner-1_0.c: Add flag.
Index: cgraphbuild.c =================================================================== --- cgraphbuild.c (revision 177964) +++ cgraphbuild.c (working copy) @@ -591,9 +591,12 @@ rebuild_cgraph_edges (void) ipa_remove_all_references (&node->ref_list); node->count = ENTRY_BLOCK_PTR->count; + node->max_bb_count = 0; FOR_EACH_BB (bb) { + if (bb->count > node->max_bb_count) + node->max_bb_count = bb->count; for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi)) { gimple stmt = gsi_stmt (gsi); Index: cgraph.c =================================================================== --- cgraph.c (revision 177964) +++ cgraph.c (working copy) @@ -1904,6 +1904,9 @@ dump_cgraph_node (FILE *f, struct cgraph if (node->count) fprintf (f, " executed "HOST_WIDEST_INT_PRINT_DEC"x", (HOST_WIDEST_INT)node->count); + if (node->max_bb_count) + fprintf (f, " hottest bb executed "HOST_WIDEST_INT_PRINT_DEC"x", + (HOST_WIDEST_INT)node->max_bb_count); if (node->local.inline_summary.self_time) fprintf (f, " %i time, %i benefit", node->local.inline_summary.self_time, node->local.inline_summary.time_inlining_benefit); @@ -2234,6 +2237,9 @@ cgraph_clone_node (struct cgraph_node *n new_node->global = n->global; new_node->rtl = n->rtl; new_node->count = count; + new_node->max_bb_count = count; + if (n->count) + new_node->max_bb_count = count * n->max_bb_count / n->count; new_node->is_versioned_clone = n->is_versioned_clone; new_node->frequency = n->frequency; new_node->clone = n->clone; @@ -2252,6 +2258,9 @@ cgraph_clone_node (struct cgraph_node *n n->count -= count; if (n->count < 0) n->count = 0; + n->max_bb_count -= new_node->max_bb_count; + if (n->max_bb_count < 0) + n->max_bb_count = 0; } FOR_EACH_VEC_ELT (cgraph_edge_p, redirect_callers, i, e) Index: cgraph.h =================================================================== --- cgraph.h (revision 177964) +++ cgraph.h (working copy) @@ -235,6 +235,8 @@ struct GTY((chain_next ("%h.next"), chai /* Expected number of executions: calculated in profile.c. */ gcov_type count; + /* Maximum count of any basic block in the function. */ + gcov_type max_bb_count; /* How to scale counts at materialization time; used to merge LTO units with different number of profile runs. */ int count_materialization_scale; Index: cgraphunit.c =================================================================== --- cgraphunit.c (revision 177964) +++ cgraphunit.c (working copy) @@ -2187,6 +2187,7 @@ cgraph_copy_node_for_versioning (struct new_version->rtl = old_version->rtl; new_version->reachable = true; new_version->count = old_version->count; + new_version->max_bb_count = old_version->max_bb_count; new_version->is_versioned_clone = true; for (e = old_version->callees; e; e=e->next_callee) Index: testsuite/gcc.dg/tree-prof/inliner-1.c =================================================================== --- testsuite/gcc.dg/tree-prof/inliner-1.c (revision 177964) +++ testsuite/gcc.dg/tree-prof/inliner-1.c (working copy) @@ -1,4 +1,4 @@ -/* { dg-options "-O2 -fdump-tree-optimized" } */ +/* { dg-options "-O2 -fno-inline-hot-caller -fdump-tree-optimized" } */ int a; int b[100]; void abort (void); @@ -34,7 +34,7 @@ main () return 0; } -/* cold function should be inlined, while hot function should not. +/* cold function should be not inlined, while hot function should be. Look for "cold_function () [tail call];" call statement not for the declaration or other apperances of the string in dump. */ /* { dg-final-use { scan-tree-dump "cold_function ..;" "optimized"} } */ Index: testsuite/gcc.dg/tree-prof/lipo/inliner-1_0.c =================================================================== --- testsuite/gcc.dg/tree-prof/lipo/inliner-1_0.c (revision 177964) +++ testsuite/gcc.dg/tree-prof/lipo/inliner-1_0.c (working copy) @@ -1,4 +1,4 @@ -/* { dg-options "-O2 -fdump-tree-optimized" } */ +/* { dg-options "-O2 -fno-inline-hot-caller -fdump-tree-optimized" } */ int a; int b[100]; void abort (void); @@ -34,7 +34,7 @@ main () return 0; } -/* cold function should be inlined, while hot function should not. +/* cold function should not be inlined, while hot function should be. Look for "cold_function () [tail call];" call statement not for the declaration or other apperances of the string in dump. */ /* { dg-final-use { scan-tree-dump "cold_function ..;" "optimized"} } */ Index: ipa-inline.c =================================================================== --- ipa-inline.c (revision 177964) +++ ipa-inline.c (working copy) @@ -332,6 +332,9 @@ cgraph_mark_inline_edge (struct cgraph_e new_size = cgraph_estimate_size_after_inlining (to, what); to->global.size = new_size; to->global.time = cgraph_estimate_time_after_inlining (freq, to, what); + + if (to->max_bb_count < e->callee->max_bb_count) + to->max_bb_count = e->callee->max_bb_count; } gcc_assert (what->global.inlined_to == to); if (new_size > old_size) @@ -1057,6 +1060,19 @@ add_new_edges_to_heap (fibheap_t heap, V } } +/* Returns true if an edge or its caller are hot enough to + be considered for inlining. */ + +static bool +edge_hot_enough_p (struct cgraph_edge *edge) +{ + if (cgraph_maybe_hot_edge_p (edge)) + return true; + if (flag_inline_hot_caller && maybe_hot_count_p (edge->caller->max_bb_count)) + return true; + return false; +} + /* We use greedy algorithm for inlining of small functions: All inline candidates are put into prioritized heap based on estimated @@ -1201,7 +1217,7 @@ cgraph_decide_inlining_of_small_function if (edge->callee->local.disregard_inline_limits) ; - else if (!cgraph_maybe_hot_edge_p (edge)) + else if (!edge_hot_enough_p (edge)) not_good = CIF_UNLIKELY_CALL; else if (!flag_inline_functions && !DECL_DECLARED_INLINE_P (edge->callee->decl)) Index: predict.c =================================================================== --- predict.c (revision 177964) +++ predict.c (working copy) @@ -131,13 +131,13 @@ maybe_hot_frequency_p (int freq) return true; } -/* Return TRUE if frequency FREQ is considered to be hot. */ +/* Return TRUE if frequency COUNT is considered to be hot. */ -static inline bool +bool maybe_hot_count_p (gcov_type count) { - if (profile_status != PROFILE_READ) - return true; + if (!profile_info) + return false; /* Code executed at most once is not hot. */ if (profile_info->runs >= count) return false; Index: common.opt =================================================================== --- common.opt (revision 177964) +++ common.opt (working copy) @@ -1327,6 +1327,10 @@ finline-limit= Common RejectNegative Joined UInteger -finline-limit=<number> Limit the size of inlined functions to <number> +finline-hot-caller +Common Report Var(flag_inline_hot_caller) Init(1) Optimization +Consider inlining cold callsites if the caller includes hot code + finstrument-functions Common Report Var(flag_instrument_function_entry_exit) Instrument function entry and exit with profiling calls Index: basic-block.h =================================================================== --- basic-block.h (revision 177964) +++ basic-block.h (working copy) @@ -744,6 +744,7 @@ extern struct edge_list *pre_edge_rev_lc extern void compute_available (sbitmap *, sbitmap *, sbitmap *, sbitmap *); /* In predict.c */ +extern bool maybe_hot_count_p (gcov_type); extern bool maybe_hot_bb_p (const_basic_block); extern bool maybe_hot_edge_p (edge); extern bool probably_never_executed_bb_p (const_basic_block);