Dear Jan, On 16 November 2013 12:24, Jan Hubicka <hubi...@ucw.cz> wrote: >> diff --git a/gcc/ChangeLog b/gcc/ChangeLog >> index c566a85..1562098 100644 >> --- a/gcc/ChangeLog >> +++ b/gcc/ChangeLog >> @@ -1,3 +1,15 @@ >> +2013-11-13 Martin Liska <marxin.li...@gmail.com> >> + Jan Hubicka <j...@suse.cz> >> + >> + * cgraphunit.c (node_cmp): New function. >> + (expand_all_functions): Function ordering added. >> + * common.opt: New profile based function reordering flag introduced. >> + * coverage.c (get_coverage_counts): Wrong profile handled. >> + * ipa.c (cgraph_externally_visible_p): New late flag introduced. >> + * lto-partition.c: Support for time profile added. >> + * lto.c: Likewise. >> + * value-prof.c: Histogram instrumentation switch added. >> + >> 2013-11-13 Vladimir Makarov <vmaka...@redhat.com> >> >> PR rtl-optimization/59036 >> diff --git a/gcc/cgraphunit.c b/gcc/cgraphunit.c >> index 4765e6a..7cdd9a4 100644 >> --- a/gcc/cgraphunit.c >> +++ b/gcc/cgraphunit.c >> @@ -1821,6 +1821,17 @@ expand_function (struct cgraph_node *node) >> ipa_remove_all_references (&node->ref_list); >> } >> >> +/* Node comparer that is responsible for the order that corresponds >> + to time when a function was launched for the first time. */ >> + >> +static int >> +node_cmp (const void *pa, const void *pb) >> +{ >> + const struct cgraph_node *a = *(const struct cgraph_node * const *) pa; >> + const struct cgraph_node *b = *(const struct cgraph_node * const *) pb; >> + >> + return b->tp_first_run - a->tp_first_run; > > Please stabilize this by using node->order when tp_first_run is equivalent. > Later we ought to use better heuristic here, but order may be good enough to
Done. > start with. >> diff --git a/gcc/ipa.c b/gcc/ipa.c >> index a11b1c7..d92a332 100644 >> --- a/gcc/ipa.c >> +++ b/gcc/ipa.c >> @@ -761,10 +761,14 @@ cgraph_externally_visible_p (struct cgraph_node *node, >> This improves code quality and we know we will duplicate them at most >> twice >> (in the case that we are not using plugin and link with object file >> implementing same COMDAT) */ >> - if ((in_lto_p || whole_program) >> - && DECL_COMDAT (node->decl) >> - && comdat_can_be_unshared_p (node)) >> - return false; >> + if ((in_lto_p || whole_program || profile_arc_flag) >> + && DECL_COMDAT (node->decl) >> + && comdat_can_be_unshared_p (node)) >> + { >> + gcc_checking_assert (cgraph_function_body_availability (node) >> + > AVAIL_OVERWRITABLE); >> + return false; >> + } >> >> /* When doing link time optimizations, hidden symbols become local. */ >> if (in_lto_p >> @@ -932,7 +936,7 @@ function_and_variable_visibility (bool whole_program) >> } >> gcc_assert ((!DECL_WEAK (node->decl) >> && !DECL_COMDAT (node->decl)) >> - || TREE_PUBLIC (node->decl) >> + || TREE_PUBLIC (node->decl) >> || node->weakref >> || DECL_EXTERNAL (node->decl)); >> if (cgraph_externally_visible_p (node, whole_program)) >> @@ -949,7 +953,7 @@ function_and_variable_visibility (bool whole_program) >> && node->definition && !node->weakref >> && !DECL_EXTERNAL (node->decl)) >> { >> - gcc_assert (whole_program || in_lto_p >> + gcc_assert (whole_program || in_lto_p || profile_arc_flag >> || !TREE_PUBLIC (node->decl)); >> node->unique_name = ((node->resolution == LDPR_PREVAILING_DEF_IRONLY >> || node->resolution == >> LDPR_PREVAILING_DEF_IRONLY_EXP) > > These changes are unrelated, please remove them. >> @@ -395,6 +397,20 @@ node_cmp (const void *pa, const void *pb) >> { >> const struct cgraph_node *a = *(const struct cgraph_node * const *) pa; >> const struct cgraph_node *b = *(const struct cgraph_node * const *) pb; >> + >> + /* Profile reorder flag enables function reordering based on first >> execution >> + of a function. All functions with profile are placed in ascending >> + order at the beginning. */ >> + >> + if (flag_profile_reorder_functions) > && a->tp_first_run != b->tp_first_run >> + { >> + if (a->tp_first_run && b->tp_first_run) >> + return a->tp_first_run - b->tp_first_run; >> + >> + if (a->tp_first_run || b->tp_first_run) >> + return b->tp_first_run - a->tp_first_run; > > Drop a comment explaining the logic here ;) >> @@ -449,7 +465,7 @@ void >> lto_balanced_map (void) >> { >> int n_nodes = 0; >> - int n_varpool_nodes = 0, varpool_pos = 0, best_varpool_pos = 0; >> + int n_varpool_nodes = 0, varpool_pos = 0; >> struct cgraph_node **order = XNEWVEC (struct cgraph_node *, >> cgraph_max_uid); >> struct varpool_node **varpool_order = NULL; >> int i; >> @@ -481,10 +497,13 @@ lto_balanced_map (void) >> get better about minimizing the function bounday, but until that >> things works smoother if we order in source order. */ >> qsort (order, n_nodes, sizeof (struct cgraph_node *), node_cmp); >> + >> + if (cgraph_dump_file) >> + for(i = 0; i < n_nodes; i++) >> + fprintf (cgraph_dump_file, "Balanced map symbol order:%s:%u\n", >> cgraph_node_asm_name (order[i]), order[i]->tp_first_run); >> + >> if (!flag_toplevel_reorder) >> { >> - qsort (order, n_nodes, sizeof (struct cgraph_node *), node_cmp); >> - >> FOR_EACH_VARIABLE (vnode) >> if (get_symbol_class (vnode) == SYMBOL_PARTITION) >> n_varpool_nodes++; > > Good catch (the redundant qsort) > >> @@ -683,7 +702,6 @@ lto_balanced_map (void) >> best_i = i; >> best_n_nodes = lto_symtab_encoder_size (partition->encoder); >> best_total_size = total_size; >> - best_varpool_pos = varpool_pos; >> } >> if (cgraph_dump_file) >> fprintf (cgraph_dump_file, "Step %i: added %s/%i, size %i, cost %i/%i " >> @@ -701,7 +719,6 @@ lto_balanced_map (void) >> fprintf (cgraph_dump_file, "Unwinding %i insertions to step >> %i\n", >> i - best_i, best_i); >> undo_partition (partition, best_n_nodes); >> - varpool_pos = best_varpool_pos; >> } >> i = best_i; >> /* When we are finished, avoid creating empty partition. */ > > This actually reverts a bugfix, so please remove the two changes above. >> diff --git a/gcc/lto/lto.c b/gcc/lto/lto.c >> index 62856d0..2401c76 100644 >> --- a/gcc/lto/lto.c >> +++ b/gcc/lto/lto.c >> @@ -2453,9 +2453,12 @@ lto_wpa_write_files (void) >> /* Sort partitions by size so small ones are compiled last. >> FIXME: Even when not reordering we may want to output one list for >> parallel make >> and other for final link command. */ >> - ltrans_partitions.qsort (flag_toplevel_reorder >> + >> + if (!flag_profile_reorder_functions || !flag_profile_use) >> + ltrans_partitions.qsort (flag_toplevel_reorder >> ? cmp_partitions_size >> : cmp_partitions_order); >> + > > Add a FIXME explaining that we ought to produce two list - one for Make > parallelizm and one for final link time. > lets work on this incrementally. >> diff --git a/gcc/varasm.c b/gcc/varasm.c >> index 2226912..758367f 100644 >> --- a/gcc/varasm.c >> +++ b/gcc/varasm.c >> @@ -548,7 +548,14 @@ default_function_section (tree decl, enum >> node_frequency freq, >> unlikely executed (this happens especially with function splitting >> where we can split away unnecessary parts of static constructors. */ >> if (startup && freq != NODE_FREQUENCY_UNLIKELY_EXECUTED) >> - return get_named_text_section (decl, ".text.startup", NULL); >> + { >> + /* If we do have a profile or(and) LTO phase is executed, we do not >> need >> + these ELF section. */ >> + if (!in_lto_p || !flag_profile_values) >> + return get_named_text_section (decl, ".text.startup", NULL); >> + else >> + return NULL; >> + } >> >> /* Similarly for exit. */ >> if (exit && freq != NODE_FREQUENCY_UNLIKELY_EXECUTED) >> @@ -560,7 +567,10 @@ default_function_section (tree decl, enum >> node_frequency freq, >> case NODE_FREQUENCY_UNLIKELY_EXECUTED: >> return get_named_text_section (decl, ".text.unlikely", NULL); >> case NODE_FREQUENCY_HOT: >> - return get_named_text_section (decl, ".text.hot", NULL); >> + /* If we do have a profile or(and) LTO phase is executed, we do not >> need >> + these ELF section. */ >> + if (!in_lto_p || !flag_profile_values) >> + return get_named_text_section (decl, ".text.hot", NULL); >> default: >> return NULL; >> } > Lets handle this in separate patch. > > The patch is missing documentation in doc/invoke.texi. You need to document > the new command line option and update documentation of -fprofile-use. Please > add it and send an update patch. I added documentation to invoke.texi. > Also Teresa recently sent patch for resetting profiles of functions where we > clearly missed the profile. > Please add there a code filling in tp_first_run from largest first_run of the > callers with non-0 count. Good idea, I implemented filling of time-profile according to callers. New version of patch is submitted as attachment. Martin > > Thanks, > Honza
diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 5cb07b7..754f882 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,13 @@ +2013-11-17 Martin Liska <marxin.li...@gmail.com> + Jan Hubicka <j...@suse.cz> + + * cgraphunit.c (node_cmp): New function. + (expand_all_functions): Function ordering added. + * common.opt: New profile based function reordering flag introduced. + * lto-partition.c: Support for time profile added. + * lto.c: Likewise. + * predict.c (handle_missing_profiles): Time profile handled in + missing profiles. 2013-11-16 Joern Rennecke <joern.renne...@embecosm.com> * config/arc/arc.c (arc_predicate_delay_insns): New function. diff --git a/gcc/cgraphunit.c b/gcc/cgraphunit.c index 8ab274b..ea722b8 100644 --- a/gcc/cgraphunit.c +++ b/gcc/cgraphunit.c @@ -1824,6 +1824,19 @@ expand_function (struct cgraph_node *node) ipa_remove_all_references (&node->ref_list); } +/* Node comparer that is responsible for the order that corresponds + to time when a function was launched for the first time. */ + +static int +node_cmp (const void *pa, const void *pb) +{ + const struct cgraph_node *a = *(const struct cgraph_node * const *) pa; + const struct cgraph_node *b = *(const struct cgraph_node * const *) pb; + + return a->tp_first_run != b->tp_first_run + ? b->tp_first_run - a->tp_first_run + : b->order - a->order; +} /* Expand all functions that must be output. @@ -1835,11 +1848,14 @@ expand_function (struct cgraph_node *node) to use subsections to make the output functions appear in top-down order). */ + static void expand_all_functions (void) { struct cgraph_node *node; struct cgraph_node **order = XCNEWVEC (struct cgraph_node *, cgraph_n_nodes); + + unsigned int expanded_func_count = 0, profiled_func_count = 0; int order_pos, new_order_pos = 0; int i; @@ -1852,19 +1868,35 @@ expand_all_functions (void) if (order[i]->process) order[new_order_pos++] = order[i]; + if (flag_profile_reorder_functions) + qsort (order, new_order_pos, sizeof (struct cgraph_node *), node_cmp); + for (i = new_order_pos - 1; i >= 0; i--) { node = order[i]; + if (node->process) { + expanded_func_count++; + if(node->tp_first_run) + profiled_func_count++; + node->process = 0; expand_function (node); } } + + if (in_lto_p && dump_file) + fprintf (dump_file, "Expanded functions with time profile (%s):%u/%u\n", + main_input_filename, profiled_func_count, expanded_func_count); + + if (cgraph_dump_file && flag_profile_reorder_functions && in_lto_p) + fprintf (cgraph_dump_file, "Expanded functions with time profile:%u/%u\n", + profiled_func_count, expanded_func_count); + cgraph_process_new_functions (); free (order); - } /* This is used to sort the node types by the cgraph order number. */ @@ -2189,6 +2221,7 @@ compile (void) #endif cgraph_state = CGRAPH_STATE_EXPANSION; + if (!flag_toplevel_reorder) output_in_order (); else diff --git a/gcc/common.opt b/gcc/common.opt index d5971df..85d5c74 100644 --- a/gcc/common.opt +++ b/gcc/common.opt @@ -1708,6 +1708,10 @@ fprofile-report Common Report Var(profile_report) Report on consistency of profile +fprofile-reorder-functions +Common Report Var(flag_profile_reorder_functions) +Enable function reordering that improves code placement + frandom-seed Common Var(common_deferred_options) Defer diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi index ff4c2ee..eb101ed 100644 --- a/gcc/doc/invoke.texi +++ b/gcc/doc/invoke.texi @@ -393,7 +393,7 @@ Objective-C and Objective-C++ Dialects}. -fprefetch-loop-arrays -fprofile-report @gol -fprofile-correction -fprofile-dir=@var{path} -fprofile-generate @gol -fprofile-generate=@var{path} @gol --fprofile-use -fprofile-use=@var{path} -fprofile-values @gol +-fprofile-use -fprofile-use=@var{path} -fprofile-values -fprofile-reorder-functions @gol -freciprocal-math -free -frename-registers -freorder-blocks @gol -freorder-blocks-and-partition -freorder-functions @gol -frerun-cse-after-loop -freschedule-modulo-scheduled-loops @gol @@ -8645,7 +8645,7 @@ profile useful for later recompilation with profile feedback based optimization. You must use @option{-fprofile-generate} both when compiling and when linking your program. -The following options are enabled: @code{-fprofile-arcs}, @code{-fprofile-values}, @code{-fvpt}. +The following options are enabled: @code{-fprofile-arcs}, @code{-fprofile-values}, @code{-fprofile-reorder-functions}, @code{-fvpt}. If @var{path} is specified, GCC looks at the @var{path} to find the profile feedback data files. See @option{-fprofile-dir}. @@ -8933,6 +8933,14 @@ from profiling values of expressions for usage in optimizations. Enabled with @option{-fprofile-generate} and @option{-fprofile-use}. +@item -fprofile-reoder-functions +@opindex fprofile-reorder-functions +Function reordering based on profile instrumentation collects +first time of execution of a function and orders these functions +in ascending order. + +Enabled with @option{-fprofile-generate} and @option{-fprofile-use}. + @item -fvpt @opindex fvpt If combined with @option{-fprofile-arcs}, this option instructs the compiler diff --git a/gcc/lto/lto-partition.c b/gcc/lto/lto-partition.c index 6a3d881..42e4aef 100644 --- a/gcc/lto/lto-partition.c +++ b/gcc/lto/lto-partition.c @@ -280,9 +280,11 @@ add_symbol_to_partition (ltrans_partition part, symtab_node *node) Be lax about comdats; they may or may not be duplicated and we may end up in need to duplicate keyed comdat because it has unkeyed alias. */ + gcc_assert (get_symbol_class (node) == SYMBOL_DUPLICATE || DECL_COMDAT (node->decl) || !symbol_partitioned_p (node)); + add_symbol_to_partition_1 (part, node); } @@ -395,6 +397,25 @@ node_cmp (const void *pa, const void *pb) { const struct cgraph_node *a = *(const struct cgraph_node * const *) pa; const struct cgraph_node *b = *(const struct cgraph_node * const *) pb; + + /* Profile reorder flag enables function reordering based on first execution + of a function. All functions with profile are placed in ascending + order at the beginning. */ + + if (flag_profile_reorder_functions) + { + /* Functions with time profile are sorted in ascending order. */ + if (a->tp_first_run && b->tp_first_run) + return a->tp_first_run != b->tp_first_run + ? a->tp_first_run - b->tp_first_run + : a->order - b->order; + + /* Functions with time profile are sorted before the functions + that do not have the profile. */ + if (a->tp_first_run || b->tp_first_run) + return b->tp_first_run - a->tp_first_run; + } + return b->order - a->order; } @@ -449,7 +470,7 @@ void lto_balanced_map (void) { int n_nodes = 0; - int n_varpool_nodes = 0, varpool_pos = 0, best_varpool_pos = 0; + int n_varpool_nodes = 0, varpool_pos = 0; struct cgraph_node **order = XNEWVEC (struct cgraph_node *, cgraph_max_uid); struct varpool_node **varpool_order = NULL; int i; @@ -481,10 +502,13 @@ lto_balanced_map (void) get better about minimizing the function bounday, but until that things works smoother if we order in source order. */ qsort (order, n_nodes, sizeof (struct cgraph_node *), node_cmp); + + if (cgraph_dump_file) + for(i = 0; i < n_nodes; i++) + fprintf (cgraph_dump_file, "Balanced map symbol order:%s:%u\n", cgraph_node_asm_name (order[i]), order[i]->tp_first_run); + if (!flag_toplevel_reorder) { - qsort (order, n_nodes, sizeof (struct cgraph_node *), node_cmp); - FOR_EACH_VARIABLE (vnode) if (get_symbol_class (vnode) == SYMBOL_PARTITION) n_varpool_nodes++; @@ -683,7 +707,6 @@ lto_balanced_map (void) best_i = i; best_n_nodes = lto_symtab_encoder_size (partition->encoder); best_total_size = total_size; - best_varpool_pos = varpool_pos; } if (cgraph_dump_file) fprintf (cgraph_dump_file, "Step %i: added %s/%i, size %i, cost %i/%i " @@ -701,7 +724,6 @@ lto_balanced_map (void) fprintf (cgraph_dump_file, "Unwinding %i insertions to step %i\n", i - best_i, best_i); undo_partition (partition, best_n_nodes); - varpool_pos = best_varpool_pos; } i = best_i; /* When we are finished, avoid creating empty partition. */ @@ -849,7 +871,7 @@ may_need_named_section_p (lto_symtab_encoder_t encoder, symtab_node *node) of the same name in partition ENCODER (or in whole compilation unit if ENCODER is NULL) and if so, mangle the statics. Always mangle all conflicting statics, so we reduce changes of silently miscompiling - asm statemnets referring to them by symbol name. */ + asm statements referring to them by symbol name. */ static void rename_statics (lto_symtab_encoder_t encoder, symtab_node *node) diff --git a/gcc/lto/lto.c b/gcc/lto/lto.c index 62856d0..2401c76 100644 --- a/gcc/lto/lto.c +++ b/gcc/lto/lto.c @@ -2453,9 +2453,12 @@ lto_wpa_write_files (void) /* Sort partitions by size so small ones are compiled last. FIXME: Even when not reordering we may want to output one list for parallel make and other for final link command. */ - ltrans_partitions.qsort (flag_toplevel_reorder + + if (!flag_profile_reorder_functions || !flag_profile_use) + ltrans_partitions.qsort (flag_toplevel_reorder ? cmp_partitions_size : cmp_partitions_order); + for (i = 0; i < n_sets; i++) { size_t len; diff --git a/gcc/opts.c b/gcc/opts.c index 3a939ac..d3a7735 100644 --- a/gcc/opts.c +++ b/gcc/opts.c @@ -1690,6 +1690,8 @@ common_handle_option (struct gcc_options *opts, opts->x_flag_vect_cost_model = VECT_COST_MODEL_DYNAMIC; if (!opts_set->x_flag_tree_loop_distribute_patterns) opts->x_flag_tree_loop_distribute_patterns = value; + if (!opts_set->x_flag_profile_reorder_functions) + opts->x_flag_profile_reorder_functions = value; /* Indirect call profiling should do all useful transformations speculative devirutalization does. */ if (!opts_set->x_flag_devirtualize_speculatively @@ -1708,6 +1710,8 @@ common_handle_option (struct gcc_options *opts, opts->x_flag_profile_values = value; if (!opts_set->x_flag_inline_functions) opts->x_flag_inline_functions = value; + if (!opts_set->x_flag_profile_reorder_functions) + opts->x_flag_profile_reorder_functions = value; /* FIXME: Instrumentation we insert makes ipa-reference bitmaps quadratic. Disable the pass until better memory representation is done. */ diff --git a/gcc/predict.c b/gcc/predict.c index 61cac52..dbfcfdd 100644 --- a/gcc/predict.c +++ b/gcc/predict.c @@ -2840,12 +2840,24 @@ handle_missing_profiles (void) { struct cgraph_edge *e; gcov_type call_count = 0; + gcov_type max_tp_first_run = 0; struct function *fn = DECL_STRUCT_FUNCTION (node->decl); if (node->count) continue; for (e = node->callers; e; e = e->next_caller) + { call_count += e->count; + + if (e->caller->tp_first_run > max_tp_first_run) + max_tp_first_run = e->caller->tp_first_run; + } + + /* If time profile is missing, let assign the maximum that comes from + caller functions. */ + if (!node->tp_first_run) + node->tp_first_run = max_tp_first_run; + if (call_count && fn && fn->cfg && (call_count * unlikely_count_fraction >= profile_info->runs))