+ Honza This patch may be a candidate for trunk as well. This feature not only allows profile collection with much less overhead (for multi-thread programs with hot regions, the slow down can be significant due to cache ping-pong effect of counter update) without sacrificing too much the performance.
Another usage for this support is that it allows profile collection to be turned on/off asynchronously for long running server programs which sometimes profile data in warm up period is not important and should be excluded. A known limitation is that value profiling is not yet sampled, but it does not seem to cause problems. David On Thu, Apr 28, 2011 at 4:42 PM, Easwaran Raman <era...@google.com> wrote: > This patch from Silvius Rus adds support for sampled edge profile collection > to reduce instrumentation run overhead. Bootstraps and no test regressions. > Ok for google/main? > > 2011-04-28 Silvius Rus <silvius....@gmail.com> > > * doc/invoke.texi: Document -fprofile-generate-sampling option. > * gcov-io.h (__gcov_set_sampling_rate): New declaration. > * profile.c (branch_prob): Add support for sampled profile > collection. > * profile.h (add_sampling_to_edge_counters): New declaration. > * common.opt (fprofile-generate-sampling): New option. > * tree-profile: Include header files; define EDGE_COUNTER_STMT_COUNT. > (instrumentation_to_be_sampled, gcov_sample_counter_decl) > (gcov_sampling_rate_decl): New globals. > (insert_if_then, add_sampling_wrapper, > is_instrumentation_to_be_sampled) > (add_sampling_to_edge_counters, gimple_init_instrumentation_sampling): > New functions. > (gimple_init_edge_profiler): Call gimple_init_instrumentation_sampling. > (gimple_gen_edge_profiler): Mark start of instrumentation block. > * libgcov.c (__gcov_sampling_rate): New extern declaration. > (gcov_sampling_rate_initialized, __gcov_sample_counter): New globals. > (gcov_exit): Set sampling rate; minor coding style fixes. > * params.def (PARAM_PROFILE_GENERATE_SAMPLING_RATE): New parameter. > > Index: gcc/doc/invoke.texi > =================================================================== > --- gcc/doc/invoke.texi (revision 173136) > +++ gcc/doc/invoke.texi (working copy) > @@ -375,7 +375,7 @@ Objective-C and Objective-C++ Dialects}. > -fpartial-inlining -fpeel-loops -fpredictive-commoning @gol > -fprefetch-loop-arrays @gol > -fprofile-correction -fprofile-dir=@var{path} -fprofile-generate @gol > --fprofile-generate=@var{path} @gol > +-fprofile-generate=@var{path} -fprofile-generate-sampling @gol > -fprofile-use -fprofile-use=@var{path} -fprofile-values @gol > -freciprocal-math -fregmove -frename-registers -freorder-blocks @gol > -freorder-blocks-and-partition -freorder-functions @gol > @@ -7923,6 +7923,20 @@ The following options are enabled: @code{-fprofile > If @var{path} is specified, GCC will look at the @var{path} to find > the profile feedback data files. See @option{-fprofile-dir}. > > +@item -fprofile-generate-sampling > +@opindex -fprofile-generate-sampling > + > +Enable sampling for instrumented binaries. Instead of recording every event, > +record only every N-th event, where N (the sampling rate) can be set either > +at compile time using > +@option{--param profile-generate-sampling-rate=@var{value}}, or > +at execution start time through environment variable > @samp{GCOV_SAMPLING_RATE}. > + > +At this time sampling applies only to branch counters. A sampling rate of > 100 > +decreases instrumentated binary slowdown from up to 20x for heavily threaded > +applications down to around 2x. @option{-fprofile-correction} is always > +needed with sampling. > + > @item -fprofile-use > @itemx -fprofile-use=@var{path} > @opindex fprofile-use > @@ -9138,6 +9152,9 @@ recognize. > If you want to pass an option that takes an argument, you must use > @option{-Xassembler} twice, once for the option and once for the argument. > > +@item profile-generate-sampling-rate > +Set the sampling rate with @option{-fprofile-generate-sampling}. > + > @end table > > @node Link Options > Index: gcc/gcov-io.h > =================================================================== > --- gcc/gcov-io.h (revision 173136) > +++ gcc/gcov-io.h (working copy) > @@ -544,6 +544,9 @@ struct dyn_imp_mod > /* Register a new object file module. */ > extern void __gcov_init (struct gcov_info *) ATTRIBUTE_HIDDEN; > > +/* Set sampling rate to RATE. */ > +extern void __gcov_set_sampling_rate (unsigned int rate); > + > /* Called before fork, to avoid double counting. */ > extern void __gcov_flush (void) ATTRIBUTE_HIDDEN; > > Index: gcc/profile.c > =================================================================== > --- gcc/profile.c (revision 173136) > +++ gcc/profile.c (working copy) > @@ -1210,6 +1210,9 @@ branch_prob (void) > > /* Commit changes done by instrumentation. */ > gsi_commit_edge_inserts (); > + > + if (flag_profile_generate_sampling) > + add_sampling_to_edge_counters (); > } > > free_aux_for_edges (); > Index: gcc/profile.h > =================================================================== > --- gcc/profile.h (revision 173136) > +++ gcc/profile.h (working copy) > @@ -47,4 +47,10 @@ extern gcov_type sum_edge_counts (VEC (edge, gc) * > extern void init_node_map (void); > extern void del_node_map (void); > > +/* Implement sampling to avoid writing to edge counters very often. > + Many concurrent writes to the same counters, or to counters that share > + the same cache line leads to up to 30x slowdown on an application running > + on 8 CPUs. With sampling, the slowdown reduced to 2x. */ > +extern void add_sampling_to_edge_counters (void); > + > #endif /* PROFILE_H */ > Index: gcc/common.opt > =================================================================== > --- gcc/common.opt (revision 173136) > +++ gcc/common.opt (working copy) > @@ -1605,6 +1605,10 @@ fprofile-generate= > Common Joined RejectNegative > Enable common options for generating profile info for profile feedback > directed optimizations, and set -fprofile-dir= > > +fprofile-generate-sampling > +Common Var(flag_profile_generate_sampling) > +Turn on instrumentation sampling with -fprofile-generate with rate set by > --param profile-generate-sampling-rate or environment variable > GCOV_SAMPLING_RATE > + > fprofile-use > Common Var(flag_profile_use) > Enable common options for performing profile feedback directed optimizations > Index: gcc/tree-profile.c > =================================================================== > --- gcc/tree-profile.c (revision 173136) > +++ gcc/tree-profile.c (working copy) > @@ -31,6 +31,8 @@ along with GCC; see the file COPYING3. If not see > #include "coretypes.h" > #include "tm.h" > #include "flags.h" > +#include "target.h" > +#include "output.h" > #include "regs.h" > #include "function.h" > #include "basic-block.h" > @@ -44,9 +46,14 @@ along with GCC; see the file COPYING3. If not see > #include "value-prof.h" > #include "cgraph.h" > #include "output.h" > +#include "params.h" > +#include "profile.h" > #include "l-ipo.h" > #include "profile.h" > > +/* Number of statements inserted for each edge counter increment. */ > +#define EDGE_COUNTER_STMT_COUNT 3 > + > static GTY(()) tree gcov_type_node; > static GTY(()) tree gcov_type_tmp_var; > static GTY(()) tree tree_interval_profiler_fn; > @@ -136,7 +143,179 @@ init_ic_make_global_vars (void) > } > } > > +/* A set of the first statement in each block of statements that need to > + be applied a sampling wrapper. */ > +static htab_t instrumentation_to_be_sampled = NULL; > + > +/* extern __thread gcov_unsigned_t __gcov_sample_counter */ > +static tree gcov_sample_counter_decl = NULL_TREE; > + > +/* extern gcov_unsigned_t __gcov_sampling_rate */ > +static tree gcov_sampling_rate_decl = NULL_TREE; > + > +/* Insert STMT_IF around given sequence of consecutive statements in the > + same basic block starting with STMT_START, ending with STMT_END. */ > + > +static void > +insert_if_then (gimple stmt_start, gimple stmt_end, gimple stmt_if) > +{ > + gimple_stmt_iterator gsi; > + basic_block bb_original, bb_before_if, bb_after_if; > + edge e_if_taken, e_then_join; > + > + gsi = gsi_for_stmt (stmt_start); > + gsi_insert_before (&gsi, stmt_if, GSI_SAME_STMT); > + bb_original = gsi_bb (gsi); > + e_if_taken = split_block (bb_original, stmt_if); > + e_if_taken->flags &= ~EDGE_FALLTHRU; > + e_if_taken->flags |= EDGE_TRUE_VALUE; > + e_then_join = split_block (e_if_taken->dest, stmt_end); > + bb_before_if = e_if_taken->src; > + bb_after_if = e_then_join->dest; > + make_edge (bb_before_if, bb_after_if, EDGE_FALSE_VALUE); > +} > + > +/* Transform: > + > + ORIGINAL CODE > + > + Into: > + > + __gcov_sample_counter++; > + if (__gcov_sample_counter >= __gcov_sampling_rate) > + { > + __gcov_sample_counter = 0; > + ORIGINAL CODE > + } > + > + The original code block starts with STMT_START, is made of STMT_COUNT > + consecutive statements in the same basic block. */ > + > +static void > +add_sampling_wrapper (gimple stmt_start, int stmt_count) > +{ > + int i; > + tree zero, one, tmp_var, tmp1, tmp2, tmp3; > + gimple stmt_block_end; > + gimple stmt_inc_counter1, stmt_inc_counter2, stmt_inc_counter3; > + gimple stmt_reset_counter, stmt_assign_rate, stmt_if; > + gimple_stmt_iterator gsi; > + > + tmp_var = create_tmp_var (get_gcov_unsigned_t (), "PROF_sample_counter"); > + tmp1 = make_ssa_name (tmp_var, NULL); > + tmp2 = make_ssa_name (tmp_var, NULL); > + > + /* Create all the new statements needed. */ > + stmt_inc_counter1 = gimple_build_assign (tmp1, gcov_sample_counter_decl); > + one = build_int_cst (get_gcov_unsigned_t (), 1); > + stmt_inc_counter2 = gimple_build_assign_with_ops ( > + PLUS_EXPR, tmp2, tmp1, one); > + stmt_inc_counter3 = gimple_build_assign (gcov_sample_counter_decl, tmp2); > + zero = build_int_cst (get_gcov_unsigned_t (), 0); > + stmt_reset_counter = gimple_build_assign (gcov_sample_counter_decl, zero); > + tmp_var = create_tmp_var (get_gcov_unsigned_t (), "PROF_sample_counter"); > + tmp3 = make_ssa_name (tmp_var, NULL); > + stmt_assign_rate = gimple_build_assign (tmp3, gcov_sampling_rate_decl); > + stmt_if = gimple_build_cond (GE_EXPR, tmp2, tmp3, NULL_TREE, NULL_TREE); > + > + /* Insert them for now in the original basic block. */ > + gsi = gsi_for_stmt (stmt_start); > + gsi_insert_before (&gsi, stmt_inc_counter1, GSI_SAME_STMT); > + gsi_insert_before (&gsi, stmt_inc_counter2, GSI_SAME_STMT); > + gsi_insert_before (&gsi, stmt_inc_counter3, GSI_SAME_STMT); > + gsi_insert_before (&gsi, stmt_assign_rate, GSI_SAME_STMT); > + gsi_insert_before (&gsi, stmt_reset_counter, GSI_SAME_STMT); > + > + /* Move to last statement. */ > + for (i = 0; i < stmt_count - 1; i++) > + gsi_next (&gsi); > + > + stmt_block_end = gsi_stmt (gsi); > + gcc_assert (stmt_block_end); > + > + /* Insert IF block. */ > + insert_if_then (stmt_reset_counter, stmt_block_end, stmt_if); > +} > + > +/* Return whether STMT is the beginning of an instrumentation block to be > + applied sampling. */ > + > +static bool > +is_instrumentation_to_be_sampled (gimple stmt) > +{ > + return (htab_find_slot_with_hash (instrumentation_to_be_sampled, stmt, > + htab_hash_pointer (stmt), NO_INSERT) > + != NULL); > +} > + > +/* Add sampling wrappers around edge counter code in current function. */ > + > void > +add_sampling_to_edge_counters (void) > +{ > + gimple_stmt_iterator gsi; > + basic_block bb; > + > + FOR_EACH_BB_REVERSE (bb) > + for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi)) > + { > + gimple stmt = gsi_stmt (gsi); > + if (is_instrumentation_to_be_sampled (stmt)) > + { > + add_sampling_wrapper (stmt, EDGE_COUNTER_STMT_COUNT); > + break; > + } > + } > + > + /* Empty the set of statements performing the edge counter increment. */ > + if (instrumentation_to_be_sampled) > + htab_empty (instrumentation_to_be_sampled); > +} > + > +static void > +gimple_init_instrumentation_sampling (void) > +{ > + if (!gcov_sampling_rate_decl) > + { > + /* Define __gcov_sampling_rate regardless of > -fprofile-generate-sampling. > + Otherwise the extern reference to it from libgcov becomes unmatched. > + */ > + gcov_sampling_rate_decl = build_decl ( > + UNKNOWN_LOCATION, > + VAR_DECL, > + get_identifier ("__gcov_sampling_rate"), > + get_gcov_unsigned_t ()); > + TREE_PUBLIC (gcov_sampling_rate_decl) = 1; > + DECL_ARTIFICIAL (gcov_sampling_rate_decl) = 1; > + DECL_COMDAT_GROUP (gcov_sampling_rate_decl) > + = DECL_ASSEMBLER_NAME (gcov_sampling_rate_decl); > + TREE_STATIC (gcov_sampling_rate_decl) = 1; > + DECL_INITIAL (gcov_sampling_rate_decl) = build_int_cst ( > + get_gcov_unsigned_t (), > + PARAM_VALUE (PARAM_PROFILE_GENERATE_SAMPLING_RATE)); > + assemble_variable (gcov_sampling_rate_decl, 0, 0, 0); > + } > + > + if (flag_profile_generate_sampling && !instrumentation_to_be_sampled) > + { > + instrumentation_to_be_sampled = htab_create (100, htab_hash_pointer, > + htab_eq_pointer, NULL); > + gcov_sample_counter_decl = build_decl ( > + UNKNOWN_LOCATION, > + VAR_DECL, > + get_identifier ("__gcov_sample_counter"), > + get_gcov_unsigned_t ()); > + TREE_PUBLIC (gcov_sample_counter_decl) = 1; > + DECL_EXTERNAL (gcov_sample_counter_decl) = 1; > + DECL_ARTIFICIAL (gcov_sample_counter_decl) = 1; > + if (targetm.have_tls) > + DECL_TLS_MODEL (gcov_sample_counter_decl) = > + decl_default_tls_model (gcov_sample_counter_decl); > + assemble_variable (gcov_sample_counter_decl, 0, 0, 0); > + } > +} > + > +void > gimple_init_edge_profiler (void) > { > tree interval_profiler_fn_type; > @@ -148,6 +327,8 @@ gimple_init_edge_profiler (void) > tree dc_profiler_fn_type; > tree average_profiler_fn_type; > > + gimple_init_instrumentation_sampling (); > + > if (!gcov_type_node) > { > char name_buf[32]; > @@ -277,6 +458,7 @@ gimple_init_edge_profiler (void) > void > gimple_gen_edge_profiler (int edgeno, edge e) > { > + void** slot; > tree ref, one; > gimple stmt1, stmt2, stmt3; > > @@ -292,6 +474,15 @@ gimple_gen_edge_profiler (int edgeno, edge e) > gimple_assign_lhs (stmt1), one); > gimple_assign_set_lhs (stmt2, make_ssa_name (gcov_type_tmp_var, stmt2)); > stmt3 = gimple_build_assign (unshare_expr (ref), gimple_assign_lhs (stmt2)); > + > + if (flag_profile_generate_sampling) > + { > + slot = htab_find_slot_with_hash (instrumentation_to_be_sampled, stmt1, > + htab_hash_pointer (stmt1), INSERT); > + gcc_assert (!*slot); > + *slot = stmt1; > + } > + > gsi_insert_on_edge (e, stmt1); > gsi_insert_on_edge (e, stmt2); > gsi_insert_on_edge (e, stmt3); > Index: gcc/libgcov.c > =================================================================== > --- gcc/libgcov.c (revision 173136) > +++ gcc/libgcov.c (working copy) > @@ -83,6 +83,20 @@ void __gcov_merge_delta (gcov_type *counters __at > #ifdef L_gcov > #include "gcov-io.c" > > +/* Sampling rate. */ > +extern gcov_unsigned_t __gcov_sampling_rate; > +static int gcov_sampling_rate_initialized = 0; > + > +/* Set sampling rate to RATE. */ > + > +void __gcov_set_sampling_rate (unsigned int rate) > +{ > + __gcov_sampling_rate = rate; > +} > + > +/* Per thread sample counter. */ > +THREAD_PREFIX gcov_unsigned_t __gcov_sample_counter = 0; > + > /* Chain of per-object gcov structures. */ > extern struct gcov_info *__gcov_list; > > @@ -365,7 +379,7 @@ gcov_exit (void) > > { > /* Check if the level of dirs to strip off specified. */ > - char *tmp = getenv("GCOV_PREFIX_STRIP"); > + char *tmp = getenv ("GCOV_PREFIX_STRIP"); > if (tmp) > { > gcov_prefix_strip = atoi (tmp); > @@ -375,7 +389,7 @@ gcov_exit (void) > } > } > /* Get file name relocation prefix. Non-absolute values are ignored. */ > - gcov_prefix = getenv("GCOV_PREFIX"); > + gcov_prefix = getenv ("GCOV_PREFIX"); > if (gcov_prefix) > { > prefix_length = strlen(gcov_prefix); > @@ -757,6 +771,17 @@ gcov_exit (void) > void > __gcov_init (struct gcov_info *info) > { > + if (!gcov_sampling_rate_initialized) > + { > + const char* env_value_str = getenv ("GCOV_SAMPLING_RATE"); > + if (env_value_str) > + { > + int env_value_int = atoi(env_value_str); > + if (env_value_int >= 1) > + __gcov_sampling_rate = env_value_int; > + } > + gcov_sampling_rate_initialized = 1; > + } > if (!info->version) > return; > if (gcov_version (info, info->version, 0)) > Index: gcc/params.def > =================================================================== > --- gcc/params.def (revision 173136) > +++ gcc/params.def (working copy) > @@ -929,6 +929,11 @@ DEFPARAM (CXX_MAX_NAMESPACES_FOR_DIAGNOSTIC_HELP, > "name lookup fails", > 1000, 0, 0) > > +DEFPARAM (PARAM_PROFILE_GENERATE_SAMPLING_RATE, > + "profile-generate-sampling-rate", > + "sampling rate with -fprofile-generate-sampling", > + 100, 0, 2000000000) > + > /* > Local variables: > mode:c > > -- > This patch is available for review at http://codereview.appspot.com/4438083 >