On Fri, Apr 29, 2011 at 1:42 AM, Easwaran Raman <[email protected]> wrote:
> This patch from Silvius Rus adds support for sampled edge profile collection
> to reduce instrumentation run overhead. Bootstraps and no test regressions.
> Ok for google/main?
>
> 2011-04-28 Silvius Rus <[email protected]>
>
> * doc/invoke.texi: Document -fprofile-generate-sampling option.
> * gcov-io.h (__gcov_set_sampling_rate): New declaration.
> * profile.c (branch_prob): Add support for sampled profile
> collection.
> * profile.h (add_sampling_to_edge_counters): New declaration.
> * common.opt (fprofile-generate-sampling): New option.
> * tree-profile: Include header files; define EDGE_COUNTER_STMT_COUNT.
> (instrumentation_to_be_sampled, gcov_sample_counter_decl)
> (gcov_sampling_rate_decl): New globals.
> (insert_if_then, add_sampling_wrapper,
> is_instrumentation_to_be_sampled)
> (add_sampling_to_edge_counters, gimple_init_instrumentation_sampling):
> New functions.
> (gimple_init_edge_profiler): Call gimple_init_instrumentation_sampling.
> (gimple_gen_edge_profiler): Mark start of instrumentation block.
> * libgcov.c (__gcov_sampling_rate): New extern declaration.
> (gcov_sampling_rate_initialized, __gcov_sample_counter): New globals.
> (gcov_exit): Set sampling rate; minor coding style fixes.
> * params.def (PARAM_PROFILE_GENERATE_SAMPLING_RATE): New parameter.
>
> Index: gcc/doc/invoke.texi
> ===================================================================
> --- gcc/doc/invoke.texi (revision 173136)
> +++ gcc/doc/invoke.texi (working copy)
> @@ -375,7 +375,7 @@ Objective-C and Objective-C++ Dialects}.
> -fpartial-inlining -fpeel-loops -fpredictive-commoning @gol
> -fprefetch-loop-arrays @gol
> -fprofile-correction -fprofile-dir=@var{path} -fprofile-generate @gol
> --fprofile-generate=@var{path} @gol
> +-fprofile-generate=@var{path} -fprofile-generate-sampling @gol
> -fprofile-use -fprofile-use=@var{path} -fprofile-values @gol
> -freciprocal-math -fregmove -frename-registers -freorder-blocks @gol
> -freorder-blocks-and-partition -freorder-functions @gol
> @@ -7923,6 +7923,20 @@ The following options are enabled: @code{-fprofile
> If @var{path} is specified, GCC will look at the @var{path} to find
> the profile feedback data files. See @option{-fprofile-dir}.
>
> +@item -fprofile-generate-sampling
> +@opindex -fprofile-generate-sampling
> +
> +Enable sampling for instrumented binaries. Instead of recording every event,
> +record only every N-th event, where N (the sampling rate) can be set either
> +at compile time using
> +@option{--param profile-generate-sampling-rate=@var{value}}, or
> +at execution start time through environment variable
> @samp{GCOV_SAMPLING_RATE}.
> +
> +At this time sampling applies only to branch counters. A sampling rate of
> 100
> +decreases instrumentated binary slowdown from up to 20x for heavily threaded
> +applications down to around 2x. @option{-fprofile-correction} is always
> +needed with sampling.
> +
> @item -fprofile-use
> @itemx -fprofile-use=@var{path}
> @opindex fprofile-use
> @@ -9138,6 +9152,9 @@ recognize.
> If you want to pass an option that takes an argument, you must use
> @option{-Xassembler} twice, once for the option and once for the argument.
>
> +@item profile-generate-sampling-rate
> +Set the sampling rate with @option{-fprofile-generate-sampling}.
> +
> @end table
>
> @node Link Options
> Index: gcc/gcov-io.h
> ===================================================================
> --- gcc/gcov-io.h (revision 173136)
> +++ gcc/gcov-io.h (working copy)
> @@ -544,6 +544,9 @@ struct dyn_imp_mod
> /* Register a new object file module. */
> extern void __gcov_init (struct gcov_info *) ATTRIBUTE_HIDDEN;
>
> +/* Set sampling rate to RATE. */
> +extern void __gcov_set_sampling_rate (unsigned int rate);
> +
> /* Called before fork, to avoid double counting. */
> extern void __gcov_flush (void) ATTRIBUTE_HIDDEN;
>
> Index: gcc/profile.c
> ===================================================================
> --- gcc/profile.c (revision 173136)
> +++ gcc/profile.c (working copy)
> @@ -1210,6 +1210,9 @@ branch_prob (void)
>
> /* Commit changes done by instrumentation. */
> gsi_commit_edge_inserts ();
> +
> + if (flag_profile_generate_sampling)
> + add_sampling_to_edge_counters ();
> }
>
> free_aux_for_edges ();
> Index: gcc/profile.h
> ===================================================================
> --- gcc/profile.h (revision 173136)
> +++ gcc/profile.h (working copy)
> @@ -47,4 +47,10 @@ extern gcov_type sum_edge_counts (VEC (edge, gc) *
> extern void init_node_map (void);
> extern void del_node_map (void);
>
> +/* Implement sampling to avoid writing to edge counters very often.
> + Many concurrent writes to the same counters, or to counters that share
> + the same cache line leads to up to 30x slowdown on an application running
> + on 8 CPUs. With sampling, the slowdown reduced to 2x. */
> +extern void add_sampling_to_edge_counters (void);
> +
> #endif /* PROFILE_H */
> Index: gcc/common.opt
> ===================================================================
> --- gcc/common.opt (revision 173136)
> +++ gcc/common.opt (working copy)
> @@ -1605,6 +1605,10 @@ fprofile-generate=
> Common Joined RejectNegative
> Enable common options for generating profile info for profile feedback
> directed optimizations, and set -fprofile-dir=
>
> +fprofile-generate-sampling
> +Common Var(flag_profile_generate_sampling)
> +Turn on instrumentation sampling with -fprofile-generate with rate set by
> --param profile-generate-sampling-rate or environment variable
> GCOV_SAMPLING_RATE
> +
> fprofile-use
> Common Var(flag_profile_use)
> Enable common options for performing profile feedback directed optimizations
> Index: gcc/tree-profile.c
> ===================================================================
> --- gcc/tree-profile.c (revision 173136)
> +++ gcc/tree-profile.c (working copy)
> @@ -31,6 +31,8 @@ along with GCC; see the file COPYING3. If not see
> #include "coretypes.h"
> #include "tm.h"
> #include "flags.h"
> +#include "target.h"
> +#include "output.h"
> #include "regs.h"
> #include "function.h"
> #include "basic-block.h"
> @@ -44,9 +46,14 @@ along with GCC; see the file COPYING3. If not see
> #include "value-prof.h"
> #include "cgraph.h"
> #include "output.h"
> +#include "params.h"
> +#include "profile.h"
> #include "l-ipo.h"
> #include "profile.h"
>
> +/* Number of statements inserted for each edge counter increment. */
> +#define EDGE_COUNTER_STMT_COUNT 3
> +
> static GTY(()) tree gcov_type_node;
> static GTY(()) tree gcov_type_tmp_var;
> static GTY(()) tree tree_interval_profiler_fn;
> @@ -136,7 +143,179 @@ init_ic_make_global_vars (void)
> }
> }
>
> +/* A set of the first statement in each block of statements that need to
> + be applied a sampling wrapper. */
> +static htab_t instrumentation_to_be_sampled = NULL;
> +
> +/* extern __thread gcov_unsigned_t __gcov_sample_counter */
> +static tree gcov_sample_counter_decl = NULL_TREE;
> +
> +/* extern gcov_unsigned_t __gcov_sampling_rate */
> +static tree gcov_sampling_rate_decl = NULL_TREE;
> +
> +/* Insert STMT_IF around given sequence of consecutive statements in the
> + same basic block starting with STMT_START, ending with STMT_END. */
> +
> +static void
> +insert_if_then (gimple stmt_start, gimple stmt_end, gimple stmt_if)
> +{
> + gimple_stmt_iterator gsi;
> + basic_block bb_original, bb_before_if, bb_after_if;
> + edge e_if_taken, e_then_join;
> +
> + gsi = gsi_for_stmt (stmt_start);
> + gsi_insert_before (&gsi, stmt_if, GSI_SAME_STMT);
> + bb_original = gsi_bb (gsi);
> + e_if_taken = split_block (bb_original, stmt_if);
> + e_if_taken->flags &= ~EDGE_FALLTHRU;
> + e_if_taken->flags |= EDGE_TRUE_VALUE;
> + e_then_join = split_block (e_if_taken->dest, stmt_end);
> + bb_before_if = e_if_taken->src;
> + bb_after_if = e_then_join->dest;
> + make_edge (bb_before_if, bb_after_if, EDGE_FALSE_VALUE);
> +}
> +
> +/* Transform:
> +
> + ORIGINAL CODE
> +
> + Into:
> +
> + __gcov_sample_counter++;
> + if (__gcov_sample_counter >= __gcov_sampling_rate)
> + {
> + __gcov_sample_counter = 0;
> + ORIGINAL CODE
> + }
> +
> + The original code block starts with STMT_START, is made of STMT_COUNT
> + consecutive statements in the same basic block. */
> +
> +static void
> +add_sampling_wrapper (gimple stmt_start, int stmt_count)
> +{
> + int i;
> + tree zero, one, tmp_var, tmp1, tmp2, tmp3;
> + gimple stmt_block_end;
> + gimple stmt_inc_counter1, stmt_inc_counter2, stmt_inc_counter3;
> + gimple stmt_reset_counter, stmt_assign_rate, stmt_if;
> + gimple_stmt_iterator gsi;
> +
> + tmp_var = create_tmp_var (get_gcov_unsigned_t (), "PROF_sample_counter");
create_tmp_reg
> + tmp1 = make_ssa_name (tmp_var, NULL);
> + tmp2 = make_ssa_name (tmp_var, NULL);
> +
> + /* Create all the new statements needed. */
> + stmt_inc_counter1 = gimple_build_assign (tmp1, gcov_sample_counter_decl);
> + one = build_int_cst (get_gcov_unsigned_t (), 1);
> + stmt_inc_counter2 = gimple_build_assign_with_ops (
> + PLUS_EXPR, tmp2, tmp1, one);
> + stmt_inc_counter3 = gimple_build_assign (gcov_sample_counter_decl, tmp2);
> + zero = build_int_cst (get_gcov_unsigned_t (), 0);
> + stmt_reset_counter = gimple_build_assign (gcov_sample_counter_decl, zero);
> + tmp_var = create_tmp_var (get_gcov_unsigned_t (), "PROF_sample_counter");
please re-use the var from above and only create new ssa names. In fact,
please try to re-use a single variable per function.
As far as I understand the idea up to this point, wouldn't it be better
(or also useful) to simply not instrument some edges? For example
at some GCC summit a loooong time ago I presented work to instrument
loops to record min/avg/max number of iterations, if that is done
instrumenting the back-edge could be avoided for example. This may
be complementary to this patch of course.
How is code-size affected with this patch, non-instrumented vs.
regular-instrumented vs. sample-instrumented?
> + tmp3 = make_ssa_name (tmp_var, NULL);
> + stmt_assign_rate = gimple_build_assign (tmp3, gcov_sampling_rate_decl);
> + stmt_if = gimple_build_cond (GE_EXPR, tmp2, tmp3, NULL_TREE, NULL_TREE);
> +
> + /* Insert them for now in the original basic block. */
> + gsi = gsi_for_stmt (stmt_start);
> + gsi_insert_before (&gsi, stmt_inc_counter1, GSI_SAME_STMT);
> + gsi_insert_before (&gsi, stmt_inc_counter2, GSI_SAME_STMT);
> + gsi_insert_before (&gsi, stmt_inc_counter3, GSI_SAME_STMT);
> + gsi_insert_before (&gsi, stmt_assign_rate, GSI_SAME_STMT);
> + gsi_insert_before (&gsi, stmt_reset_counter, GSI_SAME_STMT);
> +
> + /* Move to last statement. */
> + for (i = 0; i < stmt_count - 1; i++)
> + gsi_next (&gsi);
Ugh. This function interface sucks ;) Why not pass a start and end
gsi instead?
> + stmt_block_end = gsi_stmt (gsi);
> + gcc_assert (stmt_block_end);
> +
> + /* Insert IF block. */
> + insert_if_then (stmt_reset_counter, stmt_block_end, stmt_if);
> +}
> +
> +/* Return whether STMT is the beginning of an instrumentation block to be
> + applied sampling. */
> +
> +static bool
> +is_instrumentation_to_be_sampled (gimple stmt)
> +{
> + return (htab_find_slot_with_hash (instrumentation_to_be_sampled, stmt,
> + htab_hash_pointer (stmt), NO_INSERT)
Use a pointer-set instead, or a bitmap of stmt uids.
> + != NULL);
> +}
> +
> +/* Add sampling wrappers around edge counter code in current function. */
> +
> void
> +add_sampling_to_edge_counters (void)
> +{
> + gimple_stmt_iterator gsi;
> + basic_block bb;
> +
> + FOR_EACH_BB_REVERSE (bb)
> + for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
> + {
> + gimple stmt = gsi_stmt (gsi);
> + if (is_instrumentation_to_be_sampled (stmt))
> + {
> + add_sampling_wrapper (stmt, EDGE_COUNTER_STMT_COUNT);
> + break;
> + }
> + }
> +
> + /* Empty the set of statements performing the edge counter increment. */
> + if (instrumentation_to_be_sampled)
> + htab_empty (instrumentation_to_be_sampled);
> +}
> +
> +static void
> +gimple_init_instrumentation_sampling (void)
> +{
> + if (!gcov_sampling_rate_decl)
> + {
> + /* Define __gcov_sampling_rate regardless of
> -fprofile-generate-sampling.
> + Otherwise the extern reference to it from libgcov becomes unmatched.
> + */
> + gcov_sampling_rate_decl = build_decl (
> + UNKNOWN_LOCATION,
> + VAR_DECL,
> + get_identifier ("__gcov_sampling_rate"),
> + get_gcov_unsigned_t ());
> + TREE_PUBLIC (gcov_sampling_rate_decl) = 1;
> + DECL_ARTIFICIAL (gcov_sampling_rate_decl) = 1;
> + DECL_COMDAT_GROUP (gcov_sampling_rate_decl)
> + = DECL_ASSEMBLER_NAME (gcov_sampling_rate_decl);
I don't think this works across all architectures we support.
> + TREE_STATIC (gcov_sampling_rate_decl) = 1;
> + DECL_INITIAL (gcov_sampling_rate_decl) = build_int_cst (
> + get_gcov_unsigned_t (),
> + PARAM_VALUE (PARAM_PROFILE_GENERATE_SAMPLING_RATE));
> + assemble_variable (gcov_sampling_rate_decl, 0, 0, 0);
> + }
> +
> + if (flag_profile_generate_sampling && !instrumentation_to_be_sampled)
> + {
> + instrumentation_to_be_sampled = htab_create (100, htab_hash_pointer,
> + htab_eq_pointer, NULL);
> + gcov_sample_counter_decl = build_decl (
> + UNKNOWN_LOCATION,
> + VAR_DECL,
> + get_identifier ("__gcov_sample_counter"),
> + get_gcov_unsigned_t ());
> + TREE_PUBLIC (gcov_sample_counter_decl) = 1;
> + DECL_EXTERNAL (gcov_sample_counter_decl) = 1;
> + DECL_ARTIFICIAL (gcov_sample_counter_decl) = 1;
> + if (targetm.have_tls)
> + DECL_TLS_MODEL (gcov_sample_counter_decl) =
> + decl_default_tls_model (gcov_sample_counter_decl);
> + assemble_variable (gcov_sample_counter_decl, 0, 0, 0);
> + }
> +}
> +
> +void
> gimple_init_edge_profiler (void)
> {
> tree interval_profiler_fn_type;
> @@ -148,6 +327,8 @@ gimple_init_edge_profiler (void)
> tree dc_profiler_fn_type;
> tree average_profiler_fn_type;
>
> + gimple_init_instrumentation_sampling ();
> +
> if (!gcov_type_node)
> {
> char name_buf[32];
> @@ -277,6 +458,7 @@ gimple_init_edge_profiler (void)
> void
> gimple_gen_edge_profiler (int edgeno, edge e)
> {
> + void** slot;
> tree ref, one;
> gimple stmt1, stmt2, stmt3;
>
> @@ -292,6 +474,15 @@ gimple_gen_edge_profiler (int edgeno, edge e)
> gimple_assign_lhs (stmt1), one);
> gimple_assign_set_lhs (stmt2, make_ssa_name (gcov_type_tmp_var, stmt2));
> stmt3 = gimple_build_assign (unshare_expr (ref), gimple_assign_lhs (stmt2));
> +
> + if (flag_profile_generate_sampling)
> + {
> + slot = htab_find_slot_with_hash (instrumentation_to_be_sampled, stmt1,
> + htab_hash_pointer (stmt1), INSERT);
> + gcc_assert (!*slot);
> + *slot = stmt1;
> + }
> +
What's the reason to delay the sampling transform and not apply it here?
At least a comment should be added for this.
> gsi_insert_on_edge (e, stmt1);
> gsi_insert_on_edge (e, stmt2);
> gsi_insert_on_edge (e, stmt3);
> Index: gcc/libgcov.c
> ===================================================================
> --- gcc/libgcov.c (revision 173136)
> +++ gcc/libgcov.c (working copy)
> @@ -83,6 +83,20 @@ void __gcov_merge_delta (gcov_type *counters __at
> #ifdef L_gcov
> #include "gcov-io.c"
>
> +/* Sampling rate. */
> +extern gcov_unsigned_t __gcov_sampling_rate;
> +static int gcov_sampling_rate_initialized = 0;
> +
> +/* Set sampling rate to RATE. */
> +
> +void __gcov_set_sampling_rate (unsigned int rate)
> +{
> + __gcov_sampling_rate = rate;
> +}
> +
> +/* Per thread sample counter. */
> +THREAD_PREFIX gcov_unsigned_t __gcov_sample_counter = 0;
> +
> /* Chain of per-object gcov structures. */
> extern struct gcov_info *__gcov_list;
>
> @@ -365,7 +379,7 @@ gcov_exit (void)
>
> {
> /* Check if the level of dirs to strip off specified. */
> - char *tmp = getenv("GCOV_PREFIX_STRIP");
> + char *tmp = getenv ("GCOV_PREFIX_STRIP");
> if (tmp)
> {
> gcov_prefix_strip = atoi (tmp);
> @@ -375,7 +389,7 @@ gcov_exit (void)
> }
> }
> /* Get file name relocation prefix. Non-absolute values are ignored. */
> - gcov_prefix = getenv("GCOV_PREFIX");
> + gcov_prefix = getenv ("GCOV_PREFIX");
> if (gcov_prefix)
> {
> prefix_length = strlen(gcov_prefix);
> @@ -757,6 +771,17 @@ gcov_exit (void)
> void
> __gcov_init (struct gcov_info *info)
> {
> + if (!gcov_sampling_rate_initialized)
> + {
> + const char* env_value_str = getenv ("GCOV_SAMPLING_RATE");
> + if (env_value_str)
> + {
> + int env_value_int = atoi(env_value_str);
> + if (env_value_int >= 1)
> + __gcov_sampling_rate = env_value_int;
> + }
> + gcov_sampling_rate_initialized = 1;
> + }
> if (!info->version)
> return;
> if (gcov_version (info, info->version, 0))
> Index: gcc/params.def
> ===================================================================
> --- gcc/params.def (revision 173136)
> +++ gcc/params.def (working copy)
> @@ -929,6 +929,11 @@ DEFPARAM (CXX_MAX_NAMESPACES_FOR_DIAGNOSTIC_HELP,
> "name lookup fails",
> 1000, 0, 0)
>
> +DEFPARAM (PARAM_PROFILE_GENERATE_SAMPLING_RATE,
> + "profile-generate-sampling-rate",
> + "sampling rate with -fprofile-generate-sampling",
> + 100, 0, 2000000000)
Zero is really ok?
Thanks,
Richard.
> +
> /*
> Local variables:
> mode:c
>
> --
> This patch is available for review at http://codereview.appspot.com/4438083
>