Re: [PATCH] Conditional count update for fast coverage test in multi-threaded programs

2014-04-17 Thread Richard Biener
On Fri, Dec 20, 2013 at 11:45 PM, Rong Xu  wrote:
> Here are the results using our internal benchmarks which are a mixed a
> multi-threaded and single-threaded programs.
> This was collected about a month ago but I did not got time to send
> due to an unexpected trip.
>
> cmpxchg gives the worst performance due to the memory barriers it incurs.
> I'll send a patch that supports conditional_1 and unconditional_1.

Bah, too bad.  Ok, so another idea is to use non-tempoal unconditional
store of 1.  According to docs this is weakly ordered and side-steps the
cache (movnti/movntq on x86).

Btw,

+@item coverage-exec_once
+Set to 1 to update each arc counter only once. This avoids false sharing
+and speeds up profile-generate run for multi-threaded programs.
+

for -fprofile-generate this certainly is a bad choice but I can see that
it is useful for -coverage.  Also avoid mixing - and _ here.  Using a
--param here is not the proper thing for a non-developer feature,
thus I'd suggest to add -fcoverage-exec-once instead.  It is supposed
to help for -coverage, right?  Not for -fprofile-generate?

Instead of using a pointer-set to record stmts to "instrument" just
set one of the pass-local flags on the stmts (gimple_set_plf/gimple_plf,
you have to clear flags before using them).

Thanks,
Richard.

> - result -
>
> base: original_coverage
> (1): using_conditional_1 -- using branch (my original implementation)
> (2): using_unconfitional_1 -- write straight 1
> (3): using_cmpxchg -- using compxchg write 1
>
> Values are performance ratios where 100.0 equals the performance of
> O2. Larger numbers are faster.
> "--" means the test failed due to running too slowly.
>
> arch: westmere
>   Benchmark   Base  (1)(2)  (3)
> -
> benchmark_126.4  +176.62%  +17.20%--
> benchmark_2  --[78.4]   [12.3]--
> benchmark_386.3+6.15%  +10.52%   -61.28%
> benchmark_488.4+6.59%  +14.26%   -68.76%
> benchmark_589.6+6.26%  +13.00%   -68.74%
> benchmark_676.7   +22.28%  +29.15%   -75.31%
> benchmark_789.0-0.62%   +3.36%   -71.37%
> benchmark_884.5-1.45%   +5.27%   -74.04%
> benchmark_981.3   +10.64%  +13.32%   -72.82%
> benchmark_10   59.1   +44.71%  +14.77%   -73.24%
> benchmark_11   90.3-1.74%   +4.22%   -61.95%
> benchmark_12   98.9+0.07%   +0.48%-6.37%
> benchmark_13   74.0-4.69%   +4.35%   -77.02%
> benchmark_14   21.4  +309.92%  +63.41%   -35.82%
> benchmark_15   21.4  +282.33%  +58.15%   -57.98%
> benchmark_16   85.1-7.71%   +1.65%   -60.72%
> benchmark_17   81.7+2.47%   +8.20%   -72.08%
> benchmark_18   83.7+1.59%   +3.83%   -69.33%
> geometric mean +30.30%  +14.41%  -65.66% (incomplete)
>
> arch: sandybridge
>   Benchmark   Base(1)   (2)  (3)
> -
> benchmark_1 --[70.1]   [26.1]   --
> benchmark_2 --[79.1]   --   --
> benchmark_3   84.3   +10.82%  +15.84%  -68.98%
> benchmark_4   88.5   +10.28%  +11.35%  -75.10%
> benchmark_5   89.4   +10.46%  +11.40%  -74.41%
> benchmark_6   65.5   +38.52%  +44.46%  -77.97%
> benchmark_7   87.7-0.16%   +1.74%  -76.19%
> benchmark_8   89.6-4.52%   +6.29%  -78.10%
> benchmark_9   79.9   +13.43%  +19.44%  -75.99%
> benchmark_10  52.6   +61.53%   +8.23%  -78.41%
> benchmark_11  89.9-1.40%   +3.37%  -68.16%
> benchmark_12  99.0+1.51%   +0.63%  -10.37%
> benchmark_13  74.3-6.75%   +3.89%  -81.84%
> benchmark_14  21.8  +295.76%  +19.48%  -51.58%
> benchmark_15  23.5  +257.20%  +29.33%  -83.53%
> benchmark_16  84.4   -10.04%   +2.39%  -68.25%
> benchmark_17  81.6+0.60%   +8.82%  -78.02%
> benchmark_18  87.4-1.14%   +9.69%  -75.88%
> geometric mean   +25.64%  +11.76%  -72.96% (incomplete)
>
> arch: clovertown
>   Benchmark Base   (1)   (2)(3)
> --
> benchmark_1 -- [83.4]-- --
> benchmark_2 -- [82.3]-- --
> benchmark_3   86.2 +7.58%   +13.10%-81.74%
> benchmark_4   89.4 +5.69%   +11.70%-82.97%
> benchmark_5   92.8 +4.67%+7.48%-80.02%
> benchmark_6   78.1+13.28%   +22.21%-86.92%
> benchmark_7   96.8 +0.25%+5.44%-84.94%
> benchmark_8   89.1 +0.66%+3.60%-85.89%
> benchmark_9   86.4 +8.42%+9.95%-82.30%
> benchmark_10  59.7+44.95%   +

Re: [PATCH] Conditional count update for fast coverage test in multi-threaded programs

2013-12-20 Thread Rong Xu
Here are the results using our internal benchmarks which are a mixed a
multi-threaded and single-threaded programs.
This was collected about a month ago but I did not got time to send
due to an unexpected trip.

cmpxchg gives the worst performance due to the memory barriers it incurs.
I'll send a patch that supports conditional_1 and unconditional_1.

- result -

base: original_coverage
(1): using_conditional_1 -- using branch (my original implementation)
(2): using_unconfitional_1 -- write straight 1
(3): using_cmpxchg -- using compxchg write 1

Values are performance ratios where 100.0 equals the performance of
O2. Larger numbers are faster.
"--" means the test failed due to running too slowly.

arch: westmere
  Benchmark   Base  (1)(2)  (3)
-
benchmark_126.4  +176.62%  +17.20%--
benchmark_2  --[78.4]   [12.3]--
benchmark_386.3+6.15%  +10.52%   -61.28%
benchmark_488.4+6.59%  +14.26%   -68.76%
benchmark_589.6+6.26%  +13.00%   -68.74%
benchmark_676.7   +22.28%  +29.15%   -75.31%
benchmark_789.0-0.62%   +3.36%   -71.37%
benchmark_884.5-1.45%   +5.27%   -74.04%
benchmark_981.3   +10.64%  +13.32%   -72.82%
benchmark_10   59.1   +44.71%  +14.77%   -73.24%
benchmark_11   90.3-1.74%   +4.22%   -61.95%
benchmark_12   98.9+0.07%   +0.48%-6.37%
benchmark_13   74.0-4.69%   +4.35%   -77.02%
benchmark_14   21.4  +309.92%  +63.41%   -35.82%
benchmark_15   21.4  +282.33%  +58.15%   -57.98%
benchmark_16   85.1-7.71%   +1.65%   -60.72%
benchmark_17   81.7+2.47%   +8.20%   -72.08%
benchmark_18   83.7+1.59%   +3.83%   -69.33%
geometric mean +30.30%  +14.41%  -65.66% (incomplete)

arch: sandybridge
  Benchmark   Base(1)   (2)  (3)
-
benchmark_1 --[70.1]   [26.1]   --
benchmark_2 --[79.1]   --   --
benchmark_3   84.3   +10.82%  +15.84%  -68.98%
benchmark_4   88.5   +10.28%  +11.35%  -75.10%
benchmark_5   89.4   +10.46%  +11.40%  -74.41%
benchmark_6   65.5   +38.52%  +44.46%  -77.97%
benchmark_7   87.7-0.16%   +1.74%  -76.19%
benchmark_8   89.6-4.52%   +6.29%  -78.10%
benchmark_9   79.9   +13.43%  +19.44%  -75.99%
benchmark_10  52.6   +61.53%   +8.23%  -78.41%
benchmark_11  89.9-1.40%   +3.37%  -68.16%
benchmark_12  99.0+1.51%   +0.63%  -10.37%
benchmark_13  74.3-6.75%   +3.89%  -81.84%
benchmark_14  21.8  +295.76%  +19.48%  -51.58%
benchmark_15  23.5  +257.20%  +29.33%  -83.53%
benchmark_16  84.4   -10.04%   +2.39%  -68.25%
benchmark_17  81.6+0.60%   +8.82%  -78.02%
benchmark_18  87.4-1.14%   +9.69%  -75.88%
geometric mean   +25.64%  +11.76%  -72.96% (incomplete)

arch: clovertown
  Benchmark Base   (1)   (2)(3)
--
benchmark_1 -- [83.4]-- --
benchmark_2 -- [82.3]-- --
benchmark_3   86.2 +7.58%   +13.10%-81.74%
benchmark_4   89.4 +5.69%   +11.70%-82.97%
benchmark_5   92.8 +4.67%+7.48%-80.02%
benchmark_6   78.1+13.28%   +22.21%-86.92%
benchmark_7   96.8 +0.25%+5.44%-84.94%
benchmark_8   89.1 +0.66%+3.60%-85.89%
benchmark_9   86.4 +8.42%+9.95%-82.30%
benchmark_10  59.7+44.95%   +21.79% --
benchmark_11  91.2 -0.29%+4.35%-76.05%
benchmark_12  99.0 +0.31%-0.05%-25.19%
benchmark_14   8.2  +1011.27%  +104.15% +5.56%
benchmark_15  11.7   +669.25%  +108.54%-29.83%
benchmark_16  85.7 -7.51%+4.43% --
benchmark_17  87.7 +2.84%+7.45% --
benchmark_18  87.9 +1.59%+3.82%-81.11%
geometric mean +37.89%   +17.54%  -74.47% (incomplete)

arch: istanbul

 Benchmark   Base(1)   (2)   (3)
--
benchmark_1 --[73.2]   -- --
benchmark_2 --[82.9]   -- --
benchmark_3   86.1+4.56%  +11.68%-61.04%
benchmark_4   92.0+3.47%   +4.63%-64.84%
benchmark_5   91.9+4.18%   +4.90%-64.77%
benchmark_6   73.6   +23.36%  +27.13%-72.64%
benchmark_7   93.6-3.57%   +4.76%-68.54%
benchmark_8   88.9-3.01%   +2.87%-75

Re: [PATCH] Conditional count update for fast coverage test in multi-threaded programs

2013-11-25 Thread Rong Xu
On Mon, Nov 25, 2013 at 2:11 AM, Richard Biener
 wrote:
> On Fri, Nov 22, 2013 at 10:49 PM, Rong Xu  wrote:
>> On Fri, Nov 22, 2013 at 4:03 AM, Richard Biener
>>  wrote:
>>> On Fri, Nov 22, 2013 at 4:51 AM, Rong Xu  wrote:
 Hi,

 This patch injects a condition into the instrumented code for edge
 counter update. The counter value will not be updated after reaching
 value 1.

 The feature is under a new parameter --param=coverage-exec_once.
 Default is disabled and setting to 1 to enable.

 This extra check usually slows the program down. For SPEC 2006
 benchmarks (all single thread programs), we usually see around 20%-35%
 slow down in -O2 coverage build. This feature, however, is expected to
 improve the coverage run speed for multi-threaded programs, because
 there virtually no data race and false sharing in updating counters.
 The improvement can be significant for highly threaded programs -- we
 are seeing 7x speedup in coverage test run for some non-trivial google
 applications.

 Tested with bootstrap.
>>>
>>> Err - why not simply emit
>>>
>>>   counter = 1
>>>
>>> for the counter update itself with that --param (I don't like a --param
>>> for this either).
>>>
>>> I assume that CPUs can avoid data-races and false sharing for
>>> non-changing accesses?
>>>
>>
>> I'm not aware of any CPU having this feature. I think a write to the
>> shared cache line to invalidate all the shared copies. I cannot find
>> any reference on checking the value of the write. Do you have any
>> pointer to the feature?
>
> I don't have any pointer - but I remember seeing this in the context
> of atomics thus it may be only in the context of using a xchg
> or cmpxchg instruction.  Which would make it non-portable to
> some extent (if you don't want to use atomic builtins here).
>

cmpxchg should work here -- it's a conditional write so the data race
/false sharing can be avoided.
I'm comparing the performance b/w explicit branch vs cmpxchg and will
report back.

-Rong


> Richard.
>
>> I just tested this implementation vs. simply setting to 1, using
>> google search as the benchmark.
>> This one is 4.5x faster. The test was done on Intel Westmere systems.
>>
>> I can change the parameter to an option.
>>
>> -Rong
>>
>>> Richard.
>>>
 Thanks,

 -Rong


Re: [PATCH] Conditional count update for fast coverage test in multi-threaded programs

2013-11-25 Thread Richard Biener
On Fri, Nov 22, 2013 at 10:49 PM, Rong Xu  wrote:
> On Fri, Nov 22, 2013 at 4:03 AM, Richard Biener
>  wrote:
>> On Fri, Nov 22, 2013 at 4:51 AM, Rong Xu  wrote:
>>> Hi,
>>>
>>> This patch injects a condition into the instrumented code for edge
>>> counter update. The counter value will not be updated after reaching
>>> value 1.
>>>
>>> The feature is under a new parameter --param=coverage-exec_once.
>>> Default is disabled and setting to 1 to enable.
>>>
>>> This extra check usually slows the program down. For SPEC 2006
>>> benchmarks (all single thread programs), we usually see around 20%-35%
>>> slow down in -O2 coverage build. This feature, however, is expected to
>>> improve the coverage run speed for multi-threaded programs, because
>>> there virtually no data race and false sharing in updating counters.
>>> The improvement can be significant for highly threaded programs -- we
>>> are seeing 7x speedup in coverage test run for some non-trivial google
>>> applications.
>>>
>>> Tested with bootstrap.
>>
>> Err - why not simply emit
>>
>>   counter = 1
>>
>> for the counter update itself with that --param (I don't like a --param
>> for this either).
>>
>> I assume that CPUs can avoid data-races and false sharing for
>> non-changing accesses?
>>
>
> I'm not aware of any CPU having this feature. I think a write to the
> shared cache line to invalidate all the shared copies. I cannot find
> any reference on checking the value of the write. Do you have any
> pointer to the feature?

I don't have any pointer - but I remember seeing this in the context
of atomics thus it may be only in the context of using a xchg
or cmpxchg instruction.  Which would make it non-portable to
some extent (if you don't want to use atomic builtins here).

Richard.

> I just tested this implementation vs. simply setting to 1, using
> google search as the benchmark.
> This one is 4.5x faster. The test was done on Intel Westmere systems.
>
> I can change the parameter to an option.
>
> -Rong
>
>> Richard.
>>
>>> Thanks,
>>>
>>> -Rong


Re: [PATCH] Conditional count update for fast coverage test in multi-threaded programs

2013-11-22 Thread Rong Xu
On Fri, Nov 22, 2013 at 4:03 AM, Richard Biener
 wrote:
> On Fri, Nov 22, 2013 at 4:51 AM, Rong Xu  wrote:
>> Hi,
>>
>> This patch injects a condition into the instrumented code for edge
>> counter update. The counter value will not be updated after reaching
>> value 1.
>>
>> The feature is under a new parameter --param=coverage-exec_once.
>> Default is disabled and setting to 1 to enable.
>>
>> This extra check usually slows the program down. For SPEC 2006
>> benchmarks (all single thread programs), we usually see around 20%-35%
>> slow down in -O2 coverage build. This feature, however, is expected to
>> improve the coverage run speed for multi-threaded programs, because
>> there virtually no data race and false sharing in updating counters.
>> The improvement can be significant for highly threaded programs -- we
>> are seeing 7x speedup in coverage test run for some non-trivial google
>> applications.
>>
>> Tested with bootstrap.
>
> Err - why not simply emit
>
>   counter = 1
>
> for the counter update itself with that --param (I don't like a --param
> for this either).
>
> I assume that CPUs can avoid data-races and false sharing for
> non-changing accesses?
>

I'm not aware of any CPU having this feature. I think a write to the
shared cache line to invalidate all the shared copies. I cannot find
any reference on checking the value of the write. Do you have any
pointer to the feature?

I just tested this implementation vs. simply setting to 1, using
google search as the benchmark.
This one is 4.5x faster. The test was done on Intel Westmere systems.

I can change the parameter to an option.

-Rong

> Richard.
>
>> Thanks,
>>
>> -Rong


Re: [PATCH] Conditional count update for fast coverage test in multi-threaded programs

2013-11-22 Thread Richard Biener
On Fri, Nov 22, 2013 at 4:51 AM, Rong Xu  wrote:
> Hi,
>
> This patch injects a condition into the instrumented code for edge
> counter update. The counter value will not be updated after reaching
> value 1.
>
> The feature is under a new parameter --param=coverage-exec_once.
> Default is disabled and setting to 1 to enable.
>
> This extra check usually slows the program down. For SPEC 2006
> benchmarks (all single thread programs), we usually see around 20%-35%
> slow down in -O2 coverage build. This feature, however, is expected to
> improve the coverage run speed for multi-threaded programs, because
> there virtually no data race and false sharing in updating counters.
> The improvement can be significant for highly threaded programs -- we
> are seeing 7x speedup in coverage test run for some non-trivial google
> applications.
>
> Tested with bootstrap.

Err - why not simply emit

  counter = 1

for the counter update itself with that --param (I don't like a --param
for this either).

I assume that CPUs can avoid data-races and false sharing for
non-changing accesses?

Richard.

> Thanks,
>
> -Rong


[PATCH] Conditional count update for fast coverage test in multi-threaded programs

2013-11-21 Thread Rong Xu
Hi,

This patch injects a condition into the instrumented code for edge
counter update. The counter value will not be updated after reaching
value 1.

The feature is under a new parameter --param=coverage-exec_once.
Default is disabled and setting to 1 to enable.

This extra check usually slows the program down. For SPEC 2006
benchmarks (all single thread programs), we usually see around 20%-35%
slow down in -O2 coverage build. This feature, however, is expected to
improve the coverage run speed for multi-threaded programs, because
there virtually no data race and false sharing in updating counters.
The improvement can be significant for highly threaded programs -- we
are seeing 7x speedup in coverage test run for some non-trivial google
applications.

Tested with bootstrap.

Thanks,

-Rong
2013-11-21  Rong Xu  

* gcc/doc/invoke.texi (coverage-exec_once): Add document.
* gcc/params.def (coverage-exec_once): New param.
* gcc/profile.h (gcov_type sum_edge_counts): Add extern decl.
* gcc/profile.c (branch_prob): Add support for coverage-exec_once.
* gcc/tree-profile.c (gimple_gen_edge_profiler): Ditto.
(gimple_gen_ior_profiler): Ditto.
(insert_if_then): Ditto.
(add_execonce_wrapper): Ditto.
(is_pred_instrumentation_candidate): Ditto.
(add_predicate_to_edge_counters): Ditto.
(cleanup_pred_instrumentation): Ditto.
(init_pred_instrumentation): Ditto.
(tree_profiling): Ditto.

Index: gcc/doc/invoke.texi
===
--- gcc/doc/invoke.texi (revision 205226)
+++ gcc/doc/invoke.texi (working copy)
@@ -9937,6 +9937,10 @@ The default choice depends on the target.
 Set the maximum number of existing candidates that will be considered when
 seeking a basis for a new straight-line strength reduction candidate.
 
+@item coverage-exec_once
+Set to 1 to update each arc counter only once. This avoids false sharing
+and speeds up profile-generate run for multi-threaded programs.
+
 @end table
 @end table
 
Index: gcc/params.def
===
--- gcc/params.def  (revision 205226)
+++ gcc/params.def  (working copy)
@@ -441,6 +441,14 @@ DEFPARAM(TRACER_MIN_BRANCH_PROBABILITY,
 "Stop forward growth if the probability of best edge is less than this 
threshold (in percent). Used when profile feedback is not available",
 50, 0, 100)
 
+/* This parameter enables fast coverage test. Once the counter flips from 0
+   to 1, we no longer updating the value . This avoids false sharing and speeds
+   up profile-generate run for multi-threaded programs.  */
+DEFPARAM (PARAM_COVERAGE_EXEC_ONCE,
+ "coverage-exec_once",
+ "Stop incrementing edge counts once they become 1.",
+ 0, 0, 1)
+
 /* The maximum number of incoming edges to consider for crossjumping.  */
 DEFPARAM(PARAM_MAX_CROSSJUMP_EDGES,
 "max-crossjump-edges",
Index: gcc/profile.c
===
--- gcc/profile.c   (revision 205226)
+++ gcc/profile.c   (working copy)
@@ -67,6 +67,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "cfgloop.h"
 #include "dumpfile.h"
 #include "cgraph.h"
+#include "params.h"
 
 #include "profile.h"
 
@@ -1327,6 +1328,9 @@ branch_prob (void)
 
   /* Commit changes done by instrumentation.  */
   gsi_commit_edge_inserts ();
+
+  if (PARAM_VALUE (PARAM_COVERAGE_EXEC_ONCE))
+add_predicate_to_edge_counters ();
 }
 
   free_aux_for_edges ();
Index: gcc/profile.h
===
--- gcc/profile.h   (revision 205226)
+++ gcc/profile.h   (working copy)
@@ -46,6 +46,9 @@ extern gcov_type sum_edge_counts (vec
 extern void init_node_map (bool);
 extern void del_node_map (void);
 
+/* Insert a predicate to edge counter update stmt.  */
+extern void add_predicate_to_edge_counters (void);
+
 extern void get_working_sets (void);
 
 /* In predict.c.  */
Index: gcc/tree-profile.c
===
--- gcc/tree-profile.c  (revision 205226)
+++ gcc/tree-profile.c  (working copy)
@@ -52,6 +52,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "target.h"
 #include "tree-cfgcleanup.h"
 #include "tree-nested.h"
+#include "params.h"
 
 static GTY(()) tree gcov_type_node;
 static GTY(()) tree tree_interval_profiler_fn;
@@ -67,6 +68,11 @@ static GTY(()) tree ic_void_ptr_var;
 static GTY(()) tree ic_gcov_type_ptr_var;
 static GTY(()) tree ptr_void;
 
+/* A pointer-set of the last statement in each block of statements that need to
+   be applied a predicate .  */
+static struct pointer_set_t *pred_instrumentation_candidate = NULL;
+
+
 /* Do initialization work for the edge profiler.  */
 
 /* Add code:
@@ -295,6 +301,10 @@ gimple_gen_edge_profiler (int edgeno, edge e)
   stmt2 = gimple_build_as