On Wed, Oct 16, 2013 at 03:31:25PM +0200, Peter Zijlstra wrote:
> On Wed, Oct 16, 2013 at 08:46:49AM -0400, Don Zickus wrote:
> > On Wed, Oct 16, 2013 at 12:57:55PM +0200, Peter Zijlstra wrote:
> > > A prettier patch below. The main difference is on-demand allocation of
> > > the scratch buffer.
> > 
> > I'll see if I can sanity test this in the next couple hours.
> > 
> > Further testing yesterday showed that intel_pmu_drain_pebs_nhm still
> > has long latencies somewhere.  With 15 minute reboots, isolation goes
> > slooow.
> 
> Pick a smaller box? I seem to be able to reproduce on my wsm-ep, which
> boots inside a minute :-)
> 
> root@westmere:~# cd /debug/tracing/
> root@westmere:/debug/tracing# echo function > current_tracer
> root@westmere:/debug/tracing# cat available_filter_functions | grep ^inat > 
> set_ftrace_notrace
> root@westmere:/debug/tracing# cat available_filter_functions | grep ^insn | 
> grep -v get_length >> set_ftrace_notrace
> 
> Run: perf top --stdio -e 'cycles:pp' in another window and when the
> console output shows:
> 
> [  610.319486] perf samples too long (19310 > 19230), lowering 
> kernel.perf_event_max_sample_rate to 7000
> 
> quickly press enter here:

BTW; you can also replace this bit of manual intervention with something
like:

There's 3 changes:
 - changed atomic_t into regular int; there's nothing atomic about
   atomic_set vs atomic_read, so atomic_t is pointless
 - made perf_proc_update_handler() clear the running_sample_length
   state.
 - added if (avg_local_sample_len > 30000) tracing_off().

Of course you should tweak the 30000 to match whatever value you're
interested in. But tracing_off() does the same as that:

  echo 0 > tracing_on

And avoids being too late and having lost the trace buffer content.

---
 kernel/events/core.c | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index c716385f6483..ea787d0d0e78 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -175,8 +175,10 @@ int sysctl_perf_event_sample_rate __read_mostly    = 
DEFAULT_MAX_SAMPLE_RATE;
 static int max_samples_per_tick __read_mostly  = 
DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
 static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS;
 
-static atomic_t perf_sample_allowed_ns __read_mostly =
-       ATOMIC_INIT( DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 
100);
+static int perf_sample_allowed_ns __read_mostly =
+       DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100;
+
+static DEFINE_PER_CPU(u64, running_sample_length);
 
 void update_perf_cpu_limits(void)
 {
@@ -184,7 +186,7 @@ void update_perf_cpu_limits(void)
 
        tmp *= sysctl_perf_cpu_time_max_percent;
        do_div(tmp, 100);
-       atomic_set(&perf_sample_allowed_ns, tmp);
+       ACCESS_ONCE(perf_sample_allowed_ns) = tmp;
 }
 
 static int perf_rotate_context(struct perf_cpu_context *cpuctx);
@@ -194,6 +196,7 @@ int perf_proc_update_handler(struct ctl_table *table, int 
write,
                loff_t *ppos)
 {
        int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+       int cpu;
 
        if (ret || !write)
                return ret;
@@ -202,6 +205,9 @@ int perf_proc_update_handler(struct ctl_table *table, int 
write,
        perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
        update_perf_cpu_limits();
 
+       for_each_possible_cpu(cpu)
+               per_cpu(running_sample_length, cpu) = 0;
+
        return 0;
 }
 
@@ -228,14 +234,13 @@ int perf_cpu_time_max_percent_handler(struct ctl_table 
*table, int write,
  * we detect that events are taking too long.
  */
 #define NR_ACCUMULATED_SAMPLES 128
-DEFINE_PER_CPU(u64, running_sample_length);
 
 void perf_sample_event_took(u64 sample_len_ns)
 {
        u64 avg_local_sample_len;
        u64 local_samples_len;
 
-       if (atomic_read(&perf_sample_allowed_ns) == 0)
+       if (ACCESS_ONCE(perf_sample_allowed_ns) == 0)
                return;
 
        /* decay the counter by 1 average sample */
@@ -251,12 +256,15 @@ void perf_sample_event_took(u64 sample_len_ns)
         */
        avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES;
 
-       if (avg_local_sample_len <= atomic_read(&perf_sample_allowed_ns))
+       if (avg_local_sample_len <= ACCESS_ONCE(perf_sample_allowed_ns))
                return;
 
        if (max_samples_per_tick <= 1)
                return;
 
+       if (avg_local_sample_len > 30000)
+               tracing_off();
+
        max_samples_per_tick = DIV_ROUND_UP(max_samples_per_tick, 2);
        sysctl_perf_event_sample_rate = max_samples_per_tick * HZ;
        perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
@@ -265,7 +273,7 @@ void perf_sample_event_took(u64 sample_len_ns)
                        "perf samples too long (%lld > %d), lowering "
                        "kernel.perf_event_max_sample_rate to %d\n",
                        avg_local_sample_len,
-                       atomic_read(&perf_sample_allowed_ns),
+                       ACCESS_ONCE(perf_sample_allowed_ns),
                        sysctl_perf_event_sample_rate);
 
        update_perf_cpu_limits();
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to