From: Peter Zijlstra <pet...@infradead.org>

There were two problems with the dynamic interrupt throttle mechanism,
both triggered by the same action.

When you (or perf_fuzzer) write a huge value into
/proc/sys/kernel/perf_event_max_sample_rate the computed
perf_sample_allowed_ns becomes 0. This effectively disables the whole
dynamic throttle.

This is fixed by ensuring update_perf_cpu_limits() never sets the
value to 0. However, we allow disabling of the dynamic throttle by
writing 100 to /proc/sys/kernel/perf_cpu_time_max_percent. This will
generate a warning in dmesg.

The second problem is that by setting the max_sample_rate to a huge
number, the adaptive process can take a few tries, since it halfs the
limit each time. Change that to directly compute a new value based on
the observed duration.

Signed-off-by: Peter Zijlstra (Intel) <pet...@infradead.org>
Cc: Alexander Shishkin <alexander.shish...@linux.intel.com>
Cc: Andy Lutomirski <l...@amacapital.net>
Cc: Arnaldo Carvalho de Melo <a...@redhat.com>
Cc: Borislav Petkov <b...@alien8.de>
Cc: Brian Gerst <brge...@gmail.com>
Cc: David Ahern <dsah...@gmail.com>
Cc: Denys Vlasenko <dvlas...@redhat.com>
Cc: H. Peter Anvin <h...@zytor.com>
Cc: Jiri Olsa <jo...@redhat.com>
Cc: Linus Torvalds <torva...@linux-foundation.org>
Cc: Namhyung Kim <namhy...@kernel.org>
Cc: Peter Zijlstra <pet...@infradead.org>
Cc: Stephane Eranian <eran...@google.com>
Cc: Thomas Gleixner <t...@linutronix.de>
Cc: Vince Weaver <vincent.wea...@maine.edu>
Signed-off-by: Ingo Molnar <mi...@kernel.org>
Signed-off-by: Tan Xiaojun <tanxiao...@huawei.com>
---
 kernel/events/core.c | 87 ++++++++++++++++++++++++++++++----------------------
 1 file changed, 51 insertions(+), 36 deletions(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 6da64f0..3089a004 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -198,8 +198,11 @@ void update_perf_cpu_limits(void)
        u64 tmp = perf_sample_period_ns;
 
        tmp *= sysctl_perf_cpu_time_max_percent;
-       do_div(tmp, 100);
-       ACCESS_ONCE(perf_sample_allowed_ns) = tmp;
+       tmp = div_u64(tmp, 100);
+       if (!tmp)
+               tmp = 1;
+
+       WRITE_ONCE(perf_sample_allowed_ns, tmp);
 }
 
 static int perf_rotate_context(struct perf_cpu_context *cpuctx);
@@ -231,7 +234,13 @@ int perf_cpu_time_max_percent_handler(struct ctl_table 
*table, int write,
        if (ret || !write)
                return ret;
 
-       update_perf_cpu_limits();
+       if (sysctl_perf_cpu_time_max_percent == 100) {
+               printk(KERN_WARNING
+                      "perf: Dynamic interrupt throttling disabled, can hang 
your system!\n");
+               WRITE_ONCE(perf_sample_allowed_ns, 0);
+       } else {
+               update_perf_cpu_limits();
+       }
 
        return 0;
 }
@@ -245,62 +254,68 @@ int perf_cpu_time_max_percent_handler(struct ctl_table 
*table, int write,
 #define NR_ACCUMULATED_SAMPLES 128
 static DEFINE_PER_CPU(u64, running_sample_length);
 
+static u64 __report_avg;
+static u64 __report_allowed;
+
 static void perf_duration_warn(struct irq_work *w)
 {
-       u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns);
-       u64 avg_local_sample_len;
-       u64 local_samples_len;
-
-       local_samples_len = __this_cpu_read(running_sample_length);
-       avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES;
-
        printk_ratelimited(KERN_WARNING
-                       "perf interrupt took too long (%lld > %lld), lowering "
-                       "kernel.perf_event_max_sample_rate to %d\n",
-                       avg_local_sample_len, allowed_ns >> 1,
-                       sysctl_perf_event_sample_rate);
+               "perf: interrupt took too long (%lld > %lld), lowering "
+               "kernel.perf_event_max_sample_rate to %d\n",
+               __report_avg, __report_allowed,
+               sysctl_perf_event_sample_rate);
 }
 
 static DEFINE_IRQ_WORK(perf_duration_work, perf_duration_warn);
 
 void perf_sample_event_took(u64 sample_len_ns)
 {
-       u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns);
-       u64 avg_local_sample_len;
-       u64 local_samples_len;
+       u64 max_len = READ_ONCE(perf_sample_allowed_ns);
+       u64 running_len;
+       u64 avg_len;
+       u32 max;
 
-       if (allowed_ns == 0)
+       if (max_len == 0)
                return;
 
-       /* decay the counter by 1 average sample */
-       local_samples_len = __this_cpu_read(running_sample_length);
-       local_samples_len -= local_samples_len/NR_ACCUMULATED_SAMPLES;
-       local_samples_len += sample_len_ns;
-       __this_cpu_write(running_sample_length, local_samples_len);
+       /* Decay the counter by 1 average sample. */
+       running_len = __this_cpu_read(running_sample_length);
+       running_len -= running_len/NR_ACCUMULATED_SAMPLES;
+       running_len += sample_len_ns;
+       __this_cpu_write(running_sample_length, running_len);
 
        /*
-        * note: this will be biased artifically low until we have
-        * seen NR_ACCUMULATED_SAMPLES.  Doing it this way keeps us
+        * Note: this will be biased artifically low until we have
+        * seen NR_ACCUMULATED_SAMPLES. Doing it this way keeps us
         * from having to maintain a count.
         */
-       avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES;
-
-       if (avg_local_sample_len <= allowed_ns)
+       avg_len = running_len/NR_ACCUMULATED_SAMPLES;
+       if (avg_len <= max_len)
                return;
 
-       if (max_samples_per_tick <= 1)
-               return;
+       __report_avg = avg_len;
+       __report_allowed = max_len;
 
-       max_samples_per_tick = DIV_ROUND_UP(max_samples_per_tick, 2);
-       sysctl_perf_event_sample_rate = max_samples_per_tick * HZ;
-       perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
+       /*
+        * Compute a throttle threshold 25% below the current duration.
+        */
+       avg_len += avg_len / 4;
+       max = (TICK_NSEC / 100) * sysctl_perf_cpu_time_max_percent;
+       if (avg_len < max)
+               max /= (u32)avg_len;
+       else
+               max = 1;
 
-       update_perf_cpu_limits();
+       WRITE_ONCE(perf_sample_allowed_ns, avg_len);
+       WRITE_ONCE(max_samples_per_tick, max);
+
+       sysctl_perf_event_sample_rate = max * HZ;
+       perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
 
        if (!irq_work_queue(&perf_duration_work)) {
-               early_printk("perf interrupt took too long (%lld > %lld), 
lowering "
+               early_printk("perf: interrupt took too long (%lld > %lld), 
lowering "
                             "kernel.perf_event_max_sample_rate to %d\n",
-                            avg_local_sample_len, allowed_ns >> 1,
+                            __report_avg, __report_allowed,
                             sysctl_perf_event_sample_rate);
        }
 }
-- 
1.9.1

Reply via email to