Re: perf: perf_fuzzer quickly locks up on 4.15-rc7

Peter Zijlstra Thu, 11 Jan 2018 07:30:03 -0800

On Thu, Jan 11, 2018 at 10:13:53AM +0100, Peter Zijlstra wrote:
> On Tue, Jan 09, 2018 at 08:44:23AM -0500, Vince Weaver wrote:
> > On Tue, 9 Jan 2018, Peter Zijlstra wrote:
> > 
> > > So remind me again, how are you running that fuzzer? I'm running
> > > ./fast_repro99.sh as root.
> > 
> > I'm running ./fast_repro98.sh on a regular haswell machine with paranoid 
> > set to "0".
> 
> I'm seeing things like:
> 
> Cannot open /sys/kernel/tracing/kprobe_events
> 
> this is likely caused by me not having anything mounted there. Rostedt
> provided the magic incantation to make that work, I'll go try now.
> 
> At least I seem to have cured all lockdep splats.. of course, the moment
> I hit send on this someone bad is bound to happen.


So while the IVB-EP (6/62/4) goes wobbly after a while, I also started
fuzzing on my HSW-EX (6/63/4) and there it runs without issue for over
30 minutes.

(I get an unchecked MSR splat from the RAPL pmu, but that is non-fatal)

But so far no weird hangs.. running tip/master + below.

Sadly the only other HSW I have is a laptop, and those are absolutely
useless for debugging (no serial :/) but I'll go see if I can reproduce.

FWIW these test-boxes are running debian/testing

---
 Documentation/admin-guide/kernel-parameters.txt |   5 +
 arch/x86/events/intel/ds.c                      |  33 ++++---
 kernel/events/core.c                            |  47 +++++++++-
 kernel/locking/lockdep.c                        |  10 ++
 kernel/printk/printk.c                          | 119 +++++++++++++++++-------
 kernel/smp.c                                    |  10 ++
 6 files changed, 171 insertions(+), 53 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt 
b/Documentation/admin-guide/kernel-parameters.txt
index 8bc512782bd2..7fa1a59461f7 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1169,6 +1169,11 @@
                        parameter will force ia64_sal_cache_flush to call
                        ia64_pal_cache_flush instead of SAL_CACHE_FLUSH.
 
+       force_early_printk
+                       Forcefully uses early_console (as per earlyprintk=)
+                       usage for regular printk, bypassing everything,
+                       including the syslog (dmesg will be empty).
+
        forcepae [X86-32]
                        Forcefully enable Physical Address Extension (PAE).
                        Many Pentium M systems disable PAE but may have a
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index 8156e47da7ba..18c25ab28557 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -372,10 +372,9 @@ static int alloc_pebs_buffer(int cpu)
 static void release_pebs_buffer(int cpu)
 {
        struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
-       struct debug_store *ds = hwev->ds;
        void *cea;
 
-       if (!ds || !x86_pmu.pebs)
+       if (!x86_pmu.pebs)
                return;
 
        kfree(per_cpu(insn_buffer, cpu));
@@ -384,7 +383,6 @@ static void release_pebs_buffer(int cpu)
        /* Clear the fixmap */
        cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer;
        ds_clear_cea(cea, x86_pmu.pebs_buffer_size);
-       ds->pebs_buffer_base = 0;
        dsfree_pages(hwev->ds_pebs_vaddr, x86_pmu.pebs_buffer_size);
        hwev->ds_pebs_vaddr = NULL;
 }
@@ -419,16 +417,14 @@ static int alloc_bts_buffer(int cpu)
 static void release_bts_buffer(int cpu)
 {
        struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
-       struct debug_store *ds = hwev->ds;
        void *cea;
 
-       if (!ds || !x86_pmu.bts)
+       if (!x86_pmu.bts)
                return;
 
        /* Clear the fixmap */
        cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.bts_buffer;
        ds_clear_cea(cea, BTS_BUFFER_SIZE);
-       ds->bts_buffer_base = 0;
        dsfree_pages(hwev->ds_bts_vaddr, BTS_BUFFER_SIZE);
        hwev->ds_bts_vaddr = NULL;
 }
@@ -454,16 +450,22 @@ void release_ds_buffers(void)
        if (!x86_pmu.bts && !x86_pmu.pebs)
                return;
 
-       get_online_cpus();
-       for_each_online_cpu(cpu)
+       for_each_possible_cpu(cpu)
+               release_ds_buffer(cpu);
+
+       for_each_possible_cpu(cpu) {
+               /*
+                * Again, ignore errors from offline CPUs, they will no longer
+                * observe cpu_hw_events.ds and not program the DS_AREA when
+                * they come up.
+                */
                fini_debug_store_on_cpu(cpu);
+       }
 
        for_each_possible_cpu(cpu) {
                release_pebs_buffer(cpu);
                release_bts_buffer(cpu);
-               release_ds_buffer(cpu);
        }
-       put_online_cpus();
 }
 
 void reserve_ds_buffers(void)
@@ -483,8 +485,6 @@ void reserve_ds_buffers(void)
        if (!x86_pmu.pebs)
                pebs_err = 1;
 
-       get_online_cpus();
-
        for_each_possible_cpu(cpu) {
                if (alloc_ds_buffer(cpu)) {
                        bts_err = 1;
@@ -521,11 +521,14 @@ void reserve_ds_buffers(void)
                if (x86_pmu.pebs && !pebs_err)
                        x86_pmu.pebs_active = 1;
 
-               for_each_online_cpu(cpu)
+               for_each_possible_cpu(cpu) {
+                       /*
+                        * Ignores wrmsr_on_cpu() errors for offline CPUs they
+                        * will get this call through intel_pmu_cpu_starting().
+                        */
                        init_debug_store_on_cpu(cpu);
+               }
        }
-
-       put_online_cpus();
 }
 
 /*
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 4e1a1bf8d867..fb079be8971a 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1231,6 +1231,10 @@ static void put_ctx(struct perf_event_context *ctx)
  *           perf_event_context::lock
  *         perf_event::mmap_mutex
  *         mmap_sem
+ *
+ *    cpu_hotplug_lock
+ *      pmus_lock
+ *       cpuctx->mutex / perf_event_context::mutex
  */
 static struct perf_event_context *
 perf_event_ctx_lock_nested(struct perf_event *event, int nesting)
@@ -4196,6 +4200,7 @@ int perf_event_release_kernel(struct perf_event *event)
 {
        struct perf_event_context *ctx = event->ctx;
        struct perf_event *child, *tmp;
+       LIST_HEAD(free_list);
 
        /*
         * If we got here through err_file: fput(event_file); we will not have
@@ -4268,8 +4273,7 @@ int perf_event_release_kernel(struct perf_event *event)
                                               struct perf_event, child_list);
                if (tmp == child) {
                        perf_remove_from_context(child, DETACH_GROUP);
-                       list_del(&child->child_list);
-                       free_event(child);
+                       list_move(&child->child_list, &free_list);
                        /*
                         * This matches the refcount bump in inherit_event();
                         * this can't be the last reference.
@@ -4284,6 +4288,11 @@ int perf_event_release_kernel(struct perf_event *event)
        }
        mutex_unlock(&event->child_mutex);
 
+       list_for_each_entry_safe(child, tmp, &free_list, child_list) {
+               list_del(&child->child_list);
+               free_event(child);
+       }
+
 no_ctx:
        put_event(event); /* Must be the 'last' reference */
        return 0;
@@ -8526,6 +8535,29 @@ perf_event_set_addr_filter(struct perf_event *event, 
char *filter_str)
        return ret;
 }
 
+static int
+perf_tracepoint_set_filter(struct perf_event *event, char *filter_str)
+{
+       struct perf_event_context *ctx = event->ctx;
+       int ret;
+
+       /*
+        * Beware, here be dragons!!
+        *
+        * the tracepoint muck will deadlock against ctx->mutex, but the 
tracepoint
+        * stuff does not actually need it. So temporarily drop ctx->mutex. As 
per
+        * perf_event_ctx_lock() we already have a reference on ctx.
+        *
+        * This can result in event getting moved to a different ctx, but that
+        * does not affect the tracepoint state.
+        */
+       mutex_unlock(&ctx->mutex);
+       ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
+       mutex_lock(&ctx->mutex);
+
+       return ret;
+}
+
 static int perf_event_set_filter(struct perf_event *event, void __user *arg)
 {
        char *filter_str;
@@ -8542,8 +8574,7 @@ static int perf_event_set_filter(struct perf_event 
*event, void __user *arg)
 
        if (IS_ENABLED(CONFIG_EVENT_TRACING) &&
            event->attr.type == PERF_TYPE_TRACEPOINT)
-               ret = ftrace_profile_set_filter(event, event->attr.config,
-                                               filter_str);
+               ret = perf_tracepoint_set_filter(event, filter_str);
        else if (has_addr_filter(event))
                ret = perf_event_set_addr_filter(event, filter_str);
 
@@ -9178,7 +9209,13 @@ static int perf_try_init_event(struct pmu *pmu, struct 
perf_event *event)
        if (!try_module_get(pmu->module))
                return -ENODEV;
 
-       if (event->group_leader != event) {
+       /*
+        * A number of pmu->event_init() methods iterate the sibling_list to,
+        * for example, validate if the group fits on the PMU. Therefore,
+        * if this is a sibling event, acquire the ctx->mutex to protect
+        * the sibling_list.
+        */
+       if (event->group_leader != event && pmu->task_ctx_nr != 
perf_sw_context) {
                /*
                 * This ctx->mutex can nest when we're called through
                 * inheritance. See the perf_event_ctx_lock_nested() comment.
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 5fa1324a4f29..fb81dc92aaa6 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -2668,8 +2668,12 @@ check_usage_backwards(struct task_struct *curr, struct 
held_lock *this,
                                        this, 0, irqclass);
 }
 
+DECLARE_PER_CPU(struct cpumask *, smp_call_mask);
+
 void print_irqtrace_events(struct task_struct *curr)
 {
+       struct cpumask *__mask = this_cpu_read(smp_call_mask);
+
        printk("irq event stamp: %u\n", curr->irq_events);
        printk("hardirqs last  enabled at (%u): [<%p>] %pS\n",
                curr->hardirq_enable_event, (void *)curr->hardirq_enable_ip,
@@ -2683,6 +2687,12 @@ void print_irqtrace_events(struct task_struct *curr)
        printk("softirqs last disabled at (%u): [<%p>] %pS\n",
                curr->softirq_disable_event, (void *)curr->softirq_disable_ip,
                (void *)curr->softirq_disable_ip);
+
+       if (__mask) {
+               printk("smp_function_call mask: %*pbl\n", 
cpumask_pr_args(__mask));
+       } else {
+               printk("smp_function_call mask: (NULL)\n");
+       }
 }
 
 static int HARDIRQ_verbose(struct lock_class *class)
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index b9006617710f..c1a4ed17c22c 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -365,6 +365,75 @@ __packed __aligned(4)
 #endif
 ;
 
+#ifdef CONFIG_EARLY_PRINTK
+struct console *early_console;
+
+static bool __read_mostly force_early_printk;
+
+static int __init force_early_printk_setup(char *str)
+{
+       force_early_printk = true;
+       return 0;
+}
+early_param("force_early_printk", force_early_printk_setup);
+
+static int early_printk_cpu = -1;
+
+static int early_vprintk(const char *fmt, va_list args)
+{
+       int n, cpu, old;
+       char buf[512];
+
+       cpu = get_cpu();
+       /*
+        * Test-and-Set inter-cpu spinlock with recursion.
+        */
+       for (;;) {
+               /*
+                * c-cas to avoid the exclusive bouncing on spin.
+                * Depends on the memory barrier implied by cmpxchg
+                * for ACQUIRE semantics.
+                */
+               old = READ_ONCE(early_printk_cpu);
+               if (old == -1) {
+                       old = cmpxchg(&early_printk_cpu, -1, cpu);
+                       if (old == -1)
+                               break;
+               }
+               /*
+                * Allow recursion for interrupts and the like.
+                */
+               if (old == cpu)
+                       break;
+
+               cpu_relax();
+       }
+
+       n = vscnprintf(buf, sizeof(buf), fmt, args);
+       early_console->write(early_console, buf, n);
+
+       /*
+        * Unlock -- in case @old == @cpu, this is a no-op.
+        */
+       smp_store_release(&early_printk_cpu, old);
+       put_cpu();
+
+       return n;
+}
+
+asmlinkage __visible void early_printk(const char *fmt, ...)
+{
+       va_list ap;
+
+       if (!early_console)
+               return;
+
+       va_start(ap, fmt);
+       early_vprintk(fmt, ap);
+       va_end(ap);
+}
+#endif
+
 /*
  * The logbuf_lock protects kmsg buffer, indices, counters.  This can be taken
  * within the scheduler's rq lock. It must be released before calling
@@ -1692,6 +1761,16 @@ asmlinkage int vprintk_emit(int facility, int level,
        int printed_len;
        bool in_sched = false;
 
+#ifdef CONFIG_KGDB_KDB
+       if (unlikely(kdb_trap_printk && kdb_printf_cpu < 0))
+               return vkdb_printf(KDB_MSGSRC_PRINTK, fmt, args);
+#endif
+
+#ifdef CONFIG_EARLY_PRINTK
+       if (force_early_printk && early_console)
+               return early_vprintk(fmt, args);
+#endif
+
        if (level == LOGLEVEL_SCHED) {
                level = LOGLEVEL_DEFAULT;
                in_sched = true;
@@ -1784,18 +1863,7 @@ EXPORT_SYMBOL(printk_emit);
 
 int vprintk_default(const char *fmt, va_list args)
 {
-       int r;
-
-#ifdef CONFIG_KGDB_KDB
-       /* Allow to pass printk() to kdb but avoid a recursion. */
-       if (unlikely(kdb_trap_printk && kdb_printf_cpu < 0)) {
-               r = vkdb_printf(KDB_MSGSRC_PRINTK, fmt, args);
-               return r;
-       }
-#endif
-       r = vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args);
-
-       return r;
+       return vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args);
 }
 EXPORT_SYMBOL_GPL(vprintk_default);
 
@@ -1826,7 +1894,12 @@ asmlinkage __visible int printk(const char *fmt, ...)
        int r;
 
        va_start(args, fmt);
-       r = vprintk_func(fmt, args);
+#ifdef CONFIG_EARLY_PRINTK
+       if (force_early_printk && early_console)
+               r = vprintk_default(fmt, args);
+       else
+#endif
+               r = vprintk_func(fmt, args);
        va_end(args);
 
        return r;
@@ -1863,26 +1936,6 @@ static bool suppress_message_printing(int level) { 
return false; }
 
 #endif /* CONFIG_PRINTK */
 
-#ifdef CONFIG_EARLY_PRINTK
-struct console *early_console;
-
-asmlinkage __visible void early_printk(const char *fmt, ...)
-{
-       va_list ap;
-       char buf[512];
-       int n;
-
-       if (!early_console)
-               return;
-
-       va_start(ap, fmt);
-       n = vscnprintf(buf, sizeof(buf), fmt, ap);
-       va_end(ap);
-
-       early_console->write(early_console, buf, n);
-}
-#endif
-
 static int __add_preferred_console(char *name, int idx, char *options,
                                   char *brl_options)
 {
diff --git a/kernel/smp.c b/kernel/smp.c
index 084c8b3a2681..d014602e13f7 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -257,6 +257,8 @@ static void flush_smp_call_function_queue(bool 
warn_cpu_offline)
        irq_work_run();
 }
 
+DEFINE_PER_CPU(const struct cpumask *, smp_call_mask);
+
 /*
  * smp_call_function_single - Run a function on a specific CPU
  * @func: The function to run. This must be fast and non-blocking.
@@ -296,11 +298,15 @@ int smp_call_function_single(int cpu, smp_call_func_t 
func, void *info,
                csd_lock(csd);
        }
 
+       this_cpu_write(smp_call_mask, cpumask_of(cpu));
+
        err = generic_exec_single(cpu, csd, func, info);
 
        if (wait)
                csd_lock_wait(csd);
 
+       this_cpu_write(smp_call_mask, NULL);
+
        put_cpu();
 
        return err;
@@ -457,6 +463,8 @@ void smp_call_function_many(const struct cpumask *mask,
                        __cpumask_set_cpu(cpu, cfd->cpumask_ipi);
        }
 
+       this_cpu_write(smp_call_mask, cfd->cpumask_ipi);
+
        /* Send a message to all CPUs in the map */
        arch_send_call_function_ipi_mask(cfd->cpumask_ipi);
 
@@ -468,6 +476,8 @@ void smp_call_function_many(const struct cpumask *mask,
                        csd_lock_wait(csd);
                }
        }
+
+       this_cpu_write(smp_call_mask, NULL);
 }
 EXPORT_SYMBOL(smp_call_function_many);

Re: perf: perf_fuzzer quickly locks up on 4.15-rc7

Reply via email to