On Fri, Jan 16, 2015 at 11:46:44AM +0100, Peter Zijlstra wrote:
> Its a bandaid at best :/ The problem is (again) that we changes
> event->ctx without any kind of serialization.
> 
> The issue came up before:
> 
>   https://lkml.org/lkml/2014/9/5/397
> 
> and I've not been able to come up with anything much saner.

A little something like the below is the best I could come up with; I
know Linus hated it, but I figure we ought to do something to stop
crashing.



---
 init/Kconfig         |    1 +
 kernel/events/core.c |  126 ++++++++++++++++++++++++++++++++++++++++----------
 2 files changed, 103 insertions(+), 24 deletions(-)

diff --git a/init/Kconfig b/init/Kconfig
index 9afb971..ebc8522 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1595,6 +1595,7 @@ config PERF_EVENTS
        depends on HAVE_PERF_EVENTS
        select ANON_INODES
        select IRQ_WORK
+       select PERCPU_RWSEM
        help
          Enable kernel support for various performance events provided
          by software and hardware.
diff --git a/kernel/events/core.c b/kernel/events/core.c
index c10124b..fb3971d 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -42,6 +42,7 @@
 #include <linux/module.h>
 #include <linux/mman.h>
 #include <linux/compat.h>
+#include <linux/percpu-rwsem.h>
 
 #include "internal.h"
 
@@ -122,6 +123,42 @@ static int cpu_function_call(int cpu, int (*func) (void 
*info), void *info)
        return data.ret;
 }
 
+/*
+ * Required to migrate events between contexts.
+ *
+ * Migrating events between contexts is rather tricky; there is no real
+ * serialization around the perf_event::ctx pointer.
+ *
+ * So what we do is hold this rwsem over the remove_from_context and
+ * install_in_context. The remove_from_context ensures the event is inactive
+ * and will not be used from IRQ/NMI context anymore, and the remaining
+ * sites can acquire the rwsem read side.
+ */
+static struct percpu_rw_semaphore perf_rwsem;
+
+static inline struct perf_event_context *perf_event_ctx(struct perf_event 
*event)
+{
+#ifdef CONFIG_LOCKDEP
+       /*
+        * Assert the locking rules outlined above; in order to dereference
+        * event->ctx we must either be attached to the context or hold
+        * perf_rwsem.
+        *
+        * XXX not usable from IPIs because the lockdep held lock context
+        * will be wrong; maybe add trylock variants to the percpu_rw_semaphore
+        */
+       WARN_ON_ONCE(!(event->attach_state & PERF_ATTACH_CONTEXT) ||
+                    (debug_locks && !lockdep_is_held(&perf_rwsem.rw_sem)));
+#endif
+
+       return event->ctx;
+}
+
+static inline struct perf_event_context *__perf_event_ctx(struct perf_event 
*event)
+{
+       return event->ctx;
+}
+
 #define EVENT_OWNER_KERNEL ((void *) -1)
 
 static bool is_kernel_event(struct perf_event *event)
@@ -380,7 +417,7 @@ perf_cgroup_from_task(struct task_struct *task)
 static inline bool
 perf_cgroup_match(struct perf_event *event)
 {
-       struct perf_event_context *ctx = event->ctx;
+       struct perf_event_context *ctx = __perf_event_ctx(event);
        struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
 
        /* @event doesn't care about cgroup */
@@ -1054,7 +1091,7 @@ static void update_context_time(struct perf_event_context 
*ctx)
 
 static u64 perf_event_time(struct perf_event *event)
 {
-       struct perf_event_context *ctx = event->ctx;
+       struct perf_event_context *ctx = __perf_event_ctx(event);
 
        if (is_cgroup_event(event))
                return perf_cgroup_event_time(event);
@@ -1068,7 +1105,7 @@ static u64 perf_event_time(struct perf_event *event)
  */
 static void update_event_times(struct perf_event *event)
 {
-       struct perf_event_context *ctx = event->ctx;
+       struct perf_event_context *ctx = __perf_event_ctx(event);
        u64 run_end;
 
        if (event->state < PERF_EVENT_STATE_INACTIVE ||
@@ -1518,7 +1555,7 @@ static int __perf_remove_from_context(void *info)
 {
        struct remove_event *re = info;
        struct perf_event *event = re->event;
-       struct perf_event_context *ctx = event->ctx;
+       struct perf_event_context *ctx = __perf_event_ctx(event);
        struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
 
        raw_spin_lock(&ctx->lock);
@@ -1551,7 +1588,7 @@ static int __perf_remove_from_context(void *info)
  */
 static void perf_remove_from_context(struct perf_event *event, bool 
detach_group)
 {
-       struct perf_event_context *ctx = event->ctx;
+       struct perf_event_context *ctx = perf_event_ctx(event);
        struct task_struct *task = ctx->task;
        struct remove_event re = {
                .event = event,
@@ -1606,7 +1643,7 @@ retry:
 int __perf_event_disable(void *info)
 {
        struct perf_event *event = info;
-       struct perf_event_context *ctx = event->ctx;
+       struct perf_event_context *ctx = __perf_event_ctx(event);
        struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
 
        /*
@@ -1656,20 +1693,24 @@ int __perf_event_disable(void *info)
  */
 void perf_event_disable(struct perf_event *event)
 {
-       struct perf_event_context *ctx = event->ctx;
-       struct task_struct *task = ctx->task;
+       struct perf_event_context *ctx;
+       struct task_struct *task;
+
+       percpu_down_read(&perf_rwsem);
+       ctx = perf_event_ctx(event);
+       task = ctx->task;
 
        if (!task) {
                /*
                 * Disable the event on the cpu that it's on
                 */
                cpu_function_call(event->cpu, __perf_event_disable, event);
-               return;
+               goto unlock;
        }
 
 retry:
        if (!task_function_call(task, __perf_event_disable, event))
-               return;
+               goto unlock;
 
        raw_spin_lock_irq(&ctx->lock);
        /*
@@ -1694,6 +1735,8 @@ retry:
                event->state = PERF_EVENT_STATE_OFF;
        }
        raw_spin_unlock_irq(&ctx->lock);
+unlock:
+       percpu_up_read(&perf_rwsem);
 }
 EXPORT_SYMBOL_GPL(perf_event_disable);
 
@@ -1937,7 +1980,7 @@ static void perf_event_sched_in(struct perf_cpu_context 
*cpuctx,
 static int  __perf_install_in_context(void *info)
 {
        struct perf_event *event = info;
-       struct perf_event_context *ctx = event->ctx;
+       struct perf_event_context *ctx = __perf_event_ctx(event);
        struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
        struct perf_event_context *task_ctx = cpuctx->task_ctx;
        struct task_struct *task = current;
@@ -2076,7 +2119,7 @@ static void __perf_event_mark_enabled(struct perf_event 
*event)
 static int __perf_event_enable(void *info)
 {
        struct perf_event *event = info;
-       struct perf_event_context *ctx = event->ctx;
+       struct perf_event_context *ctx = __perf_event_ctx(event);
        struct perf_event *leader = event->group_leader;
        struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
        int err;
@@ -2160,15 +2203,19 @@ unlock:
  */
 void perf_event_enable(struct perf_event *event)
 {
-       struct perf_event_context *ctx = event->ctx;
-       struct task_struct *task = ctx->task;
+       struct perf_event_context *ctx;
+       struct task_struct *task;
+
+       percpu_down_read(&perf_rwsem);
+       ctx = perf_event_ctx(event);
+       task = ctx->task;
 
        if (!task) {
                /*
                 * Enable the event on the cpu that it's on
                 */
                cpu_function_call(event->cpu, __perf_event_enable, event);
-               return;
+               goto unlock;
        }
 
        raw_spin_lock_irq(&ctx->lock);
@@ -2194,7 +2241,7 @@ retry:
        raw_spin_unlock_irq(&ctx->lock);
 
        if (!task_function_call(task, __perf_event_enable, event))
-               return;
+               goto unlock;
 
        raw_spin_lock_irq(&ctx->lock);
 
@@ -2213,6 +2260,8 @@ retry:
 
 out:
        raw_spin_unlock_irq(&ctx->lock);
+unlock:
+       percpu_up_read(&perf_rwsem);
 }
 EXPORT_SYMBOL_GPL(perf_event_enable);
 
@@ -3076,7 +3125,7 @@ void perf_event_exec(void)
 static void __perf_event_read(void *info)
 {
        struct perf_event *event = info;
-       struct perf_event_context *ctx = event->ctx;
+       struct perf_event_context *ctx = __perf_event_ctx(event);
        struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
 
        /*
@@ -3115,7 +3164,7 @@ static u64 perf_event_read(struct perf_event *event)
                smp_call_function_single(event->oncpu,
                                         __perf_event_read, event, 1);
        } else if (event->state == PERF_EVENT_STATE_INACTIVE) {
-               struct perf_event_context *ctx = event->ctx;
+               struct perf_event_context *ctx = perf_event_ctx(event);
                unsigned long flags;
 
                raw_spin_lock_irqsave(&ctx->lock, flags);
@@ -3440,7 +3489,7 @@ static void perf_remove_from_owner(struct perf_event 
*event)
  */
 static void put_event(struct perf_event *event)
 {
-       struct perf_event_context *ctx = event->ctx;
+       struct perf_event_context *ctx;
 
        if (!atomic_long_dec_and_test(&event->refcount))
                return;
@@ -3448,6 +3497,8 @@ static void put_event(struct perf_event *event)
        if (!is_kernel_event(event))
                perf_remove_from_owner(event);
 
+       percpu_down_read(&perf_rwsem);
+       ctx = perf_event_ctx(event);
        WARN_ON_ONCE(ctx->parent_ctx);
        /*
         * There are two ways this annotation is useful:
@@ -3464,6 +3515,7 @@ static void put_event(struct perf_event *event)
        mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING);
        perf_remove_from_context(event, true);
        mutex_unlock(&ctx->mutex);
+       percpu_up_read(&perf_rwsem);
 
        _free_event(event);
 }
@@ -3647,11 +3699,13 @@ perf_read_hw(struct perf_event *event, char __user 
*buf, size_t count)
        if (count < event->read_size)
                return -ENOSPC;
 
-       WARN_ON_ONCE(event->ctx->parent_ctx);
+       percpu_down_read(&perf_rwsem);
+       WARN_ON_ONCE(perf_event_ctx(event)->parent_ctx);
        if (read_format & PERF_FORMAT_GROUP)
                ret = perf_event_read_group(event, read_format, buf);
        else
                ret = perf_event_read_one(event, read_format, buf);
+       percpu_up_read(&perf_rwsem);
 
        return ret;
 }
@@ -3689,9 +3743,11 @@ static unsigned int perf_poll(struct file *file, 
poll_table *wait)
 
 static void perf_event_reset(struct perf_event *event)
 {
+       percpu_down_read(&perf_rwsem);
        (void)perf_event_read(event);
        local64_set(&event->count, 0);
        perf_event_update_userpage(event);
+       percpu_up_read(&perf_rwsem);
 }
 
 /*
@@ -3705,7 +3761,7 @@ static void perf_event_for_each_child(struct perf_event 
*event,
 {
        struct perf_event *child;
 
-       WARN_ON_ONCE(event->ctx->parent_ctx);
+       WARN_ON_ONCE(__perf_event_ctx(event)->parent_ctx);
        mutex_lock(&event->child_mutex);
        func(event);
        list_for_each_entry(child, &event->child_list, child_list)
@@ -3716,6 +3772,14 @@ static void perf_event_for_each_child(struct perf_event 
*event,
 static void perf_event_for_each(struct perf_event *event,
                                  void (*func)(struct perf_event *))
 {
+       /* 
+        * XXX broken 
+        *
+        * lock inversion and recursion issues; ctx->mutex must nest inside
+        * perf_rwsem, but func() will take perf_rwsem again.
+        *
+        * Cure with ugly.
+        */
        struct perf_event_context *ctx = event->ctx;
        struct perf_event *sibling;
 
@@ -3731,7 +3795,7 @@ static void perf_event_for_each(struct perf_event *event,
 
 static int perf_event_period(struct perf_event *event, u64 __user *arg)
 {
-       struct perf_event_context *ctx = event->ctx;
+       struct perf_event_context *ctx;
        int ret = 0, active;
        u64 value;
 
@@ -3744,6 +3808,8 @@ static int perf_event_period(struct perf_event *event, 
u64 __user *arg)
        if (!value)
                return -EINVAL;
 
+       percpu_down_read(&perf_rwsem);
+       ctx = perf_event_ctx(event);
        raw_spin_lock_irq(&ctx->lock);
        if (event->attr.freq) {
                if (value > sysctl_perf_event_sample_rate) {
@@ -3772,6 +3838,7 @@ static int perf_event_period(struct perf_event *event, 
u64 __user *arg)
 
 unlock:
        raw_spin_unlock_irq(&ctx->lock);
+       percpu_up_read(&perf_rwsem);
 
        return ret;
 }
@@ -7229,11 +7296,13 @@ perf_event_set_output(struct perf_event *event, struct 
perf_event *output_event)
        if (output_event->cpu != event->cpu)
                goto out;
 
+       percpu_down_read(&perf_rwsem);
        /*
         * If its not a per-cpu rb, it must be the same task.
         */
-       if (output_event->cpu == -1 && output_event->ctx != event->ctx)
-               goto out;
+       if (output_event->cpu == -1 &&
+           perf_event_ctx(output_event) != perf_event_ctx(event))
+               goto unlock_rwsem;
 
 set:
        mutex_lock(&event->mmap_mutex);
@@ -7253,6 +7322,8 @@ set:
        ret = 0;
 unlock:
        mutex_unlock(&event->mmap_mutex);
+unlock_rwsem:
+       percpu_up_read(&perf_rwsem);
 
 out:
        return ret;
@@ -7461,6 +7532,7 @@ SYSCALL_DEFINE5(perf_event_open,
        if (move_group) {
                struct perf_event_context *gctx = group_leader->ctx;
 
+               percpu_down_write(&perf_rwsem);
                mutex_lock(&gctx->mutex);
                perf_remove_from_context(group_leader, false);
 
@@ -7498,6 +7570,9 @@ SYSCALL_DEFINE5(perf_event_open,
        perf_unpin_context(ctx);
        mutex_unlock(&ctx->mutex);
 
+       if (move_group)
+               percpu_up_write(&perf_rwsem);
+
        put_online_cpus();
 
        event->owner = current;
@@ -7600,6 +7675,7 @@ void perf_pmu_migrate_context(struct pmu *pmu, int 
src_cpu, int dst_cpu)
        struct perf_event *event, *tmp;
        LIST_HEAD(events);
 
+       percpu_down_write(&perf_rwsem);
        src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx;
        dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx;
 
@@ -7625,6 +7701,7 @@ void perf_pmu_migrate_context(struct pmu *pmu, int 
src_cpu, int dst_cpu)
                get_ctx(dst_ctx);
        }
        mutex_unlock(&dst_ctx->mutex);
+       percpu_up_write(&perf_rwsem);
 }
 EXPORT_SYMBOL_GPL(perf_pmu_migrate_context);
 
@@ -8261,6 +8338,7 @@ void __init perf_event_init(void)
 
        idr_init(&pmu_idr);
 
+       percpu_init_rwsem(&perf_rwsem);
        perf_event_init_all_cpus();
        init_srcu_struct(&pmus_srcu);
        perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE);
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to