On Thu, Dec 17, 2015 at 04:07:32PM +0100, Peter Zijlstra wrote:
> On Thu, Dec 17, 2015 at 04:25:14PM +0200, Alexander Shishkin wrote:
> 
> > That aside, why I brought it up in the first place is because the two
> > functions are asymmetric: one is called with irqs disabled and the
> > other -- with ctx::lock held (and not because I'm into bikeshedding or
> > anything like that). Looking at the pair of them sets off my "that's not
> > right" trigger and sends me to the event_function_call()
> > implementation. So in that sense, prepending an extra underscore kind of
> > made sense. Maybe __perf_remove_from_context_{on,off}()?
> 
> You are quite right, and I think I've found more problems because of
> this. Let me prod at this some more.

So this then...

This fixes, I think, 3 separate bugs:

 - remove_from_context used to clear ->is_active, this is against the
   update rules from ctx_sched_in() which set ->is_active even though
   there might be !nr_events

 - install_in_context did bad things to cpuctx->task_ctx; it would not
   validate that ctx->task == current and could do random things because
   of that.

 - cpuctx->task_ctx tracking was iffy

It also unifies a lot of the weird and fragile code we had around all
those IPI calls and adds a bunch of assertions.

It seems to survive a little pounding with 'normal' workloads.

Please have an extra careful look..

Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
---
 include/linux/perf_event.h    |    4 
 kernel/events/core.c          |  463 ++++++++++++++++--------------------------
 kernel/events/hw_breakpoint.c |    2 
 3 files changed, 180 insertions(+), 289 deletions(-)

--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1044,7 +1044,7 @@ extern void perf_swevent_put_recursion_c
 extern u64 perf_swevent_set_period(struct perf_event *event);
 extern void perf_event_enable(struct perf_event *event);
 extern void perf_event_disable(struct perf_event *event);
-extern int __perf_event_disable(void *info);
+extern void perf_event_disable_local(struct perf_event *event);
 extern void perf_event_task_tick(void);
 #else /* !CONFIG_PERF_EVENTS: */
 static inline void *
@@ -1106,7 +1106,7 @@ static inline void perf_swevent_put_recu
 static inline u64 perf_swevent_set_period(struct perf_event *event)    { 
return 0; }
 static inline void perf_event_enable(struct perf_event *event)         { }
 static inline void perf_event_disable(struct perf_event *event)                
{ }
-static inline int __perf_event_disable(void *info)                     { 
return -1; }
+static inline void perf_event_disable_local(struct perf_event *event)  { }
 static inline void perf_event_task_tick(void)                          { }
 static inline int perf_event_release_kernel(struct perf_event *event)  { 
return 0; }
 #endif
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -126,21 +126,127 @@ static int cpu_function_call(int cpu, re
        return data.ret;
 }
 
-static void event_function_call(struct perf_event *event,
-                               int (*active)(void *),
-                               void (*inactive)(void *),
-                               void *data)
+static inline struct perf_cpu_context *
+__get_cpu_context(struct perf_event_context *ctx)
+{
+       return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
+}
+
+static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
+                         struct perf_event_context *ctx)
+{
+       raw_spin_lock(&cpuctx->ctx.lock);
+       if (ctx)
+               raw_spin_lock(&ctx->lock);
+}
+
+static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
+                           struct perf_event_context *ctx)
+{
+       if (ctx)
+               raw_spin_unlock(&ctx->lock);
+       raw_spin_unlock(&cpuctx->ctx.lock);
+}
+
+typedef void (*event_f)(struct perf_event *, struct perf_cpu_context *,
+                       struct perf_event_context *, void *);
+
+struct event_function_struct {
+       struct perf_event *event;
+       event_f func;
+       void *data;
+};
+
+static int event_function(void *info)
+{
+       struct event_function_struct *efs = info;
+       struct perf_event *event = efs->event;
+       struct perf_event_context *ctx = event->ctx;
+       struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+       struct perf_event_context *task_ctx = NULL;
+
+       WARN_ON_ONCE(!irqs_disabled());
+
+       /*
+        * If this is aimed at a task, check that we hit it.
+        *
+        * This being a smp_call_function() we'll have IRQs disabled, therefore
+        * if this is current, we cannot race with context switches swapping
+        * out contexts, see perf_event_context_sched_out().
+        */
+       if (ctx->task) {
+               /* Wrong task, try again. */
+               if (ctx->task != current)
+                       return -EAGAIN;
+
+               task_ctx = ctx;
+       } else
+               WARN_ON_ONCE(&cpuctx->ctx != ctx);
+
+       perf_ctx_lock(cpuctx, task_ctx);
+       /*
+        * Now that we hold locks, double check state. Paranoia pays.
+        */
+       if (task_ctx) {
+               WARN_ON_ONCE(task_ctx->task != current);
+
+               /*
+                * The rules are that:
+                *
+                *  * ctx->is_active is set irrespective of the actual number of
+                *    events, such that we can know it got scheduled in etc.
+                *
+                *  * cpuctx->task_ctx is only set if there were actual events 
on.
+                *
+                * It can happen that even though we hit the right task,
+                * ->is_active is still false, this is possible when the ctx is
+                * new and the task hasn't scheduled yet, or the ctx on its way
+                * out.
+                */
+               if (task_ctx->is_active && task_ctx->nr_events) {
+                       WARN_ON_ONCE(cpuctx->task_ctx != task_ctx);
+               } else {
+                       WARN_ON_ONCE(cpuctx->task_ctx);
+               }
+       }
+
+       efs->func(event, cpuctx, ctx, efs->data);
+       perf_ctx_unlock(cpuctx, task_ctx);
+
+       return 0;
+}
+
+static void event_function_local(struct perf_event *event, event_f func, void 
*data)
+{
+       struct event_function_struct efs = {
+               .event = event,
+               .func = func,
+               .data = data,
+       };
+
+       int ret = event_function(&efs);
+       WARN_ON_ONCE(ret);
+}
+
+static void event_function_call(struct perf_event *event, event_f func, void 
*data)
 {
        struct perf_event_context *ctx = event->ctx;
        struct task_struct *task = ctx->task;
+       struct perf_cpu_context *cpuctx;
+
+       struct event_function_struct efs = {
+               .event = event,
+               .func = func,
+               .data = data,
+       };
 
        if (!task) {
-               cpu_function_call(event->cpu, active, data);
+               cpu_function_call(event->cpu, event_function, &efs);
                return;
        }
 
 again:
-       if (!task_function_call(task, active, data))
+       if (!task_function_call(task, event_function, &efs))
                return;
 
        raw_spin_lock_irq(&ctx->lock);
@@ -153,7 +259,8 @@ static void event_function_call(struct p
                raw_spin_unlock_irq(&ctx->lock);
                goto again;
        }
-       inactive(data);
+       cpuctx = __get_cpu_context(ctx);
+       func(event, cpuctx, ctx, data);
        raw_spin_unlock_irq(&ctx->lock);
 }
 
@@ -368,28 +475,6 @@ static inline u64 perf_event_clock(struc
        return event->clock();
 }
 
-static inline struct perf_cpu_context *
-__get_cpu_context(struct perf_event_context *ctx)
-{
-       return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
-}
-
-static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
-                         struct perf_event_context *ctx)
-{
-       raw_spin_lock(&cpuctx->ctx.lock);
-       if (ctx)
-               raw_spin_lock(&ctx->lock);
-}
-
-static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
-                           struct perf_event_context *ctx)
-{
-       if (ctx)
-               raw_spin_unlock(&ctx->lock);
-       raw_spin_unlock(&cpuctx->ctx.lock);
-}
-
 #ifdef CONFIG_CGROUP_PERF
 
 static inline bool
@@ -1655,47 +1740,18 @@ group_sched_out(struct perf_event *group
                cpuctx->exclusive = 0;
 }
 
-struct remove_event {
-       struct perf_event *event;
-       bool detach_group;
-};
-
-static void ___perf_remove_from_context(void *info)
-{
-       struct remove_event *re = info;
-       struct perf_event *event = re->event;
-       struct perf_event_context *ctx = event->ctx;
-
-       if (re->detach_group)
-               perf_group_detach(event);
-       list_del_event(event, ctx);
-}
-
-/*
- * Cross CPU call to remove a performance event
- *
- * We disable the event on the hardware level first. After that we
- * remove it from the context list.
- */
-static int __perf_remove_from_context(void *info)
+static void
+__perf_remove_from_context(struct perf_event *event, struct perf_cpu_context 
*cpuctx,
+                          struct perf_event_context *ctx, void *info)
 {
-       struct remove_event *re = info;
-       struct perf_event *event = re->event;
-       struct perf_event_context *ctx = event->ctx;
-       struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+       bool detach_group = (unsigned long)info;
 
-       raw_spin_lock(&ctx->lock);
        event_sched_out(event, cpuctx, ctx);
-       if (re->detach_group)
+       if (detach_group)
                perf_group_detach(event);
        list_del_event(event, ctx);
-       if (!ctx->nr_events && cpuctx->task_ctx == ctx) {
-               ctx->is_active = 0;
+       if (!ctx->nr_events && cpuctx->task_ctx == ctx)
                cpuctx->task_ctx = NULL;
-       }
-       raw_spin_unlock(&ctx->lock);
-
-       return 0;
 }
 
 /*
@@ -1714,70 +1770,33 @@ static int __perf_remove_from_context(vo
 static void perf_remove_from_context(struct perf_event *event, bool 
detach_group)
 {
        struct perf_event_context *ctx = event->ctx;
-       struct remove_event re = {
-               .event = event,
-               .detach_group = detach_group,
-       };
 
        lockdep_assert_held(&ctx->mutex);
 
        event_function_call(event, __perf_remove_from_context,
-                           ___perf_remove_from_context, &re);
+                           (void *)(unsigned long)detach_group);
 }
 
-/*
- * Cross CPU call to disable a performance event
- */
-int __perf_event_disable(void *info)
+static void
+__perf_event_disable(struct perf_event *event, struct perf_cpu_context *cpuctx,
+                    struct perf_event_context *ctx, void *info)
 {
-       struct perf_event *event = info;
-       struct perf_event_context *ctx = event->ctx;
-       struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
-
-       /*
-        * If this is a per-task event, need to check whether this
-        * event's task is the current task on this cpu.
-        *
-        * Can trigger due to concurrent perf_event_context_sched_out()
-        * flipping contexts around.
-        */
-       if (ctx->task && cpuctx->task_ctx != ctx)
-               return -EINVAL;
-
-       raw_spin_lock(&ctx->lock);
-
-       /*
-        * If the event is on, turn it off.
-        * If it is in error state, leave it in error state.
-        */
-       if (event->state >= PERF_EVENT_STATE_INACTIVE) {
-               update_context_time(ctx);
-               update_cgrp_time_from_event(event);
-               update_group_times(event);
-               if (event == event->group_leader)
-                       group_sched_out(event, cpuctx, ctx);
-               else
-                       event_sched_out(event, cpuctx, ctx);
-               event->state = PERF_EVENT_STATE_OFF;
-       }
-
-       raw_spin_unlock(&ctx->lock);
+       if (event->state < PERF_EVENT_STATE_INACTIVE)
+               return;
 
-       return 0;
+       update_context_time(ctx);
+       update_cgrp_time_from_event(event);
+       update_group_times(event);
+       if (event == event->group_leader)
+               group_sched_out(event, cpuctx, ctx);
+       else
+               event_sched_out(event, cpuctx, ctx);
+       event->state = PERF_EVENT_STATE_OFF;
 }
 
-void ___perf_event_disable(void *info)
+void perf_event_disable_local(struct perf_event *event)
 {
-       struct perf_event *event = info;
-
-       /*
-        * Since we have the lock this context can't be scheduled
-        * in, so we can change the state safely.
-        */
-       if (event->state == PERF_EVENT_STATE_INACTIVE) {
-               update_group_times(event);
-               event->state = PERF_EVENT_STATE_OFF;
-       }
+       event_function_local(event, __perf_event_disable, NULL);
 }
 
 /*
@@ -1804,8 +1823,7 @@ static void _perf_event_disable(struct p
        }
        raw_spin_unlock_irq(&ctx->lock);
 
-       event_function_call(event, __perf_event_disable,
-                           ___perf_event_disable, event);
+       event_function_call(event, __perf_event_disable, NULL);
 }
 
 /*
@@ -2054,62 +2072,30 @@ static void perf_event_sched_in(struct p
        if (ctx)
                ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
        cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
-       if (ctx)
+       if (ctx) {
                ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
+               if (ctx->nr_events)
+                       cpuctx->task_ctx = ctx;
+       }
 }
 
-static void ___perf_install_in_context(void *info)
-{
-       struct perf_event *event = info;
-       struct perf_event_context *ctx = event->ctx;
-
-       /*
-        * Since the task isn't running, its safe to add the event, us holding
-        * the ctx->lock ensures the task won't get scheduled in.
-        */
-       add_event_to_ctx(event, ctx);
-}
-
-/*
- * Cross CPU call to install and enable a performance event
- *
- * Must be called with ctx->mutex held
- */
-static int  __perf_install_in_context(void *info)
+static void perf_resched_context(struct perf_cpu_context *cpuctx)
 {
-       struct perf_event *event = info;
-       struct perf_event_context *ctx = event->ctx;
-       struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
        struct perf_event_context *task_ctx = cpuctx->task_ctx;
-       struct task_struct *task = current;
 
-       perf_ctx_lock(cpuctx, task_ctx);
        perf_pmu_disable(cpuctx->ctx.pmu);
-
-       /*
-        * If there was an active task_ctx schedule it out.
-        */
        if (task_ctx)
                task_ctx_sched_out(task_ctx);
-
-       /*
-        * If the context we're installing events in is not the
-        * active task_ctx, flip them.
-        */
-       if (ctx->task && task_ctx != ctx) {
-               if (task_ctx)
-                       raw_spin_unlock(&task_ctx->lock);
-               raw_spin_lock(&ctx->lock);
-               task_ctx = ctx;
-       }
-
-       if (task_ctx) {
-               cpuctx->task_ctx = task_ctx;
-               task = task_ctx->task;
-       }
-
        cpu_ctx_sched_out(cpuctx, EVENT_ALL);
 
+       perf_event_sched_in(cpuctx, task_ctx, current);
+       perf_pmu_enable(cpuctx->ctx.pmu);
+}
+
+static void
+__perf_install_in_context(struct perf_event *event, struct perf_cpu_context 
*cpuctx,
+                         struct perf_event_context *ctx, void *info)
+{
        update_context_time(ctx);
        /*
         * update cgrp time only if current cgrp
@@ -2120,15 +2106,8 @@ static int  __perf_install_in_context(vo
 
        add_event_to_ctx(event, ctx);
 
-       /*
-        * Schedule everything back in
-        */
-       perf_event_sched_in(cpuctx, task_ctx, task);
-
-       perf_pmu_enable(cpuctx->ctx.pmu);
-       perf_ctx_unlock(cpuctx, task_ctx);
-
-       return 0;
+       if (ctx->is_active)
+               perf_resched_context(cpuctx);
 }
 
 /*
@@ -2152,8 +2131,7 @@ perf_install_in_context(struct perf_even
        if (event->cpu != -1)
                event->cpu = cpu;
 
-       event_function_call(event, __perf_install_in_context,
-                           ___perf_install_in_context, event);
+       event_function_call(event, __perf_install_in_context, NULL);
 }
 
 /*
@@ -2177,88 +2155,33 @@ static void __perf_event_mark_enabled(st
        }
 }
 
-/*
- * Cross CPU call to enable a performance event
- */
-static int __perf_event_enable(void *info)
+static void
+__perf_event_enable(struct perf_event *event, struct perf_cpu_context *cpuctx,
+                   struct perf_event_context *ctx, void *info)
 {
-       struct perf_event *event = info;
-       struct perf_event_context *ctx = event->ctx;
        struct perf_event *leader = event->group_leader;
-       struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
-       int err;
-
-       /*
-        * There's a time window between 'ctx->is_active' check
-        * in perf_event_enable function and this place having:
-        *   - IRQs on
-        *   - ctx->lock unlocked
-        *
-        * where the task could be killed and 'ctx' deactivated
-        * by perf_event_exit_task.
-        */
-       if (!ctx->is_active)
-               return -EINVAL;
-
-       raw_spin_lock(&ctx->lock);
-       update_context_time(ctx);
 
        if (event->state >= PERF_EVENT_STATE_INACTIVE)
-               goto unlock;
+               return;
 
-       /*
-        * set current task's cgroup time reference point
-        */
+       update_context_time(ctx);
        perf_cgroup_set_timestamp(current, ctx);
-
        __perf_event_mark_enabled(event);
 
        if (!event_filter_match(event)) {
                if (is_cgroup_event(event))
                        perf_cgroup_defer_enabled(event);
-               goto unlock;
+               return;
        }
 
        /*
-        * If the event is in a group and isn't the group leader,
-        * then don't put it on unless the group is on.
+        * If we're part of a group and the leader isn't active, we won't be 
either.
         */
        if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)
-               goto unlock;
-
-       if (!group_can_go_on(event, cpuctx, 1)) {
-               err = -EEXIST;
-       } else {
-               if (event == leader)
-                       err = group_sched_in(event, cpuctx, ctx);
-               else
-                       err = event_sched_in(event, cpuctx, ctx);
-       }
-
-       if (err) {
-               /*
-                * If this event can't go on and it's part of a
-                * group, then the whole group has to come off.
-                */
-               if (leader != event) {
-                       group_sched_out(leader, cpuctx, ctx);
-                       perf_mux_hrtimer_restart(cpuctx);
-               }
-               if (leader->attr.pinned) {
-                       update_group_times(leader);
-                       leader->state = PERF_EVENT_STATE_ERROR;
-               }
-       }
-
-unlock:
-       raw_spin_unlock(&ctx->lock);
-
-       return 0;
-}
+               return;
 
-void ___perf_event_enable(void *info)
-{
-       __perf_event_mark_enabled((struct perf_event *)info);
+       if (ctx->is_active)
+               perf_resched_context(cpuctx);
 }
 
 /*
@@ -2291,8 +2214,7 @@ static void _perf_event_enable(struct pe
                event->state = PERF_EVENT_STATE_OFF;
        raw_spin_unlock_irq(&ctx->lock);
 
-       event_function_call(event, __perf_event_enable,
-                           ___perf_event_enable, event);
+       event_function_call(event, __perf_event_enable, NULL);
 }
 
 /*
@@ -2774,9 +2696,6 @@ static void perf_event_context_sched_in(
         */
        cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
 
-       if (ctx->nr_events)
-               cpuctx->task_ctx = ctx;
-
        perf_event_sched_in(cpuctx, cpuctx->task_ctx, task);
 
        perf_pmu_enable(ctx->pmu);
@@ -4086,36 +4005,13 @@ static void perf_event_for_each(struct p
                perf_event_for_each_child(sibling, func);
 }
 
-struct period_event {
-       struct perf_event *event;
-       u64 value;
-};
-
-static void ___perf_event_period(void *info)
-{
-       struct period_event *pe = info;
-       struct perf_event *event = pe->event;
-       u64 value = pe->value;
-
-       if (event->attr.freq) {
-               event->attr.sample_freq = value;
-       } else {
-               event->attr.sample_period = value;
-               event->hw.sample_period = value;
-       }
-
-       local64_set(&event->hw.period_left, 0);
-}
-
-static int __perf_event_period(void *info)
+static void
+__perf_event_period(struct perf_event *event, struct perf_cpu_context *cpuctx,
+                   struct perf_event_context *ctx, void *info)
 {
-       struct period_event *pe = info;
-       struct perf_event *event = pe->event;
-       struct perf_event_context *ctx = event->ctx;
-       u64 value = pe->value;
+       u64 value = *((u64 *)info);
        bool active;
 
-       raw_spin_lock(&ctx->lock);
        if (event->attr.freq) {
                event->attr.sample_freq = value;
        } else {
@@ -4125,6 +4021,7 @@ static int __perf_event_period(void *inf
 
        active = (event->state == PERF_EVENT_STATE_ACTIVE);
        if (active) {
+               WARN_ON_ONCE(!ctx->is_active);
                perf_pmu_disable(ctx->pmu);
                event->pmu->stop(event, PERF_EF_UPDATE);
        }
@@ -4135,14 +4032,10 @@ static int __perf_event_period(void *inf
                event->pmu->start(event, PERF_EF_RELOAD);
                perf_pmu_enable(ctx->pmu);
        }
-       raw_spin_unlock(&ctx->lock);
-
-       return 0;
 }
 
 static int perf_event_period(struct perf_event *event, u64 __user *arg)
 {
-       struct period_event pe = { .event = event, };
        u64 value;
 
        if (!is_sampling_event(event))
@@ -4157,10 +4050,7 @@ static int perf_event_period(struct perf
        if (event->attr.freq && value > sysctl_perf_event_sample_rate)
                return -EINVAL;
 
-       pe.value = value;
-
-       event_function_call(event, __perf_event_period,
-                           ___perf_event_period, &pe);
+       event_function_call(event, __perf_event_period, &value);
 
        return 0;
 }
@@ -4932,7 +4822,7 @@ static void perf_pending_event(struct ir
 
        if (event->pending_disable) {
                event->pending_disable = 0;
-               __perf_event_disable(event);
+               perf_event_disable_local(event);
        }
 
        if (event->pending_wakeup) {
@@ -9221,13 +9111,14 @@ static void perf_event_init_cpu(int cpu)
 #if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE
 static void __perf_event_exit_context(void *__info)
 {
-       struct remove_event re = { .detach_group = true };
        struct perf_event_context *ctx = __info;
+       struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+       struct perf_event *event;
 
-       rcu_read_lock();
-       list_for_each_entry_rcu(re.event, &ctx->event_list, event_entry)
-               __perf_remove_from_context(&re);
-       rcu_read_unlock();
+       raw_spin_lock(&ctx->lock);
+       list_for_each_entry(event, &ctx->event_list, event_entry)
+               __perf_remove_from_context(event, cpuctx, ctx, (void 
*)(unsigned long)true);
+       raw_spin_unlock(&ctx->lock);
 }
 
 static void perf_event_exit_cpu_context(int cpu)
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -444,7 +444,7 @@ int modify_user_hw_breakpoint(struct per
         * current task.
         */
        if (irqs_disabled() && bp->ctx && bp->ctx->task == current)
-               __perf_event_disable(bp);
+               perf_event_disable_local(bp);
        else
                perf_event_disable(bp);
 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to