This patch eliminates RCU violations detected by the RCU
checker (PROVE_RCU). The impact code paths were all related
to cgroup mode monitoring and involved access a task's cgrp.

V2 is updated to include comments from PeterZ to eliminate
some of the warnings without grabbing the rcu_read lock because
we know we are already holding th ctx->lock which prevents
the cgroup from disappearing while we are accessing it.
The trick, as suggested by Peter, is to modify the
perf_cgroup_from_task() to take an extra boolean parameter
to allow bypassing the lockdep test in the task_subsys_cstate()
macros. This patch uses this approach to update all calls the
perf_cgroup_from_task().

In V3, we change the boolean parameter for a pointer to a
perf_event_context so we can check the ctx->lock explicitely.
This is more robust, than passing the boolean to express that
we know the lock is held. The code can change, and thus the
locking assumption, checking lockdep_is_held() ensures,
the proper locking is in place. Patch relative to tip.git
at commit 57ef9fc.

Signed-off-by: Stephane Eranian <[email protected]>
---
 arch/x86/kernel/cpu/perf_event_intel_cqm.c |  2 +-
 include/linux/perf_event.h                 |  5 +++--
 kernel/events/core.c                       | 25 +++++++++++++++----------
 3 files changed, 19 insertions(+), 13 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c 
b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
index 377e8f8..a316ca9 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_cqm.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
@@ -298,7 +298,7 @@ static bool __match_event(struct perf_event *a, struct 
perf_event *b)
 static inline struct perf_cgroup *event_to_cgroup(struct perf_event *event)
 {
        if (event->attach_state & PERF_ATTACH_TASK)
-               return perf_cgroup_from_task(event->hw.target);
+               return perf_cgroup_from_task(event->hw.target, event->ctx);
 
        return event->cgrp;
 }
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index d841d33..94107e4 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -697,9 +697,10 @@ struct perf_cgroup {
  * if there is no cgroup event for the current CPU context.
  */
 static inline struct perf_cgroup *
-perf_cgroup_from_task(struct task_struct *task)
+perf_cgroup_from_task(struct task_struct *task, struct perf_event_context *ctx)
 {
-       return container_of(task_css(task, perf_event_cgrp_id),
+       bool safe = ctx ? lockdep_is_held(&ctx->lock) : true;
+       return container_of(task_css_check(task, perf_event_cgrp_id, safe),
                            struct perf_cgroup, css);
 }
 #endif /* CONFIG_CGROUP_PERF */
diff --git a/kernel/events/core.c b/kernel/events/core.c
index ea02109..f611246 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -435,7 +435,7 @@ static inline void update_cgrp_time_from_event(struct 
perf_event *event)
        if (!is_cgroup_event(event))
                return;
 
-       cgrp = perf_cgroup_from_task(current);
+       cgrp = perf_cgroup_from_task(current, event->ctx);
        /*
         * Do not update time when cgroup is not active
         */
@@ -458,7 +458,7 @@ perf_cgroup_set_timestamp(struct task_struct *task,
        if (!task || !ctx->nr_cgroups)
                return;
 
-       cgrp = perf_cgroup_from_task(task);
+       cgrp = perf_cgroup_from_task(task, ctx);
        info = this_cpu_ptr(cgrp->info);
        info->timestamp = ctx->timestamp;
 }
@@ -489,7 +489,6 @@ static void perf_cgroup_switch(struct task_struct *task, 
int mode)
         * we reschedule only in the presence of cgroup
         * constrained events.
         */
-       rcu_read_lock();
 
        list_for_each_entry_rcu(pmu, &pmus, entry) {
                cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
@@ -523,7 +522,7 @@ static void perf_cgroup_switch(struct task_struct *task, 
int mode)
                                 * event_filter_match() to not have to pass
                                 * task around
                                 */
-                               cpuctx->cgrp = perf_cgroup_from_task(task);
+                               cpuctx->cgrp = perf_cgroup_from_task(task, 
NULL);
                                cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
                        }
                        perf_pmu_enable(cpuctx->ctx.pmu);
@@ -531,8 +530,6 @@ static void perf_cgroup_switch(struct task_struct *task, 
int mode)
                }
        }
 
-       rcu_read_unlock();
-
        local_irq_restore(flags);
 }
 
@@ -542,17 +539,18 @@ static inline void perf_cgroup_sched_out(struct 
task_struct *task,
        struct perf_cgroup *cgrp1;
        struct perf_cgroup *cgrp2 = NULL;
 
+       rcu_read_lock();
        /*
         * we come here when we know perf_cgroup_events > 0
         */
-       cgrp1 = perf_cgroup_from_task(task);
+       cgrp1 = perf_cgroup_from_task(task, NULL);
 
        /*
         * next is NULL when called from perf_event_enable_on_exec()
         * that will systematically cause a cgroup_switch()
         */
        if (next)
-               cgrp2 = perf_cgroup_from_task(next);
+               cgrp2 = perf_cgroup_from_task(next, NULL);
 
        /*
         * only schedule out current cgroup events if we know
@@ -561,6 +559,8 @@ static inline void perf_cgroup_sched_out(struct task_struct 
*task,
         */
        if (cgrp1 != cgrp2)
                perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
+
+       rcu_read_unlock();
 }
 
 static inline void perf_cgroup_sched_in(struct task_struct *prev,
@@ -569,13 +569,14 @@ static inline void perf_cgroup_sched_in(struct 
task_struct *prev,
        struct perf_cgroup *cgrp1;
        struct perf_cgroup *cgrp2 = NULL;
 
+       rcu_read_lock();
        /*
         * we come here when we know perf_cgroup_events > 0
         */
-       cgrp1 = perf_cgroup_from_task(task);
+       cgrp1 = perf_cgroup_from_task(task, NULL);
 
        /* prev can never be NULL */
-       cgrp2 = perf_cgroup_from_task(prev);
+       cgrp2 = perf_cgroup_from_task(prev, NULL);
 
        /*
         * only need to schedule in cgroup events if we are changing
@@ -584,6 +585,8 @@ static inline void perf_cgroup_sched_in(struct task_struct 
*prev,
         */
        if (cgrp1 != cgrp2)
                perf_cgroup_switch(task, PERF_CGROUP_SWIN);
+
+       rcu_read_unlock();
 }
 
 static inline int perf_cgroup_connect(int fd, struct perf_event *event,
@@ -9442,7 +9445,9 @@ static void perf_cgroup_css_free(struct 
cgroup_subsys_state *css)
 static int __perf_cgroup_move(void *info)
 {
        struct task_struct *task = info;
+       rcu_read_lock();
        perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN);
+       rcu_read_unlock();
        return 0;
 }
 
-- 
2.5.0

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to