Steven reported that rcuidle && in_nmi() condition can occur which creates a problem for SRCU usage, since we can't use the SRCU node from both NMI context and other contexts (NMI can come in while the SRCU read lock is in the process of being held).
This patch switches to using a separate SRCU node for tracepoints called from in_nmi(). This is needed to also make tracepoints work while CPU is offline. Fixes: c3bc8fd637a9 ("tracing: Centralize preemptirq tracepoints and unify their usage") Reported-by: Masami Hiramatsu <mhira...@kernel.org> Signed-off-by: Joel Fernandes (Google) <j...@joelfernandes.org> --- Dropped the "CPU offline" changes, and only keeping the SRCU changes. include/linux/tracepoint.h | 19 ++++++++----------- kernel/tracepoint.c | 10 ++++++---- 2 files changed, 14 insertions(+), 15 deletions(-) diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index d9a084c72541..1ceee17a38dc 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -35,6 +35,7 @@ struct trace_eval_map { #define TRACEPOINT_DEFAULT_PRIO 10 extern struct srcu_struct tracepoint_srcu; +extern struct srcu_struct tracepoint_srcu_nmi; extern int tracepoint_probe_register(struct tracepoint *tp, void *probe, void *data); @@ -144,13 +145,11 @@ extern void syscall_unregfunc(void); void *it_func; \ void *__data; \ int __maybe_unused idx = 0; \ + struct srcu_struct *ss; \ \ if (!(cond)) \ return; \ \ - /* srcu can't be used from NMI */ \ - WARN_ON_ONCE(rcuidle && in_nmi()); \ - \ /* keep srcu and sched-rcu usage consistent */ \ preempt_disable_notrace(); \ \ @@ -159,7 +158,11 @@ extern void syscall_unregfunc(void); * doesn't work from the idle path. \ */ \ if (rcuidle) \ - idx = srcu_read_lock_notrace(&tracepoint_srcu); \ + ss = &tracepoint_srcu_nmi; \ + else \ + ss = &tracepoint_srcu; \ + \ + idx = srcu_read_lock_notrace(ss); \ \ it_func_ptr = rcu_dereference_raw((tp)->funcs); \ \ @@ -171,8 +174,7 @@ extern void syscall_unregfunc(void); } while ((++it_func_ptr)->func); \ } \ \ - if (rcuidle) \ - srcu_read_unlock_notrace(&tracepoint_srcu, idx);\ + srcu_read_unlock_notrace(ss, idx); \ \ preempt_enable_notrace(); \ } while (0) @@ -212,11 +214,6 @@ extern void syscall_unregfunc(void); TP_PROTO(data_proto), \ TP_ARGS(data_args), \ TP_CONDITION(cond), 0); \ - if (IS_ENABLED(CONFIG_LOCKDEP) && (cond)) { \ - rcu_read_lock_sched_notrace(); \ - rcu_dereference_sched(__tracepoint_##name.funcs);\ - rcu_read_unlock_sched_notrace(); \ - } \ } \ __DECLARE_TRACE_RCU(name, PARAMS(proto), PARAMS(args), \ PARAMS(cond), PARAMS(data_proto), PARAMS(data_args)) \ diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 955148d91b74..769d74b2f90e 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -32,7 +32,9 @@ extern struct tracepoint * const __start___tracepoints_ptrs[]; extern struct tracepoint * const __stop___tracepoints_ptrs[]; DEFINE_SRCU(tracepoint_srcu); +DEFINE_SRCU(tracepoint_srcu_nmi); EXPORT_SYMBOL_GPL(tracepoint_srcu); +EXPORT_SYMBOL_GPL(tracepoint_srcu_nmi); /* Set to 1 to enable tracepoint debug output */ static const int tracepoint_debug; @@ -70,14 +72,14 @@ static inline void *allocate_probes(int count) return p == NULL ? NULL : p->probes; } -static void srcu_free_old_probes(struct rcu_head *head) +static void srcu_free_old_probes_nmi(struct rcu_head *head) { kfree(container_of(head, struct tp_probes, rcu)); } -static void rcu_free_old_probes(struct rcu_head *head) +static void srcu_free_old_probes(struct rcu_head *head) { - call_srcu(&tracepoint_srcu, head, srcu_free_old_probes); + call_srcu(&tracepoint_srcu_nmi, head, srcu_free_old_probes_nmi); } static inline void release_probes(struct tracepoint_func *old) @@ -91,7 +93,7 @@ static inline void release_probes(struct tracepoint_func *old) * cover both cases. So let us chain the SRCU and sched RCU * callbacks to wait for both grace periods. */ - call_rcu_sched(&tp_probes->rcu, rcu_free_old_probes); + call_srcu(&tracepoint_srcu, &tp_probes->rcu, srcu_free_old_probes); } } -- 2.18.0.597.ga71716f1ad-goog