If context tracking is enabled, causing page fault in preemptirq irq_enable or irq_disable events triggers the following RCU EQS warning.
Reproducer: // CONFIG_PREEMPTIRQ_EVENTS=y // CONFIG_CONTEXT_TRACKING=y // CONFIG_RCU_EQS_DEBUG=y # echo 1 > events/preemptirq/irq_disable/enable # echo 1 > options/userstacktrace WARNING: CPU: 0 PID: 2574 at kernel/rcu/tree.c:262 rcu_dynticks_eqs_exit+0x48/0x50 CPU: 0 PID: 2574 Comm: sh Not tainted 5.3.0-rc1+ #105 RIP: 0010:rcu_dynticks_eqs_exit+0x48/0x50 Call Trace: rcu_eqs_exit+0x4e/0xd0 rcu_user_exit+0x13/0x20 __context_tracking_exit.part.0+0x74/0x120 context_tracking_exit.part.0+0x28/0x50 context_tracking_exit+0x1d/0x20 do_page_fault+0xab/0x1b0 do_async_page_fault+0x35/0xb0 async_page_fault+0x3e/0x50 RIP: 0010:arch_stack_walk_user+0x8e/0x100 stack_trace_save_user+0x7d/0xa9 trace_buffer_unlock_commit_regs+0x178/0x220 trace_event_buffer_commit+0x6b/0x200 trace_event_raw_event_preemptirq_template+0x7b/0xc0 trace_hardirqs_off_caller+0xb3/0xf0 trace_hardirqs_off_thunk+0x1a/0x20 entry_SYSCALL_64_after_hwframe+0x3e/0xbe Details of call trace and RCU EQS/Context: entry_SYSCALL_64_after_hwframe() EQS: IN, CTX: USER trace_irq_disable_rcuidle() rcu_irq_enter_irqson() rcu_dynticks_eqs_exit() EQS: IN => OUT stack_trace_save_user() EQS: OUT, CTX: USER page_fault() do_page_fault() exception_enter() EQS: OUT, CTX: USER context_tracking_exit() rcu_eqs_exit() rcu_dynticks_eqs_exit() EQS: OUT => OUT? (warning) trace_irq_disable/enable_rcuidle() are called from user mode in entry code, and calls rcu_irq_enter_irqson() in __DO_TRACE(). This can cause the state "RCU ESQ: OUT but CTX: USER", then stack_trace_save_user() can cause page fault which calls rcu_dynticks_eqs_exit() again leads to hit the EQS validation warning if CONFIG_RCU_EQS_DEBUG is enabled. Fix it by calling exception_enter/exit() around trace_irq_disable/enable_rcuidle() to enter CONTEXT_KERNEL before tracing code causes page fault. Also makes the timing of state change to CONTEXT_KERNL earlier to prevent tracing codes from calling context_tracking_exit() recursively. Ideally, the problem can be fixed by calling enter_from_user_mode() before TRACE_IRQS_OFF in entry codes (then we need to tell lockdep that IRQs are off eariler) and calling prepare_exit_to_usermode() after TRACE_IRQS_ON. But this patch will be much simpler and limit the most change to tracing codes. Fixes: 865e63b04e9b ("tracing: Add back in rcu_irq_enter/exit_irqson() for rcuidle tracepoints") Signed-off-by: Eiichi Tsukata <de...@etsukata.com> --- kernel/context_tracking.c | 6 +++++- kernel/trace/trace_preemptirq.c | 15 +++++++++++++-- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index be01a4d627c9..860eaf9780e5 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -148,6 +148,11 @@ void __context_tracking_exit(enum ctx_state state) return; if (__this_cpu_read(context_tracking.state) == state) { + /* + * Change state before executing codes which can trigger + * page fault leading unnecessary re-entrance. + */ + __this_cpu_write(context_tracking.state, CONTEXT_KERNEL); if (__this_cpu_read(context_tracking.active)) { /* * We are going to run code that may use RCU. Inform @@ -159,7 +164,6 @@ void __context_tracking_exit(enum ctx_state state) trace_user_exit(0); } } - __this_cpu_write(context_tracking.state, CONTEXT_KERNEL); } context_tracking_recursion_exit(); } diff --git a/kernel/trace/trace_preemptirq.c b/kernel/trace/trace_preemptirq.c index 4d8e99fdbbbe..031b51cb94d0 100644 --- a/kernel/trace/trace_preemptirq.c +++ b/kernel/trace/trace_preemptirq.c @@ -10,6 +10,7 @@ #include <linux/module.h> #include <linux/ftrace.h> #include <linux/kprobes.h> +#include <linux/context_tracking.h> #include "trace.h" #define CREATE_TRACE_POINTS @@ -49,9 +50,14 @@ NOKPROBE_SYMBOL(trace_hardirqs_off); __visible void trace_hardirqs_on_caller(unsigned long caller_addr) { + enum ctx_state prev_state; + if (this_cpu_read(tracing_irq_cpu)) { - if (!in_nmi()) + if (!in_nmi()) { + prev_state = exception_enter(); trace_irq_enable_rcuidle(CALLER_ADDR0, caller_addr); + exception_exit(prev_state); + } tracer_hardirqs_on(CALLER_ADDR0, caller_addr); this_cpu_write(tracing_irq_cpu, 0); } @@ -63,11 +69,16 @@ NOKPROBE_SYMBOL(trace_hardirqs_on_caller); __visible void trace_hardirqs_off_caller(unsigned long caller_addr) { + enum ctx_state prev_state; + if (!this_cpu_read(tracing_irq_cpu)) { this_cpu_write(tracing_irq_cpu, 1); tracer_hardirqs_off(CALLER_ADDR0, caller_addr); - if (!in_nmi()) + if (!in_nmi()) { + prev_state = exception_enter(); trace_irq_disable_rcuidle(CALLER_ADDR0, caller_addr); + exception_exit(prev_state); + } } lockdep_hardirqs_off(CALLER_ADDR0); -- 2.21.0