This patch implements the feature that the tracing_max_latency file, e.g. /sys/kernel/debug/tracing/tracing_max_latency will receive notifications through the fsnotify framework when a new latency is available.
One particularly interesting use of this facility is when enabling threshold tracing, through /sys/kernel/debug/tracing/tracing_thresh, together with the preempt/irqsoff tracers. This makes it possible to implement a user space program that can, with equal probability, obtain traces of latencies that occur immediately after each other in spite of the fact that the preempt/irqsoff tracers operate in overwrite mode. This facility works with the hwlat, preempt/irqsoff, and wakeup tracers. This patch also adds some unfortunate calls from __schedule() and do_idle(). Those calls to the latency_fsnotify_disable/enable() are needed because we cannot wake up the workqueue from these critical sections without risking a deadlock. Similar problems would also arise if we try to schedule a tasklet, raise a softirq, or wake up a kernel thread. If a notification event would happen in the forbidden sections, we schedule the fsnotify work as soon as we have exited them. There was a suggestion to remove this latency_fsnotify_enable/disable() gunk, or at least to combine it with the start_critical_timings() and stop_critical_timings(). I have however not been able to come up with a way to do it. It seems like it would be possible to simply replace the calls to latency_fsnotify_enable/disable() with calls to start/stop_critical_timings(). However, the main problem is that it would not work for the wakup tracer. The wakeup tracer needs a facility that postpones the notifications, not one that prevents the measurements because all its measurements takes place in the middle of __schedule(). On the other hand, in some places, like in idle and the console we need start stop functions that prevents the measurements from being make. Signed-off-by: Viktor Rosendahl <viktor.rosend...@gmail.com> --- include/linux/ftrace.h | 31 +++++++++ kernel/sched/core.c | 3 + kernel/sched/idle.c | 3 + kernel/sched/sched.h | 1 + kernel/trace/trace.c | 112 +++++++++++++++++++++++++++++- kernel/trace/trace.h | 22 ++++++ kernel/trace/trace_hwlat.c | 4 +- kernel/trace/trace_irqsoff.c | 4 ++ kernel/trace/trace_sched_wakeup.c | 4 ++ 9 files changed, 181 insertions(+), 3 deletions(-) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 8a8cb3c401b2..b4d9700ef917 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -907,4 +907,35 @@ unsigned long arch_syscall_addr(int nr); #endif /* CONFIG_FTRACE_SYSCALLS */ +#if (defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)) && \ + defined(CONFIG_FSNOTIFY) + +DECLARE_PER_CPU(int, latency_notify_disable); +DECLARE_STATIC_KEY_FALSE(latency_notify_key); + +void latency_fsnotify_process(void); + +/* + * Disable/enable fsnotify while in scheduler and idle code. Trying to wake + * anything up from there, such as calling queue_work() is prone to deadlock. + */ +static inline void latency_fsnotify_disable(void) +{ + this_cpu_inc(latency_notify_disable); +} + +static inline void latency_fsnotify_enable(void) +{ + this_cpu_dec(latency_notify_disable); + if (static_branch_unlikely(&latency_notify_key)) + latency_fsnotify_process(); +} + +#else + +#define latency_fsnotify_disable() do { } while (0) +#define latency_fsnotify_enable() do { } while (0) + +#endif + #endif /* _LINUX_FTRACE_H */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 010d578118d6..e3c1dc801073 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3198,6 +3198,7 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev) */ rq = finish_task_switch(prev); + latency_fsnotify_enable(); balance_callback(rq); preempt_enable(); @@ -3820,6 +3821,7 @@ static void __sched notrace __schedule(bool preempt) local_irq_disable(); rcu_note_context_switch(preempt); + latency_fsnotify_disable(); /* * Make sure that signal_pending_state()->signal_pending() below @@ -3883,6 +3885,7 @@ static void __sched notrace __schedule(bool preempt) rq_unlock_irq(rq, &rf); } + latency_fsnotify_enable(); balance_callback(rq); } diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 80940939b733..5fc87d99a407 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -236,6 +236,7 @@ static void do_idle(void) __current_set_polling(); tick_nohz_idle_enter(); + latency_fsnotify_disable(); while (!need_resched()) { check_pgt_cache(); @@ -265,6 +266,8 @@ static void do_idle(void) arch_cpu_idle_exit(); } + latency_fsnotify_enable(); + /* * Since we fell out of the loop above, we know TIF_NEED_RESCHED must * be set, propagate it into PREEMPT_NEED_RESCHED. diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 802b1f3405f2..467d6ad03f16 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -46,6 +46,7 @@ #include <linux/debugfs.h> #include <linux/delayacct.h> #include <linux/energy_model.h> +#include <linux/ftrace.h> #include <linux/init_task.h> #include <linux/kprobes.h> #include <linux/kthread.h> diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 563e80f9006a..a622263a69e4 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -44,6 +44,10 @@ #include <linux/trace.h> #include <linux/sched/clock.h> #include <linux/sched/rt.h> +#include <linux/fsnotify.h> +#include <linux/workqueue.h> +#include <trace/events/power.h> +#include <trace/events/sched.h> #include "trace.h" #include "trace_output.h" @@ -1480,6 +1484,110 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) unsigned long __read_mostly tracing_thresh; +#if (defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)) && \ + defined(CONFIG_FSNOTIFY) + +static const struct file_operations tracing_max_lat_fops; +static struct workqueue_struct *fsnotify_wq; +static DEFINE_PER_CPU(struct llist_head, notify_list); + +DEFINE_PER_CPU(int, latency_notify_disable); +DEFINE_STATIC_KEY_FALSE(latency_notify_key); + +static void latency_fsnotify_workfn(struct work_struct *work) +{ + struct trace_array *tr = container_of(work, struct trace_array, + fsnotify_work); + fsnotify(tr->d_max_latency->d_inode, FS_MODIFY, + tr->d_max_latency->d_inode, FSNOTIFY_EVENT_INODE, NULL, 0); +} + +static void trace_create_maxlat_file(struct trace_array *tr, + struct dentry *d_tracer) +{ + INIT_WORK(&tr->fsnotify_work, latency_fsnotify_workfn); + atomic_set(&tr->notify_pending, 0); + tr->d_max_latency = trace_create_file("tracing_max_latency", 0644, + d_tracer, &tr->max_latency, + &tracing_max_lat_fops); +} + +void latency_fsnotify_stop(void) +{ + /* Make sure all CPUs see caller's previous actions to stop tracer */ + smp_wmb(); + static_branch_disable(&latency_notify_key); + latency_fsnotify_process(); +} + +void latency_fsnotify_start(void) +{ + static_branch_enable(&latency_notify_key); + /* Make sure all CPUs see key value before caller continue */ + smp_wmb(); +} + +void latency_fsnotify_process(void) +{ + struct trace_array *tr; + struct llist_head *list; + struct llist_node *node; + + if (this_cpu_read(latency_notify_disable)) + return; + + list = this_cpu_ptr(¬ify_list); + for (node = llist_del_first(list); node != NULL; + node = llist_del_first(list)) { + tr = llist_entry(node, struct trace_array, notify_ll); + atomic_set(&tr->notify_pending, 0); + queue_work(fsnotify_wq, &tr->fsnotify_work); + } +} + +__init static int latency_fsnotify_init(void) +{ + fsnotify_wq = alloc_workqueue("tr_max_lat_wq", + WQ_UNBOUND | WQ_HIGHPRI, 0); + if (!fsnotify_wq) { + pr_err("Unable to allocate tr_max_lat_wq\n"); + return -ENOMEM; + } + return 0; +} + +late_initcall_sync(latency_fsnotify_init); + +void latency_fsnotify(struct trace_array *tr) +{ + if (!fsnotify_wq) + return; + + if (!this_cpu_read(latency_notify_disable)) + queue_work(fsnotify_wq, &tr->fsnotify_work); + else { + /* + * notify_pending prevents us from adding the same entry to + * more than one notify_list. It will get queued in + * latency_enable_fsnotify() + */ + if (!atomic_xchg(&tr->notify_pending, 1)) + llist_add(&tr->notify_ll, this_cpu_ptr(¬ify_list)); + } +} + +/* + * (defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)) && \ + * defined(CONFIG_FSNOTIFY) + */ +#else + +#define trace_create_maxlat_file(tr, d_tracer) \ + trace_create_file("tracing_max_latency", 0644, d_tracer, \ + &tr->max_latency, &tracing_max_lat_fops) + +#endif + #ifdef CONFIG_TRACER_MAX_TRACE /* * Copy the new maximum trace into the separate maximum-trace @@ -1518,6 +1626,7 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) /* record this tasks comm */ tracing_record_cmdline(tsk); + latency_fsnotify(tr); } /** @@ -8550,8 +8659,7 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) create_trace_options_dir(tr); #if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) - trace_create_file("tracing_max_latency", 0644, d_tracer, - &tr->max_latency, &tracing_max_lat_fops); + trace_create_maxlat_file(tr, d_tracer); #endif if (ftrace_create_function_files(tr, d_tracer)) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 005f08629b8b..d9f83b2aaa71 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -16,6 +16,7 @@ #include <linux/trace_events.h> #include <linux/compiler.h> #include <linux/glob.h> +#include <linux/workqueue.h> #ifdef CONFIG_FTRACE_SYSCALLS #include <asm/unistd.h> /* For NR_SYSCALLS */ @@ -264,6 +265,12 @@ struct trace_array { #endif #if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) unsigned long max_latency; +#ifdef CONFIG_FSNOTIFY + struct dentry *d_max_latency; + struct work_struct fsnotify_work; + atomic_t notify_pending; + struct llist_node notify_ll; +#endif #endif struct trace_pid_list __rcu *filtered_pids; /* @@ -785,6 +792,21 @@ void update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu); #endif /* CONFIG_TRACER_MAX_TRACE */ +#if (defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)) && \ + defined(CONFIG_FSNOTIFY) + +void latency_fsnotify(struct trace_array *tr); +void latency_fsnotify_start(void); +void latency_fsnotify_stop(void); + +#else + +#define latency_fsnotify(tr) do { } while (0) +#define latency_fsnotify_start() do { } while (0) +#define latency_fsnotify_stop() do { } while (0) + +#endif + #ifdef CONFIG_STACKTRACE void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, int pc); diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index fa95139445b2..9c379261ee89 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -254,8 +254,10 @@ static int get_sample(void) trace_hwlat_sample(&s); /* Keep a running maximum ever recorded hardware latency */ - if (sample > tr->max_latency) + if (sample > tr->max_latency) { tr->max_latency = sample; + latency_fsnotify(tr); + } } out: diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index a745b0cee5d3..29403a83a5f0 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -557,6 +557,7 @@ static int __irqsoff_tracer_init(struct trace_array *tr) if (irqsoff_busy) return -EBUSY; + latency_fsnotify_start(); save_flags = tr->trace_flags; /* non overwrite screws up the latency tracers */ @@ -591,16 +592,19 @@ static void __irqsoff_tracer_reset(struct trace_array *tr) ftrace_reset_array_ops(tr); irqsoff_busy = false; + latency_fsnotify_stop(); } static void irqsoff_tracer_start(struct trace_array *tr) { + latency_fsnotify_start(); tracer_enabled = 1; } static void irqsoff_tracer_stop(struct trace_array *tr) { tracer_enabled = 0; + latency_fsnotify_stop(); } #ifdef CONFIG_IRQSOFF_TRACER diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 743b2b520d34..3dc90d9f605b 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -669,6 +669,7 @@ static bool wakeup_busy; static int __wakeup_tracer_init(struct trace_array *tr) { + latency_fsnotify_start(); save_flags = tr->trace_flags; /* non overwrite screws up the latency tracers */ @@ -727,10 +728,12 @@ static void wakeup_tracer_reset(struct trace_array *tr) set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag); ftrace_reset_array_ops(tr); wakeup_busy = false; + latency_fsnotify_stop(); } static void wakeup_tracer_start(struct trace_array *tr) { + latency_fsnotify_start(); wakeup_reset(tr); tracer_enabled = 1; } @@ -738,6 +741,7 @@ static void wakeup_tracer_start(struct trace_array *tr) static void wakeup_tracer_stop(struct trace_array *tr) { tracer_enabled = 0; + latency_fsnotify_stop(); } static struct tracer wakeup_tracer __read_mostly = -- 2.17.1