Workqueue stalls can happen from a variety of usage bugs such as missing WQ_MEM_RECLAIM flag or concurrency managed work item indefinitely staying RUNNING. These stalls can be extremely difficult to hunt down because the usual warning mechanisms can't detect workqueue stalls and the internal state is pretty opaque.
To alleviate the situation, this patch implements workqueue lockup detector. It periodically monitors all worker_pools periodically and, if any pool failed to make forward progress longer than the threshold duration, triggers warning and dumps workqueue state as follows. BUG: workqueue lockup - pool cpus=0 node=0 flags=0x0 nice=0 stuck for 31s! Showing busy workqueues and worker pools: workqueue events: flags=0x0 pwq 0: cpus=0 node=0 flags=0x0 nice=0 active=17/256 pending: monkey_wrench_fn, e1000_watchdog, cache_reap, vmstat_shepherd, release_one_tty, release_one_tty, release_one_tty, release_one_tty, release_one_tty, release_one_tty, release_one_tty, release_one_tty, release_one_tty, release_one_tty, release_one_tty, release_one_tty, cgroup_release_agent workqueue events_power_efficient: flags=0x80 pwq 0: cpus=0 node=0 flags=0x0 nice=0 active=2/256 pending: check_lifetime, neigh_periodic_work workqueue cgroup_pidlist_destroy: flags=0x0 pwq 0: cpus=0 node=0 flags=0x0 nice=0 active=1/1 pending: cgroup_pidlist_destroy_work_fn ... The detection mechanism is enabled/disabled together with scheduler softlockup watchdog and uses the same threshold value; however, it currently doesn't trigger panic. We can do that later once the detection mechanism proves to be reliable. Signed-off-by: Tejun Heo <t...@kernel.org> Cc: Ulrich Obergfell <uober...@redhat.com> Cc: Michal Hocko <mho...@suse.com> Cc: Chris Mason <c...@fb.com> Cc: Andrew Morton <a...@linux-foundation.org> --- include/linux/sched.h | 1 include/linux/workqueue.h | 6 + kernel/watchdog.c | 15 +++- kernel/workqueue.c | 146 +++++++++++++++++++++++++++++++++++++++++++++- 4 files changed, 162 insertions(+), 6 deletions(-) --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -381,6 +381,7 @@ extern void touch_softlockup_watchdog_sc extern void touch_softlockup_watchdog(void); extern void touch_softlockup_watchdog_sync(void); extern void touch_all_softlockup_watchdogs(void); +extern int get_softlockup_thresh(void); extern int proc_dowatchdog_thresh(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -618,4 +618,10 @@ static inline int workqueue_sysfs_regist { return 0; } #endif /* CONFIG_SYSFS */ +#ifdef CONFIG_LOCKUP_DETECTOR +void enable_workqueue_watchdog(void); +void disable_workqueue_watchdog(void); +void touch_workqueue_watchdog(int cpu); +#endif /* CONFIG_LOCKUP_DETECTOR */ + #endif --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -20,6 +20,7 @@ #include <linux/smpboot.h> #include <linux/sched/rt.h> #include <linux/tick.h> +#include <linux/workqueue.h> #include <asm/irq_regs.h> #include <linux/kvm_para.h> @@ -192,7 +193,7 @@ __setup("hardlockup_all_cpu_backtrace=", * the thresholds with a factor: we make the soft threshold twice the amount of * time the hard threshold is. */ -static int get_softlockup_thresh(void) +int get_softlockup_thresh(void) { return watchdog_thresh * 2; } @@ -245,6 +246,7 @@ void touch_softlockup_watchdog_sched(voi void touch_softlockup_watchdog(void) { touch_softlockup_watchdog_sched(); + touch_workqueue_watchdog(raw_smp_processor_id()); } EXPORT_SYMBOL(touch_softlockup_watchdog); @@ -259,6 +261,7 @@ void touch_all_softlockup_watchdogs(void */ for_each_watchdog_cpu(cpu) per_cpu(watchdog_touch_ts, cpu) = 0; + touch_workqueue_watchdog(-1); } #ifdef CONFIG_HARDLOCKUP_DETECTOR @@ -795,13 +798,18 @@ static int watchdog_enable_all_cpus(void { int err = 0; + disable_workqueue_watchdog(); + if (!watchdog_running) { err = smpboot_register_percpu_thread_cpumask(&watchdog_threads, &watchdog_cpumask); - if (err) + if (err) { pr_err("Failed to create watchdog threads, disabled\n"); - else + } else { + if (watchdog_enabled & SOFT_WATCHDOG_ENABLED) + enable_workqueue_watchdog(); watchdog_running = 1; + } } else { /* * Enable/disable the lockup detectors or @@ -826,6 +834,7 @@ static void watchdog_disable_all_cpus(vo if (watchdog_running) { watchdog_running = 0; smpboot_unregister_percpu_thread(&watchdog_threads); + disable_workqueue_watchdog(); } } --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -148,6 +148,8 @@ struct worker_pool { int id; /* I: pool ID */ unsigned int flags; /* X: flags */ + unsigned long watchdog_ts; /* L: watchdog timestamp */ + struct list_head worklist; /* L: list of pending works */ int nr_workers; /* L: total number of workers */ @@ -1083,6 +1085,8 @@ static void pwq_activate_delayed_work(st struct pool_workqueue *pwq = get_work_pwq(work); trace_workqueue_activate_work(work); + if (list_empty(&pwq->pool->worklist)) + pwq->pool->watchdog_ts = jiffies; move_linked_works(work, &pwq->pool->worklist, NULL); __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work)); pwq->nr_active++; @@ -1385,6 +1389,8 @@ retry: trace_workqueue_activate_work(work); pwq->nr_active++; worklist = &pwq->pool->worklist; + if (list_empty(worklist)) + pwq->pool->watchdog_ts = jiffies; } else { work_flags |= WORK_STRUCT_DELAYED; worklist = &pwq->delayed_works; @@ -2157,6 +2163,8 @@ recheck: list_first_entry(&pool->worklist, struct work_struct, entry); + pool->watchdog_ts = jiffies; + if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) { /* optimization path, not strictly necessary */ process_one_work(worker, work); @@ -2240,6 +2248,7 @@ repeat: struct pool_workqueue, mayday_node); struct worker_pool *pool = pwq->pool; struct work_struct *work, *n; + bool first = true; __set_current_state(TASK_RUNNING); list_del_init(&pwq->mayday_node); @@ -2256,9 +2265,14 @@ repeat: * process'em. */ WARN_ON_ONCE(!list_empty(scheduled)); - list_for_each_entry_safe(work, n, &pool->worklist, entry) - if (get_work_pwq(work) == pwq) + list_for_each_entry_safe(work, n, &pool->worklist, entry) { + if (get_work_pwq(work) == pwq) { + if (first) + pool->watchdog_ts = jiffies; move_linked_works(work, scheduled, &n); + } + first = false; + } if (!list_empty(scheduled)) { process_scheduled_works(rescuer); @@ -3069,6 +3083,7 @@ static int init_worker_pool(struct worke pool->cpu = -1; pool->node = NUMA_NO_NODE; pool->flags |= POOL_DISASSOCIATED; + pool->watchdog_ts = jiffies; INIT_LIST_HEAD(&pool->worklist); INIT_LIST_HEAD(&pool->idle_list); hash_init(pool->busy_hash); @@ -4308,7 +4323,9 @@ void show_workqueue_state(void) pr_info("pool %d:", pool->id); pr_cont_pool_info(pool); - pr_cont(" workers=%d", pool->nr_workers); + pr_cont(" hung=%us workers=%d", + jiffies_to_msecs(jiffies - pool->watchdog_ts) / 1000, + pool->nr_workers); if (pool->manager) pr_cont(" manager: %d", task_pid_nr(pool->manager->task)); @@ -5167,6 +5184,129 @@ static void workqueue_sysfs_unregister(s static void workqueue_sysfs_unregister(struct workqueue_struct *wq) { } #endif /* CONFIG_SYSFS */ +/* + * Workqueue watchdog. + * + * Stall may be caused by various bugs - missing WQ_MEM_RECLAIM, illegal + * flush dependency, a concurrency managed work item which stays RUNNING + * indefinitely. Workqueue stalls can be very difficult to debug as the + * usual warning mechanisms don't trigger and internal workqueue state is + * largely opaque. + * + * Workqueue watchdog monitors all worker pools periodically and dumps + * state if some pools failed to make forward progress for a while where + * forward progress is defined as the first item on ->worklist changing. + * + * The mechanism is enabled and disabled together with softlockup watchdog + * and uses the same threshold duration; however, it currently doesn't + * cause panic even if softlockup_panic is set. Also, workqueue watchdog + * assumes that the usual jiffies and timer mechanisms are working as there + * isn't much point in warning about workqueue stalls when timer is broken. + */ +#ifdef CONFIG_LOCKUP_DETECTOR + +static void wq_watchdog_timer_fn(unsigned long data); + +static unsigned long wq_watchdog_thresh; +static struct timer_list wq_watchdog_timer = + TIMER_DEFERRED_INITIALIZER(wq_watchdog_timer_fn, 0, 0); + +static unsigned long wq_watchdog_touched = INITIAL_JIFFIES; +static DEFINE_PER_CPU(unsigned long, wq_watchdog_touched_cpu) = INITIAL_JIFFIES; + +static void wq_watchdog_reset_touched(void) +{ + int cpu; + + wq_watchdog_touched = jiffies; + for_each_possible_cpu(cpu) + per_cpu(wq_watchdog_touched_cpu, cpu) = jiffies; +} + +static void wq_watchdog_timer_fn(unsigned long data) +{ + unsigned long thresh = wq_watchdog_thresh; + bool lockup_detected = false; + struct worker_pool *pool; + int pi; + + if (!thresh) + return; + + rcu_read_lock(); + + for_each_pool(pool, pi) { + unsigned long pool_ts, touched, ts; + + if (list_empty(&pool->worklist)) + continue; + + /* get the latest of pool and touched timestamps */ + pool_ts = READ_ONCE(pool->watchdog_ts); + touched = READ_ONCE(wq_watchdog_touched); + + if (time_after(pool_ts, touched)) + ts = pool_ts; + else + ts = touched; + + if (pool->cpu >= 0) { + unsigned long cpu_touched = + READ_ONCE(per_cpu(wq_watchdog_touched_cpu, + pool->cpu)); + if (time_after(cpu_touched, ts)) + ts = cpu_touched; + } + + /* did we stall? */ + if (time_after(jiffies, ts + thresh)) { + lockup_detected = true; + pr_emerg("BUG: workqueue lockup - pool"); + pr_cont_pool_info(pool); + pr_cont(" stuck for %us!\n", + jiffies_to_msecs(jiffies - pool_ts) / 1000); + } + } + + rcu_read_unlock(); + + if (lockup_detected) + show_workqueue_state(); + + wq_watchdog_reset_touched(); + mod_timer(&wq_watchdog_timer, jiffies + thresh); +} + +void enable_workqueue_watchdog(void) +{ + wq_watchdog_thresh = get_softlockup_thresh() * HZ; + + wq_watchdog_reset_touched(); + mod_timer(&wq_watchdog_timer, jiffies + wq_watchdog_thresh); +} + +void disable_workqueue_watchdog(void) +{ + wq_watchdog_thresh = 0; + del_timer_sync(&wq_watchdog_timer); +} + +void touch_workqueue_watchdog(int cpu) +{ + /* + * If not explicitly touched, these stamps are never updated, which + * means that they may affect stall detection if jiffies wraps; + * however, it's highly unlikely and the worst that can happen is + * delaying stall detection by upto one threshold duration. + */ + if (cpu >= 0) + per_cpu(wq_watchdog_touched_cpu, cpu) = jiffies; + else + wq_watchdog_touched = jiffies; +} + +#endif /* CONFIG_LOCKUP_DETECTOR */ + static void __init wq_numa_init(void) { cpumask_var_t *tbl; -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/