We have a global offloading state and make the offloading decision based on printing task pointer and timestamp. If we keep seeing the same task performing printing for too long (`atomic_print_limit') we request offloading. Similarly when we see that printing is now performed by another task, we reset the timestamp counter.
This, however, will not work in the following case: =============================================================================== CPU0 CPU1 //taskA //taskB preempt_disable() preempt_disable() printk() console_trylock() console_unlock() printing_task = taskA up() printk() console_trylock() console_unlock() printing_task = taskB ^^^ reset offloading control up() printk() console_trylock() console_unlock() printing_task = taskA ^^^ reset offloading control up() printk() console_trylock() console_unlock() printing_task = taskB ^^^ reset offloading control up() /* * X seconds later */ printk() console_trylock() console_unlock() printing_task = taskA ^^^ reset offloading control up() printk() console_trylock() console_unlock() printing_task = taskB ^^^ reset offloading control up() lockup! lockup! =============================================================================== So this printk ping-pong confuses our offloading control logic. Move it to per-CPU area and have a separate offloading control on every CPU. Signed-off-by: Sergey Senozhatsky <sergey.senozhat...@gmail.com> --- kernel/printk/printk.c | 36 ++++++++++++++++++++---------------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index ba82152ce5d9..f9799616e9fc 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -558,13 +558,14 @@ static inline void adj_atomic_print_limit(void) #endif } -static inline unsigned long emergency_timeout(unsigned long ts) +static inline unsigned long emergency_timeout(unsigned long now, + unsigned long ts) { #ifdef CONFIG_LOCKUP_DETECTOR if (watchdog_thresh) - return ts + 2 * watchdog_thresh; + return time_after_eq(now, ts + 2 * watchdog_thresh); #endif - return ts + 10 * atomic_print_limit; + return time_after_eq(now, ts + 10 * atomic_print_limit); } /* @@ -574,13 +575,13 @@ static inline unsigned long emergency_timeout(unsigned long ts) * amount of time a process can print from console_unlock(). * * This function must be called from 'printk_safe' context under - * console_sem lock. + * console_sem lock with preemption disabled. */ static inline bool console_offload_printing(void) { - static struct task_struct *printing_task; - static unsigned long printing_start_ts; - static unsigned long saved_csw; + static DEFINE_PER_CPU(struct task_struct, *printing_task); + static DEFINE_PER_CPU(unsigned long, printing_start_ts); + static DEFINE_PER_CPU(unsigned long, saved_csw); unsigned long now = local_clock() >> 30LL; /* seconds */ if (printk_kthread_should_stop()) @@ -600,16 +601,17 @@ static inline bool console_offload_printing(void) goto offload; /* A new task - reset the counters. */ - if (printing_task != current) { - printing_start_ts = local_clock() >> 30LL; - saved_csw = current->nvcsw + current->nivcsw; - printing_task = current; + if (this_cpu_read(printing_task) != current) { + this_cpu_write(printing_start_ts, local_clock() >> 30LL); + this_cpu_write(saved_csw, current->nvcsw + current->nivcsw); + this_cpu_write(printing_task, current); return false; } adj_atomic_print_limit(); - if (!time_after_eq(now, printing_start_ts + atomic_print_limit)) + if (!time_after_eq(now, this_cpu_read(printing_start_ts) + + atomic_print_limit)) return false; if (current == printk_kthread) { @@ -626,7 +628,7 @@ static inline bool console_offload_printing(void) * back to console_unlock(), it will have another full * `atomic_print_limit' time slice. */ - printing_start_ts = local_clock() >> 30LL; + this_cpu_write(printing_start_ts, local_clock() >> 30LL); return true; } @@ -634,10 +636,12 @@ static inline bool console_offload_printing(void) * A trivial emergency enforcement - give up on printk_kthread if * we can't wake it up. */ - if (time_after_eq(now, emergency_timeout(printing_start_ts)) && - saved_csw == (current->nvcsw + current->nivcsw)) { + if (this_cpu_read(saved_csw) == (current->nvcsw + current->nivcsw) + && emergency_timeout(now, this_cpu_read(printing_start_ts))) { + printk_enforce_emergency = true; - pr_crit("Declaring printk emergency mode.\n"); + pr_crit("CPU%d declared a printk emergency mode.\n", + smp_processor_id()); return true; } -- 2.14.1