Current idle_worker_rebind() has a bug. Worker thread: The second CPU_ONLINE thread idle_worker_rebind() wait_event(gcwq->rebind_hold) test for WORKER_REBIND and fails sleeping...
#the first CPU_ONLINE is wokenup, #finish its later work and gone. this thread is also wokenup, but it is not scheduled, it is still sleeping sleeping... #the cpu is offline again #the cpu is online again, #the online code do notify(CPU_ONLINE) call rebind_workers() #I name this is the seconed CPU_ONLINE set WORKER_REBIND to idle workers #thread, see the right. . . this thread is finally scheduled, . sees the WORKER_REBIND is not cleared, . go to sleep again waiting for (another) . rebind_workers() to wake up me. <--bug-> waiting for the idles' ACK. The two thread wait each other. It is bug. So this implement adds an "all_done", thus rebind_workers() can't leave until idle_worker_rebind() successful wait something until all other idle also done, so this "wait something" will not cause deadlock as the old code. The code of "idle_worker_rebind() wait something until all other idle also done" is also changed. It is changed to wait on "worker_rebind.idle_done". With the help of "all_done" this "worker_rebind" is valid when they wait on "worker_rebind.idle_done". The busy_worker_rebind_fn() also explicitly wait on all idle done. It adds a small overhead on cold path, but it makes the rebind_workers() as single pass. Clean Code/Readability wins. "all_cnt" 's decreasing is done without any lock, because this decreasing only happens on the bound CPU, no lock needed. (the bound CPU can't go until we notify on "all_done") Signed-off-by: Lai Jiangshan <la...@cn.fujitsu.com> --- kernel/workqueue.c | 108 ++++++++++++++++++++++++++------------------------- 1 files changed, 55 insertions(+), 53 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 692d976..5f63883 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -125,7 +125,7 @@ enum { struct global_cwq; struct worker_pool; -struct idle_rebind; +struct worker_rebind; /* * The poor guys doing the actual heavy lifting. All on-duty workers @@ -149,7 +149,7 @@ struct worker { int id; /* I: worker id */ /* for rebinding worker to CPU */ - struct idle_rebind *idle_rebind; /* L: for idle worker */ + struct worker_rebind *worker_rebind; /* L: for all worker */ struct work_struct rebind_work; /* L: for busy worker */ }; @@ -1304,9 +1304,17 @@ __acquires(&gcwq->lock) } } -struct idle_rebind { - int cnt; /* # workers to be rebound */ - struct completion done; /* all workers rebound */ +struct worker_rebind { + int idle_cnt; /* # idle workers to be rebound */ + struct completion idle_done; /* all idle workers rebound */ + + /* + * notify the rebind_workers() that: + * 1. All workers are rebound. + * 2. No worker has ref to this struct + */ + int all_cnt; /* # all workers to be rebound */ + struct completion all_done; /* all workers rebound */ }; /* @@ -1316,33 +1324,42 @@ struct idle_rebind { */ static void idle_worker_rebind(struct worker *worker) { - struct global_cwq *gcwq = worker->pool->gcwq; - /* CPU must be online at this point */ WARN_ON(!worker_maybe_bind_and_lock(worker)); - if (!--worker->idle_rebind->cnt) - complete(&worker->idle_rebind->done); + worker_clr_flags(worker, WORKER_REBIND); + if (!--worker->worker_rebind->idle_cnt) + complete_all(&worker->worker_rebind->idle_done); spin_unlock_irq(&worker->pool->gcwq->lock); - /* we did our part, wait for rebind_workers() to finish up */ - wait_event(gcwq->rebind_hold, !(worker->flags & WORKER_REBIND)); + /* It did its part, wait for all other idle to finish up */ + wait_for_completion(&worker->worker_rebind->idle_done); + + /* all_cnt is only accessed by the bound CPU, don't need any lock */ + if (!--worker->worker_rebind->all_cnt) + complete(&worker->worker_rebind->all_done); } /* * Function for @worker->rebind.work used to rebind unbound busy workers to * the associated cpu which is coming back online. This is scheduled by - * cpu up but can race with other cpu hotplug operations and may be - * executed twice without intervening cpu down. + * cpu up. */ static void busy_worker_rebind_fn(struct work_struct *work) { struct worker *worker = container_of(work, struct worker, rebind_work); struct global_cwq *gcwq = worker->pool->gcwq; - if (worker_maybe_bind_and_lock(worker)) - worker_clr_flags(worker, WORKER_REBIND); + /* Wait for all idle to finish up */ + wait_for_completion(&worker->worker_rebind->idle_done); + /* CPU must be online at this point */ + WARN_ON(!worker_maybe_bind_and_lock(worker)); + worker_clr_flags(worker, WORKER_REBIND); spin_unlock_irq(&gcwq->lock); + + /* all_cnt is only accessed by the bound CPU, don't need any lock */ + if (!--worker->worker_rebind->all_cnt) + complete(&worker->worker_rebind->all_done); } /** @@ -1366,13 +1383,13 @@ static void busy_worker_rebind_fn(struct work_struct *work) * the head of their scheduled lists is enough. Note that nr_running will * be properbly bumped as busy workers rebind. * - * On return, all workers are guaranteed to either be bound or have rebind - * work item scheduled. + * On return, all workers are guaranteed to be bound. */ static void rebind_workers(struct global_cwq *gcwq) __releases(&gcwq->lock) __acquires(&gcwq->lock) { - struct idle_rebind idle_rebind; + int idle_cnt = 0, all_cnt = 0; + struct worker_rebind worker_rebind; struct worker_pool *pool; struct worker *worker; struct hlist_node *pos; @@ -1383,54 +1400,22 @@ static void rebind_workers(struct global_cwq *gcwq) for_each_worker_pool(pool, gcwq) lockdep_assert_held(&pool->manager_mutex); - /* - * Rebind idle workers. Interlocked both ways. We wait for - * workers to rebind via @idle_rebind.done. Workers will wait for - * us to finish up by watching %WORKER_REBIND. - */ - init_completion(&idle_rebind.done); -retry: - idle_rebind.cnt = 1; - INIT_COMPLETION(idle_rebind.done); - /* set REBIND and kick idle ones, we'll wait for these later */ for_each_worker_pool(pool, gcwq) { list_for_each_entry(worker, &pool->idle_list, entry) { - if (worker->flags & WORKER_REBIND) - continue; - /* morph UNBOUND to REBIND */ worker->flags &= ~WORKER_UNBOUND; worker->flags |= WORKER_REBIND; - idle_rebind.cnt++; - worker->idle_rebind = &idle_rebind; + idle_cnt++; + all_cnt++; + worker->worker_rebind = &worker_rebind; /* worker_thread() will call idle_worker_rebind() */ wake_up_process(worker->task); } } - if (--idle_rebind.cnt) { - spin_unlock_irq(&gcwq->lock); - wait_for_completion(&idle_rebind.done); - spin_lock_irq(&gcwq->lock); - /* busy ones might have become idle while waiting, retry */ - goto retry; - } - - /* - * All idle workers are rebound and waiting for %WORKER_REBIND to - * be cleared inside idle_worker_rebind(). Clear and release. - * Clearing %WORKER_REBIND from this foreign context is safe - * because these workers are still guaranteed to be idle. - */ - for_each_worker_pool(pool, gcwq) - list_for_each_entry(worker, &pool->idle_list, entry) - worker->flags &= ~WORKER_REBIND; - - wake_up_all(&gcwq->rebind_hold); - /* rebind busy workers */ for_each_busy_worker(worker, i, pos, gcwq) { struct work_struct *rebind_work = &worker->rebind_work; @@ -1443,12 +1428,29 @@ retry: work_data_bits(rebind_work))) continue; + all_cnt++; + worker->worker_rebind = &worker_rebind; + /* wq doesn't matter, use the default one */ debug_work_activate(rebind_work); insert_work(get_cwq(gcwq->cpu, system_wq), rebind_work, worker->scheduled.next, work_color_to_flags(WORK_NO_COLOR)); } + + if (all_cnt) { + worker_rebind.idle_cnt = idle_cnt; + init_completion(&worker_rebind.idle_done); + worker_rebind.all_cnt = all_cnt; + init_completion(&worker_rebind.all_done); + + if (!idle_cnt) + complete_all(&worker_rebind.idle_done); + + spin_unlock_irq(&gcwq->lock); + wait_for_completion(&worker_rebind.all_done); + spin_lock_irq(&gcwq->lock); + } } static struct worker *alloc_worker(void) -- 1.7.4.4 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/