If a worker task for a normal (bound, non-CPU-intensive) calls
cond_resched(), this allows other non-workqueue processes to run, but
does *not* allow other workqueue workers to run.  This is because
workqueue will only attempt to run one task at a time on each CPU,
unless the current task actually sleeps.

This is already a problem for should_reclaim_retry() in mm/page_alloc.c,
which inserts a small sleep just to convince workqueue to allow other
workers to run.

It can also be a problem for NFS when closing a very large file (e.g.
100 million pages in memory).  NFS can call the final iput() from a
workqueue, which can then take long enough to trigger a workqueue-lockup
warning, and long enough for performance problems to be observed.

I added a WARN_ON_ONCE() in cond_resched() to report when it is run from
a workqueue, and got about 20 hits during boot, many of system_wq (aka
"events") which suggests there is a real chance that worker are being
delayed unnecessarily be tasks which are written to politely relinquish
the CPU.

I think that once a worker calls cond_resched(), it should be treated as
though it was run from a WQ_CPU_INTENSIVE queue, because only cpu-intensive
tasks need to call cond_resched().  This would allow other workers to be
scheduled.

The following patch achieves this I believe.

Signed-off-by: NeilBrown <ne...@suse.de>
---
 include/linux/sched.h     |  7 ++++++-
 include/linux/workqueue.h |  2 ++
 kernel/sched/core.c       |  4 ++++
 kernel/workqueue.c        | 16 +++++++++++++++-
 mm/page_alloc.c           | 12 +-----------
 5 files changed, 28 insertions(+), 13 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4418f5cb8324..728870965df1 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1784,7 +1784,12 @@ static inline int test_tsk_need_resched(struct 
task_struct *tsk)
 #ifndef CONFIG_PREEMPTION
 extern int _cond_resched(void);
 #else
-static inline int _cond_resched(void) { return 0; }
+static inline int _cond_resched(void)
+{
+       if (current->flags & PF_WQ_WORKER)
+               workqueue_cond_resched();
+       return 0;
+}
 #endif
 
 #define cond_resched() ({                      \
diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 8b505d22fc0e..7bcc5717d80f 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -626,6 +626,8 @@ static inline bool schedule_delayed_work(struct 
delayed_work *dwork,
        return queue_delayed_work(system_wq, dwork, delay);
 }
 
+void workqueue_cond_resched(void);
+
 #ifndef CONFIG_SMP
 static inline long work_on_cpu(int cpu, long (*fn)(void *), void *arg)
 {
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9a2fbf98fd6f..5b2e38567a0c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5620,6 +5620,8 @@ SYSCALL_DEFINE0(sched_yield)
 #ifndef CONFIG_PREEMPTION
 int __sched _cond_resched(void)
 {
+       if (current->flags & PF_WQ_WORKER)
+               workqueue_cond_resched();
        if (should_resched(0)) {
                preempt_schedule_common();
                return 1;
@@ -5643,6 +5645,8 @@ int __cond_resched_lock(spinlock_t *lock)
        int resched = should_resched(PREEMPT_LOCK_OFFSET);
        int ret = 0;
 
+       if (current->flags & PF_WQ_WORKER)
+               workqueue_cond_resched();
        lockdep_assert_held(lock);
 
        if (spin_needbreak(lock) || resched) {
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 891ccad5f271..fd2e9557b1a6 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2296,7 +2296,7 @@ __acquires(&pool->lock)
        spin_lock_irq(&pool->lock);
 
        /* clear cpu intensive status */
-       if (unlikely(cpu_intensive))
+       if (unlikely(worker->flags & WORKER_CPU_INTENSIVE))
                worker_clr_flags(worker, WORKER_CPU_INTENSIVE);
 
        /* tag the worker for identification in schedule() */
@@ -5330,6 +5330,20 @@ int workqueue_set_unbound_cpumask(cpumask_var_t cpumask)
        return ret;
 }
 
+void workqueue_cond_resched(void)
+{
+       struct worker *worker = current_wq_worker();
+
+       if (worker && !(worker->flags & WORKER_CPU_INTENSIVE)) {
+               struct worker_pool *pool = worker->pool;
+
+               worker_set_flags(worker, WORKER_CPU_INTENSIVE);
+               if (need_more_worker(pool))
+                       wake_up_worker(pool);
+       }
+}
+EXPORT_SYMBOL(workqueue_cond_resched);
+
 #ifdef CONFIG_SYSFS
 /*
  * Workqueues with WQ_SYSFS flag set is visible to userland via
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 13cc653122b7..0d5720ecbfe7 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4420,17 +4420,7 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
        }
 
 out:
-       /*
-        * Memory allocation/reclaim might be called from a WQ context and the
-        * current implementation of the WQ concurrency control doesn't
-        * recognize that a particular WQ is congested if the worker thread is
-        * looping without ever sleeping. Therefore we have to do a short sleep
-        * here rather than calling cond_resched().
-        */
-       if (current->flags & PF_WQ_WORKER)
-               schedule_timeout_uninterruptible(1);
-       else
-               cond_resched();
+       cond_resched();
        return ret;
 }
 
-- 
2.29.2

Attachment: signature.asc
Description: PGP signature

Reply via email to