from:"Lai Jiangshan"

[PATCH 06/10 V2] workqueue: convert worker_idr to worker_ida

2014-05-11 Thread Lai Jiangshan

We don't need to iterate workers via worker_idr, worker_idr is
used for allocating/freeing ID only, so we convert it to worker_ida.

By using ida_simple_get/remove(), worker_ida can be protected by itself,
so we don't need manager_mutex to protect it and remove the coupling,
and we can move the ID-removal out from worker_detach_from_pool().

Signed-off-by: Lai Jiangshan 
---
 kernel/workqueue.c |   20 
 1 files changed, 8 insertions(+), 12 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index b6cf4d9..9f7f4ef 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -161,10 +161,11 @@ struct worker_pool {
/* see manage_workers() for details on the two manager mutexes */
struct mutexmanager_arb;/* manager arbitration */
struct mutexmanager_mutex;  /* manager exclusion */
-   struct idr  worker_idr; /* M: worker IDs */
struct list_headworkers;/* M: attached workers */
struct completion   *detach_completion; /* all workers detached */
 
+   struct ida  worker_ida; /* worker IDs for task name */
+
struct workqueue_attrs  *attrs; /* I: worker attributes */
struct hlist_node   hash_node;  /* PL: unbound_pool_hash node */
int refcnt; /* PL: refcnt for unbound pools 
*/
@@ -1681,7 +1682,6 @@ static void worker_detach_from_pool(struct worker *worker,
struct completion *detach_completion = NULL;
 
mutex_lock(&pool->manager_mutex);
-   idr_remove(&pool->worker_idr, worker->id);
list_del(&worker->node);
if (list_empty(&pool->workers))
detach_completion = pool->detach_completion;
@@ -1712,11 +1712,8 @@ static struct worker *create_worker(struct worker_pool 
*pool)
 
lockdep_assert_held(&pool->manager_mutex);
 
-   /*
-* ID is needed to determine kthread name.  Allocate ID first
-* without installing the pointer.
-*/
-   id = idr_alloc(&pool->worker_idr, NULL, 0, 0, GFP_KERNEL);
+   /* ID is needed to determine kthread name. */
+   id = ida_simple_get(&pool->worker_ida, 0, 0, GFP_KERNEL);
if (id < 0)
goto fail;
 
@@ -1757,8 +1754,6 @@ static struct worker *create_worker(struct worker_pool 
*pool)
if (pool->flags & POOL_DISASSOCIATED)
worker->flags |= WORKER_UNBOUND;
 
-   /* successful, commit the pointer to idr */
-   idr_replace(&pool->worker_idr, worker, worker->id);
/* successful, attach the worker to the pool */
list_add_tail(&worker->node, &pool->workers);
 
@@ -1766,7 +1761,7 @@ static struct worker *create_worker(struct worker_pool 
*pool)
 
 fail:
if (id >= 0)
-   idr_remove(&pool->worker_idr, id);
+   ida_simple_remove(&pool->worker_ida, id);
kfree(worker);
return NULL;
 }
@@ -2233,6 +2228,7 @@ woke_up:
worker->task->flags &= ~PF_WQ_WORKER;
 
set_task_comm(worker->task, "kworker_dying");
+   ida_simple_remove(&pool->worker_ida, worker->id);
worker_detach_from_pool(worker, pool);
kfree(worker);
return 0;
@@ -3469,9 +3465,9 @@ static int init_worker_pool(struct worker_pool *pool)
 
mutex_init(&pool->manager_arb);
mutex_init(&pool->manager_mutex);
-   idr_init(&pool->worker_idr);
INIT_LIST_HEAD(&pool->workers);
 
+   ida_init(&pool->worker_ida);
INIT_HLIST_NODE(&pool->hash_node);
pool->refcnt = 1;
 
@@ -3486,7 +3482,7 @@ static void rcu_free_pool(struct rcu_head *rcu)
 {
struct worker_pool *pool = container_of(rcu, struct worker_pool, rcu);
 
-   idr_destroy(&pool->worker_idr);
+   ida_destroy(&pool->worker_ida);
free_workqueue_attrs(pool->attrs);
kfree(pool);
 }
-- 
1.7.4.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 07/10 V2] workqueue: narrow the protection range of manager_mutex

2014-05-11 Thread Lai Jiangshan

In create_worker(), pool->worker_ida is protected by idr subsystem via
using ida_simple_get()/ida_simple_put(), it doesn't need manager_mutex

struct worker allocation and kthread allocation are not visible by any one,
before attached, they don't need manager_mutex either.

The above operations are before the attaching operation which attach
the worker to the pool. And between attached and starting the worker,
the worker is already attached to the pool, the cpuhotplug will handle the
cpu-binding for the worker correctly since it is attached to the pool.
So we don't need the manager_mutex after attached.

The conclusion is that only the attaching operation needs manager_mutex,
so we narrow the protection section of manager_mutex in create_worker().

Some comments about manager_mutex are removed, due to we will rename it to
attach_mutex and add worker_attach_to_pool() later which is self-comments.

Signed-off-by: Lai Jiangshan 
---
 kernel/workqueue.c |   35 +--
 1 files changed, 5 insertions(+), 30 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 9f7f4ef..663de70 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1710,8 +1710,6 @@ static struct worker *create_worker(struct worker_pool 
*pool)
int id = -1;
char id_buf[16];
 
-   lockdep_assert_held(&pool->manager_mutex);
-
/* ID is needed to determine kthread name. */
id = ida_simple_get(&pool->worker_ida, 0, 0, GFP_KERNEL);
if (id < 0)
@@ -1740,6 +1738,8 @@ static struct worker *create_worker(struct worker_pool 
*pool)
/* prevent userland from meddling with cpumask of workqueue workers */
worker->task->flags |= PF_NO_SETAFFINITY;
 
+   mutex_lock(&pool->manager_mutex);
+
/*
 * set_cpus_allowed_ptr() will fail if the cpumask doesn't have any
 * online CPUs.  It'll be re-applied when any of the CPUs come up.
@@ -1747,7 +1747,7 @@ static struct worker *create_worker(struct worker_pool 
*pool)
set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask);
 
/*
-* The caller is responsible for ensuring %POOL_DISASSOCIATED
+* The pool->manager_mutex ensures %POOL_DISASSOCIATED
 * remains stable across this function.  See the comments above the
 * flag definition for details.
 */
@@ -1757,6 +1757,8 @@ static struct worker *create_worker(struct worker_pool 
*pool)
/* successful, attach the worker to the pool */
list_add_tail(&worker->node, &pool->workers);
 
+   mutex_unlock(&pool->manager_mutex);
+
return worker;
 
 fail:
@@ -1794,8 +1796,6 @@ static int create_and_start_worker(struct worker_pool 
*pool)
 {
struct worker *worker;
 
-   mutex_lock(&pool->manager_mutex);
-
worker = create_worker(pool);
if (worker) {
spin_lock_irq(&pool->lock);
@@ -1803,8 +1803,6 @@ static int create_and_start_worker(struct worker_pool 
*pool)
spin_unlock_irq(&pool->lock);
}
 
-   mutex_unlock(&pool->manager_mutex);
-
return worker ? 0 : -ENOMEM;
 }
 
@@ -2002,8 +2000,6 @@ static bool manage_workers(struct worker *worker)
bool ret = false;
 
/*
-* Managership is governed by two mutexes - manager_arb and
-* manager_mutex.  manager_arb handles arbitration of manager role.
 * Anyone who successfully grabs manager_arb wins the arbitration
 * and becomes the manager.  mutex_trylock() on pool->manager_arb
 * failure while holding pool->lock reliably indicates that someone
@@ -2012,33 +2008,12 @@ static bool manage_workers(struct worker *worker)
 * grabbing manager_arb is responsible for actually performing
 * manager duties.  If manager_arb is grabbed and released without
 * actual management, the pool may stall indefinitely.
-*
-* manager_mutex is used for exclusion of actual management
-* operations.  The holder of manager_mutex can be sure that none
-* of management operations, including creation and destruction of
-* workers, won't take place until the mutex is released.  Because
-* manager_mutex doesn't interfere with manager role arbitration,
-* it is guaranteed that the pool's management, while may be
-* delayed, won't be disturbed by someone else grabbing
-* manager_mutex.
 */
if (!mutex_trylock(&pool->manager_arb))
return ret;
 
-   /*
-* With manager arbitration won, manager_mutex would be free in
-* most cases.  trylock first without dropping @pool->lock.
-*/
-   if (unlikely(!mutex_trylock(&pool->manager_mutex))) {
-   spin_unlock_irq(&pool->lock);
-   mutex_lock(&a

[PATCH 03/10 V2] workqueue: async worker destruction

2014-05-11 Thread Lai Jiangshan

worker destruction includes these parts of code:
adjust pool's stats
remove the worker from idle list
detach the worker from the pool
kthread_stop() to wait for the worker's task exit
free the worker struct

We can find out that there is no essential work to do after
kthread_stop(). Which means destroy_worker() doesn't need
to wait for the worker's task exit. So we can remove kthread_stop()
and free the worker struct in the worker exiting path.

But put_unbound_pool() still needs to sync the all the workers'
destruction before to destroy the pool. Otherwise the workers
may access to the invalid pool when they are exiting.

So we also move the code of "detach the worker" to the exiting
path and let put_unbound_pool() to sync with this code via
detach_completion.

The code of "detach the worker" is wrapped in a new function
"worker_detach_from_pool()".

And the worker id is freed when detaching which happens
before the worker is fully dead. But this id of the dying worker
may be re-used for a new worker. So the dying worker's task name
is changed to "worker_dying" to avoid two or several workers
having the same name.

Since "detach the worker" is moved out from destroy_worker(),
destroy_worker() doesn't require manager_mutex.
So the "lockdep_assert_held(&pool->manager_mutex)" in destroy_worker()
is removed, and destroy_worker() is not protected by manager_mutex
in put_unbound_pool().

Signed-off-by: Lai Jiangshan 
---
 kernel/workqueue.c |   65 +--
 1 files changed, 42 insertions(+), 23 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 752e109..465e751 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -163,6 +163,7 @@ struct worker_pool {
struct mutexmanager_arb;/* manager arbitration */
struct mutexmanager_mutex;  /* manager exclusion */
struct idr  worker_idr; /* M: worker IDs and iteration 
*/
+   struct completion   *detach_completion; /* all workers detached */
 
struct workqueue_attrs  *attrs; /* I: worker attributes */
struct hlist_node   hash_node;  /* PL: unbound_pool_hash node */
@@ -1688,6 +1689,30 @@ static struct worker *alloc_worker(void)
 }
 
 /**
+ * worker_detach_from_pool() - detach the worker from the pool
+ * @worker: worker which is attached to its pool
+ * @pool: attached pool
+ *
+ * Undo the attaching which had been done in create_worker().
+ * The caller worker shouldn't access to the pool after detached
+ * except it has other reference to the pool.
+ */
+static void worker_detach_from_pool(struct worker *worker,
+   struct worker_pool *pool)
+{
+   struct completion *detach_completion = NULL;
+
+   mutex_lock(&pool->manager_mutex);
+   idr_remove(&pool->worker_idr, worker->id);
+   if (idr_is_empty(&pool->worker_idr))
+   detach_completion = pool->detach_completion;
+   mutex_unlock(&pool->manager_mutex);
+
+   if (detach_completion)
+   complete(detach_completion);
+}
+
+/**
  * create_worker - create a new workqueue worker
  * @pool: pool the new worker will belong to
  *
@@ -1815,13 +1840,12 @@ static int create_and_start_worker(struct worker_pool 
*pool)
  * The worker should be idle.
  *
  * CONTEXT:
- * spin_lock_irq(pool->lock) which is released and regrabbed.
+ * spin_lock_irq(pool->lock).
  */
 static void destroy_worker(struct worker *worker)
 {
struct worker_pool *pool = worker->pool;
 
-   lockdep_assert_held(&pool->manager_mutex);
lockdep_assert_held(&pool->lock);
 
/* sanity check frenzy */
@@ -1833,24 +1857,9 @@ static void destroy_worker(struct worker *worker)
pool->nr_workers--;
pool->nr_idle--;
 
-   /*
-* Once WORKER_DIE is set, the kworker may destroy itself at any
-* point.  Pin to ensure the task stays until we're done with it.
-*/
-   get_task_struct(worker->task);
-
list_del_init(&worker->entry);
worker->flags |= WORKER_DIE;
-
-   idr_remove(&pool->worker_idr, worker->id);
-
-   spin_unlock_irq(&pool->lock);
-
-   kthread_stop(worker->task);
-   put_task_struct(worker->task);
-   kfree(worker);
-
-   spin_lock_irq(&pool->lock);
+   wake_up_process(worker->task);
 }
 
 static void idle_worker_timeout(unsigned long __pool)
@@ -2289,6 +2298,10 @@ woke_up:
spin_unlock_irq(&pool->lock);
WARN_ON_ONCE(!list_empty(&worker->entry));
worker->task->flags &= ~PF_WQ_WORKER;
+
+   set_task_comm(worker->task, "kworker_dying");
+

[PATCH 00/10 V2] workqueue: async worker destruction and worker attaching/detaching

2014-05-11 Thread Lai Jiangshan

Patch1-4: async worker destruction

Patch2 reduces the review burden. It will be easier to review the whole
patchset if we know destroy_worker() is forced to destroy idle workers only.

Patch5-10: worker attaching/detaching and simplify the workers management

The code which attaches a worker to the pool and detaches a worker from the pool
is unfolded in create_worker()/destroy_worker().
The patchset moves this attaching/detaching code out and wraps them.

patch3-4 moves the detaching code out from destroy_worker(), and make
manger_mutex only protects the detaching code only rather than
protects the whole worker-destruction path.

patch5-7 makes manger_mutex only protects the attaching code rather than the
whole worker-creation path.

patch8: rename manger_mutex to attach_mutex
patch9-10: moves the attaching code out from create_worker() and use it for
rescuer.


Lai Jiangshan (10):
  workqueue: use manager lock only to protect worker_idr
  workqueue: destroy_worker() should destroy idle workers only
  workqueue: async worker destruction
  workqueue: destroy worker directly in the idle timeout handler
  workqueue: separate iteration role from worker_idr
  workqueue: convert worker_idr to worker_ida
  workqueue: narrow the protection range of manager_mutex
  workqueue: rename manager_mutex to attach_mutex
  workqueue: separate pool-attaching code out from create_worker()
  workqueue: use generic attach/detach routine for rescuers

 kernel/workqueue.c  |  401 ++-
 kernel/workqueue_internal.h |1 +
 2 files changed, 126 insertions(+), 276 deletions(-)

-- 
1.7.4.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: workqueue: WARN at at kernel/workqueue.c:2176

2014-05-12 Thread Lai Jiangshan

On 05/13/2014 04:01 AM, Tejun Heo wrote:
> On Mon, May 12, 2014 at 02:58:55PM -0400, Sasha Levin wrote:
>> Hi all,
>>
>> While fuzzing with trinity inside a KVM tools guest running the latest -next
>> kernel I've stumbled on the following spew:
>>
>> [ 1297.886670] WARNING: CPU: 0 PID: 190 at kernel/workqueue.c:2176 
>> process_one_work+0xb5/0x6f0()
>> [ 1297.889216] Modules linked in:
>> [ 1297.890306] CPU: 0 PID: 190 Comm: kworker/3:0 Not tainted 
>> 3.15.0-rc5-next-20140512-sasha-00019-ga20bc00-dirty #456
>> [ 1297.893258]  0009 88010c5d7ce8 b153e1ec 
>> 0002
>> [ 1297.893258]   88010c5d7d28 ae15fd6c 
>> 88010cdd6c98
>> [ 1297.893258]  8806285d4000 b3cd09e0 88010cdde000 
>> 
>> [ 1297.893258] Call Trace:
>> [ 1297.893258] dump_stack (lib/dump_stack.c:52)
>> [ 1297.893258] warn_slowpath_common (kernel/panic.c:430)
>> [ 1297.893258] warn_slowpath_null (kernel/panic.c:465)
>> [ 1297.893258] process_one_work (kernel/workqueue.c:2174 (discriminator 38))
>> [ 1297.893258] worker_thread (kernel/workqueue.c:2354)
>> [ 1297.893258] kthread (kernel/kthread.c:210)
>> [ 1297.893258] ret_from_fork (arch/x86/kernel/entry_64.S:553)

Hi,

I have been trying to address this bug.
Buy I can't reproduce this bug. Is your testing arch X86?
if yes, could you find out how to reproduce the bug?

Thanks,
Lai

> 
> Hmm, this is "percpu worker on the wrong CPU while the current
> workqueue state indicates it should be on the CPU it's bound to"
> warning.  We had a similar and more reproducible report a couple
> months back.
> 
>   http://lkml.kernel.org/g/52f4f01c.1070...@linux.vnet.ibm.com
> 
> We added some debug code back then and it looked like the worker was
> setting the right cpus_allowed mask and the cpu was up but still
> ending up on the wrong CPU.  Peter was looking into it and, ooh, I
> missed his last message and it fell through the crack.  We probably
> should follow up on that thread.
> 
> Thanks.
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [patch 1/3] rtmutex: Add missing deadlock check

2014-05-12 Thread Lai Jiangshan

Hi, Thomas,

I think this patch is just a workaround, it is not the proper fix.
you need a updated deadlock-check mechanism:

- (old) skip the check when top_waiter != task_top_pi_waiter(task)
+ (new) skip the check when top_waiter->prio > task->prio

/*
 * Drop out, when the task has no waiters. Note,
 * top_waiter can be NULL, when we are in the deboosting
 * mode!
 */
if (top_waiter && (!task_has_pi_waiters(task) ||
   top_waiter != task_top_pi_waiter(task)))
goto out_unlock_pi;

(also need to update the code in other places respectively)

On 05/13/2014 04:45 AM, Thomas Gleixner wrote:
>   /*
> +  * Deadlock check for the following scenario:
> +  *
> +  * T holds lock L and has waiters
> +  * T locks L again, but does not end up as it's own top waiter

ABBA problem (TA TB TC TD are of the same priority)

TA holds lock LA, and try to lock LB which TC already has waited on
TB holds lock LB, and try to lock LA which TD already has waited on

I think this check can't detect it IIUC.

> +  *
> +  * So we would drop out at the next check without noticing.
> +  *
> +  * Note, we need to check for orig_waiter as it might be NULL
> +  * when deboosting!
> +  */
> + if (orig_waiter && orig_waiter->task == rt_mutex_owner(lock)) {

when non-first-loop, it is already checked.

> + ret = deadlock_detect ? -EDEADLK : 0;
> + goto out_unlock_pi;
> + }

I considered you blamed to me.
I would feel better if you directly blamed to me.

Thanks
Lai
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 02/10 V2] workqueue: destroy_worker() should destroy idle workers only

2014-05-12 Thread Lai Jiangshan

On 05/13/2014 05:08 AM, Tejun Heo wrote:
> On Mon, May 12, 2014 at 02:56:14PM +0800, Lai Jiangshan wrote:
>> @@ -1692,9 +1691,8 @@ static struct worker *alloc_worker(void)
>>   * create_worker - create a new workqueue worker
>>   * @pool: pool the new worker will belong to
>>   *
>> - * Create a new worker which is bound to @pool.  The returned worker
>> - * can be started by calling start_worker() or destroyed using
>> - * destroy_worker().
>> + * Create a new worker which is attached to @pool.
>> + * The new worker must be started and enter idle via start_worker().
> 
> Please always fill the comment paragarphs to 75 column or so.  Also,

> do we even need start_worker() separate anymore?  Maybe we can just
> fold alloc_and_create_worker() into alloc_worker()?

We should do this I think. but it is just cleanup, I will do it after
this core patchset is accepted.

> 
>> @@ -1815,6 +1812,7 @@ static int create_and_start_worker(struct worker_pool 
>> *pool)
>>   * @worker: worker to be destroyed
>>   *
>>   * Destroy @worker and adjust @pool stats accordingly.
>> + * The worker should be idle.
> 
> Ditto about filling.
> 
> Looks good otherwise.
> 
> Thanks.
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 03/10 V2] workqueue: async worker destruction

2014-05-12 Thread Lai Jiangshan

On 05/13/2014 05:20 AM, Tejun Heo wrote:
> On Mon, May 12, 2014 at 02:56:15PM +0800, Lai Jiangshan wrote:
>>  /**
>> + * worker_detach_from_pool() - detach the worker from the pool
>> + * @worker: worker which is attached to its pool
>> + * @pool: attached pool
>> + *
>> + * Undo the attaching which had been done in create_worker().
>> + * The caller worker shouldn't access to the pool after detached
>> + * except it has other reference to the pool.
>> + */
>> +static void worker_detach_from_pool(struct worker *worker,
>> +struct worker_pool *pool)
>> +{
>> +struct completion *detach_completion = NULL;
>> +
>> +mutex_lock(&pool->manager_mutex);
>> +idr_remove(&pool->worker_idr, worker->id);
>> +if (idr_is_empty(&pool->worker_idr))
>> +detach_completion = pool->detach_completion;
>> +mutex_unlock(&pool->manager_mutex);
>> +
>> +if (detach_completion)
>> +complete(detach_completion);
>> +}
> 
> Are we gonna use this function from somewhere else too?

it is called from worker_thread().

I don't want to unfold it into worker_thread(), it is better
readability when it is wrapped and it will be called in patch10
for rescuer.

> 
>> @@ -2289,6 +2298,10 @@ woke_up:
>>  spin_unlock_irq(&pool->lock);
>>  WARN_ON_ONCE(!list_empty(&worker->entry));
>>  worker->task->flags &= ~PF_WQ_WORKER;
>> +
>> +set_task_comm(worker->task, "kworker_dying");
> 
> Given how other kworkers are named, maybe a better name is
> "kworker/dying" or "kworker/detached"?
> 
>> +worker_detach_from_pool(worker, pool);
>> +kfree(worker);
>>  return 0;
>>  }
>>  
>> @@ -3561,6 +3574,7 @@ static void rcu_free_pool(struct rcu_head *rcu)
>>  static void put_unbound_pool(struct worker_pool *pool)
>>  {
>>  struct worker *worker;
>> +DECLARE_COMPLETION_ONSTACK(detach_completion);
> 
> I think it's conventional to put initialized ones (especially the ones
> require initializing macros) before uninitialized vars.
> 
>> @@ -3579,19 +3593,24 @@ static void put_unbound_pool(struct worker_pool 
>> *pool)
>>  
>>  /*
>>   * Become the manager and destroy all workers.  Grabbing
>> - * manager_arb prevents @pool's workers from blocking on
>> - * manager_mutex.
>> + * manager_arb ensures manage_workers() finish and enter idle.
> 
> I don't follow what the above comment update is trying to say.

If a pool is destroying, the worker will not call manage_workers().
but the existing manage_workers() may be still running.

mutex_lock(&manager_arb) in put_unbound_pool() should wait this manage_workers()
finished due to the manager-worker (non-idle-worker) can't be destroyed.

> 
> Thanks.
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 6/6] workqueue: destroy worker directly in idle timeout handler

2014-04-12 Thread Lai Jiangshan

Since destroy_worker() is working atomically, we move maybe_destroy_worker()
out from manager and destroy worker directly in idle timeout handler.
And we remove %POOL_MANAGE_WORKERS which help us remove a branch in
worker_thread().

Signed-off-by: Lai Jiangshan 
---
 kernel/workqueue.c |   71 ---
 1 files changed, 6 insertions(+), 65 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 8199e7f..92c9ada 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -64,7 +64,6 @@ enum {
 * %WORKER_UNBOUND set and concurrency management disabled, and may
 * be executing on any CPU.  The pool behaves as an unbound one.
 */
-   POOL_MANAGE_WORKERS = 1 << 0,   /* need to manage workers */
POOL_DISASSOCIATED  = 1 << 2,   /* cpu can't serve workers */
 
/* worker flags */
@@ -754,13 +753,6 @@ static bool need_to_create_worker(struct worker_pool *pool)
return need_more_worker(pool) && !may_start_working(pool);
 }
 
-/* Do I need to be the manager? */
-static bool need_to_manage_workers(struct worker_pool *pool)
-{
-   return need_to_create_worker(pool) ||
-   (pool->flags & POOL_MANAGE_WORKERS);
-}
-
 /* Do we have too many workers and should some go away? */
 static bool too_many_workers(struct worker_pool *pool)
 {
@@ -1789,7 +1781,7 @@ static int create_and_start_worker(struct worker_pool 
*pool)
  * Destroy @worker and adjust @pool stats accordingly.
  *
  * CONTEXT:
- * spin_lock_irq(pool->lock) which is released and regrabbed.
+ * spin_lock_irq(pool->lock).
  */
 static void destroy_worker(struct worker *worker)
 {
@@ -1819,8 +1811,7 @@ static void idle_worker_timeout(unsigned long __pool)
struct worker_pool *pool = (void *)__pool;
 
spin_lock_irq(&pool->lock);
-
-   if (too_many_workers(pool)) {
+   while (too_many_workers(pool)) {
struct worker *worker;
unsigned long expires;
 
@@ -1828,15 +1819,13 @@ static void idle_worker_timeout(unsigned long __pool)
worker = list_entry(pool->idle_list.prev, struct worker, entry);
expires = worker->last_active + IDLE_WORKER_TIMEOUT;
 
-   if (time_before(jiffies, expires))
+   if (time_before(jiffies, expires)) {
mod_timer(&pool->idle_timer, expires);
-   else {
-   /* it's been idle for too long, wake up manager */
-   pool->flags |= POOL_MANAGE_WORKERS;
-   wake_up_worker(pool);
+   break;
}
-   }
 
+   destroy_worker(worker);
+   }
spin_unlock_irq(&pool->lock);
 }
 
@@ -1960,44 +1949,6 @@ restart:
 }
 
 /**
- * maybe_destroy_worker - destroy workers which have been idle for a while
- * @pool: pool to destroy workers for
- *
- * Destroy @pool workers which have been idle for longer than
- * IDLE_WORKER_TIMEOUT.
- *
- * LOCKING:
- * spin_lock_irq(pool->lock) which may be released and regrabbed
- * multiple times.  Called only from manager.
- *
- * Return:
- * %false if no action was taken and pool->lock stayed locked, %true
- * otherwise.
- */
-static bool maybe_destroy_workers(struct worker_pool *pool)
-{
-   bool ret = false;
-
-   while (too_many_workers(pool)) {
-   struct worker *worker;
-   unsigned long expires;
-
-   worker = list_entry(pool->idle_list.prev, struct worker, entry);
-   expires = worker->last_active + IDLE_WORKER_TIMEOUT;
-
-   if (time_before(jiffies, expires)) {
-   mod_timer(&pool->idle_timer, expires);
-   break;
-   }
-
-   destroy_worker(worker);
-   ret = true;
-   }
-
-   return ret;
-}
-
-/**
  * manage_workers - manage worker pool
  * @worker: self
  *
@@ -2039,13 +1990,6 @@ static bool manage_workers(struct worker *worker)
if (!mutex_trylock(&pool->manager_arb))
return ret;
 
-   pool->flags &= ~POOL_MANAGE_WORKERS;
-
-   /*
-* Destroy and then create so that may_start_working() is true
-* on return.
-*/
-   ret |= maybe_destroy_workers(pool);
ret |= maybe_create_worker(pool);
 
mutex_unlock(&pool->manager_arb);
@@ -2285,9 +2229,6 @@ recheck:
 
worker_set_flags(worker, WORKER_PREP, false);
 sleep:
-   if (unlikely(need_to_manage_workers(pool)) && manage_workers(worker))
-   goto recheck;
-
/*
 * pool->lock is held and there's no work to process and no need to
 * manage, sleep.  Workers are woken up only while holding
-- 
1.7.4.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the bod

[PATCH 1/6] workqueue: generic routine to restore percpu/unbound pools' workers' cpumask

2014-04-12 Thread Lai Jiangshan

Current code uses different routines to restore the cpumask of workers of
percpu&unbound pools.

unbound pools - restore_unbound_workers_cpumask()
percpu pools - rebind_workers()

Actually, restore_unbound_workers_cpumask() can be used for percpu pools.
if percpu_pool->cpu != cpu, restore_unbound_workers_cpumask() will returns at
the first if-branch. if percpu_pool->cpu == cpu, 
restore_unbound_workers_cpumask()
will call set_cpus_allowed_ptr() for all of its workers.

So we can use restore_unbound_workers_cpumask() for both kinds of pools.

The patch rename restore_unbound_workers_cpumask() to restore_workers_cpumask()
and use it for percpu pools. rebind_workers() will not restore cpumask,
so it is renamed to restore_workers_concurrency().

"pool->flags &= ~POOL_DISASSOCIATED" is also moved into
restore_workers_concurrency(), concurrency is restored atomically.

wq_unbind_fn() is rename to disable_workers_concurrency().

Signed-off-by: Lai Jiangshan 
---
 kernel/workqueue.c |   46 +++---
 1 files changed, 15 insertions(+), 31 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index d845bdd..0b56730 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2350,7 +2350,7 @@ recheck:
 * worker or that someone else has already assumed the manager
 * role.  This is where @worker starts participating in concurrency
 * management if applicable and concurrency management is restored
-* after being rebound.  See rebind_workers() for details.
+* after being rebound.  See restore_workers_concurrency() for details.
 */
worker_clr_flags(worker, WORKER_PREP | WORKER_REBOUND);
 
@@ -4586,7 +4586,7 @@ void print_worker_info(const char *log_lvl, struct 
task_struct *task)
  * cpu comes back online.
  */
 
-static void wq_unbind_fn(struct work_struct *work)
+static void disable_workers_concurrency(struct work_struct *work)
 {
int cpu = smp_processor_id();
struct worker_pool *pool;
@@ -4644,30 +4644,20 @@ static void wq_unbind_fn(struct work_struct *work)
 }
 
 /**
- * rebind_workers - rebind all workers of a pool to the associated CPU
+ * restore_workers_concurrency - restore concurrency management of all workers
  * @pool: pool of interest
  *
- * @pool->cpu is coming online.  Rebind all workers to the CPU.
+ * @pool->cpu is coming online and all workers are alreaddy rebound to the CPU.
  */
-static void rebind_workers(struct worker_pool *pool)
+static void restore_workers_concurrency(struct worker_pool *pool)
 {
struct worker *worker;
int wi;
 
lockdep_assert_held(&pool->manager_mutex);
 
-   /*
-* Restore CPU affinity of all workers.  As all idle workers should
-* be on the run-queue of the associated CPU before any local
-* wake-ups for concurrency management happen, restore CPU affinty
-* of all workers first and then clear UNBOUND.  As we're called
-* from CPU_ONLINE, the following shouldn't fail.
-*/
-   for_each_pool_worker(worker, wi, pool)
-   WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task,
- pool->attrs->cpumask) < 0);
-
spin_lock_irq(&pool->lock);
+   pool->flags &= ~POOL_DISASSOCIATED;
 
for_each_pool_worker(worker, wi, pool) {
unsigned int worker_flags = worker->flags;
@@ -4708,16 +4698,16 @@ static void rebind_workers(struct worker_pool *pool)
 }
 
 /**
- * restore_unbound_workers_cpumask - restore cpumask of unbound workers
- * @pool: unbound pool of interest
+ * restore_workers_cpumask - restore cpumask of workers
+ * @pool: pool of interest
  * @cpu: the CPU which is coming up
  *
- * An unbound pool may end up with a cpumask which doesn't have any online
- * CPUs.  When a worker of such pool get scheduled, the scheduler resets
+ * A pool may end up with a cpumask which doesn't have any online CPUS.
+ * When a worker of such pool get scheduled, the scheduler resets
  * its cpus_allowed.  If @cpu is in @pool's cpumask which didn't have any
  * online CPU before, cpus_allowed of all its workers should be restored.
  */
-static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu)
+static void restore_workers_cpumask(struct worker_pool *pool, int cpu)
 {
static cpumask_t cpumask;
struct worker *worker;
@@ -4769,16 +4759,10 @@ static int workqueue_cpu_up_callback(struct 
notifier_block *nfb,
 
for_each_pool(pool, pi) {
mutex_lock(&pool->manager_mutex);
+   restore_workers_cpumask(pool, cpu);
 
-   if (pool->cpu == cpu) {
-   spin_lock_irq(&pool->lock);
-   pool->flags &= ~POOL_DISASSOCIATED;
-

[PATCH 4/6] workqueue: commit worker to pool's concurrency setting atomically.

2014-04-12 Thread Lai Jiangshan

workers' concurrency setting need to be coordinate with pool's
concurrency setting when create_worker()/destroy_worker()/cpu_inline()
cpu_offline().

But create_worker() handles it non-atomically(not in a single pool->lock).
This patch makes the behavior atomically.

Now bind_list is used for coordinating workers' cpumask with the pool.
worker_idr is used for coordinating workers' concurrency with the pool.

cpumask is coordinated at first and then concurrency.

We don't want to remove worker_idr and re-use bind_list.
if we do so:
1) the locking will become much complex.
2) after removing worker_idr, we need to add a ida back
   we do not save any thing.

Signed-off-by: Lai Jiangshan 
---
 kernel/workqueue.c |   15 +++
 1 files changed, 3 insertions(+), 12 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 6c38aed..3a6be02 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -63,10 +63,6 @@ enum {
 * While DISASSOCIATED, the cpu may be offline and all workers have
 * %WORKER_UNBOUND set and concurrency management disabled, and may
 * be executing on any CPU.  The pool behaves as an unbound one.
-*
-* Note that DISASSOCIATED should be flipped only while holding
-* manager_mutex to avoid changing binding state while
-* create_worker() is in progress.
 */
POOL_MANAGE_WORKERS = 1 << 0,   /* need to manage workers */
POOL_DISASSOCIATED  = 1 << 2,   /* cpu can't serve workers */
@@ -1745,16 +1741,10 @@ static struct worker *create_worker(struct worker_pool 
*pool)
 
bind_worker(worker, pool);
 
-   /*
-* The caller is responsible for ensuring %POOL_DISASSOCIATED
-* remains stable across this function.  See the comments above the
-* flag definition for details.
-*/
+   /* successful, commit the worker to the pool's concurrency setting */
+   spin_lock_irq(&pool->lock);
if (pool->flags & POOL_DISASSOCIATED)
worker->flags |= WORKER_UNBOUND;
-
-   /* successful, commit the pointer to idr */
-   spin_lock_irq(&pool->lock);
idr_replace(&pool->worker_idr, worker, worker->id);
spin_unlock_irq(&pool->lock);
 
@@ -1841,6 +1831,7 @@ static void destroy_worker(struct worker *worker)
 
list_del_init(&worker->entry);
worker->flags |= WORKER_DIE;
+   /* release @id and leave pool's concurrency setting */
idr_remove(&pool->worker_idr, worker->id);
wake_up_process(worker->task);
 }
-- 
1.7.4.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 5/6] workqueue: remove manager_mutex

2014-04-12 Thread Lai Jiangshan

Now bind_mutex is used for coordinating workers' cpumask with the pool.
pool->lock is used for coordinating workers' concurrency with the pool.

cpumask is coordinated at first and then concurrency.
manager_mutex don't need for cpumask nor concurrency.

In restore_workers_cpumask(), we don't need manager_mutex,
pool->bind_mutex handle the cpumasks corrently VS. bind_worker()
and unbind_worker().

In restore_workers_concurrency()/disable_workers_concurrency(),
we don't need manager_mutex, pool->lock handle the concurrency
correctly VS. create_worker() and destroy_worker().

In put_unbound_pool(), we don't need manager_mutex before this
patchset. It has manager_arb VS. manager_workers() and it has
wq_pool_mutex VS. restore_workers_cpumask(unbound_pool).
and it has wq_pool_mutex VS. create_and_start_worker(unbound_pool).

For percpu pool's create_and_start_worker(), other routines
can't be called at the some time. so create_and_start_worker()
don't need manager_mutex too.

All other routines don't need manager_mutex. so manager_workers()
don't need manager_mutex too.

Signed-off-by: Lai Jiangshan 
---
 kernel/workqueue.c |   73 +--
 1 files changed, 8 insertions(+), 65 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 3a6be02..8199e7f 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -119,9 +119,6 @@ enum {
  *cpu or grabbing pool->lock is enough for read access.  If
  *POOL_DISASSOCIATED is set, it's identical to L.
  *
- * MG: pool->manager_mutex and pool->lock protected.  Writes require both
- * locks.  Reads can happen under either lock.
- *
  * PL: wq_pool_mutex protected.
  *
  * PR: wq_pool_mutex protected for writes.  Sched-RCU protected for reads.
@@ -158,10 +155,8 @@ struct worker_pool {
DECLARE_HASHTABLE(busy_hash, BUSY_WORKER_HASH_ORDER);
/* L: hash of busy workers */
 
-   /* see manage_workers() for details on the two manager mutexes */
struct mutexmanager_arb;/* manager arbitration */
-   struct mutexmanager_mutex;  /* manager exclusion */
-   struct idr  worker_idr; /* MG: worker IDs and iteration 
*/
+   struct idr  worker_idr; /* L: worker IDs and iteration 
*/
 
/*
 * A worker is bound to the pool, it means:
@@ -353,16 +348,6 @@ static void copy_workqueue_attrs(struct workqueue_attrs 
*to,
   lockdep_is_held(&wq->mutex), \
   "sched RCU or wq->mutex should be held")
 
-#ifdef CONFIG_LOCKDEP
-#define assert_manager_or_pool_lock(pool)  \
-   WARN_ONCE(debug_locks &&\
- !lockdep_is_held(&(pool)->manager_mutex) &&   \
- !lockdep_is_held(&(pool)->lock),  \
- "pool->manager_mutex or ->lock should be held")
-#else
-#define assert_manager_or_pool_lock(pool)  do { } while (0)
-#endif
-
 #define for_each_cpu_worker_pool(pool, cpu)\
for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0];   \
 (pool) < &per_cpu(cpu_worker_pools, cpu)[NR_STD_WORKER_POOLS]; \
@@ -391,14 +376,14 @@ static void copy_workqueue_attrs(struct workqueue_attrs 
*to,
  * @wi: integer used for iteration
  * @pool: worker_pool to iterate workers of
  *
- * This must be called with either @pool->manager_mutex or ->lock held.
+ * This must be called with ->lock held.
  *
  * The if/else clause exists only for the lockdep assertion and can be
  * ignored.
  */
 #define for_each_pool_worker(worker, wi, pool) \
idr_for_each_entry(&(pool)->worker_idr, (worker), (wi)) \
-   if (({ assert_manager_or_pool_lock((pool)); false; })) { } \
+   if (({ lockdep_assert_held(&pool->lock); false; })) { } \
else
 
 /**
@@ -1700,8 +1685,6 @@ static struct worker *create_worker(struct worker_pool 
*pool)
int id = -1;
char id_buf[16];
 
-   lockdep_assert_held(&pool->manager_mutex);
-
/*
 * ID is needed to determine kthread name.  Allocate ID first
 * without installing the pointer.
@@ -1781,7 +1764,7 @@ static void start_worker(struct worker *worker)
  * create_and_start_worker - create and start a worker for a pool
  * @pool: the target pool
  *
- * Grab the managership of @pool and create and start a new worker for it.
+ * Create and start a new worker for it.
  *
  * Return: 0 on success. A negative error code otherwise.
  */
@@ -1789,8 +1772,6 @@ static int create_and_start_worker(struct worker_pool 
*pool)
 {
struct worker *w

[PATCH 2/6] workqueue: generic framework to manage normal&rescuer workers' cpumask

2014-04-12 Thread Lai Jiangshan

If a worker's cpumask need to be kept co-ordinate with the pool
during cpu-hotpug, we add this worker to a special set which
will be used to manage workers' cpumask.

This special set is worker_idr currently and it serves for normal workers only.
But we can't add rescuer to this set due to we can't allocate id from
worker_idr for rescuers. We need to introduce a new set(bind_list).

When the rescuer adds itself to bind_list. it needs some synchronization.
but we can't re-use manager_mutex due to manager_mutex has dependency
to memory allocation. So we introduce a new mutex - bind_mutex.

Now restore_workers_cpumask() also restore rescuers' cpumask via bind_list.

Signed-off-by: Lai Jiangshan 
---
 kernel/workqueue.c  |  129 +++
 kernel/workqueue_internal.h |1 +
 2 files changed, 57 insertions(+), 73 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 0b56730..743917d 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -135,6 +135,8 @@ enum {
  * WR: wq->mutex protected for writes.  Sched-RCU protected for reads.
  *
  * MD: wq_mayday_lock protected.
+ *
+ * B:  pool->bind_mutex protected.
  */
 
 /* struct worker is defined in workqueue_internal.h */
@@ -165,6 +167,17 @@ struct worker_pool {
struct mutexmanager_mutex;  /* manager exclusion */
struct idr  worker_idr; /* MG: worker IDs and iteration 
*/
 
+   /*
+* A worker is bound to the pool, it means:
+* 1) the worker's cpumask is bound to the pool.
+*
+* bind_mutex is held in rescuer before processing works,
+* so bind_mutex shouldn't have any directly nor indirecty dependency
+* to sleepable-memory-allocation.
+*/
+   struct mutexbind_mutex; /* workers binding */
+   struct list_headbind_list;  /* B: bound workers*/
+
struct workqueue_attrs  *attrs; /* I: worker attributes */
struct hlist_node   hash_node;  /* PL: unbound_pool_hash node */
int refcnt; /* PL: refcnt for unbound pools 
*/
@@ -1625,70 +1638,6 @@ static void worker_leave_idle(struct worker *worker)
list_del_init(&worker->entry);
 }
 
-/**
- * worker_maybe_bind_and_lock - try to bind %current to worker_pool and lock it
- * @pool: target worker_pool
- *
- * Bind %current to the cpu of @pool if it is associated and lock @pool.
- *
- * Works which are scheduled while the cpu is online must at least be
- * scheduled to a worker which is bound to the cpu so that if they are
- * flushed from cpu callbacks while cpu is going down, they are
- * guaranteed to execute on the cpu.
- *
- * This function is to be used by unbound workers and rescuers to bind
- * themselves to the target cpu and may race with cpu going down or
- * coming online.  kthread_bind() can't be used because it may put the
- * worker to already dead cpu and set_cpus_allowed_ptr() can't be used
- * verbatim as it's best effort and blocking and pool may be
- * [dis]associated in the meantime.
- *
- * This function tries set_cpus_allowed() and locks pool and verifies the
- * binding against %POOL_DISASSOCIATED which is set during
- * %CPU_DOWN_PREPARE and cleared during %CPU_ONLINE, so if the worker
- * enters idle state or fetches works without dropping lock, it can
- * guarantee the scheduling requirement described in the first paragraph.
- *
- * CONTEXT:
- * Might sleep.  Called without any lock but returns with pool->lock
- * held.
- *
- * Return:
- * %true if the associated pool is online (@worker is successfully
- * bound), %false if offline.
- */
-static bool worker_maybe_bind_and_lock(struct worker_pool *pool)
-__acquires(&pool->lock)
-{
-   while (true) {
-   /*
-* The following call may fail, succeed or succeed
-* without actually migrating the task to the cpu if
-* it races with cpu hotunplug operation.  Verify
-* against POOL_DISASSOCIATED.
-*/
-   if (!(pool->flags & POOL_DISASSOCIATED))
-   set_cpus_allowed_ptr(current, pool->attrs->cpumask);
-
-   spin_lock_irq(&pool->lock);
-   if (pool->flags & POOL_DISASSOCIATED)
-   return false;
-   if (task_cpu(current) == pool->cpu &&
-   cpumask_equal(¤t->cpus_allowed, pool->attrs->cpumask))
-   return true;
-   spin_unlock_irq(&pool->lock);
-
-   /*
-* We've raced with CPU hot[un]plug.  Give it a breather
-* and retry migration.  cond_resched() is required here;
-* otherwise, we might deadlock against cpu_stop trying to
-* bring down the CPU on non

[PATCH 3/6] workqueue: make destroy_worker() atomically

2014-04-12 Thread Lai Jiangshan

destroy_worker() doesn't need to wait for worker's task exit.
There is no essential things to do after kthread_stop().
So we remove kthread_stop().

put_unbound_pool() needs to wait for workers' tasks exit.
we add a new completion to handle it.

The purpose of this patch is not making the slowpath destroy_worker()
faster, but:
1) allow destroy_worker() to be called in timeout handler in future patch.
2) reduce possible latency for create_worker()/cpu-hotplug.

Signed-off-by: Lai Jiangshan 
---
 kernel/workqueue.c |   32 +---
 1 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 743917d..6c38aed 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -170,6 +170,10 @@ struct worker_pool {
/*
 * A worker is bound to the pool, it means:
 * 1) the worker's cpumask is bound to the pool.
+* 2) the worker gets a reference to the pool. The worker shouldn't
+*access to the pool after the worker is unbound from the pool,
+*except that the worker has another kinds of reference to
+*the pool.
 *
 * bind_mutex is held in rescuer before processing works,
 * so bind_mutex shouldn't have any directly nor indirecty dependency
@@ -177,6 +181,7 @@ struct worker_pool {
 */
struct mutexbind_mutex; /* workers binding */
struct list_headbind_list;  /* B: bound workers*/
+   struct completion   workers_leave;  /* all workers exit */
 
struct workqueue_attrs  *attrs; /* I: worker attributes */
struct hlist_node   hash_node;  /* PL: unbound_pool_hash node */
@@ -1668,9 +1673,15 @@ static void bind_worker(struct worker *worker, struct 
worker_pool *pool)
 
 static void unbind_worker(struct worker *worker, struct worker_pool *pool)
 {
+   bool is_last;
+
mutex_lock(&pool->bind_mutex);
list_del(&worker->bind_entry);
+   is_last = list_empty(&worker->bind_entry);
mutex_unlock(&pool->bind_mutex);
+
+   if (is_last)
+   complete(&pool->workers_leave);
 }
 
 /**
@@ -1828,24 +1839,10 @@ static void destroy_worker(struct worker *worker)
if (worker->flags & WORKER_IDLE)
pool->nr_idle--;
 
-   /*
-* Once WORKER_DIE is set, the kworker may destroy itself at any
-* point.  Pin to ensure the task stays until we're done with it.
-*/
-   get_task_struct(worker->task);
-
list_del_init(&worker->entry);
worker->flags |= WORKER_DIE;
-
idr_remove(&pool->worker_idr, worker->id);
-
-   spin_unlock_irq(&pool->lock);
-
-   kthread_stop(worker->task);
-   put_task_struct(worker->task);
-   kfree(worker);
-
-   spin_lock_irq(&pool->lock);
+   wake_up_process(worker->task);
 }
 
 static void idle_worker_timeout(unsigned long __pool)
@@ -2293,6 +2290,7 @@ woke_up:
worker->task->flags &= ~PF_WQ_WORKER;
 
unbind_worker(worker, pool);
+   kfree(worker);
return 0;
}
 
@@ -3529,6 +3527,7 @@ static int init_worker_pool(struct worker_pool *pool)
 
mutex_init(&pool->bind_mutex);
INIT_LIST_HEAD(&pool->bind_list);
+   init_completion(&pool->workers_leave);
 
INIT_HLIST_NODE(&pool->hash_node);
pool->refcnt = 1;
@@ -3588,6 +3587,7 @@ static void put_unbound_pool(struct worker_pool *pool)
mutex_lock(&pool->manager_mutex);
spin_lock_irq(&pool->lock);
 
+   WARN_ON(pool->nr_workers != pool->nr_idle);
while ((worker = first_worker(pool)))
destroy_worker(worker);
WARN_ON(pool->nr_workers || pool->nr_idle);
@@ -3596,6 +3596,8 @@ static void put_unbound_pool(struct worker_pool *pool)
mutex_unlock(&pool->manager_mutex);
mutex_unlock(&pool->manager_arb);
 
+   wait_for_completion(&pool->workers_leave);
+
/* shut down the timers */
del_timer_sync(&pool->idle_timer);
del_timer_sync(&pool->mayday_timer);
-- 
1.7.4.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 0/6] workqueue: simpler&better workers management synchronization

2014-04-12 Thread Lai Jiangshan

Sorry,
the cover letter was forgotten to send to LKML.

On 04/12/2014 06:45 PM, Lai Jiangshan wrote:
> Each patches remove codes!
> 
> Patch1&2 are the basic patches. They add a *united* mechanism for managing
> percpu pools' workers' & unbound pools' workers' & rescuers' CPUMASK.
> 
> Patch1&4 make workers-concurrency-enabling atomically when cpu_online()
> and create_worker(). after this, workers-concurrency-enabling-disabling
> are all atomically.
> 
> Old manager_mutex protects:
> 1) workers creation&destruction
> 2) workers cpumask managing when cpu_online()/workers-creation
> 3) workers concurrency enabling&disabling.
> 
> Now, the above three things' synchronization are separated.
> 1) We don't need manager_mutex protects workers creation&destruction
>1.1) creation is only happened when the cpu is first online, the unbound 
> pool
>   is just created, and manage_workers(). they have their one 
> synchronization.
>1.2) destruction only happens on idle-timeout handler, protected by 
> pool->lock.
>1.3) put_unbound_pool() uses manager_arb&workers_leave(new) to synchronize
>   with workers creation&destruction.
> 
> 2) bind_mutex and bind_list handle all the workers'(percpu,unbound,
>normal,rescuer) cpumask when 
> cpu_online()/workers-creation/worker-destrution.
>we don't need manager_mutex.
> 
> 3) pool->lock and worker_idr handle workers concurrency enabling&disabling.
> 
> Note, bind_list is always a super set of worker_idr, and when cpu-online or
> workers-creation, cpumask is set at first and then concurrency-enabling.
> concurrency-management depends on local-wakeup which depends on cpumask
> is properly set for all the workers.
> 
> This patchset depends on previous patch:
> "workqueue: fix possible race condition when rescuer VS pwq-release"
> 
> Thanks,
> Lai
> 
> 
> Lai Jiangshan (6):
>   workqueue: generic routine to restore percpu/unbound pools' workers'
> cpumask
>   workqueue: generic framework to manage normal&rescuer workers'
> cpumask
>   workqueue: make destroy_worker() atomically
>   workqueue: commit worker to pool's concurrency setting atomically.
>   workqueue: remove manager_mutex
>   workqueue: destroy worker directly in idle timeout handler
> 
>  kernel/workqueue.c  |  366 --
>  kernel/workqueue_internal.h |1 +
>  2 files changed, 106 insertions(+), 261 deletions(-)
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] drivers/net: Use RCU_INIT_POINTER(x, NULL) in tun.c

2014-03-24 Thread Lai Jiangshan

On 03/24/2014 01:25 PM, Eric Dumazet wrote:
> On Mon, 2014-03-24 at 07:09 +0200, Michael S. Tsirkin wrote:
> 
>> Seems an incredibly strict requirement for something that just
>> silences a warning.
>> What exactly should I test?
>> I intended to just verify this produces same code as before
>> d322f45ceed525daa under a recent gcc.
> 
> Thats because many rcu_assign_pointer(X, NULL) were already converted to
> RCU_INIT_POINTER(X, NULL)
> 
> Quite frankly I don't know why you bother at all.
> 
> Adding back the lazy test in rcu_assign_pointer() doesn't help to make
> the API cleaner and easier to understand.
> 
> People are usually using RCU API without really understanding
> all the issues. They tend to add superfluous barriers because they feel
> better. 
> 
> Having separate RCU_INIT_POINTER() and rcu_assign_pointer() serve as
> better documentation of the code, I find it more easier to immediately
> check what is going on while reviewing stuff.
> 
> Presumably, checkpatch.pl could be augmented to suggest to use
> RCU_INIT_POINTER(X, NULL) instead of rcu_assign_pointer(X, NULL)


I prefer rcu_assign_pointer(X, NULL) than RCU_INIT_POINTER(X, NULL),
NULL should not be a special pointer value to the users of RCU.

the RCU implements should hide the difference if RCU implements
differentiate the values for optimization.

RCU_INIT_POINTER() sounds as an initialization-stage API. If we need
something different for NULL pointer, I prefer
rcu_assign_*null*_pointer().

rcu_assign_pointer(X, NULL) implies compiler barrier(), but
RCU_INIT_POINTER(X, NULL) doesn't.

> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH] workqueue: add __WQ_FREEZING and remove POOL_FREEZING

2014-03-25 Thread Lai Jiangshan

freezing is nothing related to pools, but POOL_FREEZING adds a connection,
and causes freeze_workqueues_begin() and thaw_workqueues() complicated.

Since freezing is workqueue instance attribute, so we introduce __WQ_FREEZING
to wq->flags instead and remove POOL_FREEZING.

we set __WQ_FREEZING only when freezable(to simplify pwq_adjust_max_active()),
make freeze_workqueues_begin() and thaw_workqueues() fast skip non-freezable wq.

Changed from previous patches(requested by tj):
1) added the WARN_ON_ONCE() back
2) merged the two patches as one

Signed-off-by: Lai Jiangshan 
---
 include/linux/workqueue.h |1 +
 kernel/workqueue.c|   43 ---
 2 files changed, 13 insertions(+), 31 deletions(-)

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 704f4f6..a45202b 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -335,6 +335,7 @@ enum {
 */
WQ_POWER_EFFICIENT  = 1 << 7,
 
+   __WQ_FREEZING   = 1 << 15, /* internel: workqueue is freezing */
__WQ_DRAINING   = 1 << 16, /* internal: workqueue is draining */
__WQ_ORDERED= 1 << 17, /* internal: workqueue is ordered */
 
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 193e977..0c74979 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -70,7 +70,6 @@ enum {
 */
POOL_MANAGE_WORKERS = 1 << 0,   /* need to manage workers */
POOL_DISASSOCIATED  = 1 << 2,   /* cpu can't serve workers */
-   POOL_FREEZING   = 1 << 3,   /* freeze in progress */
 
/* worker flags */
WORKER_STARTED  = 1 << 0,   /* started */
@@ -3632,9 +3631,6 @@ static struct worker_pool *get_unbound_pool(const struct 
workqueue_attrs *attrs)
if (!pool || init_worker_pool(pool) < 0)
goto fail;
 
-   if (workqueue_freezing)
-   pool->flags |= POOL_FREEZING;
-
lockdep_set_subclass(&pool->lock, 1);   /* see put_pwq() */
copy_workqueue_attrs(pool->attrs, attrs);
 
@@ -3730,18 +3726,13 @@ static void pwq_unbound_release_workfn(struct 
work_struct *work)
 static void pwq_adjust_max_active(struct pool_workqueue *pwq)
 {
struct workqueue_struct *wq = pwq->wq;
-   bool freezable = wq->flags & WQ_FREEZABLE;
 
-   /* for @wq->saved_max_active */
+   /* for @wq->saved_max_active and @wq->flags */
lockdep_assert_held(&wq->mutex);
 
-   /* fast exit for non-freezable wqs */
-   if (!freezable && pwq->max_active == wq->saved_max_active)
-   return;
-
spin_lock_irq(&pwq->pool->lock);
 
-   if (!freezable || !(pwq->pool->flags & POOL_FREEZING)) {
+   if (!(wq->flags & __WQ_FREEZING)) {
pwq->max_active = wq->saved_max_active;
 
while (!list_empty(&pwq->delayed_works) &&
@@ -4250,6 +4241,8 @@ struct workqueue_struct *__alloc_workqueue_key(const char 
*fmt,
mutex_lock(&wq_pool_mutex);
 
mutex_lock(&wq->mutex);
+   if ((wq->flags & WQ_FREEZABLE) && workqueue_freezing)
+   wq->flags |= __WQ_FREEZING;
for_each_pwq(pwq, wq)
pwq_adjust_max_active(pwq);
mutex_unlock(&wq->mutex);
@@ -4856,26 +4849,20 @@ EXPORT_SYMBOL_GPL(work_on_cpu);
  */
 void freeze_workqueues_begin(void)
 {
-   struct worker_pool *pool;
struct workqueue_struct *wq;
struct pool_workqueue *pwq;
-   int pi;
 
mutex_lock(&wq_pool_mutex);
 
WARN_ON_ONCE(workqueue_freezing);
workqueue_freezing = true;
 
-   /* set FREEZING */
-   for_each_pool(pool, pi) {
-   spin_lock_irq(&pool->lock);
-   WARN_ON_ONCE(pool->flags & POOL_FREEZING);
-   pool->flags |= POOL_FREEZING;
-   spin_unlock_irq(&pool->lock);
-   }
-
list_for_each_entry(wq, &workqueues, list) {
+   if (!(wq->flags & WQ_FREEZABLE))
+   continue;
mutex_lock(&wq->mutex);
+   WARN_ON_ONCE(wq->flags & __WQ_FREEZING);
+   wq->flags |= __WQ_FREEZING;
for_each_pwq(pwq, wq)
pwq_adjust_max_active(pwq);
mutex_unlock(&wq->mutex);
@@ -4943,25 +4930,19 @@ void thaw_workqueues(void)
 {
struct workqueue_struct *wq;
struct pool_workqueue *pwq;
-   struct worker_pool *pool;
-   int pi;
 
mutex_lock(&wq_pool_mutex);
 
if (!workqueue_freezing)
goto out_unlock;
 
-   /* clear FREEZING */
-   for_each_pool(pool, pi) {
-   spin_lock_irq(&pool->lock);
-

[PATCH] workqueue: use manager lock only to protect worker_idr

2014-03-26 Thread Lai Jiangshan

worker_idr is always accessed in manager lock context currently.
worker_idr is highly related to managers, it will be unlikely
accessed in pool->lock only in future.

Signed-off-by: Lai Jiangshan 
---
 kernel/workqueue.c |   34 ++
 1 files changed, 6 insertions(+), 28 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 0c74979..f5b68a3 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -123,8 +123,7 @@ enum {
  *cpu or grabbing pool->lock is enough for read access.  If
  *POOL_DISASSOCIATED is set, it's identical to L.
  *
- * MG: pool->manager_mutex and pool->lock protected.  Writes require both
- * locks.  Reads can happen under either lock.
+ * M: pool->manager_mutex protected.
  *
  * PL: wq_pool_mutex protected.
  *
@@ -163,7 +162,7 @@ struct worker_pool {
/* see manage_workers() for details on the two manager mutexes */
struct mutexmanager_arb;/* manager arbitration */
struct mutexmanager_mutex;  /* manager exclusion */
-   struct idr  worker_idr; /* MG: worker IDs and iteration 
*/
+   struct idr  worker_idr; /* M: worker IDs and iteration 
*/
 
struct workqueue_attrs  *attrs; /* I: worker attributes */
struct hlist_node   hash_node;  /* PL: unbound_pool_hash node */
@@ -339,16 +338,6 @@ static void copy_workqueue_attrs(struct workqueue_attrs 
*to,
   lockdep_is_held(&wq->mutex), \
   "sched RCU or wq->mutex should be held")
 
-#ifdef CONFIG_LOCKDEP
-#define assert_manager_or_pool_lock(pool)  \
-   WARN_ONCE(debug_locks &&\
- !lockdep_is_held(&(pool)->manager_mutex) &&   \
- !lockdep_is_held(&(pool)->lock),  \
- "pool->manager_mutex or ->lock should be held")
-#else
-#define assert_manager_or_pool_lock(pool)  do { } while (0)
-#endif
-
 #define for_each_cpu_worker_pool(pool, cpu)\
for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0];   \
 (pool) < &per_cpu(cpu_worker_pools, cpu)[NR_STD_WORKER_POOLS]; \
@@ -377,14 +366,14 @@ static void copy_workqueue_attrs(struct workqueue_attrs 
*to,
  * @wi: integer used for iteration
  * @pool: worker_pool to iterate workers of
  *
- * This must be called with either @pool->manager_mutex or ->lock held.
+ * This must be called with either @pool->manager_mutex.
  *
  * The if/else clause exists only for the lockdep assertion and can be
  * ignored.
  */
 #define for_each_pool_worker(worker, wi, pool) \
idr_for_each_entry(&(pool)->worker_idr, (worker), (wi)) \
-   if (({ assert_manager_or_pool_lock((pool)); false; })) { } \
+   if (({ lockdep_assert_held(&pool->manager_mutex); false; })) { 
} \
else
 
 /**
@@ -1717,13 +1706,7 @@ static struct worker *create_worker(struct worker_pool 
*pool)
 * ID is needed to determine kthread name.  Allocate ID first
 * without installing the pointer.
 */
-   idr_preload(GFP_KERNEL);
-   spin_lock_irq(&pool->lock);
-
-   id = idr_alloc(&pool->worker_idr, NULL, 0, 0, GFP_NOWAIT);
-
-   spin_unlock_irq(&pool->lock);
-   idr_preload_end();
+   id = idr_alloc(&pool->worker_idr, NULL, 0, 0, GFP_KERNEL);
if (id < 0)
goto fail;
 
@@ -1765,18 +1748,13 @@ static struct worker *create_worker(struct worker_pool 
*pool)
worker->flags |= WORKER_UNBOUND;
 
/* successful, commit the pointer to idr */
-   spin_lock_irq(&pool->lock);
idr_replace(&pool->worker_idr, worker, worker->id);
-   spin_unlock_irq(&pool->lock);
 
return worker;
 
 fail:
-   if (id >= 0) {
-   spin_lock_irq(&pool->lock);
+   if (id >= 0)
idr_remove(&pool->worker_idr, id);
-   spin_unlock_irq(&pool->lock);
-   }
kfree(worker);
return NULL;
 }
-- 
1.7.4.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] workqueue: use manager lock only to protect worker_idr

2014-03-27 Thread Lai Jiangshan

please omit this patch and wait for my new patchset.

Thanks,
Lai

On 03/26/2014 10:41 PM, Lai Jiangshan wrote:
> worker_idr is always accessed in manager lock context currently.
> worker_idr is highly related to managers, it will be unlikely
> accessed in pool->lock only in future.
> 
> Signed-off-by: Lai Jiangshan 
> ---
>  kernel/workqueue.c |   34 ++
>  1 files changed, 6 insertions(+), 28 deletions(-)
> 
> diff --git a/kernel/workqueue.c b/kernel/workqueue.c
> index 0c74979..f5b68a3 100644
> --- a/kernel/workqueue.c
> +++ b/kernel/workqueue.c
> @@ -123,8 +123,7 @@ enum {
>   *cpu or grabbing pool->lock is enough for read access.  If
>   *POOL_DISASSOCIATED is set, it's identical to L.
>   *
> - * MG: pool->manager_mutex and pool->lock protected.  Writes require both
> - * locks.  Reads can happen under either lock.
> + * M: pool->manager_mutex protected.
>   *
>   * PL: wq_pool_mutex protected.
>   *
> @@ -163,7 +162,7 @@ struct worker_pool {
>   /* see manage_workers() for details on the two manager mutexes */
>   struct mutexmanager_arb;/* manager arbitration */
>   struct mutexmanager_mutex;  /* manager exclusion */
> - struct idr  worker_idr; /* MG: worker IDs and iteration 
> */
> + struct idr  worker_idr; /* M: worker IDs and iteration 
> */
>  
>   struct workqueue_attrs  *attrs; /* I: worker attributes */
>   struct hlist_node   hash_node;  /* PL: unbound_pool_hash node */
> @@ -339,16 +338,6 @@ static void copy_workqueue_attrs(struct workqueue_attrs 
> *to,
>  lockdep_is_held(&wq->mutex), \
>  "sched RCU or wq->mutex should be held")
>  
> -#ifdef CONFIG_LOCKDEP
> -#define assert_manager_or_pool_lock(pool)\
> - WARN_ONCE(debug_locks &&\
> -   !lockdep_is_held(&(pool)->manager_mutex) &&   \
> -   !lockdep_is_held(&(pool)->lock),  \
> -   "pool->manager_mutex or ->lock should be held")
> -#else
> -#define assert_manager_or_pool_lock(pool)do { } while (0)
> -#endif
> -
>  #define for_each_cpu_worker_pool(pool, cpu)  \
>   for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0];   \
>(pool) < &per_cpu(cpu_worker_pools, cpu)[NR_STD_WORKER_POOLS]; \
> @@ -377,14 +366,14 @@ static void copy_workqueue_attrs(struct workqueue_attrs 
> *to,
>   * @wi: integer used for iteration
>   * @pool: worker_pool to iterate workers of
>   *
> - * This must be called with either @pool->manager_mutex or ->lock held.
> + * This must be called with either @pool->manager_mutex.
>   *
>   * The if/else clause exists only for the lockdep assertion and can be
>   * ignored.
>   */
>  #define for_each_pool_worker(worker, wi, pool)   
> \
>   idr_for_each_entry(&(pool)->worker_idr, (worker), (wi)) \
> - if (({ assert_manager_or_pool_lock((pool)); false; })) { } \
> + if (({ lockdep_assert_held(&pool->manager_mutex); false; })) { 
> } \
>   else
>  
>  /**
> @@ -1717,13 +1706,7 @@ static struct worker *create_worker(struct worker_pool 
> *pool)
>* ID is needed to determine kthread name.  Allocate ID first
>* without installing the pointer.
>*/
> - idr_preload(GFP_KERNEL);
> - spin_lock_irq(&pool->lock);
> -
> - id = idr_alloc(&pool->worker_idr, NULL, 0, 0, GFP_NOWAIT);
> -
> - spin_unlock_irq(&pool->lock);
> - idr_preload_end();
> + id = idr_alloc(&pool->worker_idr, NULL, 0, 0, GFP_KERNEL);
>   if (id < 0)
>   goto fail;
>  
> @@ -1765,18 +1748,13 @@ static struct worker *create_worker(struct 
> worker_pool *pool)
>   worker->flags |= WORKER_UNBOUND;
>  
>   /* successful, commit the pointer to idr */
> - spin_lock_irq(&pool->lock);
>   idr_replace(&pool->worker_idr, worker, worker->id);
> - spin_unlock_irq(&pool->lock);
>  
>   return worker;
>  
>  fail:
> - if (id >= 0) {
> - spin_lock_irq(&pool->lock);
> + if (id >= 0)
>   idr_remove(&pool->worker_idr, id);
> - spin_unlock_irq(&pool->lock);
> - }
>   kfree(worker);
>   return NULL;
>  }

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] workqueue: add __WQ_FREEZING and remove POOL_FREEZING

2014-03-27 Thread Lai Jiangshan

On 03/25/2014 05:56 PM, Lai Jiangshan wrote:
> freezing is nothing related to pools, but POOL_FREEZING adds a connection,
> and causes freeze_workqueues_begin() and thaw_workqueues() complicated.
> 
> Since freezing is workqueue instance attribute, so we introduce __WQ_FREEZING
> to wq->flags instead and remove POOL_FREEZING.
> 
> we set __WQ_FREEZING only when freezable(to simplify pwq_adjust_max_active()),
> make freeze_workqueues_begin() and thaw_workqueues() fast skip non-freezable 
> wq.
> 
> Changed from previous patches(requested by tj):
>   1) added the WARN_ON_ONCE() back
>   2) merged the two patches as one

Ping.

Hi, Tejun

You had reviewed this patch several rounds.
I had applied all your requests(the last two is listed above) in your comments.

I'm deeply sorry for responding so late.

Thanks,
Lai


> 
> Signed-off-by: Lai Jiangshan 
> ---
>  include/linux/workqueue.h |1 +
>  kernel/workqueue.c|   43 ---
>  2 files changed, 13 insertions(+), 31 deletions(-)
> 
> diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
> index 704f4f6..a45202b 100644
> --- a/include/linux/workqueue.h
> +++ b/include/linux/workqueue.h
> @@ -335,6 +335,7 @@ enum {
>*/
>   WQ_POWER_EFFICIENT  = 1 << 7,
>  
> + __WQ_FREEZING   = 1 << 15, /* internel: workqueue is freezing */
>   __WQ_DRAINING   = 1 << 16, /* internal: workqueue is draining */
>   __WQ_ORDERED= 1 << 17, /* internal: workqueue is ordered */
>  
> diff --git a/kernel/workqueue.c b/kernel/workqueue.c
> index 193e977..0c74979 100644
> --- a/kernel/workqueue.c
> +++ b/kernel/workqueue.c
> @@ -70,7 +70,6 @@ enum {
>*/
>   POOL_MANAGE_WORKERS = 1 << 0,   /* need to manage workers */
>   POOL_DISASSOCIATED  = 1 << 2,   /* cpu can't serve workers */
> - POOL_FREEZING   = 1 << 3,   /* freeze in progress */
>  
>   /* worker flags */
>   WORKER_STARTED  = 1 << 0,   /* started */
> @@ -3632,9 +3631,6 @@ static struct worker_pool *get_unbound_pool(const 
> struct workqueue_attrs *attrs)
>   if (!pool || init_worker_pool(pool) < 0)
>   goto fail;
>  
> - if (workqueue_freezing)
> - pool->flags |= POOL_FREEZING;
> -
>   lockdep_set_subclass(&pool->lock, 1);   /* see put_pwq() */
>   copy_workqueue_attrs(pool->attrs, attrs);
>  
> @@ -3730,18 +3726,13 @@ static void pwq_unbound_release_workfn(struct 
> work_struct *work)
>  static void pwq_adjust_max_active(struct pool_workqueue *pwq)
>  {
>   struct workqueue_struct *wq = pwq->wq;
> - bool freezable = wq->flags & WQ_FREEZABLE;
>  
> - /* for @wq->saved_max_active */
> + /* for @wq->saved_max_active and @wq->flags */
>   lockdep_assert_held(&wq->mutex);
>  
> - /* fast exit for non-freezable wqs */
> - if (!freezable && pwq->max_active == wq->saved_max_active)
> - return;
> -
>   spin_lock_irq(&pwq->pool->lock);
>  
> - if (!freezable || !(pwq->pool->flags & POOL_FREEZING)) {
> + if (!(wq->flags & __WQ_FREEZING)) {
>   pwq->max_active = wq->saved_max_active;
>  
>   while (!list_empty(&pwq->delayed_works) &&
> @@ -4250,6 +4241,8 @@ struct workqueue_struct *__alloc_workqueue_key(const 
> char *fmt,
>   mutex_lock(&wq_pool_mutex);
>  
>   mutex_lock(&wq->mutex);
> + if ((wq->flags & WQ_FREEZABLE) && workqueue_freezing)
> + wq->flags |= __WQ_FREEZING;
>   for_each_pwq(pwq, wq)
>   pwq_adjust_max_active(pwq);
>   mutex_unlock(&wq->mutex);
> @@ -4856,26 +4849,20 @@ EXPORT_SYMBOL_GPL(work_on_cpu);
>   */
>  void freeze_workqueues_begin(void)
>  {
> - struct worker_pool *pool;
>   struct workqueue_struct *wq;
>   struct pool_workqueue *pwq;
> - int pi;
>  
>   mutex_lock(&wq_pool_mutex);
>  
>   WARN_ON_ONCE(workqueue_freezing);
>   workqueue_freezing = true;
>  
> - /* set FREEZING */
> - for_each_pool(pool, pi) {
> - spin_lock_irq(&pool->lock);
> - WARN_ON_ONCE(pool->flags & POOL_FREEZING);
> - pool->flags |= POOL_FREEZING;
> - spin_unlock_irq(&pool->lock);
> - }
> -
>   list_for_each_entry(wq, &workqueues, list) {
> + if (!(wq->flags & WQ_FREEZABLE))
> + continue;
>

[PATCH] workqueue: fix possible race condition when rescuer VS pwq-release

2014-03-27 Thread Lai Jiangshan

There is a race condition between rescuer_thread() and
pwq_unbound_release_workfn().

The works of the @pwq may be processed by some other worker,
and @pwq is scheduled to release before the rescuer starts
to process. In this case pwq_unbound_release_workfn() will
corrupt wq->maydays, and rescuer_thead will access to corrupted
data.

Using get_pwq() when send_mayday() will keep @pwq's lifetime
and avoid the race condition.

Signed-off-by: Lai Jiangshan 
---
 kernel/workqueue.c |7 +++
 1 files changed, 7 insertions(+), 0 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 82ef9f3..7066519 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1902,6 +1902,12 @@ static void send_mayday(struct work_struct *work)
 
/* mayday mayday mayday */
if (list_empty(&pwq->mayday_node)) {
+   /*
+* Keep the pwq and avoid the pwq to be scheduled to release
+* when someone else processes all the works before the rescuer
+* starts to process.
+*/
+   get_pwq(pwq);
list_add_tail(&pwq->mayday_node, &wq->maydays);
wake_up_process(wq->rescuer->task);
}
@@ -2418,6 +2424,7 @@ repeat:
/* migrate to the target cpu if possible */
worker_maybe_bind_and_lock(pool);
rescuer->pool = pool;
+   put_pwq(pwq);
 
/*
 * Slurp in all works issued via this workqueue and
-- 
1.7.7.6

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] workqueue: fix double unlock bug

2014-04-13 Thread Lai Jiangshan

On 04/14/2014 08:58 AM, Daeseok Youn wrote:
> 
> mutex_unlock() and put_pwq_unlocked() do not need to be called
> when alloc_unbound_pwq() is failed.
> 
> Signed-off-by: Daeseok Youn 
> ---
>  kernel/workqueue.c |2 +-
>  1 files changed, 1 insertions(+), 1 deletions(-)
> 
> diff --git a/kernel/workqueue.c b/kernel/workqueue.c
> index 0ee63af..e6e9f6a 100644
> --- a/kernel/workqueue.c
> +++ b/kernel/workqueue.c
> @@ -4100,7 +4100,7 @@ static void wq_update_unbound_numa(struct 
> workqueue_struct *wq, int cpu,
>   if (!pwq) {
>   pr_warning("workqueue: allocation failed while updating NUMA 
> affinity of \"%s\"\n",
>  wq->name);
> - goto out_unlock;
> + return;
>   }
>  
>   /*


Nice catch!!!

The supposed correct behavior is documented in the head of
this function. We forgot to do it.

 * If NUMA affinity can't be adjusted due to memory allocation failure, it
 * falls back to @wq->dfl_pwq which may not be optimal but is always
 * correct.

Could you use the following code instead of "goto out_unlock":
mutex_lock(&wq->mutex);
if (pwq == wq->dfl_pwq)
goto out_unlock;
else
goto use_dfl_pwq;

Correct&BAD. There are two blocks of suck code in this function:
if (pwq == wq->dfl_pwq)
goto out_unlock;
else
goto use_dfl_pwq;

You can replace both these two blocks code to the following code:
goto use_dfl_pwq;

The result is the same as before except it adds some small overhead.
I don't care the small overhead in this function.

Thanks
Lai

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH V2] workqueue: fix possible race condition when rescuer VS pwq-release

2014-04-14 Thread Lai Jiangshan

On 04/01/2014 04:06 AM, Tejun Heo wrote:
> On Mon, Mar 31, 2014 at 10:40:47PM +0800, Lai Jiangshan wrote:
>> Ping
> 
> Lai, I'll keep the mail tagged but can you pleaes ping me once -rc1
> drops?  While it is an actual bug, given that it'd be pretty difficult
> to trigger, I don't think it's too urgent.
> 
> Thanks!
> 

Hi, Tejun
Ping.

Very old (resent) patch(1 patch):
workqueue: add __WQ_FREEZING and remove POOL_FREEZING

Possible race condition patch(1 patch), (In this email thread)
workqueue: fix possible race condition when rescuer VS pwq-release

Worker management patchset(6 patches)
workqueue: simpler&better workers management synchronization
  workqueue: generic routine to restore percpu/unbound pools' workers'
cpumask
  workqueue: generic framework to manage normal&rescuer workers'
cpumask
  workqueue: make destroy_worker() atomically
  workqueue: commit worker to pool's concurrency setting atomically.
  workqueue: remove manager_mutex
  workqueue: destroy worker directly in idle timeout handler

Sorry for adding some work to you.

Thanks
Lai
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH V2] workqueue: fix double unlock bug

2014-04-14 Thread Lai Jiangshan

On Mon, Apr 14, 2014 at 6:41 PM, Daeseok Youn  wrote:
>
> mutex_unlock() and put_pwq_unlocked() do not need to be called
> when alloc_unbound_pwq() is failed.
>
> And remove "if" condition for whether "pwq" is same as "wq->dfl_pwq"
> when wq_calc_node_cpumask() returns false and just use "goto use_dfl_pwq"
>
> Signed-off-by: Daeseok Youn 
> ---
> V2: replace "if condition" with "goto" as Lai's comment.
>
>  kernel/workqueue.c |7 ++-
>  1 files changed, 2 insertions(+), 5 deletions(-)
>
> diff --git a/kernel/workqueue.c b/kernel/workqueue.c
> index 0ee63af..9853067 100644
> --- a/kernel/workqueue.c
> +++ b/kernel/workqueue.c
> @@ -4087,10 +4087,7 @@ static void wq_update_unbound_numa(struct 
> workqueue_struct *wq, int cpu,
> if (cpumask_equal(cpumask, pwq->pool->attrs->cpumask))
> goto out_unlock;
> } else {
> -   if (pwq == wq->dfl_pwq)
> -   goto out_unlock;
> -   else
> -   goto use_dfl_pwq;
> +   goto use_dfl_pwq;
> }
>
> mutex_unlock(&wq->mutex);
> @@ -4100,7 +4097,7 @@ static void wq_update_unbound_numa(struct 
> workqueue_struct *wq, int cpu,
> if (!pwq) {
> pr_warning("workqueue: allocation failed while updating NUMA 
> affinity of \"%s\"\n",
>wq->name);
> -   goto out_unlock;
> +   return;

Please acquire the  wq->mutex and goto use_dfl_pwq;

> }
>
> /*
> --
> 1.7.4.4
>
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH V3] workqueue: fix double unlock bug

2014-04-14 Thread Lai Jiangshan

Hi, Tejun

Acked-by: Lai Jiangshan 
CC: sta...@kernel.org

Thanks,
Lai

On 04/15/2014 07:17 AM, Daeseok Youn wrote:
> 
> Use default pwq when alloc_unbound_pwq() is failed.
> 
> And remove "if" condition for whether "pwq" is same as "wq->dfl_pwq"
> when wq_calc_node_cpumask() returns false and just use "goto use_dfl_pwq"
> 
> Signed-off-by: Daeseok Youn 
> ---
> V2: replace "if condition" with "goto" as Lai's comment.
> V3: Use default pwq when alloc_unbound_pwq() is failed.
> 
>  kernel/workqueue.c |8 +++-
>  1 files changed, 3 insertions(+), 5 deletions(-)
> 
> diff --git a/kernel/workqueue.c b/kernel/workqueue.c
> index 0ee63af..0679854 100644
> --- a/kernel/workqueue.c
> +++ b/kernel/workqueue.c
> @@ -4087,10 +4087,7 @@ static void wq_update_unbound_numa(struct 
> workqueue_struct *wq, int cpu,
>   if (cpumask_equal(cpumask, pwq->pool->attrs->cpumask))
>   goto out_unlock;
>   } else {
> - if (pwq == wq->dfl_pwq)
> - goto out_unlock;
> - else
> - goto use_dfl_pwq;
> + goto use_dfl_pwq;
>   }
>  
>   mutex_unlock(&wq->mutex);
> @@ -4100,7 +4097,8 @@ static void wq_update_unbound_numa(struct 
> workqueue_struct *wq, int cpu,
>   if (!pwq) {
>   pr_warning("workqueue: allocation failed while updating NUMA 
> affinity of \"%s\"\n",
>  wq->name);
> - goto out_unlock;
> + mutex_lock(&wq->mutex);
> + goto use_dfl_pwq;
>   }
>  
>   /*

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH] workqueue: allow changing attributions of ordered workqueue

2014-04-15 Thread Lai Jiangshan

>From 534f1df8a5a03427b0fc382150fbd34e05648a28 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan 
Date: Tue, 15 Apr 2014 17:52:19 +0800
Subject: [PATCH] workqueue: allow changing attributions of ordered workqueue

Allow changing ordered workqueue's cpumask
Allow changing ordered workqueue's nice value
Allow registering ordered workqueue to SYSFS

Still disallow changing ordered workqueue's max_active which breaks ordered 
guarantee
Still disallow changing ordered workqueue's no_numa which breaks ordered 
guarantee

Changing attributions will introduce new pwqs in a given workqueue, and
old implement doesn't have any solution to handle multi-pwqs on ordered 
workqueues,
so changing attributions for ordered workqueues is disallowed.

After I investigated several solutions which try to handle multi-pwqs on 
ordered workqueues,
I found the solution which reuse max_active are the simplest.
In this solution, the new introduced pwq's max_active is kept as *ZERO* until 
it become
the oldest pwq. Thus only one (the oldest) pwq is active, and ordered is 
guarantee.

This solution forces ordered on higher level while the non-reentrancy is also
forced. so we don't need to queue any work to its previous pool. And we 
shouldn't
do it. we must disallow any work repeatedly requeues itself back-to-back
which keeps the oldest pwq stay and make newest(if have) pwq 
starvation(non-active for ever).

Build test only.
This patch depends on previous patch:
workqueue: add __WQ_FREEZING and remove POOL_FREEZING

Frederic:
You can pick this patch to your updated patchset before
    TJ apply it.

Signed-off-by: Lai Jiangshan 
---
 kernel/workqueue.c |   66 ++-
 1 files changed, 39 insertions(+), 27 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 92c9ada..fadcc4a 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1350,8 +1350,14 @@ retry:
 * If @work was previously on a different pool, it might still be
 * running there, in which case the work needs to be queued on that
 * pool to guarantee non-reentrancy.
+*
+* pwqs are guaranteed active orderly for ordered workqueue, and
+* it guarantees non-reentrancy for works. So any work doesn't need
+* to be queued on previous pool. And the works shouldn't be queued
+* on previous pool, since we need to guarantee the prevous pwq
+* releasable to avoid work-stavation on the newest pool.
 */
-   last_pool = get_work_pool(work);
+   last_pool = wq->flags & __WQ_ORDERED ? NULL : get_work_pool(work);
if (last_pool && last_pool != pwq->pool) {
struct worker *worker;
 
@@ -3179,6 +3185,10 @@ static ssize_t wq_numa_store(struct device *dev, struct 
device_attribute *attr,
struct workqueue_attrs *attrs;
int v, ret;
 
+   /* Creating per-node pwqs breaks ordering guarantee. Keep no_numa = 1 */
+   if (WARN_ON(wq->flags & __WQ_ORDERED))
+   return -EINVAL;
+
attrs = wq_sysfs_prep_attrs(wq);
if (!attrs)
return -ENOMEM;
@@ -3239,14 +3249,6 @@ int workqueue_sysfs_register(struct workqueue_struct *wq)
struct wq_device *wq_dev;
int ret;
 
-   /*
-* Adjusting max_active or creating new pwqs by applyting
-* attributes breaks ordering guarantee.  Disallow exposing ordered
-* workqueues.
-*/
-   if (WARN_ON(wq->flags & __WQ_ORDERED))
-   return -EINVAL;
-
wq->wq_dev = wq_dev = kzalloc(sizeof(*wq_dev), GFP_KERNEL);
if (!wq_dev)
return -ENOMEM;
@@ -3568,6 +3570,13 @@ static void rcu_free_pwq(struct rcu_head *rcu)
container_of(rcu, struct pool_workqueue, rcu));
 }
 
+static struct pool_workqueue *oldest_pwq(struct workqueue_struct *wq)
+{
+   return list_last_entry(&wq->pwqs, struct pool_workqueue, pwqs_node);
+}
+
+static void pwq_adjust_max_active(struct pool_workqueue *pwq);
+
 /*
  * Scheduled on system_wq by put_pwq() when an unbound pwq hits zero refcnt
  * and needs to be destroyed.
@@ -3583,14 +3592,12 @@ static void pwq_unbound_release_workfn(struct 
work_struct *work)
if (WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND)))
return;
 
-   /*
-* Unlink @pwq.  Synchronization against wq->mutex isn't strictly
-* necessary on release but do it anyway.  It's easier to verify
-* and consistent with the linking path.
-*/
mutex_lock(&wq->mutex);
list_del_rcu(&pwq->pwqs_node);
is_last = list_empty(&wq->pwqs);
+   /* try to active the oldest pwq when needed */
+   if (!is_last && (wq->flags & __WQ_ORDERED))
+   pwq_adjust_max_active(oldest_pwq(wq));
mutex_unlock(&wq->mute

Re: [PATCH] workqueue: allow changing attributions of ordered workqueue

2014-04-15 Thread Lai Jiangshan

>
> So IUUC this is to allow registering ordered workqueue as WQ_SYSFS? But I 
> think

Making ordered workqueue's attributions changeable was listed in my TODO list.
I found it may help for you, so I prioritized it.
Never mind, I will push my patch separated if you don't need it.

> we are going to follow Tejun's suggestion to have a low level cpumask which 
> defines
> the limit for all unbound workqueues. This indeed tremendously simplifies 
> everyting.
> I'll post the series soon.

It sounds feasible. I look forward to your patches

>
> But maybe I'm missing other requirements that are fixed by your patch?
>
> Thanks.
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH V2] workqueue: fix possible race condition when rescuer VS pwq-release

2014-04-15 Thread Lai Jiangshan

On 04/16/2014 12:47 AM, Tejun Heo wrote:
> On Fri, Mar 28, 2014 at 08:07:58PM +0800, Lai Jiangshan wrote:
>> +static inline void get_unbound_pwq(struct pool_workqueue *pwq)
>> +{
>> +if (pwq->wq->flags & WQ_UNBOUND)
>> +get_pwq(pwq);
>> +}
>> +
>>  /**
>>   * put_pwq - put a pool_workqueue reference
>>   * @pwq: pool_workqueue to put
>> @@ -1075,6 +1081,12 @@ static void put_pwq(struct pool_workqueue *pwq)
>>  schedule_work(&pwq->unbound_release_work);
>>  }
>>  
>> +static inline void put_unbound_pwq(struct pool_workqueue *pwq)
>> +{
>> +if (pwq->wq->flags & WQ_UNBOUND)
>> +put_pwq(pwq);
>> +}
> 
> Ugh... please drop these helpers.
> 
>> +get_unbound_pwq(pwq);
> 
> Why not just do get_pwq() here?

V1 patch just do get_pwq().

> 
> Thanks.
> 

1) Our aim is to protect unbound pwq, not percpu pwq which can't be be 
protected by get_pwq().
2) get_pwq() will make reviewers confused/surprised, destroy_workqueue() may 
destroy percpu pwqs
   with ref > 1. At least we need to add more comments explain this behavior. 
Origin comments:
/*
 * The base ref is never dropped on per-cpu pwqs.  Directly
 * free the pwqs and wq.
 */
3) get_unbound_pwq() self document.

Thanks,
Lai
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH V2] workqueue: fix possible race condition when rescuer VS pwq-release

2014-04-16 Thread Lai Jiangshan

On Wed, Apr 16, 2014 at 11:23 PM, Tejun Heo  wrote:
> Hello, Lai.
>
> On Wed, Apr 16, 2014 at 09:25:16AM +0800, Lai Jiangshan wrote:
>> 1) Our aim is to protect unbound pwq, not percpu pwq which can't be be 
>> protected by get_pwq().
>> 2) get_pwq() will make reviewers confused/surprised, destroy_workqueue() may 
>> destroy percpu pwqs
>>with ref > 1. At least we need to add more comments explain this 
>> behavior. Origin comments:
>>   /*
>>* The base ref is never dropped on per-cpu pwqs.  Directly
>>* free the pwqs and wq.
>>*/

Hi, Tejun

OK. It is better to use get_pwq(). I will also change the above comments to:

  The base ref and the possible ref from rerscuer(stopped) are never
dropped on per-cpu pwqs.
  Directly free the pwqs and wq.

The reason I quickly dropped V1 and wrote the V2 is that I saw this comment.
"The base ref" is precise after I used get_pwq() in V1.

Or to avoid to change to this comments.
I can also move the following code down to the bottom of the rescuer_thread().

if (kthread_should_stop()) {
__set_current_state(TASK_RUNNING);
rescuer->task->flags &= ~PF_WQ_WORKER;
return 0;
}

(I reply this email on browser, never mind the syntax).
Maybe the second choice are better.

Any think?

Thanks,
Lai.

>
> You can just comment "pwqs might go away at any time, pin it until
> rescuer is done with it" and that's actually the better way to do it.
> percpu wq's not supporting attribute changes may change in the future.
> What the code path wants is pinning down the pwq no matter where it
> came from.  There's no point in distinguishing unbound and per-cpu
> here.
>
>> 3) get_unbound_pwq() self document.
>
> Not really.  If the name is get_pwq_if_unbound(), maybe.  Functions
> which take args and become noop depending on the argument aren't
> generally good ideas.  There are specific cases that they are suitable
> but this is just gratuituous.
>
> Thanks.
>
> --
> tejun
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 1/2] workqueue: fix bugs in wq_update_unbound_numa() failure path

2014-04-16 Thread Lai Jiangshan

Acked.

On Wed, Apr 16, 2014 at 1:32 PM, Daeseok Youn  wrote:
>
> wq_update_unbound_numa() failure path has the following two bugs.
>  - alloc_unbound_pwq() is called without holding wq->mutex;
>  however, if the allocation fails, it jumps to out_unlock
>  which tries to unlock wq->mutex.
>
>  - The function should switch to dfl_pwq on failure
>  but didn't do so after alloc_unbound_pwq() failure.
>
> Fix it by regrabbing wq->mutex and jumping to use_dfl_pwq on
> alloc_unbound_pwq() failure.
>
> Signed-off-by: Daeseok Youn 
> ---
>  kernel/workqueue.c |3 ++-
>  1 files changed, 2 insertions(+), 1 deletions(-)
>
> diff --git a/kernel/workqueue.c b/kernel/workqueue.c
> index 0ee63af..3150b21 100644
> --- a/kernel/workqueue.c
> +++ b/kernel/workqueue.c
> @@ -4100,7 +4100,8 @@ static void wq_update_unbound_numa(struct 
> workqueue_struct *wq, int cpu,
> if (!pwq) {
> pr_warning("workqueue: allocation failed while updating NUMA 
> affinity of \"%s\"\n",
>wq->name);
> -   goto out_unlock;
> +   mutex_lock(&wq->mutex);
> +   goto use_dfl_pwq;
> }
>
> /*
> --
> 1.7.4.4
>
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 6/6] percpu-refcount: implement percpu_ref_reinit() and percpu_ref_is_zero()

2014-06-18 Thread Lai Jiangshan

On 06/18/2014 11:32 PM, Tejun Heo wrote:
> On Wed, Jun 18, 2014 at 11:37:35AM +0800, Lai Jiangshan wrote:
>>> @@ -97,7 +98,10 @@ static inline void percpu_ref_kill(struct percpu_ref 
>>> *ref)
>>>  static inline bool __pcpu_ref_alive(struct percpu_ref *ref,
>>> unsigned __percpu **pcpu_countp)
>>>  {
>>> -   unsigned long pcpu_ptr = ACCESS_ONCE(ref->pcpu_count_ptr);
>>> +   unsigned long pcpu_ptr;
>>> +
>>> +   /* paired with smp_store_release() in percpu_ref_reinit() */
>>> +   pcpu_ptr = smp_load_acquire(&ref->pcpu_count_ptr);
>>
>>
>> Does "smp_load_acquire()" hurts the performance of percpu_ref_get/put()
>> in non-x86 system?
> 
> It's equivalent to data dependency barrier.  The only arch which needs
> something more than barrier() is alpha.  It isn't an issue.
> 

But I searched from the source, smp_load_acquire() is just barrier() in
x86, arm64, ia64, s390, sparc, but it includes memory barrier 
instruction in other archs.

CC Paul. If smp_load_acquire() is sufficient lightweight, I would update
the SRCU.

Thanks,
Lai
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v2 6/6] percpu-refcount: implement percpu_ref_reinit() and percpu_ref_is_zero()

2014-06-18 Thread Lai Jiangshan

On 06/19/2014 10:20 AM, Tejun Heo wrote:
> Now that explicit invocation of percpu_ref_exit() is necessary to free
> the percpu counter, we can implement percpu_ref_reinit() which
> reinitializes a released percpu_ref.  This can be used implement
> scalable gating switch which can be drained and then re-opened without
> worrying about memory allocation failures.
> 
> percpu_ref_is_zero() is added to be used in a sanity check in
> percpu_ref_exit().  As this function will be useful for other purposes
> too, make it a public interface.
> 
> v2: Use smp_read_barrier_depends() instead of smp_load_acquire().  We
> only need data dep barrier and smp_load_acquire() is stronger and
>     heavier on some archs.  Spotted by Lai Jiangshan.
> 
> Signed-off-by: Tejun Heo 
> Cc: Kent Overstreet 
> Cc: Christoph Lameter 
> Cc: Lai Jiangshan 
> ---
>  include/linux/percpu-refcount.h |   19 +++
>  lib/percpu-refcount.c   |   35 +++
>  2 files changed, 54 insertions(+)
> 
> --- a/include/linux/percpu-refcount.h
> +++ b/include/linux/percpu-refcount.h
> @@ -67,6 +67,7 @@ struct percpu_ref {
>  
>  int __must_check percpu_ref_init(struct percpu_ref *ref,
>percpu_ref_func_t *release);
> +void percpu_ref_reinit(struct percpu_ref *ref);
>  void percpu_ref_exit(struct percpu_ref *ref);
>  void percpu_ref_kill_and_confirm(struct percpu_ref *ref,
>percpu_ref_func_t *confirm_kill);
> @@ -99,6 +100,9 @@ static inline bool __pcpu_ref_alive(stru
>  {
>   unsigned long pcpu_ptr = ACCESS_ONCE(ref->pcpu_count_ptr);
>  
> + /* paired with smp_store_release() in percpu_ref_reinit() */
> + smp_read_barrier_depends();
> +
>   if (unlikely(pcpu_ptr & PCPU_REF_DEAD))
>   return false;
>  
> @@ -206,4 +210,19 @@ static inline void percpu_ref_put(struct
>   rcu_read_unlock_sched();
>  }
>  
> +/**
> + * percpu_ref_is_zero - test whether a percpu refcount reached zero
> + * @ref: percpu_ref to test
> + *
> + * Returns %true if @ref reached zero.
> + */
> +static inline bool percpu_ref_is_zero(struct percpu_ref *ref)
> +{
> + unsigned __percpu *pcpu_count;
> +
> + if (__pcpu_ref_alive(ref, &pcpu_count))
> + return false;
> + return !atomic_read(&ref->count);
> +}
> +
>  #endif
> --- a/lib/percpu-refcount.c
> +++ b/lib/percpu-refcount.c
> @@ -61,6 +61,41 @@ int percpu_ref_init(struct percpu_ref *r
>  EXPORT_SYMBOL_GPL(percpu_ref_init);
>  
>  /**
> + * percpu_ref_reinit - re-initialize a percpu refcount
> + * @ref: perpcu_ref to re-initialize
> + *
> + * Re-initialize @ref so that it's in the same state as when it finished
> + * percpu_ref_init().  @ref must have been initialized successfully, killed
> + * and reached 0 but not exited.
> + *
> + * Note that percpu_ref_tryget[_live]() are safe to perform on @ref while
> + * this function is in progress.
> + */
> +void percpu_ref_reinit(struct percpu_ref *ref)
> +{
> + unsigned __percpu *pcpu_count = pcpu_count_ptr(ref);
> + int cpu;
> +
> + BUG_ON(!pcpu_count);
> + WARN_ON(!percpu_ref_is_zero(ref));
> +
> + atomic_set(&ref->count, 1 + PCPU_COUNT_BIAS);
> +
> + /*
> +  * Restore per-cpu operation.  smp_store_release() is paired with
> +  * smp_load_acquire() in __pcpu_ref_alive() and guarantees that the

s/smp_load_acquire()/smp_read_barrier_depends()/

s/smp_store_release()/smp_mb()/  if you accept my next comment.

> +  * zeroing is visible to all percpu accesses which can see the
> +  * following PCPU_REF_DEAD clearing.
> +  */
> + for_each_possible_cpu(cpu)
> + *per_cpu_ptr(pcpu_count, cpu) = 0;
> +
> + smp_store_release(&ref->pcpu_count_ptr,
> +   ref->pcpu_count_ptr & ~PCPU_REF_DEAD);

I think it would be better if smp_mb() is used.

it is documented that smp_read_barrier_depends() and smp_mb() are paired.
Not smp_read_barrier_depends() and smp_store_release().

> +}
> +EXPORT_SYMBOL_GPL(percpu_ref_reinit);
> +
> +/**
>   * percpu_ref_exit - undo percpu_ref_init()
>   * @ref: percpu_ref to exit
>   *
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
> .
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 2/3] workqueue: use dedicated creater kthread for all pools

2014-07-25 Thread Lai Jiangshan

 Although the early kworkers are not
depends on kworker_creater_thread, but this initialization order makes
the pid of kworker_creater_thread smaller than kworkers which
seems more smooth.

Signed-off-by: Lai Jiangshan 
---
 kernel/workqueue.c |  190 ++--
 1 files changed, 80 insertions(+), 110 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 1d44d8d..ce8e3fc 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -152,17 +152,17 @@ struct worker_pool {
struct list_headidle_list;  /* X: list of idle workers */
struct timer_list   idle_timer; /* L: worker idle timeout */
struct timer_list   mayday_timer;   /* L: SOS timer for workers */
+   struct timer_list   cooldown_timer; /* L: cool down before retry */
 
-   /* a workers is either on busy_hash or idle_list, or the manager */
+   /* a workers is either on busy_hash or idle_list */
DECLARE_HASHTABLE(busy_hash, BUSY_WORKER_HASH_ORDER);
/* L: hash of busy workers */
 
-   /* see manage_workers() for details on the two manager mutexes */
-   struct mutexmanager_arb;/* manager arbitration */
struct mutexattach_mutex;   /* attach/detach exclusion */
struct list_headworkers;/* A: attached workers */
struct completion   *detach_completion; /* all workers detached */
 
+   struct kthread_work creater_work;   /* L: work for creating a new 
worker */
struct ida  worker_ida; /* worker IDs for task name */
 
struct workqueue_attrs  *attrs; /* I: worker attributes */
@@ -291,6 +291,10 @@ static DEFINE_SPINLOCK(wq_mayday_lock);/* protects 
wq->maydays list */
 static LIST_HEAD(workqueues);  /* PL: list of all workqueues */
 static bool workqueue_freezing;/* PL: have wqs started 
freezing? */
 
+/* kthread worker for creating workers for pools */
+static DEFINE_KTHREAD_WORKER(kworker_creater);
+static struct task_struct *kworker_creater_thread __read_mostly;
+
 /* the per-cpu worker pools */
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS],
 cpu_worker_pools);
@@ -753,8 +757,7 @@ static bool need_to_create_worker(struct worker_pool *pool)
 /* Do we have too many workers and should some go away? */
 static bool too_many_workers(struct worker_pool *pool)
 {
-   bool managing = mutex_is_locked(&pool->manager_arb);
-   int nr_idle = pool->nr_idle + managing; /* manager is considered idle */
+   int nr_idle = pool->nr_idle;
int nr_busy = pool->nr_workers - nr_idle;
 
return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy;
@@ -1823,113 +1826,78 @@ static void pool_mayday_timeout(unsigned long __pool)
send_mayday(work);
}
 
+   if (!pool->nr_idle)
+   mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL);
+
spin_unlock(&pool->lock);
spin_unlock_irq(&wq_mayday_lock);
+}
+
+static void pool_cooldown_timeout(unsigned long __pool)
+{
+   struct worker_pool *pool = (void *)__pool;
 
-   mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL);
+   spin_lock_irq(&pool->lock);
+   if (!pool->nr_idle)
+   queue_kthread_work(&kworker_creater, &pool->creater_work);
+   spin_unlock_irq(&pool->lock);
 }
 
-/**
- * maybe_create_worker - create a new worker if necessary
- * @pool: pool to create a new worker for
- *
- * Create a new worker for @pool if necessary.  @pool is guaranteed to
- * have at least one idle worker on return from this function.  If
- * creating a new worker takes longer than MAYDAY_INTERVAL, mayday is
- * sent to all rescuers with works scheduled on @pool to resolve
- * possible allocation deadlock.
- *
- * On return, need_to_create_worker() is guaranteed to be %false and
- * may_start_working() %true.
- *
- * LOCKING:
- * spin_lock_irq(pool->lock) which may be released and regrabbed
- * multiple times.  Does GFP_KERNEL allocations.  Called only from
- * manager.
+/*
+ * Start the mayday timer and the creater when pool->nr_idle is reduced to 0.
  *
- * Return:
- * %false if no action was taken and pool->lock stayed locked, %true
- * otherwise.
+ * Any moment when pool->nr_idle == 0, the mayday timer must be active
+ * (pending or running) to ensure the mayday can be sent, and at least one
+ * of the creater_work or the cooldown timer must be active to ensure
+ * the creating is in progress or standby.
  */
-static bool maybe_create_worker(struct worker_pool *pool)
-__releases(&pool->lock)
-__acquires(&pool->lock)
+static void start_creater_work(struct worker_pool *pool)
 {
-   if (!need_to_create_worker(pool))

[PATCH 0/3] workqueue: offload the worker-management out from kworker

2014-07-25 Thread Lai Jiangshan

Current kworker prefer creating worker (if required) to processing work items,
we hope the processing should be the first priority.

The jobs in managers are serialized, it is just wasting if we have multiple
managers, only one worker-creater is enough.

It causes much complication and tricky when manager is implemented inside
worker, using dedicated creater will make things more flexible.

So we offload the worker-management out from kworker into a single
dedicated creater kthread.  It is done in patch2. And the patch1 is
preparation and patch3 is cleanup patch.

Lai Jiangshan (3):
  workqueue: migrate the new worker before add it to idle_list
  workqueue: use dedicated creater kthread for all pools
  workqueue: cleanup may_start_working()

 kernel/workqueue.c |  228 ++--
 1 files changed, 96 insertions(+), 132 deletions(-)

-- 
1.7.4.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 3/3] workqueue: cleanup may_start_working()

2014-07-25 Thread Lai Jiangshan

The name of may_start_working() became misleading due to the semantics of
"!pool->nr_idle" is changed and any worker can start working in spite of
the value of pool->nr_idle.

So we remove the may_start_working() and use "!pool->nr_idle" directly,
need_to_create_worker() is also removed along with it.

Signed-off-by: Lai Jiangshan 
---
 kernel/workqueue.c |   37 +
 1 files changed, 13 insertions(+), 24 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index ce8e3fc..e1ab4f9 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -735,12 +735,6 @@ static bool need_more_worker(struct worker_pool *pool)
return !list_empty(&pool->worklist) && __need_more_worker(pool);
 }
 
-/* Can I start working?  Called from busy but !running workers. */
-static bool may_start_working(struct worker_pool *pool)
-{
-   return pool->nr_idle;
-}
-
 /* Do I need to keep working?  Called from currently running workers. */
 static bool keep_working(struct worker_pool *pool)
 {
@@ -748,12 +742,6 @@ static bool keep_working(struct worker_pool *pool)
atomic_read(&pool->nr_running) <= 1;
 }
 
-/* Do we need a new worker?  Called from manager. */
-static bool need_to_create_worker(struct worker_pool *pool)
-{
-   return need_more_worker(pool) && !may_start_working(pool);
-}
-
 /* Do we have too many workers and should some go away? */
 static bool too_many_workers(struct worker_pool *pool)
 {
@@ -1815,19 +1803,20 @@ static void pool_mayday_timeout(unsigned long __pool)
spin_lock_irq(&wq_mayday_lock); /* for wq->maydays */
spin_lock(&pool->lock);
 
-   if (need_to_create_worker(pool)) {
-   /*
-* We've been trying to create a new worker but
-* haven't been successful.  We might be hitting an
-* allocation deadlock.  Send distress signals to
-* rescuers.
-*/
-   list_for_each_entry(work, &pool->worklist, entry)
-   send_mayday(work);
-   }
+   if (!pool->nr_idle) {
+   if (need_more_worker(pool)) {
+   /*
+* We've been trying to create a new worker but
+* haven't been successful.  We might be hitting an
+* allocation deadlock.  Send distress signals to
+* rescuers.
+*/
+   list_for_each_entry(work, &pool->worklist, entry)
+   send_mayday(work);
+   }
 
-   if (!pool->nr_idle)
mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL);
+   }
 
spin_unlock(&pool->lock);
spin_unlock_irq(&wq_mayday_lock);
@@ -2097,7 +2086,7 @@ woke_up:
goto sleep;
 
/* do we need to create worker? */
-   if (unlikely(!may_start_working(pool)))
+   if (unlikely(!pool->nr_idle))
start_creater_work(pool);
 
/*
-- 
1.7.4.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 1/3] workqueue: migrate the new worker before add it to idle_list

2014-07-25 Thread Lai Jiangshan

There is an undocumented requirement for create_worker() that it can
only be called from existing worker (aka. manager) except the first call.

The reason is that the current create_worker() queues the new worker to
idle_list at first and then wake up it.  But the new worker is not
guaranteed to be migrated until it is waken up.  Thus the
wq_worker_sleeping() may see the new non-local worker from the idle_list
if this block of code is not executed on the local CPU to disable
wq_worker_sleeping().  Existing worker can guarantee to run on local
CPU when !DISASSOCIATED, so create_worker() is required to be called
from existing worker/manager only currently.

But we are planning to allow create_worker() to be called out side
from its workers and the requirement should be alleviated.  So we exchange
the order of the code, the new worker is woken up before queued
to idle_list.

Signed-off-by: Lai Jiangshan 
---
 kernel/workqueue.c |7 ++-
 1 files changed, 6 insertions(+), 1 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 370f947..1d44d8d 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1708,8 +1708,13 @@ static struct worker *create_worker(struct worker_pool 
*pool)
/* start the newly created worker */
spin_lock_irq(&pool->lock);
worker->pool->nr_workers++;
-   worker_enter_idle(worker);
+   /*
+* Wake up the worker at first and then queue it to the idle_list,
+* so that it is ensued that the wq_worker_sleeping() sees the worker
+* had been migrated properly when sees this worker in the idle_list.
+*/
wake_up_process(worker->task);
+   worker_enter_idle(worker);
spin_unlock_irq(&pool->lock);
 
return worker;
-- 
1.7.4.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH] kthread_work: add cancel_kthread_work[_sync]()

2014-07-25 Thread Lai Jiangshan

When an object or a subsystem quits, we need to destroy the kthread_work
which is used by the object or the subsystem.  We used to use
flush_kthread_work().  But flush_kthread_work() has not any guarantee
about the suspension of the work, this duty is pushed to the users.

So we introduce the cancel_kthread_work_sync() with a strict guarantee
like cancel_work_sync() (workqueue).  We also introduce cancel_kthread_work()
which can be used by users on some conditions.  And it is required for
making the implementation of the cancel_kthread_work_sync() simpler.
kthread_flush_work_fn() owns the running state of the kthread_worker
and calls cancel_kthread_work() to cancel the possible requeued work.

Both cancel_kthread_work_sync() and cancel_kthread_work() share the
code of flush_kthread_work() which also make the implementation simpler.

Signed-off-by: Lai Jiangshan 
---
 include/linux/kthread.h |2 +
 kernel/kthread.c|   78 ++
 2 files changed, 66 insertions(+), 14 deletions(-)

diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index 790e49c..3cc3377 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -129,6 +129,8 @@ int kthread_worker_fn(void *worker_ptr);
 bool queue_kthread_work(struct kthread_worker *worker,
struct kthread_work *work);
 void flush_kthread_work(struct kthread_work *work);
+void cancel_kthread_work(struct kthread_work *work);
+void cancel_kthread_work_sync(struct kthread_work *work);
 void flush_kthread_worker(struct kthread_worker *worker);
 
 #endif /* _LINUX_KTHREAD_H */
diff --git a/kernel/kthread.c b/kernel/kthread.c
index ef48322..b5d6844 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -622,6 +622,7 @@ EXPORT_SYMBOL_GPL(queue_kthread_work);
 
 struct kthread_flush_work {
struct kthread_work work;
+   struct kthread_work *cancel_work;
struct completion   done;
 };
 
@@ -629,24 +630,25 @@ static void kthread_flush_work_fn(struct kthread_work 
*work)
 {
struct kthread_flush_work *fwork =
container_of(work, struct kthread_flush_work, work);
+
+   /* cancel the possible requeued work for cancel_kthread_work_sync() */
+   if (fwork->cancel_work)
+   cancel_kthread_work(fwork->cancel_work);
complete(&fwork->done);
 }
 
-/**
- * flush_kthread_work - flush a kthread_work
- * @work: work to flush
- *
- * If @work is queued or executing, wait for it to finish execution.
- */
-void flush_kthread_work(struct kthread_work *work)
+static void __cancel_work_sync(struct kthread_work *work, bool cancel, bool 
sync)
 {
struct kthread_flush_work fwork = {
-   KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn),
-   COMPLETION_INITIALIZER_ONSTACK(fwork.done),
+   .work = KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn),
+   .done = COMPLETION_INITIALIZER_ONSTACK(fwork.done),
};
struct kthread_worker *worker;
bool noop = false;
 
+   if (WARN_ON(!cancel && !sync))
+   return;
+
 retry:
worker = work->worker;
if (!worker)
@@ -658,21 +660,69 @@ retry:
goto retry;
}
 
-   if (!list_empty(&work->node))
+   /* cancel the queued work */
+   if (cancel && !list_empty(&work->node))
+   list_del_init(&work->node);
+
+   /* cancel the work during flushing it if it is requeued */
+   if (cancel && sync)
+   fwork.cancel_work = work;
+
+   /* insert the kthread_flush_work when sync */
+   if (sync && !list_empty(&work->node))
insert_kthread_work(worker, &fwork.work, work->node.next);
-   else if (worker->current_work == work)
+   else if (sync && worker->current_work == work)
insert_kthread_work(worker, &fwork.work, 
worker->work_list.next);
else
noop = true;
 
spin_unlock_irq(&worker->lock);
 
-   if (!noop)
+   if (sync && !noop)
wait_for_completion(&fwork.done);
 }
+
+/**
+ * flush_kthread_work - flush a kthread_work
+ * @work: work to flush
+ *
+ * If @work is queued or executing, wait for it to finish execution.
+ */
+void flush_kthread_work(struct kthread_work *work)
+{
+   __cancel_work_sync(work, false, true);
+}
 EXPORT_SYMBOL_GPL(flush_kthread_work);
 
 /**
+ * cancel_kthread_work - cancel a kthread_work
+ * @work: work to cancel
+ *
+ * If @work is queued, cancel it. Note, the work maybe still
+ * be executing after it returns.
+ */
+void cancel_kthread_work(struct kthread_work *work)
+{
+   __cancel_work_sync(work, true, false);
+}
+EXPORT_SYMBOL_GPL(cancel_kthread_work);
+
+/**
+ * cancel_kthread_work_sync - cancel a kthread_work and sync it
+ * @work: work to cancel
+ *
+ * If @work is queued

[PATCH] kthread_work: remove the unused wait_queue_head

2014-07-25 Thread Lai Jiangshan

The wait_queue_head_t done was totally unused since the flush_kthread_work()
had been re-implemented.  So we removed it including the initialization
code.  Some LOCKDEP code also depends on this wait_queue_head, so the
LOCKDEP code is also cleanup.

Signed-off-by: Lai Jiangshan 
---
 include/linux/kthread.h |   16 +---
 1 files changed, 5 insertions(+), 11 deletions(-)

diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index 7dcef33..790e49c 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -73,7 +73,6 @@ struct kthread_worker {
 struct kthread_work {
struct list_headnode;
kthread_work_func_t func;
-   wait_queue_head_t   done;
struct kthread_worker   *worker;
 };
 
@@ -85,7 +84,6 @@ struct kthread_work {
 #define KTHREAD_WORK_INIT(work, fn){   \
.node = LIST_HEAD_INIT((work).node),\
.func = (fn),   \
-   .done = __WAIT_QUEUE_HEAD_INITIALIZER((work).done), \
}
 
 #define DEFINE_KTHREAD_WORKER(worker)  \
@@ -95,24 +93,21 @@ struct kthread_work {
struct kthread_work work = KTHREAD_WORK_INIT(work, fn)
 
 /*
- * kthread_worker.lock and kthread_work.done need their own lockdep class
- * keys if they are defined on stack with lockdep enabled.  Use the
- * following macros when defining them on stack.
+ * kthread_worker.lock need its own lockdep class key if it is defined
+ * on stack with lockdep enabled.  Use the following macros when defining
+ * it on stack.
  */
 #ifdef CONFIG_LOCKDEP
 # define KTHREAD_WORKER_INIT_ONSTACK(worker)   \
({ init_kthread_worker(&worker); worker; })
 # define DEFINE_KTHREAD_WORKER_ONSTACK(worker) \
struct kthread_worker worker = KTHREAD_WORKER_INIT_ONSTACK(worker)
-# define KTHREAD_WORK_INIT_ONSTACK(work, fn)   \
-   ({ init_kthread_work((&work), fn); work; })
-# define DEFINE_KTHREAD_WORK_ONSTACK(work, fn) \
-   struct kthread_work work = KTHREAD_WORK_INIT_ONSTACK(work, fn)
 #else
 # define DEFINE_KTHREAD_WORKER_ONSTACK(worker) DEFINE_KTHREAD_WORKER(worker)
-# define DEFINE_KTHREAD_WORK_ONSTACK(work, fn) DEFINE_KTHREAD_WORK(work, fn)
 #endif
 
+# define DEFINE_KTHREAD_WORK_ONSTACK(work, fn) DEFINE_KTHREAD_WORK(work, fn)
+
 extern void __init_kthread_worker(struct kthread_worker *worker,
const char *name, struct lock_class_key *key);
 
@@ -127,7 +122,6 @@ extern void __init_kthread_worker(struct kthread_worker 
*worker,
memset((work), 0, sizeof(struct kthread_work)); \
INIT_LIST_HEAD(&(work)->node);  \
(work)->func = (fn);\
-   init_waitqueue_head(&(work)->done); \
} while (0)
 
 int kthread_worker_fn(void *worker_ptr);
-- 
1.7.4.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH] kthread_work: wake up worker only when the worker is idle

2014-07-25 Thread Lai Jiangshan

If the worker task is not idle, it may sleep on some conditions by the request
of the work.  Our unfriendly wakeup in the insert_kthread_work() may confuse
the worker.

Signed-off-by: Lai Jiangshan 
---
 kernel/kthread.c |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/kernel/kthread.c b/kernel/kthread.c
index c2390f4..ef48322 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -591,7 +591,7 @@ static void insert_kthread_work(struct kthread_worker 
*worker,
 
list_add_tail(&work->node, pos);
work->worker = worker;
-   if (likely(worker->task))
+   if (!worker->current_work && likely(worker->task))
wake_up_process(worker->task);
 }
 
-- 
1.7.4.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 2/3] workqueue: use dedicated creater kthread for all pools

2014-07-28 Thread Lai Jiangshan

On 07/29/2014 02:55 AM, Tejun Heo wrote:
> Hello, Lai.
> 
> On Sat, Jul 26, 2014 at 11:04:50AM +0800, Lai Jiangshan wrote:
>> There are some problems with the managers:
>>   1) The last idle worker prefer managing to processing.
>>  It is better that the processing of work items should be the first
>>  priority to make the whole system make progress earlier.
>>   2) managers among different pools can be parallel, but actually
>>  their major work are serialized on the kernel thread "kthreadd".
>>  These managers are sleeping and wasted when the system is lack
>>  of memory.
> 
> It's a bit difficult to get excited about this patchset given that
> this is mostly churn without many actual benefits.  Sure, it
> consolidates one-per-pool managers into one kthread_worker but it was
> one-per-pool already.  That said, I don't hate it and it may be
> considered an improvement.  I don't know.

It prefers to processing works rather than creating worker without any
loss of the guarantee.

processing works makes directly progress for the system.
creating worker makes delay and indirectly progress.

> 
>> This patch introduces a dedicated creater kthread which offloads the
>> managing from the workers, thus every worker makes effort to process work
>> rather than create worker, and there is no manager wasting on sleeping
>> when the system is lack of memory.  This dedicated creater kthread causes
>> a little more work serialized than before, but it is acceptable.
> 
> I do dislike the idea of creating this sort of hard serialization
> among separate worker pools.  It's probably acceptable but why are we
> doing this?  


I was about to use per-cpu kthread_worker, but I changed to use single
global kthread_worker after I had noticed the kthread_create() is already
serialized in kthreadd.

> To save one thread per pool and shed 30 lines of code
> while adding 40 lines to kthread_worker?

Countings are different between us!
Simplicity was also my aim in this patchset and total LOC was reduced.

> 
>>   1) the creater_work
>> Every pool has a struct kthread_work creater_work to create worker, and
>> the dedicated creater kthread processes all these creater_works of
>> all pools. struct kthread_work has itself execution exclusion, so we don't
>> need the manager_arb to handle the creating exclusion any more.
> 
> This explanation can be a bit misleading, I think.  We just don't have
> multiple workers trying to become managers anymore.
> 
>> put_unbound_pool() uses the flush_kthread_work() to synchronize with
>> the creating rather than uses the manager_arb.
>>
>>   2) the cooldown timer
>> The cooldown timer is introduced to implement the cool-down mechanism
>> rather than to causes the creater to sleep.  When the create_worker()
>> fails, the cooldown timer is requested and it will restart the creater_work.
> 
> Why?  Why doesn't the creator sleep?
> 
> ...
>>   5) the routine of the creater_work
>> The creater_work creates a worker in these two conditions:
>>   A) pool->nr_idle == 0
>>  A new worker is needed to be created obviously even there is no
>>  work item pending.  The busy workers may sleep and the pool can't
>>  serves the future new work items if no new worker is standby or
>>  being created.
> 
> This is kinda silly when the duty of worker creation is served by an
> external entity.  Why would a pool need any idle worker?

The idle worker must be ready or being prepared for wq_worker_sleeping()
or chained-wake-up.

percpu-kthreadd can serve for wq_worker_sleeping() in this case, but it is
not a good idle to introduce percpu-kthreadd now since no other user.

> insert_work() may as well just queue worker creation on demand.

Yes, it can save some workers for idle-unbound-pool.

> 
>>   8) init_workqueues()
>> The struct kthread_worker kworker_creater is initialized earlier than
>> worker_pools in init_workqueues() so that kworker_creater_thread is
>> created than all early kworkers.  Although the early kworkers are not
>> depends on kworker_creater_thread, but this initialization order makes
>> the pid of kworker_creater_thread smaller than kworkers which
>> seems more smooth.
> 
> Just don't create idle workers?

It does a good idea.

Thanks for review and comments.
Lai
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH RFC 2/2 V2] workqueue: use dedicated creater kthread for all pools

2014-07-29 Thread Lai Jiangshan

There are some problems with the managers:
  1) The last idle worker prefer managing to processing.
 It is better that the processing of work items should be the first
 priority to make the whole system make progress earlier.
  2) each pool always needs an additional idle worker, it is just wasting.
  3) managers among different pools can be parallel, but actually
 their major work are serialized on the kernel thread "kthreadd".
 These managers are sleeping and wasted when the system is lack
 of memory.
  4) it causes much complication and tricky when manager is implemented
 inside worker

This patch introduces a dedicated creater kthread which offloads the
managing from the workers, thus every worker makes effort to process work
rather than create worker. And there is no idle worker nor manager wasting
on sleeping.  And all the manager code is removed.  This dedicated creater
kthread causes a little more work serialized than before, but it is
acceptable since their major works are already serialized.

We introduce the kthread_work creater_work which calls create_worker()
to create a worker and the start_creater_work() which starts the
mayday_timer and the creater_work when needed and the pool->creating
which handles the exclusion for the mayday_timer and the creater_work.

We create worker only when the worklist has work items pending but no
idle worker ready.  So start_creater_work() is only called on
insert_work() or the last idle does go to busy with other work items
remained.  And we don't create worker when init_workqueues() nor
get_unbound_pool() nor CPU_UP_PREPARE nor the last idle worker leaves
idle any more, it saves code and unneeded workers.

put_unbound_pool() groups code by functionality not by the name, it
stops creation activity (creater_work, mayday_timer) at first and then
stops idle workers and idle_timer.

(1/2 patch is the 1/3 patch of the v1, so it is not resent.)

Signed-off-by: Lai Jiangshan 
---
 kernel/workqueue.c |  238 ++--
 1 files changed, 63 insertions(+), 175 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 1d44d8d..03e253f 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -153,16 +153,16 @@ struct worker_pool {
struct timer_list   idle_timer; /* L: worker idle timeout */
struct timer_list   mayday_timer;   /* L: SOS timer for workers */
 
-   /* a workers is either on busy_hash or idle_list, or the manager */
+   /* a workers is either on busy_hash or idle_list */
DECLARE_HASHTABLE(busy_hash, BUSY_WORKER_HASH_ORDER);
/* L: hash of busy workers */
 
-   /* see manage_workers() for details on the two manager mutexes */
-   struct mutexmanager_arb;/* manager arbitration */
struct mutexattach_mutex;   /* attach/detach exclusion */
struct list_headworkers;/* A: attached workers */
struct completion   *detach_completion; /* all workers detached */
 
+   struct kthread_work creater_work;   /* L: work for creating a new 
worker */
+   boolcreating;   /* L: creater_work is busy */
struct ida  worker_ida; /* worker IDs for task name */
 
struct workqueue_attrs  *attrs; /* I: worker attributes */
@@ -291,6 +291,10 @@ static DEFINE_SPINLOCK(wq_mayday_lock);/* protects 
wq->maydays list */
 static LIST_HEAD(workqueues);  /* PL: list of all workqueues */
 static bool workqueue_freezing;/* PL: have wqs started 
freezing? */
 
+/* kthread worker for creating workers for pools */
+static DEFINE_KTHREAD_WORKER(kworker_creater);
+static struct task_struct *kworker_creater_thread __read_mostly;
+
 /* the per-cpu worker pools */
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS],
 cpu_worker_pools);
@@ -731,12 +735,6 @@ static bool need_more_worker(struct worker_pool *pool)
return !list_empty(&pool->worklist) && __need_more_worker(pool);
 }
 
-/* Can I start working?  Called from busy but !running workers. */
-static bool may_start_working(struct worker_pool *pool)
-{
-   return pool->nr_idle;
-}
-
 /* Do I need to keep working?  Called from currently running workers. */
 static bool keep_working(struct worker_pool *pool)
 {
@@ -744,17 +742,10 @@ static bool keep_working(struct worker_pool *pool)
atomic_read(&pool->nr_running) <= 1;
 }
 
-/* Do we need a new worker?  Called from manager. */
-static bool need_to_create_worker(struct worker_pool *pool)
-{
-   return need_more_worker(pool) && !may_start_working(pool);
-}
-
 /* Do we have too many workers and should some go away? */
 static bool too_many_workers(struct worker_pool *pool)
 {
-   bool managing = mutex_

Re: [RFC PATCH 0/5] workqueue: Introduce low-level unbound wq sysfs cpumask v3

2014-07-11 Thread Lai Jiangshan

Hi, Frederic

I'd like to take this work unless you are still working on it.
I would do some cleanup at first so that it will be much slow for me.

Thanks,
Lai


On 05/17/2014 12:16 AM, Frederic Weisbecker wrote:
> So in this version I actually save the cpumask belonging to wq (before
> it's intersected against the low level cpumask) in its unbounds attrs.
> 
> But the attrs passed to pwq and worker pools have the low level cpumask
> computed against the wq cpumask.
> 
> It makes it easier that way as the wq cpumask itself remains untouched.
> It's a user setting so it must stay intact. OTOH the cpumask of pwqs and
> worker pools only belong to the implementation so it's the right
> place to store the effective cpumask.
> 
> Also wq_update_unbound_numa() is fixed, and cpu_possible_mask() 
> is restored as a base. Thanks to Lai!
> 
> Thanks,
>   Frederic
> 
> git://git.kernel.org/pub/scm/linux/kernel/git/frederic/linux-dynticks.git
> core/workqueue-v5
> ---
> 
> Frederic Weisbecker (4):
>   workqueue: Reorder sysfs code
>   workqueue: Create low-level unbound workqueues cpumask
>   workqueue: Split apply attrs code from its locking
>   workqueue: Allow modifying low level unbound workqueue cpumask
> 
> Lai Jiangshan (1):
>   workqueue: Allow changing attributions of ordered workqueues
> 
> 
>  kernel/workqueue.c | 1674 
> 
>  1 file changed, 900 insertions(+), 774 deletions(-)
> .
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH] workqueue: remove the del_timer_sync()s in maybe_create_worker()

2014-07-13 Thread Lai Jiangshan

It is said in the document that the timer which is being
deleted by del_timer_sync() should not be restarted:
  Synchronization rules: Callers must prevent restarting of
  the timer, otherwise this function is meaningless.

Repeating timer may cause the del_timer_sync() spin longer,
or even spin forever in very very very very extreme condition.

It is not considered a bug for me, but it disobeys the document.

To fix it, we need:
  1) don't requeue the mayday timer when !need_to_create_worker()
   it is a preparation/cleaup step for the fix. Otherwise the timer
   will repeat forever if we only do the 2).
  2) remove the del_timer_sync()s in maybe_create_worker()
   Fix the problem. It is not enough if we only do the 1),
   need_to_create_worker() would be probably true when the time
   del_timer_sync() is called, the timer is still repeating.

Signed-off-by: Lai Jiangshan 
---
 kernel/workqueue.c |6 ++
 1 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 35974ac..593b9bc 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1866,12 +1866,12 @@ static void pool_mayday_timeout(unsigned long __pool)
 */
list_for_each_entry(work, &pool->worklist, entry)
send_mayday(work);
+
+   mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL);
}
 
spin_unlock(&pool->lock);
spin_unlock_irq(&wq_mayday_lock);
-
-   mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL);
 }
 
 /**
@@ -1913,7 +1913,6 @@ restart:
 
worker = create_worker(pool);
if (worker) {
-   del_timer_sync(&pool->mayday_timer);
spin_lock_irq(&pool->lock);
start_worker(worker);
if (WARN_ON_ONCE(need_to_create_worker(pool)))
@@ -1931,7 +1930,6 @@ restart:
break;
}
 
-   del_timer_sync(&pool->mayday_timer);
spin_lock_irq(&pool->lock);
if (need_to_create_worker(pool))
goto restart;
-- 
1.7.7.6

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 1/3] workqueue: remove the first check and the return value of maybe_create_worker()

2014-07-13 Thread Lai Jiangshan

On 07/11/2014 11:03 PM, Tejun Heo wrote:
> On Fri, Jul 11, 2014 at 12:01:03AM +0800, Lai Jiangshan wrote:
>> @@ -1887,17 +1887,11 @@ static void pool_mayday_timeout(unsigned long __pool)
>>   * spin_lock_irq(pool->lock) which may be released and regrabbed
>>   * multiple times.  Does GFP_KERNEL allocations.  Called only from
>>   * manager.
>> - *
>> - * Return:
>> - * %false if no action was taken and pool->lock stayed locked, %true
>> - * otherwise.
>>   */
>> -static bool maybe_create_worker(struct worker_pool *pool)
>> +static void maybe_create_worker(struct worker_pool *pool)
> 
> We probably should drop "maybe_" from the function name?
> 

We already have create_worker(). And maybe_create_worker() does not always 
create worker.
maybe_create_worker() may not create worker if the condition is changed after
it fails at the first time, 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 0/1 V2] workqueue: manager cleanup

2014-07-13 Thread Lai Jiangshan

Hi, TJ,

I dropped the patch1 & patch2 of the V1, only the patch3 is kept and
re-based. The new patch depends on the patch of last night:
"workqueue: remove the del_timer_sync()s in maybe_create_worker()".

Thanks,
Lai

Lai Jiangshan (1):
  workqueue: unfold start_worker() into create_worker()

 kernel/workqueue.c |   69 ++--
 1 files changed, 13 insertions(+), 56 deletions(-)

-- 
1.7.4.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 1/1 V2] workqueue: unfold start_worker() into create_worker()

2014-07-13 Thread Lai Jiangshan

Simply unfold the code of start_worker() into create_worker() and
remove the original start_worker() and create_and_start_worker().

The only trade-off is the introduced overhead that the pool->lock
is released and re-grabbed after the newly worker is started.
The overhead is acceptable since the manager is slow path.

And because this new locking behavior, the newly created worker
may grab the lock earlier than the manager and go to process
work items. In this case, the need_to_create_worker() may be
true as expected and the manager goes to restart without complaint.

Signed-off-by: Lai Jiangshan 
---
 kernel/workqueue.c |   69 ++--
 1 files changed, 13 insertions(+), 56 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 2b3c12c..5b68527 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1553,7 +1553,7 @@ static void worker_enter_idle(struct worker *worker)
 (worker->hentry.next || worker->hentry.pprev)))
return;
 
-   /* can't use worker_set_flags(), also called from start_worker() */
+   /* can't use worker_set_flags(), also called from create_worker() */
worker->flags |= WORKER_IDLE;
pool->nr_idle++;
worker->last_active = jiffies;
@@ -1674,8 +1674,7 @@ static void worker_detach_from_pool(struct worker *worker,
  * create_worker - create a new workqueue worker
  * @pool: pool the new worker will belong to
  *
- * Create a new worker which is attached to @pool.  The new worker must be
- * started by start_worker().
+ * Create and start a new worker which is attached to @pool.
  *
  * CONTEXT:
  * Might sleep.  Does GFP_KERNEL allocations.
@@ -1720,6 +1719,13 @@ static struct worker *create_worker(struct worker_pool 
*pool)
/* successful, attach the worker to the pool */
worker_attach_to_pool(worker, pool);
 
+   /* start the newly created worker */
+   spin_lock_irq(&pool->lock);
+   worker->pool->nr_workers++;
+   worker_enter_idle(worker);
+   wake_up_process(worker->task);
+   spin_unlock_irq(&pool->lock);
+
return worker;
 
 fail:
@@ -1730,44 +1736,6 @@ fail:
 }
 
 /**
- * start_worker - start a newly created worker
- * @worker: worker to start
- *
- * Make the pool aware of @worker and start it.
- *
- * CONTEXT:
- * spin_lock_irq(pool->lock).
- */
-static void start_worker(struct worker *worker)
-{
-   worker->pool->nr_workers++;
-   worker_enter_idle(worker);
-   wake_up_process(worker->task);
-}
-
-/**
- * create_and_start_worker - create and start a worker for a pool
- * @pool: the target pool
- *
- * Grab the managership of @pool and create and start a new worker for it.
- *
- * Return: 0 on success. A negative error code otherwise.
- */
-static int create_and_start_worker(struct worker_pool *pool)
-{
-   struct worker *worker;
-
-   worker = create_worker(pool);
-   if (worker) {
-   spin_lock_irq(&pool->lock);
-   start_worker(worker);
-   spin_unlock_irq(&pool->lock);
-   }
-
-   return worker ? 0 : -ENOMEM;
-}
-
-/**
  * destroy_worker - destroy a workqueue worker
  * @worker: worker to be destroyed
  *
@@ -1905,18 +1873,7 @@ restart:
mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT);
 
while (true) {
-   struct worker *worker;
-
-   worker = create_worker(pool);
-   if (worker) {
-   spin_lock_irq(&pool->lock);
-   start_worker(worker);
-   if (WARN_ON_ONCE(need_to_create_worker(pool)))
-   goto restart;
-   return true;
-   }
-
-   if (!need_to_create_worker(pool))
+   if (create_worker(pool) || !need_to_create_worker(pool))
break;
 
schedule_timeout_interruptible(CREATE_COOLDOWN);
@@ -3543,7 +3500,7 @@ static struct worker_pool *get_unbound_pool(const struct 
workqueue_attrs *attrs)
goto fail;
 
/* create and start the initial worker */
-   if (create_and_start_worker(pool) < 0)
+   if (!create_worker(pool))
goto fail;
 
/* install */
@@ -4617,7 +4574,7 @@ static int workqueue_cpu_up_callback(struct 
notifier_block *nfb,
for_each_cpu_worker_pool(pool, cpu) {
if (pool->nr_workers)
continue;
-   if (create_and_start_worker(pool) < 0)
+   if (!create_worker(pool))
return NOTIFY_BAD;
}
break;
@@ -4917,7 +4874,7 @@ static int __init init_workqueues(void)
 
for_each_cpu_worker_pool(pool, cpu) {

[PATCH 0/1 V2] workqueue: a tiny fix for the mayday timer

2014-07-14 Thread Lai Jiangshan

Hi, TJ

The del_timer_sync()s for the mayday timer in maybe_create_worker()
do not meet the requirement of the document of the del_timer_sync().

V1 patch incorrectly stops the timer when !need_to_create_worker(),
but no one can restart it when needed. In V2, we stops the
timer only when manager is not working.

Sorry for the incorrect V1 due to a general excuse: World-Cup-2014.

Another patch "[PATCH 1/1 V2] workqueue: unfold start_worker() into 
create_worker()"
depends on this patch. That patch still coincides with this patch even
this patch is updated.

Thanks,
Lai

Lai Jiangshan (1):
  workqueue: remove the del_timer_sync()s in maybe_create_worker()

 kernel/workqueue.c |7 +++
 1 files changed, 3 insertions(+), 4 deletions(-)

-- 
1.7.4.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 1/1 V2] workqueue: remove the del_timer_sync()s in maybe_create_worker()

2014-07-14 Thread Lai Jiangshan

It is said in the document that the timer which is being
deleted by del_timer_sync() should not be restarted:
  Synchronization rules: Callers must prevent restarting of
  the timer, otherwise this function is meaningless.

Repeating timer may cause the del_timer_sync() spin longer,
or even spin forever in very very very very extreme condition.

It is not considered a bug for me, but it disobeys the document.

To fix it, we need:
  1) don't requeue the mayday timer unless the manager is working
   it is a preparation/cleanup step for the fix. Otherwise the timer
   will repeat forever if we only do the 2).
  2) remove the del_timer_sync()s in maybe_create_worker()
   the timer is still repeating, we should not use del_timer_sync(),
   let the timer dismiss itself.

Change from V1:
  V1 incorrectly stops the timer when !need_to_create_worker(),
  but no one can restart it when needed. In V2, we stops the
  timer only when manager is not working.

Signed-off-by: Lai Jiangshan 
---
 kernel/workqueue.c |7 +++
 1 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 338d418..9c86a64 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1864,10 +1864,11 @@ static void pool_mayday_timeout(unsigned long __pool)
send_mayday(work);
}
 
+   if (mutex_is_locked(&pool->manager_arb))
+   mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL);
+
spin_unlock(&pool->lock);
spin_unlock_irq(&wq_mayday_lock);
-
-   mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL);
 }
 
 /**
@@ -1909,7 +1910,6 @@ restart:
 
worker = create_worker(pool);
if (worker) {
-   del_timer_sync(&pool->mayday_timer);
spin_lock_irq(&pool->lock);
start_worker(worker);
if (WARN_ON_ONCE(need_to_create_worker(pool)))
@@ -1926,7 +1926,6 @@ restart:
break;
}
 
-   del_timer_sync(&pool->mayday_timer);
spin_lock_irq(&pool->lock);
if (need_to_create_worker(pool))
goto restart;
-- 
1.7.4.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 1/1 V2] workqueue: remove the del_timer_sync()s in maybe_create_worker()

2014-07-14 Thread Lai Jiangshan

On 07/14/2014 11:33 PM, Thomas Gleixner wrote:
> On Mon, 14 Jul 2014, Tejun Heo wrote:
> 
>> Hello,
>>
>> On Mon, Jul 14, 2014 at 04:13:21PM +0800, Lai Jiangshan wrote:
>>> It is said in the document that the timer which is being
>>> deleted by del_timer_sync() should not be restarted:
>>>   Synchronization rules: Callers must prevent restarting of
>>>   the timer, otherwise this function is meaningless.
>>>
>>> Repeating timer may cause the del_timer_sync() spin longer,
>>> or even spin forever in very very very very extreme condition.
>>
>> I'm fairly sure del_timer_sync() can delete self-requeueing timers.
>> The implementation busy-waits if the queued timer is the currently
>> executing one and dequeues only while the timer isn't running which
>> should be able to handle self-requeueing ones just fine.  Thomas,
>> del_timer_sync() can reliably delete self-requeueing ones, right?
> 
> Yes. 

The comments of the del_timer_sync() needs to be updated
if I did not misunderstood?

> If the timer callback is running on the other cpu, then it waits
> for the callback to finish before checking whether the timer is
> enqueued or not.

The syncer may be interrupted here, after it comes back, the timer
may be running again (and maybe again and again).

Current code is an acceptable ostrich algorithm, but the documents
needs to be updated. Or re-implement it as the way as cancel_work_sync()
which has similar but stronger semantics, it disables requeueing when
syncing.



> 
> Thanks,
> 
>   tglx
> 
> 
> .
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 1/1 V2] workqueue: remove the del_timer_sync()s in maybe_create_worker()

2014-07-14 Thread Lai Jiangshan

On 07/15/2014 05:42 AM, Thomas Gleixner wrote:
> On Tue, 15 Jul 2014, Lai Jiangshan wrote:
>> On 07/14/2014 11:33 PM, Thomas Gleixner wrote:
>>> On Mon, 14 Jul 2014, Tejun Heo wrote:
>>>
>>>> Hello,
>>>>
>>>> On Mon, Jul 14, 2014 at 04:13:21PM +0800, Lai Jiangshan wrote:
>>>>> It is said in the document that the timer which is being
>>>>> deleted by del_timer_sync() should not be restarted:
>>>>>   Synchronization rules: Callers must prevent restarting of
>>>>>   the timer, otherwise this function is meaningless.
>>>>>
>>>>> Repeating timer may cause the del_timer_sync() spin longer,
>>>>> or even spin forever in very very very very extreme condition.
>>>>
>>>> I'm fairly sure del_timer_sync() can delete self-requeueing timers.
>>>> The implementation busy-waits if the queued timer is the currently
>>>> executing one and dequeues only while the timer isn't running which
>>>> should be able to handle self-requeueing ones just fine.  Thomas,
>>>> del_timer_sync() can reliably delete self-requeueing ones, right?
>>>
>>> Yes. 
>>
>> The comments of the del_timer_sync() needs to be updated
>> if I did not misunderstood?
>>
>>> If the timer callback is running on the other cpu, then it waits
>>> for the callback to finish before checking whether the timer is
>>> enqueued or not.
>>
>> The syncer may be interrupted here, after it comes back, the timer
>> may be running again (and maybe again and again).
> 
> No. The del_timer_sync() code holds the base lock with interrupts
> disabled. So it can't be interrupted.
> 

WARN_ON(in_irq() && !tbase_get_irqsafe(timer->base));
for (;;) {
int ret = try_to_del_timer_sync(timer);
if (ret >= 0)
return ret;
cpu_relax();

How about when it is interrupted here?

}

> Thanks,
> 
>   tglx
> 
> 
> .
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH RFC] workqueue: don't grab PENDING bit on some conditions

2014-07-15 Thread Lai Jiangshan

mod_delayed_work_on() can't handle well when the work is being flushed.

Thread1 Thread2
/* the delayed work is pending in timer. */

/* flush it */  /* change to a smaller delay. */
flush_delayed_work();   mod_delayed_work_on();


Thread1 expects that, after flush_delayed_work() returns, the known pending
work is guaranteed finished. But if Thread2 is scheduled a little later than
Thread1, the known pending work is dequeued and re-queued, it is considered
as two different works in the workqueue subsystem and the guarantee expected
by Thread1 is broken.

The guarantee expected by Thread1/workqueue-user is reasonable for me,
the workqueue subsystem should provide this guarantee. In another aspect,
the flush_delayed_work() is still working when mod_delayed_work_on() returns,
it is more acceptable that the flush_delayed_work() beats the
mod_delayed_work_on().

It is achieved by introducing a KEEP_FLUSHED flag for try_to_grab_pending().
If the work is being flushed and KEEP_FLUSHED flags is set,
we disallow try_to_grab_pending() to grab the pending of the work.


And there is another condition that the user want to speed up a delayed work.

When the user use "mod_delayed_work_on(..., 0 /* zero delay */);", his
attention is to accelerate the work and queue the work immediately.

But the work does be slowed down when it is already queued on the worklist
due to the work is dequeued and re-queued. So we also disallow
try_to_grab_pending() to grab the pending of the work in this condition
by introducing KEEP_QUEUED flag.

Signed-off-by: Lai Jiangshan 
---
 kernel/workqueue.c |   29 -
 1 files changed, 24 insertions(+), 5 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 338d418..3a0b055 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1141,10 +1141,16 @@ out_put:
put_pwq(pwq);
 }
 
+/* keep the work on the worklist if it is on the worklist */
+#define KEEP_QUEUED1
+/* keep the work on the worklist if it is on the worklist and being flushed */
+#define KEEP_FLUSHED   2
+
 /**
  * try_to_grab_pending - steal work item from worklist and disable irq
  * @work: work item to steal
  * @is_dwork: @work is a delayed_work
+ * @keep_flags: don't grab PENDING bit for some conditions
  * @flags: place to store irq state
  *
  * Try to grab PENDING bit of @work.  This function can handle @work in any
@@ -1156,6 +1162,8 @@ out_put:
  *  -EAGAINif PENDING couldn't be grabbed at the moment, safe to busy-retry
  *  -ENOENTif someone else is canceling @work, this state may persist
  * for arbitrarily long
+ *  -EBUSY @work is kept due to @work meets the requirement for
+ * the keep_flags, see the comments for KEEP_
  *
  * Note:
  * On >= 0 return, the caller owns @work's PENDING bit.  To avoid getting
@@ -1169,7 +1177,7 @@ out_put:
  * This function is safe to call from any context including IRQ handler.
  */
 static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
-  unsigned long *flags)
+  unsigned long keep_flags, unsigned long *flags)
 {
struct worker_pool *pool;
struct pool_workqueue *pwq;
@@ -1212,6 +1220,13 @@ static int try_to_grab_pending(struct work_struct *work, 
bool is_dwork,
 */
pwq = get_work_pwq(work);
if (pwq && pwq->pool == pool) {
+   if ((keep_flags | KEEP_QUEUED) ||
+   ((keep_flags | KEEP_FLUSHED) &&
+   (*work_data_bits(work) & WORK_STRUCT_LINKED))) {
+   spin_unlock_irqrestore(&pool->lock, *flags);
+   return -EBUSY;
+   }
+
debug_work_deactivate(work);
 
/*
@@ -1517,11 +1532,15 @@ EXPORT_SYMBOL(queue_delayed_work_on);
 bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq,
 struct delayed_work *dwork, unsigned long delay)
 {
+   unsigned long keep = KEEP_FLUSHED;
unsigned long flags;
int ret;
 
+   if (!delay)
+   keep |= KEEP_QUEUED;
+
do {
-   ret = try_to_grab_pending(&dwork->work, true, &flags);
+   ret = try_to_grab_pending(&dwork->work, true, keep, &flags);
} while (unlikely(ret == -EAGAIN));
 
if (likely(ret >= 0)) {
@@ -1529,7 +1548,7 @@ bool mod_delayed_work_on(int cpu, struct workqueue_struct 
*wq,
local_irq_restore(flags);
}
 
-   /* -ENOENT from try_to_grab_pending() becomes %true */
+   /* -ENOENT or -EBUSY from try_to_grab_pending() becomes %true */
return ret;
 }
 EXPORT_SYMBOL_GPL(mod_delayed_work_on);
@@ -2773,7 +2792,7 @@ static bool __cancel_work_timer(struc

[PATCH] workqueue: alloc struct worker on its local node

2014-07-15 Thread Lai Jiangshan

When the create_worker() is called from non-manager, the struct worker
is allocated from the node of the caller which may be different from the
node of pool->node.

So we add a node ID argument for the alloc_worker() to ensure the
struct worker is allocated from the preferable node.

Signed-off-by: Lai Jiangshan 
---
 kernel/workqueue.c |8 
 1 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index e83c42a..a4a5364 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1614,11 +1614,11 @@ static void worker_leave_idle(struct worker *worker)
list_del_init(&worker->entry);
 }
 
-static struct worker *alloc_worker(void)
+static struct worker *alloc_worker(int nid)
 {
struct worker *worker;
 
-   worker = kzalloc(sizeof(*worker), GFP_KERNEL);
+   worker = kzalloc_node(sizeof(*worker), GFP_KERNEL, nid);
if (worker) {
INIT_LIST_HEAD(&worker->entry);
INIT_LIST_HEAD(&worker->scheduled);
@@ -1713,7 +1713,7 @@ static struct worker *create_worker(struct worker_pool 
*pool)
if (id < 0)
goto fail;
 
-   worker = alloc_worker();
+   worker = alloc_worker(pool->node);
if (!worker)
goto fail;
 
@@ -4129,7 +4129,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char 
*fmt,
if (flags & WQ_MEM_RECLAIM) {
struct worker *rescuer;
 
-   rescuer = alloc_worker();
+   rescuer = alloc_worker(NUMA_NO_NODE);
if (!rescuer)
goto err_destroy;
 
-- 
1.7.4.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 2/2] workqueue: stronger test in process_one_work()

2014-06-26 Thread Lai Jiangshan

On 06/20/2014 03:44 AM, Tejun Heo wrote:
> On Tue, Jun 03, 2014 at 03:33:28PM +0800, Lai Jiangshan wrote:
>> When POOL_DISASSOCIATED is cleared, the running worker's local CPU should
>> be the same as pool->cpu without any exception even during cpu-hotplug.
>>
>> This fix changes "(proposition_A && proposition_B && proposition_C)"
>> to "(proposition_B && proposition_C)", so if the old compound proposition
>> is true, the new one must be true too. so this fix will not hide any
>> possible bug which can be hit by old test.
>>
>> CC: Jason J. Herne 
>> CC: Sasha Levin 
>> Signed-off-by: Lai Jiangshan 
> 
> Applied to wq/for-3.17 with minor updates.
> 
> Nice set of cleanups.  Thanks!
> 

Hi,Tejun

I found the slight earlier 6 patches are in wq/for-3.17.
But these two patches (in this email thread) are not in wq/for-3.17 yet.

workqueue: clear POOL_DISASSOCIATED in rebind_workers()
workqueue: stronger test in process_one_work()

Thanks,
Lai
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 2/2] workqueue: stronger test in process_one_work()

2014-06-29 Thread Lai Jiangshan

Ping.

Thanks,
Lai

On 06/26/2014 07:27 PM, Lai Jiangshan wrote:
> On 06/20/2014 03:44 AM, Tejun Heo wrote:
>> On Tue, Jun 03, 2014 at 03:33:28PM +0800, Lai Jiangshan wrote:
>>> When POOL_DISASSOCIATED is cleared, the running worker's local CPU should
>>> be the same as pool->cpu without any exception even during cpu-hotplug.
>>>
>>> This fix changes "(proposition_A && proposition_B && proposition_C)"
>>> to "(proposition_B && proposition_C)", so if the old compound proposition
>>> is true, the new one must be true too. so this fix will not hide any
>>> possible bug which can be hit by old test.
>>>
>>> CC: Jason J. Herne 
>>> CC: Sasha Levin 
>>> Signed-off-by: Lai Jiangshan 
>>
>> Applied to wq/for-3.17 with minor updates.
>>
>> Nice set of cleanups.  Thanks!
>>
> 
> Hi,Tejun
> 
> I found the slight earlier 6 patches are in wq/for-3.17.
> But these two patches (in this email thread) are not in wq/for-3.17 yet.
> 
> workqueue: clear POOL_DISASSOCIATED in rebind_workers()
> workqueue: stronger test in process_one_work()
> 
> Thanks,
> Lai
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH tip/core/rcu 08/17] rcu: Allow post-unlock reference for rt_mutex

2014-07-08 Thread Lai Jiangshan

On 07/08/2014 06:38 AM, Paul E. McKenney wrote:
> From: "Paul E. McKenney" 
> 
> The current approach to RCU priority boosting uses an rt_mutex strictly
> for its priority-boosting side effects.  The rt_mutex_init_proxy_locked()
> function is used by the booster to initialize the lock as held by the
> boostee.  The booster then uses rt_mutex_lock() to acquire this rt_mutex,
> which priority-boosts the boostee.  When the boostee reaches the end
> of its outermost RCU read-side critical section, it checks a field in
> its task structure to see whether it has been boosted, and, if so, uses
> rt_mutex_unlock() to release the rt_mutex.  The booster can then go on
> to boost the next task that is blocking the current RCU grace period.
> 
> But reasonable implementations of rt_mutex_unlock() might result in the
> boostee referencing the rt_mutex's data after releasing it. 

_unlock(lock_ptr) should not reference to the lock_ptr after it has 
unlocked the lock. (*)
So I think this patch is unneeded. Although its adding overhead is at 
slow-patch,
but it adds REVIEW-burden.

And although the original rt_mutex_unlock() violates the rule(*) when the 
fast-cmpxchg-path,
but it is fixed now.

It is the lock-subsystem's responsible to do this. I prefer to add the 
wait_for_complete()
stuff until the future when the boostee needs to re-access the booster after 
rt_mutex_unlock()
instead of now.

Thanks,
Lai
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH tip/core/rcu 0/17] Miscellaneous fixes for 3.17

2014-07-08 Thread Lai Jiangshan

Besides patch 3, please also add my review-by for these patches:
patch 1, 2, 5, 7, 12, 13, 15, 16, 17

Reviewed-by: Lai Jiangshan 

Thanks,
Lai

On 07/08/2014 06:37 AM, Paul E. McKenney wrote:
> Hello!
> 
> This series provides miscellaneous fixes:
> 
> 1.Document deadlock-avoidance information in rcu_read_unlock()'s
>   docbook comment header.
> 
> 2.Remove obsolete references to TINY_PREEMPT_RCU.
> 
> 3.Add deadlock explanation to local_irq_save() call in
>   __lock_task_sighand().
> 
> 4.Make the rcu_node arrays be static const char * const,
>   courtesy of Fabian Frederick.
> 
> 5.Remove redundant ACCESS_ONCE() from tick_do_timer_cpu under
>   #ifdef CONFIG_NO_HZ_FULL.
> 
> 6.Eliminate read-modify-write ACCESS_ONCE() calls.
> 
> 7.Loosen __call_rcu()'s rcu_head alignment constraint to handle
>   m68k's 16-bit alignment.
> 
> 8.Allow post-unlock reference for rt_mutex.
> 
> 9.Check both root and current rcu_node structures when setting up
>   future grace periods, courtesy of Pranith Kumar.
> 
> 10.   Simplify priority boosting by putting rt_mutex in rcu_node
>   structure.
> 
> 11.   Bind grace-period kthreads to no-NO_HZ_FULL CPUs instead of the
>   timekeeping CPU, at least for CONFIG_NO_HZ_FULL_SYSIDLE=n.
> 
> 12.   Don't use NMIs to dump other CPUs' stacks.
> 
> 13.   Use __this_cpu_read() instead of per_cpu_ptr(), courtesy of Shan Wei.
> 
> 14.   Remove CONFIG_PROVE_RCU_DELAY.
> 
> 15.   Fix __rcu_reclaim to use true/false instead of 1/0.
> 
> 16.   Fix sparse warning in rcu_initiate_boost(), courtesy of Pranith
>   Kumar.
> 
> 17.   Fix sparse warning in rcu_report_unblock_qs_rnp(), again courtesy
>   of Pranith Kumar.
> 
>   Thanx, Paul
> 
> 
> 
>  b/include/linux/init_task.h   |9 --
>  b/include/linux/rcupdate.h|   45 --
>  b/include/linux/sched.h   |6 -
>  b/include/linux/tick.h|   19 
>  b/init/Kconfig|2 
>  b/kernel/rcu/rcu.h|8 +
>  b/kernel/rcu/srcu.c   |4 
>  b/kernel/rcu/tree.c   |   59 
> ++
>  b/kernel/rcu/tree.h   |8 +
>  b/kernel/rcu/tree_plugin.h|   52 +++-
>  b/kernel/rcu/update.c |3 
>  b/kernel/signal.c |4 
>  b/kernel/time/tick-sched.c|   10 ++
>  b/tools/testing/selftests/rcutorture/configs/rcu/TREE01   |1 
>  b/tools/testing/selftests/rcutorture/configs/rcu/TREE02   |1 
>  b/tools/testing/selftests/rcutorture/configs/rcu/TREE02-T |1 
>  b/tools/testing/selftests/rcutorture/configs/rcu/TREE03   |1 
>  b/tools/testing/selftests/rcutorture/configs/rcu/TREE04   |1 
>  b/tools/testing/selftests/rcutorture/configs/rcu/TREE05   |1 
>  b/tools/testing/selftests/rcutorture/configs/rcu/TREE06   |1 
>  b/tools/testing/selftests/rcutorture/configs/rcu/TREE07   |1 
>  b/tools/testing/selftests/rcutorture/configs/rcu/TREE08   |1 
>  b/tools/testing/selftests/rcutorture/configs/rcu/TREE08-T |1 
>  b/tools/testing/selftests/rcutorture/configs/rcu/TREE09   |1 
>  24 files changed, 147 insertions(+), 93 deletions(-)
> 
> .
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 1/3] workqueue: remove the first check and the return value of maybe_create_worker()

2014-07-10 Thread Lai Jiangshan

The pool->lock is kept locked during the time the worker_thread() checks the
condition and maybe_create_worker() is called, therefore, the condition
is unchanged and the first condition check is unneeded.

After this unneeded check is removed, maybe_create_worker() only returns
true, so we remove the return value of it and adjust the call-site.

Signed-off-by: Lai Jiangshan 
---
 kernel/workqueue.c |   18 +-
 1 files changed, 5 insertions(+), 13 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index f344334..92f7ea0c 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1887,17 +1887,11 @@ static void pool_mayday_timeout(unsigned long __pool)
  * spin_lock_irq(pool->lock) which may be released and regrabbed
  * multiple times.  Does GFP_KERNEL allocations.  Called only from
  * manager.
- *
- * Return:
- * %false if no action was taken and pool->lock stayed locked, %true
- * otherwise.
  */
-static bool maybe_create_worker(struct worker_pool *pool)
+static void maybe_create_worker(struct worker_pool *pool)
 __releases(&pool->lock)
 __acquires(&pool->lock)
 {
-   if (!need_to_create_worker(pool))
-   return false;
 restart:
spin_unlock_irq(&pool->lock);
 
@@ -1914,7 +1908,7 @@ restart:
start_worker(worker);
if (WARN_ON_ONCE(need_to_create_worker(pool)))
goto restart;
-   return true;
+   return;
}
 
if (!need_to_create_worker(pool))
@@ -1930,7 +1924,6 @@ restart:
spin_lock_irq(&pool->lock);
if (need_to_create_worker(pool))
goto restart;
-   return true;
 }
 
 /**
@@ -1959,7 +1952,6 @@ restart:
 static bool manage_workers(struct worker *worker)
 {
struct worker_pool *pool = worker->pool;
-   bool ret = false;
 
/*
 * Anyone who successfully grabs manager_arb wins the arbitration
@@ -1972,12 +1964,12 @@ static bool manage_workers(struct worker *worker)
 * actual management, the pool may stall indefinitely.
 */
if (!mutex_trylock(&pool->manager_arb))
-   return ret;
+   return false;
 
-   ret |= maybe_create_worker(pool);
+   maybe_create_worker(pool);
 
mutex_unlock(&pool->manager_arb);
-   return ret;
+   return true;
 }
 
 /**
-- 
1.7.4.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 3/3] workqueue: unfold start_worker() into create_worker()

2014-07-10 Thread Lai Jiangshan

Simply unfold the code of start_worker() into create_worker() and
remove the original start_worker() and create_and_start_worker().

maybe_create_worker() also becomes decently shorter.

The only trade-off is the introduced overhead that the pool->lock
is released and regrabbed after the newly worker is started.
The overhead is acceptible since the manager is slow path.

Signed-off-by: Lai Jiangshan 
---
 kernel/workqueue.c |   68 ++--
 1 files changed, 13 insertions(+), 55 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 4fdd6d0..c820057 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1553,7 +1553,7 @@ static void worker_enter_idle(struct worker *worker)
 (worker->hentry.next || worker->hentry.pprev)))
return;
 
-   /* can't use worker_set_flags(), also called from start_worker() */
+   /* can't use worker_set_flags(), also called from create_worker() */
worker->flags |= WORKER_IDLE;
pool->nr_idle++;
worker->last_active = jiffies;
@@ -1674,8 +1674,7 @@ static void worker_detach_from_pool(struct worker *worker,
  * create_worker - create a new workqueue worker
  * @pool: pool the new worker will belong to
  *
- * Create a new worker which is attached to @pool.  The new worker must be
- * started by start_worker().
+ * Create and start a new worker which is attached to @pool.
  *
  * CONTEXT:
  * Might sleep.  Does GFP_KERNEL allocations.
@@ -1720,6 +1719,13 @@ static struct worker *create_worker(struct worker_pool 
*pool)
/* successful, attach the worker to the pool */
worker_attach_to_pool(worker, pool);
 
+   /* start the newly created worker */
+   spin_lock_irq(&pool->lock);
+   worker->pool->nr_workers++;
+   worker_enter_idle(worker);
+   wake_up_process(worker->task);
+   spin_unlock_irq(&pool->lock);
+
return worker;
 
 fail:
@@ -1730,44 +1736,6 @@ fail:
 }
 
 /**
- * start_worker - start a newly created worker
- * @worker: worker to start
- *
- * Make the pool aware of @worker and start it.
- *
- * CONTEXT:
- * spin_lock_irq(pool->lock).
- */
-static void start_worker(struct worker *worker)
-{
-   worker->pool->nr_workers++;
-   worker_enter_idle(worker);
-   wake_up_process(worker->task);
-}
-
-/**
- * create_and_start_worker - create and start a worker for a pool
- * @pool: the target pool
- *
- * Grab the managership of @pool and create and start a new worker for it.
- *
- * Return: 0 on success. A negative error code otherwise.
- */
-static int create_and_start_worker(struct worker_pool *pool)
-{
-   struct worker *worker;
-
-   worker = create_worker(pool);
-   if (worker) {
-   spin_lock_irq(&pool->lock);
-   start_worker(worker);
-   spin_unlock_irq(&pool->lock);
-   }
-
-   return worker ? 0 : -ENOMEM;
-}
-
-/**
  * destroy_worker - destroy a workqueue worker
  * @worker: worker to be destroyed
  *
@@ -1895,22 +1863,12 @@ static void maybe_create_worker(struct worker_pool 
*pool)
 __releases(&pool->lock)
 __acquires(&pool->lock)
 {
-   struct worker *worker;
-
spin_unlock_irq(&pool->lock);
 
/* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */
mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT);
 
-   worker = create_worker(pool);
-   if (worker) {
-   del_timer_sync(&pool->mayday_timer);
-   spin_lock_irq(&pool->lock);
-   start_worker(worker);
-   return;
-   }
-
-   if (need_to_create_worker(pool))
+   if (!create_worker(pool) && need_to_create_worker(pool))
schedule_timeout_interruptible(CREATE_COOLDOWN);
 
del_timer_sync(&pool->mayday_timer);
@@ -3524,7 +3482,7 @@ static struct worker_pool *get_unbound_pool(const struct 
workqueue_attrs *attrs)
goto fail;
 
/* create and start the initial worker */
-   if (create_and_start_worker(pool) < 0)
+   if (!create_worker(pool))
goto fail;
 
/* install */
@@ -4598,7 +4556,7 @@ static int workqueue_cpu_up_callback(struct 
notifier_block *nfb,
for_each_cpu_worker_pool(pool, cpu) {
if (pool->nr_workers)
continue;
-   if (create_and_start_worker(pool) < 0)
+   if (!create_worker(pool))
return NOTIFY_BAD;
}
break;
@@ -4898,7 +4856,7 @@ static int __init init_workqueues(void)
 
for_each_cpu_worker_pool(pool, cpu) {
pool->flags &= ~POOL_DISASSOCIATED;
-   BUG_ON(create_and_start_worker(p

[PATCH 0/3] workqueue: manager cleanup

2014-07-10 Thread Lai Jiangshan

Hi, TJ

These are the cleanups for which you asked during the prevous development,
especailly the patch3 merging start_worker() into create_worker().

Thanks,
Lai


Lai Jiangshan (3):
  workqueue: remove the first check and the return value of
maybe_create_worker()
  workqueue: remove the guarantee and the retrying of
maybe_create_worker()
  workqueue: unfold start_worker() into create_worker()

 kernel/workqueue.c |  113 +++
 1 files changed, 25 insertions(+), 88 deletions(-)

-- 
1.7.4.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 2/3] workqueue: remove the guarantee and the retrying of maybe_create_worker()

2014-07-10 Thread Lai Jiangshan

maybe_create_worker() has a strong guarantee that there has at least
one idle worker on return from this function.  This guarantee is guaranteed
via the check and the retrying in this function.

But the caller (worker_thread()) also has the same check and retrying,
so the guarantee is not really required and the check and the retrying in
maybe_create_worker() are unnecessary and redundant.

So we remove the guarantee as well as the check and the retrying.  The caller
takes responsibility to retry when needed.

The only trade-off is that the mayday timer will be removed and re-added
across retrying. Since retrying is expected as rare case, the trade-off
is acceptible.

Signed-off-by: Lai Jiangshan 
---
 kernel/workqueue.c |   49 ++---
 1 files changed, 18 insertions(+), 31 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 92f7ea0c..4fdd6d0 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1875,14 +1875,17 @@ static void pool_mayday_timeout(unsigned long __pool)
  * @pool: pool to create a new worker for
  *
  * Create a new worker for @pool if necessary.  @pool is guaranteed to
- * have at least one idle worker on return from this function.  If
- * creating a new worker takes longer than MAYDAY_INTERVAL, mayday is
+ * make some progresses on return from this function.
+ *   1) success to create a new idle worker.  Or
+ *   2) cool down a while after it failed.  Or
+ *   3) condition changed (no longer need to create worker) after it failed.
+ * In any case, the caller will recheck the condition and retry when needed,
+ * so this function doesn't need to retry.
+ *
+ * If creating a new worker takes longer than MAYDAY_INTERVAL, mayday is
  * sent to all rescuers with works scheduled on @pool to resolve
  * possible allocation deadlock.
  *
- * On return, need_to_create_worker() is guaranteed to be %false and
- * may_start_working() %true.
- *
  * LOCKING:
  * spin_lock_irq(pool->lock) which may be released and regrabbed
  * multiple times.  Does GFP_KERNEL allocations.  Called only from
@@ -1892,38 +1895,26 @@ static void maybe_create_worker(struct worker_pool 
*pool)
 __releases(&pool->lock)
 __acquires(&pool->lock)
 {
-restart:
+   struct worker *worker;
+
spin_unlock_irq(&pool->lock);
 
/* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */
mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT);
 
-   while (true) {
-   struct worker *worker;
-
-   worker = create_worker(pool);
-   if (worker) {
-   del_timer_sync(&pool->mayday_timer);
-   spin_lock_irq(&pool->lock);
-   start_worker(worker);
-   if (WARN_ON_ONCE(need_to_create_worker(pool)))
-   goto restart;
-   return;
-   }
-
-   if (!need_to_create_worker(pool))
-   break;
+   worker = create_worker(pool);
+   if (worker) {
+   del_timer_sync(&pool->mayday_timer);
+   spin_lock_irq(&pool->lock);
+   start_worker(worker);
+   return;
+   }
 
+   if (need_to_create_worker(pool))
schedule_timeout_interruptible(CREATE_COOLDOWN);
 
-   if (!need_to_create_worker(pool))
-   break;
-   }
-
del_timer_sync(&pool->mayday_timer);
spin_lock_irq(&pool->lock);
-   if (need_to_create_worker(pool))
-   goto restart;
 }
 
 /**
@@ -1934,10 +1925,6 @@ restart:
  * to.  At any given time, there can be only zero or one manager per
  * pool.  The exclusion is handled automatically by this function.
  *
- * The caller can safely start processing works on false return.  On
- * true return, it's guaranteed that need_to_create_worker() is false
- * and may_start_working() is true.
- *
  * CONTEXT:
  * spin_lock_irq(pool->lock) which may be released and regrabbed
  * multiple times.  Does GFP_KERNEL allocations.
-- 
1.7.4.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH] workqueue: reuse the calculated pwq in try_to_grab_pending()

2014-07-10 Thread Lai Jiangshan

Subsitute the get_work_pwq(work) to pwq since it is calculated.

Signed-off-by: Lai Jiangshan 
---
 kernel/workqueue.c |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index c820057..7fc3df3 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1225,7 +1225,7 @@ static int try_to_grab_pending(struct work_struct *work, 
bool is_dwork,
pwq_activate_delayed_work(work);
 
list_del_init(&work->entry);
-   pwq_dec_nr_in_flight(get_work_pwq(work), get_work_color(work));
+   pwq_dec_nr_in_flight(pwq, get_work_color(work));
 
/* work->data points to pwq iff queued, point to pool */
set_work_pool_and_keep_pending(work, pool->id);
-- 
1.7.4.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH RFC] workqueue: don't grab PENDING bit on some conditions

2014-07-15 Thread Lai Jiangshan

On 07/15/2014 11:58 PM, Tejun Heo wrote:
> Hello, Lai.
> 
> On Tue, Jul 15, 2014 at 05:30:10PM +0800, Lai Jiangshan wrote:
>> Thread1 expects that, after flush_delayed_work() returns, the known pending
>> work is guaranteed finished. But if Thread2 is scheduled a little later than
>> Thread1, the known pending work is dequeued and re-queued, it is considered
>> as two different works in the workqueue subsystem and the guarantee expected
> 
> They are two separate queueing instances of the same work item.

I think the mod_delayed_work() is expected to modify a queueing instances
instead of separate from the name.

> 
>> by Thread1 is broken.
> 
> The guarantee expected by thread 1 is that the most recent queueing
> instance of the work item is finished either through completing
> execution or being cancelled.  No guarantee is broken.

I don't think the mod_delayed_work() is considered as a cancelling operation
to the user. You can add comments to state that it contains a cancelling 
operation
and a requeue operation.

> 
>> The guarantee expected by Thread1/workqueue-user is reasonable for me,
>> the workqueue subsystem should provide this guarantee. In another aspect,
> 
> You're adding a new component to the existing set of guarantees.  You
> can argue for it but it's a new guarantee regardless.

So, it is an RFC.

> 
>> the flush_delayed_work() is still working when mod_delayed_work_on() returns,
>> it is more acceptable that the flush_delayed_work() beats the
>> mod_delayed_work_on().
>>
>> It is achieved by introducing a KEEP_FLUSHED flag for try_to_grab_pending().
>> If the work is being flushed and KEEP_FLUSHED flags is set,
>> we disallow try_to_grab_pending() to grab the pending of the work.
>>
>> And there is another condition that the user want to speed up a delayed work.
>>
>> When the user use "mod_delayed_work_on(..., 0 /* zero delay */);", his
>> attention is to accelerate the work and queue the work immediately.
>>
>> But the work does be slowed down when it is already queued on the worklist
>> due to the work is dequeued and re-queued. So we also disallow
>> try_to_grab_pending() to grab the pending of the work in this condition
>> by introducing KEEP_QUEUED flag.
> 
> Both are extremely marginal.  
> Do we have any actual cases any of these matters?

No such case.

I only found the WB subsystem (backing-dev.c, fs-writeback.c) uses both
mod_delayed_work() and flush_delayed_work(), but it seems that when
flush_delayed_work() is called, mod_delayed_work() will can't be called.

> I can't see what we're gaining with the extra complexity.

Will you add some comments or let it as before?

> 
>> @@ -1212,6 +1220,13 @@ static int try_to_grab_pending(struct work_struct 
>> *work, bool is_dwork,
>>   */
>>  pwq = get_work_pwq(work);
>>  if (pwq && pwq->pool == pool) {
>> +if ((keep_flags | KEEP_QUEUED) ||
>> +((keep_flags | KEEP_FLUSHED) &&
> 
> This can't be right.
> 
> Thanks.
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 1/1 V2] workqueue: remove the del_timer_sync()s in maybe_create_worker()

2014-07-15 Thread Lai Jiangshan

On 07/15/2014 05:42 AM, Thomas Gleixner wrote:
> On Tue, 15 Jul 2014, Lai Jiangshan wrote:
>> On 07/14/2014 11:33 PM, Thomas Gleixner wrote:
>>> On Mon, 14 Jul 2014, Tejun Heo wrote:
>>>
>>>> Hello,
>>>>
>>>> On Mon, Jul 14, 2014 at 04:13:21PM +0800, Lai Jiangshan wrote:
>>>>> It is said in the document that the timer which is being
>>>>> deleted by del_timer_sync() should not be restarted:
>>>>>   Synchronization rules: Callers must prevent restarting of
>>>>>   the timer, otherwise this function is meaningless.
>>>>>
>>>>> Repeating timer may cause the del_timer_sync() spin longer,
>>>>> or even spin forever in very very very very extreme condition.
>>>>
>>>> I'm fairly sure del_timer_sync() can delete self-requeueing timers.
>>>> The implementation busy-waits if the queued timer is the currently
>>>> executing one and dequeues only while the timer isn't running which
>>>> should be able to handle self-requeueing ones just fine.  Thomas,
>>>> del_timer_sync() can reliably delete self-requeueing ones, right?
>>>
>>> Yes. 
>>
>> The comments of the del_timer_sync() needs to be updated
>> if I did not misunderstood?
>>
>>> If the timer callback is running on the other cpu, then it waits
>>> for the callback to finish before checking whether the timer is
>>> enqueued or not.
>>
>> The syncer may be interrupted here, after it comes back, the timer
>> may be running again (and maybe again and again).
> 
> No. The del_timer_sync() code holds the base lock with interrupts
> disabled. So it can't be interrupted.
> 

The del_timer_sync() waits via cpu_relax() without interrupts
disabled. Could you wipe out my concern?

> 
> .
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 1/1 V2] workqueue: unfold start_worker() into create_worker()

2014-07-15 Thread Lai Jiangshan

On 07/14/2014 12:05 PM, Lai Jiangshan wrote:
> Simply unfold the code of start_worker() into create_worker() and
> remove the original start_worker() and create_and_start_worker().
> 
> The only trade-off is the introduced overhead that the pool->lock
> is released and re-grabbed after the newly worker is started.
> The overhead is acceptable since the manager is slow path.

Hi, TJ

Will you accept this trade-off and the patch?
If so, I will rebase this patch without any dependence on other patch.

Thanks,
Lai

> 
> And because this new locking behavior, the newly created worker
> may grab the lock earlier than the manager and go to process
> work items. In this case, the need_to_create_worker() may be
> true as expected and the manager goes to restart without complaint.
> 
> Signed-off-by: Lai Jiangshan 
> ---
>  kernel/workqueue.c |   69 
> ++--
>  1 files changed, 13 insertions(+), 56 deletions(-)
> 
> diff --git a/kernel/workqueue.c b/kernel/workqueue.c
> index 2b3c12c..5b68527 100644
> --- a/kernel/workqueue.c
> +++ b/kernel/workqueue.c
> @@ -1553,7 +1553,7 @@ static void worker_enter_idle(struct worker *worker)
>(worker->hentry.next || worker->hentry.pprev)))
>   return;
>  
> - /* can't use worker_set_flags(), also called from start_worker() */
> + /* can't use worker_set_flags(), also called from create_worker() */
>   worker->flags |= WORKER_IDLE;
>   pool->nr_idle++;
>   worker->last_active = jiffies;
> @@ -1674,8 +1674,7 @@ static void worker_detach_from_pool(struct worker 
> *worker,
>   * create_worker - create a new workqueue worker
>   * @pool: pool the new worker will belong to
>   *
> - * Create a new worker which is attached to @pool.  The new worker must be
> - * started by start_worker().
> + * Create and start a new worker which is attached to @pool.
>   *
>   * CONTEXT:
>   * Might sleep.  Does GFP_KERNEL allocations.
> @@ -1720,6 +1719,13 @@ static struct worker *create_worker(struct worker_pool 
> *pool)
>   /* successful, attach the worker to the pool */
>   worker_attach_to_pool(worker, pool);
>  
> + /* start the newly created worker */
> + spin_lock_irq(&pool->lock);
> + worker->pool->nr_workers++;
> + worker_enter_idle(worker);
> + wake_up_process(worker->task);
> + spin_unlock_irq(&pool->lock);
> +
>   return worker;
>  
>  fail:
> @@ -1730,44 +1736,6 @@ fail:
>  }
>  
>  /**
> - * start_worker - start a newly created worker
> - * @worker: worker to start
> - *
> - * Make the pool aware of @worker and start it.
> - *
> - * CONTEXT:
> - * spin_lock_irq(pool->lock).
> - */
> -static void start_worker(struct worker *worker)
> -{
> - worker->pool->nr_workers++;
> - worker_enter_idle(worker);
> - wake_up_process(worker->task);
> -}
> -
> -/**
> - * create_and_start_worker - create and start a worker for a pool
> - * @pool: the target pool
> - *
> - * Grab the managership of @pool and create and start a new worker for it.
> - *
> - * Return: 0 on success. A negative error code otherwise.
> - */
> -static int create_and_start_worker(struct worker_pool *pool)
> -{
> - struct worker *worker;
> -
> - worker = create_worker(pool);
> - if (worker) {
> - spin_lock_irq(&pool->lock);
> - start_worker(worker);
> - spin_unlock_irq(&pool->lock);
> - }
> -
> - return worker ? 0 : -ENOMEM;
> -}
> -
> -/**
>   * destroy_worker - destroy a workqueue worker
>   * @worker: worker to be destroyed
>   *
> @@ -1905,18 +1873,7 @@ restart:
>   mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT);
>  
>   while (true) {
> - struct worker *worker;
> -
> - worker = create_worker(pool);
> - if (worker) {
> - spin_lock_irq(&pool->lock);
> - start_worker(worker);
> - if (WARN_ON_ONCE(need_to_create_worker(pool)))
> - goto restart;
> - return true;
> - }
> -
> - if (!need_to_create_worker(pool))
> + if (create_worker(pool) || !need_to_create_worker(pool))
>   break;
>  
>   schedule_timeout_interruptible(CREATE_COOLDOWN);
> @@ -3543,7 +3500,7 @@ static struct worker_pool *get_unbound_pool(const 
> struct workqueue_attrs *attrs)
>   goto fail;
>  
>   /* create and start the initial worker */
> -

[PATCH] workqueue: wake regular worker if need_more_worker() when rescuer leave the pool

2014-07-15 Thread Lai Jiangshan

We don't need to wake up regular worker when nr_running==1,
so need_more_worker() is sufficient here.

And need_more_worker() gives us better readability due to the name of
"keep_working()" implies the rescuer should keep working now but
the rescuer is actually leaving.

Signed-off-by: Lai Jiangshan 
---
 kernel/workqueue.c |4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index dfd9f68..7128053 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2308,11 +2308,11 @@ repeat:
put_pwq(pwq);
 
/*
-* Leave this pool.  If keep_working() is %true, notify a
+* Leave this pool.  If need_more_worker() is %true, notify a
 * regular worker; otherwise, we end up with 0 concurrency
 * and stalling the execution.
 */
-   if (keep_working(pool))
+   if (need_more_worker(pool))
wake_up_worker(pool);
 
rescuer->pool = NULL;
-- 
1.7.4.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH] workqueue: detach rescuer from pool until the last

2014-07-15 Thread Lai Jiangshan

In 51697d393922 ("workqueue: use generic attach/detach routine for rescuers"),
The rescuer detaches itself from the pool before put_pwq() so that
the put_unbound_pool() will not destroy the rescuer-attached pool.

It is unnecessary, worker_detach_from_pool() can be used as the last
statement to access to the pool as the same as the regular workers,
put_unbound_pool() will wait for it to detach and then free the pool.

So we move the worker_detach_from_pool() down, make it coincide with
the regular workers.

Signed-off-by: Lai Jiangshan 
---
This old logic is introduced by the same-purpose but un-accepted patch:
https://lkml.org/lkml/2014/4/12/26

In the old patch, put_unbound_pool() still synchronizes with the workers
exiting via kthread_stop() which means put_unbound_pool() can't destroy the
rescuer-attached pool, so the put_pwq() should be the last statement
for the rescuer.

And after several versions of the patchset, the put_unbound_pool() was
re-implemented and it can wait any un-detached worker including the
rescuers, but the above mentioned old logic is still kept in the code.

The old logic is still correct now, but not grace nor necessary.

 kernel/workqueue.c |12 +---
 1 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index a791a8c..dfd9f68 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2300,11 +2300,6 @@ repeat:
move_linked_works(work, scheduled, &n);
 
process_scheduled_works(rescuer);
-   spin_unlock_irq(&pool->lock);
-
-   worker_detach_from_pool(rescuer, pool);
-
-   spin_lock_irq(&pool->lock);
 
/*
 * Put the reference grabbed by send_mayday().  @pool won't
@@ -2322,8 +2317,11 @@ repeat:
wake_up_worker(pool);
 
rescuer->pool = NULL;
-   spin_unlock(&pool->lock);
-   spin_lock(&wq_mayday_lock);
+   spin_unlock_irq(&pool->lock);
+
+   worker_detach_from_pool(rescuer, pool);
+
+   spin_lock_irq(&wq_mayday_lock);
}
 
spin_unlock_irq(&wq_mayday_lock);
-- 
1.7.4.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 2/2] workqueue: remove the argument @wakeup from worker_set_flags()

2014-07-16 Thread Lai Jiangshan

worker_set_flags() doesn't necessarily wake next worker and the @wakeup
can be removed, the caller can use the following conbination instead
when needed:

worker_set_flags();
if (need_more_worker(pool))
wake_up_worker(pool);


Signed-off-by: Lai Jiangshan 
---
 kernel/workqueue.c |   25 ++---
 1 files changed, 6 insertions(+), 19 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 6d11b9a..1a14f18 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -867,35 +867,22 @@ struct task_struct *wq_worker_sleeping(struct task_struct 
*task, int cpu)
  * worker_set_flags - set worker flags and adjust nr_running accordingly
  * @worker: self
  * @flags: flags to set
- * @wakeup: wakeup an idle worker if necessary
  *
- * Set @flags in @worker->flags and adjust nr_running accordingly.  If
- * nr_running becomes zero and @wakeup is %true, an idle worker is
- * woken up.
+ * Set @flags in @worker->flags and adjust nr_running accordingly.
  *
  * CONTEXT:
  * spin_lock_irq(pool->lock)
  */
-static inline void worker_set_flags(struct worker *worker, unsigned int flags,
-   bool wakeup)
+static inline void worker_set_flags(struct worker *worker, unsigned int flags)
 {
struct worker_pool *pool = worker->pool;
 
WARN_ON_ONCE(worker->task != current);
 
-   /*
-* If transitioning into NOT_RUNNING, adjust nr_running and
-* wake up an idle worker as necessary if requested by
-* @wakeup.
-*/
+   /* If transitioning into NOT_RUNNING, adjust nr_running. */
if ((flags & WORKER_NOT_RUNNING) &&
!(worker->flags & WORKER_NOT_RUNNING)) {
-   if (wakeup) {
-   if (atomic_dec_and_test(&pool->nr_running) &&
-   !list_empty(&pool->worklist))
-   wake_up_worker(pool);
-   } else
-   atomic_dec(&pool->nr_running);
+   atomic_dec(&pool->nr_running);
}
 
worker->flags |= flags;
@@ -2045,7 +2032,7 @@ __acquires(&pool->lock)
 * management.  They're the scheduler's responsibility.
 */
if (unlikely(cpu_intensive))
-   worker_set_flags(worker, WORKER_CPU_INTENSIVE, true);
+   worker_set_flags(worker, WORKER_CPU_INTENSIVE);
 
/* Wake up another worker if necessary. */
if (need_more_worker(pool))
@@ -2204,7 +2191,7 @@ recheck:
}
} while (keep_working(pool));
 
-   worker_set_flags(worker, WORKER_PREP, false);
+   worker_set_flags(worker, WORKER_PREP);
 sleep:
/*
 * pool->lock is held and there's no work to process and no need to
-- 
1.7.4.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 1/2] workqueue: remove unneeded test before wake up next worker

2014-07-16 Thread Lai Jiangshan

In this code:
if ((worker->flags & WORKER_UNBOUND) && need_more_worker(pool))
wake_up_worker(pool);

the first test is unneeded. Even the first test is removed, it doesn't affect
the wake-up logic when WORKER_UNBOUND. And it will not introduce any useless
wake-up when !WORKER_UNBOUND since the nr_running >= 1 except only one case.
It will introduce useless/redundant wake-up when cpu_intensive, but this
case is rare and next patch will also remove this redundant wake-up.

Signed-off-by: Lai Jiangshan 
---
 kernel/workqueue.c |7 ++-
 1 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index f8d54c1..6d11b9a 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2047,11 +2047,8 @@ __acquires(&pool->lock)
if (unlikely(cpu_intensive))
worker_set_flags(worker, WORKER_CPU_INTENSIVE, true);
 
-   /*
-* Unbound pool isn't concurrency managed and work items should be
-* executed ASAP.  Wake up another worker if necessary.
-*/
-   if ((worker->flags & WORKER_UNBOUND) && need_more_worker(pool))
+   /* Wake up another worker if necessary. */
+   if (need_more_worker(pool))
wake_up_worker(pool);
 
/*
-- 
1.7.4.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 2/3] rcu: Remove stale comment in tree.c

2014-07-16 Thread Lai Jiangshan

On 07/16/2014 09:29 PM, Pranith Kumar wrote:
> On 07/16/2014 08:47 AM, Paul E. McKenney wrote:
>> On Tue, Jul 15, 2014 at 06:57:59PM -0400, Pranith Kumar wrote:
>>>
>>> On 07/15/2014 06:53 PM, j...@joshtriplett.org wrote:
 On Tue, Jul 15, 2014 at 06:31:48PM -0400, Pranith Kumar wrote:
> This commit removes a stale comment in rcu/tree.c.
> FYI, an updated comment exists a few lines below this.
>
> Signed-off-by: Pranith Kumar 
 In general, when removing a stale comment, I'd suggest explaining why
 the comment is stale.  Was code removed without removing the
 corresponding comment, or was code changed such that the comment no
 longer applies, or...?
>>>
>>> I guess it was left out when code was moved around previously. And I did 
>>> mention that an updated comment is there a few lines below.
>>>
>>> For reference this is the new comment which is below the old comment, they 
>>> mean the same :)
>>>
>>> /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */
>>
>> Indeed that is the case.
>>
>> Please update the commit log with this explanation and resend.
>>
>>  Thanx, Paul
>>
> 
> Please find the updated patch below.
> 
> --
> Pranith
> 
> From: Pranith Kumar 
> Date: Mon, 14 Jul 2014 16:01:05 -0400
> Subject: [PATCH 2/3] rcu: Remove stale comment in tree.c
> 
> This commit removes a stale comment in rcu/tree.c which was left out when some
> code was moved around previously.

Which commit caused this leftover comment? Could you mention that commit in
your changlog?

12BitsCmmtID ("commit title...")

> 
> For reference, the following updated comment exists a few lines below this 
> which
> means the same.
> 
> /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */
> 
> Signed-off-by: Pranith Kumar 
> ---
>  kernel/rcu/tree.c | 2 --
>  1 file changed, 2 deletions(-)
> 
> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> index a1abaa8..e67246e 100644
> --- a/kernel/rcu/tree.c
> +++ b/kernel/rcu/tree.c
> @@ -2211,8 +2211,6 @@ static void rcu_cleanup_dead_cpu(int cpu, struct 
> rcu_state *rsp)
>   /* Adjust any no-longer-needed kthreads. */
>   rcu_boost_kthread_setaffinity(rnp, -1);
>  
> - /* Remove the dead CPU from the bitmasks in the rcu_node hierarchy. */
> -
>   /* Exclude any attempts to start a new grace period. */
>   mutex_lock(&rsp->onoff_mutex);
>   raw_spin_lock_irqsave(&rsp->orphan_lock, flags);

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 2/3] rcu: Remove stale comment in tree.c

2014-07-16 Thread Lai Jiangshan

On 07/17/2014 09:01 AM, Pranith Kumar wrote:
> On 07/16/2014 08:55 PM, Lai Jiangshan wrote:
>> On 07/16/2014 09:29 PM, Pranith Kumar wrote:
>>> On 07/16/2014 08:47 AM, Paul E. McKenney wrote:
>>>> On Tue, Jul 15, 2014 at 06:57:59PM -0400, Pranith Kumar wrote:
>>>>>
>>>>> On 07/15/2014 06:53 PM, j...@joshtriplett.org wrote:
>>>>>> On Tue, Jul 15, 2014 at 06:31:48PM -0400, Pranith Kumar wrote:
>>>>>>> This commit removes a stale comment in rcu/tree.c.
>>>>>>> FYI, an updated comment exists a few lines below this.
>>>>>>>
>>>>>>> Signed-off-by: Pranith Kumar 
>>>>>> In general, when removing a stale comment, I'd suggest explaining why
>>>>>> the comment is stale.  Was code removed without removing the
>>>>>> corresponding comment, or was code changed such that the comment no
>>>>>> longer applies, or...?
>>>>>
>>>>> I guess it was left out when code was moved around previously. And I did 
>>>>> mention that an updated comment is there a few lines below.
>>>>>
>>>>> For reference this is the new comment which is below the old comment, 
>>>>> they mean the same :)
>>>>>
>>>>> /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. 
>>>>> */
>>>>
>>>> Indeed that is the case.
>>>>
>>>> Please update the commit log with this explanation and resend.
>>>>
>>>>Thanx, Paul
>>>>
>>>
>>> Please find the updated patch below.
>>>
>>> --
>>> Pranith
>>>
>>> From: Pranith Kumar 
>>> Date: Mon, 14 Jul 2014 16:01:05 -0400
>>> Subject: [PATCH 2/3] rcu: Remove stale comment in tree.c
>>>
>>> This commit removes a stale comment in rcu/tree.c which was left out when 
>>> some
>>> code was moved around previously.
>>
>> Which commit caused this leftover comment? Could you mention that commit in
>> your changlog?
>>
>> 12BitsCmmtID ("commit title...")
>>
> 
> Sure, please find an updated patch with Josh Triplett's sign-off added:
> 
> From: Pranith Kumar 
> Date: Mon, 14 Jul 2014 16:01:05 -0400
> Subject: [PATCH 2/3] rcu: Remove stale comment in tree.c
> 
> 
> This commit removes a stale comment in rcu/tree.c which was left out in
> commit 2036d94a7b61ca5032ce (rcu:  Rework detection of use of RCU by offline 
> CPUs)

I suggest you use the following syntax in future.

2036d94a7b61 ("rcu:  Rework detection of use of RCU by offline CPUs")

> 
> For reference, the following updated comment exists a few lines below this 
> which
> means the same.
> 
> /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */
> 
> Signed-off-by: Pranith Kumar 
> Reviewed-by: Joe Tripplett 

Reviewed-by: Lai Jiangshan 

> ---
>  kernel/rcu/tree.c | 2 --
>  1 file changed, 2 deletions(-)
> 
> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> index a1abaa8..e67246e 100644
> --- a/kernel/rcu/tree.c
> +++ b/kernel/rcu/tree.c
> @@ -2211,8 +2211,6 @@ static void rcu_cleanup_dead_cpu(int cpu, struct 
> rcu_state *rsp)
>   /* Adjust any no-longer-needed kthreads. */
>   rcu_boost_kthread_setaffinity(rnp, -1);
>  
> - /* Remove the dead CPU from the bitmasks in the rcu_node hierarchy. */
> -
>   /* Exclude any attempts to start a new grace period. */
>   mutex_lock(&rsp->onoff_mutex);
>   raw_spin_lock_irqsave(&rsp->orphan_lock, flags);

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] workqueue: initialize cpumask of wq_numa_possible_cpumask

2014-07-06 Thread Lai Jiangshan

On 07/07/2014 01:21 AM, Yasuaki Ishimatsu wrote:
> When hot-adding and onlining CPU, kernel panic occurs, showing following
> call trace.
> 
>  BUG: unable to handle kernel paging request at 1d08
>  IP: [] __alloc_pages_nodemask+0x9d/0xb10
>  PGD 0
>  Oops:  [#1] SMP
>  ...
>  Call Trace:
>   [] ? cpumask_next_and+0x35/0x50
>   [] ? find_busiest_group+0x113/0x8f0
>   [] ? deactivate_slab+0x349/0x3c0
>   [] new_slab+0x91/0x300
>   [] __slab_alloc+0x2bb/0x482
>   [] ? copy_process.part.25+0xfc/0x14c0
>   [] ? load_balance+0x218/0x890
>   [] ? sched_clock+0x9/0x10
>   [] ? trace_clock_local+0x9/0x10
>   [] kmem_cache_alloc_node+0x8c/0x200
>   [] copy_process.part.25+0xfc/0x14c0
>   [] ? trace_buffer_unlock_commit+0x4d/0x60
>   [] ? kthread_create_on_node+0x140/0x140
>   [] do_fork+0xbc/0x360
>   [] kernel_thread+0x26/0x30
>   [] kthreadd+0x2c2/0x300
>   [] ? kthread_create_on_cpu+0x60/0x60
>   [] ret_from_fork+0x7c/0xb0
>   [] ? kthread_create_on_cpu+0x60/0x60
> 
> In my investigation, I found the root cause is wq_numa_possible_cpumask.
> All entries of wq_numa_possible_cpumask is allocated by
> alloc_cpumask_var_node(). And these entries are used without initializing.
> So these entries have wrong value.
> 
> When hot-adding and onlining CPU, wq_update_unbound_numa() is called.
> wq_update_unbound_numa() calls alloc_unbound_pwq(). And alloc_unbound_pwq()
> calls get_unbound_pool(). In get_unbound_pool(), worker_pool->node is set
> as follow:
> 
> #kernel/workqueue.c
> 3592 /* if cpumask is contained inside a NUMA node, we belong to that 
> node */
> 3593 if (wq_numa_enabled) {
> 3594 for_each_node(node) {
> 3595 if (cpumask_subset(pool->attrs->cpumask,
> 3596
> wq_numa_possible_cpumask[node])) {
> 3597 pool->node = node;
> 3598 break;
> 3599 }
> 3600 }
> 3601 }
> 
> But wq_numa_possible_cpumask[node] does not have correct cpumask. So, wrong
> node is selected. As a result, kernel panic occurs.
> 
> By this patch, all entries of wq_numa_possible_cpumask are allocated by
> zalloc_cpumask_var_node to initialize them. And the panic disappeared.
> 
> Signed-off-by: Yasuaki Ishimatsu 

Hi, Yasuaki

All cpumasks in the wq_numa_possible_cpumask array are allocated in
wq_numa_init():

for_each_node(node)
BUG_ON(!alloc_cpumask_var_node(&tbl[node], GFP_KERNEL,
node_online(node) ? node : NUMA_NO_NODE));

[snip...]

wq_numa_possible_cpumask = tbl;

I didn't find out how does this patch make the all entries of
wq_numa_possible_cpumask zeroed.

Or I misunderstood.

Thanks,
Lai

> ---
>  kernel/workqueue.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/kernel/workqueue.c b/kernel/workqueue.c
> index 6203d29..b393ded 100644
> --- a/kernel/workqueue.c
> +++ b/kernel/workqueue.c
> @@ -3338,7 +3338,7 @@ struct workqueue_attrs *alloc_workqueue_attrs(gfp_t 
> gfp_mask)
>   attrs = kzalloc(sizeof(*attrs), gfp_mask);
>   if (!attrs)
>   goto fail;
> - if (!alloc_cpumask_var(&attrs->cpumask, gfp_mask))
> + if (!zalloc_cpumask_var(&attrs->cpumask, gfp_mask))
>   goto fail;
> 
>   cpumask_copy(attrs->cpumask, cpu_possible_mask);
> 
> .
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] workqueue: initialize cpumask of wq_numa_possible_cpumask

2014-07-06 Thread Lai Jiangshan

On 07/07/2014 08:33 AM, Yasuaki Ishimatsu wrote:
> (2014/07/07 9:19), Lai Jiangshan wrote:
>> On 07/07/2014 01:21 AM, Yasuaki Ishimatsu wrote:
>>> When hot-adding and onlining CPU, kernel panic occurs, showing following
>>> call trace.
>>>
>>>   BUG: unable to handle kernel paging request at 1d08
>>>   IP: [] __alloc_pages_nodemask+0x9d/0xb10
>>>   PGD 0
>>>   Oops:  [#1] SMP
>>>   ...
>>>   Call Trace:
>>>[] ? cpumask_next_and+0x35/0x50
>>>[] ? find_busiest_group+0x113/0x8f0
>>>[] ? deactivate_slab+0x349/0x3c0
>>>[] new_slab+0x91/0x300
>>>[] __slab_alloc+0x2bb/0x482
>>>[] ? copy_process.part.25+0xfc/0x14c0
>>>[] ? load_balance+0x218/0x890
>>>[] ? sched_clock+0x9/0x10
>>>[] ? trace_clock_local+0x9/0x10
>>>[] kmem_cache_alloc_node+0x8c/0x200
>>>[] copy_process.part.25+0xfc/0x14c0
>>>[] ? trace_buffer_unlock_commit+0x4d/0x60
>>>[] ? kthread_create_on_node+0x140/0x140
>>>[] do_fork+0xbc/0x360
>>>[] kernel_thread+0x26/0x30
>>>[] kthreadd+0x2c2/0x300
>>>[] ? kthread_create_on_cpu+0x60/0x60
>>>[] ret_from_fork+0x7c/0xb0
>>>[] ? kthread_create_on_cpu+0x60/0x60
>>>
>>> In my investigation, I found the root cause is wq_numa_possible_cpumask.
>>> All entries of wq_numa_possible_cpumask is allocated by
>>> alloc_cpumask_var_node(). And these entries are used without initializing.
>>> So these entries have wrong value.
>>>
>>> When hot-adding and onlining CPU, wq_update_unbound_numa() is called.
>>> wq_update_unbound_numa() calls alloc_unbound_pwq(). And alloc_unbound_pwq()
>>> calls get_unbound_pool(). In get_unbound_pool(), worker_pool->node is set
>>> as follow:
>>>
>>> #kernel/workqueue.c
>>> 3592 /* if cpumask is contained inside a NUMA node, we belong to 
>>> that node */
>>> 3593 if (wq_numa_enabled) {
>>> 3594 for_each_node(node) {
>>> 3595 if (cpumask_subset(pool->attrs->cpumask,
>>> 3596
>>> wq_numa_possible_cpumask[node])) {
>>> 3597 pool->node = node;
>>> 3598 break;
>>> 3599 }
>>> 3600 }
>>> 3601 }
>>>
>>> But wq_numa_possible_cpumask[node] does not have correct cpumask. So, wrong
>>> node is selected. As a result, kernel panic occurs.
>>>
>>> By this patch, all entries of wq_numa_possible_cpumask are allocated by
>>> zalloc_cpumask_var_node to initialize them. And the panic disappeared.

Hi, Yasuaki

You said the panic disappeared with the old patch, how did it happen
since the old patch was considered incorrect?

Did the panic happen so rarely that it was mistaken disappeared?

How did you test the new one?

In the point of review, we definitely need to use zalloc_cpumask_var_node()
instead of alloc_cpumask_var_node() in wq_numa_init().

So for the new patch:

Reviewed-by: Lai Jiangshan 

Thanks,
Lai

>>>
>>> Signed-off-by: Yasuaki Ishimatsu 
>>
>> Hi, Yasuaki
>>
>> All cpumasks in the wq_numa_possible_cpumask array are allocated in
>> wq_numa_init():
>>
>> for_each_node(node)
>> BUG_ON(!alloc_cpumask_var_node(&tbl[node], GFP_KERNEL,
>> node_online(node) ? node : NUMA_NO_NODE));
>>
>> [snip...]
>>
>> wq_numa_possible_cpumask = tbl;
>>
>> I didn't find out how does this patch make the all entries of
>> wq_numa_possible_cpumask zeroed.
> 
> Sorry. I mistook. I will resend soon.
> 
> Thanks,
> Yasuaki Ishimatsu.
> 
>>
>> Or I misunderstood.
>>
>> Thanks,
>> Lai
>>
>>> ---
>>>   kernel/workqueue.c | 2 +-
>>>   1 file changed, 1 insertion(+), 1 deletion(-)
>>>
>>> diff --git a/kernel/workqueue.c b/kernel/workqueue.c
>>> index 6203d29..b393ded 100644
>>> --- a/kernel/workqueue.c
>>> +++ b/kernel/workqueue.c
>>> @@ -3338,7 +3338,7 @@ struct workqueue_attrs *alloc_workqueue_attrs(gfp_t 
>>> gfp_mask)
>>>   attrs = kzalloc(sizeof(*attrs), gfp_mask);
>>>   if (!attrs)
>>>   goto fail;
>>> -if (!alloc_cpumask_var(&attrs->cpumask, gfp_mask))
>>> +if (!zalloc_cpumask_var(&attrs->cpumask, gfp_mask))
>>>   goto fail;
>>>
>>>   cpumask_copy(attrs->cpumask, cpu_possible_mask);
>>>
>>> .
>>>
>>
> 
> 
> .
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH tip/core/rcu 0/4] Documentation changes for 3.17

2014-07-08 Thread Lai Jiangshan

On 07/08/2014 08:14 AM, Josh Triplett wrote:
> On Mon, Jul 07, 2014 at 03:23:45PM -0700, Paul E. McKenney wrote:
>> Hello!
>>
>> This series provides a few documentation updates:
>>
>> 1.   Clarify that if there is nothing awakened, then there is no
>>  memory barrier.
>>
>> 2.   Fix broken RTFP URL, courtesy of Pranith Kumar.
>>
>> 3.   Add acquire/release barrier to memory-barrier pairing rules.
>>
>> 4.   Add pointer to percpu-ref in rcuref.txt documentation.
> 
> For all four:
> 
> Reviewed-by: Josh Triplett 
> .
> 

For patch 1 ("documentation: Clarify wake-up/memory-barrier relationship")
and patch 4 ("documentation: Add pointer to percpu-ref for RCU and refcount")

Reviewed-by: Lai Jiangshan 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH tip/core/rcu 03/17] signal: Explain local_irq_save() call

2014-07-08 Thread Lai Jiangshan

On 07/08/2014 06:38 AM, Paul E. McKenney wrote:
> From: "Paul E. McKenney" 
> 
> The explicit local_irq_save() in __lock_task_sighand() is needed to avoid
> a potential deadlock condition, as noted in a841796f11c90d53 (signal:
> align __lock_task_sighand() irq disabling and RCU).  However, someone
> reading the code might be forgiven for concluding that this separate
> local_irq_save() was completely unnecessary.  This commit therefore adds
> a comment referencing the shiny new block comment on rcu_read_unlock().
> 
> Reported-by: Oleg Nesterov 
> Signed-off-by: Paul E. McKenney 
> Acked-by: Oleg Nesterov 
> ---
>  kernel/signal.c | 4 
>  1 file changed, 4 insertions(+)
> 
> diff --git a/kernel/signal.c b/kernel/signal.c
> index a4077e90f19f..46161e744760 100644
> --- a/kernel/signal.c
> +++ b/kernel/signal.c
> @@ -1263,6 +1263,10 @@ struct sighand_struct *__lock_task_sighand(struct 
> task_struct *tsk,
>   struct sighand_struct *sighand;
>  
>   for (;;) {
> + /*
> +  * Disable interrupts early to avoid deadlocks.
> +  * See rcu_read_unlock comment header for details.
> +  */

A pair of brackets are missing here: rcu_read_unlock()
after that, please add:

Reviewed-by: Lai Jiangshan 


It reminds me that I should keep my effort to solve the deadlock
problem where rcu_read_unlock() is overlapped with schedule locks.

>   local_irq_save(*flags);
>   rcu_read_lock();
>   sighand = rcu_dereference(tsk->sighand);

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH V2] workqueue: fix possible race condition when rescuer VS pwq-release

2014-04-16 Thread Lai Jiangshan

On 04/17/2014 12:50 AM, Tejun Heo wrote:
> Hello, Lai.
> 
> On Thu, Apr 17, 2014 at 12:21:21AM +0800, Lai Jiangshan wrote:
>> OK. It is better to use get_pwq(). I will also change the above comments to:
>>
>>   The base ref and the possible ref from rerscuer(stopped) are never
>> dropped on per-cpu pwqs.
>>   Directly free the pwqs and wq.
> 
> Hmmm, why does that matter?  The base ref is the business of
> create/destroy path and as long as base ref is never put other refs
> just don't matter.
> 
>> The reason I quickly dropped V1 and wrote the V2 is that I saw this comment.
>> "The base ref" is precise after I used get_pwq() in V1.
>>
>> Or to avoid to change to this comments.
>> I can also move the following code down to the bottom of the 
>> rescuer_thread().
>>
>> if (kthread_should_stop()) {
>> __set_current_state(TASK_RUNNING);
>> rescuer->task->flags &= ~PF_WQ_WORKER;
>> return 0;
>> }
>>
>> (I reply this email on browser, never mind the syntax).
>> Maybe the second choice are better.
> 
> Hmmm... yeah, regardlesss of the above, it's a bit nasty that rescuer
> may exit with non-empty mayday list.
> 

+* The wq->maydays list maybe still have some pwqs linked,
+* but it is safe to free them all together since the rescuer
+* is stopped.

I just comment it without fix. It has no harm before, but it harms after I 
added a get_pwq().
Sorry.
Lai
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 1/2] workqueue: rescuer_thread() processes all pwqs before exit

2014-04-16 Thread Lai Jiangshan

Before the rescuer is picked to running, the works of the @pwq
may be processed by some other workers, and destroy_workqueue()
may called at the same time. This may result a nasty situation
that rescuer may exit with non-empty mayday list.

It is no harm currently, destroy_workqueue() can safely to free
them all(workqueue&pwqs) togerther, since the rescuer is stopped.
No rescuer nor mayday-timer can access the mayday list.

But it is nasty and error-prone in future develop. Fix it.

Signed-off-by: Lai Jiangshan 
---
 kernel/workqueue.c |   12 ++--
 1 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 0ee63af..832125f 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2409,12 +2409,6 @@ static int rescuer_thread(void *__rescuer)
 repeat:
set_current_state(TASK_INTERRUPTIBLE);
 
-   if (kthread_should_stop()) {
-   __set_current_state(TASK_RUNNING);
-   rescuer->task->flags &= ~PF_WQ_WORKER;
-   return 0;
-   }
-
/* see whether any pwq is asking for help */
spin_lock_irq(&wq_mayday_lock);
 
@@ -2459,6 +2453,12 @@ repeat:
 
spin_unlock_irq(&wq_mayday_lock);
 
+   if (kthread_should_stop()) {
+   __set_current_state(TASK_RUNNING);
+   rescuer->task->flags &= ~PF_WQ_WORKER;
+   return 0;
+   }
+
/* rescuers should never participate in concurrency management */
WARN_ON_ONCE(!(rescuer->flags & WORKER_NOT_RUNNING));
schedule();
-- 
1.7.4.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 2/2] workqueue: fix possible race condition when rescuer VS pwq-release

2014-04-16 Thread Lai Jiangshan

There is a race condition between rescuer_thread() and
pwq_unbound_release_workfn().

The works of the @pwq may be processed by some other workers,
and @pwq is scheduled to release(due to its wq's attr is changed)
before the rescuer starts to process. In this case
pwq_unbound_release_workfn() will corrupt wq->maydays list,
and rescuer_thead() will access to corrupted data.

Using get_pwq() pin it until rescuer is done with it.

Signed-off-by: Lai Jiangshan 
---
 kernel/workqueue.c |   13 +
 1 files changed, 13 insertions(+), 0 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 832125f..77c29b7 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1916,6 +1916,16 @@ static void send_mayday(struct work_struct *work)
 
/* mayday mayday mayday */
if (list_empty(&pwq->mayday_node)) {
+   /*
+* pwqs might go away at any time, pin it until
+* rescuer is done with it.
+*
+* Especially a pwq of an unbound wq may be released
+* before wq's destruction when the wq's attr is changed.
+* In this case, pwq_unbound_release_workfn() may execute
+* earlier before rescuer_thread() and corrupt wq->maydays.
+*/
+   get_pwq(pwq);
list_add_tail(&pwq->mayday_node, &wq->maydays);
wake_up_process(wq->rescuer->task);
}
@@ -2438,6 +2448,9 @@ repeat:
 
process_scheduled_works(rescuer);
 
+   /* put the reference grabbed by send_mayday(). */
+   put_pwq(pwq);
+
/*
 * Leave this pool.  If keep_working() is %true, notify a
 * regular worker; otherwise, we end up with 0 concurrency
-- 
1.7.4.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] workqueue: add __WQ_FREEZING and remove POOL_FREEZING

2014-04-16 Thread Lai Jiangshan

On 04/17/2014 03:51 AM, Tejun Heo wrote:
> Hello,
> 
> On Tue, Mar 25, 2014 at 05:56:04PM +0800, Lai Jiangshan wrote:
>> freezing is nothing related to pools, but POOL_FREEZING adds a connection,
>> and causes freeze_workqueues_begin() and thaw_workqueues() complicated.
>>
>> Since freezing is workqueue instance attribute, so we introduce __WQ_FREEZING
>> to wq->flags instead and remove POOL_FREEZING.
>>
>> we set __WQ_FREEZING only when freezable(to simplify 
>> pwq_adjust_max_active()),
>> make freeze_workqueues_begin() and thaw_workqueues() fast skip non-freezable 
>> wq.
> 
> Please wrap the description to 80 columns.
> 
>> @@ -3730,18 +3726,13 @@ static void pwq_unbound_release_workfn(struct 
>> work_struct *work)
>>  static void pwq_adjust_max_active(struct pool_workqueue *pwq)
>>  {
>>  struct workqueue_struct *wq = pwq->wq;
>> -bool freezable = wq->flags & WQ_FREEZABLE;
>>  
>> -/* for @wq->saved_max_active */
>> +/* for @wq->saved_max_active and @wq->flags */
>>  lockdep_assert_held(&wq->mutex);
>>  
>> -/* fast exit for non-freezable wqs */
>> -if (!freezable && pwq->max_active == wq->saved_max_active)
>> -return;
>> -
> 
> Why are we removing the above?  Can't we still test __WQ_FREEZING as
> we're holding wq->mutex?  I don't really mind removing the
> optimization but the patch description at least has to explain what's
> going on.

This part was in other old patch: https://lkml.org/lkml/2013/4/3/756
I admit the changelogs(old patch&this) are bad.
But I still consider it would be better if we split it to two patches:
(https://lkml.org/lkml/2013/4/3/748 & https://lkml.org/lkml/2013/4/3/756)

There are different aims in the patches.

Any thinks? And sorry for I didn't keep to push the patches at that time.
Thanks
Lai

> 
> ...
>>  list_for_each_entry(wq, &workqueues, list) {
>> +if (!(wq->flags & WQ_FREEZABLE))
>> +continue;
> 
> Ah, okay, you're not calling the function at all if WQ_FREEZABLE is
> not set.  I couldn't really understand what you were trying to say in
> the patch description.  Can you please try to refine the description
> more?  It's better to be verbose and clear than short and difficult to
> understand.
> 
> Thanks.
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 1/2] workqueue: rescuer_thread() processes all pwqs before exit

2014-04-17 Thread Lai Jiangshan

On Thu, Apr 17, 2014 at 11:27 PM, Tejun Heo  wrote:
> Hello,
>
> On Thu, Apr 17, 2014 at 07:34:08AM +0800, Lai Jiangshan wrote:
>> Before the rescuer is picked to running, the works of the @pwq
>> may be processed by some other workers, and destroy_workqueue()
>> may called at the same time. This may result a nasty situation
>> that rescuer may exit with non-empty mayday list.
>>
>> It is no harm currently, destroy_workqueue() can safely to free
>> them all(workqueue&pwqs) togerther, since the rescuer is stopped.
>> No rescuer nor mayday-timer can access the mayday list.
>>
>> But it is nasty and error-prone in future develop. Fix it.
>>
>> Signed-off-by: Lai Jiangshan 
>> ---
>>  kernel/workqueue.c |   12 ++--
>>  1 files changed, 6 insertions(+), 6 deletions(-)
>>
>> diff --git a/kernel/workqueue.c b/kernel/workqueue.c
>> index 0ee63af..832125f 100644
>> --- a/kernel/workqueue.c
>> +++ b/kernel/workqueue.c
>> @@ -2409,12 +2409,6 @@ static int rescuer_thread(void *__rescuer)
>>  repeat:
>>   set_current_state(TASK_INTERRUPTIBLE);
>>
>> - if (kthread_should_stop()) {
>> - __set_current_state(TASK_RUNNING);
>> - rescuer->task->flags &= ~PF_WQ_WORKER;
>> - return 0;
>> - }
>> -
>>   /* see whether any pwq is asking for help */
>>   spin_lock_irq(&wq_mayday_lock);
>>
>> @@ -2459,6 +2453,12 @@ repeat:
>>
>>   spin_unlock_irq(&wq_mayday_lock);
>>
>> + if (kthread_should_stop()) {
>> + __set_current_state(TASK_RUNNING);
>> + rescuer->task->flags &= ~PF_WQ_WORKER;
>> + return 0;
>> + }
>> +
>
> I don't think this is reliable.  What if mayday requests take place
> between wq_mayday_lock and kthread_should_stop() check?  We'll
> probably need to run through mayday list after checking should_stop.

It is destroy_workqueue()'s responsibility to avoid this.
destroy_workqueue() should drain all works and refuse any new work queued
on the wq before destroy the wq.

So since there is no works, there is no new mayday request,
and there is no mayday request take place between wq_mayday_lock
and kthread_should_stop() check.

Thanks,
Lai

>
> Thanks.
>
> --
> tejun
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 1/2] workqueue: rescuer_thread() processes all pwqs before exit

2014-04-17 Thread Lai Jiangshan

On Fri, Apr 18, 2014 at 12:08 AM, Tejun Heo  wrote:
> Hello, Lai.
>
> On Fri, Apr 18, 2014 at 12:04:29AM +0800, Lai Jiangshan wrote:
>> > I don't think this is reliable.  What if mayday requests take place
>> > between wq_mayday_lock and kthread_should_stop() check?  We'll
>> > probably need to run through mayday list after checking should_stop.
>>
>> It is destroy_workqueue()'s responsibility to avoid this.
>> destroy_workqueue() should drain all works and refuse any new work queued
>> on the wq before destroy the wq.
>>
>> So since there is no works, there is no new mayday request,
>> and there is no mayday request take place between wq_mayday_lock
>> and kthread_should_stop() check.
>
> Hmmm?  Isn't this the same race condition that you tried to remove by
> relocating the test?  It doesn't matter what destroy_workqueue() does,
> the rescuer may get preempted inbetween and anything can happen
> inbetween including someone maydaying and initiation of
> destroy_workqueue().  Your patch doesn't change the situation at all.
> It can still return with non-empty mayday list.

You are right. We need a additional atomic check.

Thanks,
Lai

>
> Thanks.
>
> --
> tejun
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 0/8] idr: fix & cleanup

2014-04-18 Thread Lai Jiangshan

Patch1 fix a bug caused by overflow.
Patch2,3 add checks for unallocated_id.
Patch4 changes to returned error code
Patch5-8 cleanup.

Lai Jiangshan (8):
  idr: fix overflow bug for the max-high layer
  idr: fix unexpected id-removal when idr_remove(unallocated_id)
  idr: fix NULL pointer dereference when ida_remove(unallocated_id)
  idr: fix idr_replace()'s returned error code
  idr: covert BUG_ON() to WARN_ON_ONCE() if the argument is invalid.
  idr: avoid ping-pong
  idr: don't need to shink the free list when idr_remove()
  idr: reduce the unneeded check in free_layer()

 lib/idr.c |   58 +-
 1 files changed, 17 insertions(+), 41 deletions(-)

-- 
1.7.4.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 3/8] idr: fix NULL pointer dereference when ida_remove(unallocated_id)

2014-04-18 Thread Lai Jiangshan

If the ida has at least one existed_id, and when an special unallocated_id
is passed to the ida_remove(), the system will crash because it hits NULL
pointer dereference.

This special unallocated_id is that it shares the same lowest idr layer with
an exsited_id, but the idr slot is different(if the unallocated_id is 
allocated).
In this case the supposed idr slot of the unallocated_id is NULL,
It means @bitmap == NULL, and when the code dereference it, it crash the kernel.

See the test code:

static void test3(void)
{
int id;
DEFINE_IDA(test_ida);

printk(KERN_INFO "Start test3\n");
if (ida_pre_get(&test_ida, GFP_KERNEL) < 0) return;
if (ida_get_new(&test_ida,  &id) < 0) return;
ida_remove(&test_ida, 4000); /* bug: null deference here */
printk(KERN_INFO "End of test3\n");
}

It only happens when unallocated_id, it is caller's fault. It is not
a bug. But it is better to add the proper check and complains instead
of crashing the kernel.

Signed-off-by: Lai Jiangshan 
---
 lib/idr.c |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/lib/idr.c b/lib/idr.c
index a28036d..d200d67 100644
--- a/lib/idr.c
+++ b/lib/idr.c
@@ -1045,7 +1045,7 @@ void ida_remove(struct ida *ida, int id)
__clear_bit(n, p->bitmap);
 
bitmap = (void *)p->ary[n];
-   if (!test_bit(offset, bitmap->bitmap))
+   if (!bitmap || !test_bit(offset, bitmap->bitmap))
goto err;
 
/* update bitmap and remove it if empty */
-- 
1.7.4.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 4/8] idr: fix idr_replace()'s returned error code

2014-04-18 Thread Lai Jiangshan

When the smaller id is not found, idr_replace() returns -ENOENT.
But when the id is bigger enough, idr_replace() returns -EINVAL,
actually there is no difference between these two kinds of ids.

These are all unallocated id, the return values of the idr_replace()
for these ids should be the same: -ENOENT.

Signed-off-by: Lai Jiangshan 
---
 lib/idr.c |4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/idr.c b/lib/idr.c
index d200d67..104f87f 100644
--- a/lib/idr.c
+++ b/lib/idr.c
@@ -814,10 +814,10 @@ void *idr_replace(struct idr *idp, void *ptr, int id)
 
p = idp->top;
if (!p)
-   return ERR_PTR(-EINVAL);
+   return ERR_PTR(-ENOENT);
 
if (id > idr_max(p->layer + 1))
-   return ERR_PTR(-EINVAL);
+   return ERR_PTR(-ENOENT);
 
n = p->layer * IDR_BITS;
while ((n > 0) && p) {
-- 
1.7.4.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 8/8] idr: reduce the unneeded check in free_layer()

2014-04-18 Thread Lai Jiangshan

If "idr->hint == p" is true, it also implies "idr->hint" is true(not NULL).

Signed-off-by: Lai Jiangshan 
---
 lib/idr.c |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/lib/idr.c b/lib/idr.c
index 314ea5f..08d010c 100644
--- a/lib/idr.c
+++ b/lib/idr.c
@@ -143,7 +143,7 @@ static void idr_layer_rcu_free(struct rcu_head *head)
 
 static inline void free_layer(struct idr *idr, struct idr_layer *p)
 {
-   if (idr->hint && idr->hint == p)
+   if (idr->hint == p)
RCU_INIT_POINTER(idr->hint, NULL);
call_rcu(&p->rcu_head, idr_layer_rcu_free);
 }
-- 
1.7.4.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 6/8] idr: avoid ping-pong

2014-04-18 Thread Lai Jiangshan

The ida callers always calls ida_pre_get() before ida_get_new*().
ida_pre_get() will do layer allocation, and ida_get_new*() will do layer 
removal.

It causes an unneeded ping-pong. The speculative layer removal in
ida_get_new*() can't result expected optimization.

So we remove the unneeded layer removal in ida_get_new*().

Signed-off-by: Lai Jiangshan 
---
 lib/idr.c |   11 ---
 1 files changed, 0 insertions(+), 11 deletions(-)

diff --git a/lib/idr.c b/lib/idr.c
index e4f9965..be0b6ff 100644
--- a/lib/idr.c
+++ b/lib/idr.c
@@ -1001,17 +1001,6 @@ int ida_get_new_above(struct ida *ida, int starting_id, 
int *p_id)
 
*p_id = id;
 
-   /* Each leaf node can handle nearly a thousand slots and the
-* whole idea of ida is to have small memory foot print.
-* Throw away extra resources one by one after each successful
-* allocation.
-*/
-   if (ida->idr.id_free_cnt || ida->free_bitmap) {
-   struct idr_layer *p = get_from_free_list(&ida->idr);
-   if (p)
-   kmem_cache_free(idr_layer_cache, p);
-   }
-
return 0;
 }
 EXPORT_SYMBOL(ida_get_new_above);
-- 
1.7.4.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 7/8] idr: don't need to shink the free list when idr_remove()

2014-04-18 Thread Lai Jiangshan

After idr subsystem is changed to RCU-awared, the free layer will not go to
the free list. The free list will not be filled up when idr_remove().
So we don't need to shink it too.

"#ifndef TEST" can't work for user space test now, just remove it.

Signed-off-by: Lai Jiangshan 
---
 lib/idr.c |   17 -
 1 files changed, 0 insertions(+), 17 deletions(-)

diff --git a/lib/idr.c b/lib/idr.c
index be0b6ff..314ea5f 100644
--- a/lib/idr.c
+++ b/lib/idr.c
@@ -18,19 +18,11 @@
  * pointer or what ever, we treat it as a (void *).  You can pass this
  * id to a user for him to pass back at a later time.  You then pass
  * that id to this code and it returns your pointer.
-
- * You can release ids at any time. When all ids are released, most of
- * the memory is returned (we keep MAX_IDR_FREE) in a local pool so we
- * don't need to go to the memory "store" during an id allocate, just
- * so you don't need to be too concerned about locking and conflicts
- * with the slab allocator.
  */
 
-#ifndef TEST// to test in user space...
 #include 
 #include 
 #include 
-#endif
 #include 
 #include 
 #include 
@@ -584,15 +576,6 @@ void idr_remove(struct idr *idp, int id)
bitmap_clear(to_free->bitmap, 0, IDR_SIZE);
free_layer(idp, to_free);
}
-   while (idp->id_free_cnt >= MAX_IDR_FREE) {
-   p = get_from_free_list(idp);
-   /*
-* Note: we don't call the rcu callback here, since the only
-* layers that fall into the freelist are those that have been
-* preallocated.
-*/
-   kmem_cache_free(idr_layer_cache, p);
-   }
return;
 }
 EXPORT_SYMBOL(idr_remove);
-- 
1.7.4.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 5/8] idr: covert BUG_ON() to WARN_ON_ONCE() if the argument is invalid.

2014-04-18 Thread Lai Jiangshan

When the arguments passed by the caller are invalid, WARN_ON_ONCE()
is proper than BUG_ON() which may crash the kernel.

Signed-off-by: Lai Jiangshan 
---
 lib/idr.c |9 +
 1 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/lib/idr.c b/lib/idr.c
index 104f87f..e4f9965 100644
--- a/lib/idr.c
+++ b/lib/idr.c
@@ -1093,13 +1093,14 @@ int ida_simple_get(struct ida *ida, unsigned int start, 
unsigned int end,
unsigned int max;
unsigned long flags;
 
-   BUG_ON((int)start < 0);
-   BUG_ON((int)end < 0);
+   if (WARN_ON_ONCE(((int)start < 0) || ((int)end < 0)))
+   return -EINVAL;
 
if (end == 0)
max = 0x8000;
else {
-   BUG_ON(end < start);
+   if (WARN_ON_ONCE(end < start))
+   return -EINVAL;
max = end - 1;
}
 
@@ -1135,7 +1136,7 @@ void ida_simple_remove(struct ida *ida, unsigned int id)
 {
unsigned long flags;
 
-   BUG_ON((int)id < 0);
+   WARN_ON_ONCE((int)id < 0);
spin_lock_irqsave(&simple_ida_lock, flags);
ida_remove(ida, id);
spin_unlock_irqrestore(&simple_ida_lock, flags);
-- 
1.7.4.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 2/8] idr: fix unexpected id-removal when idr_remove(unallocated_id)

2014-04-18 Thread Lai Jiangshan

If unallocated_id = (ANY * idr_max(idp->layers) + existed_id) is passed
to idr_remove(). The existed_id will be removed unexpected.

The following test shows this unexpected id-removal:

static void test4(void)
{
int id;
DEFINE_IDR(test_idr);

printk(KERN_INFO "Start test4\n");
id = idr_alloc(&test_idr, (void *)1, 42, 43, GFP_KERNEL);
BUG_ON(id != 42);
idr_remove(&test_idr, 42 + IDR_SIZE);
TEST_BUG_ON(idr_find(&test_idr, 42) != (void *)1);
idr_destroy(&test_idr);
printk(KERN_INFO "End of test4\n");
}

It only happens when unallocated_id, it is caller's fault. It is not
a bug. But it is better to add the proper check and complains instead
of removing an existed_id silently.

Signed-off-by: Lai Jiangshan 
---
 lib/idr.c |5 +
 1 files changed, 5 insertions(+), 0 deletions(-)

diff --git a/lib/idr.c b/lib/idr.c
index 4df6792..a28036d 100644
--- a/lib/idr.c
+++ b/lib/idr.c
@@ -562,6 +562,11 @@ void idr_remove(struct idr *idp, int id)
if (id < 0)
return;
 
+   if (id > idr_max(idp->layers)) {
+   idr_remove_warning(id);
+   return;
+   }
+
sub_remove(idp, (idp->layers - 1) * IDR_BITS, id);
if (idp->top && idp->top->count == 1 && (idp->layers > 1) &&
idp->top->ary[0]) {
-- 
1.7.4.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 1/8] idr: fix overflow bug for the max-high layer

2014-04-18 Thread Lai Jiangshan

In idr_replace(), when the top layer is the max-high layer(p->layer == 3),
The "(1 << n)" will overflow and the result is 0, it causes idr_replace()
return -EINVAL even the id is actually valid.

The following test code shows it fails to replace the value for id=((1<<27)+42):

static void test5(void)
{
int id;
DEFINE_IDR(test_idr);
#define TEST5_START ((1<<27)+42) /* use the highest layer */

printk(KERN_INFO "Start test5\n");
id = idr_alloc(&test_idr, (void *)1, TEST5_START, 0, GFP_KERNEL);
BUG_ON(id != TEST5_START);
TEST_BUG_ON(idr_replace(&test_idr, (void *)2, TEST5_START) != (void 
*)1);
idr_destroy(&test_idr);
printk(KERN_INFO "End of test5\n");
}

Fixed the bug by using idr_max() instead of the incorrect open code.

There is the same problem in sub_alloc(). The overflow causes sub_alloc()
returns -EAGAIN unexpectedly. But the idr_get_empty_slot() will call it
again with increased @id. So the bug is hided.

CC: sta...@vger.kernel.org
Signed-off-by: Lai Jiangshan 
---
 lib/idr.c |8 +++-
 1 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/lib/idr.c b/lib/idr.c
index 2642fa8..4df6792 100644
--- a/lib/idr.c
+++ b/lib/idr.c
@@ -249,7 +249,7 @@ static int sub_alloc(struct idr *idp, int *starting_id, 
struct idr_layer **pa,
id = (id | ((1 << (IDR_BITS * l)) - 1)) + 1;
 
/* if already at the top layer, we need to grow */
-   if (id >= 1 << (idp->layers * IDR_BITS)) {
+   if (id > idr_max(idp->layers)) {
*starting_id = id;
return -EAGAIN;
}
@@ -811,12 +811,10 @@ void *idr_replace(struct idr *idp, void *ptr, int id)
if (!p)
return ERR_PTR(-EINVAL);
 
-   n = (p->layer+1) * IDR_BITS;
-
-   if (id >= (1 << n))
+   if (id > idr_max(p->layer + 1))
return ERR_PTR(-EINVAL);
 
-   n -= IDR_BITS;
+   n = p->layer * IDR_BITS;
while ((n > 0) && p) {
p = p->ary[(id >> n) & IDR_MASK];
n -= IDR_BITS;
-- 
1.7.4.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 2/2 V4] workqueue: fix possible race condition when rescuer VS pwq-release

2014-04-18 Thread Lai Jiangshan

There is a race condition between rescuer_thread() and
pwq_unbound_release_workfn().

The works of the @pwq may be processed by some other workers,
and @pwq is scheduled to release(due to its wq's attr is changed)
before the rescuer starts to process. In this case
pwq_unbound_release_workfn() will corrupt wq->maydays list,
and rescuer_thead() will access to corrupted data.

Using get_pwq() pin it until rescuer is done with it.

Signed-off-by: Lai Jiangshan 
---
 kernel/workqueue.c |   13 +
 1 files changed, 13 insertions(+), 0 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 7539244..8c0830c 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1916,6 +1916,16 @@ static void send_mayday(struct work_struct *work)
 
/* mayday mayday mayday */
if (list_empty(&pwq->mayday_node)) {
+   /*
+* pwqs might go away at any time, pin it until the
+* rescuer is done with it.
+*
+* Especially a pwq of an unbound wq may be released
+* before wq's destruction when the wq's attr is changed.
+* In this case, pwq_unbound_release_workfn() may execute
+* earlier before rescuer_thread() and corrupt wq->maydays.
+*/
+   get_pwq(pwq);
list_add_tail(&pwq->mayday_node, &wq->maydays);
wake_up_process(wq->rescuer->task);
}
@@ -2447,6 +2457,9 @@ repeat:
 
process_scheduled_works(rescuer);
 
+   /* put the reference grabbed by send_mayday(). */
+   put_pwq(pwq);
+
/*
 * Leave this pool.  If keep_working() is %true, notify a
 * regular worker; otherwise, we end up with 0 concurrency
-- 
1.7.4.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 1/2 V4] workqueue: rescuer_thread() processes all pwqs before exit

2014-04-18 Thread Lai Jiangshan

Before the rescuer is picked to running, the works of the @pwq
may be processed by some other workers, and destroy_workqueue()
may called at the same time. This may result a nasty situation
that rescuer may exit with non-empty mayday list.

It is no harm currently, destroy_workqueue() can safely to free
them all(workqueue&pwqs) togerther, since the rescuer is stopped.
No rescuer nor mayday-timer can access the mayday list.

But it is nasty and error-prone in future development. Fix it.

Signed-off-by: Lai Jiangshan 
---
 kernel/workqueue.c |   21 +++--
 1 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 0ee63af..7539244 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2398,6 +2398,7 @@ static int rescuer_thread(void *__rescuer)
struct worker *rescuer = __rescuer;
struct workqueue_struct *wq = rescuer->rescue_wq;
struct list_head *scheduled = &rescuer->scheduled;
+   bool should_stop;
 
set_user_nice(current, RESCUER_NICE_LEVEL);
 
@@ -2408,12 +2409,14 @@ static int rescuer_thread(void *__rescuer)
rescuer->task->flags |= PF_WQ_WORKER;
 repeat:
set_current_state(TASK_INTERRUPTIBLE);
-
-   if (kthread_should_stop()) {
-   __set_current_state(TASK_RUNNING);
-   rescuer->task->flags &= ~PF_WQ_WORKER;
-   return 0;
-   }
+   /*
+* When the rescuer is requested to stop, the workqueue has no
+* work pending, but wq->maydays may still have pwq(s) queued.
+* This can happend when some other workers process all works
+* before this rescuer is scheduled. The rescuer must process
+* all pwq(s) before exit.
+*/
+   should_stop = kthread_should_stop();
 
/* see whether any pwq is asking for help */
spin_lock_irq(&wq_mayday_lock);
@@ -2459,6 +2462,12 @@ repeat:
 
spin_unlock_irq(&wq_mayday_lock);
 
+   if (should_stop) {
+   __set_current_state(TASK_RUNNING);
+   rescuer->task->flags &= ~PF_WQ_WORKER;
+   return 0;
+   }
+
/* rescuers should never participate in concurrency management */
WARN_ON_ONCE(!(rescuer->flags & WORKER_NOT_RUNNING));
schedule();
-- 
1.7.4.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 2/2] workqueue: fix a possible race condition between rescuer and pwq-release

2014-04-18 Thread Lai Jiangshan

On Fri, Apr 18, 2014 at 11:06 PM, Tejun Heo  wrote:
> Hello,
>
> Applied to wq/for-3.15-fixes with put_pwq() relocated and patch
> description and comments updated.
>
> Thanks.
>  8< 
> From 4b81955722abe4306096c7b0137e0491a7ba7b0e Mon Sep 17 00:00:00 2001
> From: Lai Jiangshan 
> Date: Fri, 18 Apr 2014 11:04:16 -0400
>
> There is a race condition between rescuer_thread() and
> pwq_unbound_release_workfn().
>
> Even after a pwq is scheduled for rescue, the associated work items
> may be consumed by any worker.  If all of them are consumed before the
> rescuer gets to them and the pwq's base ref was put due to attribute
> change, the pwq may be released while still being linked on
> @wq->maydays list making the rescuer dereference already freed pwq
> later.
>
> Make send_mayday() pin the target pwq until the rescuer is done with
> it.
>
> tj: Updated comment and patch description.  Moved put_pwq() to after
> the last @pwq->pool access.  This isn't strictly necessarily for
>     correctness but is cleaner as the pool is accessed through the
> pwq.
>
> Signed-off-by: Lai Jiangshan 
> Signed-off-by: Tejun Heo 
> Cc: sta...@vger.kernel.org # v3.10+
> ---
>  kernel/workqueue.c | 7 +++
>  1 file changed, 7 insertions(+)
>
> diff --git a/kernel/workqueue.c b/kernel/workqueue.c
> index 6ba0c60..a6532f2 100644
> --- a/kernel/workqueue.c
> +++ b/kernel/workqueue.c
> @@ -1916,6 +1916,12 @@ static void send_mayday(struct work_struct *work)
>
> /* mayday mayday mayday */
> if (list_empty(&pwq->mayday_node)) {
> +   /*
> +* If @pwq is for an unbound wq, its base ref may be put at
> +* any time due to an attribute change.  Pin @pwq until the
> +* rescuer is done with it.
> +*/
> +   get_pwq(pwq);
> list_add_tail(&pwq->mayday_node, &wq->maydays);
> wake_up_process(wq->rescuer->task);
> }
> @@ -2459,6 +2465,7 @@ repeat:
>
> rescuer->pool = NULL;
> spin_unlock(&pool->lock);
> +   put_pwq(pwq);   /* put the ref from send_mayday() */

put_pwq() requires pool->lock held.

> spin_lock(&wq_mayday_lock);
> }
>
> --
> 1.9.0
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 1/8] idr: fix overflow bug for the max-high layer

2014-04-18 Thread Lai Jiangshan

On Sat, Apr 19, 2014 at 12:29 AM, Tejun Heo  wrote:
> Hello,

Hi,

Should I resend the patch with your updated changelog?
Or something else I need to do?

>
> Subject: idr: fix overflow bug during maximum ID calculation at maximum height
>
> On Fri, Apr 18, 2014 at 08:49:48PM +0800, Lai Jiangshan wrote:
>> In idr_replace(), when the top layer is the max-high layer(p->layer == 3),
>> The "(1 << n)" will overflow and the result is 0, it causes idr_replace()
>> return -EINVAL even the id is actually valid.
>
>   idr_replace() open-codes the logic to calculate the maximum valid ID
>   given the height of the idr tree; unfortunately, the open-coded logic
>   doesn't account for the fact that the top layer may have unused slots
>   and over-shifts the limit to zero when the tree is at its maximum
>   height.
>
>> The following test code shows it fails to replace the value for 
>> id=((1<<27)+42):
>>
>> static void test5(void)
>> {
>>   int id;
>>   DEFINE_IDR(test_idr);
>> #define TEST5_START ((1<<27)+42) /* use the highest layer */
>>
>>   printk(KERN_INFO "Start test5\n");
>>   id = idr_alloc(&test_idr, (void *)1, TEST5_START, 0, GFP_KERNEL);
>>   BUG_ON(id != TEST5_START);
>>   TEST_BUG_ON(idr_replace(&test_idr, (void *)2, TEST5_START) != (void 
>> *)1);
>>   idr_destroy(&test_idr);
>>   printk(KERN_INFO "End of test5\n");
>> }

Does the above testing code need to be kept in the changelog.

Thanks,
Lai

>>
>> Fixed the bug by using idr_max() instead of the incorrect open code.
>
>   Fix the bug by using idr_max() which correctly takes into account the
>   maximum allowed shift.
>
>> There is the same problem in sub_alloc(). The overflow causes sub_alloc()
>> returns -EAGAIN unexpectedly. But the idr_get_empty_slot() will call it
>> again with increased @id. So the bug is hided.
>
>   sub_alloc() shares the same problem and may incorrectly fail with
>   -EAGAIN; however, this bug doesn't affect correct operation because
>   idr_get_empty_slot(), which already uses idr_max(), retries with the
>   increased @id in such cases.
>
>> CC: sta...@vger.kernel.org
>> Signed-off-by: Lai Jiangshan 
>
>  Acked-by: Tejun Heo 
>
> Thanks.
>
> --
> tejun
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 6/8] idr: avoid ping-pong

2014-04-19 Thread Lai Jiangshan

On 04/19/2014 01:17 AM, Tejun Heo wrote:
> On Fri, Apr 18, 2014 at 08:49:53PM +0800, Lai Jiangshan wrote:
>> The ida callers always calls ida_pre_get() before ida_get_new*().
>> ida_pre_get() will do layer allocation, and ida_get_new*() will do layer 
>> removal.
>>
>> It causes an unneeded ping-pong. The speculative layer removal in
>> ida_get_new*() can't result expected optimization.
>>
>> So we remove the unneeded layer removal in ida_get_new*().
> 
> But the as comment says, ida doesn't want to keep extra layers around
> as it wants to keep its memory footprint minimal.  

It only frees one layer. And the ida_pre_get() for the next ida_get_new*()
will allocation it back again. The aim "Throw away extra resources one by one"
can't be achieved. It can't keep its memory footprint minimal.

> I think the right
> thing to do is implementing ida_preload() which is simliar to
> idr_preload() and do away with per-ida layer cache.

Yes and no.

We need a static private ida_preload() for ida_simple_get() only.

Because the IDA doesn't have any query-function, so IDA's own synchronization
is enough for all use cases, IDA should off-loads the caller's
synchronization burden.

In my todo-list, IDA only needs the following functions. other functions
will be deprecated and scheduled to be removed:
void ida_destroy(struct ida *ida);
void ida_init(struct ida *ida);
int ida_simple_get(struct ida *ida, unsigned int start, unsigned int 
end,
   gfp_t gfp_mask);
void ida_simple_remove(struct ida *ida, unsigned int id);

(I don't think we need any query-function, But...)
If in the future we need some query-functions such as:
ida_is_this_id_allocated()
ida_find_next_[un]allocated_id(),
In this case, we can expose the ida_preload() and ida_alloc() at the same time 
that
we introduce the query-functions.

Any thought?

But we need to remove this unneeded ping-pong despite of any plan.

Thanks,
Lai
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 0/9 V2] idr: fix & cleanup

2014-04-19 Thread Lai Jiangshan

Patch1 fix a bug caused by overflow.
Patch2,3 add checks for unallocated_id.
Patch4 changes to returned error code
Patch5-9 cleanup.


Lai Jiangshan (9):
  idr: fix overflow bug during maximum ID calculation at maximum height
  idr: fix unexpected ID-removal when idr_remove(unallocated_id)
**^^also consider ida_remove()
  idr: fix NULL pointer dereference when ida_remove(unallocated_id)
  idr: fix idr_replace()'s returned error code
  idr: covert BUG_ON() to WARN_ON_ONCE() if the argument is invalid.
**^^move the test into ida_remove() as tj suggested.
  idr: avoid ping-pong
  idr: don't need to shink the free list when idr_remove()
**^^split
  idr: reduce the unneeded check in free_layer()
  idr: remove useless C-PreProcessor branch
**^^new

 lib/idr.c |   66 +---
 1 files changed, 23 insertions(+), 43 deletions(-)

-- 
1.7.4.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 2/9 V2] idr: fix unexpected ID-removal when idr_remove(unallocated_id)

2014-04-19 Thread Lai Jiangshan

If unallocated_id = (ANY * idr_max(idp->layers) + existing_id) is passed
to idr_remove(). The existing_id will be removed unexpectedly.

The following test shows this unexpected id-removal:

static void test4(void)
{
int id;
DEFINE_IDR(test_idr);

printk(KERN_INFO "Start test4\n");
id = idr_alloc(&test_idr, (void *)1, 42, 43, GFP_KERNEL);
BUG_ON(id != 42);
idr_remove(&test_idr, 42 + IDR_SIZE);
TEST_BUG_ON(idr_find(&test_idr, 42) != (void *)1);
idr_destroy(&test_idr);
printk(KERN_INFO "End of test4\n");
}

ida_remove() shares the similar problem.

It happens only when the caller tries to free an unallocated ID which
is the caller's fault. It is not a bug. But it is better to add
the proper check and complain rather than removing an existing_id silently.

Signed-off-by: Lai Jiangshan 
Acked-by: Tejun Heo 
---
 lib/idr.c |8 
 1 files changed, 8 insertions(+), 0 deletions(-)

diff --git a/lib/idr.c b/lib/idr.c
index 4df6792..c4afd94 100644
--- a/lib/idr.c
+++ b/lib/idr.c
@@ -562,6 +562,11 @@ void idr_remove(struct idr *idp, int id)
if (id < 0)
return;
 
+   if (id > idr_max(idp->layers)) {
+   idr_remove_warning(id);
+   return;
+   }
+
sub_remove(idp, (idp->layers - 1) * IDR_BITS, id);
if (idp->top && idp->top->count == 1 && (idp->layers > 1) &&
idp->top->ary[0]) {
@@ -1025,6 +1030,9 @@ void ida_remove(struct ida *ida, int id)
int n;
struct ida_bitmap *bitmap;
 
+   if (idr_id > idr_max(ida->idr.layers))
+   goto err;
+
/* clear full bits while looking up the leaf idr_layer */
while ((shift > 0) && p) {
n = (idr_id >> shift) & IDR_MASK;
-- 
1.7.4.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 9/9 V2] idr: remove useless C-PreProcessor branch

2014-04-19 Thread Lai Jiangshan

"#ifndef TEST" can't work for user space test now, just remove it.

Signed-off-by: Lai Jiangshan 
---
 lib/idr.c |2 --
 1 files changed, 0 insertions(+), 2 deletions(-)

diff --git a/lib/idr.c b/lib/idr.c
index e3c1da0..d77cdca 100644
--- a/lib/idr.c
+++ b/lib/idr.c
@@ -20,11 +20,9 @@
  * that id to this code and it returns your pointer.
  */
 
-#ifndef TEST// to test in user space...
 #include 
 #include 
 #include 
-#endif
 #include 
 #include 
 #include 
-- 
1.7.4.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 7/9 V2] idr: don't need to shink the free list when idr_remove()

2014-04-19 Thread Lai Jiangshan

After idr subsystem is changed to RCU-awared, the free layer will not go to
the free list. The free list will not be filled up when idr_remove().
So we don't need to shink it too.

Signed-off-by: Lai Jiangshan 
Acked-by: Tejun Heo 
---
 lib/idr.c |   16 
 1 files changed, 0 insertions(+), 16 deletions(-)

diff --git a/lib/idr.c b/lib/idr.c
index 25fe476..4a11c5d 100644
--- a/lib/idr.c
+++ b/lib/idr.c
@@ -18,12 +18,6 @@
  * pointer or what ever, we treat it as a (void *).  You can pass this
  * id to a user for him to pass back at a later time.  You then pass
  * that id to this code and it returns your pointer.
-
- * You can release ids at any time. When all ids are released, most of
- * the memory is returned (we keep MAX_IDR_FREE) in a local pool so we
- * don't need to go to the memory "store" during an id allocate, just
- * so you don't need to be too concerned about locking and conflicts
- * with the slab allocator.
  */
 
 #ifndef TEST// to test in user space...
@@ -584,16 +578,6 @@ void idr_remove(struct idr *idp, int id)
bitmap_clear(to_free->bitmap, 0, IDR_SIZE);
free_layer(idp, to_free);
}
-   while (idp->id_free_cnt >= MAX_IDR_FREE) {
-   p = get_from_free_list(idp);
-   /*
-* Note: we don't call the rcu callback here, since the only
-* layers that fall into the freelist are those that have been
-* preallocated.
-*/
-   kmem_cache_free(idr_layer_cache, p);
-   }
-   return;
 }
 EXPORT_SYMBOL(idr_remove);
 
-- 
1.7.4.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 8/9 V2] idr: reduce the unneeded check in free_layer()

2014-04-19 Thread Lai Jiangshan

If "idr->hint == p" is true, it also implies "idr->hint" is true(not NULL).

Signed-off-by: Lai Jiangshan 
Acked-by: Tejun Heo 
---
 lib/idr.c |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/lib/idr.c b/lib/idr.c
index 4a11c5d..e3c1da0 100644
--- a/lib/idr.c
+++ b/lib/idr.c
@@ -145,7 +145,7 @@ static void idr_layer_rcu_free(struct rcu_head *head)
 
 static inline void free_layer(struct idr *idr, struct idr_layer *p)
 {
-   if (idr->hint && idr->hint == p)
+   if (idr->hint == p)
RCU_INIT_POINTER(idr->hint, NULL);
call_rcu(&p->rcu_head, idr_layer_rcu_free);
 }
-- 
1.7.4.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 5/9 V2] idr: covert BUG_ON() to WARN_ON_ONCE() if the argument is invalid.

2014-04-19 Thread Lai Jiangshan

When the arguments passed by the caller are invalid, WARN_ON_ONCE()
is proper than BUG_ON() which may crash the kernel.

The invalid-checking for ida_simple_remove() is moved into ida_remove().
idr_remove() also adds this WARN_ON_ONCE().

And when "end < start" in ida_simple_get(), it returns -ENOSPC as
ida_alloc() does.

Signed-off-by: Lai Jiangshan 
---
 lib/idr.c |   13 -
 1 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/lib/idr.c b/lib/idr.c
index e79e051..317fd35 100644
--- a/lib/idr.c
+++ b/lib/idr.c
@@ -559,7 +559,7 @@ void idr_remove(struct idr *idp, int id)
struct idr_layer *p;
struct idr_layer *to_free;
 
-   if (id < 0)
+   if (WARN_ON_ONCE(id < 0))
return;
 
if (id > idr_max(idp->layers)) {
@@ -1030,6 +1030,9 @@ void ida_remove(struct ida *ida, int id)
int n;
struct ida_bitmap *bitmap;
 
+   if (WARN_ON_ONCE(id < 0))
+   return;
+
if (idr_id > idr_max(ida->idr.layers))
goto err;
 
@@ -1096,13 +1099,14 @@ int ida_simple_get(struct ida *ida, unsigned int start, 
unsigned int end,
unsigned int max;
unsigned long flags;
 
-   BUG_ON((int)start < 0);
-   BUG_ON((int)end < 0);
+   if (WARN_ON_ONCE(((int)start < 0) || ((int)end < 0)))
+   return -EINVAL;
 
if (end == 0)
max = 0x8000;
else {
-   BUG_ON(end < start);
+   if (WARN_ON_ONCE(end < start))
+   return -ENOSPC;
max = end - 1;
}
 
@@ -1138,7 +1142,6 @@ void ida_simple_remove(struct ida *ida, unsigned int id)
 {
unsigned long flags;
 
-   BUG_ON((int)id < 0);
spin_lock_irqsave(&simple_ida_lock, flags);
ida_remove(ida, id);
spin_unlock_irqrestore(&simple_ida_lock, flags);
-- 
1.7.4.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

< 4 5 6 7 8 9 10 11 12 13 >

801 - 900 of 1247 matches

Mail list logo