from:"Yuanhan Liu"

[PATCH] async_tx: replace page_address with kmap_atomic

2015-07-02 Thread Yuanhan Liu

As a page might belong to highmem.

Strictly nested kmap_atomic() order is followed according to doc
Documentation/vm/highmem.txt

CC: Dan Williams 
CC: Shaohua Li 
Signed-off-by: Yuanhan Liu 
---
 crypto/async_tx/async_pq.c  | 18 +-
 crypto/async_tx/async_raid6_recov.c | 31 ---
 crypto/async_tx/async_xor.c | 17 ++---
 3 files changed, 51 insertions(+), 15 deletions(-)

diff --git a/crypto/async_tx/async_pq.c b/crypto/async_tx/async_pq.c
index 5d355e0..a408b7e 100644
--- a/crypto/async_tx/async_pq.c
+++ b/crypto/async_tx/async_pq.c
@@ -136,7 +136,7 @@ do_sync_gen_syndrome(struct page **blocks, unsigned int 
offset, int disks,
BUG_ON(i > disks - 3); /* P or Q can't be zero */
srcs[i] = (void*)raid6_empty_zero_page;
} else {
-   srcs[i] = page_address(blocks[i]) + offset;
+   srcs[i] = kmap_atomic(blocks[i]) + offset;
if (i < disks - 2) {
stop = i;
if (start == -1)
@@ -150,6 +150,12 @@ do_sync_gen_syndrome(struct page **blocks, unsigned int 
offset, int disks,
raid6_call.xor_syndrome(disks, start, stop, len, srcs);
} else
raid6_call.gen_syndrome(disks, len, srcs);
+
+   for (i = disks; i--; ) {
+   if (blocks[i])
+   kunmap_atomic(srcs[i]);
+   }
+
async_tx_sync_epilog(submit);
 }
 
@@ -395,14 +401,15 @@ async_syndrome_val(struct page **blocks, unsigned int 
offset, int disks,
 */
tx = NULL;
*pqres = 0;
+   s = kmap_atomic(spare) + offset;
if (p_src) {
init_async_submit(submit, ASYNC_TX_XOR_ZERO_DST, NULL,
  NULL, NULL, scribble);
tx = async_xor(spare, blocks, offset, disks-2, len, 
submit);
async_tx_quiesce(&tx);
-   p = page_address(p_src) + offset;
-   s = page_address(spare) + offset;
+   p = kmap_atomic(p_src) + offset;
*pqres |= !!memcmp(p, s, len) << SUM_CHECK_P;
+   kunmap_atomic(p);
}
 
if (q_src) {
@@ -411,10 +418,11 @@ async_syndrome_val(struct page **blocks, unsigned int 
offset, int disks,
init_async_submit(submit, 0, NULL, NULL, NULL, 
scribble);
tx = async_gen_syndrome(blocks, offset, disks, len, 
submit);
async_tx_quiesce(&tx);
-   q = page_address(q_src) + offset;
-   s = page_address(spare) + offset;
+   q = kmap_atomic(q_src) + offset;
*pqres |= !!memcmp(q, s, len) << SUM_CHECK_Q;
+   kunmap_atomic(q);
}
+   kunmap_atomic(s);
 
/* restore P, Q and submit */
P(blocks, disks) = p_src;
diff --git a/crypto/async_tx/async_raid6_recov.c 
b/crypto/async_tx/async_raid6_recov.c
index 934a849..abcacb0 100644
--- a/crypto/async_tx/async_raid6_recov.c
+++ b/crypto/async_tx/async_raid6_recov.c
@@ -80,9 +80,9 @@ async_sum_product(struct page *dest, struct page **srcs, 
unsigned char *coef,
async_tx_quiesce(&submit->depend_tx);
amul = raid6_gfmul[coef[0]];
bmul = raid6_gfmul[coef[1]];
-   a = page_address(srcs[0]);
-   b = page_address(srcs[1]);
-   c = page_address(dest);
+   a = kmap_atomic(srcs[0]);
+   b = kmap_atomic(srcs[1]);
+   c = kmap_atomic(dest);
 
while (len--) {
ax= amul[*a++];
@@ -90,6 +90,10 @@ async_sum_product(struct page *dest, struct page **srcs, 
unsigned char *coef,
*c++ = ax ^ bx;
}
 
+   kunmap_atomic(c);
+   kunmap_atomic(b);
+   kunmap_atomic(a);
+
return NULL;
 }
 
@@ -147,12 +151,15 @@ async_mult(struct page *dest, struct page *src, u8 coef, 
size_t len,
 */
async_tx_quiesce(&submit->depend_tx);
qmul  = raid6_gfmul[coef];
-   d = page_address(dest);
-   s = page_address(src);
+   d = kmap_atomic(dest);
+   s = kmap_atomic(src);
 
while (len--)
*d++ = qmul[*s++];
 
+   kunmap_atomic(s);
+   kunmap_atomic(d);
+
return NULL;
 }
 
@@ -372,10 +379,15 @@ async_raid6_2data_recov(int disks, size_t bytes, int 
faila, int failb,
if (blocks[i] == NULL)
ptrs[i] = (void *) raid6_empty_zero_page;
else
-   ptrs[i] = page_address(blocks[i]);
+   ptrs[i] = kmap_atomic(blocks[i]);
 
raid6

Re: [PATCH 3/3] md/raid5: per hash value and exclusive wait_for_stripe

2015-05-13 Thread Yuanhan Liu

On Thu, May 14, 2015 at 03:45:11PM +1000, NeilBrown wrote:
> On Wed, 29 Apr 2015 10:48:55 +0800 Yuanhan Liu 
> wrote:
> 
> > diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
> > index 64d5bea..697d77a 100644
> > --- a/drivers/md/raid5.c
> > +++ b/drivers/md/raid5.c
> > @@ -344,7 +344,8 @@ static void release_inactive_stripe_list(struct r5conf 
> > *conf,
> >  int hash)
> >  {
> > int size;
> > -   bool do_wakeup = false;
> > +   unsigned long do_wakeup = 0;
> > +   int i = 0;
> > unsigned long flags;
> >  
> > if (hash == NR_STRIPE_HASH_LOCKS) {
> > @@ -365,15 +366,19 @@ static void release_inactive_stripe_list(struct 
> > r5conf *conf,
> > !list_empty(list))
> > atomic_dec(&conf->empty_inactive_list_nr);
> > list_splice_tail_init(list, conf->inactive_list + hash);
> > -   do_wakeup = true;
> > +   do_wakeup |= 1 << (size - 1);
> > spin_unlock_irqrestore(conf->hash_locks + hash, flags);
> > }
> > size--;
> > hash--;
> > }
> >  
> > +   for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) {
> > +   if (do_wakeup & (1 << i))
> > +   wake_up(&conf->wait_for_stripe[i]);
> > +   }
> > +
> 
> hi,
>  I've been doing some testing and got a lock-up in resize_stripes, waiting
>  on wait_for_stripe[].
> 
>  Looking at the above code,  I think
>   do_wakeup |= 1 << (size - 1);
>  should be
>   do_wakeup |= 1 << hash;
> 
>  do you agree?  Or am I missing something?

Right. Sorry for the careless mistake.

--yliu
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 1/2] md/raid5: avoid duplicate code

2015-05-07 Thread Yuanhan Liu

On Fri, May 08, 2015 at 03:28:00PM +1000, NeilBrown wrote:
> On Wed,  6 May 2015 17:45:49 +0800 Yuanhan Liu 
> wrote:
> 
> > Move the code that put one idle sh(hot in cache, but happens to be
> > zero referenced) back to active stage to __find_stripe(). Because
> > that's what need to do every time you invoke __find_stripe().
> > 
> > Moving it there avoids duplicate code, as well as makes a bit more
> > sense, IMO, as it tells a whole story now.
> 
> Thanks for this.  It is a good cleanup.
> 
> However I don't want to make any new changes to the RAID5 code until I find a
> couple of bugs that I'm hunting.  So I won't apply it just yet.
> Remind me in a couple of weeks if I seem to have forgotten.

Got it. Thanks.


--yliu
> 
> > 
> > Signed-off-by: Yuanhan Liu 
> > ---
> >  drivers/md/raid5.c | 50 ++
> >  1 file changed, 18 insertions(+), 32 deletions(-)
> > 
> > diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
> > index 77dfd72..e7fa818 100644
> > --- a/drivers/md/raid5.c
> > +++ b/drivers/md/raid5.c
> > @@ -567,8 +567,25 @@ static struct stripe_head *__find_stripe(struct r5conf 
> > *conf, sector_t sector,
> >  
> > pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector);
> > hlist_for_each_entry(sh, stripe_hash(conf, sector), hash)
> > -   if (sh->sector == sector && sh->generation == generation)
> > +   if (sh->sector == sector && sh->generation == generation) {
> > +   if (!atomic_inc_not_zero(&sh->count)) {
> > +   spin_lock(&conf->device_lock);
> > +   if (!atomic_read(&sh->count)) {
> > +   if (!test_bit(STRIPE_HANDLE, 
> > &sh->state))
> > +   
> > atomic_inc(&conf->active_stripes);
> > +   BUG_ON(list_empty(&sh->lru) &&
> > +  !test_bit(STRIPE_EXPANDING, 
> > &sh->state));
> > +   list_del_init(&sh->lru);
> > +   if (sh->group) {
> > +   sh->group->stripes_cnt--;
> > +   sh->group = NULL;
> > +   }
> > +   }
> > +   atomic_inc(&sh->count);
> > +   spin_unlock(&conf->device_lock);
> > +   }
> > return sh;
> > +   }
> > pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector);
> > return NULL;
> >  }
> > @@ -698,21 +715,6 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
> > init_stripe(sh, sector, previous);
> > atomic_inc(&sh->count);
> > }
> > -   } else if (!atomic_inc_not_zero(&sh->count)) {
> > -   spin_lock(&conf->device_lock);
> > -   if (!atomic_read(&sh->count)) {
> > -   if (!test_bit(STRIPE_HANDLE, &sh->state))
> > -   atomic_inc(&conf->active_stripes);
> > -   BUG_ON(list_empty(&sh->lru) &&
> > -  !test_bit(STRIPE_EXPANDING, &sh->state));
> > -   list_del_init(&sh->lru);
> > -   if (sh->group) {
> > -   sh->group->stripes_cnt--;
> > -   sh->group = NULL;
> > -   }
> > -   }
> > -   atomic_inc(&sh->count);
> > -   spin_unlock(&conf->device_lock);
> > }
> > } while (sh == NULL);
> >  
> > @@ -771,22 +773,6 @@ static void stripe_add_to_batch_list(struct r5conf 
> > *conf, struct stripe_head *sh
> > hash = stripe_hash_locks_hash(head_sector);
> > spin_lock_irq(conf->hash_locks + hash);
> > head = __find_stripe(conf, head_sector, conf->generation);
> > -   if (head && !atomic_inc_not_zero(&head->count)) {
> > -   spin_lock(&conf->device_lock);
> > -   if (

[PATCH 2/2] md/raid5: remove unnecessary sh->count check

2015-05-06 Thread Yuanhan Liu

Remove the unnecessary "!atomic_read(&sh->count)" check, as the previous
"atomic_inc_not_zero(&sh->count)" check assures sh->count to be 0.

The only reason I can think of that we need such check is to consider
the lock race issue.

First of all, I doubt there is another process could modify an in-cache
but zero referenced sh while it's being protected by a hash lock. Hence,
I would say sh->count will be consistent to 0 in that  "if !atomic_inc_not_zero"
block.

Secondly, just assume there is a chance that someone outside the lock
modifies sh->count(by atomic_inc?). It could lead to some problem.

To make it clear, here I paste few lines of key code:

if (!atomic_inc_not_zero(&sh->count)) {
spin_lock(&conf->device_lock);
if (!atomic_read(&sh->count)) {

}
...
}

At the time we enter the first if block, sh->count is zero. And just assume
someone increases sh->count from somewhere while acquiring the lock,
the following if block will not be executed then, leaving some fileds,
such as conf->active_stripes, not being set properly.

So, we should execute the second if block whenever we entered the first
if block no matter sh->count stays with 0 or not.

Signed-off-by: Yuanhan Liu 
---

Neil, I'm a bit concerned that I missed something in this patch. Please
kindly correct me if I'm wrong :)

---
 drivers/md/raid5.c | 18 --
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index e7fa818..17ece2a 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -570,16 +570,14 @@ static struct stripe_head *__find_stripe(struct r5conf 
*conf, sector_t sector,
if (sh->sector == sector && sh->generation == generation) {
if (!atomic_inc_not_zero(&sh->count)) {
spin_lock(&conf->device_lock);
-   if (!atomic_read(&sh->count)) {
-   if (!test_bit(STRIPE_HANDLE, 
&sh->state))
-   
atomic_inc(&conf->active_stripes);
-   BUG_ON(list_empty(&sh->lru) &&
-  !test_bit(STRIPE_EXPANDING, 
&sh->state));
-   list_del_init(&sh->lru);
-   if (sh->group) {
-   sh->group->stripes_cnt--;
-   sh->group = NULL;
-   }
+   if (!test_bit(STRIPE_HANDLE, &sh->state))
+   atomic_inc(&conf->active_stripes);
+   BUG_ON(list_empty(&sh->lru) &&
+  !test_bit(STRIPE_EXPANDING, &sh->state));
+   list_del_init(&sh->lru);
+   if (sh->group) {
+   sh->group->stripes_cnt--;
+   sh->group = NULL;
}
atomic_inc(&sh->count);
spin_unlock(&conf->device_lock);
-- 
1.9.0

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 1/2] md/raid5: avoid duplicate code

2015-05-06 Thread Yuanhan Liu

Move the code that put one idle sh(hot in cache, but happens to be
zero referenced) back to active stage to __find_stripe(). Because
that's what need to do every time you invoke __find_stripe().

Moving it there avoids duplicate code, as well as makes a bit more
sense, IMO, as it tells a whole story now.

Signed-off-by: Yuanhan Liu 
---
 drivers/md/raid5.c | 50 ++
 1 file changed, 18 insertions(+), 32 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 77dfd72..e7fa818 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -567,8 +567,25 @@ static struct stripe_head *__find_stripe(struct r5conf 
*conf, sector_t sector,
 
pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector);
hlist_for_each_entry(sh, stripe_hash(conf, sector), hash)
-   if (sh->sector == sector && sh->generation == generation)
+   if (sh->sector == sector && sh->generation == generation) {
+   if (!atomic_inc_not_zero(&sh->count)) {
+   spin_lock(&conf->device_lock);
+   if (!atomic_read(&sh->count)) {
+   if (!test_bit(STRIPE_HANDLE, 
&sh->state))
+   
atomic_inc(&conf->active_stripes);
+   BUG_ON(list_empty(&sh->lru) &&
+  !test_bit(STRIPE_EXPANDING, 
&sh->state));
+   list_del_init(&sh->lru);
+   if (sh->group) {
+   sh->group->stripes_cnt--;
+   sh->group = NULL;
+   }
+   }
+   atomic_inc(&sh->count);
+   spin_unlock(&conf->device_lock);
+   }
return sh;
+   }
pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector);
return NULL;
 }
@@ -698,21 +715,6 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
init_stripe(sh, sector, previous);
atomic_inc(&sh->count);
}
-   } else if (!atomic_inc_not_zero(&sh->count)) {
-   spin_lock(&conf->device_lock);
-   if (!atomic_read(&sh->count)) {
-   if (!test_bit(STRIPE_HANDLE, &sh->state))
-   atomic_inc(&conf->active_stripes);
-   BUG_ON(list_empty(&sh->lru) &&
-  !test_bit(STRIPE_EXPANDING, &sh->state));
-   list_del_init(&sh->lru);
-   if (sh->group) {
-   sh->group->stripes_cnt--;
-   sh->group = NULL;
-   }
-   }
-   atomic_inc(&sh->count);
-   spin_unlock(&conf->device_lock);
}
} while (sh == NULL);
 
@@ -771,22 +773,6 @@ static void stripe_add_to_batch_list(struct r5conf *conf, 
struct stripe_head *sh
hash = stripe_hash_locks_hash(head_sector);
spin_lock_irq(conf->hash_locks + hash);
head = __find_stripe(conf, head_sector, conf->generation);
-   if (head && !atomic_inc_not_zero(&head->count)) {
-   spin_lock(&conf->device_lock);
-   if (!atomic_read(&head->count)) {
-   if (!test_bit(STRIPE_HANDLE, &head->state))
-   atomic_inc(&conf->active_stripes);
-   BUG_ON(list_empty(&head->lru) &&
-  !test_bit(STRIPE_EXPANDING, &head->state));
-   list_del_init(&head->lru);
-   if (head->group) {
-   head->group->stripes_cnt--;
-   head->group = NULL;
-   }
-   }
-   atomic_inc(&head->count);
-   spin_unlock(&conf->device_lock);
-   }
spin_unlock_irq(conf->hash_locks + hash);
 
if (!head)
-- 
1.9.0

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] md/raid5: init batch_xxx for new sh at resize_stripes

2015-05-04 Thread Yuanhan Liu

On Mon, May 04, 2015 at 05:24:24PM +1000, NeilBrown wrote:
> On Mon,  4 May 2015 13:50:24 +0800 Yuanhan Liu 
> wrote:
> 
> > This is to fix a kernel NULL dereference oops introduced by commit
> > 59fc630b("RAID5: batch adjacent full stripe write"), which introduced
> > several batch_xxx fields, and did initiation for them at grow_one_stripes(),
> > but forgot to do same at resize_stripes().
> > 
> > This oops can be easily triggered by following steps:
> > 
> > __create RAID5 /dev/md0
> > __grow /dev/md0
> > mdadm --wait /dev/md0
> > dd if=/dev/zero of=/dev/md0
> > 
> > Here is the detailed oops log:
...
> > 
> > Cc: Shaohua Li 
> > Signed-off-by: Yuanhan Liu 
> > ---
> >  drivers/md/raid5.c | 4 
> >  1 file changed, 4 insertions(+)
> > 
> > diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
> > index 697d77a..7b074f7 100644
> > --- a/drivers/md/raid5.c
> > +++ b/drivers/md/raid5.c
> > @@ -2217,6 +2217,10 @@ static int resize_stripes(struct r5conf *conf, int 
> > newsize)
> > if (!p)
> > err = -ENOMEM;
> > }
> > +
> > +   spin_lock_init(&nsh->batch_lock);
> > +   INIT_LIST_HEAD(&nsh->batch_list);
> > +   nsh->batch_head = NULL;
> > release_stripe(nsh);
> > }
> > /* critical section pass, GFP_NOIO no longer needed */
> 
> Thanks!
> 
> However I already have the following fix queued - though not pushed  out

Yeah, much cleaner.


> you.  I probably would have got it into -rc2 except that I was chasing
> another raid5 bug.  The
>   BUG_ON(sh->batch_head);
> 
> in handle_stripe_fill() fires when I run the mdadm selftests.  I got caught
> up chasing that and didn't push the other fix.

I am not aware of there is a selftests for raid. I'd like to add it to our 0day
kernel testing in near future so that we could catch bugs and bisect it down in
first time ;)

--yliu
> 
> 
> From 3dd8ba734349e602fe17d647ce3da5f4a13748aa Mon Sep 17 00:00:00 2001
> From: NeilBrown 
> Date: Thu, 30 Apr 2015 11:24:28 +1000
> Subject: [PATCH] md/raid5 new alloc_stripe function.
> 
> 
> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
> index 77dfd720aaa0..91a1e8b26b52 100644
> --- a/drivers/md/raid5.c
> +++ b/drivers/md/raid5.c
> @@ -1971,17 +1971,30 @@ static void raid_run_ops(struct stripe_head *sh, 
> unsigned long ops_request)
>   put_cpu();
>  }
>  
> +static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp)
> +{
> + struct stripe_head *sh;
> +
> + sh = kmem_cache_zalloc(sc, gfp);
> + if (sh) {
> + spin_lock_init(&sh->stripe_lock);
> + spin_lock_init(&sh->batch_lock);
> + INIT_LIST_HEAD(&sh->batch_list);
> + INIT_LIST_HEAD(&sh->lru);
> + atomic_set(&sh->count, 1);
> + }
> + return sh;
> +}
>  static int grow_one_stripe(struct r5conf *conf, gfp_t gfp)
>  {
>   struct stripe_head *sh;
> - sh = kmem_cache_zalloc(conf->slab_cache, gfp);
> +
> + sh = alloc_stripe(conf->slab_cache, gfp);
>   if (!sh)
>   return 0;
>  
>   sh->raid_conf = conf;
>  
> - spin_lock_init(&sh->stripe_lock);
> -
>   if (grow_buffers(sh, gfp)) {
>   shrink_buffers(sh);
>   kmem_cache_free(conf->slab_cache, sh);
> @@ -1990,13 +2003,8 @@ static int grow_one_stripe(struct r5conf *conf, gfp_t 
> gfp)
>   sh->hash_lock_index =
>   conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS;
>   /* we just created an active stripe so... */
> - atomic_set(&sh->count, 1);
>   atomic_inc(&conf->active_stripes);
> - INIT_LIST_HEAD(&sh->lru);
>  
> - spin_lock_init(&sh->batch_lock);
> - INIT_LIST_HEAD(&sh->batch_list);
> - sh->batch_head = NULL;
>   release_stripe(sh);
>   conf->max_nr_stripes++;
>   return 1;
> @@ -2109,13 +2117,11 @@ static int resize_stripes(struct r5conf *conf, int 
> newsize)
>   return -ENOMEM;
>  
>   for (i = conf->max_nr_stripes; i; i--) {
> - nsh = kmem_cache_zalloc(sc, GFP_KERNEL);
> + nsh = alloc_stripe(sc, GFP_KERNEL);
>   if (!nsh)
>   break;
>  
>   nsh->raid_conf = conf;
> - spin_lock_init(&nsh->stripe_lock);
> -
>   list_add(&nsh

[PATCH] md/raid5: init batch_xxx for new sh at resize_stripes

2015-05-03 Thread Yuanhan Liu

This is to fix a kernel NULL dereference oops introduced by commit
59fc630b("RAID5: batch adjacent full stripe write"), which introduced
several batch_xxx fields, and did initiation for them at grow_one_stripes(),
but forgot to do same at resize_stripes().

This oops can be easily triggered by following steps:

__create RAID5 /dev/md0
__grow /dev/md0
mdadm --wait /dev/md0
dd if=/dev/zero of=/dev/md0

Here is the detailed oops log:

[   32.384499] BUG: unable to handle kernel NULL pointer dereference at 
  (null)
[   32.385366] IP: [] add_stripe_bio+0x48d/0x544
[   32.385955] PGD 373f3067 PUD 36e34067 PMD 0
[   32.386404] Oops: 0002 [#1] SMP
[   32.386740] Modules linked in:
[   32.387040] CPU: 0 PID: 1059 Comm: kworker/u2:2 Not tainted 
4.0.0-next-20150427+ #107
[   32.387762] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 
rel-1.7.5-0-ge51488c-20140602_164612-nilsson.home.kraxel.org 04/01/2014
[   32.388044] Workqueue: writeback bdi_writeback_workfn (flush-9:0)
[   32.388044] task: 88003d038000 ti: 88003d40c000 task.ti: 
88003d40c000
[   32.388044] RIP: 0010:[]  [] 
add_stripe_bio+0x48d/0x544
[   32.388044] RSP: :88003d40f6f8  EFLAGS: 00010046
[   32.388044] RAX:  RBX: 880037168cd0 RCX: 880037179a28
[   32.388044] RDX: 880037168d58 RSI:  RDI: 880037179a20
[   32.388044] RBP: 88003d40f738 R08: 0410 R09: 0410
[   32.388044] R10: 0410 R11: 0002 R12: 8800371799a0
[   32.388044] R13: 88003c3d0800 R14: 0001 R15: 880037179a08
[   32.388044] FS:  () GS:88003fc0() 
knlGS:
[   32.388044] CS:  0010 DS:  ES:  CR0: 8005003b
[   32.388044] CR2:  CR3: 36e33000 CR4: 06f0
[   32.388044] Stack:
[   32.388044]  0002 880037168d38 88003d40f738 
88003c3abd00
[   32.388044]  88003c2df800 88003c3d0800 0408 
88003c3d0b54
[   32.388044]  88003d40f828 8184b9ea 3d40f7e8 
0292
[   32.388044] Call Trace:
[   32.388044]  [] make_request+0x7a8/0xaee
[   32.388044]  [] ? wait_woken+0x79/0x79
[   32.388044]  [] ? kmem_cache_alloc+0x95/0x1b6
[   32.388044]  [] md_make_request+0xeb/0x1c3
[   32.388044]  [] ? mempool_alloc+0x64/0x127
[   32.388044]  [] generic_make_request+0x9c/0xdb
[   32.388044]  [] submit_bio+0xf6/0x134
[   32.388044]  [] _submit_bh+0x119/0x141
[   32.388044]  [] submit_bh+0x10/0x12
[   32.388044]  [] 
__block_write_full_page.constprop.30+0x1a3/0x2a4
[   32.388044]  [] ? I_BDEV+0xd/0xd
[   32.388044]  [] block_write_full_page+0xab/0xaf
[   32.388044]  [] blkdev_writepage+0x18/0x1a
[   32.388044]  [] __writepage+0x14/0x2d
[   32.388044]  [] write_cache_pages+0x29a/0x3a7
[   32.388044]  [] ? mapping_tagged+0x14/0x14
[   32.388044]  [] generic_writepages+0x3e/0x56
[   32.388044]  [] do_writepages+0x1e/0x2c
[   32.388044]  [] __writeback_single_inode+0x5b/0x27e
[   32.388044]  [] writeback_sb_inodes+0x1dc/0x358
[   32.388044]  [] __writeback_inodes_wb+0x7f/0xb8
[   32.388044]  [] wb_writeback+0x11a/0x271
[   32.388044]  [] ? global_dirty_limits+0x1b/0xfd
[   32.388044]  [] bdi_writeback_workfn+0x1ae/0x360
[   32.388044]  [] process_one_work+0x1c2/0x340
[   32.388044]  [] worker_thread+0x28b/0x389
[   32.388044]  [] ? cancel_delayed_work_sync+0x15/0x15
[   32.388044]  [] kthread+0xd2/0xda
[   32.388044]  [] ? kthread_create_on_node+0x17c/0x17c
[   32.388044]  [] ret_from_fork+0x42/0x70
[   32.388044]  [] ? kthread_create_on_node+0x17c/0x17c
[   32.388044] Code: 84 24 90 00 00 00 48 8d 93 88 00 00 00 49 8d 8c 24 88 00 
00 00 49 89 94 24 90 00 00 00 48 89 8b 88 00 00 00 48 89 83 90 00 00 00 <48> 89 
10 66 41 83 84 24 80 00 00 00 01 3e 0f ba 73 48 06 72 02
[   32.388044] RIP  [] add_stripe_bio+0x48d/0x544
[   32.388044]  RSP 
[   32.388044] CR2: 
[   32.388044] ---[ end trace 2b255d3f55be9eb3 ]---

Cc: Shaohua Li 
Signed-off-by: Yuanhan Liu 
---
 drivers/md/raid5.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 697d77a..7b074f7 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2217,6 +2217,10 @@ static int resize_stripes(struct r5conf *conf, int 
newsize)
if (!p)
err = -ENOMEM;
}
+
+   spin_lock_init(&nsh->batch_lock);
+   INIT_LIST_HEAD(&nsh->batch_list);
+   nsh->batch_head = NULL;
release_stripe(nsh);
}
/* critical section pass, GFP_NOIO no longer needed */
-- 
1.9.0

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[LKP] [genirq] d5b2eacdbc2: BUG: unable to handle kernel NULL pointer dereference at (null)

2015-04-30 Thread Yuanhan Liu

FYI, we noticed the below changes on

https://github.com/jiangliu/linux.git test/irq_common_data_v2
commit d5b2eacdbc280da7c6dfbe0f52bb293ef227d349 ("genirq: Introduce struct 
irq_common_data to host shared irq data")


+-+++
| | 39fb394021 | 
d5b2eacdbc |
+-+++
| boot_successes  | 0  | 0  
|
| boot_failures   | 22 | 20 
|
| PM:Hibernation_image_not_present_or_could_not_be_loaded | 22 |
|
| BUG:unable_to_handle_kernel | 0  | 20 
|
| Oops| 0  | 20 
|
| Kernel_panic-not_syncing:Fatal_exception_in_interrupt   | 0  | 20 
|
| backtrace:__pci_register_driver | 0  | 6  
|
| backtrace:e1000_init_module | 0  | 6  
|
| backtrace:kernel_init_freeable  | 0  | 6  
|
| backtrace:ata_sff_pio_task  | 0  | 14 
|
+-+++


[1.351055] ata2.01: NODEV after polling detection
[1.352179] ata2.00: ATAPI: QEMU DVD-ROM, 2.1.2, max UDMA/100
[1.353501] ata2.00: configured for MWDMA2
[1.354423] BUG: unable to handle kernel NULL pointer dereference at 
  (null)
[1.356074] IP: [<  (null)>]   (null)
[1.356074] PGD 0 
[1.356074] Oops: 0010 [#1] SMP 
[1.356074] Modules linked in:
[1.356074] CPU: 0 PID: 584 Comm: kworker/0:1 Not tainted 
4.1.0-rc1-wl-ath-00905-geb3b9ec #1
[1.356074] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 
1.7.5-20140531_083030-gandalf 04/01/2014
[1.356074] Workqueue: ata_sff ata_sff_pio_task
[1.356074] task: 880011c2af30 ti: 8800123bc000 task.ti: 
8800123bc000
[1.356074] RIP: 0010:[<>]  [<  (null)>]   
(null)
[1.356074] RSP: :880013803ee0  EFLAGS: 00010046
[1.356074] RAX: 8222b2c0 RBX: 88001349fc80 RCX: 0009
[1.356074] RDX: 88001348f400 RSI: ffc0 RDI: 88001349fc80
[1.356074] RBP: 880013803ef8 R08:  R09: 0013
[1.356074] R10: 0006 R11:  R12: 88001348f400
[1.356074] R13: 000f R14: 8800123bfc78 R15: 
[1.356074] FS:  () GS:88001380() 
knlGS:
[1.356074] CS:  0010 DS:  ES:  CR0: 8005003b
[1.356074] CR2:  CR3: 0220b000 CR4: 06f0
[1.356074] Stack:
[1.356074]  8113aa96 88001349fc80 88001348f458 
880013803f18
[1.356074]  8106bc49 8222b2c0 88001348f400 
880013803f28
[1.356074]  81138421 880013803f48 811380db 
000f
[1.356074] Call Trace:
[1.356074]   
[1.356074]  [] ? irq_move_irq+0x34/0x50
[1.356074]  [] apic_ack_edge+0x23/0x3b
[1.356074]  [] irq_chip_ack_parent+0x14/0x16
[1.356074]  [] handle_edge_irq+0xa5/0x110
[1.356074]  [] handle_irq+0x27/0x2d
[1.356074]  [] do_IRQ+0x4c/0xcf
[1.356074]  [] common_interrupt+0x73/0x73
[1.356074]   
[1.356074]  [] ? __ata_qc_complete+0xe1/0xe9
[1.356074]  [] ? _raw_spin_unlock_irqrestore+0x32/0x42
[1.356074]  [] ata_sff_hsm_move+0x258/0x66a
[1.356074]  [] ata_sff_pio_task+0x140/0x15e
[1.356074]  [] process_one_work+0x1c6/0x37b
[1.356074]  [] worker_thread+0x2ad/0x3b6
[1.356074]  [] ? rescuer_thread+0x318/0x318
[1.356074]  [] kthread+0xf8/0x100
[1.356074]  [] ? kthread_create_on_node+0x184/0x184
[1.356074]  [] ret_from_fork+0x42/0x70
[1.356074]  [] ? kthread_create_on_node+0x184/0x184
[1.356074] Code:  Bad RIP value.
[1.356074] RIP  [<  (null)>]   (null)
[1.356074]  RSP 
[1.356074] CR2: 
[1.356074] ---[ end trace d37ae2366ce94eef ]---
[1.356074] Kernel panic - not syncing: Fatal exception in interrupt



Thanks,
lkp
#
# Automatically generated file; DO NOT EDIT.
# Linux/x86_64 4.0.0 Kernel Configuration
#
CONFIG_64BIT=y
CONFIG_X86_64=y
CONFIG_X86=y
CONFIG_INSTRUCTION_DECODER=y
CONFIG_PERF_EVENTS_INTEL_UNCORE=y
CONFIG_OUTPUT_FORMAT="elf64-x86-64"
CONFIG_ARCH_DEFCONFIG="arch/x86/configs/x86_64_defconfig"
CONFIG_LOCKDEP_SUPPORT=y
CONFIG_STACKTRACE_SUPPORT=y
CONFIG_HAVE_LATENCYTOP_SUPPORT=y
CONFIG_MMU=y
CONFIG_NEED_DMA_MAP_STATE=y
CONFIG_NEED_SG_DMA_LENGTH=y
CONFIG_GENERIC_ISA_DMA=y
CONFIG_GENERIC_BUG=y
CONFIG_GENERIC_BUG_RELATIVE_POINTERS=y
CONFIG_GENE

Re: [PATCH 2/2] md/raid5: trivial coding style fix

2015-04-30 Thread Yuanhan Liu

On Thu, Apr 30, 2015 at 05:16:50PM +1000, NeilBrown wrote:
> On Thu, 30 Apr 2015 15:01:17 +0800 Yuanhan Liu 
> wrote:
> 
> > Signed-off-by: Yuanhan Liu 
> > ---
> >  drivers/md/raid5.c | 3 +--
> >  1 file changed, 1 insertion(+), 2 deletions(-)
> > 
> > diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
> > index 2651bda..bae3e2c 100644
> > --- a/drivers/md/raid5.c
> > +++ b/drivers/md/raid5.c
> > @@ -5789,8 +5789,7 @@ static void raid5d(struct md_thread *thread)
> > if (released)
> > clear_bit(R5_DID_ALLOC, &conf->cache_state);
> >  
> > -   if (
> > -   !list_empty(&conf->bitmap_list)) {
> > +   if (!list_empty(&conf->bitmap_list)) {
> > /* Now is a good time to flush some bitmap updates */
> > conf->seq_flush++;
> > spin_unlock_irq(&conf->device_lock);
> 
> 
> I'm happy for these sorts of changes when you are fixing up nearby code, or
> if the change significantly improves readability.
> But I'd rather not bother is one-off trivial fixes like this.

Got it.

--yliu
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 1/2] md/raid5: fix typo

2015-04-30 Thread Yuanhan Liu

On Thu, Apr 30, 2015 at 05:14:26PM +1000, NeilBrown wrote:
> On Thu, 30 Apr 2015 15:01:16 +0800 Yuanhan Liu 
> wrote:
> 
> > bion -> bios
> > 
> > Signed-off-by: Yuanhan Liu 
> > ---
> >  drivers/md/raid5.c | 2 +-
> >  1 file changed, 1 insertion(+), 1 deletion(-)
> > 
> > diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
> > index 697d77a..2651bda 100644
> > --- a/drivers/md/raid5.c
> > +++ b/drivers/md/raid5.c
> > @@ -2919,7 +2919,7 @@ schedule_reconstruction(struct stripe_head *sh, 
> > struct stripe_head_state *s,
> >  }
> >  
> >  /*
> > - * Each stripe/dev can have one or more bion attached.
> > + * Each stripe/dev can have one or more bios attached.
> >   * toread/towrite point to the first in a chain.
> >   * The bi_next chain must be in order.
> >   */
> 
> That was intentional.  "bios" as a plural looks too much like "BIOS" which is
> in the ROM of computers.
> 
> Children and oxen are plurals with an 'n' at the end.  So I used 'bion'.
> Private joke?

Interesting.

> 
> I'd rather leave it as it is.

Okay, and sorry for the noise.

--yliu
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 1/2] md/raid5: fix typo

2015-04-30 Thread Yuanhan Liu

bion -> bios

Signed-off-by: Yuanhan Liu 
---
 drivers/md/raid5.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 697d77a..2651bda 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2919,7 +2919,7 @@ schedule_reconstruction(struct stripe_head *sh, struct 
stripe_head_state *s,
 }
 
 /*
- * Each stripe/dev can have one or more bion attached.
+ * Each stripe/dev can have one or more bios attached.
  * toread/towrite point to the first in a chain.
  * The bi_next chain must be in order.
  */
-- 
1.9.0

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 2/2] md/raid5: trivial coding style fix

2015-04-30 Thread Yuanhan Liu

Signed-off-by: Yuanhan Liu 
---
 drivers/md/raid5.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 2651bda..bae3e2c 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5789,8 +5789,7 @@ static void raid5d(struct md_thread *thread)
if (released)
clear_bit(R5_DID_ALLOC, &conf->cache_state);
 
-   if (
-   !list_empty(&conf->bitmap_list)) {
+   if (!list_empty(&conf->bitmap_list)) {
/* Now is a good time to flush some bitmap updates */
conf->seq_flush++;
spin_unlock_irq(&conf->device_lock);
-- 
1.9.0

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [LKP] [RAID5] 878ee679279: -1.8% vmstat.io.bo, +40.5% perf-stat.LLC-load-misses

2015-04-29 Thread Yuanhan Liu

On Fri, Apr 24, 2015 at 12:15:59PM +1000, NeilBrown wrote:
> On Thu, 23 Apr 2015 14:55:59 +0800 Huang Ying  wrote:
> 
> > FYI, we noticed the below changes on
> > 
> > git://neil.brown.name/md for-next
> > commit 878ee6792799e2f88bdcac329845efadb205252f ("RAID5: batch adjacent 
> > full stripe write")
> 
> Hi,
>  is there any chance that you could explain what some of this means?
> There is lots of data and some very pretty graphs, but no explanation.

Hi Neil,

(Sorry for late response: Ying is on vacation)

I guess you can simply ignore this report, as I already reported to you
month ago that this patch made fsmark performs better in most cases:

https://lists.01.org/pipermail/lkp/2015-March/002411.html

> 
> Which numbers are "good", which are "bad"?  Which is "worst".
> What do the graphs really show? and what would we like to see in them?
> 
> I think it is really great that you are doing this testing and reporting the
> results.  It's just so sad that I completely fail to understand them.

Sorry, it's our bad to make them hard to understand as well as
to report a duplicate one(well, the commit hash is different ;).

We might need take some time to make those data understood easier.

--yliu

> 
> > 
> > 
> > testbox/testcase/testparams: 
> > lkp-st02/dd-write/300-5m-11HDD-RAID5-cfq-xfs-1dd
> > 
> > a87d7f782b47e030  878ee6792799e2f88bdcac3298  
> >   --  
> >  %stddev %change %stddev
> >  \  |\  
> >  59035 ±  0% +18.4%  69913 ±  1%  softirqs.SCHED
> >   1330 ± 10% +17.4%   1561 ±  4%  slabinfo.kmalloc-512.num_objs
> >   1330 ± 10% +17.4%   1561 ±  4%  
> > slabinfo.kmalloc-512.active_objs
> > 305908 ±  0%  -1.8% 300427 ±  0%  vmstat.io.bo
> >  1 ±  0%+100.0%  2 ±  0%  vmstat.procs.r
> >   8266 ±  1% -15.7%   6968 ±  0%  vmstat.system.cs
> >  14819 ±  0%  -2.1%  14503 ±  0%  vmstat.system.in
> >  18.20 ±  6% +10.2%  20.05 ±  4%  
> > perf-profile.cpu-cycles.raid_run_ops.handle_stripe.handle_active_stripes.raid5d.md_thread
> >   1.94 ±  9% +90.6%   3.70 ±  9%  
> > perf-profile.cpu-cycles.async_xor.raid_run_ops.handle_stripe.handle_active_stripes.raid5d
> >   0.00 ±  0%  +Inf%  25.18 ±  3%  
> > perf-profile.cpu-cycles.handle_active_stripes.isra.45.raid5d.md_thread.kthread.ret_from_fork
> >   0.00 ±  0%  +Inf%  14.14 ±  4%  
> > perf-profile.cpu-cycles.async_copy_data.isra.42.raid_run_ops.handle_stripe.handle_active_stripes.raid5d
> >   1.79 ±  7%+102.9%   3.64 ±  9%  
> > perf-profile.cpu-cycles.xor_blocks.async_xor.raid_run_ops.handle_stripe.handle_active_stripes
> >   3.09 ±  4% -10.8%   2.76 ±  4%  
> > perf-profile.cpu-cycles.get_active_stripe.make_request.md_make_request.generic_make_request.submit_bio
> >   0.80 ± 14% +28.1%   1.02 ± 10%  
> > perf-profile.cpu-cycles.mutex_lock.xfs_file_buffered_aio_write.xfs_file_write_iter.new_sync_write.vfs_write
> >  14.78 ±  6%-100.0%   0.00 ±  0%  
> > perf-profile.cpu-cycles.async_copy_data.isra.38.raid_run_ops.handle_stripe.handle_active_stripes.raid5d
> >  25.68 ±  4%-100.0%   0.00 ±  0%  
> > perf-profile.cpu-cycles.handle_active_stripes.isra.41.raid5d.md_thread.kthread.ret_from_fork
> >   1.23 ±  5%+140.0%   2.96 ±  7%  
> > perf-profile.cpu-cycles.xor_sse_5_pf64.xor_blocks.async_xor.raid_run_ops.handle_stripe
> >   2.62 ±  6% -95.6%   0.12 ± 33%  
> > perf-profile.cpu-cycles.analyse_stripe.handle_stripe.handle_active_stripes.raid5d.md_thread
> >   0.96 ±  9% +17.5%   1.12 ±  2%  
> > perf-profile.cpu-cycles.xfs_ilock.xfs_file_buffered_aio_write.xfs_file_write_iter.new_sync_write.vfs_write
> >  1.461e+10 ±  0%  -5.3%  1.384e+10 ±  1%  
> > perf-stat.L1-dcache-load-misses
> >  3.688e+11 ±  0%  -2.7%   3.59e+11 ±  0%  perf-stat.L1-dcache-loads
> >  1.124e+09 ±  0% -27.7%  8.125e+08 ±  0%  perf-stat.L1-dcache-prefetches
> >  2.767e+10 ±  0%  -1.8%  2.717e+10 ±  0%  
> > perf-stat.L1-dcache-store-misses
> >  2.352e+11 ±  0%  -2.8%  2.287e+11 ±  0%  perf-stat.L1-dcache-stores
> >  6.774e+09 ±  0%  -2.3%   6.62e+09 ±  0%  
> > perf-stat.L1-icache-load-misses
> >  5.571e+08 ±  0% +40.5%  7.826e+08 ±  1%  perf-stat.LLC-load-misses
> >  6.263e+09 ±  0% -13.7%  5.407e+09 ±  1%  perf-stat.LLC-loads
> >  1.914e+11 ±  0%  -4.2%  1.833e+11 ±  0%  perf-stat.branch-instructions
> >  1.145e+09 ±  2%  -5.6%  1.081e+09 ±  0%  perf-stat.branch-load-misses
> >  1.911e+11 ±  0%  -4.3%  1.829e+11 ±  0%  perf-stat.branch-loads
> >  1.142e+09 ±  2%  -5.1%  1.083e+09 ±  0%  perf-stat.branch-misses
> >  1.218e+09 ±  0% +19.8%   1.46e+09 ±  0%  perf-stat.cache-misses
> >  2.118e+10 ±  0%  -5.2%  2.007e+10 ±  0%  perf-stat.cache-references
> >2510308 ±  1% -15.7%2115410 ±  0%

[LKP] [block] 5a19fe29ba7: +5.4% boot-slabinfo.num_objs

2015-04-29 Thread Yuanhan Liu

FYI, we noticed the below changes on

git://git.kernel.org/pub/scm/linux/kernel/git/mlin/linux.git block-generic-req
commit 5a19fe29ba7d052c0d8fa8a2bf461abc1e4d89bb ("block: make 
generic_make_request handle arbitrarily sized bios")


testbox/testcase/testparams: vm-kbuild-1G/boot/1

v4.1-rc1  5a19fe29ba7d052c0d8fa8a2bf  
  --  
 %stddev %change %stddev
 \  |\  
152092 ±  0%  +5.4% 160249 ±  0%  boot-slabinfo.num_objs
 10106 ±  0% +21.6%  12293 ±  0%  boot-slabinfo.num_pages
  8.30 ± 21% -33.9%   5.48 ±  1%  boot-time.boot
  7.44 ± 23% -34.9%   4.84 ±  1%  boot-time.dhcp
 10.01 ± 17% -27.0%   7.31 ±  1%  boot-time.idle
 35507 ±  2% +17.9%  41856 ± 10%  boot-meminfo.DirectMap4k
  1558 ±  8%+276.5%   5868 ±  1%  boot-meminfo.KernelStack
480717 ±  0%  -2.8% 467414 ±  0%  boot-meminfo.MemFree
 11462 ±  1% +70.0%  19488 ±  0%  boot-meminfo.SUnreclaim
 40390 ±  0% +21.7%  49146 ±  0%  boot-meminfo.Slab

vm-kbuild-1G: qemu-system-x86_64 -enable-kvm -cpu Haswell,+smep,+smap
Memory: 1G




   boot-slabinfo.num_objs

  162000 ++-+
 |  O OOO
  16 O+ O   O   OOOO O  |
 |O  O O O OOO   O O  O |
 | O  OOO   |
  158000 ++ |
 |O |
  156000 ++ |
 |  |
  154000 ++ |
 |  |
 |  *.  .*.*.. *..*.*.. |
  152000 ++.  *.  *. ..*|
 *  *   |
  15 ++-+


   boot-slabinfo.num_pages

  12500 ++--+
O  O O  O O  O  O O O O  O O  O  O O  O O  O  O O  O  OO O  O
|O  O   |
  12000 ++  |
|   |
|   |
  11500 ++  |
|   |
  11000 ++  |
|   |
|   |
  10500 ++  |
|   |
|   .*..*.  .*..*.*..*..*.* |
  1 *+-*--*-+


boot-meminfo.MemFree

  484000 ++-+
  482000 ++*|
 | .*.*.. .*..*   : +   *.. |
  48 *+  * :  :  + +|
  478000 ++: :**|
 |  *   |
  476000 ++ |
  474000 ++ |
  472000 ++ |
 |  |
  47 ++O O   O O O O|
  468000 ++   O  O O  OO  O O O   OO O  |
 |  O   O   O  O  O O   |
  466000 O+  O  O   O
  464000 ++-+


  boot-meminfo.Slab

  5 ++---

[PATCH 2/3] md/raid5: split wait_for_stripe and introduce wait_for_quiescent

2015-04-28 Thread Yuanhan Liu

I noticed heavy spin lock contention at get_active_stripe(), introduced
at being wake up stage, where a bunch of processes try to re-hold the
spin lock again.

After giving some thoughts on this issue, I found the lock could be
relieved(and even avoided) if we turn the wait_for_stripe to per
waitqueue for each lock hash and make the wake up exclusive: wake up
one process each time, which avoids the lock contention naturally.

Before go hacking with wait_for_stripe, I found it actually has 2
usages: for the array to enter or leave the quiescent state, and also
to wait for an available stripe in each of the hash lists.

So this patch splits the first usage off into a separate wait_queue,
wait_for_quiescent, and the next patch will turn the second usage into
one waitqueue for each hash value, and make it exclusive, to relieve
the lock contention.

v2: wake_up(wait_for_quiescent) when (active_stripes == 0)
Commit log refactor suggestion from Neil.

Signed-off-by: Yuanhan Liu 
---
 drivers/md/raid5.c | 15 +--
 drivers/md/raid5.h |  1 +
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 77dfd72..64d5bea 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -374,6 +374,8 @@ static void release_inactive_stripe_list(struct r5conf 
*conf,
 
if (do_wakeup) {
wake_up(&conf->wait_for_stripe);
+   if (atomic_read(&conf->active_stripes) == 0)
+   wake_up(&conf->wait_for_quiescent);
if (conf->retry_read_aligned)
md_wakeup_thread(conf->mddev->thread);
}
@@ -667,7 +669,7 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
spin_lock_irq(conf->hash_locks + hash);
 
do {
-   wait_event_lock_irq(conf->wait_for_stripe,
+   wait_event_lock_irq(conf->wait_for_quiescent,
conf->quiesce == 0 || noquiesce,
*(conf->hash_locks + hash));
sh = __find_stripe(conf, sector, conf->generation - previous);
@@ -4729,7 +4731,7 @@ static void raid5_align_endio(struct bio *bi, int error)
 raid_bi, 0);
bio_endio(raid_bi, 0);
if (atomic_dec_and_test(&conf->active_aligned_reads))
-   wake_up(&conf->wait_for_stripe);
+   wake_up(&conf->wait_for_quiescent);
return;
}
 
@@ -4824,7 +4826,7 @@ static int chunk_aligned_read(struct mddev *mddev, struct 
bio * raid_bio)
align_bi->bi_iter.bi_sector += rdev->data_offset;
 
spin_lock_irq(&conf->device_lock);
-   wait_event_lock_irq(conf->wait_for_stripe,
+   wait_event_lock_irq(conf->wait_for_quiescent,
conf->quiesce == 0,
conf->device_lock);
atomic_inc(&conf->active_aligned_reads);
@@ -5668,7 +5670,7 @@ static int  retry_aligned_read(struct r5conf *conf, 
struct bio *raid_bio)
bio_endio(raid_bio, 0);
}
if (atomic_dec_and_test(&conf->active_aligned_reads))
-   wake_up(&conf->wait_for_stripe);
+   wake_up(&conf->wait_for_quiescent);
return handled;
 }
 
@@ -6399,6 +6401,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
goto abort;
spin_lock_init(&conf->device_lock);
seqcount_init(&conf->gen_lock);
+   init_waitqueue_head(&conf->wait_for_quiescent);
init_waitqueue_head(&conf->wait_for_stripe);
init_waitqueue_head(&conf->wait_for_overlap);
INIT_LIST_HEAD(&conf->handle_list);
@@ -7422,7 +7425,7 @@ static void raid5_quiesce(struct mddev *mddev, int state)
 * active stripes can drain
 */
conf->quiesce = 2;
-   wait_event_cmd(conf->wait_for_stripe,
+   wait_event_cmd(conf->wait_for_quiescent,
atomic_read(&conf->active_stripes) == 0 &&
atomic_read(&conf->active_aligned_reads) == 
0,
unlock_all_device_hash_locks_irq(conf),
@@ -7436,7 +7439,7 @@ static void raid5_quiesce(struct mddev *mddev, int state)
case 0: /* re-enable writes */
lock_all_device_hash_locks_irq(conf);
conf->quiesce = 0;
-   wake_up(&conf->wait_for_stripe);
+   wake_up(&conf->wait_for_quiescent);
wake_up(&conf->wait_for_overlap);
unlock_all_device_hash_locks_irq(conf);
break;
diff --git a/drivers/md/raid5.h b/d

[PATCH 3/3] md/raid5: per hash value and exclusive wait_for_stripe

2015-04-28 Thread Yuanhan Liu

gh there are no much performance gain for hard disk
workload, the system time is dropped heavily, up to 97%. And as expected,
the performance increased a lot, up to 260%, for fast device(ram disk).

v2: use bits instead of array to note down wait queue need to wake up.

Signed-off-by: Yuanhan Liu 
---
 drivers/md/raid5.c | 27 +++
 drivers/md/raid5.h |  2 +-
 2 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 64d5bea..697d77a 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -344,7 +344,8 @@ static void release_inactive_stripe_list(struct r5conf 
*conf,
 int hash)
 {
int size;
-   bool do_wakeup = false;
+   unsigned long do_wakeup = 0;
+   int i = 0;
unsigned long flags;
 
if (hash == NR_STRIPE_HASH_LOCKS) {
@@ -365,15 +366,19 @@ static void release_inactive_stripe_list(struct r5conf 
*conf,
!list_empty(list))
atomic_dec(&conf->empty_inactive_list_nr);
list_splice_tail_init(list, conf->inactive_list + hash);
-   do_wakeup = true;
+   do_wakeup |= 1 << (size - 1);
spin_unlock_irqrestore(conf->hash_locks + hash, flags);
}
size--;
hash--;
}
 
+   for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) {
+   if (do_wakeup & (1 << i))
+   wake_up(&conf->wait_for_stripe[i]);
+   }
+
if (do_wakeup) {
-   wake_up(&conf->wait_for_stripe);
if (atomic_read(&conf->active_stripes) == 0)
wake_up(&conf->wait_for_quiescent);
if (conf->retry_read_aligned)
@@ -686,14 +691,15 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
if (!sh) {
set_bit(R5_INACTIVE_BLOCKED,
&conf->cache_state);
-   wait_event_lock_irq(
-   conf->wait_for_stripe,
+   wait_event_exclusive_cmd(
+   conf->wait_for_stripe[hash],
!list_empty(conf->inactive_list + hash) 
&&
(atomic_read(&conf->active_stripes)
 < (conf->max_nr_stripes * 3 / 4)
 || !test_bit(R5_INACTIVE_BLOCKED,
  &conf->cache_state)),
-   *(conf->hash_locks + hash));
+   spin_unlock_irq(conf->hash_locks + 
hash),
+   spin_lock_irq(conf->hash_locks + hash));
clear_bit(R5_INACTIVE_BLOCKED,
  &conf->cache_state);
} else {
@@ -718,6 +724,9 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
}
} while (sh == NULL);
 
+   if (!list_empty(conf->inactive_list + hash))
+   wake_up(&conf->wait_for_stripe[hash]);
+
spin_unlock_irq(conf->hash_locks + hash);
return sh;
 }
@@ -2138,7 +2147,7 @@ static int resize_stripes(struct r5conf *conf, int 
newsize)
cnt = 0;
list_for_each_entry(nsh, &newstripes, lru) {
lock_device_hash_lock(conf, hash);
-   wait_event_cmd(conf->wait_for_stripe,
+   wait_event_exclusive_cmd(conf->wait_for_stripe[hash],
!list_empty(conf->inactive_list + hash),
unlock_device_hash_lock(conf, hash),
lock_device_hash_lock(conf, hash));
@@ -6402,7 +6411,9 @@ static struct r5conf *setup_conf(struct mddev *mddev)
spin_lock_init(&conf->device_lock);
seqcount_init(&conf->gen_lock);
init_waitqueue_head(&conf->wait_for_quiescent);
-   init_waitqueue_head(&conf->wait_for_stripe);
+   for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) {
+   init_waitqueue_head(&conf->wait_for_stripe[i]);
+   }
init_waitqueue_head(&conf->wait_for_overlap);
INIT_LIST_HEAD(&conf->handle_list);
INIT_LIST_HEAD(&conf->hold_list);
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 4cc05ec..6307b90 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -509,7 +509,7 @@ struct r5conf {
atomic_tempty_inactive_list_nr;
struct llist_head   released_stripes;
wait_queue_head_t   w

[PATCH 1/3 v2] wait: introduce wait_event_exclusive_cmd

2015-04-28 Thread Yuanhan Liu

It's just a variant of wait_event_cmd(), with exclusive flag being set.

For cases like RAID5, which puts many processes to sleep until 1/4
resources are free, a wake_up wakes up all processes to run, but
there is one process being able to get the resource as it's protected
by a spin lock. That ends up introducing heavy lock contentions, and
hurts performance badly.

Here introduce wait_event_exclusive_cmd to relieve the lock contention
naturally by letting wake_up just wake up one process.

Cc: Ingo Molnar 
Cc: Peter Zijlstra 
v2: its assumed that wait*() and __wait*() have the same arguments - peterz

Signed-off-by: Yuanhan Liu 
---
 include/linux/wait.h | 13 +
 1 file changed, 13 insertions(+)

diff --git a/include/linux/wait.h b/include/linux/wait.h
index 2db8334..db78c72 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -358,6 +358,19 @@ do {   
\
__ret;  \
 })
 
+#define __wait_event_exclusive_cmd(wq, condition, cmd1, cmd2)  \
+   (void)___wait_event(wq, condition, TASK_UNINTERRUPTIBLE, 1, 0,  \
+   cmd1; schedule(); cmd2)
+/*
+ * Just like wait_event_cmd(), except it sets exclusive flag
+ */
+#define wait_event_exclusive_cmd(wq, condition, cmd1, cmd2)\
+do {   \
+   if (condition)  \
+   break;  \
+   __wait_event_exclusive_cmd(wq, condition, cmd1, cmd2);  \
+} while (0)
+
 #define __wait_event_cmd(wq, condition, cmd1, cmd2)\
(void)___wait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0, 0,  \
cmd1; schedule(); cmd2)
-- 
1.9.0

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 1/3] wait: introduce wait_event_cmd_exclusive

2015-04-28 Thread Yuanhan Liu

On Tue, Apr 28, 2015 at 04:13:15PM +0200, Peter Zijlstra wrote:
> On Mon, Apr 27, 2015 at 12:51:01PM +0800, Yuanhan Liu wrote:
> > It's just a variant of wait_event_cmd, with exclusive flag being set.
> > 
> > For cases like RAID5, which puts many processes to sleep until 1/4
> > resources are free, a wake_up wakes up all processes to run, but
> > there is one process being able to get the resource as it's protected
> > by a spin lock. That ends up introducing heavy lock contentions, and
> > hurts performance badly.
> > 
> > Here introduce wait_event_cmd_exclusive to relieve the lock contention
> > naturally by letting wake_up() just wake up one process.
> > 
> > Cc: Ingo Molnar 
> > Cc: Peter Zijlstra 
> > Signed-off-by: Yuanhan Liu 
> > ---
> >  include/linux/wait.h | 14 +++---
> >  1 file changed, 11 insertions(+), 3 deletions(-)
> > 
> > diff --git a/include/linux/wait.h b/include/linux/wait.h
> > index 2db8334..6c3b4de 100644
> > --- a/include/linux/wait.h
> > +++ b/include/linux/wait.h
> > @@ -358,10 +358,18 @@ do {  
> > \
> > __ret;  \
> >  })
> >  
> > -#define __wait_event_cmd(wq, condition, cmd1, cmd2)
> > \
> > -   (void)___wait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0, 0,  \
> > +#define __wait_event_cmd(wq, condition, cmd1, cmd2, exclusive) 
> > \
> > +   (void)___wait_event(wq, condition, TASK_UNINTERRUPTIBLE, exclusive, 0, \
> > cmd1; schedule(); cmd2)
> >  
> > +
> > +#define wait_event_cmd_exclusive(wq, condition, cmd1, cmd2)
> > \
> > +do {   
> > \
> > +   if (condition)  \
> > +   break;  \
> > +   __wait_event_cmd(wq, condition, cmd1, cmd2, 1); \
> > +} while (0)
> > +
> >  /**
> >   * wait_event_cmd - sleep until a condition gets true
> >   * @wq: the waitqueue to wait on
> > @@ -380,7 +388,7 @@ do {
> > \
> >  do {   
> > \
> > if (condition)  \
> > break;  \
> > -   __wait_event_cmd(wq, condition, cmd1, cmd2);\
> > +   __wait_event_cmd(wq, condition, cmd1, cmd2, 0); \
> >  } while (0)
> >  
> 
> No, that's wrong, its assumed that wait*() and __wait*() have the same
> arguments.

Thanks. Will send an updated patch soon.


--yliu
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 3/3 v2] md/raid5: per hash value and exclusive wait_for_stripe

2015-04-26 Thread Yuanhan Liu

gh there are no much performance gain for hard disk
workload, the system time is dropped heavily, up to 97%. And as expected,
the performance increased a lot, up to 260%, for fast device(ram disk).

v2: use bits instead of array to note down wait queue need to wake up.

Signed-off-by: Yuanhan Liu 
---
 drivers/md/raid5.c | 27 +++
 drivers/md/raid5.h |  2 +-
 2 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 64d5bea..1b11bbf 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -344,7 +344,8 @@ static void release_inactive_stripe_list(struct r5conf 
*conf,
 int hash)
 {
int size;
-   bool do_wakeup = false;
+   unsigned long do_wakeup = 0;
+   int i = 0;
unsigned long flags;
 
if (hash == NR_STRIPE_HASH_LOCKS) {
@@ -365,15 +366,19 @@ static void release_inactive_stripe_list(struct r5conf 
*conf,
!list_empty(list))
atomic_dec(&conf->empty_inactive_list_nr);
list_splice_tail_init(list, conf->inactive_list + hash);
-   do_wakeup = true;
+   do_wakeup |= 1 << (size - 1);
spin_unlock_irqrestore(conf->hash_locks + hash, flags);
}
size--;
hash--;
}
 
+   for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) {
+   if (do_wakeup & (1 << i))
+   wake_up(&conf->wait_for_stripe[i]);
+   }
+
if (do_wakeup) {
-   wake_up(&conf->wait_for_stripe);
if (atomic_read(&conf->active_stripes) == 0)
wake_up(&conf->wait_for_quiescent);
if (conf->retry_read_aligned)
@@ -686,14 +691,15 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
if (!sh) {
set_bit(R5_INACTIVE_BLOCKED,
&conf->cache_state);
-   wait_event_lock_irq(
-   conf->wait_for_stripe,
+   wait_event_cmd_exclusive(
+   conf->wait_for_stripe[hash],
!list_empty(conf->inactive_list + hash) 
&&
(atomic_read(&conf->active_stripes)
 < (conf->max_nr_stripes * 3 / 4)
 || !test_bit(R5_INACTIVE_BLOCKED,
  &conf->cache_state)),
-   *(conf->hash_locks + hash));
+   spin_unlock_irq(conf->hash_locks + 
hash),
+   spin_lock_irq(conf->hash_locks + hash));
clear_bit(R5_INACTIVE_BLOCKED,
  &conf->cache_state);
} else {
@@ -718,6 +724,9 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
}
} while (sh == NULL);
 
+   if (!list_empty(conf->inactive_list + hash))
+   wake_up(&conf->wait_for_stripe[hash]);
+
spin_unlock_irq(conf->hash_locks + hash);
return sh;
 }
@@ -2138,7 +2147,7 @@ static int resize_stripes(struct r5conf *conf, int 
newsize)
cnt = 0;
list_for_each_entry(nsh, &newstripes, lru) {
lock_device_hash_lock(conf, hash);
-   wait_event_cmd(conf->wait_for_stripe,
+   wait_event_cmd_exclusive(conf->wait_for_stripe[hash],
!list_empty(conf->inactive_list + hash),
unlock_device_hash_lock(conf, hash),
lock_device_hash_lock(conf, hash));
@@ -6402,7 +6411,9 @@ static struct r5conf *setup_conf(struct mddev *mddev)
spin_lock_init(&conf->device_lock);
seqcount_init(&conf->gen_lock);
init_waitqueue_head(&conf->wait_for_quiescent);
-   init_waitqueue_head(&conf->wait_for_stripe);
+   for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) {
+   init_waitqueue_head(&conf->wait_for_stripe[i]);
+   }
init_waitqueue_head(&conf->wait_for_overlap);
INIT_LIST_HEAD(&conf->handle_list);
INIT_LIST_HEAD(&conf->hold_list);
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 4cc05ec..6307b90 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -509,7 +509,7 @@ struct r5conf {
atomic_tempty_inactive_list_nr;
struct llist_head   released_stripes;
wait_queue_head_t   w

[PATCH 1/3] wait: introduce wait_event_cmd_exclusive

2015-04-26 Thread Yuanhan Liu

It's just a variant of wait_event_cmd, with exclusive flag being set.

For cases like RAID5, which puts many processes to sleep until 1/4
resources are free, a wake_up wakes up all processes to run, but
there is one process being able to get the resource as it's protected
by a spin lock. That ends up introducing heavy lock contentions, and
hurts performance badly.

Here introduce wait_event_cmd_exclusive to relieve the lock contention
naturally by letting wake_up() just wake up one process.

Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Signed-off-by: Yuanhan Liu 
---
 include/linux/wait.h | 14 +++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/include/linux/wait.h b/include/linux/wait.h
index 2db8334..6c3b4de 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -358,10 +358,18 @@ do {  
\
__ret;  \
 })
 
-#define __wait_event_cmd(wq, condition, cmd1, cmd2)\
-   (void)___wait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0, 0,  \
+#define __wait_event_cmd(wq, condition, cmd1, cmd2, exclusive) \
+   (void)___wait_event(wq, condition, TASK_UNINTERRUPTIBLE, exclusive, 0, \
cmd1; schedule(); cmd2)
 
+
+#define wait_event_cmd_exclusive(wq, condition, cmd1, cmd2)
\
+do {   \
+   if (condition)  \
+   break;  \
+   __wait_event_cmd(wq, condition, cmd1, cmd2, 1); \
+} while (0)
+
 /**
  * wait_event_cmd - sleep until a condition gets true
  * @wq: the waitqueue to wait on
@@ -380,7 +388,7 @@ do {
\
 do {   \
if (condition)  \
break;  \
-   __wait_event_cmd(wq, condition, cmd1, cmd2);\
+   __wait_event_cmd(wq, condition, cmd1, cmd2, 0); \
 } while (0)
 
 #define __wait_event_interruptible(wq, condition)  \
-- 
1.9.0

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 2/3 v2] md/raid5: split wait_for_stripe and introduce wait_for_quiescent

2015-04-26 Thread Yuanhan Liu

I noticed heavy spin lock contention at get_active_stripe(), introduced
at being wake up stage, where a bunch of processes try to re-hold the
spin lock again.

After giving some thoughts on this issue, I found the lock could be
relieved(and even avoided) if we turn the wait_for_stripe to per
waitqueue for each lock hash and make the wake up exclusive: wake up
one process each time, which avoids the lock contention naturally.

Before go hacking with wait_for_stripe, I found it actually has 2
usages: for the array to enter or leave the quiescent state, and also
to wait for an available stripe in each of the hash lists.

So this patch splits the first usage off into a separate wait_queue,
wait_for_quiescent, and the next patch will turn the second usage into
one waitqueue for each hash value, and make it exclusive, to relieve
the lock contention.

v2: wake_up(wait_for_quiescent) when (active_stripes == 0)
Commit log refactor suggestion from Neil.

Signed-off-by: Yuanhan Liu 
---
 drivers/md/raid5.c | 15 +--
 drivers/md/raid5.h |  1 +
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 77dfd72..64d5bea 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -374,6 +374,8 @@ static void release_inactive_stripe_list(struct r5conf 
*conf,
 
if (do_wakeup) {
wake_up(&conf->wait_for_stripe);
+   if (atomic_read(&conf->active_stripes) == 0)
+   wake_up(&conf->wait_for_quiescent);
if (conf->retry_read_aligned)
md_wakeup_thread(conf->mddev->thread);
}
@@ -667,7 +669,7 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
spin_lock_irq(conf->hash_locks + hash);
 
do {
-   wait_event_lock_irq(conf->wait_for_stripe,
+   wait_event_lock_irq(conf->wait_for_quiescent,
conf->quiesce == 0 || noquiesce,
*(conf->hash_locks + hash));
sh = __find_stripe(conf, sector, conf->generation - previous);
@@ -4729,7 +4731,7 @@ static void raid5_align_endio(struct bio *bi, int error)
 raid_bi, 0);
bio_endio(raid_bi, 0);
if (atomic_dec_and_test(&conf->active_aligned_reads))
-   wake_up(&conf->wait_for_stripe);
+   wake_up(&conf->wait_for_quiescent);
return;
}
 
@@ -4824,7 +4826,7 @@ static int chunk_aligned_read(struct mddev *mddev, struct 
bio * raid_bio)
align_bi->bi_iter.bi_sector += rdev->data_offset;
 
spin_lock_irq(&conf->device_lock);
-   wait_event_lock_irq(conf->wait_for_stripe,
+   wait_event_lock_irq(conf->wait_for_quiescent,
conf->quiesce == 0,
conf->device_lock);
atomic_inc(&conf->active_aligned_reads);
@@ -5668,7 +5670,7 @@ static int  retry_aligned_read(struct r5conf *conf, 
struct bio *raid_bio)
bio_endio(raid_bio, 0);
}
if (atomic_dec_and_test(&conf->active_aligned_reads))
-   wake_up(&conf->wait_for_stripe);
+   wake_up(&conf->wait_for_quiescent);
return handled;
 }
 
@@ -6399,6 +6401,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
goto abort;
spin_lock_init(&conf->device_lock);
seqcount_init(&conf->gen_lock);
+   init_waitqueue_head(&conf->wait_for_quiescent);
init_waitqueue_head(&conf->wait_for_stripe);
init_waitqueue_head(&conf->wait_for_overlap);
INIT_LIST_HEAD(&conf->handle_list);
@@ -7422,7 +7425,7 @@ static void raid5_quiesce(struct mddev *mddev, int state)
 * active stripes can drain
 */
conf->quiesce = 2;
-   wait_event_cmd(conf->wait_for_stripe,
+   wait_event_cmd(conf->wait_for_quiescent,
atomic_read(&conf->active_stripes) == 0 &&
atomic_read(&conf->active_aligned_reads) == 
0,
unlock_all_device_hash_locks_irq(conf),
@@ -7436,7 +7439,7 @@ static void raid5_quiesce(struct mddev *mddev, int state)
case 0: /* re-enable writes */
lock_all_device_hash_locks_irq(conf);
conf->quiesce = 0;
-   wake_up(&conf->wait_for_stripe);
+   wake_up(&conf->wait_for_quiescent);
wake_up(&conf->wait_for_overlap);
unlock_all_device_hash_locks_irq(conf);
break;
diff --git a/drivers/md/raid5.h b/d

Re: [PATCH 2/2] md/raid5: exclusive wait_for_stripe

2015-04-26 Thread Yuanhan Liu

On Mon, Apr 27, 2015 at 10:24:05AM +1000, NeilBrown wrote:
> On Fri, 24 Apr 2015 21:39:04 +0800 Yuanhan Liu 
> wrote:
> 
> > I noticed heavy spin lock contention at get_active_stripe() with fsmark
> > multiple thread write workloads.
> > 
> > Here is how this hot contention comes from. We have limited stripes, and
> > it's a multiple thread write workload. Hence, those stripes will be taken
> > soon, which puts later processes to sleep for waiting free stripes. When
> > enough stripes(> 1/4 total stripes) are released, all process are woken,
> > trying to get the lock. But there is one only being able to get this lock
> > for each hash lock, making other processes spinning out there for acquiring
> > the lock.
> > 
> > Thus, it's effectiveless to wakeup all processes and let them battle for
> > a lock that permits one to access only each time. Instead, we could make
> > it be a exclusive wake up: wake up one process only. That avoids the heavy
> > spin lock contention naturally.
> > 
> > Here are some test results I have got with this patch applied(all test run
> > 3 times):
> > 
> > `fsmark.files_per_sec'
> > =
> > 
> > next-20150317 this patch
> > - -
> > metric_value ±stddev  metric_value ±stddev change  
> > testbox/benchmark/testcase-params
> > - -    
> > --
> >   25.600 ±0.0  92.700 ±2.5  262.1% 
> > ivb44/fsmark/1x-64t-4BRD_12G-RAID5-btrfs-4M-30G-fsyncBeforeClose
> >   25.600 ±0.0  77.800 ±0.6  203.9% 
> > ivb44/fsmark/1x-64t-9BRD_6G-RAID5-btrfs-4M-30G-fsyncBeforeClose
> >   32.000 ±0.0  93.800 ±1.7  193.1% 
> > ivb44/fsmark/1x-64t-4BRD_12G-RAID5-ext4-4M-30G-fsyncBeforeClose
> >   32.000 ±0.0  81.233 ±1.7  153.9% 
> > ivb44/fsmark/1x-64t-9BRD_6G-RAID5-ext4-4M-30G-fsyncBeforeClose
> >   48.800 ±14.5 99.667 ±2.0  104.2% 
> > ivb44/fsmark/1x-64t-4BRD_12G-RAID5-xfs-4M-30G-fsyncBeforeClose
> >6.400 ±0.0  12.800 ±0.0  100.0% 
> > ivb44/fsmark/1x-64t-3HDD-RAID5-btrfs-4M-40G-fsyncBeforeClose
> >   63.133 ±8.2  82.800 ±0.7   31.2% 
> > ivb44/fsmark/1x-64t-9BRD_6G-RAID5-xfs-4M-30G-fsyncBeforeClose
> >  245.067 ±0.7 306.567 ±7.9   25.1% 
> > ivb44/fsmark/1x-64t-4BRD_12G-RAID5-f2fs-4M-30G-fsyncBeforeClose
> >   17.533 ±0.3  21.000 ±0.8   19.8% 
> > ivb44/fsmark/1x-1t-3HDD-RAID5-xfs-4M-40G-fsyncBeforeClose
> >  188.167 ±1.9 215.033 ±3.1   14.3% 
> > ivb44/fsmark/1x-1t-4BRD_12G-RAID5-btrfs-4M-30G-NoSync
> >  254.500 ±1.8 290.733 ±2.4   14.2% 
> > ivb44/fsmark/1x-1t-9BRD_6G-RAID5-btrfs-4M-30G-NoSync
> > 
> > `time.system_time'
> > =
> > 
> > next-20150317 this patch
> > --
> > metric_value ±stddev metric_value ±stddev change   
> > testbox/benchmark/testcase-params
> > -- 
> > --
> > 7235.603 ±1.2 185.163 ±1.9  -97.4% 
> > ivb44/fsmark/1x-64t-4BRD_12G-RAID5-btrfs-4M-30G-fsyncBeforeClose
> > 7666.883 ±2.9 202.750 ±1.0  -97.4% 
> > ivb44/fsmark/1x-64t-9BRD_6G-RAID5-btrfs-4M-30G-fsyncBeforeClose
> >14567.893 ±0.7 421.230 ±0.4  -97.1% 
> > ivb44/fsmark/1x-64t-3HDD-RAID5-btrfs-4M-40G-fsyncBeforeClose
> > 3697.667 ±14.0148.190 ±1.7  -96.0% 
> > ivb44/fsmark/1x-64t-4BRD_12G-RAID5-xfs-4M-30G-fsyncBeforeClose
> > 5572.867 ±3.8 310.717 ±1.4  -94.4% 
> > ivb44/fsmark/1x-64t-9BRD_6G-RAID5-ext4-4M-30G-fsyncBeforeClose
> > 5565.050 ±0.5 313.277 ±1.5  -94.4% 
> > ivb44/fsmark/1x-64t-4BRD_12G-RAID5-ext4-4M-30G-fsyncBeforeClose
> > 2420.707 ±17.1171.043 ±2.7  -92.9% 
> > ivb44/fsmark/1x-64t-9BRD_6G-RAID5-xfs-4M-30G-fsyncBeforeClose
> > 3743.300 ±4.6 379.827 ±3.5  -89.9% 
> > iv

Re: [PATCH 1/2] md/raid5: split wait_for_stripe and introduce wait_for_quiesce

2015-04-26 Thread Yuanhan Liu

On Mon, Apr 27, 2015 at 10:10:24AM +1000, NeilBrown wrote:
> On Fri, 24 Apr 2015 21:39:03 +0800 Yuanhan Liu 
> wrote:
> 
> > If I read code correctly, current wait_for_stripe actually has 2 usage:
> > 
> > - wait for there is enough free stripe cache, triggered when
> >   get_free_stripe() failed. This is what wait_for_stripe intend
> >   for literally.
> > 
> > - wait for quiesce == 0 or
> >active_aligned_reads == 0 && active_stripes == 0
> > 
> >   It has nothing to do with wait_for_stripe literally, and releasing
> >   an active stripe won't actually wake them up. On the contrary, wake_up
> >   from under this case won't actually wake up the process waiting for
> >   an free stripe being available.
> 
> I disagree.  Releasing an active stripe *will* (or *can*) wake up that third
> case, as it decrements "active_stripes" which will eventually reach zero.
> 
> I don't think your new code will properly wake up a process which is waiting
> for "active_stripes == 0".

Right, and thanks for pointing it out. So, is this enough?

---
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 2d8fcc1..3f23035 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -383,6 +383,9 @@ static void release_inactive_stripe_list(struct
r5conf *conf,
}
}
}
+
+   if (!atomic_read(&conf->active_stripes))
+   wake_up(&conf->wait_for_quiesce);
 }

 /* should hold conf->device_lock already */


Or, should I put it a bit ahead, trying to invoke 
wake_up(&conf->wait_for_quiesce)
after each atomic_dec(&conf->active_stripes)?

if (atomic_dec_return(&conf->active_stripes) == 0)
wake_up(&conf->wait_for_quiesce);

> 
> > 
> > Hence, we'd better split wait_for_stripe, and here I introduce
> > wait_for_quiesce for the second usage. The name may not well taken, or
> > even taken wrongly. Feel free to correct me then.
> > 
> > This is also a prepare patch for next patch: make wait_for_stripe
> > exclusive.
> 
> I think you have this commit description upside down :-)
> 
> The real motivation is that you are seeing contention on some spinlock and so
> you want to split 'wait_for_stripe' up in to multiple wait_queues so that you
> can use exclusive wakeup.  As this is the main motivation, it should be
> stated first.
> 
> Then explain that 'wait_for_stripe' is used to wait for the array to enter or
> leave the quiescent state, and also to wait for an available stripe in each
> of the hash lists.
> 
> So this patch splits the first usage off into a separate wait_queue, and the
> next patch will split the second usage into one waitqueue for each hash value.
> 
> Then explain just is what is needed for that first step.
> 
> When you put it that way around, the patch makes lots of sense.

It does, and thanks!

> 
> So: could you please resubmit with the description the right way around, and

To make sure I followed you correctly, my patch order is correct(I mean,
split lock first, and make wait_for_stripe per lock hash and exclusive
second), and what I need to do is re-writing the commit log as you suggested,
and fixing all issues you pointed out. Right?

--yliu

> with an appropriate wakeup call to ensure raid5_quiesce is woken up when
> active_stripes reaches zero?
> 
> Thanks,
> NeilBrown
> 
> 
> > 
> > Signed-off-by: Yuanhan Liu 
> > ---
> >  drivers/md/raid5.c | 13 +++--
> >  drivers/md/raid5.h |  1 +
> >  2 files changed, 8 insertions(+), 6 deletions(-)
> > 
> > diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
> > index 9716319..b7e385f 100644
> > --- a/drivers/md/raid5.c
> > +++ b/drivers/md/raid5.c
> > @@ -667,7 +667,7 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
> > spin_lock_irq(conf->hash_locks + hash);
> >  
> > do {
> > -   wait_event_lock_irq(conf->wait_for_stripe,
> > +   wait_event_lock_irq(conf->wait_for_quiesce,
> > conf->quiesce == 0 || noquiesce,
> > *(conf->hash_locks + hash));
> > sh = __find_stripe(conf, sector, conf->generation - previous);
> > @@ -4725,7 +4725,7 @@ static void raid5_align_endio(struct bio *bi, int 
> > error)
> >  raid_bi, 0);
> > bio_endio(raid_bi, 0);
> > if (atomic_dec_and_test(&conf->active_aligned_reads))
> > -   wake_up(&conf->wait_for_stripe);
> >

[PATCH 1/2] md/raid5: split wait_for_stripe and introduce wait_for_quiesce

2015-04-24 Thread Yuanhan Liu

If I read code correctly, current wait_for_stripe actually has 2 usage:

- wait for there is enough free stripe cache, triggered when
  get_free_stripe() failed. This is what wait_for_stripe intend
  for literally.

- wait for quiesce == 0 or
   active_aligned_reads == 0 && active_stripes == 0

  It has nothing to do with wait_for_stripe literally, and releasing
  an active stripe won't actually wake them up. On the contrary, wake_up
  from under this case won't actually wake up the process waiting for
  an free stripe being available.

Hence, we'd better split wait_for_stripe, and here I introduce
wait_for_quiesce for the second usage. The name may not well taken, or
even taken wrongly. Feel free to correct me then.

This is also a prepare patch for next patch: make wait_for_stripe
exclusive.

Signed-off-by: Yuanhan Liu 
---
 drivers/md/raid5.c | 13 +++--
 drivers/md/raid5.h |  1 +
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 9716319..b7e385f 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -667,7 +667,7 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
spin_lock_irq(conf->hash_locks + hash);
 
do {
-   wait_event_lock_irq(conf->wait_for_stripe,
+   wait_event_lock_irq(conf->wait_for_quiesce,
conf->quiesce == 0 || noquiesce,
*(conf->hash_locks + hash));
sh = __find_stripe(conf, sector, conf->generation - previous);
@@ -4725,7 +4725,7 @@ static void raid5_align_endio(struct bio *bi, int error)
 raid_bi, 0);
bio_endio(raid_bi, 0);
if (atomic_dec_and_test(&conf->active_aligned_reads))
-   wake_up(&conf->wait_for_stripe);
+   wake_up(&conf->wait_for_quiesce);
return;
}
 
@@ -4820,7 +4820,7 @@ static int chunk_aligned_read(struct mddev *mddev, struct 
bio * raid_bio)
align_bi->bi_iter.bi_sector += rdev->data_offset;
 
spin_lock_irq(&conf->device_lock);
-   wait_event_lock_irq(conf->wait_for_stripe,
+   wait_event_lock_irq(conf->wait_for_quiesce,
conf->quiesce == 0,
conf->device_lock);
atomic_inc(&conf->active_aligned_reads);
@@ -5659,7 +5659,7 @@ static int  retry_aligned_read(struct r5conf *conf, 
struct bio *raid_bio)
bio_endio(raid_bio, 0);
}
if (atomic_dec_and_test(&conf->active_aligned_reads))
-   wake_up(&conf->wait_for_stripe);
+   wake_up(&conf->wait_for_quiesce);
return handled;
 }
 
@@ -6390,6 +6390,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
goto abort;
spin_lock_init(&conf->device_lock);
seqcount_init(&conf->gen_lock);
+   init_waitqueue_head(&conf->wait_for_quiesce);
init_waitqueue_head(&conf->wait_for_stripe);
init_waitqueue_head(&conf->wait_for_overlap);
INIT_LIST_HEAD(&conf->handle_list);
@@ -7413,7 +7414,7 @@ static void raid5_quiesce(struct mddev *mddev, int state)
 * active stripes can drain
 */
conf->quiesce = 2;
-   wait_event_cmd(conf->wait_for_stripe,
+   wait_event_cmd(conf->wait_for_quiesce,
atomic_read(&conf->active_stripes) == 0 &&
atomic_read(&conf->active_aligned_reads) == 
0,
unlock_all_device_hash_locks_irq(conf),
@@ -7427,7 +7428,7 @@ static void raid5_quiesce(struct mddev *mddev, int state)
case 0: /* re-enable writes */
lock_all_device_hash_locks_irq(conf);
conf->quiesce = 0;
-   wake_up(&conf->wait_for_stripe);
+   wake_up(&conf->wait_for_quiesce);
wake_up(&conf->wait_for_overlap);
unlock_all_device_hash_locks_irq(conf);
break;
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 7dc0dd8..fab53a3 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -508,6 +508,7 @@ struct r5conf {
struct list_headinactive_list[NR_STRIPE_HASH_LOCKS];
atomic_tempty_inactive_list_nr;
struct llist_head   released_stripes;
+   wait_queue_head_t   wait_for_quiesce;
wait_queue_head_t   wait_for_stripe;
wait_queue_head_t   wait_for_overlap;
unsigned long   cache_state;
-- 
1.9.0

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 2/2] md/raid5: exclusive wait_for_stripe

2015-04-24 Thread Yuanhan Liu

I noticed heavy spin lock contention at get_active_stripe() with fsmark
multiple thread write workloads.

Here is how this hot contention comes from. We have limited stripes, and
it's a multiple thread write workload. Hence, those stripes will be taken
soon, which puts later processes to sleep for waiting free stripes. When
enough stripes(> 1/4 total stripes) are released, all process are woken,
trying to get the lock. But there is one only being able to get this lock
for each hash lock, making other processes spinning out there for acquiring
the lock.

Thus, it's effectiveless to wakeup all processes and let them battle for
a lock that permits one to access only each time. Instead, we could make
it be a exclusive wake up: wake up one process only. That avoids the heavy
spin lock contention naturally.

Here are some test results I have got with this patch applied(all test run
3 times):

`fsmark.files_per_sec'
=

next-20150317 this patch
- -
metric_value ±stddev  metric_value ±stddev change  
testbox/benchmark/testcase-params
- -    
--
  25.600 ±0.0  92.700 ±2.5  262.1% 
ivb44/fsmark/1x-64t-4BRD_12G-RAID5-btrfs-4M-30G-fsyncBeforeClose
  25.600 ±0.0  77.800 ±0.6  203.9% 
ivb44/fsmark/1x-64t-9BRD_6G-RAID5-btrfs-4M-30G-fsyncBeforeClose
  32.000 ±0.0  93.800 ±1.7  193.1% 
ivb44/fsmark/1x-64t-4BRD_12G-RAID5-ext4-4M-30G-fsyncBeforeClose
  32.000 ±0.0  81.233 ±1.7  153.9% 
ivb44/fsmark/1x-64t-9BRD_6G-RAID5-ext4-4M-30G-fsyncBeforeClose
  48.800 ±14.5 99.667 ±2.0  104.2% 
ivb44/fsmark/1x-64t-4BRD_12G-RAID5-xfs-4M-30G-fsyncBeforeClose
   6.400 ±0.0  12.800 ±0.0  100.0% 
ivb44/fsmark/1x-64t-3HDD-RAID5-btrfs-4M-40G-fsyncBeforeClose
  63.133 ±8.2  82.800 ±0.7   31.2% 
ivb44/fsmark/1x-64t-9BRD_6G-RAID5-xfs-4M-30G-fsyncBeforeClose
 245.067 ±0.7 306.567 ±7.9   25.1% 
ivb44/fsmark/1x-64t-4BRD_12G-RAID5-f2fs-4M-30G-fsyncBeforeClose
  17.533 ±0.3  21.000 ±0.8   19.8% 
ivb44/fsmark/1x-1t-3HDD-RAID5-xfs-4M-40G-fsyncBeforeClose
 188.167 ±1.9 215.033 ±3.1   14.3% 
ivb44/fsmark/1x-1t-4BRD_12G-RAID5-btrfs-4M-30G-NoSync
 254.500 ±1.8 290.733 ±2.4   14.2% 
ivb44/fsmark/1x-1t-9BRD_6G-RAID5-btrfs-4M-30G-NoSync

`time.system_time'
=

next-20150317 this patch
--
metric_value ±stddev metric_value ±stddev change   
testbox/benchmark/testcase-params
-- 
--
7235.603 ±1.2 185.163 ±1.9  -97.4% 
ivb44/fsmark/1x-64t-4BRD_12G-RAID5-btrfs-4M-30G-fsyncBeforeClose
7666.883 ±2.9 202.750 ±1.0  -97.4% 
ivb44/fsmark/1x-64t-9BRD_6G-RAID5-btrfs-4M-30G-fsyncBeforeClose
   14567.893 ±0.7 421.230 ±0.4  -97.1% 
ivb44/fsmark/1x-64t-3HDD-RAID5-btrfs-4M-40G-fsyncBeforeClose
3697.667 ±14.0148.190 ±1.7  -96.0% 
ivb44/fsmark/1x-64t-4BRD_12G-RAID5-xfs-4M-30G-fsyncBeforeClose
5572.867 ±3.8 310.717 ±1.4  -94.4% 
ivb44/fsmark/1x-64t-9BRD_6G-RAID5-ext4-4M-30G-fsyncBeforeClose
5565.050 ±0.5 313.277 ±1.5  -94.4% 
ivb44/fsmark/1x-64t-4BRD_12G-RAID5-ext4-4M-30G-fsyncBeforeClose
2420.707 ±17.1171.043 ±2.7  -92.9% 
ivb44/fsmark/1x-64t-9BRD_6G-RAID5-xfs-4M-30G-fsyncBeforeClose
3743.300 ±4.6 379.827 ±3.5  -89.9% 
ivb44/fsmark/1x-64t-3HDD-RAID5-ext4-4M-40G-fsyncBeforeClose
3308.687 ±6.3 363.050 ±2.0  -89.0% 
ivb44/fsmark/1x-64t-3HDD-RAID5-xfs-4M-40G-fsyncBeforeClose

Where,

 1x: where 'x' means iterations or loop, corresponding to the 'L' option of 
fsmark

 1t, 64t: where 't' means thread

 4M: means the single file size, corresponding to the '-s' option of fsmark
 40G, 30G, 120G: means the total test size

 4BRD_12G: BRD is the ramdisk, where '4' means 4 ramdisk, and where '12G' 
means
   the size of one ramdisk. So, it would be 48G in total. And we 
made a
   raid on those ramdisk

As you can see, though there are no much performance gain for hard disk
workload, the system time is dropped heavily, up to 97%. And as expected,
the performance

performance changes on c9dc4c65: 9.8% fsmark.files_per_sec

2015-04-22 Thread Yuanhan Liu

FYI, we found performance increasement, which is expected as commit patch says,
on `fsmark.files_per_sec' by c9dc4c6578502c2085705347375b82089aad18d0:

> commit c9dc4c6578502c2085705347375b82089aad18d0
> Author: Chris Mason 
> AuthorDate: Sat Apr 4 17:14:42 2015 -0700
> Commit: Chris Mason 
> CommitDate: Fri Apr 10 14:07:11 2015 -0700
> 
> Btrfs: two stage dirty block group writeout

4c6d1d85ad89fd8e32dc9204b7f944854399bda9 
c9dc4c6578502c2085705347375b82089aad18d0
 

run time(m) metric_value ±stddev run time(m) metric_value 
±stddev change   testbox/benchmark/testcase-params
--- --   --- --  
  --
3   7.3  |35.267|±0.55   6.6  |38.740|
±1.69.8% ivb44/fsmark/1x-1t-1HDD-btrfs-4M-60G-NoSync


NOTE: here are some more explanation about those test parameters for you to
  know what the testcase does better:

  1x: where 'x' means iterations or loop, corresponding to the 'L' option 
of fsmark

  1t, 64t: where 't' means thread

  4M: means the single file size, corresponding to the '-s' option of fsmark
  60G: means the total test size


And FYI, here are more changes by the same commit:

4c6d1d85ad89fd8e  c9dc4c6578502c208570534737  
  --  
 %stddev %change %stddev
 \  |\  
  9864 ±  2%+156.9%  25345 ±  4%  
fsmark.time.voluntary_context_switches
 9 ±  0% +17.8% 10 ±  4%  
fsmark.time.percent_of_cpu_this_job_got
462211 ±  1% +16.8% 539707 ±  0%  fsmark.app_overhead
 35.27 ±  0%  +9.8%  38.74 ±  1%  fsmark.files_per_sec
   435 ±  0%  -9.0%396 ±  1%  fsmark.time.elapsed_time.max
   435 ±  0%  -9.0%396 ±  1%  fsmark.time.elapsed_time
  5.20 ±  2% -70.3%   1.54 ±  6%  turbostat.Pkg%pc6
   2447873 ± 42% -67.9% 785086 ± 33%  numa-numastat.node1.numa_hit
   2413662 ± 43% -68.1% 771115 ± 31%  numa-numastat.node1.local_node
  9864 ±  2%+156.9%  25345 ±  4%  time.voluntary_context_switches
187680 ± 10%+126.8% 425676 ±  7%  numa-vmstat.node1.nr_dirty
747361 ±  9%+127.8%1702809 ±  7%  numa-meminfo.node1.Dirty
   1787510 ±  1%+117.0%3878984 ±  2%  meminfo.Dirty
446861 ±  1%+117.0% 969472 ±  2%  proc-vmstat.nr_dirty
   1655962 ± 37% -59.3% 673988 ± 29%  numa-vmstat.node1.numa_local
   1036191 ±  8%+110.3%2179311 ±  3%  numa-meminfo.node0.Dirty
259069 ±  8%+110.3% 544783 ±  3%  numa-vmstat.node0.nr_dirty
   1687987 ± 37% -58.6% 698626 ± 29%  numa-vmstat.node1.numa_hit
 1 ±  0%+100.0%  2 ±  0%  vmstat.procs.b
  0.02 ±  0%+100.0%   0.04 ± 22%  turbostat.CPU%c3
  6.03 ±  1% +76.9%  10.67 ±  1%  turbostat.CPU%c1
 5.189e+08 ±  0% +72.6%  8.956e+08 ±  1%  cpuidle.C1-IVT.time
   2646692 ±  7% +75.0%4630890 ± 23%  cpuidle.C3-IVT.time
  5301 ±  6% -31.7%   3620 ±  3%  
slabinfo.btrfs_ordered_extent.active_objs
 10549 ± 16% -30.3%   7349 ± 12%  
numa-vmstat.node1.nr_slab_reclaimable
  5353 ±  6% -31.4%   3670 ±  3%  
slabinfo.btrfs_ordered_extent.num_objs
 42169 ± 16% -30.3%  29397 ± 12%  numa-meminfo.node1.SReclaimable
   1619825 ± 22% +39.4%2258188 ±  4%  proc-vmstat.pgfree
  4611 ±  7% -28.0%   3318 ±  1%  
slabinfo.btrfs_delayed_ref_head.num_objs
  4471 ±  8% -27.0%   3264 ±  2%  
slabinfo.btrfs_delayed_ref_head.active_objs
 67.93 ±  1% -24.7%  51.15 ±  4%  turbostat.Pkg%pc2
   2332975 ± 21% +45.6%3396446 ±  4%  numa-vmstat.node1.numa_other
   2300949 ± 22% +46.5%3371807 ±  4%  numa-vmstat.node1.numa_miss
   2300941 ± 22% +46.5%3371793 ±  4%  numa-vmstat.node0.numa_foreign
  2952 ±  8% -23.3%   2263 ±  3%  
slabinfo.btrfs_delayed_data_ref.num_objs
   2570716 ±  3% +25.7%3230157 ±  2%  numa-meminfo.node1.Writeback
642367 ±  3% +25.7% 807533 ±  2%  numa-vmstat.node1.nr_writeback
 95408 ± 13% -17.3%  78910 ±  6%  numa-meminfo.node1.Slab
  2803 ±  7% -21.1%   2210 ±  3%  
slabinfo.btrfs_delayed_data_ref.active_objs
   240 ±  9% +23.1%295 ± 16%  
numa-vmstat.node0.nr_page_table_pages
   4626942 ± 19% +49.6%6924087 ± 22%  cpuidle.C1E-IVT.time
   5585235 ±  0% +25.5%7011242 ±  0%  meminfo.Writeback
   1396232 ±  0% +25.5%1752892 ±  0%  proc-vmstat.nr_writeback
   962 ±  9% +23.0%   1184 ± 16%  numa-meminfo.node0.PageTables
 9 ±  0% +17.8% 10 ±  4%  time.percent_of_cpu_this_job_got
754027 ±  2% +25.2% 944312 ±

performance changes on 78373b73: -46.6% fsmark.files_per_sec, and few more

2015-04-20 Thread Yuanhan Liu

FYI, we found changes on `fsmark.files_per_sec' by 
78373b7319abdf15050af5b1632c4c8b8b398f33:

> commit 78373b7319abdf15050af5b1632c4c8b8b398f33
> Author: Jaegeuk Kim 
> AuthorDate: Fri Mar 13 21:44:36 2015 -0700
> Commit: Jaegeuk Kim 
> CommitDate: Fri Apr 10 15:08:45 2015 -0700
> 
> f2fs: enhance multi-threads performance

3402e87cfb5e762f9c95071bf4a2ad65fd9392a2 
78373b7319abdf15050af5b1632c4c8b8b398f33
 

run time(m) metric_value ±stddev run time(m) metric_value 
±stddev change   testbox/benchmark/testcase-params
--- --   --- --  
  --
3   0.3 |490.800|±5.73   0.5 |262.067|
±0.4  -46.6% 
ivb44/fsmark/1x-64t-4BRD_12G-RAID0-f2fs-4M-30G-fsyncBeforeClose
3   0.3 |468.367|±3.53   0.5 |264.467|
±0.2  -43.5% 
ivb44/fsmark/1x-64t-9BRD_6G-RAID0-f2fs-4M-30G-fsyncBeforeClose
3   0.6 |211.867|±0.73   0.7 |191.067|
±0.5   -9.8% 
ivb44/fsmark/1x-64t-4BRD_12G-RAID5-f2fs-4M-30G-fsyncBeforeClose

NOTE: here are some more info about those test parameters for you to
  know what the testcase does better:

  1x: where 'x' means iterations or loop, corresponding to the 'L' option 
of fsmark

  1t, 64t: where 't' means thread

  4M: means the single file size, corresponding to the '-s' option of fsmark
  40G, 30G, 120G: means the total test size

  4BRD_12G: BRD is the ramdisk, where '4' means 4 ramdisk, and where '12G' 
means
the size of one ramdisk. So, it would be 48G in total. And we 
made a
raid on those ramdisk


The change is a bit interesting as you already stated it clear that this
patch is for performance gain. The patch itself is clear, too: remove a
mutex lock. So the only reasonable cause, without too much dig, I can think
of would be the remove of this lock reduces sleep time, and brings more
process to be able run, but somehow increases the context switches and cpu
usage in the meantime at somewhere. I guess this is what the following
changes are trying to tell us:

 29708 ±  2%   +5720.0%1729051 ±  1%  
fsmark.time.voluntary_context_switches
   302 ±  0%+113.8%647 ±  0%  
fsmark.time.percent_of_cpu_this_job_got
 61.05 ±  0%+214.0% 191.70 ±  0%  fsmark.time.system_time


FYI, Here I listed all changes for the outstanding change:

3   0.3 |490.800|±5.73   0.5 |262.067|
±0.4  -46.6% 
ivb44/fsmark/1x-64t-4BRD_12G-RAID0-f2fs-4M-30G-fsyncBeforeClose

3402e87cfb5e762f  78373b7319abdf15050af5b163  
  --  
 %stddev %change %stddev
 \  |\  
 29708 ±  2%   +5720.0%1729051 ±  1%  
fsmark.time.voluntary_context_switches
 61.05 ±  0%+214.0% 191.70 ±  0%  fsmark.time.system_time
   302 ±  0%+113.8%647 ±  0%  
fsmark.time.percent_of_cpu_this_job_got
 10476 ±  0% +95.4%  20467 ±  5%  fsmark.time.minor_page_faults
   490 ±  5% -46.6%262 ±  0%  fsmark.files_per_sec
 20.21 ±  0% +46.7%  29.65 ±  0%  fsmark.time.elapsed_time
 20.21 ±  0% +46.7%  29.65 ±  0%  fsmark.time.elapsed_time.max
226379 ±  0% +32.5% 299882 ±  0%  fsmark.app_overhead
 0 ±  0%  +Inf%   1045 ±  2%  proc-vmstat.numa_pages_migrated
   209 ± 26%   +3272.3%   7059 ±  3%  cpuidle.C1E-IVT.usage
   228 ± 42%+686.7%   1799 ± 14%  numa-meminfo.node0.Writeback
 14633 ±  5%   +7573.2%1122849 ±  1%  cpuidle.C1-IVT.usage
 0 ±  0%  +Inf%   1045 ±  2%  proc-vmstat.pgmigrate_success
 29708 ±  2%   +5720.0%1729051 ±  1%  time.voluntary_context_switches
 55663 ±  0%+776.9% 488081 ±  0%  cpuidle.C6-IVT.usage
56 ± 42%+718.8%464 ± 11%  numa-vmstat.node0.nr_writeback
   535 ± 29%+334.4%   2325 ± 10%  meminfo.Writeback
   129 ± 30%+295.6%511 ±  4%  proc-vmstat.nr_writeback
 59.25 ±  5% -74.2%  15.26 ±  3%  turbostat.CPU%c6
  2.58 ±  8% -74.5%   0.66 ± 11%  turbostat.Pkg%pc2
 1.551e+08 ± 14%+233.4%  5.171e+08 ±  4%  cpuidle.C1-IVT.time
 32564 ± 24%+208.1% 100330 ±  5%  softirqs.RCU
 61.05 ±  0%+214.0% 191.70 ±  0%  time.system_time
60 ± 32%+165.7%160 ± 16%  numa-vmstat.node1.nr_writeback
 2 ±  0%+200.0%  6 ±  0%  vmstat.procs.r
  3057 ±  2%+166.1%   8136 ± 22%  numa-vmstat.node0.nr_mapped
 12240 ±  2%+165.9%  32547 ± 22%  numa-meminfo.node0.Mapped
  6324 ±  3%+148.4%  15709 ±  0%  proc-vmstat.nr_mapped

Re: performance changes on 4400755e: 200.0% fsmark.files_per_sec, -18.1% fsmark.files_per_sec, and few more

2015-03-25 Thread Yuanhan Liu

On Wed, Mar 25, 2015 at 02:03:59PM +1100, NeilBrown wrote:
> On Wed, 18 Mar 2015 13:00:30 +0800 Yuanahn Liu 
> wrote:
> 
> > Hi,
> > 
> > FYI, we noticed performance changes on `fsmark.files_per_sec' by 
> > 4400755e356f9a2b0b7ceaa02f57b1c7546c3765:
> > 
> > > commit 4400755e356f9a2b0b7ceaa02f57b1c7546c3765
> > > Author: NeilBrown 
> > > AuthorDate: Thu Feb 26 12:47:56 2015 +1100
> > > Commit: NeilBrown 
> > > CommitDate: Wed Mar 4 13:40:19 2015 +1100
> > > 
> > > md/raid5: allow the stripe_cache to grow and shrink.
> 
> Thanks a lot for this testing!!! I was wondering how I could do some proper
> testing of this patch, and you've done it for me :-)

Welcome!

> 
> The large number of improvements is very encouraging - that is what I was
> hoping for of course.
> 
> The few regressions could be a concern.  I note that are all NoSync.
> That seems to suggest that they could just be writing more data.

It's not a time based test, but size based test:

> >   40G, 30G, 120G: means the total test size

Hence, I doubt it might be writing more data.


> i.e. the data is written a bit earlier (certainly possible) so it happen to
> introduce more delay 
> 
> I guess I'm not really sure how to interpret NoSync results, and suspect that
> poor NoSync result don't really reflect much on the underlying block device.
> Could that be right?

Sorry, I'm not quite sure I followed you. Poor NoSync result? Do you
mean the small number like 63.133, 57.600? They are of unit of
files_per_sec, and file size is 4M. Hence, it would be 200+ MB/s, which
is not that bad in this case, as it's a 3 hard disk RAID5.

> > 3   8.1   63.133 ±0.5%   3   9.2   55.633   
> >   ±0.2% -11.9% ivb44/fsmark/1x-1t-3HDD-RAID5-btrfs-4M-120G-NoSync

Here are few iostat sample from 26089f4902595a2f64c512066af07af6e82eb096
of above test:

avg-cpu:  %user   %nice %system %iowait  %steal   %idle
   0.000.000.631.670.00   97.70

Device: rrqm/s   wrqm/s r/s w/srkB/swkB/s avgrq-sz 
avgqu-sz   await r_await w_await  svctm  %util
sdb   0.00 30353.000.00  240.00 0.00 121860.00  1015.50 
1.295.350.005.35   3.50  83.90
sdc   0.00 30353.000.00  241.00 0.00 122372.00  1015.54 
0.662.740.002.74   2.53  60.90
sda   0.00 30353.000.00  242.00 0.00 122884.00  1015.57 
1.295.360.005.36   3.52  85.20
md0   0.00 0.000.00  956.00 0.00 244736.00   512.00 
227231.390.000.000.00   1.05 100.00

avg-cpu:  %user   %nice %system %iowait  %steal   %idle
   0.020.000.691.690.00   97.60

Device: rrqm/s   wrqm/s r/s w/srkB/swkB/s avgrq-sz 
avgqu-sz   await r_await w_await  svctm  %util
sdb   0.00 30988.000.00  247.00 0.00 125444.00  1015.74 
1.777.170.007.17   4.02  99.40
sdc   0.00 30988.000.00  245.00 0.00 124420.00  1015.67 
1.194.820.004.82   3.67  89.90
sda   0.00 30988.000.00  247.00 0.00 125444.00  1015.74 
0.652.650.002.65   2.54  62.70
md0   0.00 0.000.00  976.00 0.00 249856.00   512.00 
228206.370.000.000.00   1.02 100.00

avg-cpu:  %user   %nice %system %iowait  %steal   %idle
   0.000.000.611.670.00   97.72

Device: rrqm/s   wrqm/s r/s w/srkB/swkB/s avgrq-sz 
avgqu-sz   await r_await w_await  svctm  %util
sdb   0.00 29718.000.00  235.00 0.00 119300.00  1015.32 
1.355.710.005.71   3.71  87.20
sdc   0.00 29718.000.00  236.00 0.00 119812.00  1015.36 
1.195.060.005.06   3.43  80.90
sda   0.00 29718.000.00  235.00 0.00 119300.00  1015.32 
0.873.690.003.69   2.99  70.20
md0   0.00 0.000.00  936.00 0.00 239616.00   512.00 
229157.330.000.000.00   1.07 100.00


And few iostat sample of 4400755e356f9a2b0b7ceaa02f57b1c7546c3765(first bad 
commit):

avg-cpu:  %user   %nice %system %iowait  %steal   %idle
   0.020.001.091.540.00   97.35

Device: rrqm/s   wrqm/s r/s w/srkB/swkB/s avgrq-sz 
avgqu-sz   await r_await w_await  svctm  %util
sdb   1.00 27677.001.00  206.00 8.00 100516.00   971.25 
   27.40  130.56  196.00  130.24   4.72  97.70
sdc   0.00 27677.000.00  207.00 0.00 101028.00   976.12 
   27.05  129.430.00  129.43   4.61  95.50
sda   5.00 27677.001.00  211.0016.00 102984.00   971.70 
   26.61  127.00  201.00  126.64   4.50  95.50
md0   0.00 0.000.00  824.00 0.0

[LKP] [sched] WARNING: CPU: 0 PID: 13608 at kernel/sched/core.c:7323 __might_sleep+0xbd/0xd0()

2014-12-02 Thread Yuanhan Liu

FYI, we noticed the below changes on

git://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git master
commit 8eb23b9f35aae413140d3fda766a98092c21e9b0 ("sched: Debug nested sleeps")


+-+++
| | 26cabd3125 | 8eb23b9f35 |
+-+++
| boot_successes  | 10 | 15 |
| boot_failures   | 0  | 25 |
| WARNING:at_kernel/sched/core.c:__might_sleep()  | 0  | 5  |
| backtrace:SyS_read  | 0  | 5  |
| backtrace:vfs_read  | 0  | 5  |
| WARNING:at_kernel/sched/core.c:#__might_sleep() | 0  | 20 |
| backtrace:SyS_io_getevents  | 0  | 10 |
| backtrace:read_events   | 0  | 7  |
| backtrace:kauditd_thread| 0  | 10 |
+-+++



<4>[  839.494114] [ cut here ]
<4>[  839.494131] WARNING: CPU: 0 PID: 13608 at 
/kbuild/src/lkp/kernel/sched/core.c:7323 __might_sleep+0xbd/0xd0()
<4>[  839.494137] do not call blocking ops when !TASK_RUNNING; state=1 set at 
[] prepare_to_wait+0x2f/0x90
<4>[  839.494256] Modules linked in: tun ipmi_watchdog loop btrfs xor raid6_pq 
sg sd_mod ast snd_pcm syscopyarea sysfillrect snd_timer sysimgblt snd ie6xx_wdt 
ttm i2c_isch drm_kms_helper soundcore drm ahci libahci pcspkr i2c_ismt lpc_sch 
ipmi_si libata shpchp ipmi_msghandler acpi_cpufreq
<4>[  839.494264] CPU: 0 PID: 13608 Comm: fanotify01 Not tainted 
3.18.0-rc4-next-20141117 #1
<4>[  839.494266] Hardware name: To be filled by O.E.M. To be filled by 
O.E.M./Double Cove , BIOS BWDEXT.86B.000.012.D127 10/08/2012
<4>[  839.494273]  81b5ebb8 88023cf37d18 81892f54 
64026402
<4>[  839.494277]  88023cf37d68 88023cf37d58 8107047a 
88023cf37db8
<4>[  839.494281]  81b5f5e8 0061  
6000
<4>[  839.494285] Call Trace:
<4>[  839.494315]  [] dump_stack+0x4c/0x65
<4>[  839.494323]  [] warn_slowpath_common+0x8a/0xc0
<4>[  839.494327]  [] warn_slowpath_fmt+0x46/0x50
<4>[  839.494333]  [] ? prepare_to_wait+0x2f/0x90
<4>[  839.494337]  [] ? prepare_to_wait+0x2f/0x90
<4>[  839.494341]  [] __might_sleep+0xbd/0xd0
<4>[  839.494348]  [] mutex_lock+0x24/0x50
<4>[  839.494354]  [] fanotify_read+0xd5/0x620
<4>[  839.494370]  [] ? selinux_file_permission+0xa6/0x120
<4>[  839.494374]  [] ? wait_woken+0xc0/0xc0
<4>[  839.494381]  [] __vfs_read+0x18/0x50
<4>[  839.494385]  [] vfs_read+0x8a/0x140
<4>[  839.494390]  [] SyS_read+0x46/0xb0
<4>[  839.494403]  [] system_call_fastpath+0x12/0x17
<4>[  839.494409] ---[ end trace 5a2207521429f889 ]---



--yliu
___
LKP mailing list
l...@linux.intel.com
[0.00] Initializing cgroup subsys cpuset
[0.00] Initializing cgroup subsys cpu
[0.00] Linux version 3.18.0-rc4-next-20141117 (kbuild@roam) (gcc 
version 4.9.1 (Debian 4.9.1-19) ) #1 SMP Tue Nov 18 11:46:52 CST 2014
[0.00] Command line: user=lkp 
job=/lkp/scheduled/lkp-a06/cyclic_ltp-performance-syscalls-x86_64-rhel-HEAD-efefb5ca5da52f7537c7ced03d6e53408f13a26e-0.yaml
 ARCH=x86_64 
BOOT_IMAGE=/kernel/x86_64-rhel/efefb5ca5da52f7537c7ced03d6e53408f13a26e/vmlinuz-3.18.0-rc4-next-20141117
 kconfig=x86_64-rhel commit=efefb5ca5da52f7537c7ced03d6e53408f13a26e 
branch=next/master root=/dev/ram0 max_uptime=3600 
RESULT_ROOT=/result/lkp-a06/ltp/performance-syscalls/debian-x86_64.cgz/x86_64-rhel/efefb5ca5da52f7537c7ced03d6e53408f13a26e/0
 ip=lkp-a06::dhcp earlyprintk=ttyS0,115200 debug apic=debug 
sysrq_always_enabled rcupdate.rcu_cpu_stall_timeout=100 panic=-1 
softlockup_panic=1 nmi_watchdog=panic oops=panic load_ramdisk=2 
prompt_ramdisk=0 console=ttyS0,115200 console=tty0 vga=normal rw
[0.00] e820: BIOS-provided physical RAM map:
[0.00] BIOS-e820: [mem 0x0100-0x0009e3ff] usable
[0.00] BIOS-e820: [mem 0x0009e400-0x0009] reserved
[0.00] BIOS-e820: [mem 0x000e-0x000f] reserved
[0.00] BIOS-e820: [mem 0x0010-0xbf67afff] usable
[0.00] BIOS-e820: [mem 0xbf67b000-0xbfb3dfff] ACPI NVS
[0.00] BIOS-e820: [mem 0xbfb3e000-0xbfc50fff] reserved
[0.00] BIOS-e820: [mem 0xbfc51000-0xbfc51fff] ACPI NVS
[0.00] BIOS-e820: [mem 0xbfc52000-0xbfc62fff] reserved
[0.00] BIOS-e820: [mem 0xbfc63000-0xbfc65fff] ACPI NVS
[0.00] BIOS-e820: [mem 0xbfc66000-0xbfc83fff] reserved
[

[LKP] [drm/fb] f5ef139cbe5: ERROR not all connectors configured

2014-12-02 Thread Yuanhan Liu

FYI, we noticed the below changes on

git://people.freedesktop.org/~airlied/linux.git radeon-mst-hacks
commit f5ef139cbe5dbd755dab3706022d7147800099a8 ("drm/fb: add support for tiled 
monitor configurations.")


testbox/testcase/testparams: vm-kbuild-1G/xfstests/4HDD-btrfs-generic-113

9cf13203b1fd7cc3  f5ef139cbe5dbd755dab370602  
  --  
   fail:runs  %reproductionfail:runs
   | | |
   :10 100%  10:10
kmsg.drm:drm_setup_crtcs[drm_kms_helper]]*ERROR*not_all_connectors_configured

vm-kbuild-1G: qemu-system-x86_64 -enable-kvm -cpu Haswell,+smep,+smap
Memory: 1G




To reproduce:

apt-get install ruby ruby-oj
git clone 
git://git.kernel.org/pub/scm/linux/kernel/git/wfg/lkp-tests.git
cd lkp-tests
bin/setup-local job.yaml # the job file attached in this email
bin/run-local   job.yaml


Disclaimer:
Results have been estimated based on internal Intel analysis and are provided
for informational purposes only. Any difference in system hardware or software
design or configuration may affect actual performance.



--yliu
---
testcase: xfstests
default_monitors:
  wait: pre-test
  vmstat: 
default_watchdogs:
  watch-oom: 
  watchdog: 
cpufreq_governor: 
model: qemu-system-x86_64 -enable-kvm -cpu Haswell,+smep,+smap
nr_vm: 16
nr_cpu: 2
memory: 1G
disk_type: virtio-scsi
rootfs: debian-x86_64.cgz
hdd_partitions: "/dev/sda /dev/sdb /dev/sdc /dev/sdd"
swap_partitions: "/dev/sde"
disk: 4HDD
fs:
- btrfs
xfstests:
  test:
  - generic-113
enqueue_time: 2014-11-26 13:11:19.191840759 +08:00
branch: linux-devel/devel-hourly-2014112611
commit: a7a1168f6b45bb0d29a20c942e1ca300ef54dc6e
repeat_to: 2
testbox: vm-kbuild-1G-3
tbox_group: vm-kbuild-1G
kconfig: x86_64-rhel
kernel: 
"/kernel/x86_64-rhel/a7a1168f6b45bb0d29a20c942e1ca300ef54dc6e/vmlinuz-3.18.0-rc6-wl-ath-ga7a1168f"
user: lkp
queue: rand
result_root: 
"/result/vm-kbuild-1G/xfstests/4HDD-btrfs-generic-113/debian-x86_64.cgz/x86_64-rhel/a7a1168f6b45bb0d29a20c942e1ca300ef54dc6e/0"
job_file: 
"/lkp/scheduled/vm-kbuild-1G-3/rand_xfstests-4HDD-btrfs-generic-113-debian-x86_64.cgz-x86_64-rhel-a7a1168f6b45bb0d29a20c942e1ca300ef54dc6e-1.yaml"
dequeue_time: 2014-11-26 13:25:10.605471464 +08:00
job_state: finished
loadavg: 96.37 33.89 12.20 1/593 3339
start_time: '1416979556'
end_time: '1416979727'
version: "/lkp/lkp/.src-20141126-053142"
mkfs -t btrfs /dev/sdd
mkfs -t btrfs /dev/sdc
mkfs -t btrfs /dev/sdb
mkfs -t btrfs /dev/sda
mount -t btrfs /dev/sda /fs/sda
mount -t btrfs /dev/sdb /fs/sdb
mount -t btrfs /dev/sdc /fs/sdc
mount -t btrfs /dev/sdd /fs/sdd
export TEST_DIR=/fs/sda
export TEST_DEV=/dev/sda
export FSTYP=btrfs
export SCRATCH_MNT=/fs/scratch
mkdir /fs/scratch -p
export SCRATCH_DEV_POOL="/dev/sdb /dev/sdc /dev/sdd"
./check generic/113

[LKP] [net] 4ed2d765dfa:

2014-11-24 Thread Yuanhan Liu

FYI, we noticed the below changes on

commit 4ed2d765dfaccff5ebdac68e2064b59125033a3b ("net-timestamp: TCP 
timestamping")


testbox/testcase/testparams: vm-vp-2G/ltp/syscalls

e7fd2885385157d4  4ed2d765dfaccff5ebdac68e20  
  --  
   fail:runs  %reproductionfail:runs
   | | |
   :5  100%   5:5 ltp.recv01.fail
   :5  100%   5:5 ltp.recvfrom01.fail
   :5  100%   5:5 ltp.recvmsg01.fail
   :5   20%   1:5 
kmsg.APIC_calibration_not_consistent_with_PM-Timer:#ms_instead_of#ms
   :5   20%   1:5 kmsg.hrtimer:interrupt_took#ns
   :5   20%   1:5 
kmsg.TINFO:mlock_failed:errno=ENOMEM(#):Cannot_allocate_memory
   :5   20%   1:5 
kmsg.estcases/kernel/syscalls/getgroups/../utils/compat_16.h::#-bit_version_of_getgroups()is_not_supported_on_your_platform

testbox/testcase/testparams: nhm-white/ltp/syscalls

e7fd2885385157d4  4ed2d765dfaccff5ebdac68e20  
  --  
   :10 100%   5:5 ltp.recv01.fail
   :10 100%   5:5 ltp.recvfrom01.fail
   :10 100%   5:5 ltp.recvmsg01.fail

vm-vp-2G: qemu-system-x86_64 -enable-kvm -cpu Penryn
Memory: 2G

nhm-white: Nehalem
Memory: 6G




To reproduce:

  apt-get install ruby ruby-oj
  git clone git://git.kernel.org/pub/scm/linux/kernel/git/wfg/lkp-tests.git
  cd lkp-tests
  bin/setup-local job.yaml # the job file attached in this email
  bin/run-local   job.yaml


--yliu
---
testcase: ltp
default_monitors:
  wait: pre-test
  vmstat: 
model: qemu-system-x86_64 -enable-kvm -cpu Penryn
nr_vm: 4
nr_cpu: 4
memory: 2G
rootfs: debian-x86_64.cgz
hdd_partitions: "/dev/vdb /dev/vdc /dev/vdd /dev/vde /dev/vdf"
swap_partitions: "/dev/vda"
ltp:
  test:
  - syscalls
enqueue_time: 2014-10-02 10:07:25.199207485 +08:00
branch: net/master
commit: 0754476419f127eb8c294b17b6fc8b6787ded1e2
testbox: vm-vp-2G-3
kconfig: x86_64-rhel
kernel: 
"/kernel/x86_64-rhel/0754476419f127eb8c294b17b6fc8b6787ded1e2/vmlinuz-3.17.0-rc6-00145-g0754476"
user: lkp
queue: rand
result_root: 
"/result/vm-vp-2G/ltp/syscalls/debian-x86_64.cgz/x86_64-rhel/0754476419f127eb8c294b17b6fc8b6787ded1e2/0"
job_file: 
"/lkp/scheduled/vm-vp-2G-3/rand_ltp-syscalls-debian-x86_64.cgz-x86_64-rhel-0754476419f127eb8c294b17b6fc8b6787ded1e2-0.yaml"
dequeue_time: 2014-10-02 11:55:51.761588446 +08:00
job_state: finished
loadavg: 4.39 5.61 2.69 1/85 10461
start_time: '141188'
end_time: '141759'
version: "/lkp/lkp/.src-20141001-203321"
./runltp -f syscalls

[LKP] [nohz] 2a16fc93d2c:

2014-11-23 Thread Yuanhan Liu

FYI, we noticed the below changes on

commit 2a16fc93d2c9568e16d45db77c7b5f15e1921cf1 ("nohz: Avoid tick's double 
reprogramming in highres mode")


testbox/testcase/testparams: snb-drag/piglit/performance-igt-001

b5e995e671d8e4d7  2a16fc93d2c9568e16d45db77c  
  --  
   fail:runs  %reproductionfail:runs
   | | |
   :5  100%   5:5 
kmsg.drm:__gen6_gt_force_wake_get]*ERROR*Timed_out_waiting_for_forcewake_to_ack_request
   :5  100%   5:5 
piglit.igt/gem_ctx_exec/reset-pin-leak.dmesg-warn

snb-drag: Sandy Bridge
Memory: 6G


<3>[   90.915459] [drm:__gen6_gt_force_wake_get] *ERROR* Timed out waiting for 
forcewake to ack request.
<3>[   90.925094] [drm:__gen6_gt_force_wake_get] *ERROR* Timed out waiting for 
forcewake to ack request.
<3>[   90.934725] [drm:__gen6_gt_force_wake_get] *ERROR* Timed out waiting for 
forcewake to ack request.
<3>[   90.944347] [drm:__gen6_gt_force_wake_get] *ERROR* Timed out waiting for 
forcewake to ack request.
<3>[   90.953956] [drm:__gen6_gt_force_wake_get] *ERROR* Timed out waiting for 
forcewake to ack request.
<3>[   90.963559] [drm:__gen6_gt_force_wake_get] *ERROR* Timed out waiting for 
forcewake to ack request.
<3>[   90.973173] [drm:__gen6_gt_force_wake_get] *ERROR* Timed out waiting for 
forcewake to ack request.
<3>[   90.982793] [drm:__gen6_gt_force_wake_get] *ERROR* Timed out waiting for 
forcewake to ack request.
<3>[   90.992405] [drm:__gen6_gt_force_wake_get] *ERROR* Timed out waiting for 
forcewake to ack request.
<3>[   91.002008] [drm:__gen6_gt_force_wake_get] *ERROR* Timed out waiting for 
forcewake to ack request.
<3>[   91.011618] [drm:__gen6_gt_force_wake_get] *ERROR* Timed out waiting for 
forcewake to ack request.
<3>[   91.021222] [drm:__gen6_gt_force_wake_get] *ERROR* Timed out waiting for 
forcewake to ack request.
<3>[   91.030825] [drm:__gen6_gt_force_wake_get] *ERROR* Timed out waiting for 
forcewake to ack request.
<3>[   91.040430] [drm:__gen6_gt_force_wake_get] *ERROR* Timed out waiting for 
forcewake to ack request.
<3>[   91.050016] [drm:__gen6_gt_force_wake_get] *ERROR* Timed out waiting for 
forcewake to ack request.
<3>[   91.059593] [drm:__gen6_gt_force_wake_get] *ERROR* Timed out waiting for 
forcewake to ack request.
<3>[   91.069152] [drm:__gen6_gt_force_wake_get] *ERROR* Timed out waiting for 
forcewake to ack request.


To reproduce:

apt-get install ruby ruby-oj
git clone 
git://git.kernel.org/pub/scm/linux/kernel/git/wfg/lkp-tests.git
cd lkp-tests
bin/setup-local job.yaml # the job file attached in this email
bin/run-local   job.yaml


Disclaimer:
Results have been estimated based on internal Intel analysis and are provided
for informational purposes only. Any difference in system hardware or software
design or configuration may affect actual performance.


--yliu
---
testcase: piglit
default_monitors:
  wait: pre-test
  vmstat: 
default_watchdogs:
  watch-oom: 
  watchdog: 
cpufreq_governor:
- performance
commit: 9bdebfefe1de2b6fa7e193c10411ef209b0ebc96
model: Sandy Bridge
memory: 6G
hdd_partitions: "/dev/disk/by-id/ata-ST3750528AS_6VP2W0PA-part5 
/dev/disk/by-id/ata-ST3750528AS_6VP2W0PA-part6
  /dev/disk/by-id/ata-ST3750528AS_6VP2W0PA-part7 
/dev/disk/by-id/ata-ST3750528AS_6VP2W0PA-part8
  /dev/disk/by-id/ata-ST3750528AS_6VP2W0PA-part9 
/dev/disk/by-id/ata-ST3750528AS_6VP2W0PA-part10"
swap_partitions: 
rootfs_partition: "/dev/disk/by-id/ata-ST3750528AS_6VP2W0PA-part2"
timeout: 30m
piglit:
  group:
  - igt-001
enqueue_time: 2014-10-27 03:51:37.871425766 +08:00
testbox: snb-drag
tbox_group: snb-drag
kconfig: x86_64-rhel
head_commit: 9bdebfefe1de2b6fa7e193c10411ef209b0ebc96
base_commit: cac7f2429872d3733dc3f9915857b1691da2eb2f
branch: linux-devel/devel-hourly-2014103002
kernel: 
"/kernel/x86_64-rhel/9bdebfefe1de2b6fa7e193c10411ef209b0ebc96/vmlinuz-3.18.0-rc2-g9bdebfe"
user: lkp
queue: cyclic
rootfs: debian-x86_64.cgz
result_root: 
"/result/snb-drag/piglit/performance-igt-001/debian-x86_64.cgz/x86_64-rhel/9bdebfefe1de2b6fa7e193c10411ef209b0ebc96/0"
job_file: 
"/lkp/scheduled/snb-drag/cyclic_piglit-performance-igt-001-x86_64-rhel-HEAD-9bdebfefe1de2b6fa7e193c10411ef209b0ebc96-0.yaml"
dequeue_time: 2014-10-30 03:46:50.534182476 +08:00
job_state: finished
loadavg: 0.62 0.46 0.25 1/96 9645
start_time: '1414612069'
end_time: '1414612536'
version: "/lkp/lkp/.src-20141029-214343"
echo performance > /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor
echo performance > /sys/devices/system/cpu/cpu1/cpufreq/scaling_governor
echo performance > /sys/devices/system/cpu/cpu2/cpufreq/scaling_governor
echo performance > /sys/devices/system/cpu/cpu3/cpufreq/scaling_governor
piglit run igt -t igt/drv_hangman/error-state-capture-bsd 
/lkp/lkp/src/tmp/piglit-results-0
piglit summary console /lkp/lkp/src/tmp/piglit-results-0
piglit run igt -t igt/gem_reset_stats/reset-co

[LKP] [x86, irq, ACPI] 5fcb864ef90: -3.3%(vm-scalability.throughput) +12.9%(turbostat.%c0)

2014-11-23 Thread Yuanhan Liu


Hi,

We noticed the below changes on(NOTE: I'm not sure the bisect is correct
or not, here I report it out JFYI).

git://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git master
commit 5fcb864ef90df093d964171539c87ffa0ab49f0f ("x86, irq, ACPI: Implement 
interfaces to support ACPI based IOAPIC hot-removal")


testbox/testcase/testparams: 
lkp-nex06/vm-scalability/performance-300s-small-allocs-mt

ff6213974cd90e1e  5fcb864ef90df093d964171539  
  --  
   fail:runs  %reproductionfail:runs
   | | |
   :5   20%   1:5 
kmsg.CE:hpet_increased_min_delta_ns_to#nsec
 %stddev %change %stddev
 \  |\  
315326 ±  0%  -3.3% 304841 ±  0%  vm-scalability.throughput
 11.82 ±  0% +12.9%  13.34 ±  0%  turbostat.%c0
  1.34 ±  0%  +9.4%   1.46 ±  0%  turbostat.GHz
12 ± 47% +78.7% 21 ± 32%  sched_debug.cfs_rq[29]:/.load
   113 ± 26% +86.3%212 ± 28%  
sched_debug.cfs_rq[39]:/.tg_load_contrib
   106 ± 28% +89.5%202 ± 30%  
sched_debug.cfs_rq[39]:/.blocked_load_avg
66 ± 23%+120.6%145 ± 29%  
sched_debug.cfs_rq[40]:/.blocked_load_avg
70 ± 23%+113.0%150 ± 29%  
sched_debug.cfs_rq[40]:/.tg_load_contrib
 10145 ± 23% -38.3%   6255 ± 35%  numa-meminfo.node1.AnonPages
  2535 ± 23% -38.3%   1564 ± 35%  numa-vmstat.node1.nr_anon_pages
   605 ± 16% -22.0%471 ±  5%  
sched_debug.cpu#58.nr_uninterruptible
 58904 ±  7% -13.8%  50762 ±  7%  
sched_debug.cfs_rq[0]:/.min_vruntime
481299 ±  8% -13.4% 416975 ±  7%  sched_debug.cpu#0.sched_count
409009 ± 11% -15.7% 344638 ±  2%  sched_debug.cpu#4.sched_count
 52022 ± 10% -16.1%  43623 ±  2%  
sched_debug.cfs_rq[4]:/.min_vruntime
68 ±  3% -12.2% 60 ±  3%  
sched_debug.cfs_rq[4]:/.tg_runnable_contrib
  3175 ±  3% -12.1%   2791 ±  3%  
sched_debug.cfs_rq[4]:/.avg->runnable_avg_sum
 50060 ±  6% -12.3%  43914 ±  4%  
sched_debug.cfs_rq[29]:/.min_vruntime
  1751 ± 12% -15.5%   1480 ±  6%  
sched_debug.cpu#63.nr_uninterruptible
  2967 ±  6% -13.7%   2562 ±  4%  
sched_debug.cfs_rq[37]:/.avg->runnable_avg_sum
63 ±  6% -13.8% 55 ±  4%  
sched_debug.cfs_rq[37]:/.tg_runnable_contrib
  1.07 ±  2% -10.9%   0.95 ±  3%  
perf-profile.cpu-cycles.tick_nohz_restart.tick_nohz_idle_exit.cpu_startup_entry.start_secondary
  1.64 ±  2%  -8.4%   1.50 ±  4%  
perf-profile.cpu-cycles.__tick_nohz_idle_enter.tick_nohz_idle_enter.cpu_startup_entry.start_secondary
 35173 ±  5%  -9.1%  31983 ±  3%  
sched_debug.cfs_rq[56]:/.min_vruntime
  1.41 ±  2%  -8.3%   1.29 ±  4%  
perf-profile.cpu-cycles.tick_nohz_stop_sched_tick.__tick_nohz_idle_enter.tick_nohz_idle_enter.cpu_startup_entry.start_secondary
  1.63 ±  1%  -9.3%   1.48 ±  3%  
perf-profile.cpu-cycles.tick_nohz_idle_exit.cpu_startup_entry.start_secondary
 45161 ± 11% -12.8%  39358 ±  4%  
sched_debug.cfs_rq[25]:/.min_vruntime
 39201 ±  5% +17.3%  45969 ± 18%  
sched_debug.cfs_rq[8]:/.min_vruntime
  21071502 ±  0%  -3.3%   20379730 ±  0%  time.minor_page_faults
   299 ±  0%  -3.1%290 ±  0%  time.user_time
  21763267 ±  0%  -3.3%   21055329 ±  0%  time.voluntary_context_switches
142199 ±  0%  -3.1% 137732 ±  0%  vmstat.system.cs
   737 ±  0%  -2.1%721 ±  1%  time.system_time
   341 ±  0%  -2.5%333 ±  0%  time.percent_of_cpu_this_job_got

lkp-nex06: Nehalem-EX
Memory: 64G




   turbostat.%c0

14 ++---+
   |O   |
   |  O  O   O  O  O|
  13.5 O+   O O   O  O  O O  O  |
   |  O   OO   O O  |
   | O OO
13 ++   |
   ||
  12.5 ++   |
   ||
   ||
12 *+.*...*.. .*... |
   | *..*. *..  .*...*..*..*..*...  .*..*   |
   |  *.  *.|
  11.5 ++--

[LKP] [x86, PCI, MSI] BUG: unable to handle kernel NULL pointer dereference at 0000000000000002

2014-11-16 Thread Yuanhan Liu

FYI, we noticed the below changes on

https://github.com/jiangliu/linux.git irqdomain/p2v7
commit 515b463a5a4c2bac0593c6d88a475a32d65f4bcc ("x86, PCI, MSI: Use hierarchy 
irqdomain to manage MSI interrupts")


+--+++
|  | dadb7cd295 | 515b463a5a |
+--+++
| boot_successes   | 6  | 1  |
| early-boot-hang  | 1  ||
| boot_failures| 0  | 4  |
| BUG:unable_to_handle_kernel  | 0  | 4  |
| Oops | 0  | 4  |
| RIP:init_irq_alloc_info  | 0  | 4  |
| Kernel_panic-not_syncing:Fatal_exception | 0  | 4  |
| backtrace:init_irq_alloc_info| 0  | 4  |
| backtrace:vp_find_vqs| 0  | 4  |
| backtrace:init_vq| 0  | 4  |
| backtrace:init   | 0  | 4  |
| backtrace:kernel_init_freeable   | 0  | 4  |
+--+++


[   20.962013] BUG: unable to handle kernel NULL pointer dereference at 
0002
[   20.964023] IP: [] init_irq_alloc_info+0x13/0x1b
[   20.964023] PGD 0 
[   20.964023] Oops: 0002 [#1] SMP DEBUG_PAGEALLOC
[   20.964023] Modules linked in:
[   20.964023] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 3.18.0-rc4-g4ae16b6 
#1457
[   20.964023] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
[   20.964023] task: 8801289c0010 ti: 8801289c4000 task.ti: 
8801289c4000
[   20.964023] RIP: 0010:[]  [] 
init_irq_alloc_info+0x13/0x1b
[   20.964023] RSP: :8801289c7928  EFLAGS: 00010246
[   20.964023] RAX:  RBX: 0002 RCX: 000a
[   20.964023] RDX: 0002 RSI:  RDI: 0002
[   20.964023] RBP: 8801289c7928 R08: 0008 R09: 
[   20.964023] R10: 8800b8399f80 R11: 0023 R12: 8800db055000
[   20.964023] R13: 8800d1ee8f98 R14: 880129cc3f80 R15: 83e36800
[   20.964023] FS:  () GS:88012a20() 
knlGS:
[   20.964023] CS:  0010 DS:  ES:  CR0: 8005003b
[   20.964023] CR2: 0002 CR3: 03e1a000 CR4: 06f0
[   20.964023] Stack:
[   20.964023]  8801289c7958 810770be 8801289c7980 
0002
[   20.964023]  83e36840 8800db055098 8801289c79d8 
8110fd29
[   20.964023]    8800db055000 
0011
[   20.964023] Call Trace:
[   20.964023]  [] pci_msi_prepare+0x2d/0x54
[   20.964023]  [] msi_domain_alloc_irqs+0x4a/0x162
[   20.964023]  [] ? dmar_find_matched_drhd_unit+0xf7/0x10b
[   20.964023]  [] pci_msi_domain_alloc_irqs+0x15/0x17
[   20.964023]  [] native_setup_msi_irqs+0x61/0x6c
[   20.964023]  [] arch_setup_msi_irqs+0xf/0x11
[   20.964023]  [] pci_msi_setup_msi_irqs+0x45/0x4c
[   20.964023]  [] pci_enable_msix+0x1d8/0x2d0
[   20.964023]  [] pci_enable_msix_range+0x31/0x50
[   20.964023]  [] vp_request_msix_vectors+0xb6/0x1f8
[   20.964023]  [] vp_try_to_find_vqs+0xae/0x43e
[   20.964023]  [] ? vsnprintf+0x374/0x3ad
[   20.964023]  [] vp_find_vqs+0x32/0x8d
[   20.964023]  [] init_vq+0x14f/0x1f8
[   20.964023]  [] virtblk_probe+0xf3/0x501
[   20.964023]  [] ? sysfs_do_create_link_sd+0x78/0xa8
[   20.964023]  [] ? vp_set_status+0x25/0x27
[   20.964023]  [] virtio_dev_probe+0xbd/0x104
[   20.964023]  [] driver_probe_device+0xb0/0x1d7
[   20.964023]  [] __driver_attach+0x62/0x85
[   20.964023]  [] ? __device_attach+0x3d/0x3d
[   20.964023]  [] bus_for_each_dev+0x6f/0x89
[   20.964023]  [] driver_attach+0x1e/0x20
[   20.964023]  [] bus_add_driver+0x110/0x1cf
[   20.964023]  [] ? nbd_init+0x39c/0x39c
[   20.964023]  [] driver_register+0x8f/0xcc
[   20.964023]  [] ? nbd_init+0x39c/0x39c
[   20.964023]  [] register_virtio_driver+0x2b/0x2d
[   20.964023]  [] init+0x5d/0x8b
[   20.964023]  [] do_one_initcall+0xee/0x17e
[   20.964023]  [] kernel_init_freeable+0x1ec/0x274
[   20.964023]  [] ? rest_init+0xcc/0xcc
[   20.964023]  [] kernel_init+0xe/0xdf
[   20.964023]  [] ret_from_fork+0x7c/0xb0
[   20.964023]  [] ? rest_init+0xcc/0xcc
[   20.964023] Code: eb 05 bb da ff ff ff 48 83 c4 28 89 d8 5b 41 5c 41 5d 41 
5e 41 5f 5d c3 0f 1f 44 00 00 55 48 89 fa b9 0a 00 00 00 31 c0 48 89 e5  ab 
5d 48 89 72 08 c3 0f 1f 44 00 00 55 48 85 f6 b9 0a 00 00 
[   20.964023] RIP  [] init_irq_alloc_info+0x13/0x1b
[   20.964023]  RSP 
[   20.964023] CR2: 0002
[   20.964023] ---[ end trace 21200aca189fb8f5 ]---
[   20.964023] Kernel panic - not syncing: Fatal exception
[   20.964023] Kernel Offse

[LKP] [LSM] Kernel panic - not syncing: No working init found.

2014-11-16 Thread Yuanhan Liu

FYI, we noticed the below changes on(TBH, I don't know the bisect is
correct or not; sorry for the noise if not)

git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git lsm/stacking
commit 58c4f9e3be81a85839ea229b1dd36bf55232d440 ("LSM: Refactor existing LSM 
stacking")


++++
|| c9979f3c6e | 58c4f9e3be |
++++
| boot_successes | 15 | 0  |
| early-boot-hang| 1  ||
| boot_failures  | 0  | 15 |
| Kernel_panic-not_syncing:No_working_init_found | 0  | 15 |
| backtrace:panic| 0  | 15 |
++++


[3.437279] Starting init: /sbin/init exists but couldn't execute it (error 
-12)
[3.438655] Starting init: /etc/init exists but couldn't execute it (error 
-13)
[3.440136] Starting init: /bin/sh exists but couldn't execute it (error -12)
[3.441487] Kernel panic - not syncing: No working init found.  Try passing 
init= option to kernel. See Linux Documentation/init.txt for guidance.
[3.443352] CPU: 0 PID: 1 Comm: swapper Not tainted 3.18.0-rc4-g49aba53 #1949
[3.443352]   f783d540 80017f88 8138c3bd 80017fa0 8138b30b 815e1f40 
f783d540
[3.443352]  815e1f40  80017fac 81389523 8152ab4d 80016000 813918e0 
81389474
[3.443352]        007b 
007b
[3.443352] Call Trace:
[3.443352]  [<8138c3bd>] dump_stack+0x16/0x18
[3.443352]  [<8138b30b>] panic+0x86/0x19e
[3.443352]  [<81389523>] kernel_init+0xaf/0xb3
[3.443352]  [<813918e0>] ret_from_kernel_thread+0x20/0x30
[3.443352]  [<81389474>] ? rest_init+0xa2/0xa2
[3.443352] Kernel Offset: 0x0 from 0x8100 (relocation range: 
0x8000-0x947fdfff)

Elapsed time: 10



--yliu
early console in setup code
Probing EDD (edd=off to disable)... ok
[0.00] Linux version 3.18.0-rc4-g49aba53 (kbuild@lkp-hsx01) (gcc 
version 4.9.1 (Debian 4.9.1-19) ) #1949 Sat Nov 15 06:21:52 CST 2014
[0.00] e820: BIOS-provided physical RAM map:
[0.00] BIOS-e820: [mem 0x-0x0009fbff] usable
[0.00] BIOS-e820: [mem 0x0009fc00-0x0009] reserved
[0.00] BIOS-e820: [mem 0x000f-0x000f] reserved
[0.00] BIOS-e820: [mem 0x0010-0x13ffdfff] usable
[0.00] BIOS-e820: [mem 0x13ffe000-0x13ff] reserved
[0.00] BIOS-e820: [mem 0xfeffc000-0xfeff] reserved
[0.00] BIOS-e820: [mem 0xfffc-0x] reserved
[0.00] Notice: NX (Execute Disable) protection missing in CPU!
[0.00] Hypervisor detected: KVM
[0.00] e820: update [mem 0x-0x0fff] usable ==> reserved
[0.00] e820: remove [mem 0x000a-0x000f] usable
[0.00] e820: last_pfn = 0x13ffe max_arch_pfn = 0x100
[0.00] initial memory mapped: [mem 0x-0x027f]
[0.00] Base memory trampoline at [8009b000] 9b000 size 16384
[0.00] init_memory_mapping: [mem 0x-0x000f]
[0.00]  [mem 0x-0x000f] page 4k
[0.00] init_memory_mapping: [mem 0x1320-0x133f]
[0.00]  [mem 0x1320-0x133f] page 2M
[0.00] init_memory_mapping: [mem 0x1000-0x131f]
[0.00]  [mem 0x1000-0x131f] page 2M
[0.00] init_memory_mapping: [mem 0x0010-0x0fff]
[0.00]  [mem 0x0010-0x001f] page 4k
[0.00]  [mem 0x0020-0x0fff] page 2M
[0.00] init_memory_mapping: [mem 0x1340-0x13ffdfff]
[0.00]  [mem 0x1340-0x13df] page 2M
[0.00]  [mem 0x13e0-0x13ffdfff] page 4k
[0.00] BRK [0x01f22000, 0x01f22fff] PGTABLE
[0.00] BRK [0x01f23000, 0x01f23fff] PGTABLE
[0.00] RAMDISK: [mem 0x135e9000-0x13fe]
[0.00] ACPI: Early table checksum verification disabled
[0.00] ACPI: RSDP 0x000FD950 14 (v00 BOCHS )
[0.00] ACPI: RSDT 0x13FFE450 34 (v01 BOCHS  BXPCRSDT 0001 BXPC 
0001)
[0.00] ACPI: FACP 0x1380 74 (v01 BOCHS  BXPCFACP 0001 BXPC 
0001)
[0.00] ACPI: DSDT 0x13FFE490 0011A9 (v01 BXPC   BXDSDT   0001 INTL 
20100528)
[0.00] ACPI: FACS 0x1340 40
[0.00] ACPI: SSDT 0x13FFF7A0 000796 (v01 BOCHS  BXPCSSDT 0001 BXPC 
0001)
[0.00] ACPI: APIC 0x13FFF680 80 (v01 BOCHS  BXPCAPIC 0001 BXPC 
0001)
[0.00] ACPI: HPET 0x13FFF640 38 (v01 BOCHS  BXPCHPET 0001 BXPC 
0001)
[0.00] ACPI: Local APIC address 0xfee0
[

[LKP] [x86, mm] BUG: Bad page state in process swapper/0 pfn:02500

2014-11-16 Thread Yuanhan Liu

FYI, we noticed the below changes on

git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git x86/pmd-nx
commit 3622dcc2b4f4eaf23bae2511a30fc449d0e5f0d9 ("x86, mm: set NX across entire 
PMD at boot")


+--+++
|  | b23dc5a7cc | 3622dcc2b4 |
+--+++
| boot_successes   | 4  | 0  |
| boot_failures| 0  | 19 |
| BUG:Bad_page_state_in_process| 0  | 19 |
| BUG:Bad_page_map_in_process  | 0  | 14 |
| BUG:Bad_rss-counter_state_mm:#idx:val| 0  | 2  |
| backtrace:free_reserved_area | 0  | 19 |
| backtrace:free_init_pages| 0  | 19 |
| backtrace:mark_rodata_ro | 0  | 19 |
| backtrace:vm_munmap  | 0  | 2  |
| backtrace:SyS_munmap | 0  | 2  |
| backtrace:do_execve  | 0  | 12 |
| backtrace:SyS_execve | 0  | 12 |
| backtrace:do_group_exit  | 0  | 10 |
| backtrace:SyS_exit_group | 0  | 10 |
| backtrace:vfs_read   | 0  | 3  |
| backtrace:SyS_read   | 0  | 3  |
| general_protection_fault | 0  | 3  |
| RIP:release_pages| 0  | 3  |
| Kernel_panic-not_syncing:Fatal_exception | 0  | 3  |
+--+++


[5.435374] PM: Hibernation image not present or could not be loaded.
[5.437869] Freeing unused kernel memory: 1448K (8215b000 - 
822c5000)
[5.439558] Write protecting the kernel read-only data: 16384k
[5.441103] BUG: Bad page state in process swapper/0  pfn:02500
[5.442204] page:ea094000 count:0 mapcount:-127 mapping:  
(null) index:0x2
[5.443939] flags: 0x180()
[5.444891] page dumped because: nonzero mapcount
[5.445861] Modules linked in:
[5.446711] CPU: 1 PID: 1 Comm: swapper/0 Not tainted 
3.18.0-rc4-00185-g3622dcc #1
[5.448369] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
[5.449450]  81cf3ba4 880037a33d78 819ea6b0 
10ac
[5.451360]  ea094000 880037a33da8 8119b29c 

[5.453289]  ea094000 0001  
880037a33de8
[5.455234] Call Trace:
[5.455942]  [] dump_stack+0x4e/0x68
[5.456971]  [] bad_page+0xf5/0x113
[5.457972]  [] free_pages_prepare+0xbf/0x13f
[5.459067]  [] free_hot_cold_page+0x35/0x1a0
[5.460178]  [] __free_pages+0x1b/0x24
[5.461219]  [] free_reserved_area+0xaf/0x10b
[5.462339]  [] free_init_pages+0x8d/0x99
[5.463407]  [] mark_rodata_ro+0xb6/0x11c
[5.464522]  [] ? rest_init+0x89/0x89
[5.465533]  [] kernel_init+0x1d/0xdf
[5.466596]  [] ret_from_fork+0x7c/0xb0
[5.467633]  [] ? rest_init+0x89/0x89
[5.468711] Disabling lock debugging due to kernel taint
[5.470302] Freeing unused kernel memory: 1488K (8248c000 - 
8260)
[5.472182] Freeing unused kernel memory: 20K (8800019fb000 - 
880001a0)
[5.477823] Freeing unused kernel memory: 1812K (880001e3b000 - 
88000200)
[5.582078] BUG: Bad page state in process udevd  pfn:0248c
[5.582103] BUG: Bad page state in process udevd  pfn:024a0
[5.582104] page:ea092800 count:2 mapcount:0 
mapping:88003ec8ea69 index:0x2
[5.582107] flags: 0x1880068(uptodate|lru|active|swapbacked)



--yliu
early console in setup code
Probing EDD (edd=off to disable)... ok
early console in decompress_kernel

Decompressing Linux... Parsing ELF... done.
Booting the kernel.
[0.00] Initializing cgroup subsys cpuset
[0.00] Initializing cgroup subsys cpu
[0.00] Linux version 3.18.0-rc4-00185-g3622dcc (kbuild@roam) (gcc 
version 4.9.1 (Debian 4.9.1-19) ) #1 SMP Sat Nov 15 17:25:59 CST 2014
[0.00] Command line: user=lkp 
job=/lkp/scheduled/vm-vp-1G-6/rand_boot-1-debian-x86_64.cgz-x86_64-lkp-3622dcc2b4f4eaf23bae2511a30fc449d0e5f0d9-1.yaml
 ARCH=x86_64 
BOOT_IMAGE=/kernel/x86_64-lkp/3622dcc2b4f4eaf23bae2511a30fc449d0e5f0d9/vmlinuz-3.18.0-rc4-00185-g3622dcc
 kconfig=x86_64-lkp commit=3622dcc2b4f4eaf23bae2511a30fc449d0e5f0d9 
branch=kees/x86/pmd-nx root=/dev/ram0 max_uptime=3600 
RESULT_ROOT=/result/vm-vp-1G/boot/1/debian-x86_64.cgz/x86_64-lkp/3622dcc2b4f4eaf23bae2511a30fc449d0e5f0d9/0
 ip=vm-vp-1G-6::dhcp earlyprintk=ttyS0,115200 debug apic=debug 
sysrq_always_enabled rcupdate.rcu_cpu_stall_timeout=100 pan

Re: [LKP] [sched] 9597d64116d: -16.1% hackbench.throughput

2014-11-13 Thread Yuanhan Liu

On Wed, Nov 12, 2014 at 03:44:34PM +0100, Vincent Guittot wrote:
> On 10 November 2014 06:54,   wrote:
> > FYI, we noticed the below changes on
> >
> > https://git.linaro.org/people/mturquette/linux.git eas-next
> > commit 9597d64116d0d441dea32e7f5f05fa135d16f44b ("sched: replace 
> > capacity_factor by usage")
> >
> >
> > b57a1e0afff2cbac  9597d64116d0d441dea32e7f5f  testbox/testcase/testparams
> >   --  ---
> >  %stddev %change %stddev
> >  \  |\
> > 104249 ą  0% -16.1%  87436 ą  0%  
> > ivb42/hackbench/performance-50%-threads-socket
> > 104249   -16.1%  87436GEO-MEAN hackbench.throughput
> 
> Hi yuanhan,
> 
> i understand this email as a 16% drop in hackbench performance when
> the number of group is half the number of CPUs. Is it the only test
> for which you  have seen some decreases ? where can i find the list of
> tests that you have passed ?

Sorry, the list is not accessed outside, plus, you have to run some
commands to generate the list on fly. Anyway, I checked it for you,
and we have run hackbench/performance-50%-threads-socket only on that
commit, which is reasonable in our system as we bisected once on this
issue.

But I can run more tests(say, with 100% and 1600% cpu) on that commit if
you like, and it also would be good if you can name some of benchmarks
you care most so that we can run it for you.

--yliu
> 
> I'm going to try to reproduce the test in my local setup
> 
> Regards,
> Vincent
> 
> 
> >
> > b57a1e0afff2cbac  9597d64116d0d441dea32e7f5f
> >   --
> >   0.88 ą 25%+209.7%   2.74 ą  5%  
> > ivb42/hackbench/performance-50%-threads-socket
> >   0.88  +209.7%   2.74GEO-MEAN 
> > perf-profile.cpu-cycles.ttwu_do_activate.constprop.87.sched_ttwu_pending.scheduler_ipi.smp_reschedule_interrupt.reschedule_interrupt
> >
> > b57a1e0afff2cbac  9597d64116d0d441dea32e7f5f
> >   --
> >   0.76 ą 26%+209.2%   2.36 ą  5%  
> > ivb42/hackbench/performance-50%-threads-socket
> >   0.76  +209.2%   2.36GEO-MEAN 
> > perf-profile.cpu-cycles.activate_task.ttwu_do_activate.sched_ttwu_pending.scheduler_ipi.smp_reschedule_interrupt
> >
> > b57a1e0afff2cbac  9597d64116d0d441dea32e7f5f
> >   --
> >   0.76 ą 26%+210.6%   2.35 ą  5%  
> > ivb42/hackbench/performance-50%-threads-socket
> >   0.76  +210.6%   2.35GEO-MEAN 
> > perf-profile.cpu-cycles.enqueue_task.activate_task.ttwu_do_activate.sched_ttwu_pending.scheduler_ipi
> >
> > b57a1e0afff2cbac  9597d64116d0d441dea32e7f5f
> >   --
> >   0.70 ą 25%+203.1%   2.13 ą  6%  
> > ivb42/hackbench/performance-50%-threads-socket
> >   0.70  +203.1%   2.13GEO-MEAN 
> > perf-profile.cpu-cycles.enqueue_task_fair.enqueue_task.activate_task.ttwu_do_activate.sched_ttwu_pending
> >
> > b57a1e0afff2cbac  9597d64116d0d441dea32e7f5f
> >   --
> > 243252 ą 46%+242.5% 833240 ą 42%  
> > ivb42/hackbench/performance-50%-threads-socket
> > 243252  +242.5% 833240GEO-MEAN 
> > sched_debug.cfs_rq[2]:/.spread0
> >
> > b57a1e0afff2cbac  9597d64116d0d441dea32e7f5f
> >   --
> > 98 ą 36% -49.1% 50 ą 34%  
> > ivb42/hackbench/performance-50%-threads-socket
> > 98   -49.1% 50GEO-MEAN 
> > sched_debug.cfs_rq[18]:/.blocked_load_avg
> >
> > b57a1e0afff2cbac  9597d64116d0d441dea32e7f5f
> >   --
> >1067752 ą 25% +65.3%1764542 ą 11%  
> > ivb42/hackbench/performance-50%-threads-socket
> >1067752   +65.3%1764542GEO-MEAN 
> > sched_debug.cfs_rq[16]:/.spread0
> >
> > b57a1e0afff2cbac  9597d64116d0d441dea32e7f5f
> >   --
> > 923375 ą 22% +96.3%1812750 ą 21%  
> > ivb42/hackbench/performance-50%-threads-socket
> > 923375   +96.3%1812750GEO-MEAN 
> > sched_debug.cfs_rq[14]:/.spread0
> >
> > b57a1e0afff2cbac  9597d64116d0d441dea32e7f5f
> >   --
> >1008818 ą 20% +70.9%1724167 ą 14%  
> > ivb42/hackbench/performance-50%-threads-socket
> >1008818   +70.9%1724167GEO-MEAN 
> > sched_debug.cfs_rq[6]:/.spread0
> >
> > b57a1e0afff2cbac  9597d64116d0d441dea32e7f5f
> >   --
> >1109100 ą 25% +53.9%1707190 ą 16%  
> > ivb42/hackbench/performance-50%-threads-socket
> >1109100   +53.9%1707190GEO-MEAN 
> > sched_debug.cfs_rq[15]:/.spread0
> >
> > b57a1e0afff2cbac  9597d64116d0d441dea32e7f5f
> >

Re: [LKP] [dmi] PANIC: early exception 0e rip 10:ffffffff81899e6b error 9 cr2 ffffffffff240000

2014-11-07 Thread Yuanhan Liu

On Fri, Nov 07, 2014 at 10:35:44AM +0100, Ard Biesheuvel wrote:
> On 7 November 2014 10:26, Yuanhan Liu  wrote:
> > On Fri, Nov 07, 2014 at 10:03:55AM +0100, Ard Biesheuvel wrote:
> >> On 7 November 2014 09:46, Yuanhan Liu  wrote:
> >> > On Fri, Nov 07, 2014 at 09:23:56AM +0100, Ard Biesheuvel wrote:
> >> >> On 7 November 2014 09:13, Yuanhan Liu  
> >> >> wrote:
> >> >> > On Fri, Nov 07, 2014 at 08:44:40AM +0100, Ard Biesheuvel wrote:
> >> >> >> On 7 November 2014 08:37, Yuanhan Liu  
> >> >> >> wrote:
> >> >> >> > On Fri, Nov 07, 2014 at 08:17:36AM +0100, Ard Biesheuvel wrote:
> >> >> >> >> On 7 November 2014 06:47, LKP  wrote:
> >> >> >> >> > FYI, we noticed the below changes on
> >> >> >> >> >
> >> >> >> >> > https://git.linaro.org/people/ard.biesheuvel/linux-arm 
> >> >> >> >> > efi-for-3.19
> >> >> >> >> > commit aacdce6e880894acb57d71dcb2e3fc61b4ed4e96 ("dmi: add 
> >> >> >> >> > support for SMBIOS 3.0 64-bit entry point")
> >> >> >> >> >
> >> >> >> >> >
> >> >> >> >> > +---+++
> >> >> >> >> > |   | 2fa165a26c | aacdce6e88 |
> >> >> >> >> > +---+++
> >> >> >> >> > | boot_successes| 20 | 10 |
> >> >> >> >> > | early-boot-hang   | 1  ||
> >> >> >> >> > | boot_failures | 0  | 5  |
> >> >> >> >> > | PANIC:early_exception | 0  | 5  |
> >> >> >> >> > +---+++
> >> >> >> >> >
> >> >> >> >> >
> >> >> >> >> > [0.00] BIOS-e820: [mem 
> >> >> >> >> > 0x0001-0x00036fff] usable
> >> >> >> >> > [0.00] bootconsole [earlyser0] enabled
> >> >> >> >> > [0.00] NX (Execute Disable) protection: active
> >> >> >> >> > PANIC: early exception 0e rip 10:81899e6b error 9 cr2 
> >> >> >> >> > ff24
> >> >> >> >> > [0.00] CPU: 0 PID: 0 Comm: swapper Not tainted 
> >> >> >> >> > 3.18.0-rc2-gc5221e6 #1
> >> >> >> >> > [0.00]   82203d30 
> >> >> >> >> > 819f0a6e 03f8
> >> >> >> >> > [0.00]  ff24 82203e18 
> >> >> >> >> > 823701b0 82511401
> >> >> >> >> > [0.00]   0ba3 
> >> >> >> >> >  ff24
> >> >> >> >> > [0.00] Call Trace:
> >> >> >> >> > [0.00]  [] dump_stack+0x4e/0x68
> >> >> >> >> > [0.00]  [] early_idt_handler+0x90/0xb7
> >> >> >> >> > [0.00]  [] ? 
> >> >> >> >> > dmi_save_one_device+0x81/0x81
> >> >> >> >> > [0.00]  [] ? dmi_table+0x3f/0x94
> >> >> >> >> > [0.00]  [] ? dmi_table+0x16/0x94
> >> >> >> >> > [0.00]  [] ? 
> >> >> >> >> > dmi_save_one_device+0x81/0x81
> >> >> >> >> > [0.00]  [] ? 
> >> >> >> >> > dmi_save_one_device+0x81/0x81
> >> >> >> >> > [0.00]  [] dmi_walk_early+0x44/0x69
> >> >> >> >> > [0.00]  [] dmi_present+0x180/0x1ff
> >> >> >> >> > [0.00]  [] 
> >> >> >> >> > dmi_scan_machine+0x144/0x191
> >> >> >> >> > [0.00]  [] ? loglevel+0x31/0x31
> >> >> >> >> > [0.00]  [] setup_arch+0x490/0xc73
> >> >> >> >> > [0.00]  [] ? printk+0x4d/0x4f
> >> >>

Re: [LKP] [dmi] PANIC: early exception 0e rip 10:ffffffff81899e6b error 9 cr2 ffffffffff240000

2014-11-07 Thread Yuanhan Liu

On Fri, Nov 07, 2014 at 09:16:02AM +, Matt Fleming wrote:
> On Fri, 2014-11-07 at 08:17 +0100, Ard Biesheuvel wrote:
> > On 7 November 2014 06:47, LKP  wrote:
> > > FYI, we noticed the below changes on
> > >
> > > https://git.linaro.org/people/ard.biesheuvel/linux-arm efi-for-3.19
> > > commit aacdce6e880894acb57d71dcb2e3fc61b4ed4e96 ("dmi: add support for 
> > > SMBIOS 3.0 64-bit entry point")
> > >
> > >
> > > +---+++
> > > |   | 2fa165a26c | aacdce6e88 |
> > > +---+++
> > > | boot_successes| 20 | 10 |
> > > | early-boot-hang   | 1  ||
> > > | boot_failures | 0  | 5  |
> > > | PANIC:early_exception | 0  | 5  |
> > > +---+++
> > >
> > >
> > > [0.00] BIOS-e820: [mem 0x0001-0x00036fff] 
> > > usable
> > > [0.00] bootconsole [earlyser0] enabled
> > > [0.00] NX (Execute Disable) protection: active
> > > PANIC: early exception 0e rip 10:81899e6b error 9 cr2 
> > > ff24
> > > [0.00] CPU: 0 PID: 0 Comm: swapper Not tainted 
> > > 3.18.0-rc2-gc5221e6 #1
> > > [0.00]   82203d30 819f0a6e 
> > > 03f8
> > > [0.00]  ff24 82203e18 823701b0 
> > > 82511401
> > > [0.00]   0ba3  
> > > ff24
> > > [0.00] Call Trace:
> > > [0.00]  [] dump_stack+0x4e/0x68
> > > [0.00]  [] early_idt_handler+0x90/0xb7
> > > [0.00]  [] ? dmi_save_one_device+0x81/0x81
> > > [0.00]  [] ? dmi_table+0x3f/0x94
> > > [0.00]  [] ? dmi_table+0x16/0x94
> > > [0.00]  [] ? dmi_save_one_device+0x81/0x81
> > > [0.00]  [] ? dmi_save_one_device+0x81/0x81
> > > [0.00]  [] dmi_walk_early+0x44/0x69
> > > [0.00]  [] dmi_present+0x180/0x1ff
> > > [0.00]  [] dmi_scan_machine+0x144/0x191
> > > [0.00]  [] ? loglevel+0x31/0x31
> > > [0.00]  [] setup_arch+0x490/0xc73
> > > [0.00]  [] ? printk+0x4d/0x4f
> > > [0.00]  [] start_kernel+0x9c/0x43f
> > > [0.00]  [] ? early_idt_handlers+0x120/0x120
> > > [0.00]  [] x86_64_start_reservations+0x2a/0x2c
> > > [0.00]  [] x86_64_start_kernel+0x13b/0x14a
> > > [0.00] RIP 0x4
> > >
> > 
> > This is most puzzling. Could anyone decode the exception?
> > This looks like the non-EFI path through dmi_scan_machine(), which
> > calls dmi_present() /after/ calling dmi_smbios3_present(), which
> > apparently has not found the _SM3_ header tag. Or could the call stack
> > be inaccurate?
> 
> The code triggered a page fault while trying to access
> 0xff24, caused because the reserved bit was set in the page
> table and no page was found. Looks like it jumped through a bogus
> pointer.
> 
> And yes, the callstack may definitely be wrong - the stack dumper is
> just scraping addresses from the stack, as indicated by the '?' symbol.
> 
> Yuanhan, what symbol does 0x81899e6b (the faulting instruction)
> translate to?

I found no System.map for that kernel, I then changed to another kernel,
and here is the new panic dmesg:

PANIC: early exception 0e rip 10:8167aa1a error 9 cr2 ff240001
[0.00] CPU: 0 PID: 0 Comm: swapper Not tainted 
3.18.0-rc2-8-g4d3a0be #66
[0.00]  0ba3 81bcfd10 818010a4 
03f8
[0.00]  003e 81bcfdf8 81d801b0 
617420534f49424d
[0.00]  001f ff24  
ff24
[0.00] Call Trace:
[0.00]  [] dump_stack+0x46/0x58
[0.00]  [] early_idt_handler+0x90/0xb7
[0.00]  [] ? dmi_format_ids.constprop.9+0x13c/0x13c
[0.00]  [] ? dmi_table+0x4a/0xf0
[0.00]  [] ? printk+0x61/0x63
[0.00]  [] ? dmi_format_ids.constprop.9+0x13c/0x13c
[0.00]  [] ? dmi_format_ids.constprop.9+0x13c/0x13c
[0.00]  [] dmi_walk_early+0x6b/0x90
[0.00]  [] dmi_present+0x1b4/0x23f
[0.00]  [] dmi_scan_machine+0x1d4/0x23a
[0.00]  [] ? early_idt_handlers+0x120/0x120
[0.00]  [] setup_arch+0x462/0xcc6
[0.00]  [] ? early_idt_handlers+0x120/0x120
[0.00]  [] ? early_idt_handler+0x47/0xb7
[0.00]  [] ? early_idt_handlers+0x120/0x120
[0.00]  [] start_kernel+0x97/0x456
[0.00]  [] ? early_idt_handlers+0x120/0x120
[0.00]  [] ? early_idt_handlers+0x120/0x120
[0.00]  [] x86_64_start_reservations+0x2a/0x2c
[0.00]  [] x86_64_start_kernel+0x13e/0x14d
[0.00] RIP 0xba2


The address changes to 10:8167aa1a, and in the System.map, it has:

  8167a9d0 t dmi_table
  8167aac0 T dmi_name_in_vendors

Sorry, I don't k

Re: [LKP] [dmi] PANIC: early exception 0e rip 10:ffffffff81899e6b error 9 cr2 ffffffffff240000

2014-11-07 Thread Yuanhan Liu

On Fri, Nov 07, 2014 at 10:03:55AM +0100, Ard Biesheuvel wrote:
> On 7 November 2014 09:46, Yuanhan Liu  wrote:
> > On Fri, Nov 07, 2014 at 09:23:56AM +0100, Ard Biesheuvel wrote:
> >> On 7 November 2014 09:13, Yuanhan Liu  wrote:
> >> > On Fri, Nov 07, 2014 at 08:44:40AM +0100, Ard Biesheuvel wrote:
> >> >> On 7 November 2014 08:37, Yuanhan Liu  
> >> >> wrote:
> >> >> > On Fri, Nov 07, 2014 at 08:17:36AM +0100, Ard Biesheuvel wrote:
> >> >> >> On 7 November 2014 06:47, LKP  wrote:
> >> >> >> > FYI, we noticed the below changes on
> >> >> >> >
> >> >> >> > https://git.linaro.org/people/ard.biesheuvel/linux-arm efi-for-3.19
> >> >> >> > commit aacdce6e880894acb57d71dcb2e3fc61b4ed4e96 ("dmi: add support 
> >> >> >> > for SMBIOS 3.0 64-bit entry point")
> >> >> >> >
> >> >> >> >
> >> >> >> > +---+++
> >> >> >> > |   | 2fa165a26c | aacdce6e88 |
> >> >> >> > +---+++
> >> >> >> > | boot_successes| 20 | 10 |
> >> >> >> > | early-boot-hang   | 1  ||
> >> >> >> > | boot_failures | 0  | 5  |
> >> >> >> > | PANIC:early_exception | 0  | 5  |
> >> >> >> > +---+++
> >> >> >> >
> >> >> >> >
> >> >> >> > [0.00] BIOS-e820: [mem 
> >> >> >> > 0x0001-0x00036fff] usable
> >> >> >> > [0.00] bootconsole [earlyser0] enabled
> >> >> >> > [0.00] NX (Execute Disable) protection: active
> >> >> >> > PANIC: early exception 0e rip 10:81899e6b error 9 cr2 
> >> >> >> > ff24
> >> >> >> > [0.00] CPU: 0 PID: 0 Comm: swapper Not tainted 
> >> >> >> > 3.18.0-rc2-gc5221e6 #1
> >> >> >> > [0.00]   82203d30 819f0a6e 
> >> >> >> > 03f8
> >> >> >> > [0.00]  ff24 82203e18 823701b0 
> >> >> >> > 82511401
> >> >> >> > [0.00]   0ba3  
> >> >> >> > ff24
> >> >> >> > [0.00] Call Trace:
> >> >> >> > [0.00]  [] dump_stack+0x4e/0x68
> >> >> >> > [0.00]  [] early_idt_handler+0x90/0xb7
> >> >> >> > [0.00]  [] ? 
> >> >> >> > dmi_save_one_device+0x81/0x81
> >> >> >> > [0.00]  [] ? dmi_table+0x3f/0x94
> >> >> >> > [0.00]  [] ? dmi_table+0x16/0x94
> >> >> >> > [0.00]  [] ? 
> >> >> >> > dmi_save_one_device+0x81/0x81
> >> >> >> > [0.00]  [] ? 
> >> >> >> > dmi_save_one_device+0x81/0x81
> >> >> >> > [0.00]  [] dmi_walk_early+0x44/0x69
> >> >> >> > [0.00]  [] dmi_present+0x180/0x1ff
> >> >> >> > [0.00]  [] dmi_scan_machine+0x144/0x191
> >> >> >> > [0.00]  [] ? loglevel+0x31/0x31
> >> >> >> > [0.00]  [] setup_arch+0x490/0xc73
> >> >> >> > [0.00]  [] ? printk+0x4d/0x4f
> >> >> >> > [0.00]  [] start_kernel+0x9c/0x43f
> >> >> >> > [0.00]  [] ? 
> >> >> >> > early_idt_handlers+0x120/0x120
> >> >> >> > [0.00]  [] 
> >> >> >> > x86_64_start_reservations+0x2a/0x2c
> >> >> >> > [0.00]  [] 
> >> >> >> > x86_64_start_kernel+0x13b/0x14a
> >> >> >> > [0.00] RIP 0x4
> >> >> >> >
> >> >> >>
> >> >> >> This is most puzzling. Could anyone decode the exception?
> >> >> >> This looks like the non-EFI path through dmi_scan_machine(), which

Re: [LKP] [dmi] PANIC: early exception 0e rip 10:ffffffff81899e6b error 9 cr2 ffffffffff240000

2014-11-07 Thread Yuanhan Liu

On Fri, Nov 07, 2014 at 09:23:56AM +0100, Ard Biesheuvel wrote:
> On 7 November 2014 09:13, Yuanhan Liu  wrote:
> > On Fri, Nov 07, 2014 at 08:44:40AM +0100, Ard Biesheuvel wrote:
> >> On 7 November 2014 08:37, Yuanhan Liu  wrote:
> >> > On Fri, Nov 07, 2014 at 08:17:36AM +0100, Ard Biesheuvel wrote:
> >> >> On 7 November 2014 06:47, LKP  wrote:
> >> >> > FYI, we noticed the below changes on
> >> >> >
> >> >> > https://git.linaro.org/people/ard.biesheuvel/linux-arm efi-for-3.19
> >> >> > commit aacdce6e880894acb57d71dcb2e3fc61b4ed4e96 ("dmi: add support 
> >> >> > for SMBIOS 3.0 64-bit entry point")
> >> >> >
> >> >> >
> >> >> > +---+++
> >> >> > |   | 2fa165a26c | aacdce6e88 |
> >> >> > +---+++
> >> >> > | boot_successes| 20 | 10 |
> >> >> > | early-boot-hang   | 1  ||
> >> >> > | boot_failures | 0  | 5  |
> >> >> > | PANIC:early_exception | 0  | 5  |
> >> >> > +---+++
> >> >> >
> >> >> >
> >> >> > [0.00] BIOS-e820: [mem 0x0001-0x00036fff] 
> >> >> > usable
> >> >> > [0.00] bootconsole [earlyser0] enabled
> >> >> > [0.00] NX (Execute Disable) protection: active
> >> >> > PANIC: early exception 0e rip 10:81899e6b error 9 cr2 
> >> >> > ff24
> >> >> > [0.00] CPU: 0 PID: 0 Comm: swapper Not tainted 
> >> >> > 3.18.0-rc2-gc5221e6 #1
> >> >> > [0.00]   82203d30 819f0a6e 
> >> >> > 03f8
> >> >> > [0.00]  ff24 82203e18 823701b0 
> >> >> > 82511401
> >> >> > [0.00]   0ba3  
> >> >> > ff24
> >> >> > [0.00] Call Trace:
> >> >> > [0.00]  [] dump_stack+0x4e/0x68
> >> >> > [0.00]  [] early_idt_handler+0x90/0xb7
> >> >> > [0.00]  [] ? dmi_save_one_device+0x81/0x81
> >> >> > [0.00]  [] ? dmi_table+0x3f/0x94
> >> >> > [0.00]  [] ? dmi_table+0x16/0x94
> >> >> > [0.00]  [] ? dmi_save_one_device+0x81/0x81
> >> >> > [0.00]  [] ? dmi_save_one_device+0x81/0x81
> >> >> > [0.00]  [] dmi_walk_early+0x44/0x69
> >> >> > [0.00]  [] dmi_present+0x180/0x1ff
> >> >> > [0.00]  [] dmi_scan_machine+0x144/0x191
> >> >> > [0.00]  [] ? loglevel+0x31/0x31
> >> >> > [0.00]  [] setup_arch+0x490/0xc73
> >> >> > [0.00]  [] ? printk+0x4d/0x4f
> >> >> > [0.00]  [] start_kernel+0x9c/0x43f
> >> >> > [0.00]  [] ? early_idt_handlers+0x120/0x120
> >> >> > [0.00]  [] 
> >> >> > x86_64_start_reservations+0x2a/0x2c
> >> >> > [0.00]  [] x86_64_start_kernel+0x13b/0x14a
> >> >> > [0.00] RIP 0x4
> >> >> >
> >> >>
> >> >> This is most puzzling. Could anyone decode the exception?
> >> >> This looks like the non-EFI path through dmi_scan_machine(), which
> >> >> calls dmi_present() /after/ calling dmi_smbios3_present(), which
> >> >> apparently has not found the _SM3_ header tag. Or could the call stack
> >> >> be inaccurate?
> >> >>
> >> >> Anyway, it would be good to know the exact type of the platform,
> >> >
> >> > It's a Nehalem-EP machine, wht 16 CPU and 12G memory.
> >> >
> >> >> and
> >> >> perhaps we could find out if there is an inadvertent _SM3_ tag
> >> >> somewhere in the 0xF - 0xF range?
> >> >
> >> > Sorry, how?
> >> >
> >>
> >> That's not a brand new machine, so I suppose there wouldn't be a
> >> SMBIOS 3.0 header lurking in there.
> >>
> >> Anyway, if you are i

Re: [LKP] [dmi] PANIC: early exception 0e rip 10:ffffffff81899e6b error 9 cr2 ffffffffff240000

2014-11-07 Thread Yuanhan Liu

On Fri, Nov 07, 2014 at 08:44:40AM +0100, Ard Biesheuvel wrote:
> On 7 November 2014 08:37, Yuanhan Liu  wrote:
> > On Fri, Nov 07, 2014 at 08:17:36AM +0100, Ard Biesheuvel wrote:
> >> On 7 November 2014 06:47, LKP  wrote:
> >> > FYI, we noticed the below changes on
> >> >
> >> > https://git.linaro.org/people/ard.biesheuvel/linux-arm efi-for-3.19
> >> > commit aacdce6e880894acb57d71dcb2e3fc61b4ed4e96 ("dmi: add support for 
> >> > SMBIOS 3.0 64-bit entry point")
> >> >
> >> >
> >> > +---+++
> >> > |   | 2fa165a26c | aacdce6e88 |
> >> > +---+++
> >> > | boot_successes| 20 | 10 |
> >> > | early-boot-hang   | 1  ||
> >> > | boot_failures | 0  | 5  |
> >> > | PANIC:early_exception | 0  | 5  |
> >> > +---+++
> >> >
> >> >
> >> > [0.00] BIOS-e820: [mem 0x0001-0x00036fff] 
> >> > usable
> >> > [0.00] bootconsole [earlyser0] enabled
> >> > [0.00] NX (Execute Disable) protection: active
> >> > PANIC: early exception 0e rip 10:81899e6b error 9 cr2 
> >> > ff24
> >> > [0.00] CPU: 0 PID: 0 Comm: swapper Not tainted 
> >> > 3.18.0-rc2-gc5221e6 #1
> >> > [0.00]   82203d30 819f0a6e 
> >> > 03f8
> >> > [0.00]  ff24 82203e18 823701b0 
> >> > 82511401
> >> > [0.00]   0ba3  
> >> > ff24
> >> > [0.00] Call Trace:
> >> > [0.00]  [] dump_stack+0x4e/0x68
> >> > [0.00]  [] early_idt_handler+0x90/0xb7
> >> > [0.00]  [] ? dmi_save_one_device+0x81/0x81
> >> > [0.00]  [] ? dmi_table+0x3f/0x94
> >> > [0.00]  [] ? dmi_table+0x16/0x94
> >> > [0.00]  [] ? dmi_save_one_device+0x81/0x81
> >> > [0.00]  [] ? dmi_save_one_device+0x81/0x81
> >> > [0.00]  [] dmi_walk_early+0x44/0x69
> >> > [0.00]  [] dmi_present+0x180/0x1ff
> >> > [0.00]  [] dmi_scan_machine+0x144/0x191
> >> > [0.00]  [] ? loglevel+0x31/0x31
> >> > [0.00]  [] setup_arch+0x490/0xc73
> >> > [0.00]  [] ? printk+0x4d/0x4f
> >> > [0.00]  [] start_kernel+0x9c/0x43f
> >> > [0.00]  [] ? early_idt_handlers+0x120/0x120
> >> > [0.00]  [] x86_64_start_reservations+0x2a/0x2c
> >> > [0.00]  [] x86_64_start_kernel+0x13b/0x14a
> >> > [0.00] RIP 0x4
> >> >
> >>
> >> This is most puzzling. Could anyone decode the exception?
> >> This looks like the non-EFI path through dmi_scan_machine(), which
> >> calls dmi_present() /after/ calling dmi_smbios3_present(), which
> >> apparently has not found the _SM3_ header tag. Or could the call stack
> >> be inaccurate?
> >>
> >> Anyway, it would be good to know the exact type of the platform,
> >
> > It's a Nehalem-EP machine, wht 16 CPU and 12G memory.
> >
> >> and
> >> perhaps we could find out if there is an inadvertent _SM3_ tag
> >> somewhere in the 0xF - 0xF range?
> >
> > Sorry, how?
> >
> 
> That's not a brand new machine, so I suppose there wouldn't be a
> SMBIOS 3.0 header lurking in there.
> 
> Anyway, if you are in a position to try things, could you apply this
> 
> --- a/drivers/firmware/dmi_scan.c
> +++ b/drivers/firmware/dmi_scan.c
> @@ -617,7 +617,7 @@ void __init dmi_scan_machine(void)
> memset(buf, 0, 16);
> for (q = p; q < p + 0x1; q += 16) {
> memcpy_fromio(buf + 16, q, 16);
> -   if (!dmi_smbios3_present(buf) || !dmi_present(buf)) {
> +   if (!dmi_present(buf)) {
> dmi_available = 1;
> dmi_early_unmap(p, 0x1);
> goto out;
> 
> and try again?

kernel boots perfectly with this patch applied.

--yliu

> That is the only change that is relevant to the non-EFI
> code path which this machine appears to take, so if this fixes things,
> that would be valuable information even if it doesn't tell us exactly
> what is going wrong.
> 
> Thanks,
> Ard.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [LKP] [dmi] PANIC: early exception 0e rip 10:ffffffff81899e6b error 9 cr2 ffffffffff240000

2014-11-06 Thread Yuanhan Liu

On Fri, Nov 07, 2014 at 08:17:36AM +0100, Ard Biesheuvel wrote:
> On 7 November 2014 06:47, LKP  wrote:
> > FYI, we noticed the below changes on
> >
> > https://git.linaro.org/people/ard.biesheuvel/linux-arm efi-for-3.19
> > commit aacdce6e880894acb57d71dcb2e3fc61b4ed4e96 ("dmi: add support for 
> > SMBIOS 3.0 64-bit entry point")
> >
> >
> > +---+++
> > |   | 2fa165a26c | aacdce6e88 |
> > +---+++
> > | boot_successes| 20 | 10 |
> > | early-boot-hang   | 1  ||
> > | boot_failures | 0  | 5  |
> > | PANIC:early_exception | 0  | 5  |
> > +---+++
> >
> >
> > [0.00] BIOS-e820: [mem 0x0001-0x00036fff] usable
> > [0.00] bootconsole [earlyser0] enabled
> > [0.00] NX (Execute Disable) protection: active
> > PANIC: early exception 0e rip 10:81899e6b error 9 cr2 
> > ff24
> > [0.00] CPU: 0 PID: 0 Comm: swapper Not tainted 3.18.0-rc2-gc5221e6 
> > #1
> > [0.00]   82203d30 819f0a6e 
> > 03f8
> > [0.00]  ff24 82203e18 823701b0 
> > 82511401
> > [0.00]   0ba3  
> > ff24
> > [0.00] Call Trace:
> > [0.00]  [] dump_stack+0x4e/0x68
> > [0.00]  [] early_idt_handler+0x90/0xb7
> > [0.00]  [] ? dmi_save_one_device+0x81/0x81
> > [0.00]  [] ? dmi_table+0x3f/0x94
> > [0.00]  [] ? dmi_table+0x16/0x94
> > [0.00]  [] ? dmi_save_one_device+0x81/0x81
> > [0.00]  [] ? dmi_save_one_device+0x81/0x81
> > [0.00]  [] dmi_walk_early+0x44/0x69
> > [0.00]  [] dmi_present+0x180/0x1ff
> > [0.00]  [] dmi_scan_machine+0x144/0x191
> > [0.00]  [] ? loglevel+0x31/0x31
> > [0.00]  [] setup_arch+0x490/0xc73
> > [0.00]  [] ? printk+0x4d/0x4f
> > [0.00]  [] start_kernel+0x9c/0x43f
> > [0.00]  [] ? early_idt_handlers+0x120/0x120
> > [0.00]  [] x86_64_start_reservations+0x2a/0x2c
> > [0.00]  [] x86_64_start_kernel+0x13b/0x14a
> > [0.00] RIP 0x4
> >
> 
> This is most puzzling. Could anyone decode the exception?
> This looks like the non-EFI path through dmi_scan_machine(), which
> calls dmi_present() /after/ calling dmi_smbios3_present(), which
> apparently has not found the _SM3_ header tag. Or could the call stack
> be inaccurate?
> 
> Anyway, it would be good to know the exact type of the platform,

It's a Nehalem-EP machine, wht 16 CPU and 12G memory.

> and
> perhaps we could find out if there is an inadvertent _SM3_ tag
> somewhere in the 0xF - 0xF range?

Sorry, how?

--yliu
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 0/3] Shrinkers and proportional reclaim

2014-05-22 Thread Yuanhan Liu

On Thu, May 22, 2014 at 05:30:51PM +0100, Mel Gorman wrote:
> On Fri, May 23, 2014 at 12:14:16AM +0800, Yuanhan Liu wrote:
> > On Thu, May 22, 2014 at 10:09:36AM +0100, Mel Gorman wrote:
> > > This series is aimed at regressions noticed during reclaim activity. The
> > > first two patches are shrinker patches that were posted ages ago but never
> > > merged for reasons that are unclear to me. I'm posting them again to see 
> > > if
> > > there was a reason they were dropped or if they just got lost. Dave?  
> > > Time?
> > > The last patch adjusts proportional reclaim. Yuanhan Liu, can you retest
> > > the vm scalability test cases on a larger machine? Hugh, does this work
> > > for you on the memcg test cases?
> > 
> > Sure, and here is the result. I applied these 3 patches on v3.15-rc6,
> > and head commit is 60c10afd. e82e0561 is the old commit that introduced
> > the regression.  The testserver has 512G memory and 120 CPU.
> > 
> > It's a simple result; if you need more data, I can gather them and send
> > it to you tomorrow:
> > 
> > e82e0561v3.15-rc6   60c10afd
> > 
> > 185607851223212238868453
> > -34%+109
> > 
> > As you can see, the performance is back, and it is way much better ;)
> > 
> 
> Thanks a lot for that and the quick response. It is much appreciated.

Welcome! And sorry that I made a silly mistake. Those numbers are right
though, I just setup wrong compare base; I should compare them with
e82e0561's parent, which is 75485363ce85526 at below table.

Here is the detailed results to compensate the mistake I made ;)

Legend:
~XX%- stddev percent  (3 runs for each kernel)
[+-]XX% - change percent


75485363ce85526  e82e0561dae9f3ae5a21fc2d3  v3.15-rc6  
60c10afd233f3344479d229dc  
---  -  -  
-  
  35979244 ~ 0% -48.4%   18560785 ~ 0% -66.0%   12235090 ~ 0%  
+8.0%   38868453 ~ 0%   vm-scalability.throughput

 28138 ~ 0%   +7448.2%2123943 ~ 0%   +2724.5% 794777 ~ 0%  
+1.6%  28598 ~ 0%   proc-vmstat.allocstall

   544 ~ 6% -95.2% 26 ~ 0% -96.5% 19 ~21%  
-6.9%506 ~ 6%   numa-vmstat.node2.nr_isolated_file
  12009832 ~11%+368.1%   56215319 ~ 0%+312.9%   49589361 ~ 1%  
+0.7%   12091235 ~ 5%   numa-numastat.node3.numa_foreign
   560 ~ 5% -95.7% 24 ~12% -96.9% 17 ~10%  
-8.7%511 ~ 2%   numa-vmstat.node1.nr_isolated_file
   8740137 ~12%+574.0%   58910256 ~ 0%+321.0%   36798827 ~ 0% 
+21.0%   10578905 ~13%   numa-vmstat.node0.numa_other
   8734988 ~12%+574.4%   58904944 ~ 0%+321.2%   36794158 ~ 0% 
+21.0%   10572718 ~13%   numa-vmstat.node0.numa_miss
  1308 ~12%-100.0%  0 ~ 0%-100.0%  0  
+23.3%   1612 ~18%   proc-vmstat.pgscan_direct_throttle
  12294788 ~11%+401.2%   61622745 ~ 0%+332.6%   53190547 ~ 0% 
-13.2%   10667387 ~ 5%   numa-numastat.node1.numa_foreign
   576 ~ 6% -91.2% 50 ~22% -94.3% 33 ~20% 
-18.1%472 ~ 1%   numa-vmstat.node0.nr_isolated_file
12 ~24%   +2400.0%316 ~ 4%  +13543.7%   1728 ~ 5%
+155.3% 32 ~29%   proc-vmstat.compact_stall
   572 ~ 2% -96.4% 20 ~18% -97.6% 13 ~11% 
-17.5%472 ~ 2%   numa-vmstat.node3.nr_isolated_file
  3012 ~12%   +2388.4%  74959 ~ 0%+254.7%  10685 ~ 1% 
-45.4%   1646 ~ 1%   proc-vmstat.pageoutrun
  2312 ~ 3% -94.2%133 ~ 4% -95.8% 97 ~ 8% 
-12.6%   2021 ~ 2%   proc-vmstat.nr_isolated_file
   2575163 ~ 0%   +2779.1%   74141888 ~ 0%+958.0%   27244229 ~ 0%  
-1.3%2542941 ~ 0%   proc-vmstat.pgscan_direct_dma32
  21916603 ~13%   +2519.8%  5.742e+08 ~ 0%   +2868.9%  6.507e+08 ~ 0% 
-16.1%   18397644 ~ 5%   proc-vmstat.pgscan_kswapd_normal
 53306 ~24%   +1077.9% 627895 ~ 0%   +2066.2%1154741 ~ 0% 
+23.5%  65815 ~24%   proc-vmstat.pgscan_kswapd_dma32
   2575163 ~ 0%   +2778.6%   74129497 ~ 0%+957.8%   27239606 ~ 0%  
-1.3%2542353 ~ 0%   proc-vmstat.pgsteal_direct_dma32
  21907744 ~14%   +2520.8%  5.742e+08 ~ 0%   +2870.0%  6.507e+08 ~ 0% 
-16.1%   18386641 ~ 5%   proc-vmstat.pgsteal_kswapd_normal
 53306 ~24%   +1077.7% 627796 ~ 0%   +2065.7%1154436 ~ 0% 
+23.3%  65731 ~24%   proc-vmstat.pgsteal_kswapd_dma32
   2967449 ~ 0%   +2432.7%   75156011 ~ 0%+869.9%   28781337 ~ 0%  
-0.7%2945933 ~ 0%   proc-vmstat.pgalloc_dma32
  13081172 ~11%+599.4%   91495653 ~ 0%+337.1%   57180622 ~ 0%

Re: [PATCH 0/3] Shrinkers and proportional reclaim

2014-05-22 Thread Yuanhan Liu

On Thu, May 22, 2014 at 10:09:36AM +0100, Mel Gorman wrote:
> This series is aimed at regressions noticed during reclaim activity. The
> first two patches are shrinker patches that were posted ages ago but never
> merged for reasons that are unclear to me. I'm posting them again to see if
> there was a reason they were dropped or if they just got lost. Dave?  Time?
> The last patch adjusts proportional reclaim. Yuanhan Liu, can you retest
> the vm scalability test cases on a larger machine? Hugh, does this work
> for you on the memcg test cases?

Sure, and here is the result. I applied these 3 patches on v3.15-rc6,
and head commit is 60c10afd. e82e0561 is the old commit that introduced
the regression.  The testserver has 512G memory and 120 CPU.

It's a simple result; if you need more data, I can gather them and send
it to you tomorrow:

e82e0561v3.15-rc6   60c10afd

185607851223212238868453
-34%+109

As you can see, the performance is back, and it is way much better ;)

--yliu
> 
> Based on ext4, I get the following results but unfortunately my larger test
> machines are all unavailable so this is based on a relatively small machine.
> 
> postmark
>   3.15.0-rc53.15.0-rc5
>  vanilla   proportion-v1r4
> Ops/sec Transactions 21.00 (  0.00%)   25.00 ( 19.05%)
> Ops/sec FilesCreate  39.00 (  0.00%)   45.00 ( 15.38%)
> Ops/sec CreateTransact   10.00 (  0.00%)   12.00 ( 20.00%)
> Ops/sec FilesDeleted   6202.00 (  0.00%) 6202.00 (  0.00%)
> Ops/sec DeleteTransact   11.00 (  0.00%)   12.00 (  9.09%)
> Ops/sec DataRead/MB  25.97 (  0.00%)   30.02 ( 15.59%)
> Ops/sec DataWrite/MB 49.99 (  0.00%)   57.78 ( 15.58%)
> 
> ffsb (mail server simulator)
>  3.15.0-rc5 3.15.0-rc5
> vanillaproportion-v1r4
> Ops/sec readall   9402.63 (  0.00%)  9805.74 (  4.29%)
> Ops/sec create4695.45 (  0.00%)  4781.39 (  1.83%)
> Ops/sec delete 173.72 (  0.00%)   177.23 (  2.02%)
> Ops/sec Transactions 14271.80 (  0.00%) 14764.37 (  3.45%)
> Ops/sec Read37.00 (  0.00%)38.50 (  4.05%)
> Ops/sec Write   18.20 (  0.00%)18.50 (  1.65%)
> 
> dd of a large file
> 3.15.0-rc53.15.0-rc5
>vanilla   proportion-v1r4
> WallTime DownloadTar   75.00 (  0.00%)   61.00 ( 18.67%)
> WallTime DD   423.00 (  0.00%)  401.00 (  5.20%)
> WallTime Delete 2.00 (  0.00%)5.00 (-150.00%)
> 
> stutter (times mmap latency during large amounts of IO)
> 
> 3.15.0-rc53.15.0-rc5
>vanilla   proportion-v1r4
> Unit >5ms Delays  80252. (  0.00%)  81523. ( -1.58%)
> Unit Mmap min 8.2118 (  0.00%)  8.3206 ( -1.33%)
> Unit Mmap mean   17.4614 (  0.00%) 17.2868 (  1.00%)
> Unit Mmap stddev 24.9059 (  0.00%) 34.6771 (-39.23%)
> Unit Mmap max  2811.6433 (  0.00%)   2645.1398 (  5.92%)
> Unit Mmap 90%20.5098 (  0.00%) 18.3105 ( 10.72%)
> Unit Mmap 93%22.9180 (  0.00%) 20.1751 ( 11.97%)
> Unit Mmap 95%25.2114 (  0.00%) 22.4988 ( 10.76%)
> Unit Mmap 99%46.1430 (  0.00%) 43.5952 (  5.52%)
> Unit Ideal  Tput 85.2623 (  0.00%) 78.8906 (  7.47%)
> Unit Tput min44.0666 (  0.00%) 43.9609 (  0.24%)
> Unit Tput mean   45.5646 (  0.00%) 45.2009 (  0.80%)
> Unit Tput stddev  0.9318 (  0.00%)  1.1084 (-18.95%)
> Unit Tput max46.7375 (  0.00%) 46.7539 ( -0.04%)
> 
>  fs/super.c  | 16 +---
>  mm/vmscan.c | 36 +---
>  2 files changed, 34 insertions(+), 18 deletions(-)
> 
> -- 
> 1.8.4.5
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: performance regression due to commit e82e0561("mm: vmscan: obey proportional scanning requirements for kswapd")

2014-03-17 Thread Yuanhan Liu

On Sat, Mar 15, 2014 at 08:56:10PM -0700, Hugh Dickins wrote:
> On Fri, 14 Mar 2014, Mel Gorman wrote:
> > On Thu, Mar 13, 2014 at 05:44:57AM -0700, Hugh Dickins wrote:
> > > On Wed, 12 Mar 2014, Mel Gorman wrote:
> > > > On Tue, Feb 18, 2014 at 04:01:22PM +0800, Yuanhan Liu wrote:
... snip ...

> > > I missed Yuanhan's mail, but seeing your reply reminds me of another
> > > issue with that proportionality patch - or perhaps more thought would
> > > show them to be two sides of the same issue, with just one fix required.
> > > Let me throw our patch into the cauldron.
> > > 
> > > [PATCH] mm: revisit shrink_lruvec's attempt at proportionality
> > > 
> > > We have a memcg reclaim test which exerts a certain amount of pressure,
> > > and expects to see a certain range of page reclaim in response.  It's a
> > > very wide range allowed, but the test repeatably failed on v3.11 onwards,
> > > because reclaim goes wild and frees up almost everything.
> > > 
> > > This wild behaviour bisects to Mel's "scan_adjusted" commit e82e0561dae9
> > > "mm: vmscan: obey proportional scanning requirements for kswapd".  That
> > > attempts to achieve proportionality between anon and file lrus: to the
> > > extent that once one of those is empty, it then tries to empty the other.
> > > Stop that.
> > > 
> > > Signed-off-by: Hugh Dickins 
> > > ---
> > > 
> > > We've been running happily with this for months; but all that time it's
> > > been on my TODO list with a "needs more thought" tag before we could
> > > upstream it, and I never got around to that.  We also have a somewhat
> > > similar, but older and quite independent, fix to get_scan_count() from
> > > Suleiman, which I'd meant to send along at the same time: I'll dig that
> > > one out tomorrow or the day after.
> 
> I've sent that one out now in a new thread
> https://lkml.org/lkml/2014/3/15/168
> and also let's tie these together with Hannes's
> https://lkml.org/lkml/2014/3/14/277
> 
> > > 
> > 
> > I ran a battery of page reclaim related tests against it on top of
> > 3.14-rc6. Workloads showed small improvements in their absolute performance
> > but actual IO behaviour looked much better in some tests.  This is the
> > iostats summary for the test that showed the biggest different -- dd of
> > a large file on ext3.
> > 
> > 3.14.0-rc6  3.14.0-rc6
> >vanilla  proportional-v1r1
> > Meansda-avgqz   1045.64 224.18  
> > Meansda-await   2120.12 506.77  
> > Meansda-r_await 18.61   19.78   
> > Meansda-w_await 11089.602126.35 
> > Max sda-avgqz   2294.39 787.13  
> > Max sda-await   7074.79 2371.67 
> > Max sda-r_await 503.00  414.00  
> > Max sda-w_await 35721.937249.84 
> > 
> > Not all workloads benefitted. The same workload on ext4 showed no useful
> > difference. btrfs looks like
> > 
> >  3.14.0-rc6 3.14.0-rc6
> >vanilla  proportional-v1r1
> > Meansda-avgqz   762.69  650.39  
> > Meansda-await   2438.46 2495.15 
> > Meansda-r_await 44.18   47.20   
> > Meansda-w_await 6109.19 5139.86 
> > Max sda-avgqz   2203.50 1870.78 
> > Max sda-await   7098.26 6847.21 
> > Max sda-r_await 63.02   156.00  
> > Max sda-w_await 19921.7011085.13
> > 
> > Better but not as dramatically so. I didn't analyse why. A workload that
> > had a large anonymous mapping with large amounts of IO in the background
> > did not show any regressions so based on that and the fact the patch looks
> > ok, here goes nothing;
> > 
> > Acked-by: Mel Gorman 
> 
> Big thank you, Mel, for doing so much work on it, and so very quickly.
> I get quite lost in the numbers myself: I'm much more convinced of it
> by your numbers and ack.
> 
> > 
> > You say it's already been tested for months but it would be nice if the
> > workload that generated this thread was also tested.
> 
> Yes indeed: Yuanhan, do you have time to try this patch for your
> testcase?  I'm hoping it will prove at least as effective as your
> own suggested patch, but

Re: performance regression due to commit e82e0561("mm: vmscan: obey proportional scanning requirements for kswapd")

2014-03-13 Thread Yuanhan Liu

On Wed, Mar 12, 2014 at 04:54:47PM +, Mel Gorman wrote:
> On Tue, Feb 18, 2014 at 04:01:22PM +0800, Yuanhan Liu wrote:
> > Hi,
> > 
> > Commit e82e0561("mm: vmscan: obey proportional scanning requirements for
> > kswapd") caused a big performance regression(73%) for vm-scalability/
> > lru-file-readonce testcase on a system with 256G memory without swap.
> > 
> > That testcase simply looks like this:
> >  truncate -s 1T /tmp/vm-scalability.img
> >  mkfs.xfs -q /tmp/vm-scalability.img
> >  mount -o loop /tmp/vm-scalability.img /tmp/vm-scalability
> > 
> >  SPARESE_FILE="/tmp/vm-scalability/sparse-lru-file-readonce"
> >  for i in `seq 1 120`; do
> >  truncate $SPARESE_FILE-$i -s 36G
> >  timeout --foreground -s INT 300 dd bs=4k if=$SPARESE_FILE-$i 
> > of=/dev/null
> >  done
> > 
> >  wait
> > 
> 
> The filename implies that it's a sparse file with no IO but does not say
> what the truncate function/program/whatever actually does.

It's actually the /usr/bin/truncate file from coreutils.

> If it's really a
> sparse file then the dd process should be reading zeros and writing them to
> NULL without IO. Where are pages being dirtied?

Sorry, my bad. I was wrong and I meant to "the speed of getting new
pages", but not "the speed of dirtying pages".

> Does the truncate command
> really create a sparse file or is it something else?
> 
> > Actually, it's not the newlly added code(obey proportional scanning)
> > in that commit caused the regression. But instead, it's the following
> > change:
> > +
> > +   if (nr_reclaimed < nr_to_reclaim || scan_adjusted)
> > +   continue;
> > +
> > 
> > 
> > -   if (nr_reclaimed >= nr_to_reclaim &&
> > -   sc->priority < DEF_PRIORITY)
> > +   if (global_reclaim(sc) && !current_is_kswapd())
> > break;
> > 
> > The difference is that we might reclaim more than requested before
> > in the first round reclaimming(sc->priority == DEF_PRIORITY).
> > 
> > So, for a testcase like lru-file-readonce, the dirty rate is fast, and
> > reclaimming SWAP_CLUSTER_MAX(32 pages) each time is not enough for catching
> > up the dirty rate. And thus page allocation stalls, and performance drops:
> > 
> >O for e82e0561
> >* for parent commit
> > 
> > proc-vmstat.allocstall
> > 
> >  2e+06 
> > ++---+
> >1.8e+06 O+  OO   O   
> > |
> >|
> > |
> >1.6e+06 ++   
> > |
> >1.4e+06 ++   
> > |
> >|
> > |
> >1.2e+06 ++   
> > |
> >  1e+06 ++   
> > |
> > 80 ++   
> > |
> >|
> > |
> > 60 ++   
> > |
> > 40 ++   
> > |
> >|
> > |
> > 20 
> > *+..**...*...*
> >  0 
> > ++---+
> > 
> >vm-scalability.throughput
> > 
> >2.2e+07 
> > ++---+
> >|
> > |
> >  2e+07 
> > *+..**...*...*
> >1.8e+07 ++   
> > |
> >|
> > |
> >1.6e+07 ++

Re: performance regression due to commit e82e0561("mm: vmscan: obey proportional scanning requirements for kswapd")

2014-03-07 Thread Yuanhan Liu

ping...

On Tue, Feb 18, 2014 at 04:01:22PM +0800, Yuanhan Liu wrote:
> Hi,
> 
> Commit e82e0561("mm: vmscan: obey proportional scanning requirements for
> kswapd") caused a big performance regression(73%) for vm-scalability/
> lru-file-readonce testcase on a system with 256G memory without swap.
> 
> That testcase simply looks like this:
>  truncate -s 1T /tmp/vm-scalability.img
>  mkfs.xfs -q /tmp/vm-scalability.img
>  mount -o loop /tmp/vm-scalability.img /tmp/vm-scalability
> 
>  SPARESE_FILE="/tmp/vm-scalability/sparse-lru-file-readonce"
>  for i in `seq 1 120`; do
>  truncate $SPARESE_FILE-$i -s 36G
>  timeout --foreground -s INT 300 dd bs=4k if=$SPARESE_FILE-$i 
> of=/dev/null
>  done
> 
>  wait
> 
> Actually, it's not the newlly added code(obey proportional scanning)
> in that commit caused the regression. But instead, it's the following
> change:
> +
> +   if (nr_reclaimed < nr_to_reclaim || scan_adjusted)
> +   continue;
> +
> 
> 
> -   if (nr_reclaimed >= nr_to_reclaim &&
> -   sc->priority < DEF_PRIORITY)
> +   if (global_reclaim(sc) && !current_is_kswapd())
> break;
> 
> The difference is that we might reclaim more than requested before
> in the first round reclaimming(sc->priority == DEF_PRIORITY).
> 
> So, for a testcase like lru-file-readonce, the dirty rate is fast, and
> reclaimming SWAP_CLUSTER_MAX(32 pages) each time is not enough for catching
> up the dirty rate. And thus page allocation stalls, and performance drops:
> 
>O for e82e0561
>* for parent commit
> 
> proc-vmstat.allocstall
> 
>  2e+06 ++---+
>1.8e+06 O+  OO   O   |
>||
>1.6e+06 ++   |
>1.4e+06 ++   |
>||
>1.2e+06 ++   |
>  1e+06 ++   |
> 80 ++   |
>||
> 60 ++   |
> 40 ++   |
>||
> 20 *+..**...*...*
>  0 ++---+
> 
>vm-scalability.throughput
> 
>2.2e+07 ++---+
>||
>  2e+07 *+..**...*...*
>1.8e+07 ++   |
>||
>1.6e+07 ++   |
>||
>1.4e+07 ++   |
>||
>1.2e+07 ++   |
>  1e+07 ++   |
>||
>  8e+06 ++  OO   O   |
>O|
>  6e+06 ++---+
> 
> I made a patch which simply keeps reclaimming more if sc->priority == 
> DEF_PRIORITY.
> I'm not sure it's the right way to go or not. Anyway, I pasted it here for 
> comments.
> 
> ---
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 26ad67f..37004a8 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -1828,7 +1828,16 @@ static void shrink_lruvec(struct lruvec *lruvec, 
> struct scan_control *sc)
>   unsigned long nr_reclaimed = 0;
>

performance regression due to commit e82e0561("mm: vmscan: obey proportional scanning requirements for kswapd")

2014-02-18 Thread Yuanhan Liu

Hi,

Commit e82e0561("mm: vmscan: obey proportional scanning requirements for
kswapd") caused a big performance regression(73%) for vm-scalability/
lru-file-readonce testcase on a system with 256G memory without swap.

That testcase simply looks like this:
 truncate -s 1T /tmp/vm-scalability.img
 mkfs.xfs -q /tmp/vm-scalability.img
 mount -o loop /tmp/vm-scalability.img /tmp/vm-scalability

 SPARESE_FILE="/tmp/vm-scalability/sparse-lru-file-readonce"
 for i in `seq 1 120`; do
 truncate $SPARESE_FILE-$i -s 36G
 timeout --foreground -s INT 300 dd bs=4k if=$SPARESE_FILE-$i 
of=/dev/null
 done

 wait

Actually, it's not the newlly added code(obey proportional scanning)
in that commit caused the regression. But instead, it's the following
change:
+
+   if (nr_reclaimed < nr_to_reclaim || scan_adjusted)
+   continue;
+


-   if (nr_reclaimed >= nr_to_reclaim &&
-   sc->priority < DEF_PRIORITY)
+   if (global_reclaim(sc) && !current_is_kswapd())
break;

The difference is that we might reclaim more than requested before
in the first round reclaimming(sc->priority == DEF_PRIORITY).

So, for a testcase like lru-file-readonce, the dirty rate is fast, and
reclaimming SWAP_CLUSTER_MAX(32 pages) each time is not enough for catching
up the dirty rate. And thus page allocation stalls, and performance drops:

   O for e82e0561
   * for parent commit

proc-vmstat.allocstall

 2e+06 ++---+
   1.8e+06 O+  OO   O   |
   ||
   1.6e+06 ++   |
   1.4e+06 ++   |
   ||
   1.2e+06 ++   |
 1e+06 ++   |
80 ++   |
   ||
60 ++   |
40 ++   |
   ||
20 *+..**...*...*
 0 ++---+

   vm-scalability.throughput

   2.2e+07 ++---+
   ||
 2e+07 *+..**...*...*
   1.8e+07 ++   |
   ||
   1.6e+07 ++   |
   ||
   1.4e+07 ++   |
   ||
   1.2e+07 ++   |
 1e+07 ++   |
   ||
 8e+06 ++  OO   O   |
   O|
 6e+06 ++---+

I made a patch which simply keeps reclaimming more if sc->priority == 
DEF_PRIORITY.
I'm not sure it's the right way to go or not. Anyway, I pasted it here for 
comments.

---
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 26ad67f..37004a8 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1828,7 +1828,16 @@ static void shrink_lruvec(struct lruvec *lruvec, struct 
scan_control *sc)
unsigned long nr_reclaimed = 0;
unsigned long nr_to_reclaim = sc->nr_to_reclaim;
struct blk_plug plug;
-   bool scan_adjusted = false;
+   /*
+* On large memory systems, direct reclamming of SWAP_CLUSTER_MAX
+* each time may not catch up the dirty rate in some cases(say,
+* vm-scalability/lru-file-readonce), which may increase the
+* page allocation stall latency in the end.
+*
+* Here we try to reclaim more than requested for the first round
+* (sc->priority == DEF_PRIORITY) to reduce such latency.
+*/
+   bool scan_adjusted = sc->priority == DEF_PRIORITY;

Re: changes caused by 0d11e6ac("blk-mq: fix use-after-free of request")

2013-12-19 Thread Yuanhan Liu

On Wed, Dec 18, 2013 at 11:29:30AM +0100, Matias Bjørling wrote:
> On 12/18/2013 09:50 AM, Yuanhan Liu wrote:
> >Hi,
> >
> >FYI, we noticed some changes caused by 0d11e6ac("blk-mq: fix use-after-free 
> >of request"):
> >
> 
> The blk-mq accounting was faulty up to that commit. We should
> compare the blk-mq with the previous block layer.
> 
> Could you try to revert the following patches:
> 
> f02b9ac virtio-blk: virtqueue_kick() must be ordered with other...
> 1cf7e9c virtio_blk: blk-mq support
> 
> and compare the two runs (upto 0d11e6ac applied, and the same, with
> the two patches reverted)

Hi Matias,

You are right. Those counter restore back with the two patches 
reverted(d1b4e3825c8848b0ea0f).

959a35f13eb785f982d7   0d11e6aca396e679c07b   d1b4e3825c8848b0ea0f  
----  - 
 
  0.00  60.02 ~42%  0.00   
vpx/micro/xfstests/4HDD-btrfs-generic-quick
  0.00 367.81 ~27%  0.00   
vpx/micro/xfstests/4HDD-ext4-generic-mid
  0.00 411.64 ~13%  0.00   
vpx/micro/xfstests/4HDD-xfs-generic-mid
  0.00 208.39 ~10%  0.00   
vpx/micro/xfstests/4HDD-xfs-generic-quick
  0.001047.86   0.00   TOTAL 
iostat.vdd.await

959a35f13eb785f982d7   0d11e6aca396e679c07b   d1b4e3825c8848b0ea0f  
----  - 
 
  0.00 301.60 ~34%  0.00   
vpx/micro/xfstests/4HDD-btrfs-generic-mid
  0.00 249.16 ~12%  0.00   
vpx/micro/xfstests/4HDD-btrfs-generic-quick
  0.00  51.45 ~26%  0.00   
vpx/micro/xfstests/4HDD-ext4-generic-mid
  0.00  91.51 ~21%  0.04   
vpx/micro/xfstests/4HDD-xfs-generic-127
  0.001919.27 ~43%  0.00   
vpx/micro/xfstests/4HDD-xfs-generic-mid
  0.00 121.04 ~11%  0.00   
vpx/micro/xfstests/4HDD-xfs-generic-quick
  0.002734.03   0.04   TOTAL 
iostat.vda.r_await

959a35f13eb785f982d7   0d11e6aca396e679c07b   d1b4e3825c8848b0ea0f  
----  - 
 
  0.00 406.12 ~10%  0.00   
vpx/micro/xfstests/4HDD-btrfs-generic-mid
  0.00 433.66 ~ 7%  0.00   
vpx/micro/xfstests/4HDD-btrfs-generic-quick
  0.00 807.79 ~15%  0.00   
vpx/micro/xfstests/4HDD-ext4-generic-mid
  0.00  42.94 ~67%  0.51   
vpx/micro/xfstests/4HDD-xfs-generic-127
  0.00 592.20 ~16%  0.00   
vpx/micro/xfstests/4HDD-xfs-generic-mid
  0.00 401.74 ~12%  0.00   
vpx/micro/xfstests/4HDD-xfs-generic-quick
  0.002684.45   0.51   TOTAL 
iostat.vda.w_await



--yliu
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

changes caused by 0d11e6ac("blk-mq: fix use-after-free of request")

2013-12-18 Thread Yuanhan Liu

Hi,

FYI, we noticed some changes caused by 0d11e6ac("blk-mq: fix use-after-free of 
request"):

- 959a35f13eb785f982d7 is parent of 0d11e6aca396e679c07b
- kbuildx and vpx are KVM testbox

959a35f13eb785f982d7  0d11e6aca396e679c07b  
    
40592.80  +314.1%168112.22  
kbuildx/sysbench/fileio/600s-100%-1HDD-xfs-64G-1024-seqrewr-sync
19763.87  +941.6%205868.32  
kbuildx/sysbench/fileio/600s-100%-1HDD-xfs-64G-1024-seqwr-sync
60356.68  +519.6%373980.55  TOTAL iostat.vdb.wkB/s

959a35f13eb785f982d7  0d11e6aca396e679c07b  
    
0.64  +345.7% 2.87  
kbuildx/sysbench/fileio/600s-100%-1HDD-xfs-64G-1024-seqrewr-sync
0.64  +345.7% 2.87  TOTAL iostat.vdb.wrqm/s

959a35f13eb785f982d7  0d11e6aca396e679c07b  
    
0.00303.01  
vpx/micro/xfstests/4HDD-btrfs-generic-mid
0.00252.31  
vpx/micro/xfstests/4HDD-btrfs-generic-quick
0.00163.05  
vpx/micro/xfstests/4HDD-ext4-generic-mid
0.00   2442.01  
vpx/micro/xfstests/4HDD-xfs-generic-127
0.00507.41  
vpx/micro/xfstests/4HDD-xfs-generic-mid
0.00404.00  
vpx/micro/xfstests/4HDD-xfs-generic-quick
0.00   4071.80  TOTAL iostat.vda.r/s

959a35f13eb785f982d7  0d11e6aca396e679c07b  
    
0.00860.50  
vpx/micro/xfstests/4HDD-btrfs-generic-mid
0.00112.49  
vpx/micro/xfstests/4HDD-btrfs-generic-quick
0.00360.58  
vpx/micro/xfstests/4HDD-ext4-generic-mid
0.00   4937.56  
vpx/micro/xfstests/4HDD-xfs-generic-127
0.00   2070.37  
vpx/micro/xfstests/4HDD-xfs-generic-mid
0.00106.65  
vpx/micro/xfstests/4HDD-xfs-generic-quick
0.00   8448.15  TOTAL iostat.vda.w/s

959a35f13eb785f982d7  0d11e6aca396e679c07b  
    
   166400.75  0.45  
vpx/micro/xfstests/4HDD-btrfs-generic-mid
13934.99  0.12  
vpx/micro/xfstests/4HDD-btrfs-generic-quick
 5483.25  0.83  
vpx/micro/xfstests/4HDD-ext4-generic-mid
   168450.24  0.51  
vpx/micro/xfstests/4HDD-xfs-generic-127
   316831.98  0.32  
vpx/micro/xfstests/4HDD-xfs-generic-mid
28037.99  0.23  
vpx/micro/xfstests/4HDD-xfs-generic-quick
   699139.21  2.45  TOTAL iostat.vda.avgqu-sz

959a35f13eb785f982d7  0d11e6aca396e679c07b  
    
0.00326.86  
vpx/micro/xfstests/4HDD-btrfs-generic-mid
0.00281.25  
vpx/micro/xfstests/4HDD-btrfs-generic-quick
0.00811.90  
vpx/micro/xfstests/4HDD-ext4-generic-mid
0.00 47.78  
vpx/micro/xfstests/4HDD-xfs-generic-127
0.00   2117.30  
vpx/micro/xfstests/4HDD-xfs-generic-mid
0.00163.09  
vpx/micro/xfstests/4HDD-xfs-generic-quick
0.00   3748.18  TOTAL iostat.vda.await

959a35f13eb785f982d7  0d11e6aca396e679c07b  
    
0.00301.60  
vpx/micro/xfstests/4HDD-btrfs-generic-mid
0.00249.16  
vpx/micro/xfstests/4HDD-btrfs-generic-quick
0.00 51.45  
vpx/micro/xfstests/4HDD-ext4-generic-mid
0.00 91.51  
vpx/micro/xfstests/4HDD-xfs-generic-127
0.00   1919.27  
vpx/micro/xfstests/4HDD-xfs-generic-mid
0.00121.04  
vpx/micro/xfstests/4HDD-xfs-generic-quick
0.00   2734.03  TOTAL iostat.vda.r_await

959a35f13eb785f982d7  0d11e6aca396e679c07b  
    
0.00406.12  
vpx/micro/xfstests/4HDD-btrfs-generic-mid
0.00433.66  
vpx/micro/xfstests/4HDD-btrfs-generic-quick
0.00807.7

Re: BUG: sleeping function called from invalid context at kernel/locking/rwsem.c:20

2013-12-09 Thread Yuanhan Liu

On Thu, Dec 05, 2013 at 05:50:19PM -0500, Tejun Heo wrote:
> On Thu, Dec 05, 2013 at 11:10:51AM +0800, Yuanhan Liu wrote:
> > Greetings,
> > 
> > I got the below dmesg and the first bad commit is
> > 
> > commit 4b93dc9b1c684d0587fe44d36bbfdf45bd3bea9d
> > Author: Tejun Heo 
> > AuthorDate: Thu Nov 28 14:54:43 2013 -0500
> > Commit: Greg Kroah-Hartman 
> > CommitDate: Fri Nov 29 18:16:08 2013 -0800
> > 
> > sysfs, kernfs: prepare mount path for kernfs
> 
> Oops, can you please try the following patch?

Sorry for being a bit late, but, it does work!

--yliu
> 
> diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
> index e7e3aa8..8d07527 100644
> --- a/fs/sysfs/mount.c
> +++ b/fs/sysfs/mount.c
> @@ -45,8 +45,10 @@ static struct dentry *sysfs_mount(struct file_system_type 
> *fs_type,
>  
>  static void sysfs_kill_sb(struct super_block *sb)
>  {
> + void *ns = (void *)kernfs_super_ns(sb);
> +
>   kernfs_kill_sb(sb);
> - kobj_ns_drop(KOBJ_NS_TYPE_NET, (void *)kernfs_super_ns(sb));
> + kobj_ns_drop(KOBJ_NS_TYPE_NET, ns);
>  }
>  
>  static struct file_system_type sysfs_fs_type = {
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 4/4] sched: bias to target cpu load to reduce task moving

2013-12-04 Thread Yuanhan Liu

On Tue, Dec 03, 2013 at 05:05:56PM +0800, Alex Shi wrote:
> Task migration happens when target just a bit less then source cpu load.
> To reduce such situation happens, aggravate the target cpu load with
> sd->imbalance_pct/100.
> 
> This patch removes the hackbench thread regression on Daniel's
> Intel Core2 server.
> 
> a5d6e63   +patch1~3   +patch1~4
> hackbench -T -s 4096 -l 1000 -g 10 -f 40
> 27.914" 38.694"   28.587"
> 28.390" 38.341"   29.513"
> 28.048" 38.626"   28.706"
> 
> Signed-off-by: Alex Shi 

Hi Alex,

We obsevered 150% performance gain with vm-scalability/300s-mmap-pread-seq
testcase with this patch applied. Here is a list of changes we got so far:

testbox : brickland
testcase: vm-scalability/300s-mmap-pread-seq


f1b6442c7dd12802e622  d70495ef86f397816d73  
   (parent commit)(this commit)
    
 26393249.80  +150.9%  66223933.60  vm-scalability.throughput

  225.12   -49.9%   112.75  time.elapsed_time
36333.40   -90.7%  3392.20  vmstat.system.cs
2.40  +375.0%11.40  vmstat.cpu.id
  3770081.60   -97.7% 87673.40  time.major_page_faults
  3975276.20   -97.0%117409.60  
time.voluntary_context_switches
3.05  +301.7%12.24  iostat.cpu.idle
21118.41   -70.3%  6277.19  time.system_time
   18.40  +130.4%42.40  vmstat.cpu.us
   77.00   -41.3%45.20  vmstat.cpu.sy
47459.60   -31.3% 32592.20  vmstat.system.in
82435.40   -12.1% 72443.60  
time.involuntary_context_switches
 5128.13   +14.0%  5848.30  time.user_time
11656.20-7.8% 10745.60  
time.percent_of_cpu_this_job_got
   1069997484.80+0.3% 1073679919.00 time.minor_page_faults


--yliu
> ---
>  kernel/sched/fair.c | 18 --
>  1 file changed, 12 insertions(+), 6 deletions(-)
> 
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index bccdd89..c49b7ba 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -978,7 +978,7 @@ static inline unsigned long group_weight(struct 
> task_struct *p, int nid)
>  
>  static unsigned long weighted_cpuload(const int cpu);
>  static unsigned long source_load(int cpu);
> -static unsigned long target_load(int cpu);
> +static unsigned long target_load(int cpu, int imbalance_pct);
>  static unsigned long power_of(int cpu);
>  static long effective_load(struct task_group *tg, int cpu, long wl, long wg);
>  
> @@ -3809,11 +3809,17 @@ static unsigned long source_load(int cpu)
>   * Return a high guess at the load of a migration-target cpu weighted
>   * according to the scheduling class and "nice" value.
>   */
> -static unsigned long target_load(int cpu)
> +static unsigned long target_load(int cpu, int imbalance_pct)
>  {
>   struct rq *rq = cpu_rq(cpu);
>   unsigned long total = weighted_cpuload(cpu);
>  
> + /*
> +  * without cpu_load decay, in most of time cpu_load is same as total
> +  * so we need to make target a bit heavier to reduce task migration
> +  */
> + total = total * imbalance_pct / 100;
> +
>   if (!sched_feat(LB_BIAS))
>   return total;
>  
> @@ -4033,7 +4039,7 @@ static int wake_affine(struct sched_domain *sd, struct 
> task_struct *p, int sync)
>   this_cpu  = smp_processor_id();
>   prev_cpu  = task_cpu(p);
>   load  = source_load(prev_cpu);
> - this_load = target_load(this_cpu);
> + this_load = target_load(this_cpu, 100);
>  
>   /*
>* If sync wakeup then subtract the (maximum possible)
> @@ -4089,7 +4095,7 @@ static int wake_affine(struct sched_domain *sd, struct 
> task_struct *p, int sync)
>  
>   if (balanced ||
>   (this_load <= load &&
> -  this_load + target_load(prev_cpu) <= tl_per_task)) {
> +  this_load + target_load(prev_cpu, 100) <= tl_per_task)) {
>   /*
>* This domain has SD_WAKE_AFFINE and
>* p is cache cold in this domain, and
> @@ -4135,7 +4141,7 @@ find_idlest_group(struct sched_domain *sd, struct 
> task_struct *p, int this_cpu)
>   if (local_group)
>   load = source_load(i);
>   else
> - load = target_load(i);
> + load = target_load(i, sd->imbalance_pct);
>  
>   avg_load += load;
>   }
> @@ -5478,7 +5484,7 @@ static inline void update_sg_lb_stats(struct lb_env 
> *env,
>  
>   /* Bias balancing toward cpus of our domain */
>   if (local_group)
> - load = target_l

Re: kernel panic due to "PCI / ACPI: Use acpi_find_child_device() for child devices lookup"

2013-11-30 Thread Yuanhan Liu

On Fri, Nov 29, 2013 at 04:33:00PM +0100, Rafael J. Wysocki wrote:
> On Friday, November 29, 2013 01:25:52 PM Yuanhan Liu wrote:
> > Greetings,
> > 
> > We got the follow kernel panic dmesg(full dmesg is attached):
> 
> That patch has been updated in linux-next recently, can you please
> check if you are able to reproduce the problem with the new version?

Hi Rafael,

Yeah, lkp-ib03 boots fine with linux-pm/linux-next. Sorry for the noise.

--yliu
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: WARNING: CPU: 0 PID: 1 at init/main.c:711 do_one_initcall()

2013-11-26 Thread Yuanhan Liu

On Mon, Nov 25, 2013 at 03:57:40PM +0200, Boaz Harrosh wrote:
> On 11/25/2013 03:25 PM, Yuanhan Liu wrote:
> > 
> > Hi Boaz,
> > 
> > We are running an 0day kernel testing system. We will test all developers'
> > tree we tracked in our system automatically. And obviously, linux-open-osd
> > is in that list.
> > 
> > This system can't tell whether a branch is experimental unless
> > - You put one extra line of "Dont-Auto-Build" to the head commit log.
> > 
> > - the branch name contains "experimental", say exofs_ioctl-experimental
> > 
> > If both items aren't convenient to you, you can ask us to remove your
> > tree from that list. Then you will never get report like this from us.
> > However, you may lose a chance to find build, boot and performance bug
> > automatically for you ;)
> > 
> > --yliu
> > 
> 
> 
> Ha OK very cool. I will remember to put -experimental on the branch name
> this is fine I will do it ASAP.
> 
> Thanks so much. Do you have some web based info on the build system?

Sorry, nope.

> Do you have a place I can see test results and tests summery?

If you like, I can add you into the build-notify list. Once the build
finished, you might get an email like following:

--yliu


From: kbuild test robot 
To: Yuanhan Liu 
Subject: [yuanhan:branch] a828a375f9fcc422c8d2613d774d031fa8a02a97 BUILD SUCCESS

git://bee.sh.intel.com/git/yliu/linux.git  branch
a828a375f9fcc422c8d2613d774d031fa8a02a97  branch: commit summary

elapsed time: 262m

configs tested: 189

arm   allnoconfig
arm   almodconfig
arm at91_dt_defconfig
arm   imx_v6_v7_defconfig
arm  marzen_defconfig
arm   omap2plus_defconfig
arm  prima2_defconfig
arm s3c2410_defconfig
arm   spear13xx_defconfig
arm   tegra_defconfig
avr32  atngw100_defconfig
avr32 atstk1006_defconfig
frv defconfig
m68k  amiga_defconfig
m68k   m5475evb_defconfig
m68k  multi_defconfig
microblaze  mmu_defconfig
microblazenommu_defconfig
mn10300 asb2364_defconfig
openriscor1ksim_defconfig
tile tilegx_defconfig
um  defconfig
x86_64 acpi-redef
x86_64 randconfig-a0-1105
x86_64 randconfig-a1-1105
x86_64 randconfig-a2-1105
x86_64 randconfig-a3-1105
x86_64 randconfig-a4-1105
x86_64 randconfig-a5-1105
i386   randconfig-c0-1105
i386   randconfig-c1-1105
i386   randconfig-c2-1105
i386   randconfig-c3-1105
i386   randconfig-c4-1105
i386   randconfig-c5-1105
i386   randconfig-c6-1105
i386   randconfig-c7-1105
i386   randconfig-c8-1105
i386   randconfig-c9-1105
x86_64 randconfig-c0-1105
x86_64 randconfig-c1-1105
x86_64 randconfig-c2-1105
x86_64 randconfig-c3-1105
x86_64 randconfig-c4-1105
x86_64 randconfig-c5-1105
x86_64 randconfig-c6-1105
x86_64 randconfig-c7-1105
x86_64 randconfig-c8-1105
x86_64 randconfig-c9-1105
ia64 alldefconfig
ia64 allmodconfig
ia64  allnoconfig
ia64defconfig
mips allmodconfig
mips  allnoconfig
mips  fuloong2e_defconfig
i386 randconfig-i000-1105
i386 randconfig-i001-1105
i386 randconfig-i002-1105
i386 randconfig-i003-1105
i386 randconfig-i004-1105
i386 randconfig-i005-1105
i386 randconfig-i006-1105
i386 randconfig-i007-1105
i386 randconfig-i008-1105
i386 randconfig-i009-1105
powerpc  chroma_defconfig
powerpc   corenet64_smp_defconfig
powerpcgamecube_defconfig
powerpc linkstation_defconfig
powerpc wii_defconfig
sparc

Re: WARNING: CPU: 0 PID: 1 at init/main.c:711 do_one_initcall()

2013-11-25 Thread Yuanhan Liu

On Mon, Nov 25, 2013 at 12:43:42PM +0200, Boaz Harrosh wrote:
> On 11/22/2013 08:02 AM, Yuanhan Liu wrote:
> > Greetings,
> > 
> > I got the below dmesg and the first bad commit is
> > 
> > commit 20545536cd8ea949c61527b6395ec8c0d2c237b1
> > Author: Boaz Harrosh 
> > Date:   Thu Jul 19 15:22:37 2012 +0300
> > 
> > RFC: do_xor_speed Broken on UML do to jiffies
> > 
> 
> Hi Sir Yuanhan.
> 
> I understand that you are running exofs_ioctl branch on linux-open-osd.git .
> Please tell me more why you choose to run this branch it is an experimental

Hi Boaz,

We are running an 0day kernel testing system. We will test all developers'
tree we tracked in our system automatically. And obviously, linux-open-osd
is in that list.

This system can't tell whether a branch is experimental unless
- You put one extra line of "Dont-Auto-Build" to the head commit log.

- the branch name contains "experimental", say exofs_ioctl-experimental

If both items aren't convenient to you, you can ask us to remove your
tree from that list. Then you will never get report like this from us.
However, you may lose a chance to find build, boot and performance bug
automatically for you ;)

--yliu

> pNFS+Ganesha+exofs branch that we are working on around here. It might have
> problems.
> 
> Yes this patch has problems, I know. I have it in my tree because I need
> it if I want to use XOR engine with a UML system. If you do need to run
> this branch *exofs_ioctl* on your system then it is best you revert this
> patch.
> 
> Thanks for the report I think I'll just remove that patch and run with it
> locally.
> 
> Cheers
> Boaz
> 
> > Remember that hang I reported a while back on UML. Well
> > I'm at it again, and it still hangs and I found why.
> > 
> > I have dprinted jiffies and it never advances during the
> > loop at do_xor_speed. There for it is stuck in an endless
> > loop. I have also dprinted current_kernel_time() and it
> > returns the same constant value as well.
> > 
> > Note that it does usually work on UML, only during
> > the modprobe of xor.ko while that test is running. It looks
> > like some lucking is preventing the clock from ticking.
> > 
> > However ktime_get_ts does work for me so I changed the code
> > as below, so I can work. See how I put several safety
> > guards, to never get hangs again.
> > And I think my time based approach is more accurate then
> > previous system.
> > 
> > UML guys please investigate the jiffies issue? what is
> > xor.ko not doing right?
> > 
> > Signed-off-by: Boaz Harrosh 
> > 
> > +--++
> > |  ||
> > +--++
> > | boot_successes   | 0  |
> > | boot_failures| 29 |
> > | WARNING:CPU:PID:at_init/main.c:do_one_initcall() | 29 |
> > | initcall_calibrate_xor_blocks_returned_with_preemption_imbalance | 29 |
> > +--++
> > 
> > [0.127025]generic_sse:   148.363 MB/sec
> > [0.127478] xor: using function: prefetch64-sse (152.727 MB/sec)
> > [0.128017] [ cut here ]
> > [0.128531] WARNING: CPU: 0 PID: 1 at init/main.c:711 
> > do_one_initcall+0x105/0x115()
> > [0.129018] initcall calibrate_xor_blocks+0x0/0x144 returned with 
> > preemption imbalance 
> > [0.130013] Modules linked in:
> > [0.130357] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 
> > 3.12.0-11285-gb242bff #91
> > [0.131013]   88000d0dde00 8161acc5 
> > 88000d0dde48
> > [0.132554]  88000d0dde38 81052de9 81000316 
> > 81a77cfd
> > [0.133380]     
> > 88000d0dde98
> > [0.134213] Call Trace:
> > [0.134493]  [] dump_stack+0x4e/0x7a
> > [0.135017]  [] warn_slowpath_common+0x75/0x8e
> > [0.135654]  [] ? do_one_initcall+0x105/0x115
> > [0.136015]  [] ? do_xor_speed+0xdd/0xdd
> > [0.137016]  [] warn_slowpath_fmt+0x47/0x49
> > [0.137628]  [] ? free_pages+0x51/0x53
> > [0.138015]  [] ? do_xor_speed+0xdd/0xdd
> > [0.138623]  [] do_one_initcall+0

WARNING at net/core/dev.c:netdev_all_upper_get_next_dev_rcu()

2013-11-24 Thread Yuanhan Liu

Greetings,

We got the following warning:
  [   25.040056] BIOS EDD facility v0.16 2004-Jun-25, 0 devices found
  [   25.047312] EDD information not available.
  [   25.637680] [ cut here ]
  [   25.643383] WARNING: CPU: 10 PID: 1 at 
/c/kernel-tests/src/x86_64/net/core/dev.c:4503 
netdev_all_upper_get_next_dev_rcu+0x40/0x84()
  [   25.657508] Modules linked in:
  [   25.661515] CPU: 10 PID: 1 Comm: swapper/0 Not tainted 
3.12.0-11530-g873cd59 #1751
  [   25.670889] Hardware name: Intel Corporation LH Pass/S4600LH, BIOS 
SE5C600.86B.99.02.1047.032320122259 03/23/2012
  [   25.683647]  0001 880427c9bc68 81a4168e 

  [   25.693182]  880427c9bca0 810c5530 81934918 
880427c9bce8
  [   25.702742]  880818e98000  0040 
880427c9bcb0
  [   25.712306] Call Trace:
  [   25.715532]  [] dump_stack+0x4d/0x66
  [   25.721770]  [] warn_slowpath_common+0x7f/0x98
  [   25.728974]  [] ? 
netdev_all_upper_get_next_dev_rcu+0x40/0x84
  [   25.738020]  [] warn_slowpath_null+0x1a/0x1c
  [   25.745022]  [] 
netdev_all_upper_get_next_dev_rcu+0x40/0x84
  [   25.753517]  [] ixgbe_configure+0x74f/0x786
  [   25.760431]  [] ixgbe_open+0x18e/0x409
  [   25.766895]  [] ? raw_notifier_call_chain+0x14/0x16
  [   25.774608]  [] ? call_netdevice_notifiers_info+0x52/0x59
  [   25.782914]  [] __dev_open+0x90/0xd0
  [   25.789174]  [] __dev_change_flags+0xa9/0x14b
  [   25.796279]  [] dev_change_flags+0x26/0x59
  [   25.803136]  [] ip_auto_config+0x204/0xe82
  [   25.809974]  [] ? lock_release_holdtime.part.7+0xcc/0xd9
  [   25.818181]  [] ? 
tcp_set_default_congestion_control+0xb4/0xb9
  [   25.827349]  [] ? _raw_spin_unlock+0x27/0x32
  [   25.834353]  [] ? root_nfs_parse_addr+0xaf/0xaf
  [   25.841668]  [] do_one_initcall+0xa4/0x13a
  [   25.848495]  [] ? parse_args+0x261/0x33f
  [   25.855127]  [] kernel_init_freeable+0x1d9/0x25f
  [   25.862516]  [] ? do_early_param+0x88/0x88
  [   25.869348]  [] ? rest_init+0xcd/0xcd
  [   25.875669]  [] kernel_init+0xe/0x109
  [   25.882011]  [] ret_from_fork+0x7c/0xb0
  [   25.888535]  [] ? rest_init+0xcd/0xcd
  [   25.894850] ---[ end trace 083c1411a531ab55 ]---
  [   25.904421] pps pps0: new PPS source ptp0
  [   25.909429] ixgbe :06:00.0: registered PHC device on eth0
  [   26.321643] IPv6: ADDRCONF(NETDEV_UP): eth0: link is not ready

And the first bad commit is:

  commit 2a47fa45d4dfbc54659d28de311a1f764b296a3c
  Author: John Fastabend 
  Date:   Wed Nov 6 09:54:52 2013 -0800
  
  ixgbe: enable l2 forwarding acceleration for macvlans
  
  Now that l2 acceleration ops are in place from the prior patch,
  enable ixgbe to take advantage of these operations.  Allow it to
  allocate queues for a macvlan so that when we transmit a frame,
  we can do the switching in hardware inside the ixgbe card, rather
  than in software.
  
  Signed-off-by: John Fastabend 
  Signed-off-by: Neil Horman 
  CC: Andy Gospodarek 
  CC: "David S. Miller" 
  Signed-off-by: David S. Miller 
  
  :04 04 6407c4e5932446e035cfd57b786845e49746948f 
60c62718a990436d6d4589b8124affaa0412aa14 Mdrivers
  bisect run success
  
  # bad: [873cd59de3c0e84596ee1790fb3047df45d0da43] Merge 
'drm-exynos/exynos-drm-fixes' into devel-hourly-2013112214
  # good: [5e01dc7b26d9f24f39abace5da98ccbd6a5ceb52] Linux 3.12
  git bisect start '873cd59de3c0e84596ee1790fb3047df45d0da43' 
'5e01dc7b26d9f24f39abace5da98ccbd6a5ceb52' '--'
  # good: [5cbb3d216e2041700231bcfc383ee5f8b7fc8b74] Merge branch 'akpm' 
(patches from Andrew Morton)
  git bisect good 5cbb3d216e2041700231bcfc383ee5f8b7fc8b74
  # bad: [3aeb58ab6216d864821e8dafb248e8d77403f3e9] Merge branch 'for-linus' of 
git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs
  git bisect bad 3aeb58ab6216d864821e8dafb248e8d77403f3e9
  # bad: [dcd607718385d02ce3741de225927a57f528f93b] inet: fix a UFO regression
  git bisect bad dcd607718385d02ce3741de225927a57f528f93b
  # good: [3ba405db1c1b05d157474c71e559393f7ea436ad] gianfar: Simplify MQ 
polling to avoid soft lockup
  git bisect good 3ba405db1c1b05d157474c71e559393f7ea436ad
  # good: [ba275241030cfe87b87d6592345c7e7ebd9b6fba] virtio-net: coalesce rx 
frags when possible during rx
  git bisect good ba275241030cfe87b87d6592345c7e7ebd9b6fba
  # good: [a72e25f78134cc0c1ef2adc99d6c3680ebd80e35] Merge branch 
'for-linville' of git://github.com/kvalo/ath
  git bisect good a72e25f78134cc0c1ef2adc99d6c3680ebd80e35
  # good: [53c5a099b8fd45632f4021f0a908b43aabe883fc] rt2x00: rt2800lib: 
autodetect 5GHz band support
  git bisect good 53c5a099b8fd45632f4021f0a908b43aabe883fc
  # good: [01925efdf7e03b4b803b5c9f985163d687f7f017] Merge branch 'master' of 
git://git.kernel.org/pub/scm/linux/kernel/git/linville/wireless
  git bisect good 01925efdf7e03b4b803b5c9f985163d687f7f017
  # good: [95ed40196f965177ee0d044ab304e5cab3aee9c1] Merge branch 
'tipc_fragmentation'
  gi

[PATCH] kernel: remove CONFIG_USE_GENERIC_SMP_HELPERS cleanly

2013-11-18 Thread Yuanhan Liu

Remove CONFIG_USE_GENERIC_SMP_HELPERS left by commit
0a06ff06("kernel: remove CONFIG_USE_GENERIC_SMP_HELPERS").

Cc: Christoph Hellwig 
Cc: Andrew Morton 
Signed-off-by: Yuanhan Liu 
---
 drivers/block/null_blk.c |8 
 net/Kconfig  |4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index b5d8423..ea192ec 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -223,7 +223,7 @@ static void null_softirq_done_fn(struct request *rq)
blk_end_request_all(rq, 0);
 }
 
-#if defined(CONFIG_SMP) && defined(CONFIG_USE_GENERIC_SMP_HELPERS)
+#ifdef CONFIG_SMP
 
 static void null_ipi_cmd_end_io(void *data)
 {
@@ -260,7 +260,7 @@ static void null_cmd_end_ipi(struct nullb_cmd *cmd)
put_cpu();
 }
 
-#endif /* CONFIG_SMP && CONFIG_USE_GENERIC_SMP_HELPERS */
+#endif /* CONFIG_SMP */
 
 static inline void null_handle_cmd(struct nullb_cmd *cmd)
 {
@@ -270,7 +270,7 @@ static inline void null_handle_cmd(struct nullb_cmd *cmd)
end_cmd(cmd);
break;
case NULL_IRQ_SOFTIRQ:
-#if defined(CONFIG_SMP) && defined(CONFIG_USE_GENERIC_SMP_HELPERS)
+#ifdef CONFIG_SMP
null_cmd_end_ipi(cmd);
 #else
end_cmd(cmd);
@@ -571,7 +571,7 @@ static int __init null_init(void)
 {
unsigned int i;
 
-#if !defined(CONFIG_SMP) || !defined(CONFIG_USE_GENERIC_SMP_HELPERS)
+#if !defined(CONFIG_SMP)
if (irqmode == NULL_IRQ_SOFTIRQ) {
pr_warn("null_blk: softirq completions not available.\n");
pr_warn("null_blk: using direct completions.\n");
diff --git a/net/Kconfig b/net/Kconfig
index 0715db6..d334678 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -224,7 +224,7 @@ source "net/hsr/Kconfig"
 
 config RPS
boolean
-   depends on SMP && SYSFS && USE_GENERIC_SMP_HELPERS
+   depends on SMP && SYSFS
default y
 
 config RFS_ACCEL
@@ -235,7 +235,7 @@ config RFS_ACCEL
 
 config XPS
boolean
-   depends on SMP && USE_GENERIC_SMP_HELPERS
+   depends on SMP
default y
 
 config NETPRIO_CGROUP
-- 
1.7.7.6

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: performance regressions by "seqcount: Add lockdep functionality to seqcount/seqlock structures"

2013-11-13 Thread Yuanhan Liu

On Wed, Nov 13, 2013 at 09:40:48AM -0800, John Stultz wrote:
> On 11/13/2013 01:14 AM, Yuanhan Liu wrote:
> > Hi,
> >
> > FYI, we found some performance regressions caused by commit 1ca7d67c
> > ("seqcount: Add lockdep functionality to seqcount/seqlock structures")
> 
> So this is expected. seqlock readers are usually very very cheap

Yeah, sorry for not mentioning that we knew it's expected, but we want
to show you the exact number of slow downs.

Thanks.

--yliu
> operations, and we're now doing lockdep tracking on every iteration
> around the loop. As the lockdep help states:
> 
>   | If you say Y here, the lock dependency engine will
> do   |  
>   | additional runtime checks to debug itself, at the
> price |  
>   | of more runtime overhead.
> 
> 
> So now since we're also tracking seqlocks in addition to spinlocks, it
> creates more overhead.
> 
> 
> Disabling CONFIG_LOCKDEP should restore performance.
> 
> thanks
> -john
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 0/4] per anon_vma lock and turn anon_vma rwsem lock to rwlock_t

2013-11-05 Thread Yuanhan Liu

On Tue, Nov 05, 2013 at 11:10:43AM +0800, Yuanhan Liu wrote:
> On Mon, Nov 04, 2013 at 05:44:00PM -0800, Tim Chen wrote:
> > On Mon, 2013-11-04 at 11:59 +0800, Yuanhan Liu wrote:
> > > On Fri, Nov 01, 2013 at 08:15:13PM -0700, Davidlohr Bueso wrote:
> > > > On Fri, 2013-11-01 at 18:16 +0800, Yuanhan Liu wrote:
> > > > > On Fri, Nov 01, 2013 at 09:21:46AM +0100, Ingo Molnar wrote:
> > > > > > 
> > > > > > * Yuanhan Liu  wrote:
> > > > > > 
> > > > > > > > Btw., another _really_ interesting comparison would be against 
> > > > > > > > the latest rwsem patches. Mind doing such a comparison?
> > > > > > > 
> > > > > > > Sure. Where can I get it? Are they on some git tree?
> > > > > > 
> > > > > > I've Cc:-ed Tim Chen who might be able to point you to the latest 
> > > > > > version.
> > > > > > 
> > > > > > The last on-lkml submission was in this thread:
> > > > > > 
> > > > > >   Subject: [PATCH v8 0/9] rwsem performance optimizations
> > > > > > 
> > > > > 
> > > > > Thanks.
> > > > > 
> > > > > I queued bunchs of tests about one hour ago, and already got some
> > > > > results(If necessary, I can add more data tomorrow when those tests 
> > > > > are
> > > > > finished):
> > > > 
> > > > What kind of system are you using to run these workloads on?
> > > 
> > > I queued jobs on 5 testboxes:
> > >   - brickland1: 120 core Ivybridge server
> > >   - lkp-ib03:   48 core Ivybridge server
> > >   - lkp-sb03:   32 core Sandybridge server
> > >   - lkp-nex04:  64 core NHM server
> > >   - lkp-a04:Atom server
> > > > 
> > > > > 
> > > > > 
> > > > >v3.12-rc7  fe001e3de090e179f95d  
> > > > >     
> > > > > -9.3%   
> > > > > brickland1/micro/aim7/shared
> > > > > +4.3%   
> > > > > lkp-ib03/micro/aim7/fork_test
> > > > > +2.2%   
> > > > > lkp-ib03/micro/aim7/shared
> > > > > -2.6%   TOTAL 
> > > > > aim7.2000.jobs-per-min
> > > > > 
> > > > 
> > > > Sorry if I'm missing something, but could you elaborate more on what
> > > > these percentages represent?
> > > 
> > >v3.12-rc7  fe001e3de090e179f95d  
> > >     
> > > -9.3%   
> > > brickland1/micro/aim7/shared
> > > 
> > > 
> > > -2.6%   TOTAL 
> > > aim7.2000.jobs-per-min
> > > 
> > > The comparation base is v3.12-rc7, and we got 9.3 performance regression
> > > at commit fe001e3de090e179f95d, which is the head of rwsem performance
> > > optimizations patch set.
> > 
> > Yunahan, thanks for the data.  This I assume is with the entire rwsem
> > v8 patchset.
> 
> Yes, it is; 9 patches in total.
> 
> > Any idea of the run variation on the workload?
> 
> Your concern is right. The variation is quite big on the 
> brickland1/micro/aim7/shared
> testcase.
> 
>* - v3.12-rc7
>O - fe001e3de090e179f95d
> 
>  brickland1/micro/aim7/shared: aim7.2000.jobs-per-min
> 
>32 +++
>   | |
>31 ++  .*.   |
>   |      ...|
>30 ++    ... |
>   |... ..   |
>29 ++     ...|
>   | *
>28 ++... |
>   | |
>27 ++|
>   *.O
>26 O+|
>   |O|
>25 +++
> 

Tim,

Please ignore this "regression", it disappears when I run that testcase
6 times both for v3.12-rc7 and fe001e3de090e179f95d.

I guess 2000 users is a bit small for 120 core IVB server. I may try to
increase the user count and do test again to see how it will behavior
with your patches applied.

Sorry for the inconvenience.

--yliu

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 0/4] per anon_vma lock and turn anon_vma rwsem lock to rwlock_t

2013-11-04 Thread Yuanhan Liu

On Mon, Nov 04, 2013 at 05:44:00PM -0800, Tim Chen wrote:
> On Mon, 2013-11-04 at 11:59 +0800, Yuanhan Liu wrote:
> > On Fri, Nov 01, 2013 at 08:15:13PM -0700, Davidlohr Bueso wrote:
> > > On Fri, 2013-11-01 at 18:16 +0800, Yuanhan Liu wrote:
> > > > On Fri, Nov 01, 2013 at 09:21:46AM +0100, Ingo Molnar wrote:
> > > > > 
> > > > > * Yuanhan Liu  wrote:
> > > > > 
> > > > > > > Btw., another _really_ interesting comparison would be against 
> > > > > > > the latest rwsem patches. Mind doing such a comparison?
> > > > > > 
> > > > > > Sure. Where can I get it? Are they on some git tree?
> > > > > 
> > > > > I've Cc:-ed Tim Chen who might be able to point you to the latest 
> > > > > version.
> > > > > 
> > > > > The last on-lkml submission was in this thread:
> > > > > 
> > > > >   Subject: [PATCH v8 0/9] rwsem performance optimizations
> > > > > 
> > > > 
> > > > Thanks.
> > > > 
> > > > I queued bunchs of tests about one hour ago, and already got some
> > > > results(If necessary, I can add more data tomorrow when those tests are
> > > > finished):
> > > 
> > > What kind of system are you using to run these workloads on?
> > 
> > I queued jobs on 5 testboxes:
> >   - brickland1: 120 core Ivybridge server
> >   - lkp-ib03:   48 core Ivybridge server
> >   - lkp-sb03:   32 core Sandybridge server
> >   - lkp-nex04:  64 core NHM server
> >   - lkp-a04:Atom server
> > > 
> > > > 
> > > > 
> > > >v3.12-rc7  fe001e3de090e179f95d  
> > > >     
> > > > -9.3%   
> > > > brickland1/micro/aim7/shared
> > > > +4.3%   
> > > > lkp-ib03/micro/aim7/fork_test
> > > > +2.2%   
> > > > lkp-ib03/micro/aim7/shared
> > > > -2.6%   TOTAL 
> > > > aim7.2000.jobs-per-min
> > > > 
> > > 
> > > Sorry if I'm missing something, but could you elaborate more on what
> > > these percentages represent?
> > 
> >v3.12-rc7  fe001e3de090e179f95d  
> >     
> > -9.3%   
> > brickland1/micro/aim7/shared
> > 
> > 
> > -2.6%   TOTAL 
> > aim7.2000.jobs-per-min
> > 
> > The comparation base is v3.12-rc7, and we got 9.3 performance regression
> > at commit fe001e3de090e179f95d, which is the head of rwsem performance
> > optimizations patch set.
> 
> Yunahan, thanks for the data.  This I assume is with the entire rwsem
> v8 patchset.

Yes, it is; 9 patches in total.

> Any idea of the run variation on the workload?

Your concern is right. The variation is quite big on the 
brickland1/micro/aim7/shared
testcase.

   * - v3.12-rc7
   O - fe001e3de090e179f95d

 brickland1/micro/aim7/shared: aim7.2000.jobs-per-min

   32 +++
  | |
   31 ++  .*.   |
  |      ...|
   30 ++    ... |
  |... ..   |
   29 ++     ...|
  | *
   28 ++... |
  | |
   27 ++|
  *.O
   26 O+|
  |O|
   25 +++


--yliu
> > 
> > "brickland1/micro/aim7/shared" tells the testbox(brickland1) and testcase:
> > shared workfile of aim7.
> > 
> > The

Re: [PATCH 0/4] per anon_vma lock and turn anon_vma rwsem lock to rwlock_t

2013-11-03 Thread Yuanhan Liu

On Fri, Nov 01, 2013 at 08:15:13PM -0700, Davidlohr Bueso wrote:
> On Fri, 2013-11-01 at 18:16 +0800, Yuanhan Liu wrote:
> > On Fri, Nov 01, 2013 at 09:21:46AM +0100, Ingo Molnar wrote:
> > > 
> > > * Yuanhan Liu  wrote:
> > > 
> > > > > Btw., another _really_ interesting comparison would be against 
> > > > > the latest rwsem patches. Mind doing such a comparison?
> > > > 
> > > > Sure. Where can I get it? Are they on some git tree?
> > > 
> > > I've Cc:-ed Tim Chen who might be able to point you to the latest 
> > > version.
> > > 
> > > The last on-lkml submission was in this thread:
> > > 
> > >   Subject: [PATCH v8 0/9] rwsem performance optimizations
> > > 
> > 
> > Thanks.
> > 
> > I queued bunchs of tests about one hour ago, and already got some
> > results(If necessary, I can add more data tomorrow when those tests are
> > finished):
> 
> What kind of system are you using to run these workloads on?

I queued jobs on 5 testboxes:
  - brickland1: 120 core Ivybridge server
  - lkp-ib03:   48 core Ivybridge server
  - lkp-sb03:   32 core Sandybridge server
  - lkp-nex04:  64 core NHM server
  - lkp-a04:Atom server
> 
> > 
> > 
> >v3.12-rc7  fe001e3de090e179f95d  
> >     
> > -9.3%   
> > brickland1/micro/aim7/shared
> > +4.3%   
> > lkp-ib03/micro/aim7/fork_test
> > +2.2%   
> > lkp-ib03/micro/aim7/shared
> > -2.6%   TOTAL 
> > aim7.2000.jobs-per-min
> > 
> 
> Sorry if I'm missing something, but could you elaborate more on what
> these percentages represent?

   v3.12-rc7  fe001e3de090e179f95d  
    
-9.3%   brickland1/micro/aim7/shared


-2.6%   TOTAL aim7.2000.jobs-per-min

The comparation base is v3.12-rc7, and we got 9.3 performance regression
at commit fe001e3de090e179f95d, which is the head of rwsem performance
optimizations patch set.

"brickland1/micro/aim7/shared" tells the testbox(brickland1) and testcase:
shared workfile of aim7.

The last line tell what field we are comparing, and it's
"aim7.2000.jobs-per-min" in this case. 2000 means 2000 users in aim7.

> Are they anon vma rwsem + optimistic
> spinning patches vs anon vma rwlock?

I tested "[PATCH v8 0/9] rwsem performance optimizations" only.

> 
> Also, I see your running aim7, you might be interested in some of the
> results I found when trying out Ingo's rwlock conversion patch on a
> largish 80 core system: https://lkml.org/lkml/2013/9/29/280

Besides aim7, I also tested dbench, hackbench, netperf, pigz. And as you
can image and see from the data, aim7 benifit most from the anon_vma
optimization stuff due to high contention of anon_vma lock.

Thanks.

--yliu

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 4/4] mm/rmap.c: move anon_vma initialization code into anon_vma_ctor

2013-11-03 Thread Yuanhan Liu

On Fri, Nov 01, 2013 at 11:04:40AM -0700, Linus Torvalds wrote:
> On Fri, Nov 1, 2013 at 12:54 AM, Yuanhan Liu
>  wrote:
> > @@ -67,19 +67,7 @@ static struct kmem_cache *anon_vma_chain_cachep;
> >
> >  static inline struct anon_vma *anon_vma_alloc(void)
> >  {
> > -   struct anon_vma *anon_vma;
> > -
> > -   anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL);
> > -   if (anon_vma) {
> > -   atomic_set(&anon_vma->refcount, 1);
> > -   /*
> > -* Initialise the anon_vma root to point to itself. If 
> > called
> > -* from fork, the root will be reset to the parents 
> > anon_vma.
> > -*/
> > -   anon_vma->root = anon_vma;
> > -   }
> > -
> > -   return anon_vma;
> > +   return kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL);
> >  }
> >
> >  static inline void anon_vma_free(struct anon_vma *anon_vma)
> > @@ -293,8 +281,15 @@ static void anon_vma_ctor(void *data)
> > struct anon_vma *anon_vma = data;
> >
> > rwlock_init(&anon_vma->rwlock);
> > -   atomic_set(&anon_vma->refcount, 0);
> > anon_vma->rb_root = RB_ROOT;
> > +
> > +   atomic_set(&anon_vma->refcount, 1);
> > +   /*
> > +* Initialise the anon_vma root to point to itself. If called
> > +* from fork, the root will be reset to the parents anon_vma.
> > +*/
> > +   anon_vma->root = anon_vma;
> > +
> >  }
> 
> This looks totally invalid.
> 
> The slab constructor is *not* called on every allocation.

Sorry, I didn't know that :(

And thanks for the detailed info very much!

--yliu

> Quite the
> reverse. Constructors are called when the underlying allocation is
> initially done, and then *not* done again, even if that particular
> object may be allocated and free'd many times.
> 
> So the reason we can do
> 
> atomic_set(&anon_vma->refcount, 0);
> 
> in a constructor is that anybody who frees that allocation will do so
> only when the refcount goes back down to zero, so zero is "valid
> state" while the slab entry stays on some percpu freelist.
> 
> But the same is ABSOLUTELY NOT TRUE of the value "1", nor is it true
> of the anon_vma->root. When the anonvma gets free'd, those values will
> *not* be the same (the refcount has been decremented to zero, and the
> root will have been set to whatever the root was.
> 
> So the rule about constructors is that the values they construct
> absolutely *have* to be the ones they get free'd with. With one
> special case.
> 
> Using slab constructors is almost always a mistake. The original
> Sun/Solaris argument for them was to avoid initialization costs in
> allocators, and that was pure and utter bullshit (initializing a whole
> cacheline is generally cheaper than not initializing it and having to
> fetch it from L3 caches, but it does hide the cost so that it is now
> spread out in the users rather than in the allocator).
> 
> So the _original_ reason for slab is pure and utter BS, and we've
> removed pretty much all uses of the constructors.
> 
> In fact, the only valid reason for using them any more is the special
> case: locks and RCU.
> 
> The reason we still have constructors is that sometimes we want to
> keep certain data structures "alive" across allocations together with
> SLAB_DESTROY_BY_RCU (which delays the actual *page* destroying by RCU,
> but the allocation can go to the free-list and get re-allocated
> without a RCU grace-period).
> 
> But because allocations can now "stay active" over a
> alloc/free/alloc-again sequence, that means that the allocation
> sequence MUST NOT re-initialize the lock, because some RCU user may
> still be looking at those fields (and in particular, unlocking an
> allocation that in the meantime got free'd and re-allocated).
> 
> So these days, the *only* valid pattern for slab constructors is
> together with SLAB_DESTROY_BY_RCU, and making sure that the fields
> that RCU readers look at (and in particular, change) are "stable" over
> such re-allocations.
> 
> Your patch is horribly wrong.
> 
>   Linus
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 1/4] mm/rmap: per anon_vma lock

2013-11-01 Thread Yuanhan Liu

On Fri, Nov 01, 2013 at 11:22:24AM +0100, Peter Zijlstra wrote:
> On Fri, Nov 01, 2013 at 05:38:44PM +0800, Yuanhan Liu wrote:
> > On Fri, Nov 01, 2013 at 09:43:29AM +0100, Peter Zijlstra wrote:
> > > On Fri, Nov 01, 2013 at 03:54:24PM +0800, Yuanhan Liu wrote:
> > > > @@ -497,15 +495,20 @@ static void vma_rb_erase(struct vm_area_struct 
> > > > *vma, struct rb_root *root)
> > > >   * anon_vma_interval_tree_post_update_vma().
> > > >   *
> > > >   * The entire update must be protected by exclusive mmap_sem and by
> > > > - * the root anon_vma's mutex.
> > > > + * the anon_vma's mutex.
> > > >   */
> > > >  static inline void
> > > >  anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma)
> > > >  {
> > > > struct anon_vma_chain *avc;
> > > >  
> > > > -   list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
> > > > -   anon_vma_interval_tree_remove(avc, 
> > > > &avc->anon_vma->rb_root);
> > > > +   list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) {
> > > > +   struct anon_vma *anon_vma = avc->anon_vma;
> > > > +
> > > > +   anon_vma_lock_write(anon_vma);
> > > > +   anon_vma_interval_tree_remove(avc, &anon_vma->rb_root);
> > > > +   anon_vma_unlock_write(anon_vma);
> > > > +   }
> > > >  }
> > > >  
> > > >  static inline void
> > > > @@ -513,8 +516,13 @@ anon_vma_interval_tree_post_update_vma(struct 
> > > > vm_area_struct *vma)
> > > >  {
> > > > struct anon_vma_chain *avc;
> > > >  
> > > > -   list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
> > > > -   anon_vma_interval_tree_insert(avc, 
> > > > &avc->anon_vma->rb_root);
> > > > +   list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) {
> > > > +   struct anon_vma *anon_vma = avc->anon_vma;
> > > > +
> > > > +   anon_vma_lock_write(anon_vma);
> > > > +   anon_vma_interval_tree_insert(avc, &anon_vma->rb_root);
> > > > +   anon_vma_unlock_write(anon_vma);
> > > > +   }
> > > >  }
> > > >  
> > > >  static int find_vma_links(struct mm_struct *mm, unsigned long addr,
> > > > @@ -781,7 +789,6 @@ again:  remove_next = 1 + (end 
> > > > > next->vm_end);
> > > > if (anon_vma) {
> > > > VM_BUG_ON(adjust_next && next->anon_vma &&
> > > >   anon_vma != next->anon_vma);
> > > > -   anon_vma_lock_write(anon_vma);
> > > > anon_vma_interval_tree_pre_update_vma(vma);
> > > > if (adjust_next)
> > > > anon_vma_interval_tree_pre_update_vma(next);
> > > > @@ -845,7 +852,6 @@ again:  remove_next = 1 + (end 
> > > > > next->vm_end);
> > > > anon_vma_interval_tree_post_update_vma(vma);
> > > > if (adjust_next)
> > > > anon_vma_interval_tree_post_update_vma(next);
> > > > -   anon_vma_unlock_write(anon_vma);
> > > > }
> > > > if (mapping)
> > > > mutex_unlock(&mapping->i_mmap_mutex);
> > > 
> > > AFAICT this isn't correct at all. We used to protect the vma interval
> > > tree with the root lock, now we don't.
> > 
> > We still use lock to protect anon_vma interval tree, but we lock our own
> > interval tree this time.
> 
> Which lock? What protects the chain you're iterating in
> anon_vma_interval_tree_{pre,post}_update_vma() ?

Sorry, I may be wrong again this time. But, isn't vma->anon_vma_chain
list being protect by mmap_sem & page_table_lock?
struct vm_area_struct {
...
struct list_head anon_vma_chain; /* Serialized by mmap_sem &
  * page_table_lock */
...
}

So, my understanding was you don't need extra lock to iterate
vma->anon_vma_chain list. However, you need acquire avc->anon_vma's lock
to insert/remove avc from it.

Thanks.

--yliu
> 
> > > All we've got left is the
> > > mmap_sem, but anon_vma chains can cross address-spaces and thus we're up
> > > some creek without no paddle.
> > 
> > Yep, however, you still need acquire the address-space crossed anon_vma's 
> > lock
> > to modify something.
> 
> -ENOPARSE.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 1/4] mm/rmap: per anon_vma lock

2013-11-01 Thread Yuanhan Liu

On Fri, Nov 01, 2013 at 01:07:45PM +0100, Peter Zijlstra wrote:
> On Fri, Nov 01, 2013 at 07:44:29PM +0800, Yuanhan Liu wrote:
> > commit 012f18004da33ba672e3c60838cc4898126174d3
> > Author: Rik van Riel 
> > Date:   Mon Aug 9 17:18:40 2010 -0700
> > 
> > mm: always lock the root (oldest) anon_vma
> > 
> > Always (and only) lock the root (oldest) anon_vma whenever we do 
> > something
> > in an anon_vma.  The recently introduced anon_vma scalability is due to
> > the rmap code scanning only the VMAs that need to be scanned.  Many 
> > common
> > operations still took the anon_vma lock on the root anon_vma, so always
> > taking that lock is not expected to introduce any scalability issues.
> > 
> > However, always taking the same lock does mean we only need to take one
> > lock, which means rmap_walk on pages from any anon_vma in the vma is
> > excluded from occurring during an munmap, expand_stack or other 
> > operation
> > that needs to exclude rmap_walk and similar functions.
> > 
> > Also add the proper locking to vma_adjust.
> > 
> > Signed-off-by: Rik van Riel 
> > Tested-by: Larry Woodman 
> > Acked-by: Larry Woodman 
> > Reviewed-by: Minchan Kim 
> > Reviewed-by: KAMEZAWA Hiroyuki 
> > Acked-by: Mel Gorman 
> > Acked-by: Linus Torvalds 
> > Signed-off-by: Andrew Morton 
> > Signed-off-by: Linus Torvalds 
> 
> Right that commit did.

Sorry again for that! I was jusy being brain dead :(

> I'm still not sure why you change both the
> locking proper and the locking primitive used in one patch set.

convert rwsem to rwlock silightly depends on per anon_vma lock, as it's
a bad idea to do avc allocation inside a spin lock.

Without converting rwsem to rwlock, it's not that useful to introduce
per anon_vma lock, or worse, it may introduce regressions.

> 
> Also, changing the locking proper requires a very detailed explanation
> on why it is correct;

Thanks for the tip. And yes, this patch really lacks of some explanation.
I tried to find some potentional races. I then digged the git history
and found it was per anon_lock at the first time avc was introduced.
It was changed to root locking not for fixing race, thus I think we
can changed it back, and this time, for performance boost.

anon_vma lock owns biggest lock contention on our many-core(say 120)servers
from /proc/lock_stat. I found Ingo's patch makes it better, and since
it's a spin lock, I then tried to narrow down the lock range. Hence,
I wrote this patch.

This patch may be wrong, but I guess it's somehow worthy sending out
for comments.

> we've had far too many 'fun' issues with the
> anon_vma locking in the past.

Yeah, I know. Say, http://lwn.net/Articles/383162/ ;)

Thanks.

--yliu
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 1/4] mm/rmap: per anon_vma lock

2013-11-01 Thread Yuanhan Liu

On Fri, Nov 01, 2013 at 11:15:14AM +0100, Peter Zijlstra wrote:
> On Fri, Nov 01, 2013 at 06:07:07PM +0800, Yuanhan Liu wrote:
> > > I also want to point out that lately we've seen several changes sent
> > > out that relax locking with no accompanying explanation of why the
> > > relaxed locking would be safe. Please don't do that - having a lot of
> > > performance data is worthless if you can't explain why the new locking
> > > is safe.
> > 
> > Agreed.
> > 
> > > And I'm not asking to prove a negative ('lack of any possible
> > > races') there, but at least in this case one could dig out why the
> > > root anon vma locking was introduced and if they think that this
> > > reason doesn't apply anymore, explain why...
> > 
> > It was introduced by commit 2b575eb6(And, BTW, I'm sorry that this commit 
> > log
> > about bb4aa39676f is wrong)
> > 
> >commit 2b575eb64f7a9c701fb4bfdb12388ac547f6c2b6
> >Author: Peter Zijlstra 
> >Date:   Tue May 24 17:12:11 2011 -0700
> >
> >mm: convert anon_vma->lock to a mutex
> >
> >Straightforward conversion of anon_vma->lock to a mutex.
> >
> > As you can see, Peter didn't tell why before. Honestly speaking, that
> > was my originaly concern as well. I tried to find some possible races;
> > I guess I may miss something.
> 
> Bullshit; I didn't change the locking. I only changed the lock primitive
> from a spinlock to a mutex. The anon_vma->root->lock is completely
> unrelated to this change.

Oops, sorry for that. Just made a *horrible* mistake: it was commit
012f18004da33ba672e3c60838cc4898126174d3.


commit 012f18004da33ba672e3c60838cc4898126174d3
Author: Rik van Riel 
Date:   Mon Aug 9 17:18:40 2010 -0700

mm: always lock the root (oldest) anon_vma

Always (and only) lock the root (oldest) anon_vma whenever we do something
in an anon_vma.  The recently introduced anon_vma scalability is due to
the rmap code scanning only the VMAs that need to be scanned.  Many common
operations still took the anon_vma lock on the root anon_vma, so always
taking that lock is not expected to introduce any scalability issues.

However, always taking the same lock does mean we only need to take one
lock, which means rmap_walk on pages from any anon_vma in the vma is
excluded from occurring during an munmap, expand_stack or other operation
that needs to exclude rmap_walk and similar functions.

Also add the proper locking to vma_adjust.

Signed-off-by: Rik van Riel 
Tested-by: Larry Woodman 
Acked-by: Larry Woodman 
Reviewed-by: Minchan Kim 
Reviewed-by: KAMEZAWA Hiroyuki 
Acked-by: Mel Gorman 
Acked-by: Linus Torvalds 
Signed-off-by: Andrew Morton 
Signed-off-by: Linus Torvalds 


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 0/4] per anon_vma lock and turn anon_vma rwsem lock to rwlock_t

2013-11-01 Thread Yuanhan Liu

On Fri, Nov 01, 2013 at 09:21:46AM +0100, Ingo Molnar wrote:
> 
> * Yuanhan Liu  wrote:
> 
> > > Btw., another _really_ interesting comparison would be against 
> > > the latest rwsem patches. Mind doing such a comparison?
> > 
> > Sure. Where can I get it? Are they on some git tree?
> 
> I've Cc:-ed Tim Chen who might be able to point you to the latest 
> version.
> 
> The last on-lkml submission was in this thread:
> 
>   Subject: [PATCH v8 0/9] rwsem performance optimizations
> 

Thanks.

I queued bunchs of tests about one hour ago, and already got some
results(If necessary, I can add more data tomorrow when those tests are
finished):


   v3.12-rc7  fe001e3de090e179f95d  
    
-9.3%   brickland1/micro/aim7/shared
+4.3%   
lkp-ib03/micro/aim7/fork_test
+2.2%   lkp-ib03/micro/aim7/shared
-2.6%   TOTAL aim7.2000.jobs-per-min

   v3.12-rc7  fe001e3de090e179f95d  
    
   204056.67   -23.5%156082.33  brickland1/micro/aim7/shared
79248.00  +144.3%193617.25  
lkp-ib03/micro/aim7/fork_test
   298355.33   -25.2%223084.67  lkp-ib03/micro/aim7/shared
   581660.00-1.5%572784.25  TOTAL 
time.involuntary_context_switches

   v3.12-rc7  fe001e3de090e179f95d  
    
22487.33-4.7% 21429.33  brickland1/micro/aim7/dbase
61412.67   -29.1% 43511.00  brickland1/micro/aim7/shared
   531142.00   -27.7%383818.75  
lkp-ib03/micro/aim7/fork_test
20158.33   -50.9%  9899.67  lkp-ib03/micro/aim7/shared
   635200.33   -27.8%458658.75  TOTAL vmstat.system.in

   v3.12-rc7  fe001e3de090e179f95d  
    
 6408.67-4.5%  6117.33  brickland1/micro/aim7/dbase
87856.00   -39.5% 53170.67  brickland1/micro/aim7/shared
  1043620.00   -28.0%751214.75  
lkp-ib03/micro/aim7/fork_test
47152.33   -38.0% 29245.33  lkp-ib03/micro/aim7/shared
  1185037.00   -29.1%839748.08  TOTAL vmstat.system.cs

   v3.12-rc7  fe001e3de090e179f95d  
    
13295.00   -10.0% 11960.00  brickland1/micro/aim7/dbase
  1901175.00   -35.5%   1226787.33  brickland1/micro/aim7/shared
13951.00-6.5% 13051.00  lkp-ib03/micro/aim7/dbase
239773251.17   -30.9% 165727820.75  
lkp-ib03/micro/aim7/fork_test
  1014933.67   -31.1%699259.67  lkp-ib03/micro/aim7/shared
242716605.83   -30.9% 167678878.75  TOTAL 
time.voluntary_context_switches

   v3.12-rc7  fe001e3de090e179f95d  
    
9.56-1.0% 9.46  brickland1/micro/aim7/dbase
   11.01   -10.1% 9.90  brickland1/micro/aim7/shared
   36.23   +15.3%41.77  
lkp-ib03/micro/aim7/fork_test
   10.51   -11.9% 9.26  lkp-ib03/micro/aim7/shared
   67.31+4.6%70.39  TOTAL iostat.cpu.system

   v3.12-rc7  fe001e3de090e179f95d  
    
   36.39-3.6%35.09  brickland1/micro/aim7/dbase
   34.97-8.1%32.13  brickland1/micro/aim7/shared
   20.34+6.7%21.70  lkp-ib03/micro/aim7/shared
   91.70-3.0%88.92  TOTAL boottime.dhcp

   v3.12-rc7  fe001e3de090e179f95d  
    
   60.00+6.7%64.00  brickland1/micro/aim7/shared
   60.83-9.2%55.25  
lkp-ib03/micro/aim7/fork_test
  120.83-1.3%   119.25  TOTAL vmstat.cpu.id

   v3.12-rc7  fe001e3de090e179f95d  
    
  345.50-1.1%   341.73  brickland1/micro/aim7/dbase
 3788.80   +11.5%  4223.15  
lkp-ib03/micro/aim7/fork_test
  108.29-7.1%   100.62  lkp-ib03/micro/aim7/shared
 4242.59   +10.0%  4665.50  TOTAL time.system_time

   v3.12-rc7  fe001e3de090e179f95d

Re: [PATCH 1/4] mm/rmap: per anon_vma lock

2013-11-01 Thread Yuanhan Liu

On Fri, Nov 01, 2013 at 02:22:25AM -0700, Michel Lespinasse wrote:
> On Fri, Nov 1, 2013 at 1:43 AM, Peter Zijlstra  wrote:
> > AFAICT this isn't correct at all. We used to protect the vma interval
> > tree with the root lock, now we don't. All we've got left is the
> > mmap_sem, but anon_vma chains can cross address-spaces and thus we're up
> > some creek without no paddle.
> 
> Yes, that was my first thought as well (though I wanted to double
> check at first).
> 
> I also want to point out that lately we've seen several changes sent
> out that relax locking with no accompanying explanation of why the
> relaxed locking would be safe. Please don't do that - having a lot of
> performance data is worthless if you can't explain why the new locking
> is safe.

Agreed.

> And I'm not asking to prove a negative ('lack of any possible
> races') there, but at least in this case one could dig out why the
> root anon vma locking was introduced and if they think that this
> reason doesn't apply anymore, explain why...

It was introduced by commit 2b575eb6(And, BTW, I'm sorry that this commit log
about bb4aa39676f is wrong)

   commit 2b575eb64f7a9c701fb4bfdb12388ac547f6c2b6
   Author: Peter Zijlstra 
   Date:   Tue May 24 17:12:11 2011 -0700
   
   mm: convert anon_vma->lock to a mutex
   
   Straightforward conversion of anon_vma->lock to a mutex.
   
   Signed-off-by: Peter Zijlstra 
   Acked-by: Hugh Dickins 
   Reviewed-by: KOSAKI Motohiro 
   Cc: Benjamin Herrenschmidt 
   Cc: David Miller 
   Cc: Martin Schwidefsky 
   Cc: Russell King 
   Cc: Paul Mundt 
   Cc: Jeff Dike 
   Cc: Richard Weinberger 
   Cc: Tony Luck 
   Cc: KAMEZAWA Hiroyuki 
   Cc: Mel Gorman 
   Cc: Nick Piggin 
   Cc: Namhyung Kim 
   Signed-off-by: Andrew Morton 
   Signed-off-by: Linus Torvalds 


As you can see, Peter didn't tell why before. Honestly speaking, that
was my originaly concern as well. I tried to find some possible races;
I guess I may miss something.

Thanks.

--yliu
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 1/4] mm/rmap: per anon_vma lock

2013-11-01 Thread Yuanhan Liu

On Fri, Nov 01, 2013 at 09:43:29AM +0100, Peter Zijlstra wrote:
> On Fri, Nov 01, 2013 at 03:54:24PM +0800, Yuanhan Liu wrote:
> > @@ -497,15 +495,20 @@ static void vma_rb_erase(struct vm_area_struct *vma, 
> > struct rb_root *root)
> >   * anon_vma_interval_tree_post_update_vma().
> >   *
> >   * The entire update must be protected by exclusive mmap_sem and by
> > - * the root anon_vma's mutex.
> > + * the anon_vma's mutex.
> >   */
> >  static inline void
> >  anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma)
> >  {
> > struct anon_vma_chain *avc;
> >  
> > -   list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
> > -   anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root);
> > +   list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) {
> > +   struct anon_vma *anon_vma = avc->anon_vma;
> > +
> > +   anon_vma_lock_write(anon_vma);
> > +   anon_vma_interval_tree_remove(avc, &anon_vma->rb_root);
> > +   anon_vma_unlock_write(anon_vma);
> > +   }
> >  }
> >  
> >  static inline void
> > @@ -513,8 +516,13 @@ anon_vma_interval_tree_post_update_vma(struct 
> > vm_area_struct *vma)
> >  {
> > struct anon_vma_chain *avc;
> >  
> > -   list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
> > -   anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root);
> > +   list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) {
> > +   struct anon_vma *anon_vma = avc->anon_vma;
> > +
> > +   anon_vma_lock_write(anon_vma);
> > +   anon_vma_interval_tree_insert(avc, &anon_vma->rb_root);
> > +   anon_vma_unlock_write(anon_vma);
> > +   }
> >  }
> >  
> >  static int find_vma_links(struct mm_struct *mm, unsigned long addr,
> > @@ -781,7 +789,6 @@ again:  remove_next = 1 + (end > 
> > next->vm_end);
> > if (anon_vma) {
> > VM_BUG_ON(adjust_next && next->anon_vma &&
> >   anon_vma != next->anon_vma);
> > -   anon_vma_lock_write(anon_vma);
> > anon_vma_interval_tree_pre_update_vma(vma);
> > if (adjust_next)
> > anon_vma_interval_tree_pre_update_vma(next);
> > @@ -845,7 +852,6 @@ again:  remove_next = 1 + (end > 
> > next->vm_end);
> > anon_vma_interval_tree_post_update_vma(vma);
> > if (adjust_next)
> > anon_vma_interval_tree_post_update_vma(next);
> > -   anon_vma_unlock_write(anon_vma);
> > }
> > if (mapping)
> > mutex_unlock(&mapping->i_mmap_mutex);
> 
> AFAICT this isn't correct at all. We used to protect the vma interval
> tree with the root lock, now we don't.

We still use lock to protect anon_vma interval tree, but we lock our own
interval tree this time.

> All we've got left is the
> mmap_sem, but anon_vma chains can cross address-spaces and thus we're up
> some creek without no paddle.

Yep, however, you still need acquire the address-space crossed anon_vma's lock
to modify something.

Say, here is a chart: http://people.freedesktop.org/~yliu/anon_vma.png.
Let's take the 3rd chart as example. And assume we will unlink vma C.

And the steps are:
- lock c, and remove avc between c and C
- lock b, and remove avc between b and C
- lock a, and remove avc between a and C

Thanks.

--yliu

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 0/4] per anon_vma lock and turn anon_vma rwsem lock to rwlock_t

2013-11-01 Thread Yuanhan Liu

On Fri, Nov 01, 2013 at 09:01:36AM +0100, Ingo Molnar wrote:
> 
> * Yuanhan Liu  wrote:
> 
> > Patch 1 turns locking the anon_vma's root to locking itself to let it be
> > a per anon_vma lock, which would reduce contentions.
> > 
> > In the same time, lock range becomes quite small then, which is bascially
> > a call of anon_vma_interval_tree_insert(). Patch 2 turn rwsem to rwlock_t.
> > It's a patch made from Ingo, I just made some change to let it apply based 
> > on
> > patch 1.
> > 
> > Patch 3 is from Peter. It was a diff, I edited it to be a patch ;)
> > 
> > Here is the detailed changed stats with this patch applied. The test base 
> > is v3.12-rc7,
> > and 1c00bef768d4341afa7d is patch 1, e3e37183ee805f33e88f is patch 2.
> > 
> > NOTE: both commits are compared to base v3.12-rc7.
> > 
> >   1c00bef768d4341afa7d  e3e37183ee805f33e88f  
> >       
> >+35.0%+89.9%   
> > brickland1/micro/aim7/fork_test
> >+28.4%+49.3%   
> > lkp-ib03/micro/aim7/fork_test
> > +2.0% +2.7%   
> > lkp-ib03/micro/aim7/shared
> > -0.4% +0.0%   
> > lkp-sb03/micro/aim7/dbase
> >+16.4%+59.0%   
> > lkp-sb03/micro/aim7/fork_test
> > +0.1% +0.3%   
> > lkp-sb03/micro/aim7/shared
> > +2.2% +5.0%   TOTAL 
> > aim7.2000.jobs-per-min
> 
> Impressive!
> 
> >   1c00bef768d4341afa7d  e3e37183ee805f33e88f  
> >       
> >-25.9%  1008.55   -47.3%   717.39  
> > brickland1/micro/aim7/fork_test
> > -1.4%   641.19-3.4%   628.45  
> > brickland1/micro/hackbench/1600%-process-pipe
> > -1.0%   122.84+1.1%   125.36  
> > brickland1/micro/netperf/120s-200%-UDP_RR
> > +0.0%   121.29+0.2%   121.57  
> > lkp-a04/micro/netperf/120s-200%-TCP_SENDFILE
> >-22.1%   351.41   -26.3%   332.54  
> > lkp-ib03/micro/aim7/fork_test
> > -1.9%31.33-2.6%31.11  
> > lkp-ib03/micro/aim7/shared
> > -0.4%   630.36+0.4%   635.05  
> > lkp-ib03/micro/hackbench/1600%-process-socket
> > -0.0%   612.62+1.8%   623.80  
> > lkp-ib03/micro/hackbench/1600%-threads-socket
> >-14.1%   340.30   -37.1%   249.26  
> > lkp-sb03/micro/aim7/fork_test
> > -0.1%41.31-0.3%41.22  
> > lkp-sb03/micro/aim7/shared
> > -0.0%   614.26+0.6%   617.81  
> > lkp-sb03/micro/hackbench/1600%-process-socket
> >-10.4%  4515.47   -18.2%  4123.55  TOTAL 
> > time.elapsed_time
> 
> Here you scared me for a second with those negative percentages! :-)

Aha.. 

> 
> >   1c00bef768d4341afa7d  e3e37183ee805f33e88f  
> >       
> >+26.7%323386.33   -75.7% 61980.00  
> > brickland1/micro/aim7/fork_test
> >-22.9% 67734.00   -64.1% 31531.33  
> > brickland1/micro/aim7/shared
> > +0.4%  3303.67-0.8%  3264.33  
> > brickland1/micro/dbench/100%
> > +0.7%   1871483.67-0.4%   1850846.00  
> > brickland1/micro/netperf/120s-200%-TCP_MAERTS
> > -1.0%109553.00+0.4%111038.67  
> > brickland1/micro/pigz/100%
> > -0.7% 13600.67+0.1% 13718.67  
> > lkp-a04/micro/netperf/120s-200%-TCP_CRR
> > -4.6%995898.00   -85.2%154621.40  
> > lkp-ib03/micro/aim7/fork_test
> >-31.8% 32178.00   -50.3% 23442.67  
> > lkp-ib03/micro/aim7/shared
> > +1.1%   7466432.67-0.7%   7334831.67  
> > lkp-ib03/micro/hackbench/1600%-threads-pipe
> > +2.5%   1044936.33-1.3%   1006084.00  
> > lkp-ib03/micro/hackbench/1600%-threads-socket
> > -1.3%   5635979.00+0.2%   5721011.67  
> > lkp-ib03/micro/netperf/120s-200%-TCP_RR
> >-24.3% 42853.33   -56.8% 24484.33  
> > lkp-nex04/micro/aim7/shared
> >-23.3%754297.67   -83.2%165479.00  
> > lkp-sb03/micro/aim7/fork_test
> > -7.4% 21586.00   -24.1% 17698.33  
&g

[PATCH 0/4] per anon_vma lock and turn anon_vma rwsem lock to rwlock_t

2013-11-01 Thread Yuanhan Liu

 -11.8% 35885.00-3.8% 39142.00  
lkp-sb03/micro/hackbench/1600%-process-socket
-2.2%   1392771.67-6.0%   1338125.33  
lkp-sb03/micro/hackbench/1600%-threads-pipe
   +14.0% 11281.67+0.7%  9959.67  
lkp-sb03/micro/netperf/120s-200%-TCP_SENDFILE
-8.6%   2986551.83   -38.8%   1998635.80  TOTAL vmstat.system.in

  1c00bef768d4341afa7d  e3e37183ee805f33e88f  
      
+1.4%   350.18+0.1%   345.91  
brickland1/micro/aim7/dbase
-6.7% 13193.68  +291.0% 55316.25  
brickland1/micro/aim7/fork_test
-7.9%   369.69   -16.5%   334.93  
brickland1/micro/aim7/shared
+1.7% 15760.12+0.2% 15522.64  
brickland1/micro/dbench/100%
   +87.1% 0.19   +54.8% 0.16  
brickland1/micro/netperf/120s-200%-TCP_RR
+1.0%   168.83-0.6%   166.16  lkp-ib03/micro/aim7/dbase
-4.2%   103.73   -11.0%96.37  lkp-ib03/micro/aim7/shared
+0.4% 21365.40+2.0% 21720.47  
lkp-ib03/micro/hackbench/1600%-process-pipe
+0.0% 26374.89+1.9% 26868.37  
lkp-ib03/micro/hackbench/1600%-threads-socket
+1.7%   147.79+1.0%   146.73  lkp-sb03/micro/aim7/dbase
-0.8%  2927.50+5.0%  3099.96  
lkp-sb03/micro/aim7/fork_test
-1.1%95.14-5.0%91.38  lkp-sb03/micro/aim7/shared
+0.0% 15758.69+0.8% 15871.96  
lkp-sb03/micro/hackbench/1600%-process-socket
+1.8%79.81+0.0%78.42  lkp-sb03/micro/pigz/100%
-0.7% 96695.65   +43.5%139659.72  TOTAL time.system_time

  1c00bef768d4341afa7d  e3e37183ee805f33e88f  
      
+1.4% 9.69-0.8% 9.48  
brickland1/micro/aim7/dbase
   +33.7%16.60  +680.3%96.90  
brickland1/micro/aim7/fork_test
-9.8% 9.93   -21.5% 8.64  
brickland1/micro/aim7/shared
+1.6%18.43+0.2%18.16  
brickland1/micro/dbench/100%
-0.1%94.68+0.1%94.82  
brickland1/micro/netperf/120s-200%-TCP_MAERTS
+3.3%89.08+3.6%89.40  
brickland1/micro/netperf/120s-200%-TCP_RR
-0.4%90.08-0.0%90.44  
lkp-a04/micro/netperf/120s-200%-TCP_RR
+0.1%90.36+0.3%90.55  
lkp-a04/micro/netperf/120s-200%-UDP_RR
   +31.5%47.63  +143.9%88.37  
lkp-ib03/micro/aim7/fork_test
-8.5% 9.62   -16.7% 8.75  lkp-ib03/micro/aim7/shared
+0.0%87.20-0.2%86.99  
lkp-ib03/micro/hackbench/1600%-threads-pipe
+0.3%88.70+0.6%88.98  
lkp-ib03/micro/hackbench/1600%-threads-socket
-0.4%88.10-0.1%88.39  
lkp-ib03/micro/netperf/120s-200%-TCP_RR
-1.6%92.55-0.4%93.61  
lkp-ib03/micro/netperf/120s-200%-TCP_SENDFILE
+7.0%12.22   -13.4% 9.89  
lkp-nex04/micro/aim7/shared
   +18.5%59.24   +77.0%88.53  
lkp-sb03/micro/aim7/fork_test
-2.5% 9.86-9.5% 9.15  lkp-sb03/micro/aim7/shared
+0.1%84.76+0.2%84.86  
lkp-sb03/micro/hackbench/1600%-process-socket
-0.0%87.91-0.5%87.54  
lkp-sb03/micro/hackbench/1600%-threads-pipe
+0.2%88.95+0.1%88.86  
lkp-sb03/micro/hackbench/1600%-threads-socket
+0.4%83.69+0.1%83.40  
lkp-sb03/micro/netperf/120s-200%-TCP_CRR
+2.1%  1259.30   +13.9%  1405.69  TOTAL iostat.cpu.system

Cc: Ingo Molnar 
Cc: Linus Torvalds 
Cc: Andrew Morton 
Cc: Rik van Riel 
Cc: Peter Zijlstra 
Cc: Michel Lespinasse 
---
Ingo Molnar (1):
  mm/rmap: convert anon_vma rwsem to rwlock_t

Peter Zijlstra (1):
  mm/rmap: cleanup unnecessary code

Yuanhan Liu (2):
  mm/rmap: per anon_vma lock
  mm/rmap.c: move anon_vma initialization code into anon_vma_ctor

 include/linux/mmu_notifier.h |2 +-
 include/linux/rmap.h |   19 ++---
 mm/huge_memory.c |4 +-
 mm/mmap.c|   48 ++--
 mm/rmap.c|  172 +++--
 5 files changed, 66 insertions(+), 179 deletions(-)

-- 
1.7.7.6

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 2/4] mm/rmap: convert anon_vma rwsem to rwlock_t

2013-11-01 Thread Yuanhan Liu

From: Ingo Molnar 

This lock basically covers the insertion of avc into rb tree; turn rwsem
to rwlock_t.

With the former per anon_vma lock applied, the performance is nearly
doubled in some heavy fork workload of aim7 on some platforms.

aim7.2000.jobs-per-min
-  --
lkp-ib03/aim7/shared+2.7%
brickland1/aim7/fork_test  +89.9%
lkp-ib03/aim7/fork_test+49.3%
lkp-sb03/aim7/fork_test+59.0%
lkp-sb03/aim7/shared+0.3%

time.voluntary_context_switches
-  --
brickland1/aim7/shared -60.8%
brickland1/aim7/fork_test  -75.8%
brickland1/pigz/100%+0.4%
lkp-ib03/aim7/fork_test-81.4%
lkp-ib03/aim7/shared   -46.8%
lkp-ib03/dbench/100%-0.6%
lkp-nex04/aim7/shared  -54.2%
lkp-sb03/aim7/fork_test-80.2%
lkp-sb03/aim7/shared   -26.7%

Cc: Linus Torvalds 
Cc: Andrew Morton 
Cc: Rik van Riel 
Cc: Peter Zijlstra 
Cc: Michel Lespinasse 
Signed-off-by: Ingo Molnar 
Signed-off-by: Yuanhan Liu 
---
 include/linux/mmu_notifier.h |2 +-
 include/linux/rmap.h |   19 +--
 mm/huge_memory.c |4 ++--
 mm/mmap.c|   10 +-
 mm/rmap.c|   16 
 5 files changed, 25 insertions(+), 26 deletions(-)

diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index deca874..628e807 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -151,7 +151,7 @@ struct mmu_notifier_ops {
  * Therefore notifier chains can only be traversed when either
  *
  * 1. mmap_sem is held.
- * 2. One of the reverse map locks is held (i_mmap_mutex or anon_vma->rwsem).
+ * 2. One of the reverse map locks is held (i_mmap_mutex or anon_vma->rwlock).
  * 3. No other concurrent thread can access the list (release)
  */
 struct mmu_notifier {
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index f450f84..2bb0a1f 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -7,7 +7,7 @@
 #include 
 #include 
 #include 
-#include 
+#include 
 #include 
 
 /*
@@ -26,7 +26,7 @@
  */
 struct anon_vma {
struct anon_vma *root;  /* Root of this anon_vma tree */
-   struct rw_semaphore rwsem;  /* W: modification, R: walking the list 
*/
+   rwlock_t rwlock;/* W: modification, R: walking the list 
*/
/*
 * The refcount is taken on an anon_vma when there is no
 * guarantee that the vma of page tables will exist for
@@ -64,7 +64,7 @@ struct anon_vma_chain {
struct vm_area_struct *vma;
struct anon_vma *anon_vma;
struct list_head same_vma;   /* locked by mmap_sem & page_table_lock */
-   struct rb_node rb;  /* locked by anon_vma->rwsem */
+   struct rb_node rb;  /* locked by anon_vma->rwlock */
unsigned long rb_subtree_last;
 #ifdef CONFIG_DEBUG_VM_RB
unsigned long cached_vma_start, cached_vma_last;
@@ -108,37 +108,36 @@ static inline void vma_lock_anon_vma(struct 
vm_area_struct *vma)
 {
struct anon_vma *anon_vma = vma->anon_vma;
if (anon_vma)
-   down_write(&anon_vma->rwsem);
+   write_lock(&anon_vma->rwlock);
 }
 
 static inline void vma_unlock_anon_vma(struct vm_area_struct *vma)
 {
struct anon_vma *anon_vma = vma->anon_vma;
if (anon_vma)
-   up_write(&anon_vma->rwsem);
+   write_unlock(&anon_vma->rwlock);
 }
 
 static inline void anon_vma_lock_write(struct anon_vma *anon_vma)
 {
-   down_write(&anon_vma->rwsem);
+   write_lock(&anon_vma->rwlock);
 }
 
 static inline void anon_vma_unlock_write(struct anon_vma *anon_vma)
 {
-   up_write(&anon_vma->rwsem);
+   write_unlock(&anon_vma->rwlock);
 }
 
 static inline void anon_vma_lock_read(struct anon_vma *anon_vma)
 {
-   down_read(&anon_vma->rwsem);
+   read_lock(&anon_vma->rwlock);
 }
 
 static inline void anon_vma_unlock_read(struct anon_vma *anon_vma)
 {
-   up_read(&anon_vma->rwsem);
+   read_unlock(&anon_vma->rwlock);
 }
 
-
 /*
  * anon_vma helper functions.
  */
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 73cc8ef..a1e6cb2 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1542,7 +1542,7 @@ static int __split_huge_page_splitting(struct page *page,
 * We can't temporarily set the pmd to null in order
 * to split it, the pmd must remain marked huge at all
 * times or the VM won't take the pmd_trans_huge paths
-* and it won't wait on the anon_vma->rwsem to
+* and it won't wait on the anon_vma->rwlock to
 * serialize against split_huge_page*.
 */
pmdp_splitting_flush(vma,

[PATCH 1/4] mm/rmap: per anon_vma lock

2013-11-01 Thread Yuanhan Liu

It used to be per anon_vma lock. At then, it was a spinlock. It was
changed to mutex and an regression was reported. Commit bb4aa396("mm:
avoid repeated anon_vma lock/unlock sequences in anon_vma_clone")
turned locking a anon_vma to locking its root.

Change it back to per anon_vma lock for:
- next patch will turn the rwsem lock to rwlock
  so above regression should be avoided.

- we don't need do avc allocation inside lock, which is somehow
  necessary for turning rwsem to rwlock

- cleaner code: don't need iterate twice at unlink_anon_vmas()

- And it boosts performance in some case as it make the lock range
  smaller, which in turn reduce contention.

  The performance boost will be more obvious with next patch applied.

  aim7.2000.jobs-per-min
    --
  brickland1/aim7/fork_test +35.0%
  lkp-ib03/aim7/fork_test   +28.4%
  lkp-ib03/aim7/shared   +2.0%
  lkp-sb03/aim7/dbase-0.4%
  lkp-sb03/aim7/fork_test   +16.4%
  lkp-sb03/aim7/shared   +0.1%

  time.voluntary_context_switches
  -  -
  brickland1/aim7/fork_test  -6.0%
  brickland1/aim7/shared-21.0%
  brickland1/pigz/100%   -1.0%
  lkp-ib03/aim7/fork_test   -24.3%
  lkp-ib03/aim7/shared  -23.3%
  lkp-ib03/dbench/100%   +1.4%
  lkp-nex04/aim7/shared -19.0%
  lkp-sb03/aim7/fork_test   -31.0%
  lkp-sb03/aim7/shared   -8.4%

Cc: Ingo Molnar 
Cc: Linus Torvalds 
Cc: Andrew Morton 
Cc: Rik van Riel 
Cc: Peter Zijlstra 
Cc: Michel Lespinasse 
Signed-off-by: Yuanhan Liu 
---
 include/linux/rmap.h |   12 
 mm/huge_memory.c |4 +-
 mm/mmap.c|   46 +++---
 mm/rmap.c|   76 --
 4 files changed, 43 insertions(+), 95 deletions(-)

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 6dacb93..f450f84 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -108,34 +108,34 @@ static inline void vma_lock_anon_vma(struct 
vm_area_struct *vma)
 {
struct anon_vma *anon_vma = vma->anon_vma;
if (anon_vma)
-   down_write(&anon_vma->root->rwsem);
+   down_write(&anon_vma->rwsem);
 }
 
 static inline void vma_unlock_anon_vma(struct vm_area_struct *vma)
 {
struct anon_vma *anon_vma = vma->anon_vma;
if (anon_vma)
-   up_write(&anon_vma->root->rwsem);
+   up_write(&anon_vma->rwsem);
 }
 
 static inline void anon_vma_lock_write(struct anon_vma *anon_vma)
 {
-   down_write(&anon_vma->root->rwsem);
+   down_write(&anon_vma->rwsem);
 }
 
 static inline void anon_vma_unlock_write(struct anon_vma *anon_vma)
 {
-   up_write(&anon_vma->root->rwsem);
+   up_write(&anon_vma->rwsem);
 }
 
 static inline void anon_vma_lock_read(struct anon_vma *anon_vma)
 {
-   down_read(&anon_vma->root->rwsem);
+   down_read(&anon_vma->rwsem);
 }
 
 static inline void anon_vma_unlock_read(struct anon_vma *anon_vma)
 {
-   up_read(&anon_vma->root->rwsem);
+   up_read(&anon_vma->rwsem);
 }
 
 
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 610e3df..73cc8ef 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1542,7 +1542,7 @@ static int __split_huge_page_splitting(struct page *page,
 * We can't temporarily set the pmd to null in order
 * to split it, the pmd must remain marked huge at all
 * times or the VM won't take the pmd_trans_huge paths
-* and it won't wait on the anon_vma->root->rwsem to
+* and it won't wait on the anon_vma->rwsem to
 * serialize against split_huge_page*.
 */
pmdp_splitting_flush(vma, address, pmd);
@@ -1747,7 +1747,7 @@ static int __split_huge_page_map(struct page *page,
return ret;
 }
 
-/* must be called with anon_vma->root->rwsem held */
+/* must be called with anon_vma->rwsem held */
 static void __split_huge_page(struct page *page,
  struct anon_vma *anon_vma,
  struct list_head *list)
diff --git a/mm/mmap.c b/mm/mmap.c
index 9d54851..b81d3a3 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -414,10 +414,8 @@ void validate_mm(struct mm_struct *mm)
struct vm_area_struct *vma = mm->mmap;
while (vma) {
struct anon_vma_chain *avc;
-   vma_lock_anon_vma(vma);
list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
anon_vma_interval_tree_verify(avc);
-   vma_unlock_anon_vma(vma);
highest_address = vma->vm_end;
vma = vma->vm_next;
i++;
@@ -497,15 +495,20 @@ static void vma_rb_e

[PATCH 4/4] mm/rmap.c: move anon_vma initialization code into anon_vma_ctor

2013-11-01 Thread Yuanhan Liu

Cc: Ingo Molnar 
Cc: Linus Torvalds 
Cc: Andrew Morton 
Cc: Rik van Riel 
Cc: Peter Zijlstra 
Cc: Michel Lespinasse 
Signed-off-by: Yuanhan Liu 
---
 mm/rmap.c |   23 +--
 1 files changed, 9 insertions(+), 14 deletions(-)

diff --git a/mm/rmap.c b/mm/rmap.c
index 246b5fe..831dd4e 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -67,19 +67,7 @@ static struct kmem_cache *anon_vma_chain_cachep;
 
 static inline struct anon_vma *anon_vma_alloc(void)
 {
-   struct anon_vma *anon_vma;
-
-   anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL);
-   if (anon_vma) {
-   atomic_set(&anon_vma->refcount, 1);
-   /*
-* Initialise the anon_vma root to point to itself. If called
-* from fork, the root will be reset to the parents anon_vma.
-*/
-   anon_vma->root = anon_vma;
-   }
-
-   return anon_vma;
+   return kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL);
 }
 
 static inline void anon_vma_free(struct anon_vma *anon_vma)
@@ -293,8 +281,15 @@ static void anon_vma_ctor(void *data)
struct anon_vma *anon_vma = data;
 
rwlock_init(&anon_vma->rwlock);
-   atomic_set(&anon_vma->refcount, 0);
anon_vma->rb_root = RB_ROOT;
+
+   atomic_set(&anon_vma->refcount, 1);
+   /*
+* Initialise the anon_vma root to point to itself. If called
+* from fork, the root will be reset to the parents anon_vma.
+*/
+   anon_vma->root = anon_vma;
+
 }
 
 void __init anon_vma_init(void)
-- 
1.7.7.6

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 3/4] mm/rmap: cleanup unnecessary code

2013-11-01 Thread Yuanhan Liu

From: Peter Zijlstra 

Quot from Peter: [ edited by Yuanhan Liu ]
You can remove all that -- all that trickery was only needed because the
lock could sleep;

Cc: Ingo Molnar 
Cc: Linus Torvalds 
Cc: Andrew Morton 
Cc: Rik van Riel 
Cc: Michel Lespinasse 
Signed-off-by: Peter Zijlstra 
---
 mm/rmap.c |   71 +++--
 1 files changed, 8 insertions(+), 63 deletions(-)

diff --git a/mm/rmap.c b/mm/rmap.c
index 22e8172..246b5fe 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -85,29 +85,6 @@ static inline struct anon_vma *anon_vma_alloc(void)
 static inline void anon_vma_free(struct anon_vma *anon_vma)
 {
VM_BUG_ON(atomic_read(&anon_vma->refcount));
-
-   /*
-* Synchronize against page_lock_anon_vma_read() such that
-* we can safely hold the lock without the anon_vma getting
-* freed.
-*
-* Relies on the full mb implied by the atomic_dec_and_test() from
-* put_anon_vma() against the acquire barrier implied by
-* down_read_trylock() from page_lock_anon_vma_read(). This orders:
-*
-* page_lock_anon_vma_read()VS  put_anon_vma()
-*   down_read_trylock()  atomic_dec_and_test()
-*   LOCK MB
-*   atomic_read()rwlock_is_locked()
-*
-* LOCK should suffice since the actual taking of the lock must
-* happen _before_ what follows.
-*/
-   if (!write_can_lock(&anon_vma->rwlock)) {
-   anon_vma_lock_write(anon_vma);
-   anon_vma_unlock_write(anon_vma);
-   }
-
kmem_cache_free(anon_vma_cachep, anon_vma);
 }
 
@@ -387,10 +364,6 @@ out:
 
 /*
  * Similar to page_get_anon_vma() except it locks the anon_vma.
- *
- * Its a little more complex as it tries to keep the fast path to a single
- * atomic op -- the trylock. If we fail the trylock, we fall back to getting a
- * reference like with page_get_anon_vma() and then block on the mutex.
  */
 struct anon_vma *page_lock_anon_vma_read(struct page *page)
 {
@@ -405,50 +378,22 @@ struct anon_vma *page_lock_anon_vma_read(struct page 
*page)
goto out;
 
anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
-   if (read_trylock(&anon_vma->rwlock)) {
-   /*
-* If the page is still mapped, then this anon_vma is still
-* its anon_vma, and holding the mutex ensures that it will
-* not go away, see anon_vma_free().
-*/
-   if (!page_mapped(page)) {
-   read_unlock(&anon_vma->rwlock);
-   anon_vma = NULL;
-   }
-   goto out;
-   }
-
-   /* trylock failed, we got to sleep */
-   if (!atomic_inc_not_zero(&anon_vma->refcount)) {
-   anon_vma = NULL;
-   goto out;
-   }
-
-   if (!page_mapped(page)) {
-   put_anon_vma(anon_vma);
-   anon_vma = NULL;
-   goto out;
-   }
-
-   /* we pinned the anon_vma, its safe to sleep */
-   rcu_read_unlock();
anon_vma_lock_read(anon_vma);
 
-   if (atomic_dec_and_test(&anon_vma->refcount)) {
-   /*
-* Oops, we held the last refcount, release the lock
-* and bail -- can't simply use put_anon_vma() because
-* we'll deadlock on the anon_vma_lock_write() recursion.
-*/
+   /*
+* If this page is still mapped, then its anon_vma cannot have been
+* freed.  But if it has been unmapped, we have no security against the
+* anon_vma structure being freed and reused (for another anon_vma:
+* SLAB_DESTROY_BY_RCU guarantees that)
+*/
+   if (!page_mapped(page)) {
anon_vma_unlock_read(anon_vma);
-   __put_anon_vma(anon_vma);
anon_vma = NULL;
}
 
-   return anon_vma;
-
 out:
rcu_read_unlock();
+
return anon_vma;
 }
 
-- 
1.7.7.6

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [tip:sched/core] sched, idle: Fix the idle polling state logic

2013-10-20 Thread Yuanhan Liu

On Wed, Sep 25, 2013 at 09:38:19AM -0700, tip-bot for Peter Zijlstra wrote:
> Commit-ID:  ea8117478918a4734586d35ff530721b682425be
> Gitweb: http://git.kernel.org/tip/ea8117478918a4734586d35ff530721b682425be
> Author: Peter Zijlstra 
> AuthorDate: Wed, 11 Sep 2013 12:43:13 +0200
> Committer:  Ingo Molnar 
> CommitDate: Wed, 25 Sep 2013 13:53:10 +0200
> 
> sched, idle: Fix the idle polling state logic
> 
> Mike reported that commit 7d1a9417 ("x86: Use generic idle loop")
> regressed several workloads and caused excessive reschedule
> interrupts.

Hi,

JFYI, this patch does reduce interruptes in our test:

   platform: Sandbridge
   testcase: aim7/creat-clo

   * --  with this patch
   O --  without this patch
interrupts.RES

   1.2e+08 ++---+
   ||
 1e+08 O+   OOOOOOOOOO  |
   ||
   ||
 8e+07 ++   |
   ||
 6e+07 ++   |
   ||
 4e+07 ++   |
   ||
   ||
 2e+07 ++   |
   ******   ..*******
 0 ++*--+



   vmstat.system.in

   16 +++
  O O O |
   14 ++   O   OO OOOO  |
   12 ++ O  |
  | |
   10 ++|
  | |
8 ++|
  | |
6 ++|
4 ++|
  | |
2 *+...***** ..******
  | *.*..   |
0 +++


--yliu

> 
> The patch in question failed to notice that the x86 code had an
> inverted sense of the polling state versus the new generic code (x86:
> default polling, generic: default !polling).
> 
> Fix the two prominent x86 mwait based idle drivers and introduce a few
> new generic polling helpers (fixing the wrong smp_mb__after_clear_bit
> usage).
> 
> Also switch the idle routines to using tif_need_resched() which is an
> immediate TIF_NEED_RESCHED test as opposed to need_resched which will
> end up being slightly different.
> 
> Reported-by: Mike Galbraith 
> Signed-off-by: Peter Zijlstra 
> Cc: l...@kernel.org
> Cc: t...@linutronix.de
> Link: http://lkml.kernel.org/n/tip-nc03imb0etuefmzybzj7s...@git.kernel.org
> Signed-off-by: Ingo Molnar 
> ---
>  arch/x86/kernel/process.c |  6 ++--
>  drivers/acpi/processor_idle.c | 46 ++---
>  drivers/idle/intel_idle.c |  2 +-
>  include/linux/sched.h | 78 
> +++
>  include/linux/thread_info.h   |  2 ++
>  kernel/cpu/idle.c |  9 +++--
>  6 files changed, 91 insertions(+), 52 deletions(-)
> 
> diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
> index c83516b..3fb8d95 100644
> --- a/arch/x86/kernel/process.c
> +++ b/arch/x86/kernel/process.c
> @@ -391,9 +391,9 @@ static void amd_e400_idle(void)
>* The switch back from broadcast mode needs to be
>* called with interrupts disabled.
>*/
> -  local_irq_disable();
> -  clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu);
> -  local_irq_enable();
> + local_irq_disable();
> + clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu);
> + local_irq_enable();
>   } else
>

Re: [tip:sched/core] sched/balancing: Fix cfs_rq-> task_h_load calculation

2013-09-29 Thread Yuanhan Liu

On Fri, Sep 20, 2013 at 06:46:59AM -0700, tip-bot for Vladimir Davydov wrote:
> Commit-ID:  7e3115ef5149fc502e3a2e80719dba54a8e7409d
> Gitweb: http://git.kernel.org/tip/7e3115ef5149fc502e3a2e80719dba54a8e7409d
> Author: Vladimir Davydov 
> AuthorDate: Sat, 14 Sep 2013 19:39:46 +0400
> Committer:  Ingo Molnar 
> CommitDate: Fri, 20 Sep 2013 11:59:39 +0200
> 
> sched/balancing: Fix cfs_rq->task_h_load calculation
> 
> Patch a003a2 (sched: Consider runnable load average in move_tasks())
> sets all top-level cfs_rqs' h_load to rq->avg.load_avg_contrib, which is
> always 0. This mistype leads to all tasks having weight 0 when load
> balancing in a cpu-cgroup enabled setup. There obviously should be sum
> of weights of all runnable tasks there instead. Fix it.

Hi Vladimir,

FYI, Here we found a 17% netperf regression by this patch. Here are some
changed stats between this commit 7e3115ef5149fc502e3a2e80719dba54a8e7409d
and it's parent(3029ede39373c368f402a76896600d85a4f7121b)

NOTE: both commit had been tested 10+ times.

stat 
7e3115ef5149fc502e3a2e807193029ede39373c368f402a768966
netperf.Throughput_Mbps  [ 2515 
  - 2593   ] -- [ 3010   - 3042   ]

lock_stat.&(&base->lock)->rlock.contentions.run_timer_softirq[ 188  
  - 261] -- [ 1907   - 2018   ]
lock_stat.&rq->lock.contentions  [ 16885
  - 23944  ] -- [ 2.6544e+05 - 2.8201e+05 ]
lock_stat.slock-AF_INET/1.contentions.release_sock   [ 28993
  - 34079  ] -- [ 2.6537e+05 - 2.7814e+05 ]
lock_stat.slock-AF_INET/1.contentions.tcp_v4_rcv [ 54906
  - 64453  ] -- [ 4.6572e+05 - 4.8895e+05 ]
lock_stat.slock-AF_INET/1.contentions[ 54778
  - 64265  ] -- [ 4.6503e+05 - 4.8831e+05 ]
lock_stat.slock-AF_INET/1.contentions.lock_sock_nested   [ 25382
  - 29998  ] -- [ 1.9822e+05 - 2.0934e+05 ]
lock_stat.slock-AF_INET.contentions.lock_sock_nested [ 
1.5861e+05 - 1.8802e+05 ] -- [ 1.2317e+06 - 1.3016e+06 ]
lock_stat.slock-AF_INET.contentions.tcp_v4_rcv   [ 
1.9181e+05 - 2.2617e+05 ] -- [ 1.5482e+06 - 1.6346e+06 ]
lock_stat.slock-AF_INET.contentions  [ 
1.9259e+05 - 2.269e+05  ] -- [ 1.5536e+06 - 1.6403e+06 ]
lock_stat.&(&base->lock)->rlock.contentions  [ 5658 
  - 9045   ] -- [ 1.3812e+05 - 1.478e+05  ]
lock_stat.&(&base->lock)->rlock.contentions.lock_timer_base  [ 11006
  - 17636  ] -- [ 2.7183e+05 - 2.9104e+05 ]
lock_stat.slock-AF_INET.contentions.release_sock [ 33931
  - 39607  ] -- [ 3.2735e+05 - 3.4512e+05 ]
lock_stat.&(&base->lock)->rlock.contentions.mod_timer[ 93   
  - 152] -- [ 2347   - 2643   ]
lock_stat.&(&zone->lock)->rlock.contentions.__free_pages_ok  [ 
6.4647e+07 - 6.6226e+07 ] -- [ 5.3604e+07 - 5.5065e+07 ]
vmstat.system.in [ 8921 
  - 9414   ] -- [ 27103  - 28369  ]
vmstat.system.cs [ 
1.4924e+05 - 1.9988e+05 ] -- [ 6.1384e+05 - 6.4036e+05 ]
lock_stat.&(&zone->lock)->rlock.contentions  [ 
6.7612e+07 - 6.9817e+07 ] -- [ 5.7419e+07 - 5.8889e+07 ]
lock_stat.rcu_node_1.contentions.rcu_process_callbacks   [ 81543
  - 87346  ] -- [ 97955  - 1.0295e+05 ]
iostat.cpu.user  [ 1.4141   
  - 1.5051 ] -- [ 2.1044 - 2.1732 ]
lock_stat.&(&zone->lock)->rlock.contentions.get_page_from_freelist   [ 
7.0564e+07 - 7.3656e+07 ] -- [ 6.1222e+07 - 6.2746e+07 ]
lock_stat.&rq->lock.contentions.__schedule   [ 8276 
  - 11422  ] -- [ 1.1656e+05 - 1.9275e+05 ]
iostat.cpu.system[ 95.387   
  - 95.516 ] -- [ 94.736 - 94.81  ]
vmstat.cpu.sy[ 96   
  - 96 ] -- [ 95 - 95 ]



And here are the text plot charts for those changed stats:

* for 7e3115ef5149fc502e3a2e80719dba54a8e7409d(this commit)
O for 3029ede39373c368f402a76896600d85a4f7121b(parent)

   netperf.Throughput_Mbps

   3100 ++--+
O   O   |
   3000 ++ O   O  O   O  O   O  O   O
|   |
|   |
   2900 ++  |
|

Re: [PATCH] x86, mm: fix boot hang regression

2013-05-25 Thread Yuanhan Liu

On Sat, May 25, 2013 at 12:31:43AM -0700, Yinghai Lu wrote:
> On Fri, May 24, 2013 at 9:30 PM, Yuanhan Liu
>  wrote:
> > Commit 8d57470d introduced a kernel panic while setting mem=2G at
> > boot time, and commit c9b3234a6 turns the the kernel panic to hang.
> >
> > While, the reason is the same: the are accessing a BAD address; I mean
> > the mapping is broken.
> >
> > Here is a mem mapping range dumped at boot time:
> > [mem 0x-0x000f] page 4k  (0)
> > [mem 0x7fe0-0x7fff] page 1G  (1)
> > [mem 0x7c00-0x7fdf] page 1G  (2)
> > [mem 0x0010-0x001f] page 4k  (3)
> > [mem 0x0020-0x7bff] page 2M  (4)
> >
> ...
> > I reported this panic regression long time ago, and I didn't notic the above
> > panic->hang change before, which might confuse Yinghai for understanding
> > what happened from 2 logs I sent before(one is from 8d57470d, another is
> > from the HEAD commit at that time, which turn to a hang as stated).
> > More, it seems that Yinghai can't produce it. And I was busying at
> > something else. And I finally got a day yesterday(and a good mood ;).
> >
> > Last, Thanks Changlong's effort for bisecting the 2 above commit.
> > ---
> >  arch/x86/mm/init_64.c |   51 
> > +---
> >  1 files changed, 43 insertions(+), 8 deletions(-)
> 
> oh, I know the reason, my intel box has acpi or reserved area just below 2GiB.

Due to the 1GB page mapping support, it seems that this issue does not
exist on platform before Sandybridge.

> 
> your patch is not right fix.
> 
> Attached patch should fix the problem.

Firstly, your patch works and feel free to add:
Tested-by: Yuanhan Liu 

While, your patch fixes this issue on source side. I thought that
before, too.  But I think it's better to fix it on root side.
That's why I sent a patch to try to fix it at mapping stage; Meanwhile,
I was trying to sent another patch to fix this issue on source
side later. And you did that.

On the other hand, since we support splitting PUD(and PMD), I guess we
should make it do the right work just in case will meet such case later.

So, I still think my patch does fix something and should be merged,
unless I did it wrongly. And please correct me if I'm wrong.

Thanks.

--yliu
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH] x86, mm: fix boot hang regression

2013-05-24 Thread Yuanhan Liu

Commit 8d57470d introduced a kernel panic while setting mem=2G at
boot time, and commit c9b3234a6 turns the the kernel panic to hang.

While, the reason is the same: the are accessing a BAD address; I mean
the mapping is broken.

Here is a mem mapping range dumped at boot time:
[mem 0x-0x000f] page 4k  (0)
[mem 0x7fe0-0x7fff] page 1G  (1)
[mem 0x7c00-0x7fdf] page 1G  (2)
[mem 0x0010-0x001f] page 4k  (3)
[mem 0x0020-0x7bff] page 2M  (4)

Where, we met no problems while setting memory map for region (0) to
(3). But we have set PG_LEVEL_1G mapping for pud index 0x1 at (1).

And pud index comes to 0x1 as well while setting 0x4000-0x7bf0
part of (4). What's more, it's PG_LEVEL_2M mapping, which results to a
splitting of PG_LEVEL_1G mapping. This breaks former mapping for (1) and
(2). In the same time, due to "end" setting to 0x7c00, we missed the
chance to fix it at phys_pmd_init() for code:
if (address >= end) {

continue;
}

Thus, using a extra flag to indicate we are splitting a large PUD(or PMD)
and changing the above if statement to following will make this issue gone:
if(address >= end && !spliting) {
...
}

Reported-by: LKP 
CC: For 3.9+ 
Cc: H. Peter Anvin 
Cc: Yinghai Lu 
Bisected-by: "Xie, ChanglongX" 
Signed-off-by: Yuanhan Liu 

---
I reported this panic regression long time ago, and I didn't notic the above
panic->hang change before, which might confuse Yinghai for understanding
what happened from 2 logs I sent before(one is from 8d57470d, another is
from the HEAD commit at that time, which turn to a hang as stated). 
More, it seems that Yinghai can't produce it. And I was busying at
something else. And I finally got a day yesterday(and a good mood ;).

Last, Thanks Changlong's effort for bisecting the 2 above commit.
---
 arch/x86/mm/init_64.c |   51 +---
 1 files changed, 43 insertions(+), 8 deletions(-)

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index bb00c46..e4c7038 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -401,7 +401,7 @@ void __init cleanup_highmap(void)
 
 static unsigned long __meminit
 phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end,
- pgprot_t prot)
+ pgprot_t prot, bool split_pmd)
 {
unsigned long pages = 0, next;
unsigned long last_map_addr = end;
@@ -411,7 +411,7 @@ phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned 
long end,
 
for (i = pte_index(addr); i < PTRS_PER_PTE; i++, addr = next, pte++) {
next = (addr & PAGE_MASK) + PAGE_SIZE;
-   if (addr >= end) {
+   if (addr >= end && !split_pmd) {
if (!after_bootmem &&
!e820_any_mapped(addr & PAGE_MASK, next, E820_RAM) 
&&
!e820_any_mapped(addr & PAGE_MASK, next, 
E820_RESERVED_KERN))
@@ -446,7 +446,7 @@ phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned 
long end,
 
 static unsigned long __meminit
 phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
- unsigned long page_size_mask, pgprot_t prot)
+ unsigned long page_size_mask, pgprot_t prot, bool split_pud)
 {
unsigned long pages = 0, next;
unsigned long last_map_addr = end;
@@ -457,9 +457,10 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, 
unsigned long end,
pmd_t *pmd = pmd_page + pmd_index(address);
pte_t *pte;
pgprot_t new_prot = prot;
+   bool split_pmd = false;
 
next = (address & PMD_MASK) + PMD_SIZE;
-   if (address >= end) {
+   if (address >= end && !split_pud) {
if (!after_bootmem &&
!e820_any_mapped(address & PMD_MASK, next, 
E820_RAM) &&
!e820_any_mapped(address & PMD_MASK, next, 
E820_RESERVED_KERN))
@@ -472,7 +473,7 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, 
unsigned long end,
spin_lock(&init_mm.page_table_lock);
pte = (pte_t *)pmd_page_vaddr(*pmd);
last_map_addr = phys_pte_init(pte, address,
-   end, prot);
+   end, prot, split_pmd);
spin_unlock(&init_mm.page_table_lock);
continue;
}
@@ -495,6 +496,7 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, 
unsigned long end,

[tip:core/locking] rwsem-spinlock: Implement writer lock-stealing for better scalability

2013-02-22 Thread tip-bot for Yuanhan Liu

Commit-ID:  41ef8f826692c8f65882bec0a8211bd4d1d2d19a
Gitweb: http://git.kernel.org/tip/41ef8f826692c8f65882bec0a8211bd4d1d2d19a
Author: Yuanhan Liu 
AuthorDate: Fri, 1 Feb 2013 18:59:16 +0800
Committer:  Ingo Molnar 
CommitDate: Tue, 19 Feb 2013 08:43:39 +0100

rwsem-spinlock: Implement writer lock-stealing for better scalability

We (Linux Kernel Performance project) found a regression
introduced by commit:

  5a505085f043 mm/rmap: Convert the struct anon_vma::mutex to an rwsem

which converted all anon_vma::mutex locks rwsem write locks.

The semantics are the same, but the behavioral difference is
quite huge in some cases. After investigating it we found the
root cause: mutexes support lock stealing while rwsems don't.

Here is the link for the detailed regression report:

  https://lkml.org/lkml/2013/1/29/84

Ingo suggested adding write lock stealing to rwsems:

"I think we should allow lock-steal between rwsem writers - that
 will not hurt fairness as most rwsem fairness concerns relate to
 reader vs. writer fairness"

And here is the rwsem-spinlock version.

With this patch, we got a double performance increase in one
test box with following aim7 workfile:

FILESIZE: 1M
POOLSIZE: 10M
10 fork_test

 /usr/bin/time output w/o patch   /usr/bin/time_output with 
patch
 -- Percent of CPU this job got: 369% Percent of CPU this job 
got: 537%
 Voluntary context switches: 640595016Voluntary context 
switches: 157915561

We got a 45% increase in CPU usage and saved about 3/4 voluntary context 
switches.

Reported-by: LKP project 
Suggested-by: Ingo Molnar 
Signed-off-by: Yuanhan Liu 
Cc: Alex Shi 
Cc: David Howells 
Cc: Michel Lespinasse 
Cc: Linus Torvalds 
Cc: Andrew Morton 
Cc: Peter Zijlstra 
Cc: Anton Blanchard 
Cc: Arjan van de Ven 
Cc: paul.gortma...@windriver.com
Link: 
http://lkml.kernel.org/r/1359716356-23865-1-git-send-email-yuanhan@linux.intel.com
Signed-off-by: Ingo Molnar 
---
 lib/rwsem-spinlock.c | 69 ++--
 1 file changed, 24 insertions(+), 45 deletions(-)

diff --git a/lib/rwsem-spinlock.c b/lib/rwsem-spinlock.c
index 7e0d6a5..7542afb 100644
--- a/lib/rwsem-spinlock.c
+++ b/lib/rwsem-spinlock.c
@@ -73,20 +73,13 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wakewrite)
goto dont_wake_writers;
}
 
-   /* if we are allowed to wake writers try to grant a single write lock
-* if there's a writer at the front of the queue
-* - we leave the 'waiting count' incremented to signify potential
-*   contention
+   /*
+* as we support write lock stealing, we can't set sem->activity
+* to -1 here to indicate we get the lock. Instead, we wake it up
+* to let it go get it again.
 */
if (waiter->flags & RWSEM_WAITING_FOR_WRITE) {
-   sem->activity = -1;
-   list_del(&waiter->list);
-   tsk = waiter->task;
-   /* Don't touch waiter after ->task has been NULLed */
-   smp_mb();
-   waiter->task = NULL;
-   wake_up_process(tsk);
-   put_task_struct(tsk);
+   wake_up_process(waiter->task);
goto out;
}
 
@@ -121,18 +114,10 @@ static inline struct rw_semaphore *
 __rwsem_wake_one_writer(struct rw_semaphore *sem)
 {
struct rwsem_waiter *waiter;
-   struct task_struct *tsk;
-
-   sem->activity = -1;
 
waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
-   list_del(&waiter->list);
+   wake_up_process(waiter->task);
 
-   tsk = waiter->task;
-   smp_mb();
-   waiter->task = NULL;
-   wake_up_process(tsk);
-   put_task_struct(tsk);
return sem;
 }
 
@@ -204,7 +189,6 @@ int __down_read_trylock(struct rw_semaphore *sem)
 
 /*
  * get a write lock on the semaphore
- * - we increment the waiting count anyway to indicate an exclusive lock
  */
 void __sched __down_write_nested(struct rw_semaphore *sem, int subclass)
 {
@@ -214,37 +198,32 @@ void __sched __down_write_nested(struct rw_semaphore 
*sem, int subclass)
 
raw_spin_lock_irqsave(&sem->wait_lock, flags);
 
-   if (sem->activity == 0 && list_empty(&sem->wait_list)) {
-   /* granted */
-   sem->activity = -1;
-   raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
-   goto out;
-   }
-
-   tsk = current;
-   set_task_state(tsk, TASK_UNINTERRUPTIBLE);
-
/* set up my own style of waitqueue */
+   tsk = current;
waiter.task = tsk;
waiter.flags = RWSEM_WAITING_FOR_WRITE;
-   get_task_struct(tsk);
-
list_add_tail(&waiter.list, &sem->wait_list);
 
-   /* we don't need to touch the sema

[tip:core/locking] locking/stat: Fix a typo

2013-02-22 Thread tip-bot for Yuanhan Liu

Commit-ID:  0be5c8ff58cf7a66019af2f1236daff731ed318c
Gitweb: http://git.kernel.org/tip/0be5c8ff58cf7a66019af2f1236daff731ed318c
Author: Yuanhan Liu 
AuthorDate: Thu, 24 Jan 2013 17:22:44 +0800
Committer:  Ingo Molnar 
CommitDate: Tue, 19 Feb 2013 08:42:37 +0100

locking/stat: Fix a typo

s/STATS/STAT

Signed-off-by: Yuanhan Liu 
Cc: Peter Zijlstra 
Link: 
http://lkml.kernel.org/r/1359019365-23646-1-git-send-email-yuanhan@linux.intel.com
Signed-off-by: Ingo Molnar 
---
 Documentation/lockstat.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/lockstat.txt b/Documentation/lockstat.txt
index cef00d4..dd2f7b2 100644
--- a/Documentation/lockstat.txt
+++ b/Documentation/lockstat.txt
@@ -65,7 +65,7 @@ that had to wait on lock acquisition.
 
  - CONFIGURATION
 
-Lock statistics are enabled via CONFIG_LOCK_STATS.
+Lock statistics are enabled via CONFIG_LOCK_STAT.
 
  - USAGE
 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[tip:core/locking] rwsem-spinlock: Implement writer lock-stealing for better scalability

2013-02-18 Thread tip-bot for Yuanhan Liu

Commit-ID:  5dae63c442131f1b0a66abd43fdc861031f13ca6
Gitweb: http://git.kernel.org/tip/5dae63c442131f1b0a66abd43fdc861031f13ca6
Author: Yuanhan Liu 
AuthorDate: Fri, 1 Feb 2013 18:59:16 +0800
Committer:  Ingo Molnar 
CommitDate: Mon, 18 Feb 2013 10:10:21 +0100

rwsem-spinlock: Implement writer lock-stealing for better scalability

We (Linux Kernel Performance project) found a regression
introduced by commit:

  5a505085f043 mm/rmap: Convert the struct anon_vma::mutex to an rwsem

which converted all anon_vma::mutex locks rwsem write locks.

The semantics are the same, but the behavioral difference is
quite huge in some cases. After investigating it we found the
root cause: mutexes support lock stealing while rwsems don't.

Here is the link for the detailed regression report:

  https://lkml.org/lkml/2013/1/29/84

Ingo suggested adding write lock stealing to rwsems:

"I think we should allow lock-steal between rwsem writers - that
 will not hurt fairness as most rwsem fairness concerns relate to
 reader vs. writer fairness"

And here is the rwsem-spinlock version.

With this patch, we got a double performance increase in one
test box with following aim7 workfile:

FILESIZE: 1M
POOLSIZE: 10M
10 fork_test

 /usr/bin/time output w/o patch   /usr/bin/time_output with 
patch
 -- Percent of CPU this job got: 369% Percent of CPU this job 
got: 537%
 Voluntary context switches: 640595016Voluntary context 
switches: 157915561

We got a 45% increase in CPU usage and saved about 3/4 voluntary context 
switches.

Reported-by: LKP project 
Suggested-by: Ingo Molnar 
Signed-off-by: Yuanhan Liu 
Cc: Alex Shi 
Cc: David Howells 
Cc: Michel Lespinasse 
Cc: Linus Torvalds 
Cc: Andrew Morton 
Cc: Peter Zijlstra 
Cc: Anton Blanchard 
Cc: Arjan van de Ven 
Cc: paul.gortma...@windriver.com
Link: 
http://lkml.kernel.org/r/1359716356-23865-1-git-send-email-yuanhan@linux.intel.com
Signed-off-by: Ingo Molnar 
---
 lib/rwsem-spinlock.c | 69 ++--
 1 file changed, 24 insertions(+), 45 deletions(-)

diff --git a/lib/rwsem-spinlock.c b/lib/rwsem-spinlock.c
index 7e0d6a5..7542afb 100644
--- a/lib/rwsem-spinlock.c
+++ b/lib/rwsem-spinlock.c
@@ -73,20 +73,13 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wakewrite)
goto dont_wake_writers;
}
 
-   /* if we are allowed to wake writers try to grant a single write lock
-* if there's a writer at the front of the queue
-* - we leave the 'waiting count' incremented to signify potential
-*   contention
+   /*
+* as we support write lock stealing, we can't set sem->activity
+* to -1 here to indicate we get the lock. Instead, we wake it up
+* to let it go get it again.
 */
if (waiter->flags & RWSEM_WAITING_FOR_WRITE) {
-   sem->activity = -1;
-   list_del(&waiter->list);
-   tsk = waiter->task;
-   /* Don't touch waiter after ->task has been NULLed */
-   smp_mb();
-   waiter->task = NULL;
-   wake_up_process(tsk);
-   put_task_struct(tsk);
+   wake_up_process(waiter->task);
goto out;
}
 
@@ -121,18 +114,10 @@ static inline struct rw_semaphore *
 __rwsem_wake_one_writer(struct rw_semaphore *sem)
 {
struct rwsem_waiter *waiter;
-   struct task_struct *tsk;
-
-   sem->activity = -1;
 
waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
-   list_del(&waiter->list);
+   wake_up_process(waiter->task);
 
-   tsk = waiter->task;
-   smp_mb();
-   waiter->task = NULL;
-   wake_up_process(tsk);
-   put_task_struct(tsk);
return sem;
 }
 
@@ -204,7 +189,6 @@ int __down_read_trylock(struct rw_semaphore *sem)
 
 /*
  * get a write lock on the semaphore
- * - we increment the waiting count anyway to indicate an exclusive lock
  */
 void __sched __down_write_nested(struct rw_semaphore *sem, int subclass)
 {
@@ -214,37 +198,32 @@ void __sched __down_write_nested(struct rw_semaphore 
*sem, int subclass)
 
raw_spin_lock_irqsave(&sem->wait_lock, flags);
 
-   if (sem->activity == 0 && list_empty(&sem->wait_list)) {
-   /* granted */
-   sem->activity = -1;
-   raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
-   goto out;
-   }
-
-   tsk = current;
-   set_task_state(tsk, TASK_UNINTERRUPTIBLE);
-
/* set up my own style of waitqueue */
+   tsk = current;
waiter.task = tsk;
waiter.flags = RWSEM_WAITING_FOR_WRITE;
-   get_task_struct(tsk);
-
list_add_tail(&waiter.list, &sem->wait_list);
 
-   /* we don't need to touch the sema

Re: [PATCH v2] rwsem-spinlock: let rwsem write lock stealable

2013-02-16 Thread Yuanhan Liu

Hi Ingo, 

Ping...

On Fri, Feb 01, 2013 at 06:59:16PM +0800, Yuanhan Liu wrote:
> We(Linux Kernel Performance project) found a regression introduced by
> commit 5a50508, which just convert all mutex lock to rwsem write lock.
> The semantics is same, but the results is quite huge in some cases.
> After investigation, we found the root cause: mutex support lock
> stealing. Here is the link for the detailed regression report:
> https://lkml.org/lkml/2013/1/29/84
> 
> Ingo suggests to add write lock stealing to rwsem as well:
> "I think we should allow lock-steal between rwsem writers - that
>  will not hurt fairness as most rwsem fairness concerns relate to
>  reader vs. writer fairness"
> 
> And here is the rwsem-spinlock version.
> 
> With this patch, we got a double performance increase in one test box
> with following aim7 workfile:
> FILESIZE: 1M
> POOLSIZE: 10M
> 10 fork_test
> 
> some /usr/bin/time output w/o patch  some /usr/bin/time_output with patch
> 
> Percent of CPU this job got: 369%Percent of CPU this job got: 537%
> Voluntary context switches: 640595016Voluntary context switches: 157915561
> 
> You will see we got a 45% increase of CPU usage and saves about 3/4
> voluntary context switches.
> 
> Here is the .nr_running filed for all CPUs from /proc/sched_debug.
> 
> output w/o this patch:
> --
> cpu 00:   0   0   ...   0   0   0   0   0   0   0   1   0   1  0   0
> cpu 01:   0   0   ...   1   0   0   0   0   0   1   1   0   1  0   0
> cpu 02:   0   0   ...   1   1   0   0   0   1   0   0   1   0  1   1
> cpu 03:   0   0   ...   0   1   0   0   0   1   1   0   1   1  0   0
> cpu 04:   0   1   ...   0   0   2   1   1   2   1   0   1   0  1   0
> cpu 05:   0   1   ...   0   0   2   1   1   2   1   1   1   1  0   0
> cpu 06:   0   0   ...   2   0   0   1   0   0   1   0   0   0  0   0
> cpu 07:   0   0   ...   2   0   0   0   1   0   1   1   0   0  1   0
> cpu 08:   0   0   ...   1   0   0   0   1   0   0   1   0   0  0   1
> cpu 09:   0   0   ...   1   0   0   0   1   0   0   1   0   0  0   1
> cpu 10:   0   0   ...   0   0   0   2   0   0   1   0   1   1  1   2
> cpu 11:   0   0   ...   0   0   0   2   2   0   1   0   1   0  1   2
> cpu 12:   0   0   ...   2   0   0   0   1   1   3   1   1   1  1   0
> cpu 13:   0   0   ...   2   0   0   0   1   1   3   1   1   0  1   1
> cpu 14:   0   0   ...   0   0   0   2   0   0   1   1   0   0  1   0
> cpu 15:   0   0   ...   1   0   0   2   0   0   1   1   0   0  0   0
> 
> output with this patch:
> ---
> cpu 00:   0   0   ...   1   1   2   1   1   1   2   1   1   1  1   3
> cpu 01:   0   0   ...   1   1   1   1   1   1   2   1   1   1  1   3
> cpu 02:   0   0   ...   2   2   3   2   0   2   1   2   1   1  1   1
> cpu 03:   0   0   ...   2   2   3   2   1   2   1   2   1   1  1   1
> cpu 04:   0   1   ...   2   0   0   1   0   1   3   1   1   1  1   1
> cpu 05:   0   1   ...   2   0   1   1   0   1   2   1   1   1  1   1
> cpu 06:   0   0   ...   2   1   1   2   0   1   2   1   1   1  2   1
> cpu 07:   0   0   ...   2   1   1   2   0   1   2   1   1   1  2   1
> cpu 08:   0   0   ...   1   1   1   1   1   1   1   1   1   1  0   0
> cpu 09:   0   0   ...   1   1   1   1   1   1   1   1   1   1  0   0
> cpu 10:   0   0   ...   1   1   1   0   0   1   1   1   1   1  0   0
> cpu 11:   0   0   ...   1   1   1   0   0   1   1   1   1   2  1   0
> cpu 12:   0   0   ...   1   1   1   0   1   1   0   0   0   1  2   1
> cpu 13:   0   0   ...   1   1   1   0   1   1   1   0   1   2  2   0
> cpu 14:   0   0   ...   2   0   0   0   0   1   1   1   1   1  2   2
> cpu 15:   0   0   ...   2   0   0   1   0   1   1   1   1   1  2   2
> 
> Where you can see that CPU is much busier with this patch.
> 
> v2: make it stealable at __down_write_trylock as well, pointed by Michel
> 
> Reported-by: LKP project 
> Suggested-by: Ingo Molnar 
> Cc: David Howells 
> Cc: Michel Lespinasse 
> Signed-off-by: Yuanhan Liu 
> ---
>  lib/rwsem-spinlock.c |   69 +
>  1 files changed, 24 insertions(+), 45 deletions(-)
> 
> diff --git a/lib/rwsem-spinlock.c b/lib/rwsem-spinlock.c
> index 7e0d6a5..7542afb 100644
> --- a/lib/rwsem-spinlock.c
> +++ b/lib/rwsem-spinlock.c
> @@ -73,20 +73,13 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wakewr

[PATCH v2] rwsem-spinlock: let rwsem write lock stealable

2013-02-01 Thread Yuanhan Liu

We(Linux Kernel Performance project) found a regression introduced by
commit 5a50508, which just convert all mutex lock to rwsem write lock.
The semantics is same, but the results is quite huge in some cases.
After investigation, we found the root cause: mutex support lock
stealing. Here is the link for the detailed regression report:
https://lkml.org/lkml/2013/1/29/84

Ingo suggests to add write lock stealing to rwsem as well:
"I think we should allow lock-steal between rwsem writers - that
 will not hurt fairness as most rwsem fairness concerns relate to
 reader vs. writer fairness"

And here is the rwsem-spinlock version.

With this patch, we got a double performance increase in one test box
with following aim7 workfile:
FILESIZE: 1M
POOLSIZE: 10M
10 fork_test

some /usr/bin/time output w/o patch  some /usr/bin/time_output with patch

Percent of CPU this job got: 369%Percent of CPU this job got: 537%
Voluntary context switches: 640595016Voluntary context switches: 157915561

You will see we got a 45% increase of CPU usage and saves about 3/4
voluntary context switches.

Here is the .nr_running filed for all CPUs from /proc/sched_debug.

output w/o this patch:
--
cpu 00:   0   0   ...   0   0   0   0   0   0   0   1   0   1  0   0
cpu 01:   0   0   ...   1   0   0   0   0   0   1   1   0   1  0   0
cpu 02:   0   0   ...   1   1   0   0   0   1   0   0   1   0  1   1
cpu 03:   0   0   ...   0   1   0   0   0   1   1   0   1   1  0   0
cpu 04:   0   1   ...   0   0   2   1   1   2   1   0   1   0  1   0
cpu 05:   0   1   ...   0   0   2   1   1   2   1   1   1   1  0   0
cpu 06:   0   0   ...   2   0   0   1   0   0   1   0   0   0  0   0
cpu 07:   0   0   ...   2   0   0   0   1   0   1   1   0   0  1   0
cpu 08:   0   0   ...   1   0   0   0   1   0   0   1   0   0  0   1
cpu 09:   0   0   ...   1   0   0   0   1   0   0   1   0   0  0   1
cpu 10:   0   0   ...   0   0   0   2   0   0   1   0   1   1  1   2
cpu 11:   0   0   ...   0   0   0   2   2   0   1   0   1   0  1   2
cpu 12:   0   0   ...   2   0   0   0   1   1   3   1   1   1  1   0
cpu 13:   0   0   ...   2   0   0   0   1   1   3   1   1   0  1   1
cpu 14:   0   0   ...   0   0   0   2   0   0   1   1   0   0  1   0
cpu 15:   0   0   ...   1   0   0   2   0   0   1   1   0   0  0   0

output with this patch:
---
cpu 00:   0   0   ...   1   1   2   1   1   1   2   1   1   1  1   3
cpu 01:   0   0   ...   1   1   1   1   1   1   2   1   1   1  1   3
cpu 02:   0   0   ...   2   2   3   2   0   2   1   2   1   1  1   1
cpu 03:   0   0   ...   2   2   3   2   1   2   1   2   1   1  1   1
cpu 04:   0   1   ...   2   0   0   1   0   1   3   1   1   1  1   1
cpu 05:   0   1   ...   2   0   1   1   0   1   2   1   1   1  1   1
cpu 06:   0   0   ...   2   1   1   2   0   1   2   1   1   1  2   1
cpu 07:   0   0   ...   2   1   1   2   0   1   2   1   1   1  2   1
cpu 08:   0   0   ...   1   1   1   1   1   1   1   1   1   1  0   0
cpu 09:   0   0   ...   1   1   1   1   1   1   1   1   1   1  0   0
cpu 10:   0   0   ...   1   1   1   0   0   1   1   1   1   1  0   0
cpu 11:   0   0   ...   1   1   1   0   0   1   1   1   1   2  1   0
cpu 12:   0   0   ...   1   1   1   0   1   1   0   0   0   1  2   1
cpu 13:   0   0   ...   1   1   1   0   1   1   1   0   1   2  2   0
cpu 14:   0   0   ...   2   0   0   0   0   1   1   1   1   1  2   2
cpu 15:   0   0   ...   2   0   0   1   0   1   1   1   1   1  2   2

Where you can see that CPU is much busier with this patch.

v2: make it stealable at __down_write_trylock as well, pointed by Michel

Reported-by: LKP project 
Suggested-by: Ingo Molnar 
Cc: David Howells 
Cc: Michel Lespinasse 
Signed-off-by: Yuanhan Liu 
---
 lib/rwsem-spinlock.c |   69 +
 1 files changed, 24 insertions(+), 45 deletions(-)

diff --git a/lib/rwsem-spinlock.c b/lib/rwsem-spinlock.c
index 7e0d6a5..7542afb 100644
--- a/lib/rwsem-spinlock.c
+++ b/lib/rwsem-spinlock.c
@@ -73,20 +73,13 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wakewrite)
goto dont_wake_writers;
}
 
-   /* if we are allowed to wake writers try to grant a single write lock
-* if there's a writer at the front of the queue
-* - we leave the 'waiting count' incremented to signify potential
-*   contention
+   /*
+* as we support write lock stealing, we can't set sem->activity
+* to -1 here to indicate we get the lock. Instead, we wake it up
+* to let it go get it again.

Re: aim7 performance regression by commit 5a50508 report from LKP

2013-02-01 Thread Yuanhan Liu

On Thu, Jan 31, 2013 at 12:22:52PM +0100, Ingo Molnar wrote:
> 
> * Yuanhan Liu  wrote:
> 
> > > or whether the lock hold times could be reduced drastically
> > 
> > I also found one, but it doesn't sound like the one will 
> > reduce lock hold times drastically:
> >
> >vma_lock_anon_vma() seems covered too much code at
> >expand_up/downwards.
> > 
> > Well, again, it's quite a tiny optimization for reducing the 
> > coverage.
> 
> The fundamental problem is I think that there's a single anon 
> vma lock for the whole workload, right?

Yes.
> 
> Is that really fundamentally needed, could that be spread out 
> perhaps?

I'm digging into this. 

Thanks.

--yliu
> 
> ( But first we want to see how much of the regression we can fix 
>   in an equivalent-locking-patterns fashion - improving locking
>   is a separate matter. )
> 
> Thanks,
> 
>   Ingo
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] rwsem-spinlock: let rwsem write lock stealable

2013-01-31 Thread Yuanhan Liu

On Thu, Jan 31, 2013 at 10:18:18PM +0100, Ingo Molnar wrote:
> 
> * Yuanhan Liu  wrote:
> 
> > On Thu, Jan 31, 2013 at 02:12:28PM +0100, Ingo Molnar wrote:
> > > 
> > > * Yuanhan Liu  wrote:
> > > 
> > > > BTW, mind to tell a nice test case for mmap_sem?
> > > 
> > > this one was write-hitting on mmap_sem pretty hard, last I 
> > > checked:
> > > 
> > >   http://people.redhat.com/mingo/threaded-mmap-stresstest/
> > 
> > Thanks!
> > 
> > Is there any pass condition? I tested a while, at least I 
> > found no oops or any noisy from dmesg output. Is that OK?
> 
> Yeah, not crashing and not hanging is the expected behavior.

Good to know.

> 
> > Well, sometimes, it will quit peacefully. Sometimes it will 
> > not. ps -eo 'pid, state,wchan,comm' shows that it is sleeping 
> > at futex_wait_queue_me().
> > 
> > NOTE: this happens both with or w/o this patch. Thus it may 
> > not an issue introduced by this patch?
> 
> hm, that's unexpected - it's expected to loop infinitely.

Reall sorry about that. My bad. I modify the code a bit: removed the two
//, so that thread will exit after count > 100.

Sorry again :(

--yliu

> I have 
> a newer version (attached) - is that exiting too?
> 
> Maybe this triggers spuriously:
> 
> if (!info->si_addr)
> raise(SIGABRT); /* Allow GDB backtrace */
> 
> although then you should see the SIGABRT as an irregular exit 
> IIRC.
> 
> Thanks,
> 
>   Ingo


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] rwsem-spinlock: let rwsem write lock stealable

2013-01-31 Thread Yuanhan Liu

On Thu, Jan 31, 2013 at 02:12:28PM +0100, Ingo Molnar wrote:
> 
> * Yuanhan Liu  wrote:
> 
> > BTW, mind to tell a nice test case for mmap_sem?
> 
> this one was write-hitting on mmap_sem pretty hard, last I 
> checked:
> 
>   http://people.redhat.com/mingo/threaded-mmap-stresstest/

Thanks!

Is there any pass condition? I tested a while, at least I found no oops
or any noisy from dmesg output. Is that OK?

Well, sometimes, it will quit peacefully. Sometimes it will not.
ps -eo 'pid, state,wchan,comm' shows that it is sleeping at
futex_wait_queue_me().

NOTE: this happens both with or w/o this patch. Thus it may not an issue
introduced by this patch?

Thanks.

--yliu

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] rwsem-spinlock: let rwsem write lock stealable

2013-01-31 Thread Yuanhan Liu

On Thu, Jan 31, 2013 at 03:57:51AM -0800, Michel Lespinasse wrote:
> On Wed, Jan 30, 2013 at 1:14 AM, Yuanhan Liu
>  wrote:
> > We(Linux Kernel Performance project) found a regression introduced by
> > commit 5a50508, which just convert all mutex lock to rwsem write lock.
> > The semantics is same, but the results is quite huge in some cases.
> > After investigation, we found the root cause: mutex support lock
> > stealing. Here is the link for the detailed regression report:
> > https://lkml.org/lkml/2013/1/29/84
> >
> > Ingo suggests to add write lock stealing to rwsem as well:
> > "I think we should allow lock-steal between rwsem writers - that
> >  will not hurt fairness as most rwsem fairness concerns relate to
> >  reader vs. writer fairness"
> >
> > I then tried it with rwsem-spinlock first as I found it much easier to
> > implement it than lib/rwsem.c. And here I sent out this patch first for
> > comments. I'd try lib/rwsem.c later once the change to rwsem-spinlock
> > is OK to you guys.
> 
> I noticed that you haven't modified __down_write_trylock() - for
> consistency with __down_write() you should replace
> if (sem->activity == 0 && list_empty(&sem->wait_list)) {
> with
> if (sem->activity == 0) {

Yes, my bad for missing that. Thanks a lot for pointing it out. Will fix it.
> 
> Other than that, I like the idea. I was originally uncomfortable with
> doing lock stealing for the rwsem, but I think doing it for writers
> only as you propose should be fine. Readers wait for any queued
> writers, and in exchange they are guaranteed to get the lock once
> they've blocked.

> You *still* want to check for regressions that this
> change might cause - not with anon_vma as this was a mutex not long
> ago, but possibly with mmap_sem

Yes. Well, at least it passed Fengguang's 0-DAY test, which did lots
of tests on almost all ARCHs. Well, you reminds me that I just enabled
RWSEM_GENERIC_SPINLOCK for x86 ARCH, thus I need to enable
RWSEM_GENERIC_SPINLOCK to all ARCHs and do test again.

BTW, mind to tell a nice test case for mmap_sem?

> - but I'm crossing my fingers and
> thinking that it'll most likely turn out fine.

Thanks!

> 
> I may be able to help with the non-spinlock version of this as I still
> remember how this works.

That would be great! Especially I will have vacation soon for Chinese
New Year.


--yliu
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] rwsem-spinlock: let rwsem write lock stealable

2013-01-31 Thread Yuanhan Liu

On Thu, Jan 31, 2013 at 11:45:41AM +0100, Ingo Molnar wrote:
> 
> * Yuanhan Liu  wrote:
> 
> > > > output with this patch:
> > > > ---
> > > > cpu 00:   0   0   ...   1   1   2   1   1   1   2   1   1   1  1   3
> > > > cpu 01:   0   0   ...   1   1   1   1   1   1   2   1   1   1  1   3
> > > > cpu 02:   0   0   ...   2   2   3   2   0   2   1   2   1   1  1   1
> > > > cpu 03:   0   0   ...   2   2   3   2   1   2   1   2   1   1  1   1
> > > > cpu 04:   0   1   ...   2   0   0   1   0   1   3   1   1   1  1   1
> > > > cpu 05:   0   1   ...   2   0   1   1   0   1   2   1   1   1  1   1
> > > > cpu 06:   0   0   ...   2   1   1   2   0   1   2   1   1   1  2   1
> > > > cpu 07:   0   0   ...   2   1   1   2   0   1   2   1   1   1  2   1
> > > > cpu 08:   0   0   ...   1   1   1   1   1   1   1   1   1   1  0   0
> > > > cpu 09:   0   0   ...   1   1   1   1   1   1   1   1   1   1  0   0
> > > > cpu 10:   0   0   ...   1   1   1   0   0   1   1   1   1   1  0   0
> > > > cpu 11:   0   0   ...   1   1   1   0   0   1   1   1   1   2  1   0
> > > > cpu 12:   0   0   ...   1   1   1   0   1   1   0   0   0   1  2   1
> > > > cpu 13:   0   0   ...   1   1   1   0   1   1   1   0   1   2  2   0
> > > > cpu 14:   0   0   ...   2   0   0   0   0   1   1   1   1   1  2   2
> > > > cpu 15:   0   0   ...   2   0   0   1   0   1   1   1   1   1  2   2
> > > > 
> > > > Where you can see that CPU is much busier with this patch.
> > > 
> > > That looks really good - quite similar to how it behaved 
> > > with mutexes, right?
> > 
> > Yes :)
> > 
> > And the result is almost same with mutex lock when MUTEX_SPIN_ON_OWNER
> > is disabled, and that's the reason you will see massive processes(about
> > 100) queued on each CPU in my last report:
> > https://lkml.org/lkml/2013/1/29/84
> 
> Just curious: how does MUTEX_SPIN_ON_OWNER versus 
> !MUTEX_SPIN_ON_OWNER compare, for this particular, 
> massively-contended anon-vma locks benchmark?

In above testcase, MUTEX_SPIN_ON_OWNER is slightly doing better job(like
3% ~ 4%) than !MUTEX_SPIN_ON_OWNER.

> 
> > > Does this recover most of the performance regression?
> > 
> > Yes, there is only a 10% gap here then. I guess that's because 

Sorry, to be accurate, it's about 14% gap; when MUTEX_SPIN_ON_OWNER is
enabled.

> > I used the general rwsem lock 
> > implementation(lib/rwsem-spinlock.c), but not the XADD 
> > one(lib/rwsem.c). I guess the gap may be a little smaller if 
> > we do the same thing to lib/rwsem.c.
> 
> Is part of the gap due to MUTEX_SPIN_ON_OWNER perhaps?

Nope, !MUTEX_SPIN_ON_OWNER does introduce a little performance drop just
as above stated.

So, to make it clear, here is the list:

lock caseperformance drop compared to mutex lock

mutex lock w/o MUTEX_SPIN_ON_OWNER   3.x%
rwsem-spinlock with write stealing   14.x%
rwsem-spinlock   >100%


> 
> I'm surprised that rwsem-spinlock versus rwsem.c would show a 
> 10% performance difference -

Yes, it may not. And there is only about 0.9% performance difference in
above test between rwsem-spinlock and XADD rwsem. The difference maybe
enlarged when both has write lock stealing enabled, which will be known
only after we do same thing to lib/rwsem.c.

Thanks.

--yliu

> assuming you have lock 
> debugging/tracing disabled in the .config.
> 
> ( Once the performance regression is fixed, another thing to 
>   check would be to reduce anon-vma lock contention. )
> 
> Thanks,
> 
>   Ingo
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] rwsem-spinlock: let rwsem write lock stealable

2013-01-31 Thread Yuanhan Liu

On Thu, Jan 31, 2013 at 10:39:31AM +0100, Ingo Molnar wrote:
> 
> * Yuanhan Liu  wrote:
> 
> > We(Linux Kernel Performance project) found a regression introduced by
> > commit 5a50508, which just convert all mutex lock to rwsem write lock.
> > The semantics is same, but the results is quite huge in some cases.
> > After investigation, we found the root cause: mutex support lock
> > stealing. Here is the link for the detailed regression report:
> > https://lkml.org/lkml/2013/1/29/84
> > 
> > Ingo suggests to add write lock stealing to rwsem as well:
> > "I think we should allow lock-steal between rwsem writers - that
> >  will not hurt fairness as most rwsem fairness concerns relate to
> >  reader vs. writer fairness"
> > 
> > I then tried it with rwsem-spinlock first as I found it much easier to
> > implement it than lib/rwsem.c. And here I sent out this patch first for
> > comments. I'd try lib/rwsem.c later once the change to rwsem-spinlock
> > is OK to you guys.
> > 
> > With this patch, we got a double performance increase in one test box
> > with following aim7 workfile:
> > FILESIZE: 1M
> > POOLSIZE: 10M
> > 10 fork_test
> > 
> > some /usr/bin/time output w/o patch  some /usr/bin/time_output with 
> > patch
> > 
> > Percent of CPU this job got: 369%Percent of CPU this job got: 537%
> > Voluntary context switches: 640595016Voluntary context switches: 
> > 157915561
> > 
> > You will see we got a 45% increase of CPU usage and saves about 3/4
> > voluntary context switches.
> > 
> > 
> > Here is the .nr_running filed for all CPUs from /proc/sched_debug.
> > 
> > output w/o this patch:
> > --
> > cpu 00:   0   0   ...   0   0   0   0   0   0   0   1   0   1  0   0
> > cpu 01:   0   0   ...   1   0   0   0   0   0   1   1   0   1  0   0
> > cpu 02:   0   0   ...   1   1   0   0   0   1   0   0   1   0  1   1
> > cpu 03:   0   0   ...   0   1   0   0   0   1   1   0   1   1  0   0
> > cpu 04:   0   1   ...   0   0   2   1   1   2   1   0   1   0  1   0
> > cpu 05:   0   1   ...   0   0   2   1   1   2   1   1   1   1  0   0
> > cpu 06:   0   0   ...   2   0   0   1   0   0   1   0   0   0  0   0
> > cpu 07:   0   0   ...   2   0   0   0   1   0   1   1   0   0  1   0
> > cpu 08:   0   0   ...   1   0   0   0   1   0   0   1   0   0  0   1
> > cpu 09:   0   0   ...   1   0   0   0   1   0   0   1   0   0  0   1
> > cpu 10:   0   0   ...   0   0   0   2   0   0   1   0   1   1  1   2
> > cpu 11:   0   0   ...   0   0   0   2   2   0   1   0   1   0  1   2
> > cpu 12:   0   0   ...   2   0   0   0   1   1   3   1   1   1  1   0
> > cpu 13:   0   0   ...   2   0   0   0   1   1   3   1   1   0  1   1
> > cpu 14:   0   0   ...   0   0   0   2   0   0   1   1   0   0  1   0
> > cpu 15:   0   0   ...   1   0   0   2   0   0   1   1   0   0  0   0
> > 
> > output with this patch:
> > ---
> > cpu 00:   0   0   ...   1   1   2   1   1   1   2   1   1   1  1   3
> > cpu 01:   0   0   ...   1   1   1   1   1   1   2   1   1   1  1   3
> > cpu 02:   0   0   ...   2   2   3   2   0   2   1   2   1   1  1   1
> > cpu 03:   0   0   ...   2   2   3   2   1   2   1   2   1   1  1   1
> > cpu 04:   0   1   ...   2   0   0   1   0   1   3   1   1   1  1   1
> > cpu 05:   0   1   ...   2   0   1   1   0   1   2   1   1   1  1   1
> > cpu 06:   0   0   ...   2   1   1   2   0   1   2   1   1   1  2   1
> > cpu 07:   0   0   ...   2   1   1   2   0   1   2   1   1   1  2   1
> > cpu 08:   0   0   ...   1   1   1   1   1   1   1   1   1   1  0   0
> > cpu 09:   0   0   ...   1   1   1   1   1   1   1   1   1   1  0   0
> > cpu 10:   0   0   ...   1   1   1   0   0   1   1   1   1   1  0   0
> > cpu 11:   0   0   ...   1   1   1   0   0   1   1   1   1   2  1   0
> > cpu 12:   0   0   ...   1   1   1   0   1   1   0   0   0   1  2   1
> > cpu 13:   0   0   ...   1   1   1   0   1   1   1   0   1   2  2   0
> > cpu 14:   0   0   ...   2   0   0   0   0   1   1   1   1   1  2   2
> > cpu 15:   0   0   ...   2   0   0   1   0   1   1   1   1   1  2   2
> > 
> > Where you can see that CPU is much busier with this patch.
> 
> That lo

[PATCH 1/3] namespaces: utsname: fix wrong comment about clone_uts_ns()

2013-01-31 Thread Yuanhan Liu

Fix the wrong comment about the return value of clone_uts_ns()

Cc: Serge Hallyn 
Signed-off-by: Yuanhan Liu 
---
 kernel/utsname.c |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/kernel/utsname.c b/kernel/utsname.c
index 08b197e..a47fc5d 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -30,7 +30,7 @@ static struct uts_namespace *create_uts_ns(void)
 /*
  * Clone a new ns copying an original utsname, setting refcount to 1
  * @old_ns: namespace to clone
- * Return NULL on error (failure to kmalloc), new ns otherwise
+ * Return ERR_PTR(-ENOMEM) on error (failure to kmalloc), new ns otherwise
  */
 static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns,
  struct uts_namespace *old_ns)
-- 
1.7.7.6

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 2/3] sysctl: put get/get_uts() into CONFIG_PROC_SYSCTL code block

2013-01-31 Thread Yuanhan Liu

put get/get_uts() into CONFIG_PROC_SYSCTL code block as they are used
only when CONFIG_PROC_SYSCTL is enabled.

Signed-off-by: Yuanhan Liu 
---
 kernel/utsname_sysctl.c |3 ++-
 1 files changed, 2 insertions(+), 1 deletions(-)

diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
index 63da38c..4f69f9a 100644
--- a/kernel/utsname_sysctl.c
+++ b/kernel/utsname_sysctl.c
@@ -15,6 +15,8 @@
 #include 
 #include 
 
+#ifdef CONFIG_PROC_SYSCTL
+
 static void *get_uts(ctl_table *table, int write)
 {
char *which = table->data;
@@ -38,7 +40,6 @@ static void put_uts(ctl_table *table, int write, void *which)
up_write(&uts_sem);
 }
 
-#ifdef CONFIG_PROC_SYSCTL
 /*
  * Special case of dostring for the UTS structure. This has locks
  * to observe. Should this be in kernel/sys.c 
-- 
1.7.7.6

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 3/3] nsproxy: remove duplicate task_cred_xxx for user_ns

2013-01-31 Thread Yuanhan Liu

We can use user_ns, which is also assigned from task_cred_xxx(tsk,
user_ns), at the beginning of copy_namespaces().

Cc: Serge Hallyn 
Signed-off-by: Yuanhan Liu 
---
 kernel/nsproxy.c |3 +--
 1 files changed, 1 insertions(+), 2 deletions(-)

diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 78e2ecb..b781e66 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -153,8 +153,7 @@ int copy_namespaces(unsigned long flags, struct task_struct 
*tsk)
goto out;
}
 
-   new_ns = create_new_namespaces(flags, tsk,
-  task_cred_xxx(tsk, user_ns), tsk->fs);
+   new_ns = create_new_namespaces(flags, tsk, user_ns, tsk->fs);
if (IS_ERR(new_ns)) {
err = PTR_ERR(new_ns);
goto out;
-- 
1.7.7.6

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: aim7 performance regression by commit 5a50508 report from LKP

2013-01-30 Thread Yuanhan Liu

On Tue, Jan 29, 2013 at 10:12:45AM +0100, Ingo Molnar wrote:
> 
> * Yuanhan Liu  wrote:
> 
> > On Tue, Jan 29, 2013 at 09:44:00AM +0100, Ingo Molnar wrote:
> > > 
> > > * Yuanhan Liu  wrote:
> > > 
> > > > [...]
> > > 
> > > Very nice measurements and analysis, thanks!
> > > 
> > > > As stated above, anybody can have a chance to own the lock in 
> > > > mutex once somebody release the lock. Well, there is only one 
> > > > to own the lock in rwsem write lock, and the one is known 
> > > > already: the one in the head of wait list. That would result 
> > > > to more contention in rwsem write lock case, especially if the 
> > > > one _will_ own the lock is not running now.
> > > 
> > > I think we should allow lock-steal between rwsem writers - that 
> > > will not hurt fairness as most rwsem fairness concerns relate to 
> > > reader vs. writer fairness.
> > 
> > Agreed, and I'm sure this will improve performance and may 
> > make this performance regression go away.
> > 
> > David, is that Ok to you? If so, I may have a try.
> 
> I'm not David but please try it :-)
> 
> Making rwsem behavior and scalability similar to mutexes would 
> have numerous advantages.
> 
> > > Am I correct to assume that all relevant users in this 
> > > workload are down_write() users?
> > 
> > Yes, as commit 5a50508 just convert all mutex to down_write.
> 
> A second track of inquiry would be to see whether any of the key 
> usage sites could be converted to down_read()

I tried before, and seems I didn't find one.

Well, I did find an irrelevant one: vma_lock_anon_vma at validate_mm()
in mm/mmap.c. That function is _real_ only if CONFIG_DEBUG_VM_RB is
set, and there is no vma_lock_anon_vma_read() or something similar,
thus I guess it's may not worthy to turn it.

> or whether the 
> lock hold times could be reduced drastically

I also found one, but it doesn't sound like the one will reduce lock
hole times drastically:
   vma_lock_anon_vma() seems covered too much code at
   expand_up/downwards.

Well, again, it's quite a tiny optimization for reducing the coverage.

Thanks.

--yliu



> - but I doubt 
> that's really possible on such heavily forking workloads.
> 
> Thanks,
> 
>   Ingo
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH] rwsem-spinlock: let rwsem write lock stealable

2013-01-30 Thread Yuanhan Liu

We(Linux Kernel Performance project) found a regression introduced by
commit 5a50508, which just convert all mutex lock to rwsem write lock.
The semantics is same, but the results is quite huge in some cases.
After investigation, we found the root cause: mutex support lock
stealing. Here is the link for the detailed regression report:
https://lkml.org/lkml/2013/1/29/84

Ingo suggests to add write lock stealing to rwsem as well:
"I think we should allow lock-steal between rwsem writers - that
 will not hurt fairness as most rwsem fairness concerns relate to
 reader vs. writer fairness"

I then tried it with rwsem-spinlock first as I found it much easier to
implement it than lib/rwsem.c. And here I sent out this patch first for
comments. I'd try lib/rwsem.c later once the change to rwsem-spinlock
is OK to you guys.

With this patch, we got a double performance increase in one test box
with following aim7 workfile:
FILESIZE: 1M
POOLSIZE: 10M
10 fork_test

some /usr/bin/time output w/o patch  some /usr/bin/time_output with patch

Percent of CPU this job got: 369%Percent of CPU this job got: 537%
Voluntary context switches: 640595016Voluntary context switches: 157915561

You will see we got a 45% increase of CPU usage and saves about 3/4
voluntary context switches.


Here is the .nr_running filed for all CPUs from /proc/sched_debug.

output w/o this patch:
--
cpu 00:   0   0   ...   0   0   0   0   0   0   0   1   0   1  0   0
cpu 01:   0   0   ...   1   0   0   0   0   0   1   1   0   1  0   0
cpu 02:   0   0   ...   1   1   0   0   0   1   0   0   1   0  1   1
cpu 03:   0   0   ...   0   1   0   0   0   1   1   0   1   1  0   0
cpu 04:   0   1   ...   0   0   2   1   1   2   1   0   1   0  1   0
cpu 05:   0   1   ...   0   0   2   1   1   2   1   1   1   1  0   0
cpu 06:   0   0   ...   2   0   0   1   0   0   1   0   0   0  0   0
cpu 07:   0   0   ...   2   0   0   0   1   0   1   1   0   0  1   0
cpu 08:   0   0   ...   1   0   0   0   1   0   0   1   0   0  0   1
cpu 09:   0   0   ...   1   0   0   0   1   0   0   1   0   0  0   1
cpu 10:   0   0   ...   0   0   0   2   0   0   1   0   1   1  1   2
cpu 11:   0   0   ...   0   0   0   2   2   0   1   0   1   0  1   2
cpu 12:   0   0   ...   2   0   0   0   1   1   3   1   1   1  1   0
cpu 13:   0   0   ...   2   0   0   0   1   1   3   1   1   0  1   1
cpu 14:   0   0   ...   0   0   0   2   0   0   1   1   0   0  1   0
cpu 15:   0   0   ...   1   0   0   2   0   0   1   1   0   0  0   0

output with this patch:
---
cpu 00:   0   0   ...   1   1   2   1   1   1   2   1   1   1  1   3
cpu 01:   0   0   ...   1   1   1   1   1   1   2   1   1   1  1   3
cpu 02:   0   0   ...   2   2   3   2   0   2   1   2   1   1  1   1
cpu 03:   0   0   ...   2   2   3   2   1   2   1   2   1   1  1   1
cpu 04:   0   1   ...   2   0   0   1   0   1   3   1   1   1  1   1
cpu 05:   0   1   ...   2   0   1   1   0   1   2   1   1   1  1   1
cpu 06:   0   0   ...   2   1   1   2   0   1   2   1   1   1  2   1
cpu 07:   0   0   ...   2   1   1   2   0   1   2   1   1   1  2   1
cpu 08:   0   0   ...   1   1   1   1   1   1   1   1   1   1  0   0
cpu 09:   0   0   ...   1   1   1   1   1   1   1   1   1   1  0   0
cpu 10:   0   0   ...   1   1   1   0   0   1   1   1   1   1  0   0
cpu 11:   0   0   ...   1   1   1   0   0   1   1   1   1   2  1   0
cpu 12:   0   0   ...   1   1   1   0   1   1   0   0   0   1  2   1
cpu 13:   0   0   ...   1   1   1   0   1   1   1   0   1   2  2   0
cpu 14:   0   0   ...   2   0   0   0   0   1   1   1   1   1  2   2
cpu 15:   0   0   ...   2   0   0   1   0   1   1   1   1   1  2   2

Where you can see that CPU is much busier with this patch.

Suggested-by: Ingo Molnar 
Cc: David Howells 
Signed-off-by: Yuanhan Liu 
---
 lib/rwsem-spinlock.c |   65 +-
 1 files changed, 22 insertions(+), 43 deletions(-)

diff --git a/lib/rwsem-spinlock.c b/lib/rwsem-spinlock.c
index 7e0d6a5..a32b85e 100644
--- a/lib/rwsem-spinlock.c
+++ b/lib/rwsem-spinlock.c
@@ -73,20 +73,13 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wakewrite)
goto dont_wake_writers;
}
 
-   /* if we are allowed to wake writers try to grant a single write lock
-* if there's a writer at the front of the queue
-* - we leave the 'waiting count' incremented to signify potential
-*   contention
+   /*
+* as we support write lock stealing, we can't set sem->activity
+* to -1 here to indicate we get the lock

Re: aim7 performance regression by commit 5a50508 report from LKP

2013-01-29 Thread Yuanhan Liu

On Tue, Jan 29, 2013 at 09:44:00AM +0100, Ingo Molnar wrote:
> 
> * Yuanhan Liu  wrote:
> 
> > [...]
> 
> Very nice measurements and analysis, thanks!
> 
> > As stated above, anybody can have a chance to own the lock in 
> > mutex once somebody release the lock. Well, there is only one 
> > to own the lock in rwsem write lock, and the one is known 
> > already: the one in the head of wait list. That would result 
> > to more contention in rwsem write lock case, especially if the 
> > one _will_ own the lock is not running now.
> 
> I think we should allow lock-steal between rwsem writers - that 
> will not hurt fairness as most rwsem fairness concerns relate to 
> reader vs. writer fairness.

Agreed, and I'm sure this will improve performance and may make this
performance regression go away. 

David, is that Ok to you? If so, I may have a try.

> 
> Am I correct to assume that all relevant users in this workload 
> are down_write() users?

Yes, as commit 5a50508 just convert all mutex to down_write.

Thanks.

--yliu
> 
> You can see the type of lock use in:
> 
>perf record -g
>perf report
> 
> I bet that allowing rwsem writer lock-steal would improve other 
> workloads as well.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

aim7 performance regression by commit 5a50508 report from LKP

2013-01-29 Thread Yuanhan Liu

Hi,

Here we, LKP(Linux Kernel Performance, a project to do performance
benchmark testing for Linux kernel, and try to find and fix the
performance regressions), found a 10%-20% performance regression
of aim7 benchmark introduced by commit 5a50508: "mm/rmap: Convert
the struct anon_vma::mutex to an rwsem".

It is quite easy to reproduce such regression with the workload of
workfile.shared in aim7 benchmark[0]. More, you will see more
obvious performance drop by setting fork_test as the only one
workload; the performance drops 2-4 times on our test boxes.

And here is the workfile you can see a obvious performance drop:

--- workfile --
FILESIZE: 1M
POOLSIZE: 10M
10 fork_test
---

Commit 5a50508 just convert all mutex lock to down_write, which, in
my opinion, is the same in semantics. While, the result is quite
different on the above workload.

I tried to figure out the root cause, and did some tests and took
some performance counters. Say, here is some /usr/bin/time output:

commit d28d433(good one, the commit before 5a50508):
-
  User time (seconds): 76.12
  System time (seconds): 4387.35
  Percent of CPU this job got: 599%
  Voluntary context switches: 191800723
  Involuntary context switches: 41312548

commit 5a50508(bad one):

  User time (seconds): 98.20
  System time (seconds): 5623.07
  Percent of CPU this job got: 373%
  Voluntary context switches: 615474680
  Involuntary context switches: 20388913


Where you can see:
 1. CPU is much idler at commit 5a50508 than d28d433 (373% vs 599%)

 2. Involves more voluntary context switches at commit 5a50508 than
d28d433(615474680 vs 191800723), which is about 3.2 times.


The performance drop and above data troubles me a lot, as I do think
rwsem write lock and mutex is the same.

I then tried to check the difference between mutex and rwsem
implementation, and found the key difference and I guess that is the
root cause:

It's about the condition how we quit the for(;;) loop, more accurate,
how to detect we got the lock.

mutex side
==
In __mutex_lock_common:

for (;;) {
if (atomic_xchg(&lock->count, -1) == 1)
break;
}


In __mutex_unlock_common_slowpath

atomic_set(&lock->count, 1);
wake_up_process(waiter->task);


rwsem side
==
In rwsem_down_failed_common(lock part):

waiter->task = current;


for (;;) {
if (!waiter.task)
break;
}

In __rwsem_do_wake(unlock part):

list_del(&waiter->list);
tsk = waiter->task;
smp_mb();
waiter->task = NULL;
wake_up_process(tsk);


Summary
===
In a simple word: mutex use lock->count to detect if we can get a lock
or not, while rwsem checks waiter->task. And that is the key problem.

Say, assume processes A, B, tried to get the mutex lock in that order,

So, A got it first, then B is blocked(went to sleep by calling schedule
voluntarily). After a while, A put the lock by setting lock->count to 1
and calling wake_up_process() to wake up B. While before B is woke up,
another process C tries to get the mutex. And C will find the condition
of following is true:

if (atomic_xchg(&lock->count, -1) == 1)
break;

Then C will get the lock bypassing B. In such case, one schedule is saved.

While, you will see no bypass getting lock with rwsem, as all process
are put into that wait list in order and wake up in order.

And that's why you saw more voluntary context switch with rwsem write lock
than mutex.

"This can't explain '1. CPU is much idler at commit 5a50508 than
d28d433 (373% vs 599%)' well", added by Huang Ying. He suggests to look
this issue from another side: who owns the lock.

As stated above, anybody can have a chance to own the lock in mutex once
somebody release the lock. Well, there is only one to own the lock in
rwsem write lock, and the one is known already: the one in the head of
wait list. That would result to more contention in rwsem write lock case,
especially if the one _will_ own the lock is not running now.

He also suggested to get some scheduler data, here I sampled some data
from /proc/sched_debug. And the following  is a data list of .nr_running
filed for all CPUs:

commit d28d433(good one):
(sampled per 2 seconds, just few of them are showed)
->
cpu 00:   0   0   ... 102 103  99 110 105 100 104 108 106  98 103 103   98  
40  38 
cpu 01:   0   0   ... 103 103  99 110 106 100 104 108 106  98 103 103   98  
40  38 
cpu 02:   0   0   ... 106 112 114 104 109 112 107 105 110 109 104 102  109  
41  42 
cpu 03:   0   0   ... 106 112 114 104 109 113 107 106 110 109 104 102  109  
41  42 
cpu 04:   0   0   ...  95 106 102  98 102 102 102 109 105  93  91 103   98  
39  38

[tip:core/locking] locking/stat: Fix a typo

2013-01-24 Thread tip-bot for Yuanhan Liu

Commit-ID:  5b43715b9d27e4e6620264113169bb8e4e607205
Gitweb: http://git.kernel.org/tip/5b43715b9d27e4e6620264113169bb8e4e607205
Author: Yuanhan Liu 
AuthorDate: Thu, 24 Jan 2013 17:22:44 +0800
Committer:  Ingo Molnar 
CommitDate: Thu, 24 Jan 2013 10:59:43 +0100

locking/stat: Fix a typo

s/STATS/STAT

Signed-off-by: Yuanhan Liu 
Cc: Peter Zijlstra 
Link: 
http://lkml.kernel.org/r/1359019365-23646-1-git-send-email-yuanhan@linux.intel.com
Signed-off-by: Ingo Molnar 
---
 Documentation/lockstat.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/lockstat.txt b/Documentation/lockstat.txt
index cef00d4..dd2f7b2 100644
--- a/Documentation/lockstat.txt
+++ b/Documentation/lockstat.txt
@@ -65,7 +65,7 @@ that had to wait on lock acquisition.
 
  - CONFIGURATION
 
-Lock statistics are enabled via CONFIG_LOCK_STATS.
+Lock statistics are enabled via CONFIG_LOCK_STAT.
 
  - USAGE
 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 2/2] mutex: use spin_[un]lock instead of arch_spin_[un]lock

2013-01-24 Thread Yuanhan Liu

On Thu, Jan 24, 2013 at 11:14:50AM +0100, Ingo Molnar wrote:
> 
> * Yuanhan Liu  wrote:
> 
> > On Thu, Jan 24, 2013 at 10:58:07AM +0100, Ingo Molnar wrote:
> > > 
> > > * Yuanhan Liu  wrote:
> > > 
> > > > Use spin_[un]lock instead of arch_spin_[un]lock in mutex-debug.h so
> > > > that we can collect the lock statistics of spin_lock_mutex from
> > > > /proc/lock_stat.
> > > > 
> > > > Cc: Ingo Molnar 
> > > > Signed-off-by: Yuanhan Liu 
> > > > ---
> > > >  kernel/mutex-debug.h |4 ++--
> > > >  1 files changed, 2 insertions(+), 2 deletions(-)
> > > > 
> > > > diff --git a/kernel/mutex-debug.h b/kernel/mutex-debug.h
> > > > index 0799fd3..556c0bc 100644
> > > > --- a/kernel/mutex-debug.h
> > > > +++ b/kernel/mutex-debug.h
> > > > @@ -43,13 +43,13 @@ static inline void mutex_clear_owner(struct mutex 
> > > > *lock)
> > > > \
> > > > DEBUG_LOCKS_WARN_ON(in_interrupt());\
> > > > local_irq_save(flags);  \
> > > > -   arch_spin_lock(&(lock)->rlock.raw_lock);\
> > > > +   spin_lock(lock);\
> > > 
> > > But in that case it could probably use the spin_lock_irqsave() 
> > > primitive, right?
> > 
> > Right, in that case I should use spin_lock_irqsave.
> > 
> > But one question, why we use spin_lock at kernel/mutex.h, 
> > while use 'local_irq_save(); arch_spin_lock' at 
> > kernel/mutex-debug.h?
> > 
> > Shouldn't we keep it consistent? Say use spin_lock_irqsave?
> 
> I think we did it to increase performance with lockdep enabled - 
> this particular lockdep annotation, given the short codepaths, 
> is not that hard to verify - and if it breaks it will break a 
> thousand mutex locking places in the kernel.

Thanks for the explanation.
> 
> So maybe it's better to leave it alone - maybe add a comment 
> that explains the reason.

Sorry, I may not get your point clearly. Should I make another patch to
convert 'local_irq_save(..); arch_spin_lock(..);' at kernel/mutex-debug.h
to spin_lock_irqsave() then?

Thanks.

--yliu
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 2/2] mutex: use spin_[un]lock instead of arch_spin_[un]lock

2013-01-24 Thread Yuanhan Liu

On Thu, Jan 24, 2013 at 10:58:07AM +0100, Ingo Molnar wrote:
> 
> * Yuanhan Liu  wrote:
> 
> > Use spin_[un]lock instead of arch_spin_[un]lock in mutex-debug.h so
> > that we can collect the lock statistics of spin_lock_mutex from
> > /proc/lock_stat.
> > 
> > Cc: Ingo Molnar 
> > Signed-off-by: Yuanhan Liu 
> > ---
> >  kernel/mutex-debug.h |4 ++--
> >  1 files changed, 2 insertions(+), 2 deletions(-)
> > 
> > diff --git a/kernel/mutex-debug.h b/kernel/mutex-debug.h
> > index 0799fd3..556c0bc 100644
> > --- a/kernel/mutex-debug.h
> > +++ b/kernel/mutex-debug.h
> > @@ -43,13 +43,13 @@ static inline void mutex_clear_owner(struct mutex *lock)
> > \
> > DEBUG_LOCKS_WARN_ON(in_interrupt());\
> > local_irq_save(flags);  \
> > -   arch_spin_lock(&(lock)->rlock.raw_lock);\
> > +   spin_lock(lock);\
> 
> But in that case it could probably use the spin_lock_irqsave() 
> primitive, right?

Right, in that case I should use spin_lock_irqsave.

But one question, why we use spin_lock at kernel/mutex.h,
while use 'local_irq_save(); arch_spin_lock' at kernel/mutex-debug.h?

Shouldn't we keep it consistent? Say use spin_lock_irqsave?

Thanks.

--yliu
> 
> > DEBUG_LOCKS_WARN_ON(l->magic != l); \
> > } while (0)
> >  
> >  #define spin_unlock_mutex(lock, flags) \
> > do {\
> > -   arch_spin_unlock(&(lock)->rlock.raw_lock);  \
> > +   spin_unlock(lock);  \
> > local_irq_restore(flags);   \
> > preempt_check_resched();\
> 
> And here spin_unlock_irqrestore().
> 
> Thanks,
> 
>   Ingo
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

1 2 >

1 - 100 of 160 matches

Mail list logo