Re: [PATCH 2/2] fs/ext4/namei.c: reducing contention on s_orphan_lock mmutex

2013-10-03 Thread Thavatchai Makphaibulchoke
On 10/03/2013 06:41 PM, Andreas Dilger wrote:

>> +struct inode *next_inode;
> 
> Stack space in the kernel is not so abundant that all (or any?) of these
> should get their own local variable.
> 
>>
>> -if (!EXT4_SB(sb)->s_journal)
> 
> Same here.
> 
> 
> Cheers, Andreas

Thanks Andreas for the comments.  On larger machines with processors with lots 
of register, with the compiler optimization I don't think it matters whether 
stack variables or repeated common expressions are used.  On smaller machines 
with processors with limited number of registers, this will be a problem.  I'll 
fix these on my rework.

Thanks,
Mak.


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 2/2] fs/ext4/namei.c: reducing contention on s_orphan_lock mmutex

2013-10-03 Thread Andreas Dilger

On 2013-10-02, at 9:36 AM, T Makphaibulchoke wrote:

> Instead of using a single per super block mutex, s_orphan_lock, to serialize
> all orphan list updates, a separate mutex and spinlock are used to
> protect the on disk and in memory orphan lists respecvitely.
> 
> At the same time, a per inode mutex is used to serialize orphan list
> updates of a single inode.
> 
> Signed-off-by: T. Makphaibulchoke 
> ---
> fs/ext4/namei.c | 139 
> 1 file changed, 100 insertions(+), 39 deletions(-)
> 
> diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
> index 35f55a0..d7d3d0f 100644
> --- a/fs/ext4/namei.c
> +++ b/fs/ext4/namei.c
> @@ -2554,12 +2554,24 @@ int ext4_orphan_add(handle_t *handle, struct inode 
> *inode)
>   struct super_block *sb = inode->i_sb;
>   struct ext4_iloc iloc;
>   int err = 0, rc;
> + struct ext4_inode_info *ext4_inode = EXT4_I(inode);
> + struct ext4_sb_info *ext4_sb = EXT4_SB(sb);
> + struct mutex *inode_orphan_mutex;
> + spinlock_t *orphan_lock;
> + struct mutex *ondisk_orphan_mutex;
> + struct list_head *prev;
> + unsigned int next_inum;
> + struct inode *next_inode;

Stack space in the kernel is not so abundant that all (or any?) of these
should get their own local variable.

> 
> - if (!EXT4_SB(sb)->s_journal)
> + if (ext4_sb->s_journal)
>   return 0;
> 
> - mutex_lock(_SB(sb)->s_orphan_lock);
> - if (!list_empty(_I(inode)->i_orphan))
> + inode_orphan_mutex = _inode->i_orphan_mutex;
> + orphan_lock = _sb->s_orphan_lock;
> + ondisk_orphan_mutex = _sb->s_ondisk_orphan_lock;
> + mutex_lock(inode_orphan_mutex);
> + prev = ext4_inode->i_orphan.prev;
> + if (prev != _inode->i_orphan)
>   goto out_unlock;
> 
>   /*
> @@ -2579,17 +2591,28 @@ int ext4_orphan_add(handle_t *handle, struct inode 
> *inode)
>   err = ext4_reserve_inode_write(handle, inode, );
>   if (err)
>   goto out_unlock;
> + mutex_lock(ondisk_orphan_mutex);
>   /*
>* Due to previous errors inode may be already a part of on-disk
>* orphan list. If so skip on-disk list modification.
>*/
>   if (NEXT_ORPHAN(inode) && NEXT_ORPHAN(inode) <=
> - (le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)))
> + (le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))) {
> + mutex_unlock(ondisk_orphan_mutex);
>   goto mem_insert;
> + }
> 
>   /* Insert this inode at the head of the on-disk orphan list... */
> - NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan);
> - EXT4_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
> + next_inum = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan);
> + NEXT_ORPHAN(inode) = next_inum;
> + next_inode = ilookup(sb, next_inum);
> + if (next_inode)
> + EXT4_I(next_inode)->i_prev_orphan = inode->i_ino;
> + ext4_sb->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
> + ext4_inode->i_prev_orphan = 0;
> + mutex_unlock(ondisk_orphan_mutex);
> + if (next_inode)
> + iput(next_inode);
>   err = ext4_handle_dirty_super(handle, sb);
>   rc = ext4_mark_iloc_dirty(handle, inode, );
>   if (!err)
> @@ -2604,14 +2627,17 @@ int ext4_orphan_add(handle_t *handle, struct inode 
> *inode)
>* This is safe: on error we're going to ignore the orphan list
>* anyway on the next recovery. */
> mem_insert:
> - if (!err)
> - list_add(_I(inode)->i_orphan, _SB(sb)->s_orphan);
> + if (!err) {
> + spin_lock(orphan_lock);
> + list_add(_inode->i_orphan, _sb->s_orphan);
> + spin_unlock(orphan_lock);
> + }
> 
>   jbd_debug(4, "superblock will point to %lu\n", inode->i_ino);
>   jbd_debug(4, "orphan inode %lu will point to %d\n",
>   inode->i_ino, NEXT_ORPHAN(inode));
> out_unlock:
> - mutex_unlock(_SB(sb)->s_orphan_lock);
> + mutex_unlock(inode_orphan_mutex);
>   ext4_std_error(inode->i_sb, err);
>   return err;
> }
> @@ -2624,71 +2650,106 @@ int ext4_orphan_del(handle_t *handle, struct inode 
> *inode)
> {
>   struct list_head *prev;
>   struct ext4_inode_info *ei = EXT4_I(inode);
> - struct ext4_sb_info *sbi;
> + struct inode *i_next = NULL;
> + struct inode *i_prev = NULL;
> + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
>   __u32 ino_next;
> + unsigned int ino_prev;
>   struct ext4_iloc iloc;
>   int err = 0;
> + struct mutex *inode_orphan_mutex;
> + spinlock_t *orphan_lock;
> + struct mutex *ondisk_orphan_mutex;

Same here.

> - if ((!EXT4_SB(inode->i_sb)->s_journal) &&
> - !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS))
> + if ((!sbi->s_journal) &&
> + !(sbi->s_mount_state & EXT4_ORPHAN_FS))
>   return 0;
> 
> - 

Re: [PATCH 2/2] fs/ext4/namei.c: reducing contention on s_orphan_lock mmutex

2013-10-03 Thread Thavatchai Makphaibulchoke
On 10/02/2013 08:05 PM, Zheng Liu wrote:
> Hello,
> 

>> -if (!EXT4_SB(sb)->s_journal)
>> +if (ext4_sb->s_journal)
> 
> typo: !ext4_sb->s_journal
> I am not sure whether or not this will impact the result because when
> journal is enabled the inode will not be added into orphan list.
> 
> Regards,
> - Zheng
> 
Thanks Zheng for pointing out the problem.  Will fix the problem and rerun the 
test.

Thanks,
Mak.


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 2/2] fs/ext4/namei.c: reducing contention on s_orphan_lock mmutex

2013-10-03 Thread Thavatchai Makphaibulchoke
On 10/02/2013 08:05 PM, Zheng Liu wrote:
 Hello,
 

 -if (!EXT4_SB(sb)-s_journal)
 +if (ext4_sb-s_journal)
 
 typo: !ext4_sb-s_journal
 I am not sure whether or not this will impact the result because when
 journal is enabled the inode will not be added into orphan list.
 
 Regards,
 - Zheng
 
Thanks Zheng for pointing out the problem.  Will fix the problem and rerun the 
test.

Thanks,
Mak.


--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 2/2] fs/ext4/namei.c: reducing contention on s_orphan_lock mmutex

2013-10-03 Thread Andreas Dilger

On 2013-10-02, at 9:36 AM, T Makphaibulchoke wrote:

 Instead of using a single per super block mutex, s_orphan_lock, to serialize
 all orphan list updates, a separate mutex and spinlock are used to
 protect the on disk and in memory orphan lists respecvitely.
 
 At the same time, a per inode mutex is used to serialize orphan list
 updates of a single inode.
 
 Signed-off-by: T. Makphaibulchoke t...@hp.com
 ---
 fs/ext4/namei.c | 139 
 1 file changed, 100 insertions(+), 39 deletions(-)
 
 diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
 index 35f55a0..d7d3d0f 100644
 --- a/fs/ext4/namei.c
 +++ b/fs/ext4/namei.c
 @@ -2554,12 +2554,24 @@ int ext4_orphan_add(handle_t *handle, struct inode 
 *inode)
   struct super_block *sb = inode-i_sb;
   struct ext4_iloc iloc;
   int err = 0, rc;
 + struct ext4_inode_info *ext4_inode = EXT4_I(inode);
 + struct ext4_sb_info *ext4_sb = EXT4_SB(sb);
 + struct mutex *inode_orphan_mutex;
 + spinlock_t *orphan_lock;
 + struct mutex *ondisk_orphan_mutex;
 + struct list_head *prev;
 + unsigned int next_inum;
 + struct inode *next_inode;

Stack space in the kernel is not so abundant that all (or any?) of these
should get their own local variable.

 
 - if (!EXT4_SB(sb)-s_journal)
 + if (ext4_sb-s_journal)
   return 0;
 
 - mutex_lock(EXT4_SB(sb)-s_orphan_lock);
 - if (!list_empty(EXT4_I(inode)-i_orphan))
 + inode_orphan_mutex = ext4_inode-i_orphan_mutex;
 + orphan_lock = ext4_sb-s_orphan_lock;
 + ondisk_orphan_mutex = ext4_sb-s_ondisk_orphan_lock;
 + mutex_lock(inode_orphan_mutex);
 + prev = ext4_inode-i_orphan.prev;
 + if (prev != ext4_inode-i_orphan)
   goto out_unlock;
 
   /*
 @@ -2579,17 +2591,28 @@ int ext4_orphan_add(handle_t *handle, struct inode 
 *inode)
   err = ext4_reserve_inode_write(handle, inode, iloc);
   if (err)
   goto out_unlock;
 + mutex_lock(ondisk_orphan_mutex);
   /*
* Due to previous errors inode may be already a part of on-disk
* orphan list. If so skip on-disk list modification.
*/
   if (NEXT_ORPHAN(inode)  NEXT_ORPHAN(inode) =
 - (le32_to_cpu(EXT4_SB(sb)-s_es-s_inodes_count)))
 + (le32_to_cpu(EXT4_SB(sb)-s_es-s_inodes_count))) {
 + mutex_unlock(ondisk_orphan_mutex);
   goto mem_insert;
 + }
 
   /* Insert this inode at the head of the on-disk orphan list... */
 - NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)-s_es-s_last_orphan);
 - EXT4_SB(sb)-s_es-s_last_orphan = cpu_to_le32(inode-i_ino);
 + next_inum = le32_to_cpu(EXT4_SB(sb)-s_es-s_last_orphan);
 + NEXT_ORPHAN(inode) = next_inum;
 + next_inode = ilookup(sb, next_inum);
 + if (next_inode)
 + EXT4_I(next_inode)-i_prev_orphan = inode-i_ino;
 + ext4_sb-s_es-s_last_orphan = cpu_to_le32(inode-i_ino);
 + ext4_inode-i_prev_orphan = 0;
 + mutex_unlock(ondisk_orphan_mutex);
 + if (next_inode)
 + iput(next_inode);
   err = ext4_handle_dirty_super(handle, sb);
   rc = ext4_mark_iloc_dirty(handle, inode, iloc);
   if (!err)
 @@ -2604,14 +2627,17 @@ int ext4_orphan_add(handle_t *handle, struct inode 
 *inode)
* This is safe: on error we're going to ignore the orphan list
* anyway on the next recovery. */
 mem_insert:
 - if (!err)
 - list_add(EXT4_I(inode)-i_orphan, EXT4_SB(sb)-s_orphan);
 + if (!err) {
 + spin_lock(orphan_lock);
 + list_add(ext4_inode-i_orphan, ext4_sb-s_orphan);
 + spin_unlock(orphan_lock);
 + }
 
   jbd_debug(4, superblock will point to %lu\n, inode-i_ino);
   jbd_debug(4, orphan inode %lu will point to %d\n,
   inode-i_ino, NEXT_ORPHAN(inode));
 out_unlock:
 - mutex_unlock(EXT4_SB(sb)-s_orphan_lock);
 + mutex_unlock(inode_orphan_mutex);
   ext4_std_error(inode-i_sb, err);
   return err;
 }
 @@ -2624,71 +2650,106 @@ int ext4_orphan_del(handle_t *handle, struct inode 
 *inode)
 {
   struct list_head *prev;
   struct ext4_inode_info *ei = EXT4_I(inode);
 - struct ext4_sb_info *sbi;
 + struct inode *i_next = NULL;
 + struct inode *i_prev = NULL;
 + struct ext4_sb_info *sbi = EXT4_SB(inode-i_sb);
   __u32 ino_next;
 + unsigned int ino_prev;
   struct ext4_iloc iloc;
   int err = 0;
 + struct mutex *inode_orphan_mutex;
 + spinlock_t *orphan_lock;
 + struct mutex *ondisk_orphan_mutex;

Same here.

 - if ((!EXT4_SB(inode-i_sb)-s_journal) 
 - !(EXT4_SB(inode-i_sb)-s_mount_state  EXT4_ORPHAN_FS))
 + if ((!sbi-s_journal) 
 + !(sbi-s_mount_state  EXT4_ORPHAN_FS))
   return 0;
 
 - mutex_lock(EXT4_SB(inode-i_sb)-s_orphan_lock);
 - if (list_empty(ei-i_orphan))
 - goto out;
 -
 - ino_next = NEXT_ORPHAN(inode);
 +

Re: [PATCH 2/2] fs/ext4/namei.c: reducing contention on s_orphan_lock mmutex

2013-10-03 Thread Thavatchai Makphaibulchoke
On 10/03/2013 06:41 PM, Andreas Dilger wrote:

 +struct inode *next_inode;
 
 Stack space in the kernel is not so abundant that all (or any?) of these
 should get their own local variable.
 

 -if (!EXT4_SB(sb)-s_journal)
 
 Same here.
 
 
 Cheers, Andreas

Thanks Andreas for the comments.  On larger machines with processors with lots 
of register, with the compiler optimization I don't think it matters whether 
stack variables or repeated common expressions are used.  On smaller machines 
with processors with limited number of registers, this will be a problem.  I'll 
fix these on my rework.

Thanks,
Mak.


--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 2/2] fs/ext4/namei.c: reducing contention on s_orphan_lock mmutex

2013-10-02 Thread Zheng Liu
Hello,

On Wed, Oct 02, 2013 at 09:36:59AM -0600, T Makphaibulchoke wrote:
> Instead of using a single per super block mutex, s_orphan_lock, to serialize
> all orphan list updates, a separate mutex and spinlock are used to
> protect the on disk and in memory orphan lists respecvitely.
> 
> At the same time, a per inode mutex is used to serialize orphan list
> updates of a single inode.
> 
> Signed-off-by: T. Makphaibulchoke 
> ---
>  fs/ext4/namei.c | 139 
> 
>  1 file changed, 100 insertions(+), 39 deletions(-)
> 
> diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
> index 35f55a0..d7d3d0f 100644
> --- a/fs/ext4/namei.c
> +++ b/fs/ext4/namei.c
> @@ -2554,12 +2554,24 @@ int ext4_orphan_add(handle_t *handle, struct inode 
> *inode)
>   struct super_block *sb = inode->i_sb;
>   struct ext4_iloc iloc;
>   int err = 0, rc;
> + struct ext4_inode_info *ext4_inode = EXT4_I(inode);
> + struct ext4_sb_info *ext4_sb = EXT4_SB(sb);
> + struct mutex *inode_orphan_mutex;
> + spinlock_t *orphan_lock;
> + struct mutex *ondisk_orphan_mutex;
> + struct list_head *prev;
> + unsigned int next_inum;
> + struct inode *next_inode;
>  
> - if (!EXT4_SB(sb)->s_journal)
> + if (ext4_sb->s_journal)

typo: !ext4_sb->s_journal
I am not sure whether or not this will impact the result because when
journal is enabled the inode will not be added into orphan list.

Regards,
- Zheng

>   return 0;
>  
> - mutex_lock(_SB(sb)->s_orphan_lock);
> - if (!list_empty(_I(inode)->i_orphan))
> + inode_orphan_mutex = _inode->i_orphan_mutex;
> + orphan_lock = _sb->s_orphan_lock;
> + ondisk_orphan_mutex = _sb->s_ondisk_orphan_lock;
> + mutex_lock(inode_orphan_mutex);
> + prev = ext4_inode->i_orphan.prev;
> + if (prev != _inode->i_orphan)
>   goto out_unlock;
>  
>   /*
> @@ -2579,17 +2591,28 @@ int ext4_orphan_add(handle_t *handle, struct inode 
> *inode)
>   err = ext4_reserve_inode_write(handle, inode, );
>   if (err)
>   goto out_unlock;
> + mutex_lock(ondisk_orphan_mutex);
>   /*
>* Due to previous errors inode may be already a part of on-disk
>* orphan list. If so skip on-disk list modification.
>*/
>   if (NEXT_ORPHAN(inode) && NEXT_ORPHAN(inode) <=
> - (le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)))
> + (le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))) {
> + mutex_unlock(ondisk_orphan_mutex);
>   goto mem_insert;
> + }
>  
>   /* Insert this inode at the head of the on-disk orphan list... */
> - NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan);
> - EXT4_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
> + next_inum = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan);
> + NEXT_ORPHAN(inode) = next_inum;
> + next_inode = ilookup(sb, next_inum);
> + if (next_inode)
> + EXT4_I(next_inode)->i_prev_orphan = inode->i_ino;
> + ext4_sb->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
> + ext4_inode->i_prev_orphan = 0;
> + mutex_unlock(ondisk_orphan_mutex);
> + if (next_inode)
> + iput(next_inode);
>   err = ext4_handle_dirty_super(handle, sb);
>   rc = ext4_mark_iloc_dirty(handle, inode, );
>   if (!err)
> @@ -2604,14 +2627,17 @@ int ext4_orphan_add(handle_t *handle, struct inode 
> *inode)
>* This is safe: on error we're going to ignore the orphan list
>* anyway on the next recovery. */
>  mem_insert:
> - if (!err)
> - list_add(_I(inode)->i_orphan, _SB(sb)->s_orphan);
> + if (!err) {
> + spin_lock(orphan_lock);
> + list_add(_inode->i_orphan, _sb->s_orphan);
> + spin_unlock(orphan_lock);
> + }
>  
>   jbd_debug(4, "superblock will point to %lu\n", inode->i_ino);
>   jbd_debug(4, "orphan inode %lu will point to %d\n",
>   inode->i_ino, NEXT_ORPHAN(inode));
>  out_unlock:
> - mutex_unlock(_SB(sb)->s_orphan_lock);
> + mutex_unlock(inode_orphan_mutex);
>   ext4_std_error(inode->i_sb, err);
>   return err;
>  }
> @@ -2624,71 +2650,106 @@ int ext4_orphan_del(handle_t *handle, struct inode 
> *inode)
>  {
>   struct list_head *prev;
>   struct ext4_inode_info *ei = EXT4_I(inode);
> - struct ext4_sb_info *sbi;
> + struct inode *i_next = NULL;
> + struct inode *i_prev = NULL;
> + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
>   __u32 ino_next;
> + unsigned int ino_prev;
>   struct ext4_iloc iloc;
>   int err = 0;
> + struct mutex *inode_orphan_mutex;
> + spinlock_t *orphan_lock;
> + struct mutex *ondisk_orphan_mutex;
>  
> - if ((!EXT4_SB(inode->i_sb)->s_journal) &&
> - !(EXT4_SB(inode->i_sb)->s_mount_state & 

[PATCH 2/2] fs/ext4/namei.c: reducing contention on s_orphan_lock mmutex

2013-10-02 Thread T Makphaibulchoke
Instead of using a single per super block mutex, s_orphan_lock, to serialize
all orphan list updates, a separate mutex and spinlock are used to
protect the on disk and in memory orphan lists respecvitely.

At the same time, a per inode mutex is used to serialize orphan list
updates of a single inode.

Signed-off-by: T. Makphaibulchoke 
---
 fs/ext4/namei.c | 139 
 1 file changed, 100 insertions(+), 39 deletions(-)

diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 35f55a0..d7d3d0f 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2554,12 +2554,24 @@ int ext4_orphan_add(handle_t *handle, struct inode 
*inode)
struct super_block *sb = inode->i_sb;
struct ext4_iloc iloc;
int err = 0, rc;
+   struct ext4_inode_info *ext4_inode = EXT4_I(inode);
+   struct ext4_sb_info *ext4_sb = EXT4_SB(sb);
+   struct mutex *inode_orphan_mutex;
+   spinlock_t *orphan_lock;
+   struct mutex *ondisk_orphan_mutex;
+   struct list_head *prev;
+   unsigned int next_inum;
+   struct inode *next_inode;
 
-   if (!EXT4_SB(sb)->s_journal)
+   if (ext4_sb->s_journal)
return 0;
 
-   mutex_lock(_SB(sb)->s_orphan_lock);
-   if (!list_empty(_I(inode)->i_orphan))
+   inode_orphan_mutex = _inode->i_orphan_mutex;
+   orphan_lock = _sb->s_orphan_lock;
+   ondisk_orphan_mutex = _sb->s_ondisk_orphan_lock;
+   mutex_lock(inode_orphan_mutex);
+   prev = ext4_inode->i_orphan.prev;
+   if (prev != _inode->i_orphan)
goto out_unlock;
 
/*
@@ -2579,17 +2591,28 @@ int ext4_orphan_add(handle_t *handle, struct inode 
*inode)
err = ext4_reserve_inode_write(handle, inode, );
if (err)
goto out_unlock;
+   mutex_lock(ondisk_orphan_mutex);
/*
 * Due to previous errors inode may be already a part of on-disk
 * orphan list. If so skip on-disk list modification.
 */
if (NEXT_ORPHAN(inode) && NEXT_ORPHAN(inode) <=
-   (le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)))
+   (le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))) {
+   mutex_unlock(ondisk_orphan_mutex);
goto mem_insert;
+   }
 
/* Insert this inode at the head of the on-disk orphan list... */
-   NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan);
-   EXT4_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
+   next_inum = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan);
+   NEXT_ORPHAN(inode) = next_inum;
+   next_inode = ilookup(sb, next_inum);
+   if (next_inode)
+   EXT4_I(next_inode)->i_prev_orphan = inode->i_ino;
+   ext4_sb->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
+   ext4_inode->i_prev_orphan = 0;
+   mutex_unlock(ondisk_orphan_mutex);
+   if (next_inode)
+   iput(next_inode);
err = ext4_handle_dirty_super(handle, sb);
rc = ext4_mark_iloc_dirty(handle, inode, );
if (!err)
@@ -2604,14 +2627,17 @@ int ext4_orphan_add(handle_t *handle, struct inode 
*inode)
 * This is safe: on error we're going to ignore the orphan list
 * anyway on the next recovery. */
 mem_insert:
-   if (!err)
-   list_add(_I(inode)->i_orphan, _SB(sb)->s_orphan);
+   if (!err) {
+   spin_lock(orphan_lock);
+   list_add(_inode->i_orphan, _sb->s_orphan);
+   spin_unlock(orphan_lock);
+   }
 
jbd_debug(4, "superblock will point to %lu\n", inode->i_ino);
jbd_debug(4, "orphan inode %lu will point to %d\n",
inode->i_ino, NEXT_ORPHAN(inode));
 out_unlock:
-   mutex_unlock(_SB(sb)->s_orphan_lock);
+   mutex_unlock(inode_orphan_mutex);
ext4_std_error(inode->i_sb, err);
return err;
 }
@@ -2624,71 +2650,106 @@ int ext4_orphan_del(handle_t *handle, struct inode 
*inode)
 {
struct list_head *prev;
struct ext4_inode_info *ei = EXT4_I(inode);
-   struct ext4_sb_info *sbi;
+   struct inode *i_next = NULL;
+   struct inode *i_prev = NULL;
+   struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
__u32 ino_next;
+   unsigned int ino_prev;
struct ext4_iloc iloc;
int err = 0;
+   struct mutex *inode_orphan_mutex;
+   spinlock_t *orphan_lock;
+   struct mutex *ondisk_orphan_mutex;
 
-   if ((!EXT4_SB(inode->i_sb)->s_journal) &&
-   !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS))
+   if ((!sbi->s_journal) &&
+   !(sbi->s_mount_state & EXT4_ORPHAN_FS))
return 0;
 
-   mutex_lock(_SB(inode->i_sb)->s_orphan_lock);
-   if (list_empty(>i_orphan))
-   goto out;
-
-   ino_next = NEXT_ORPHAN(inode);
+   inode_orphan_mutex = >i_orphan_mutex;
+   orphan_lock = >s_orphan_lock;
+   ondisk_orphan_mutex = 

[PATCH 2/2] fs/ext4/namei.c: reducing contention on s_orphan_lock mmutex

2013-10-02 Thread T Makphaibulchoke
Instead of using a single per super block mutex, s_orphan_lock, to serialize
all orphan list updates, a separate mutex and spinlock are used to
protect the on disk and in memory orphan lists respecvitely.

At the same time, a per inode mutex is used to serialize orphan list
updates of a single inode.

Signed-off-by: T. Makphaibulchoke t...@hp.com
---
 fs/ext4/namei.c | 139 
 1 file changed, 100 insertions(+), 39 deletions(-)

diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 35f55a0..d7d3d0f 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2554,12 +2554,24 @@ int ext4_orphan_add(handle_t *handle, struct inode 
*inode)
struct super_block *sb = inode-i_sb;
struct ext4_iloc iloc;
int err = 0, rc;
+   struct ext4_inode_info *ext4_inode = EXT4_I(inode);
+   struct ext4_sb_info *ext4_sb = EXT4_SB(sb);
+   struct mutex *inode_orphan_mutex;
+   spinlock_t *orphan_lock;
+   struct mutex *ondisk_orphan_mutex;
+   struct list_head *prev;
+   unsigned int next_inum;
+   struct inode *next_inode;
 
-   if (!EXT4_SB(sb)-s_journal)
+   if (ext4_sb-s_journal)
return 0;
 
-   mutex_lock(EXT4_SB(sb)-s_orphan_lock);
-   if (!list_empty(EXT4_I(inode)-i_orphan))
+   inode_orphan_mutex = ext4_inode-i_orphan_mutex;
+   orphan_lock = ext4_sb-s_orphan_lock;
+   ondisk_orphan_mutex = ext4_sb-s_ondisk_orphan_lock;
+   mutex_lock(inode_orphan_mutex);
+   prev = ext4_inode-i_orphan.prev;
+   if (prev != ext4_inode-i_orphan)
goto out_unlock;
 
/*
@@ -2579,17 +2591,28 @@ int ext4_orphan_add(handle_t *handle, struct inode 
*inode)
err = ext4_reserve_inode_write(handle, inode, iloc);
if (err)
goto out_unlock;
+   mutex_lock(ondisk_orphan_mutex);
/*
 * Due to previous errors inode may be already a part of on-disk
 * orphan list. If so skip on-disk list modification.
 */
if (NEXT_ORPHAN(inode)  NEXT_ORPHAN(inode) =
-   (le32_to_cpu(EXT4_SB(sb)-s_es-s_inodes_count)))
+   (le32_to_cpu(EXT4_SB(sb)-s_es-s_inodes_count))) {
+   mutex_unlock(ondisk_orphan_mutex);
goto mem_insert;
+   }
 
/* Insert this inode at the head of the on-disk orphan list... */
-   NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)-s_es-s_last_orphan);
-   EXT4_SB(sb)-s_es-s_last_orphan = cpu_to_le32(inode-i_ino);
+   next_inum = le32_to_cpu(EXT4_SB(sb)-s_es-s_last_orphan);
+   NEXT_ORPHAN(inode) = next_inum;
+   next_inode = ilookup(sb, next_inum);
+   if (next_inode)
+   EXT4_I(next_inode)-i_prev_orphan = inode-i_ino;
+   ext4_sb-s_es-s_last_orphan = cpu_to_le32(inode-i_ino);
+   ext4_inode-i_prev_orphan = 0;
+   mutex_unlock(ondisk_orphan_mutex);
+   if (next_inode)
+   iput(next_inode);
err = ext4_handle_dirty_super(handle, sb);
rc = ext4_mark_iloc_dirty(handle, inode, iloc);
if (!err)
@@ -2604,14 +2627,17 @@ int ext4_orphan_add(handle_t *handle, struct inode 
*inode)
 * This is safe: on error we're going to ignore the orphan list
 * anyway on the next recovery. */
 mem_insert:
-   if (!err)
-   list_add(EXT4_I(inode)-i_orphan, EXT4_SB(sb)-s_orphan);
+   if (!err) {
+   spin_lock(orphan_lock);
+   list_add(ext4_inode-i_orphan, ext4_sb-s_orphan);
+   spin_unlock(orphan_lock);
+   }
 
jbd_debug(4, superblock will point to %lu\n, inode-i_ino);
jbd_debug(4, orphan inode %lu will point to %d\n,
inode-i_ino, NEXT_ORPHAN(inode));
 out_unlock:
-   mutex_unlock(EXT4_SB(sb)-s_orphan_lock);
+   mutex_unlock(inode_orphan_mutex);
ext4_std_error(inode-i_sb, err);
return err;
 }
@@ -2624,71 +2650,106 @@ int ext4_orphan_del(handle_t *handle, struct inode 
*inode)
 {
struct list_head *prev;
struct ext4_inode_info *ei = EXT4_I(inode);
-   struct ext4_sb_info *sbi;
+   struct inode *i_next = NULL;
+   struct inode *i_prev = NULL;
+   struct ext4_sb_info *sbi = EXT4_SB(inode-i_sb);
__u32 ino_next;
+   unsigned int ino_prev;
struct ext4_iloc iloc;
int err = 0;
+   struct mutex *inode_orphan_mutex;
+   spinlock_t *orphan_lock;
+   struct mutex *ondisk_orphan_mutex;
 
-   if ((!EXT4_SB(inode-i_sb)-s_journal) 
-   !(EXT4_SB(inode-i_sb)-s_mount_state  EXT4_ORPHAN_FS))
+   if ((!sbi-s_journal) 
+   !(sbi-s_mount_state  EXT4_ORPHAN_FS))
return 0;
 
-   mutex_lock(EXT4_SB(inode-i_sb)-s_orphan_lock);
-   if (list_empty(ei-i_orphan))
-   goto out;
-
-   ino_next = NEXT_ORPHAN(inode);
+   inode_orphan_mutex = ei-i_orphan_mutex;
+   orphan_lock = sbi-s_orphan_lock;
+   

Re: [PATCH 2/2] fs/ext4/namei.c: reducing contention on s_orphan_lock mmutex

2013-10-02 Thread Zheng Liu
Hello,

On Wed, Oct 02, 2013 at 09:36:59AM -0600, T Makphaibulchoke wrote:
 Instead of using a single per super block mutex, s_orphan_lock, to serialize
 all orphan list updates, a separate mutex and spinlock are used to
 protect the on disk and in memory orphan lists respecvitely.
 
 At the same time, a per inode mutex is used to serialize orphan list
 updates of a single inode.
 
 Signed-off-by: T. Makphaibulchoke t...@hp.com
 ---
  fs/ext4/namei.c | 139 
 
  1 file changed, 100 insertions(+), 39 deletions(-)
 
 diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
 index 35f55a0..d7d3d0f 100644
 --- a/fs/ext4/namei.c
 +++ b/fs/ext4/namei.c
 @@ -2554,12 +2554,24 @@ int ext4_orphan_add(handle_t *handle, struct inode 
 *inode)
   struct super_block *sb = inode-i_sb;
   struct ext4_iloc iloc;
   int err = 0, rc;
 + struct ext4_inode_info *ext4_inode = EXT4_I(inode);
 + struct ext4_sb_info *ext4_sb = EXT4_SB(sb);
 + struct mutex *inode_orphan_mutex;
 + spinlock_t *orphan_lock;
 + struct mutex *ondisk_orphan_mutex;
 + struct list_head *prev;
 + unsigned int next_inum;
 + struct inode *next_inode;
  
 - if (!EXT4_SB(sb)-s_journal)
 + if (ext4_sb-s_journal)

typo: !ext4_sb-s_journal
I am not sure whether or not this will impact the result because when
journal is enabled the inode will not be added into orphan list.

Regards,
- Zheng

   return 0;
  
 - mutex_lock(EXT4_SB(sb)-s_orphan_lock);
 - if (!list_empty(EXT4_I(inode)-i_orphan))
 + inode_orphan_mutex = ext4_inode-i_orphan_mutex;
 + orphan_lock = ext4_sb-s_orphan_lock;
 + ondisk_orphan_mutex = ext4_sb-s_ondisk_orphan_lock;
 + mutex_lock(inode_orphan_mutex);
 + prev = ext4_inode-i_orphan.prev;
 + if (prev != ext4_inode-i_orphan)
   goto out_unlock;
  
   /*
 @@ -2579,17 +2591,28 @@ int ext4_orphan_add(handle_t *handle, struct inode 
 *inode)
   err = ext4_reserve_inode_write(handle, inode, iloc);
   if (err)
   goto out_unlock;
 + mutex_lock(ondisk_orphan_mutex);
   /*
* Due to previous errors inode may be already a part of on-disk
* orphan list. If so skip on-disk list modification.
*/
   if (NEXT_ORPHAN(inode)  NEXT_ORPHAN(inode) =
 - (le32_to_cpu(EXT4_SB(sb)-s_es-s_inodes_count)))
 + (le32_to_cpu(EXT4_SB(sb)-s_es-s_inodes_count))) {
 + mutex_unlock(ondisk_orphan_mutex);
   goto mem_insert;
 + }
  
   /* Insert this inode at the head of the on-disk orphan list... */
 - NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)-s_es-s_last_orphan);
 - EXT4_SB(sb)-s_es-s_last_orphan = cpu_to_le32(inode-i_ino);
 + next_inum = le32_to_cpu(EXT4_SB(sb)-s_es-s_last_orphan);
 + NEXT_ORPHAN(inode) = next_inum;
 + next_inode = ilookup(sb, next_inum);
 + if (next_inode)
 + EXT4_I(next_inode)-i_prev_orphan = inode-i_ino;
 + ext4_sb-s_es-s_last_orphan = cpu_to_le32(inode-i_ino);
 + ext4_inode-i_prev_orphan = 0;
 + mutex_unlock(ondisk_orphan_mutex);
 + if (next_inode)
 + iput(next_inode);
   err = ext4_handle_dirty_super(handle, sb);
   rc = ext4_mark_iloc_dirty(handle, inode, iloc);
   if (!err)
 @@ -2604,14 +2627,17 @@ int ext4_orphan_add(handle_t *handle, struct inode 
 *inode)
* This is safe: on error we're going to ignore the orphan list
* anyway on the next recovery. */
  mem_insert:
 - if (!err)
 - list_add(EXT4_I(inode)-i_orphan, EXT4_SB(sb)-s_orphan);
 + if (!err) {
 + spin_lock(orphan_lock);
 + list_add(ext4_inode-i_orphan, ext4_sb-s_orphan);
 + spin_unlock(orphan_lock);
 + }
  
   jbd_debug(4, superblock will point to %lu\n, inode-i_ino);
   jbd_debug(4, orphan inode %lu will point to %d\n,
   inode-i_ino, NEXT_ORPHAN(inode));
  out_unlock:
 - mutex_unlock(EXT4_SB(sb)-s_orphan_lock);
 + mutex_unlock(inode_orphan_mutex);
   ext4_std_error(inode-i_sb, err);
   return err;
  }
 @@ -2624,71 +2650,106 @@ int ext4_orphan_del(handle_t *handle, struct inode 
 *inode)
  {
   struct list_head *prev;
   struct ext4_inode_info *ei = EXT4_I(inode);
 - struct ext4_sb_info *sbi;
 + struct inode *i_next = NULL;
 + struct inode *i_prev = NULL;
 + struct ext4_sb_info *sbi = EXT4_SB(inode-i_sb);
   __u32 ino_next;
 + unsigned int ino_prev;
   struct ext4_iloc iloc;
   int err = 0;
 + struct mutex *inode_orphan_mutex;
 + spinlock_t *orphan_lock;
 + struct mutex *ondisk_orphan_mutex;
  
 - if ((!EXT4_SB(inode-i_sb)-s_journal) 
 - !(EXT4_SB(inode-i_sb)-s_mount_state  EXT4_ORPHAN_FS))
 + if ((!sbi-s_journal) 
 + !(sbi-s_mount_state  EXT4_ORPHAN_FS))
   return 0;