Re: [PATCH 2/2] fs/ext4/namei.c: reducing contention on s_orphan_lock mmutex
On 10/03/2013 06:41 PM, Andreas Dilger wrote: >> +struct inode *next_inode; > > Stack space in the kernel is not so abundant that all (or any?) of these > should get their own local variable. > >> >> -if (!EXT4_SB(sb)->s_journal) > > Same here. > > > Cheers, Andreas Thanks Andreas for the comments. On larger machines with processors with lots of register, with the compiler optimization I don't think it matters whether stack variables or repeated common expressions are used. On smaller machines with processors with limited number of registers, this will be a problem. I'll fix these on my rework. Thanks, Mak. -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 2/2] fs/ext4/namei.c: reducing contention on s_orphan_lock mmutex
On 2013-10-02, at 9:36 AM, T Makphaibulchoke wrote: > Instead of using a single per super block mutex, s_orphan_lock, to serialize > all orphan list updates, a separate mutex and spinlock are used to > protect the on disk and in memory orphan lists respecvitely. > > At the same time, a per inode mutex is used to serialize orphan list > updates of a single inode. > > Signed-off-by: T. Makphaibulchoke > --- > fs/ext4/namei.c | 139 > 1 file changed, 100 insertions(+), 39 deletions(-) > > diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c > index 35f55a0..d7d3d0f 100644 > --- a/fs/ext4/namei.c > +++ b/fs/ext4/namei.c > @@ -2554,12 +2554,24 @@ int ext4_orphan_add(handle_t *handle, struct inode > *inode) > struct super_block *sb = inode->i_sb; > struct ext4_iloc iloc; > int err = 0, rc; > + struct ext4_inode_info *ext4_inode = EXT4_I(inode); > + struct ext4_sb_info *ext4_sb = EXT4_SB(sb); > + struct mutex *inode_orphan_mutex; > + spinlock_t *orphan_lock; > + struct mutex *ondisk_orphan_mutex; > + struct list_head *prev; > + unsigned int next_inum; > + struct inode *next_inode; Stack space in the kernel is not so abundant that all (or any?) of these should get their own local variable. > > - if (!EXT4_SB(sb)->s_journal) > + if (ext4_sb->s_journal) > return 0; > > - mutex_lock(_SB(sb)->s_orphan_lock); > - if (!list_empty(_I(inode)->i_orphan)) > + inode_orphan_mutex = _inode->i_orphan_mutex; > + orphan_lock = _sb->s_orphan_lock; > + ondisk_orphan_mutex = _sb->s_ondisk_orphan_lock; > + mutex_lock(inode_orphan_mutex); > + prev = ext4_inode->i_orphan.prev; > + if (prev != _inode->i_orphan) > goto out_unlock; > > /* > @@ -2579,17 +2591,28 @@ int ext4_orphan_add(handle_t *handle, struct inode > *inode) > err = ext4_reserve_inode_write(handle, inode, ); > if (err) > goto out_unlock; > + mutex_lock(ondisk_orphan_mutex); > /* >* Due to previous errors inode may be already a part of on-disk >* orphan list. If so skip on-disk list modification. >*/ > if (NEXT_ORPHAN(inode) && NEXT_ORPHAN(inode) <= > - (le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))) > + (le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))) { > + mutex_unlock(ondisk_orphan_mutex); > goto mem_insert; > + } > > /* Insert this inode at the head of the on-disk orphan list... */ > - NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan); > - EXT4_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino); > + next_inum = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan); > + NEXT_ORPHAN(inode) = next_inum; > + next_inode = ilookup(sb, next_inum); > + if (next_inode) > + EXT4_I(next_inode)->i_prev_orphan = inode->i_ino; > + ext4_sb->s_es->s_last_orphan = cpu_to_le32(inode->i_ino); > + ext4_inode->i_prev_orphan = 0; > + mutex_unlock(ondisk_orphan_mutex); > + if (next_inode) > + iput(next_inode); > err = ext4_handle_dirty_super(handle, sb); > rc = ext4_mark_iloc_dirty(handle, inode, ); > if (!err) > @@ -2604,14 +2627,17 @@ int ext4_orphan_add(handle_t *handle, struct inode > *inode) >* This is safe: on error we're going to ignore the orphan list >* anyway on the next recovery. */ > mem_insert: > - if (!err) > - list_add(_I(inode)->i_orphan, _SB(sb)->s_orphan); > + if (!err) { > + spin_lock(orphan_lock); > + list_add(_inode->i_orphan, _sb->s_orphan); > + spin_unlock(orphan_lock); > + } > > jbd_debug(4, "superblock will point to %lu\n", inode->i_ino); > jbd_debug(4, "orphan inode %lu will point to %d\n", > inode->i_ino, NEXT_ORPHAN(inode)); > out_unlock: > - mutex_unlock(_SB(sb)->s_orphan_lock); > + mutex_unlock(inode_orphan_mutex); > ext4_std_error(inode->i_sb, err); > return err; > } > @@ -2624,71 +2650,106 @@ int ext4_orphan_del(handle_t *handle, struct inode > *inode) > { > struct list_head *prev; > struct ext4_inode_info *ei = EXT4_I(inode); > - struct ext4_sb_info *sbi; > + struct inode *i_next = NULL; > + struct inode *i_prev = NULL; > + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); > __u32 ino_next; > + unsigned int ino_prev; > struct ext4_iloc iloc; > int err = 0; > + struct mutex *inode_orphan_mutex; > + spinlock_t *orphan_lock; > + struct mutex *ondisk_orphan_mutex; Same here. > - if ((!EXT4_SB(inode->i_sb)->s_journal) && > - !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) > + if ((!sbi->s_journal) && > + !(sbi->s_mount_state & EXT4_ORPHAN_FS)) > return 0; > > -
Re: [PATCH 2/2] fs/ext4/namei.c: reducing contention on s_orphan_lock mmutex
On 10/02/2013 08:05 PM, Zheng Liu wrote: > Hello, > >> -if (!EXT4_SB(sb)->s_journal) >> +if (ext4_sb->s_journal) > > typo: !ext4_sb->s_journal > I am not sure whether or not this will impact the result because when > journal is enabled the inode will not be added into orphan list. > > Regards, > - Zheng > Thanks Zheng for pointing out the problem. Will fix the problem and rerun the test. Thanks, Mak. -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 2/2] fs/ext4/namei.c: reducing contention on s_orphan_lock mmutex
On 10/02/2013 08:05 PM, Zheng Liu wrote: Hello, -if (!EXT4_SB(sb)-s_journal) +if (ext4_sb-s_journal) typo: !ext4_sb-s_journal I am not sure whether or not this will impact the result because when journal is enabled the inode will not be added into orphan list. Regards, - Zheng Thanks Zheng for pointing out the problem. Will fix the problem and rerun the test. Thanks, Mak. -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 2/2] fs/ext4/namei.c: reducing contention on s_orphan_lock mmutex
On 2013-10-02, at 9:36 AM, T Makphaibulchoke wrote: Instead of using a single per super block mutex, s_orphan_lock, to serialize all orphan list updates, a separate mutex and spinlock are used to protect the on disk and in memory orphan lists respecvitely. At the same time, a per inode mutex is used to serialize orphan list updates of a single inode. Signed-off-by: T. Makphaibulchoke t...@hp.com --- fs/ext4/namei.c | 139 1 file changed, 100 insertions(+), 39 deletions(-) diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 35f55a0..d7d3d0f 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2554,12 +2554,24 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode) struct super_block *sb = inode-i_sb; struct ext4_iloc iloc; int err = 0, rc; + struct ext4_inode_info *ext4_inode = EXT4_I(inode); + struct ext4_sb_info *ext4_sb = EXT4_SB(sb); + struct mutex *inode_orphan_mutex; + spinlock_t *orphan_lock; + struct mutex *ondisk_orphan_mutex; + struct list_head *prev; + unsigned int next_inum; + struct inode *next_inode; Stack space in the kernel is not so abundant that all (or any?) of these should get their own local variable. - if (!EXT4_SB(sb)-s_journal) + if (ext4_sb-s_journal) return 0; - mutex_lock(EXT4_SB(sb)-s_orphan_lock); - if (!list_empty(EXT4_I(inode)-i_orphan)) + inode_orphan_mutex = ext4_inode-i_orphan_mutex; + orphan_lock = ext4_sb-s_orphan_lock; + ondisk_orphan_mutex = ext4_sb-s_ondisk_orphan_lock; + mutex_lock(inode_orphan_mutex); + prev = ext4_inode-i_orphan.prev; + if (prev != ext4_inode-i_orphan) goto out_unlock; /* @@ -2579,17 +2591,28 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode) err = ext4_reserve_inode_write(handle, inode, iloc); if (err) goto out_unlock; + mutex_lock(ondisk_orphan_mutex); /* * Due to previous errors inode may be already a part of on-disk * orphan list. If so skip on-disk list modification. */ if (NEXT_ORPHAN(inode) NEXT_ORPHAN(inode) = - (le32_to_cpu(EXT4_SB(sb)-s_es-s_inodes_count))) + (le32_to_cpu(EXT4_SB(sb)-s_es-s_inodes_count))) { + mutex_unlock(ondisk_orphan_mutex); goto mem_insert; + } /* Insert this inode at the head of the on-disk orphan list... */ - NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)-s_es-s_last_orphan); - EXT4_SB(sb)-s_es-s_last_orphan = cpu_to_le32(inode-i_ino); + next_inum = le32_to_cpu(EXT4_SB(sb)-s_es-s_last_orphan); + NEXT_ORPHAN(inode) = next_inum; + next_inode = ilookup(sb, next_inum); + if (next_inode) + EXT4_I(next_inode)-i_prev_orphan = inode-i_ino; + ext4_sb-s_es-s_last_orphan = cpu_to_le32(inode-i_ino); + ext4_inode-i_prev_orphan = 0; + mutex_unlock(ondisk_orphan_mutex); + if (next_inode) + iput(next_inode); err = ext4_handle_dirty_super(handle, sb); rc = ext4_mark_iloc_dirty(handle, inode, iloc); if (!err) @@ -2604,14 +2627,17 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode) * This is safe: on error we're going to ignore the orphan list * anyway on the next recovery. */ mem_insert: - if (!err) - list_add(EXT4_I(inode)-i_orphan, EXT4_SB(sb)-s_orphan); + if (!err) { + spin_lock(orphan_lock); + list_add(ext4_inode-i_orphan, ext4_sb-s_orphan); + spin_unlock(orphan_lock); + } jbd_debug(4, superblock will point to %lu\n, inode-i_ino); jbd_debug(4, orphan inode %lu will point to %d\n, inode-i_ino, NEXT_ORPHAN(inode)); out_unlock: - mutex_unlock(EXT4_SB(sb)-s_orphan_lock); + mutex_unlock(inode_orphan_mutex); ext4_std_error(inode-i_sb, err); return err; } @@ -2624,71 +2650,106 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode) { struct list_head *prev; struct ext4_inode_info *ei = EXT4_I(inode); - struct ext4_sb_info *sbi; + struct inode *i_next = NULL; + struct inode *i_prev = NULL; + struct ext4_sb_info *sbi = EXT4_SB(inode-i_sb); __u32 ino_next; + unsigned int ino_prev; struct ext4_iloc iloc; int err = 0; + struct mutex *inode_orphan_mutex; + spinlock_t *orphan_lock; + struct mutex *ondisk_orphan_mutex; Same here. - if ((!EXT4_SB(inode-i_sb)-s_journal) - !(EXT4_SB(inode-i_sb)-s_mount_state EXT4_ORPHAN_FS)) + if ((!sbi-s_journal) + !(sbi-s_mount_state EXT4_ORPHAN_FS)) return 0; - mutex_lock(EXT4_SB(inode-i_sb)-s_orphan_lock); - if (list_empty(ei-i_orphan)) - goto out; - - ino_next = NEXT_ORPHAN(inode); +
Re: [PATCH 2/2] fs/ext4/namei.c: reducing contention on s_orphan_lock mmutex
On 10/03/2013 06:41 PM, Andreas Dilger wrote: +struct inode *next_inode; Stack space in the kernel is not so abundant that all (or any?) of these should get their own local variable. -if (!EXT4_SB(sb)-s_journal) Same here. Cheers, Andreas Thanks Andreas for the comments. On larger machines with processors with lots of register, with the compiler optimization I don't think it matters whether stack variables or repeated common expressions are used. On smaller machines with processors with limited number of registers, this will be a problem. I'll fix these on my rework. Thanks, Mak. -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 2/2] fs/ext4/namei.c: reducing contention on s_orphan_lock mmutex
Hello, On Wed, Oct 02, 2013 at 09:36:59AM -0600, T Makphaibulchoke wrote: > Instead of using a single per super block mutex, s_orphan_lock, to serialize > all orphan list updates, a separate mutex and spinlock are used to > protect the on disk and in memory orphan lists respecvitely. > > At the same time, a per inode mutex is used to serialize orphan list > updates of a single inode. > > Signed-off-by: T. Makphaibulchoke > --- > fs/ext4/namei.c | 139 > > 1 file changed, 100 insertions(+), 39 deletions(-) > > diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c > index 35f55a0..d7d3d0f 100644 > --- a/fs/ext4/namei.c > +++ b/fs/ext4/namei.c > @@ -2554,12 +2554,24 @@ int ext4_orphan_add(handle_t *handle, struct inode > *inode) > struct super_block *sb = inode->i_sb; > struct ext4_iloc iloc; > int err = 0, rc; > + struct ext4_inode_info *ext4_inode = EXT4_I(inode); > + struct ext4_sb_info *ext4_sb = EXT4_SB(sb); > + struct mutex *inode_orphan_mutex; > + spinlock_t *orphan_lock; > + struct mutex *ondisk_orphan_mutex; > + struct list_head *prev; > + unsigned int next_inum; > + struct inode *next_inode; > > - if (!EXT4_SB(sb)->s_journal) > + if (ext4_sb->s_journal) typo: !ext4_sb->s_journal I am not sure whether or not this will impact the result because when journal is enabled the inode will not be added into orphan list. Regards, - Zheng > return 0; > > - mutex_lock(_SB(sb)->s_orphan_lock); > - if (!list_empty(_I(inode)->i_orphan)) > + inode_orphan_mutex = _inode->i_orphan_mutex; > + orphan_lock = _sb->s_orphan_lock; > + ondisk_orphan_mutex = _sb->s_ondisk_orphan_lock; > + mutex_lock(inode_orphan_mutex); > + prev = ext4_inode->i_orphan.prev; > + if (prev != _inode->i_orphan) > goto out_unlock; > > /* > @@ -2579,17 +2591,28 @@ int ext4_orphan_add(handle_t *handle, struct inode > *inode) > err = ext4_reserve_inode_write(handle, inode, ); > if (err) > goto out_unlock; > + mutex_lock(ondisk_orphan_mutex); > /* >* Due to previous errors inode may be already a part of on-disk >* orphan list. If so skip on-disk list modification. >*/ > if (NEXT_ORPHAN(inode) && NEXT_ORPHAN(inode) <= > - (le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))) > + (le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))) { > + mutex_unlock(ondisk_orphan_mutex); > goto mem_insert; > + } > > /* Insert this inode at the head of the on-disk orphan list... */ > - NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan); > - EXT4_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino); > + next_inum = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan); > + NEXT_ORPHAN(inode) = next_inum; > + next_inode = ilookup(sb, next_inum); > + if (next_inode) > + EXT4_I(next_inode)->i_prev_orphan = inode->i_ino; > + ext4_sb->s_es->s_last_orphan = cpu_to_le32(inode->i_ino); > + ext4_inode->i_prev_orphan = 0; > + mutex_unlock(ondisk_orphan_mutex); > + if (next_inode) > + iput(next_inode); > err = ext4_handle_dirty_super(handle, sb); > rc = ext4_mark_iloc_dirty(handle, inode, ); > if (!err) > @@ -2604,14 +2627,17 @@ int ext4_orphan_add(handle_t *handle, struct inode > *inode) >* This is safe: on error we're going to ignore the orphan list >* anyway on the next recovery. */ > mem_insert: > - if (!err) > - list_add(_I(inode)->i_orphan, _SB(sb)->s_orphan); > + if (!err) { > + spin_lock(orphan_lock); > + list_add(_inode->i_orphan, _sb->s_orphan); > + spin_unlock(orphan_lock); > + } > > jbd_debug(4, "superblock will point to %lu\n", inode->i_ino); > jbd_debug(4, "orphan inode %lu will point to %d\n", > inode->i_ino, NEXT_ORPHAN(inode)); > out_unlock: > - mutex_unlock(_SB(sb)->s_orphan_lock); > + mutex_unlock(inode_orphan_mutex); > ext4_std_error(inode->i_sb, err); > return err; > } > @@ -2624,71 +2650,106 @@ int ext4_orphan_del(handle_t *handle, struct inode > *inode) > { > struct list_head *prev; > struct ext4_inode_info *ei = EXT4_I(inode); > - struct ext4_sb_info *sbi; > + struct inode *i_next = NULL; > + struct inode *i_prev = NULL; > + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); > __u32 ino_next; > + unsigned int ino_prev; > struct ext4_iloc iloc; > int err = 0; > + struct mutex *inode_orphan_mutex; > + spinlock_t *orphan_lock; > + struct mutex *ondisk_orphan_mutex; > > - if ((!EXT4_SB(inode->i_sb)->s_journal) && > - !(EXT4_SB(inode->i_sb)->s_mount_state &
[PATCH 2/2] fs/ext4/namei.c: reducing contention on s_orphan_lock mmutex
Instead of using a single per super block mutex, s_orphan_lock, to serialize all orphan list updates, a separate mutex and spinlock are used to protect the on disk and in memory orphan lists respecvitely. At the same time, a per inode mutex is used to serialize orphan list updates of a single inode. Signed-off-by: T. Makphaibulchoke --- fs/ext4/namei.c | 139 1 file changed, 100 insertions(+), 39 deletions(-) diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 35f55a0..d7d3d0f 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2554,12 +2554,24 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode) struct super_block *sb = inode->i_sb; struct ext4_iloc iloc; int err = 0, rc; + struct ext4_inode_info *ext4_inode = EXT4_I(inode); + struct ext4_sb_info *ext4_sb = EXT4_SB(sb); + struct mutex *inode_orphan_mutex; + spinlock_t *orphan_lock; + struct mutex *ondisk_orphan_mutex; + struct list_head *prev; + unsigned int next_inum; + struct inode *next_inode; - if (!EXT4_SB(sb)->s_journal) + if (ext4_sb->s_journal) return 0; - mutex_lock(_SB(sb)->s_orphan_lock); - if (!list_empty(_I(inode)->i_orphan)) + inode_orphan_mutex = _inode->i_orphan_mutex; + orphan_lock = _sb->s_orphan_lock; + ondisk_orphan_mutex = _sb->s_ondisk_orphan_lock; + mutex_lock(inode_orphan_mutex); + prev = ext4_inode->i_orphan.prev; + if (prev != _inode->i_orphan) goto out_unlock; /* @@ -2579,17 +2591,28 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode) err = ext4_reserve_inode_write(handle, inode, ); if (err) goto out_unlock; + mutex_lock(ondisk_orphan_mutex); /* * Due to previous errors inode may be already a part of on-disk * orphan list. If so skip on-disk list modification. */ if (NEXT_ORPHAN(inode) && NEXT_ORPHAN(inode) <= - (le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))) + (le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))) { + mutex_unlock(ondisk_orphan_mutex); goto mem_insert; + } /* Insert this inode at the head of the on-disk orphan list... */ - NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan); - EXT4_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino); + next_inum = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan); + NEXT_ORPHAN(inode) = next_inum; + next_inode = ilookup(sb, next_inum); + if (next_inode) + EXT4_I(next_inode)->i_prev_orphan = inode->i_ino; + ext4_sb->s_es->s_last_orphan = cpu_to_le32(inode->i_ino); + ext4_inode->i_prev_orphan = 0; + mutex_unlock(ondisk_orphan_mutex); + if (next_inode) + iput(next_inode); err = ext4_handle_dirty_super(handle, sb); rc = ext4_mark_iloc_dirty(handle, inode, ); if (!err) @@ -2604,14 +2627,17 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode) * This is safe: on error we're going to ignore the orphan list * anyway on the next recovery. */ mem_insert: - if (!err) - list_add(_I(inode)->i_orphan, _SB(sb)->s_orphan); + if (!err) { + spin_lock(orphan_lock); + list_add(_inode->i_orphan, _sb->s_orphan); + spin_unlock(orphan_lock); + } jbd_debug(4, "superblock will point to %lu\n", inode->i_ino); jbd_debug(4, "orphan inode %lu will point to %d\n", inode->i_ino, NEXT_ORPHAN(inode)); out_unlock: - mutex_unlock(_SB(sb)->s_orphan_lock); + mutex_unlock(inode_orphan_mutex); ext4_std_error(inode->i_sb, err); return err; } @@ -2624,71 +2650,106 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode) { struct list_head *prev; struct ext4_inode_info *ei = EXT4_I(inode); - struct ext4_sb_info *sbi; + struct inode *i_next = NULL; + struct inode *i_prev = NULL; + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); __u32 ino_next; + unsigned int ino_prev; struct ext4_iloc iloc; int err = 0; + struct mutex *inode_orphan_mutex; + spinlock_t *orphan_lock; + struct mutex *ondisk_orphan_mutex; - if ((!EXT4_SB(inode->i_sb)->s_journal) && - !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) + if ((!sbi->s_journal) && + !(sbi->s_mount_state & EXT4_ORPHAN_FS)) return 0; - mutex_lock(_SB(inode->i_sb)->s_orphan_lock); - if (list_empty(>i_orphan)) - goto out; - - ino_next = NEXT_ORPHAN(inode); + inode_orphan_mutex = >i_orphan_mutex; + orphan_lock = >s_orphan_lock; + ondisk_orphan_mutex =
[PATCH 2/2] fs/ext4/namei.c: reducing contention on s_orphan_lock mmutex
Instead of using a single per super block mutex, s_orphan_lock, to serialize all orphan list updates, a separate mutex and spinlock are used to protect the on disk and in memory orphan lists respecvitely. At the same time, a per inode mutex is used to serialize orphan list updates of a single inode. Signed-off-by: T. Makphaibulchoke t...@hp.com --- fs/ext4/namei.c | 139 1 file changed, 100 insertions(+), 39 deletions(-) diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 35f55a0..d7d3d0f 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2554,12 +2554,24 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode) struct super_block *sb = inode-i_sb; struct ext4_iloc iloc; int err = 0, rc; + struct ext4_inode_info *ext4_inode = EXT4_I(inode); + struct ext4_sb_info *ext4_sb = EXT4_SB(sb); + struct mutex *inode_orphan_mutex; + spinlock_t *orphan_lock; + struct mutex *ondisk_orphan_mutex; + struct list_head *prev; + unsigned int next_inum; + struct inode *next_inode; - if (!EXT4_SB(sb)-s_journal) + if (ext4_sb-s_journal) return 0; - mutex_lock(EXT4_SB(sb)-s_orphan_lock); - if (!list_empty(EXT4_I(inode)-i_orphan)) + inode_orphan_mutex = ext4_inode-i_orphan_mutex; + orphan_lock = ext4_sb-s_orphan_lock; + ondisk_orphan_mutex = ext4_sb-s_ondisk_orphan_lock; + mutex_lock(inode_orphan_mutex); + prev = ext4_inode-i_orphan.prev; + if (prev != ext4_inode-i_orphan) goto out_unlock; /* @@ -2579,17 +2591,28 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode) err = ext4_reserve_inode_write(handle, inode, iloc); if (err) goto out_unlock; + mutex_lock(ondisk_orphan_mutex); /* * Due to previous errors inode may be already a part of on-disk * orphan list. If so skip on-disk list modification. */ if (NEXT_ORPHAN(inode) NEXT_ORPHAN(inode) = - (le32_to_cpu(EXT4_SB(sb)-s_es-s_inodes_count))) + (le32_to_cpu(EXT4_SB(sb)-s_es-s_inodes_count))) { + mutex_unlock(ondisk_orphan_mutex); goto mem_insert; + } /* Insert this inode at the head of the on-disk orphan list... */ - NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)-s_es-s_last_orphan); - EXT4_SB(sb)-s_es-s_last_orphan = cpu_to_le32(inode-i_ino); + next_inum = le32_to_cpu(EXT4_SB(sb)-s_es-s_last_orphan); + NEXT_ORPHAN(inode) = next_inum; + next_inode = ilookup(sb, next_inum); + if (next_inode) + EXT4_I(next_inode)-i_prev_orphan = inode-i_ino; + ext4_sb-s_es-s_last_orphan = cpu_to_le32(inode-i_ino); + ext4_inode-i_prev_orphan = 0; + mutex_unlock(ondisk_orphan_mutex); + if (next_inode) + iput(next_inode); err = ext4_handle_dirty_super(handle, sb); rc = ext4_mark_iloc_dirty(handle, inode, iloc); if (!err) @@ -2604,14 +2627,17 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode) * This is safe: on error we're going to ignore the orphan list * anyway on the next recovery. */ mem_insert: - if (!err) - list_add(EXT4_I(inode)-i_orphan, EXT4_SB(sb)-s_orphan); + if (!err) { + spin_lock(orphan_lock); + list_add(ext4_inode-i_orphan, ext4_sb-s_orphan); + spin_unlock(orphan_lock); + } jbd_debug(4, superblock will point to %lu\n, inode-i_ino); jbd_debug(4, orphan inode %lu will point to %d\n, inode-i_ino, NEXT_ORPHAN(inode)); out_unlock: - mutex_unlock(EXT4_SB(sb)-s_orphan_lock); + mutex_unlock(inode_orphan_mutex); ext4_std_error(inode-i_sb, err); return err; } @@ -2624,71 +2650,106 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode) { struct list_head *prev; struct ext4_inode_info *ei = EXT4_I(inode); - struct ext4_sb_info *sbi; + struct inode *i_next = NULL; + struct inode *i_prev = NULL; + struct ext4_sb_info *sbi = EXT4_SB(inode-i_sb); __u32 ino_next; + unsigned int ino_prev; struct ext4_iloc iloc; int err = 0; + struct mutex *inode_orphan_mutex; + spinlock_t *orphan_lock; + struct mutex *ondisk_orphan_mutex; - if ((!EXT4_SB(inode-i_sb)-s_journal) - !(EXT4_SB(inode-i_sb)-s_mount_state EXT4_ORPHAN_FS)) + if ((!sbi-s_journal) + !(sbi-s_mount_state EXT4_ORPHAN_FS)) return 0; - mutex_lock(EXT4_SB(inode-i_sb)-s_orphan_lock); - if (list_empty(ei-i_orphan)) - goto out; - - ino_next = NEXT_ORPHAN(inode); + inode_orphan_mutex = ei-i_orphan_mutex; + orphan_lock = sbi-s_orphan_lock; +
Re: [PATCH 2/2] fs/ext4/namei.c: reducing contention on s_orphan_lock mmutex
Hello, On Wed, Oct 02, 2013 at 09:36:59AM -0600, T Makphaibulchoke wrote: Instead of using a single per super block mutex, s_orphan_lock, to serialize all orphan list updates, a separate mutex and spinlock are used to protect the on disk and in memory orphan lists respecvitely. At the same time, a per inode mutex is used to serialize orphan list updates of a single inode. Signed-off-by: T. Makphaibulchoke t...@hp.com --- fs/ext4/namei.c | 139 1 file changed, 100 insertions(+), 39 deletions(-) diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 35f55a0..d7d3d0f 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2554,12 +2554,24 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode) struct super_block *sb = inode-i_sb; struct ext4_iloc iloc; int err = 0, rc; + struct ext4_inode_info *ext4_inode = EXT4_I(inode); + struct ext4_sb_info *ext4_sb = EXT4_SB(sb); + struct mutex *inode_orphan_mutex; + spinlock_t *orphan_lock; + struct mutex *ondisk_orphan_mutex; + struct list_head *prev; + unsigned int next_inum; + struct inode *next_inode; - if (!EXT4_SB(sb)-s_journal) + if (ext4_sb-s_journal) typo: !ext4_sb-s_journal I am not sure whether or not this will impact the result because when journal is enabled the inode will not be added into orphan list. Regards, - Zheng return 0; - mutex_lock(EXT4_SB(sb)-s_orphan_lock); - if (!list_empty(EXT4_I(inode)-i_orphan)) + inode_orphan_mutex = ext4_inode-i_orphan_mutex; + orphan_lock = ext4_sb-s_orphan_lock; + ondisk_orphan_mutex = ext4_sb-s_ondisk_orphan_lock; + mutex_lock(inode_orphan_mutex); + prev = ext4_inode-i_orphan.prev; + if (prev != ext4_inode-i_orphan) goto out_unlock; /* @@ -2579,17 +2591,28 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode) err = ext4_reserve_inode_write(handle, inode, iloc); if (err) goto out_unlock; + mutex_lock(ondisk_orphan_mutex); /* * Due to previous errors inode may be already a part of on-disk * orphan list. If so skip on-disk list modification. */ if (NEXT_ORPHAN(inode) NEXT_ORPHAN(inode) = - (le32_to_cpu(EXT4_SB(sb)-s_es-s_inodes_count))) + (le32_to_cpu(EXT4_SB(sb)-s_es-s_inodes_count))) { + mutex_unlock(ondisk_orphan_mutex); goto mem_insert; + } /* Insert this inode at the head of the on-disk orphan list... */ - NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)-s_es-s_last_orphan); - EXT4_SB(sb)-s_es-s_last_orphan = cpu_to_le32(inode-i_ino); + next_inum = le32_to_cpu(EXT4_SB(sb)-s_es-s_last_orphan); + NEXT_ORPHAN(inode) = next_inum; + next_inode = ilookup(sb, next_inum); + if (next_inode) + EXT4_I(next_inode)-i_prev_orphan = inode-i_ino; + ext4_sb-s_es-s_last_orphan = cpu_to_le32(inode-i_ino); + ext4_inode-i_prev_orphan = 0; + mutex_unlock(ondisk_orphan_mutex); + if (next_inode) + iput(next_inode); err = ext4_handle_dirty_super(handle, sb); rc = ext4_mark_iloc_dirty(handle, inode, iloc); if (!err) @@ -2604,14 +2627,17 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode) * This is safe: on error we're going to ignore the orphan list * anyway on the next recovery. */ mem_insert: - if (!err) - list_add(EXT4_I(inode)-i_orphan, EXT4_SB(sb)-s_orphan); + if (!err) { + spin_lock(orphan_lock); + list_add(ext4_inode-i_orphan, ext4_sb-s_orphan); + spin_unlock(orphan_lock); + } jbd_debug(4, superblock will point to %lu\n, inode-i_ino); jbd_debug(4, orphan inode %lu will point to %d\n, inode-i_ino, NEXT_ORPHAN(inode)); out_unlock: - mutex_unlock(EXT4_SB(sb)-s_orphan_lock); + mutex_unlock(inode_orphan_mutex); ext4_std_error(inode-i_sb, err); return err; } @@ -2624,71 +2650,106 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode) { struct list_head *prev; struct ext4_inode_info *ei = EXT4_I(inode); - struct ext4_sb_info *sbi; + struct inode *i_next = NULL; + struct inode *i_prev = NULL; + struct ext4_sb_info *sbi = EXT4_SB(inode-i_sb); __u32 ino_next; + unsigned int ino_prev; struct ext4_iloc iloc; int err = 0; + struct mutex *inode_orphan_mutex; + spinlock_t *orphan_lock; + struct mutex *ondisk_orphan_mutex; - if ((!EXT4_SB(inode-i_sb)-s_journal) - !(EXT4_SB(inode-i_sb)-s_mount_state EXT4_ORPHAN_FS)) + if ((!sbi-s_journal) + !(sbi-s_mount_state EXT4_ORPHAN_FS)) return 0;