The commit is pushed to "branch-rh7-3.10.0-1160.6.1.vz7.171.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git after rh7-3.10.0-1160.6.1.vz7.171.1 ------> commit 33149e68cd1be429dbfc2c4a66019b2b971f7268 Author: Davidlohr Bueso <d...@stgolabs.net> Date: Thu Dec 3 11:40:47 2020 +0300
ms/mm: convert i_mmap_mutex to rwsem The i_mmap_mutex is a close cousin of the anon vma lock, both protecting similar data, one for file backed pages and the other for anon memory. To this end, this lock can also be a rwsem. In addition, there are some important opportunities to share the lock when there are no tree modifications. This conversion is straightforward. For now, all users take the write lock. [s...@canb.auug.org.au: update fremap.c] Signed-off-by: Davidlohr Bueso <dbu...@suse.de> Reviewed-by: Rik van Riel <r...@redhat.com> Acked-by: "Kirill A. Shutemov" <kir...@shutemov.name> Acked-by: Hugh Dickins <hu...@google.com> Cc: Oleg Nesterov <o...@redhat.com> Acked-by: Peter Zijlstra (Intel) <pet...@infradead.org> Cc: Srikar Dronamraju <sri...@linux.vnet.ibm.com> Acked-by: Mel Gorman <mgor...@suse.de> Signed-off-by: Stephen Rothwell <s...@canb.auug.org.au> Signed-off-by: Andrew Morton <a...@linux-foundation.org> Signed-off-by: Linus Torvalds <torva...@linux-foundation.org> https://jira.sw.ru/browse/PSBM-122663 (cherry picked from commit c8c06efa8b552608493b7066c234cfa82c47fcea) Signed-off-by: Andrey Ryabinin <aryabi...@virtuozzo.com> --- Documentation/vm/locking | 2 +- fs/hugetlbfs/inode.c | 10 +++++----- fs/inode.c | 2 +- include/linux/fs.h | 7 ++++--- include/linux/mmu_notifier.h | 2 +- kernel/events/uprobes.c | 2 +- mm/filemap.c | 10 +++++----- mm/hugetlb.c | 10 +++++----- mm/memory.c | 2 +- mm/mmap.c | 6 +++--- mm/mremap.c | 2 +- mm/rmap.c | 4 ++-- 12 files changed, 30 insertions(+), 29 deletions(-) diff --git a/Documentation/vm/locking b/Documentation/vm/locking index f61228b..fb64028 100644 --- a/Documentation/vm/locking +++ b/Documentation/vm/locking @@ -66,7 +66,7 @@ in some cases it is not really needed. Eg, vm_start is modified by expand_stack(), it is hard to come up with a destructive scenario without having the vmlist protection in this case. -The page_table_lock nests with the inode i_mmap_mutex and the kmem cache +The page_table_lock nests with the inode i_mmap_rwsem and the kmem cache c_spinlock spinlocks. This is okay, since the kmem code asks for pages after dropping c_spinlock. The page_table_lock also nests with pagecache_lock and pagemap_lru_lock spinlocks, and no code asks for memory with these locks diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 79c34c8..c1eef50 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -757,12 +757,12 @@ static struct inode *hugetlbfs_get_root(struct super_block *sb, } /* - * Hugetlbfs is not reclaimable; therefore its i_mmap_mutex will never + * Hugetlbfs is not reclaimable; therefore its i_mmap_rwsem will never * be taken from reclaim -- unlike regular filesystems. This needs an * annotation because huge_pmd_share() does an allocation under hugetlb's - * i_mmap_mutex. + * i_mmap_rwsem. */ -struct lock_class_key hugetlbfs_i_mmap_mutex_key; +static struct lock_class_key hugetlbfs_i_mmap_rwsem_key; static struct inode *hugetlbfs_get_inode(struct super_block *sb, struct inode *dir, @@ -779,8 +779,8 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, if (inode) { inode->i_ino = get_next_ino(); inode_init_owner(inode, dir, mode); - lockdep_set_class(&inode->i_mapping->i_mmap_mutex, - &hugetlbfs_i_mmap_mutex_key); + lockdep_set_class(&inode->i_mapping->i_mmap_rwsem, + &hugetlbfs_i_mmap_rwsem_key); inode->i_mapping->a_ops = &hugetlbfs_aops; inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; diff --git a/fs/inode.c b/fs/inode.c index 3fec88e..72a9073 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -356,7 +356,7 @@ void address_space_init_once(struct address_space *mapping) memset(mapping, 0, sizeof(*mapping)); INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC); spin_lock_init(&mapping->tree_lock); - mutex_init(&mapping->i_mmap_mutex); + init_rwsem(&mapping->i_mmap_rwsem); INIT_LIST_HEAD(&mapping->private_list); spin_lock_init(&mapping->private_lock); mapping->i_mmap = RB_ROOT; diff --git a/include/linux/fs.h b/include/linux/fs.h index d570b02..1ea2a23 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -17,6 +17,7 @@ #include <linux/pid.h> #include <linux/bug.h> #include <linux/mutex.h> +#include <linux/rwsem.h> #include <linux/capability.h> #include <linux/semaphore.h> #include <linux/fiemap.h> @@ -626,7 +627,7 @@ struct address_space { RH_KABI_REPLACE(unsigned int i_mmap_writable, atomic_t i_mmap_writable) /* count VM_SHARED mappings */ struct rb_root i_mmap; /* tree of private and shared mappings */ - struct mutex i_mmap_mutex; /* protect tree, count, list */ + struct rw_semaphore i_mmap_rwsem; /* protect tree, count, list */ /* Protected by tree_lock together with the radix tree */ unsigned long nrpages; /* number of total pages */ /* number of shadow or DAX exceptional entries */ @@ -700,12 +701,12 @@ int mapping_tagged(struct address_space *mapping, int tag); static inline void i_mmap_lock_write(struct address_space *mapping) { - mutex_lock(&mapping->i_mmap_mutex); + down_write(&mapping->i_mmap_rwsem); } static inline void i_mmap_unlock_write(struct address_space *mapping) { - mutex_unlock(&mapping->i_mmap_mutex); + up_write(&mapping->i_mmap_rwsem); } /* diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 156b150..ebcd0b4 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -218,7 +218,7 @@ struct mmu_notifier_rhel7 { * Therefore notifier chains can only be traversed when either * * 1. mmap_sem is held. - * 2. One of the reverse map locks is held (i_mmap_mutex or anon_vma->rwsem). + * 2. One of the reverse map locks is held (i_mmap_rwsem or anon_vma->rwsem). * 3. No other concurrent thread can access the list (release) */ struct mmu_notifier { diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 6c714b0..a7c557f 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -697,7 +697,7 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register) if (!prev && !more) { /* - * Needs GFP_NOWAIT to avoid i_mmap_mutex recursion through + * Needs GFP_NOWAIT to avoid i_mmap_rwsem recursion through * reclaim. This is optimistic, no harm done if it fails. */ prev = kmalloc(sizeof(struct map_info), diff --git a/mm/filemap.c b/mm/filemap.c index 2bd5ca4..950d92b 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -66,16 +66,16 @@ /* * Lock ordering: * - * ->i_mmap_mutex (truncate_pagecache) + * ->i_mmap_rwsem (truncate_pagecache) * ->private_lock (__free_pte->__set_page_dirty_buffers) * ->swap_lock (exclusive_swap_page, others) * ->mapping->tree_lock * * ->i_mutex - * ->i_mmap_mutex (truncate->unmap_mapping_range) + * ->i_mmap_rwsem (truncate->unmap_mapping_range) * * ->mmap_sem - * ->i_mmap_mutex + * ->i_mmap_rwsem * ->page_table_lock or pte_lock (various, mainly in memory.c) * ->mapping->tree_lock (arch-dependent flush_dcache_mmap_lock) * @@ -89,7 +89,7 @@ * sb_lock (fs/fs-writeback.c) * ->mapping->tree_lock (__sync_single_inode) * - * ->i_mmap_mutex + * ->i_mmap_rwsem * ->anon_vma.lock (vma_adjust) * * ->anon_vma.lock @@ -109,7 +109,7 @@ * ->inode->i_lock (zap_pte_range->set_page_dirty) * ->private_lock (zap_pte_range->__set_page_dirty_buffers) * - * ->i_mmap_mutex + * ->i_mmap_rwsem * ->tasklist_lock (memory_failure, collect_procs_ao) */ diff --git a/mm/hugetlb.c b/mm/hugetlb.c index dd751e3..8300a7f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3515,9 +3515,9 @@ void __unmap_hugepage_range_final(struct mmu_gather *tlb, * on its way out. We're lucky that the flag has such an appropriate * name, and can in fact be safely cleared here. We could clear it * before the __unmap_hugepage_range above, but all that's necessary - * is to clear it before releasing the i_mmap_mutex. This works + * is to clear it before releasing the i_mmap_rwsem. This works * because in the context this is called, the VMA is about to be - * destroyed and the i_mmap_mutex is held. + * destroyed and the i_mmap_rwsem is held. */ vma->vm_flags &= ~VM_MAYSHARE; } @@ -4456,9 +4456,9 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, spin_unlock(ptl); } /* - * Must flush TLB before releasing i_mmap_mutex: x86's huge_pmd_unshare + * Must flush TLB before releasing i_mmap_rwsem: x86's huge_pmd_unshare * may have cleared our pud entry and done put_page on the page table: - * once we release i_mmap_mutex, another task can do the final put_page + * once we release i_mmap_rwsem, another task can do the final put_page * and that page table be reused and filled with junk. */ flush_tlb_range(vma, start, end); @@ -4658,7 +4658,7 @@ static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr) * and returns the corresponding pte. While this is not necessary for the * !shared pmd case because we can allocate the pmd later as well, it makes the * code much cleaner. pmd allocation is essential for the shared case because - * pud has to be populated inside the same i_mmap_mutex section - otherwise + * pud has to be populated inside the same i_mmap_rwsem section - otherwise * racing tasks could either miss the sharing (see huge_pte_offset) or select a * bad pmd for sharing. */ diff --git a/mm/memory.c b/mm/memory.c index 8f7736d..ac2d668 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4360,7 +4360,7 @@ restart: spin_unlock(&inode->i_lock); } - mutex_lock_nested(&peer->i_mmap_mutex, SINGLE_DEPTH_NESTING); + down_write_nested(&peer->i_mmap_rwsem, SINGLE_DEPTH_NESTING); if (!peer->i_peer_file) { i_mmap_unlock_write(peer); goto restart; diff --git a/mm/mmap.c b/mm/mmap.c index 837a1ac..e515507 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -248,7 +248,7 @@ error: } /* - * Requires inode->i_mapping->i_mmap_mutex + * Requires inode->i_mapping->i_mmap_rwsem */ static void __remove_shared_vm_struct(struct vm_area_struct *vma, struct file *file, struct address_space *mapping) @@ -3200,7 +3200,7 @@ void exit_mmap(struct mm_struct *mm) /* Insert vm structure into process list sorted by address * and into the inode's i_mmap tree. If vm_file is non-NULL - * then i_mmap_mutex is taken here. + * then i_mmap_rwsem is taken here. */ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) { @@ -3476,7 +3476,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) */ if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags)) BUG(); - mutex_lock_nest_lock(&mapping->i_mmap_mutex, &mm->mmap_sem); + down_write_nest_lock(&mapping->i_mmap_rwsem, &mm->mmap_sem); } } diff --git a/mm/mremap.c b/mm/mremap.c index 3e50a89..b9d487f 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -117,7 +117,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, unsigned long len = old_end - old_addr; /* - * When need_rmap_locks is true, we take the i_mmap_mutex and anon_vma + * When need_rmap_locks is true, we take the i_mmap_rwsem and anon_vma * locks to ensure that rmap will always observe either the old or the * new ptes. This is the easiest way to avoid races with * truncate_pagecache(), page migration, etc... diff --git a/mm/rmap.c b/mm/rmap.c index 478f384..e72be32c 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1716,14 +1716,14 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc) * The page lock not only makes sure that page->mapping cannot * suddenly be NULLified by truncation, it makes sure that the * structure at mapping cannot be freed and reused yet, - * so we can safely take mapping->i_mmap_mutex. + * so we can safely take mapping->i_mmap_rwsem. */ VM_BUG_ON(!PageLocked(page)); if (!mapping) return ret; pgoff = page_to_pgoff(page); - mutex_lock_nested(&mapping->i_mmap_mutex, SINGLE_DEPTH_NESTING); + down_write_nested(&mapping->i_mmap_rwsem, SINGLE_DEPTH_NESTING); vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { unsigned long address = vma_address(page, vma); _______________________________________________ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel