When setting SEAL_WRITE, we must make sure nobody has a writable reference
to the pages (via GUP or similar). We currently check references and wait
some time for them to be dropped. This, however, might fail for several
reasons, including:
 - the page is pinned for longer than we wait
 - while we wait, someone takes an already pinned page for read-access

Therefore, this patch introduces page-isolation. When sealing a file with
SEAL_WRITE, we copy all pages that have an elevated ref-count. The newpage
is put in place atomically, the old page is detached and left alone. It
will get reclaimed once the last external user dropped it.

Signed-off-by: David Herrmann <dh.herrm...@gmail.com>
---
 mm/shmem.c | 218 +++++++++++++++++++++++++++++--------------------------------
 1 file changed, 105 insertions(+), 113 deletions(-)

diff --git a/mm/shmem.c b/mm/shmem.c
index ddc3998..34b14fb 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1237,6 +1237,110 @@ unlock:
        return error;
 }
 
+static int shmem_isolate_page(struct inode *inode, struct page *oldpage)
+{
+       struct address_space *mapping = inode->i_mapping;
+       struct shmem_inode_info *info = SHMEM_I(inode);
+       struct page *newpage;
+       int error;
+
+       if (oldpage->mapping != mapping)
+               return 0;
+       if (page_count(oldpage) - page_mapcount(oldpage) <= 2)
+               return 0;
+
+       if (page_mapped(oldpage))
+               unmap_mapping_range(mapping,
+                                   (loff_t)oldpage->index << PAGE_CACHE_SHIFT,
+                                   PAGE_CACHE_SIZE, 0);
+
+       VM_BUG_ON_PAGE(PageWriteback(oldpage), oldpage);
+       VM_BUG_ON_PAGE(page_has_private(oldpage), oldpage);
+
+       newpage = shmem_alloc_page(mapping_gfp_mask(mapping), info,
+                                  oldpage->index);
+       if (!newpage)
+               return -ENOMEM;
+
+       __set_page_locked(newpage);
+       copy_highpage(newpage, oldpage);
+       flush_dcache_page(newpage);
+
+       page_cache_get(newpage);
+       SetPageUptodate(newpage);
+       SetPageSwapBacked(newpage);
+       newpage->mapping = mapping;
+       newpage->index = oldpage->index;
+
+       cancel_dirty_page(oldpage, PAGE_CACHE_SIZE);
+
+       spin_lock_irq(&mapping->tree_lock);
+       error = shmem_radix_tree_replace(mapping, oldpage->index,
+                                        oldpage, newpage);
+       if (!error) {
+               __inc_zone_page_state(newpage, NR_FILE_PAGES);
+               __dec_zone_page_state(oldpage, NR_FILE_PAGES);
+       }
+       spin_unlock_irq(&mapping->tree_lock);
+
+       if (error) {
+               newpage->mapping = NULL;
+               unlock_page(newpage);
+               page_cache_release(newpage);
+               page_cache_release(newpage);
+               return error;
+       }
+
+       mem_cgroup_replace_page_cache(oldpage, newpage);
+       lru_cache_add_anon(newpage);
+
+       oldpage->mapping = NULL;
+       page_cache_release(oldpage);
+       unlock_page(newpage);
+       page_cache_release(newpage);
+
+       return 1;
+}
+
+static int shmem_isolate_pins(struct inode *inode)
+{
+       struct address_space *mapping = inode->i_mapping;
+       struct pagevec pvec;
+       pgoff_t indices[PAGEVEC_SIZE];
+       pgoff_t index;
+       int i, ret, error;
+
+       pagevec_init(&pvec, 0);
+       index = 0;
+       error = 0;
+       while ((pvec.nr = find_get_entries(mapping, index, PAGEVEC_SIZE,
+                                          pvec.pages, indices))) {
+               for (i = 0; i < pagevec_count(&pvec); i++) {
+                       struct page *page = pvec.pages[i];
+
+                       index = indices[i];
+                       if (radix_tree_exceptional_entry(page))
+                               continue;
+                       if (page->mapping != mapping)
+                               continue;
+                       if (page_count(page) - page_mapcount(page) <= 2)
+                               continue;
+
+                       lock_page(page);
+                       ret = shmem_isolate_page(inode, page);
+                       if (ret < 0)
+                               error = ret;
+                       unlock_page(page);
+               }
+               pagevec_remove_exceptionals(&pvec);
+               pagevec_release(&pvec);
+               cond_resched();
+               index++;
+       }
+
+       return error;
+}
+
 static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
        struct inode *inode = file_inode(vma->vm_file);
@@ -1734,118 +1838,6 @@ static loff_t shmem_file_llseek(struct file *file, 
loff_t offset, int whence)
        return offset;
 }
 
-/*
- * We need a tag: a new tag would expand every radix_tree_node by 8 bytes,
- * so reuse a tag which we firmly believe is never set or cleared on shmem.
- */
-#define SHMEM_TAG_PINNED        PAGECACHE_TAG_TOWRITE
-#define LAST_SCAN               4       /* about 150ms max */
-
-static void shmem_tag_pins(struct address_space *mapping)
-{
-       struct radix_tree_iter iter;
-       void **slot;
-       pgoff_t start;
-       struct page *page;
-
-       start = 0;
-       rcu_read_lock();
-
-restart:
-       radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
-               page = radix_tree_deref_slot(slot);
-               if (!page || radix_tree_exception(page)) {
-                       if (radix_tree_deref_retry(page))
-                               goto restart;
-               } else if (page_count(page) - page_mapcount(page) > 1) {
-                       spin_lock_irq(&mapping->tree_lock);
-                       radix_tree_tag_set(&mapping->page_tree, iter.index,
-                                          SHMEM_TAG_PINNED);
-                       spin_unlock_irq(&mapping->tree_lock);
-               }
-
-               if (need_resched()) {
-                       cond_resched_rcu();
-                       start = iter.index + 1;
-                       goto restart;
-               }
-       }
-       rcu_read_unlock();
-}
-
-/*
- * Setting SEAL_WRITE requires us to verify there's no pending writer. However,
- * via get_user_pages(), drivers might have some pending I/O without any active
- * user-space mappings (eg., direct-IO, AIO). Therefore, we look at all pages
- * and see whether it has an elevated ref-count. If so, we tag them and wait 
for
- * them to be dropped.
- * The caller must guarantee that no new user will acquire writable references
- * to those pages to avoid races.
- */
-static int shmem_wait_for_pins(struct address_space *mapping)
-{
-       struct radix_tree_iter iter;
-       void **slot;
-       pgoff_t start;
-       struct page *page;
-       int error, scan;
-
-       shmem_tag_pins(mapping);
-
-       error = 0;
-       for (scan = 0; scan <= LAST_SCAN; scan++) {
-               if (!radix_tree_tagged(&mapping->page_tree, SHMEM_TAG_PINNED))
-                       break;
-
-               if (!scan)
-                       lru_add_drain_all();
-               else if (schedule_timeout_killable((HZ << scan) / 200))
-                       scan = LAST_SCAN;
-
-               start = 0;
-               rcu_read_lock();
-restart:
-               radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter,
-                                          start, SHMEM_TAG_PINNED) {
-
-                       page = radix_tree_deref_slot(slot);
-                       if (radix_tree_exception(page)) {
-                               if (radix_tree_deref_retry(page))
-                                       goto restart;
-
-                               page = NULL;
-                       }
-
-                       if (page &&
-                           page_count(page) - page_mapcount(page) != 1) {
-                               if (scan < LAST_SCAN)
-                                       goto continue_resched;
-
-                               /*
-                                * On the last scan, we clean up all those tags
-                                * we inserted; but make a note that we still
-                                * found pages pinned.
-                                */
-                               error = -EBUSY;
-                       }
-
-                       spin_lock_irq(&mapping->tree_lock);
-                       radix_tree_tag_clear(&mapping->page_tree,
-                                            iter.index, SHMEM_TAG_PINNED);
-                       spin_unlock_irq(&mapping->tree_lock);
-continue_resched:
-                       if (need_resched()) {
-                               cond_resched_rcu();
-                               start = iter.index + 1;
-                               goto restart;
-                       }
-               }
-               rcu_read_unlock();
-       }
-
-       return error;
-}
-
 #define F_ALL_SEALS (F_SEAL_SEAL | \
                     F_SEAL_SHRINK | \
                     F_SEAL_GROW | \
@@ -1907,7 +1899,7 @@ int shmem_add_seals(struct file *file, unsigned int seals)
                if (error)
                        goto unlock;
 
-               error = shmem_wait_for_pins(file->f_mapping);
+               error = shmem_isolate_pins(inode);
                if (error) {
                        mapping_allow_writable(file->f_mapping);
                        goto unlock;
-- 
2.0.0

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to