Rewrite unionfs_writepage to minimize dependence on AOP_WRITEPAGE_ACTIVEATE,
handle memory pressure better, and update documentation.  Remove
unionfs_sync_page because it's not needed.

CC: Hugh Dickins <[EMAIL PROTECTED]>
CC: Pekka Enberg <[EMAIL PROTECTED]>

Signed-off-by: Erez Zadok <[EMAIL PROTECTED]>
---
 fs/unionfs/mmap.c |  156 ++++++++++++++++++++--------------------------------
 1 files changed, 60 insertions(+), 96 deletions(-)

diff --git a/fs/unionfs/mmap.c b/fs/unionfs/mmap.c
index bed11c3..4b00b98 100644
--- a/fs/unionfs/mmap.c
+++ b/fs/unionfs/mmap.c
@@ -25,83 +25,91 @@ static int unionfs_writepage(struct page *page, struct 
writeback_control *wbc)
        struct inode *inode;
        struct inode *lower_inode;
        struct page *lower_page;
-       char *kaddr, *lower_kaddr;
+       struct address_space *lower_mapping; /* lower inode mapping */
+       gfp_t mask;
 
        inode = page->mapping->host;
        lower_inode = unionfs_lower_inode(inode);
+       lower_mapping = lower_inode->i_mapping;
 
        /*
         * find lower page (returns a locked page)
         *
-        * NOTE: we used to call grab_cache_page(), but that was unnecessary
-        * as it would have tried to create a new lower page if it didn't
-        * exist, leading to deadlocks (esp. under memory-pressure
-        * conditions, when it is really a bad idea to *consume* more
-        * memory).  Instead, we assume the lower page exists, and if we can
-        * find it, then we ->writepage on it; if we can't find it, then it
-        * couldn't have disappeared unless the kernel already flushed it,
-        * in which case we're still OK.  This is especially correct if
-        * wbc->sync_mode is WB_SYNC_NONE (as per
-        * Documentation/filesystems/vfs.txt).  If we can't flush our page
-        * because we can't find a lower page, then at least we re-mark our
-        * page as dirty, and return AOP_WRITEPAGE_ACTIVATE as the VFS
-        * expects us to.  (Note, if in the future it'd turn out that we
-        * have to find a lower page no matter what, then we'd have to
-        * resort to RAIF's page pointer flipping trick.)
+        * We turn off __GFP_FS while we look for or create a new lower
+        * page.  This prevents a recursion into the file system code, which
+        * under memory pressure conditions could lead to a deadlock.  This
+        * is similar to how the loop driver behaves (see loop_set_fd in
+        * drivers/block/loop.c).  If we can't find the lower page, we
+        * redirty our page and return "success" so that the VM will call us
+        * again in the (hopefully near) future.
         */
-       lower_page = find_lock_page(lower_inode->i_mapping, page->index);
+       mask = mapping_gfp_mask(lower_mapping) & ~(__GFP_FS);
+       lower_page = find_or_create_page(lower_mapping, page->index, mask);
        if (!lower_page) {
-               err = AOP_WRITEPAGE_ACTIVATE;
+               err = 0;
                set_page_dirty(page);
                goto out;
        }
 
-       /* get page address, and encode it */
-       kaddr = kmap(page);
-       lower_kaddr = kmap(lower_page);
+       /* copy page data from our upper page to the lower page */
+       copy_highpage(lower_page, page);
 
-       memcpy(lower_kaddr, kaddr, PAGE_CACHE_SIZE);
-
-       kunmap(page);
-       kunmap(lower_page);
-
-       BUG_ON(!lower_inode->i_mapping->a_ops->writepage);
-
-       /* call lower writepage (expects locked page) */
-       clear_page_dirty_for_io(lower_page); /* emulate VFS behavior */
-       err = lower_inode->i_mapping->a_ops->writepage(lower_page, wbc);
-
-       /* b/c find_lock_page locked it and ->writepage unlocks on success */
-       if (err)
+       /*
+        * Call lower writepage (expects locked page).  However, if we are
+        * called with wbc->for_reclaim, then the VFS/VM just wants to
+        * reclaim our page.  Therefore, we don't need to call the lower
+        * ->writepage: just copy our data to the lower page (already done
+        * above), then mark the lower page dirty and unlock it, and return
+        * success.
+        */
+       if (wbc->for_reclaim) {
+               set_page_dirty(lower_page);
                unlock_page(lower_page);
-       /* b/c grab_cache_page increased refcnt */
-       page_cache_release(lower_page);
-
+               goto out_release;
+       }
+       BUG_ON(!lower_mapping->a_ops->writepage);
+       clear_page_dirty_for_io(lower_page); /* emulate VFS behavior */
+       err = lower_mapping->a_ops->writepage(lower_page, wbc);
        if (err < 0) {
                ClearPageUptodate(page);
-               goto out;
+               goto out_release;
        }
+
+       /*
+        * Lower file systems such as ramfs and tmpfs, may return
+        * AOP_WRITEPAGE_ACTIVATE so that the VM won't try to (pointlessly)
+        * write the page again for a while.  But those lower file systems
+        * also set the page dirty bit back again.  Since we successfully
+        * copied our page data to the lower page, then the VM will come
+        * back to the lower page (directly) and try to flush it.  So we can
+        * save the VM the hassle of coming back to our page and trying to
+        * flush too.  Therefore, we don't re-dirty our own page, and we
+        * never return AOP_WRITEPAGE_ACTIVATE back to the VM (we consider
+        * this a success).
+        *
+        * We also unlock the lower page if the lower ->writepage returned
+        * AOP_WRITEPAGE_ACTIVATE.  (This "anomalous" behaviour may be
+        * addressed in future shmem/VM code.)
+        */
        if (err == AOP_WRITEPAGE_ACTIVATE) {
-               /*
-                * Lower file systems such as ramfs and tmpfs, may return
-                * AOP_WRITEPAGE_ACTIVATE so that the VM won't try to
-                * (pointlessly) write the page again for a while.  But
-                * those lower file systems also set the page dirty bit back
-                * again.  So we mimic that behaviour here.
-                */
-               if (PageDirty(lower_page))
-                       set_page_dirty(page);
-               goto out;
+               err = 0;
+               unlock_page(lower_page);
        }
 
        /* all is well */
        SetPageUptodate(page);
-       /* lower mtimes has changed: update ours */
+       /* lower mtimes have changed: update ours */
        unionfs_copy_attr_times(inode);
 
-       unlock_page(page);
-
+out_release:
+       /* b/c find_or_create_page increased refcnt */
+       page_cache_release(lower_page);
 out:
+       /*
+        * We unlock our page unconditionally, because we never return
+        * AOP_WRITEPAGE_ACTIVATE.
+        */
+       unlock_page(page);
        return err;
 }
 
@@ -119,9 +127,8 @@ static int unionfs_writepages(struct address_space *mapping,
        if (!mapping_cap_writeback_dirty(lower_inode->i_mapping))
                goto out;
 
-       /* Note: generic_writepages may return AOP_WRITEPAGE_ACTIVATE */
        err = generic_writepages(mapping, wbc);
-       if (err == 0)
+       if (!err)
                unionfs_copy_attr_times(inode);
 out:
        return err;
@@ -313,53 +320,10 @@ out:
        return err;             /* assume all is ok */
 }
 
-static void unionfs_sync_page(struct page *page)
-{
-       struct inode *inode;
-       struct inode *lower_inode;
-       struct page *lower_page;
-       struct address_space *mapping;
-
-       inode = page->mapping->host;
-       lower_inode = unionfs_lower_inode(inode);
-
-       /*
-        * Find lower page (returns a locked page).
-        *
-        * NOTE: we used to call grab_cache_page(), but that was unnecessary
-        * as it would have tried to create a new lower page if it didn't
-        * exist, leading to deadlocks.  All our sync_page method needs to
-        * do is ensure that pending I/O gets done.
-        */
-       lower_page = find_lock_page(lower_inode->i_mapping, page->index);
-       if (!lower_page) {
-               printk(KERN_ERR "unionfs: find_lock_page failed\n");
-               goto out;
-       }
-
-       /* do the actual sync */
-       mapping = lower_page->mapping;
-       /*
-        * XXX: can we optimize ala RAIF and set the lower page to be
-        * discarded after a successful sync_page?
-        */
-       if (mapping && mapping->a_ops && mapping->a_ops->sync_page)
-               mapping->a_ops->sync_page(lower_page);
-
-       /* b/c find_lock_page locked it */
-       unlock_page(lower_page);
-       /* b/c find_lock_page increased refcnt */
-       page_cache_release(lower_page);
-
-out:
-       return;
-}
-
 struct address_space_operations unionfs_aops = {
        .writepage      = unionfs_writepage,
        .writepages     = unionfs_writepages,
        .readpage       = unionfs_readpage,
        .prepare_write  = unionfs_prepare_write,
        .commit_write   = unionfs_commit_write,
-       .sync_page      = unionfs_sync_page,
 };
-- 
1.5.2.2

-
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to