On Wed, May 23, 2018 at 04:46:46PM +0200, Christoph Hellwig wrote:
> Switch to using the iomap_page structure for checking sub-page uptodate
> status and track sub-page I/O completion status, and remove large
> quantities of boilerplate code working around buffer heads.
> 
> Signed-off-by: Christoph Hellwig <h...@lst.de>
> ---
>  fs/xfs/xfs_aops.c  | 536 +++++++--------------------------------------
>  fs/xfs/xfs_buf.h   |   1 -
>  fs/xfs/xfs_iomap.c |   3 -
>  fs/xfs/xfs_super.c |   2 +-
>  fs/xfs/xfs_trace.h |  18 +-
>  5 files changed, 79 insertions(+), 481 deletions(-)
> 
> diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
> index efa2cbb27d67..d279929e53fb 100644
> --- a/fs/xfs/xfs_aops.c
> +++ b/fs/xfs/xfs_aops.c
...
> @@ -768,7 +620,7 @@ xfs_aops_discard_page(
>       int                     error;
>  
>       if (XFS_FORCED_SHUTDOWN(mp))
> -             goto out_invalidate;
> +             goto out;
>  
>       xfs_alert(mp,
>               "page discard on page "PTR_FMT", inode 0x%llx, offset %llu.",
> @@ -778,15 +630,15 @@ xfs_aops_discard_page(
>                       PAGE_SIZE / i_blocksize(inode));
>       if (error && !XFS_FORCED_SHUTDOWN(mp))
>               xfs_alert(mp, "page discard unable to remove delalloc 
> mapping.");
> -out_invalidate:
> -     xfs_vm_invalidatepage(page, 0, PAGE_SIZE);
> +out:
> +     iomap_invalidatepage(page, 0, PAGE_SIZE);

All this does is lose the tracepoint. I don't think this call needs to
change. The rest looks Ok to me, but I still need to run some tests on
the whole thing.

Brian

>  }
>  
>  /*
>   * We implement an immediate ioend submission policy here to avoid needing to
>   * chain multiple ioends and hence nest mempool allocations which can violate
>   * forward progress guarantees we need to provide. The current ioend we are
> - * adding buffers to is cached on the writepage context, and if the new 
> buffer
> + * adding blocks to is cached on the writepage context, and if the new block
>   * does not append to the cached ioend it will create a new ioend and cache 
> that
>   * instead.
>   *
> @@ -807,41 +659,28 @@ xfs_writepage_map(
>       uint64_t                end_offset)
>  {
>       LIST_HEAD(submit_list);
> +     struct iomap_page       *iop = to_iomap_page(page);
> +     unsigned                len = i_blocksize(inode);
>       struct xfs_ioend        *ioend, *next;
> -     struct buffer_head      *bh = NULL;
> -     ssize_t                 len = i_blocksize(inode);
> -     int                     error = 0;
> -     int                     count = 0;
> -     loff_t                  file_offset;    /* file offset of page */
> -     unsigned                poffset;        /* offset into page */
> +     int                     error = 0, count = 0, i;
> +     u64                     file_offset;    /* file offset of page */
>  
> -     if (page_has_buffers(page))
> -             bh = page_buffers(page);
> +     ASSERT(iop || i_blocksize(inode) == PAGE_SIZE);
> +     ASSERT(!iop || atomic_read(&iop->write_count) == 0);
>  
>       /*
> -      * Walk the blocks on the page, and we we run off then end of the
> -      * current map or find the current map invalid, grab a new one.
> -      * We only use bufferheads here to check per-block state - they no
> -      * longer control the iteration through the page. This allows us to
> -      * replace the bufferhead with some other state tracking mechanism in
> -      * future.
> +      * Walk through the page to find areas to write back. If we run off the
> +      * end of the current map or find the current map invalid, grab a new
> +      * one.
>        */
> -     for (poffset = 0, file_offset = page_offset(page);
> -          poffset < PAGE_SIZE;
> -          poffset += len, file_offset += len) {
> -             /* past the range we are writing, so nothing more to write. */
> -             if (file_offset >= end_offset)
> -                     break;
> -
> +     for (i = 0, file_offset = page_offset(page);
> +          i < (PAGE_SIZE >> inode->i_blkbits) && file_offset < end_offset;
> +          i++, file_offset += len) {
>               /*
>                * Block does not contain valid data, skip it.
>                */
> -             if (bh && !buffer_uptodate(bh)) {
> -                     if (PageUptodate(page))
> -                             ASSERT(buffer_mapped(bh));
> -                     bh = bh->b_this_page;
> +             if (iop && !test_bit(i, iop->uptodate))
>                       continue;
> -             }
>  
>               /*
>                * If we don't have a valid map, now it's time to get a new one
> @@ -854,52 +693,33 @@ xfs_writepage_map(
>                       error = xfs_map_blocks(inode, file_offset, &wpc->imap,
>                                            &wpc->io_type);
>                       if (error)
> -                             goto out;
> +                             break;
>               }
>  
> -             if (wpc->io_type == XFS_IO_HOLE) {
> -                     /*
> -                      * set_page_dirty dirties all buffers in a page, 
> independent
> -                      * of their state.  The dirty state however is entirely
> -                      * meaningless for holes (!mapped && uptodate), so 
> check we did
> -                      * have a buffer covering a hole here and continue.
> -                      */
> -                     if (bh)
> -                             bh = bh->b_this_page;
> -                     continue;
> -             }
> -
> -             if (bh) {
> -                     xfs_map_at_offset(inode, bh, &wpc->imap, file_offset);
> -                     bh = bh->b_this_page;
> +             if (wpc->io_type != XFS_IO_HOLE) {
> +                     xfs_add_to_ioend(inode, file_offset, page, iop, wpc,
> +                             wbc, &submit_list);
> +                     count++;
>               }
> -             xfs_add_to_ioend(inode, file_offset, page, wpc, wbc,
> -                             &submit_list);
> -             count++;
>       }
>  
>       ASSERT(wpc->ioend || list_empty(&submit_list));
> -
> -out:
>       ASSERT(PageLocked(page));
>       ASSERT(!PageWriteback(page));
>  
>       /*
> -      * On error, we have to fail the ioend here because we have locked
> -      * buffers in the ioend. If we don't do this, we'll deadlock
> -      * invalidating the page as that tries to lock the buffers on the page.
> -      * Also, because we may have set pages under writeback, we have to make
> -      * sure we run IO completion to mark the error state of the IO
> -      * appropriately, so we can't cancel the ioend directly here. That means
> -      * we have to mark this page as under writeback if we included any
> -      * buffers from it in the ioend chain so that completion treats it
> -      * correctly.
> +      * On error, we have to fail the ioend here because we may have set
> +      * pages under writeback, we have to make sure we run IO completion to
> +      * mark the error state of the IO appropriately, so we can't cancel the
> +      * ioend directly here.  That means we have to mark this page as under
> +      * writeback if we included any blocks from it in the ioend chain so
> +      * that completion treats it correctly.
>        *
>        * If we didn't include the page in the ioend, the on error we can
>        * simply discard and unlock it as there are no other users of the page
> -      * or it's buffers right now. The caller will still need to trigger
> -      * submission of outstanding ioends on the writepage context so they are
> -      * treated correctly on error.
> +      * now.  The caller will still need to trigger submission of outstanding
> +      * ioends on the writepage context so they are treated correctly on
> +      * error.
>        */
>       if (unlikely(error)) {
>               if (!count) {
> @@ -940,8 +760,8 @@ xfs_writepage_map(
>       }
>  
>       /*
> -      * We can end up here with no error and nothing to write if we race with
> -      * a partial page truncate on a sub-page block sized filesystem.
> +      * We can end up here with no error and nothing to write only if we race
> +      * with a partial page truncate on a sub-page block sized filesystem.
>        */
>       if (!count)
>               end_page_writeback(page);
> @@ -956,7 +776,6 @@ xfs_writepage_map(
>   * For delalloc space on the page we need to allocate space and flush it.
>   * For unwritten space on the page we need to start the conversion to
>   * regular allocated space.
> - * For any other dirty buffer heads on the page we should flush them.
>   */
>  STATIC int
>  xfs_do_writepage(
> @@ -1110,168 +929,6 @@ xfs_dax_writepages(
>                       xfs_find_bdev_for_inode(mapping->host), wbc);
>  }
>  
> -/*
> - * Called to move a page into cleanable state - and from there
> - * to be released. The page should already be clean. We always
> - * have buffer heads in this call.
> - *
> - * Returns 1 if the page is ok to release, 0 otherwise.
> - */
> -STATIC int
> -xfs_vm_releasepage(
> -     struct page             *page,
> -     gfp_t                   gfp_mask)
> -{
> -     int                     delalloc, unwritten;
> -
> -     trace_xfs_releasepage(page->mapping->host, page, 0, 0);
> -
> -     /*
> -      * mm accommodates an old ext3 case where clean pages might not have had
> -      * the dirty bit cleared. Thus, it can send actual dirty pages to
> -      * ->releasepage() via shrink_active_list(). Conversely,
> -      * block_invalidatepage() can send pages that are still marked dirty but
> -      * otherwise have invalidated buffers.
> -      *
> -      * We want to release the latter to avoid unnecessary buildup of the
> -      * LRU, so xfs_vm_invalidatepage() clears the page dirty flag on pages
> -      * that are entirely invalidated and need to be released.  Hence the
> -      * only time we should get dirty pages here is through
> -      * shrink_active_list() and so we can simply skip those now.
> -      *
> -      * warn if we've left any lingering delalloc/unwritten buffers on clean
> -      * or invalidated pages we are about to release.
> -      */
> -     if (PageDirty(page))
> -             return 0;
> -
> -     xfs_count_page_state(page, &delalloc, &unwritten);
> -
> -     if (WARN_ON_ONCE(delalloc))
> -             return 0;
> -     if (WARN_ON_ONCE(unwritten))
> -             return 0;
> -
> -     return try_to_free_buffers(page);
> -}
> -
> -/*
> - * If this is O_DIRECT or the mpage code calling tell them how large the 
> mapping
> - * is, so that we can avoid repeated get_blocks calls.
> - *
> - * If the mapping spans EOF, then we have to break the mapping up as the 
> mapping
> - * for blocks beyond EOF must be marked new so that sub block regions can be
> - * correctly zeroed. We can't do this for mappings within EOF unless the 
> mapping
> - * was just allocated or is unwritten, otherwise the callers would overwrite
> - * existing data with zeros. Hence we have to split the mapping into a range 
> up
> - * to and including EOF, and a second mapping for beyond EOF.
> - */
> -static void
> -xfs_map_trim_size(
> -     struct inode            *inode,
> -     sector_t                iblock,
> -     struct buffer_head      *bh_result,
> -     struct xfs_bmbt_irec    *imap,
> -     xfs_off_t               offset,
> -     ssize_t                 size)
> -{
> -     xfs_off_t               mapping_size;
> -
> -     mapping_size = imap->br_startoff + imap->br_blockcount - iblock;
> -     mapping_size <<= inode->i_blkbits;
> -
> -     ASSERT(mapping_size > 0);
> -     if (mapping_size > size)
> -             mapping_size = size;
> -     if (offset < i_size_read(inode) &&
> -         (xfs_ufsize_t)offset + mapping_size >= i_size_read(inode)) {
> -             /* limit mapping to block that spans EOF */
> -             mapping_size = roundup_64(i_size_read(inode) - offset,
> -                                       i_blocksize(inode));
> -     }
> -     if (mapping_size > LONG_MAX)
> -             mapping_size = LONG_MAX;
> -
> -     bh_result->b_size = mapping_size;
> -}
> -
> -static int
> -xfs_get_blocks(
> -     struct inode            *inode,
> -     sector_t                iblock,
> -     struct buffer_head      *bh_result,
> -     int                     create)
> -{
> -     struct xfs_inode        *ip = XFS_I(inode);
> -     struct xfs_mount        *mp = ip->i_mount;
> -     xfs_fileoff_t           offset_fsb, end_fsb;
> -     int                     error = 0;
> -     int                     lockmode = 0;
> -     struct xfs_bmbt_irec    imap;
> -     int                     nimaps = 1;
> -     xfs_off_t               offset;
> -     ssize_t                 size;
> -
> -     BUG_ON(create);
> -
> -     if (XFS_FORCED_SHUTDOWN(mp))
> -             return -EIO;
> -
> -     offset = (xfs_off_t)iblock << inode->i_blkbits;
> -     ASSERT(bh_result->b_size >= i_blocksize(inode));
> -     size = bh_result->b_size;
> -
> -     if (offset >= i_size_read(inode))
> -             return 0;
> -
> -     /*
> -      * Direct I/O is usually done on preallocated files, so try getting
> -      * a block mapping without an exclusive lock first.
> -      */
> -     lockmode = xfs_ilock_data_map_shared(ip);
> -
> -     ASSERT(offset <= mp->m_super->s_maxbytes);
> -     if (offset > mp->m_super->s_maxbytes - size)
> -             size = mp->m_super->s_maxbytes - offset;
> -     end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size);
> -     offset_fsb = XFS_B_TO_FSBT(mp, offset);
> -
> -     error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
> -                     &nimaps, 0);
> -     if (error)
> -             goto out_unlock;
> -     if (!nimaps) {
> -             trace_xfs_get_blocks_notfound(ip, offset, size);
> -             goto out_unlock;
> -     }
> -
> -     trace_xfs_get_blocks_found(ip, offset, size,
> -             imap.br_state == XFS_EXT_UNWRITTEN ?
> -                     XFS_IO_UNWRITTEN : XFS_IO_OVERWRITE, &imap);
> -     xfs_iunlock(ip, lockmode);
> -
> -     /* trim mapping down to size requested */
> -     xfs_map_trim_size(inode, iblock, bh_result, &imap, offset, size);
> -
> -     /*
> -      * For unwritten extents do not report a disk address in the buffered
> -      * read case (treat as if we're reading into a hole).
> -      */
> -     if (xfs_bmap_is_real_extent(&imap))
> -             xfs_map_buffer(inode, bh_result, &imap, offset);
> -
> -     /*
> -      * If this is a realtime file, data may be on a different device.
> -      * to that pointed to from the buffer_head b_bdev currently.
> -      */
> -     bh_result->b_bdev = xfs_find_bdev_for_inode(inode);
> -     return 0;
> -
> -out_unlock:
> -     xfs_iunlock(ip, lockmode);
> -     return error;
> -}
> -
>  STATIC sector_t
>  xfs_vm_bmap(
>       struct address_space    *mapping,
> @@ -1301,9 +958,7 @@ xfs_vm_readpage(
>       struct page             *page)
>  {
>       trace_xfs_vm_readpage(page->mapping->host, 1);
> -     if (i_blocksize(page->mapping->host) == PAGE_SIZE)
> -             return iomap_readpage(page, &xfs_iomap_ops);
> -     return mpage_readpage(page, xfs_get_blocks);
> +     return iomap_readpage(page, &xfs_iomap_ops);
>  }
>  
>  STATIC int
> @@ -1314,65 +969,26 @@ xfs_vm_readpages(
>       unsigned                nr_pages)
>  {
>       trace_xfs_vm_readpages(mapping->host, nr_pages);
> -     if (i_blocksize(mapping->host) == PAGE_SIZE)
> -             return iomap_readpages(mapping, pages, nr_pages, 
> &xfs_iomap_ops);
> -     return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks);
> +     return iomap_readpages(mapping, pages, nr_pages, &xfs_iomap_ops);
>  }
>  
> -/*
> - * This is basically a copy of __set_page_dirty_buffers() with one
> - * small tweak: buffers beyond EOF do not get marked dirty. If we mark them
> - * dirty, we'll never be able to clean them because we don't write buffers
> - * beyond EOF, and that means we can't invalidate pages that span EOF
> - * that have been marked dirty. Further, the dirty state can leak into
> - * the file interior if the file is extended, resulting in all sorts of
> - * bad things happening as the state does not match the underlying data.
> - *
> - * XXX: this really indicates that bufferheads in XFS need to die. Warts like
> - * this only exist because of bufferheads and how the generic code manages 
> them.
> - */
> -STATIC int
> -xfs_vm_set_page_dirty(
> -     struct page             *page)
> +static int
> +xfs_vm_releasepage(
> +     struct page             *page,
> +     gfp_t                   gfp_mask)
>  {
> -     struct address_space    *mapping = page->mapping;
> -     struct inode            *inode = mapping->host;
> -     loff_t                  end_offset;
> -     loff_t                  offset;
> -     int                     newly_dirty;
> -
> -     if (unlikely(!mapping))
> -             return !TestSetPageDirty(page);
> -
> -     end_offset = i_size_read(inode);
> -     offset = page_offset(page);
> -
> -     spin_lock(&mapping->private_lock);
> -     if (page_has_buffers(page)) {
> -             struct buffer_head *head = page_buffers(page);
> -             struct buffer_head *bh = head;
> +     trace_xfs_releasepage(page->mapping->host, page, 0, 0);
> +     return iomap_releasepage(page, gfp_mask);
> +}
>  
> -             do {
> -                     if (offset < end_offset)
> -                             set_buffer_dirty(bh);
> -                     bh = bh->b_this_page;
> -                     offset += i_blocksize(inode);
> -             } while (bh != head);
> -     }
> -     /*
> -      * Lock out page->mem_cgroup migration to keep PageDirty
> -      * synchronized with per-memcg dirty page counters.
> -      */
> -     lock_page_memcg(page);
> -     newly_dirty = !TestSetPageDirty(page);
> -     spin_unlock(&mapping->private_lock);
> -
> -     if (newly_dirty)
> -             __set_page_dirty(page, mapping, 1);
> -     unlock_page_memcg(page);
> -     if (newly_dirty)
> -             __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
> -     return newly_dirty;
> +static void
> +xfs_vm_invalidatepage(
> +     struct page             *page,
> +     unsigned int            offset,
> +     unsigned int            length)
> +{
> +     trace_xfs_invalidatepage(page->mapping->host, page, offset, length);
> +     iomap_invalidatepage(page, offset, length);
>  }
>  
>  static int
> @@ -1390,13 +1006,13 @@ const struct address_space_operations 
> xfs_address_space_operations = {
>       .readpages              = xfs_vm_readpages,
>       .writepage              = xfs_vm_writepage,
>       .writepages             = xfs_vm_writepages,
> -     .set_page_dirty         = xfs_vm_set_page_dirty,
> +     .set_page_dirty         = iomap_set_page_dirty,
>       .releasepage            = xfs_vm_releasepage,
>       .invalidatepage         = xfs_vm_invalidatepage,
>       .bmap                   = xfs_vm_bmap,
>       .direct_IO              = noop_direct_IO,
> -     .migratepage            = buffer_migrate_page,
> -     .is_partially_uptodate  = block_is_partially_uptodate,
> +     .migratepage            = iomap_migrate_page,
> +     .is_partially_uptodate  = iomap_is_partially_uptodate,
>       .error_remove_page      = generic_error_remove_page,
>       .swap_activate          = xfs_iomap_swapfile_activate,
>  };
> diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
> index f5f2b71c2fde..f3fa197bd272 100644
> --- a/fs/xfs/xfs_buf.h
> +++ b/fs/xfs/xfs_buf.h
> @@ -24,7 +24,6 @@
>  #include <linux/mm.h>
>  #include <linux/fs.h>
>  #include <linux/dax.h>
> -#include <linux/buffer_head.h>
>  #include <linux/uio.h>
>  #include <linux/list_lru.h>
>  
> diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
> index 93c40da3378a..c646d84cd55e 100644
> --- a/fs/xfs/xfs_iomap.c
> +++ b/fs/xfs/xfs_iomap.c
> @@ -1031,9 +1031,6 @@ xfs_file_iomap_begin(
>       if (XFS_FORCED_SHUTDOWN(mp))
>               return -EIO;
>  
> -     if (i_blocksize(inode) < PAGE_SIZE)
> -             iomap->flags |= IOMAP_F_BUFFER_HEAD;
> -
>       if (((flags & (IOMAP_WRITE | IOMAP_DIRECT)) == IOMAP_WRITE) &&
>                       !IS_DAX(inode) && !xfs_get_extsz_hint(ip)) {
>               /* Reserve delalloc blocks for regular writeback. */
> diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> index 39e5ec3d407f..a9f23ec95216 100644
> --- a/fs/xfs/xfs_super.c
> +++ b/fs/xfs/xfs_super.c
> @@ -1866,7 +1866,7 @@ MODULE_ALIAS_FS("xfs");
>  STATIC int __init
>  xfs_init_zones(void)
>  {
> -     xfs_ioend_bioset = bioset_create(4 * MAX_BUF_PER_PAGE,
> +     xfs_ioend_bioset = bioset_create(4 * (PAGE_SIZE / SECTOR_SIZE),
>                       offsetof(struct xfs_ioend, io_inline_bio),
>                       BIOSET_NEED_BVECS);
>       if (!xfs_ioend_bioset)
> diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
> index ed8f774944ba..e4dc7c7f3da9 100644
> --- a/fs/xfs/xfs_trace.h
> +++ b/fs/xfs/xfs_trace.h
> @@ -1165,33 +1165,23 @@ DECLARE_EVENT_CLASS(xfs_page_class,
>               __field(loff_t, size)
>               __field(unsigned long, offset)
>               __field(unsigned int, length)
> -             __field(int, delalloc)
> -             __field(int, unwritten)
>       ),
>       TP_fast_assign(
> -             int delalloc = -1, unwritten = -1;
> -
> -             if (page_has_buffers(page))
> -                     xfs_count_page_state(page, &delalloc, &unwritten);
>               __entry->dev = inode->i_sb->s_dev;
>               __entry->ino = XFS_I(inode)->i_ino;
>               __entry->pgoff = page_offset(page);
>               __entry->size = i_size_read(inode);
>               __entry->offset = off;
>               __entry->length = len;
> -             __entry->delalloc = delalloc;
> -             __entry->unwritten = unwritten;
>       ),
>       TP_printk("dev %d:%d ino 0x%llx pgoff 0x%lx size 0x%llx offset %lx "
> -               "length %x delalloc %d unwritten %d",
> +               "length %x",
>                 MAJOR(__entry->dev), MINOR(__entry->dev),
>                 __entry->ino,
>                 __entry->pgoff,
>                 __entry->size,
>                 __entry->offset,
> -               __entry->length,
> -               __entry->delalloc,
> -               __entry->unwritten)
> +               __entry->length)
>  )
>  
>  #define DEFINE_PAGE_EVENT(name)              \
> @@ -1275,9 +1265,6 @@ DEFINE_EVENT(xfs_imap_class, name,      \
>       TP_ARGS(ip, offset, count, type, irec))
>  DEFINE_IOMAP_EVENT(xfs_map_blocks_found);
>  DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc);
> -DEFINE_IOMAP_EVENT(xfs_get_blocks_found);
> -DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc);
> -DEFINE_IOMAP_EVENT(xfs_get_blocks_map_direct);
>  DEFINE_IOMAP_EVENT(xfs_iomap_alloc);
>  DEFINE_IOMAP_EVENT(xfs_iomap_found);
>  
> @@ -1316,7 +1303,6 @@ DEFINE_EVENT(xfs_simple_io_class, name, \
>       TP_ARGS(ip, offset, count))
>  DEFINE_SIMPLE_IO_EVENT(xfs_delalloc_enospc);
>  DEFINE_SIMPLE_IO_EVENT(xfs_unwritten_convert);
> -DEFINE_SIMPLE_IO_EVENT(xfs_get_blocks_notfound);
>  DEFINE_SIMPLE_IO_EVENT(xfs_setfilesize);
>  DEFINE_SIMPLE_IO_EVENT(xfs_zero_eof);
>  DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write);
> -- 
> 2.17.0
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to