Re: [RFC PATCH V11 01/21] Btrfs: subpagesize-blocksize: Fix whole page read.

Liu Bo Thu, 18 Jun 2015 21:47:43 -0700

On Mon, Jun 01, 2015 at 08:52:36PM +0530, Chandan Rajendra wrote:
> For the subpagesize-blocksize scenario, a page can contain multiple
> blocks. In such cases, this patch handles reading data from files.
> 
> To track the status of individual blocks of a page, this patch makes use of a
> bitmap pointed to by page->private.


Start going through the patchset, it's not easy though.

Several comments are following.

> 
> Signed-off-by: Chandan Rajendra <chan...@linux.vnet.ibm.com>
> ---
>  fs/btrfs/extent_io.c | 301 
> +++++++++++++++++++++++++++++++++------------------
>  fs/btrfs/extent_io.h |  28 ++++-
>  fs/btrfs/inode.c     |  13 +--
>  3 files changed, 224 insertions(+), 118 deletions(-)
> 
> diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
> index 782f3bc..d37badb 100644
> --- a/fs/btrfs/extent_io.c
> +++ b/fs/btrfs/extent_io.c
> @@ -1325,6 +1325,88 @@ int clear_extent_uptodate(struct extent_io_tree *tree, 
> u64 start, u64 end,
>                               cached_state, mask);
>  }
>  
> +static int modify_page_blks_state(struct page *page,
> +                             unsigned long blk_states,
> +                             u64 start, u64 end, int set)
> +{
> +     struct inode *inode = page->mapping->host;
> +     unsigned long *bitmap;
> +     unsigned long state;
> +     u64 nr_blks;
> +     u64 blk;
> +
> +     BUG_ON(!PagePrivate(page));
> +
> +     bitmap = ((struct btrfs_page_private *)page->private)->bstate;
> +
> +     blk = (start & (PAGE_CACHE_SIZE - 1)) >> inode->i_blkbits;
> +     nr_blks = (end - start + 1) >> inode->i_blkbits;
> +
> +     while (nr_blks--) {
> +             state = find_next_bit(&blk_states, BLK_NR_STATE, 0);

Looks like we don't need to do find_next_bit for every block.

> +
> +             while (state < BLK_NR_STATE) {
> +                     if (set)
> +                             set_bit((blk * BLK_NR_STATE) + state, bitmap);
> +                     else
> +                             clear_bit((blk * BLK_NR_STATE) + state, bitmap);
> +
> +                     state = find_next_bit(&blk_states, BLK_NR_STATE,
> +                                     state + 1);
> +             }
> +
> +             ++blk;
> +     }
> +
> +     return 0;
> +}
> +
> +int set_page_blks_state(struct page *page, unsigned long blk_states,
> +                     u64 start, u64 end)
> +{
> +     return modify_page_blks_state(page, blk_states, start, end, 1);
> +}
> +
> +int clear_page_blks_state(struct page *page, unsigned long blk_states,
> +                     u64 start, u64 end)
> +{
> +     return modify_page_blks_state(page, blk_states, start, end, 0);
> +}
> +
> +int test_page_blks_state(struct page *page, enum blk_state blk_state,
> +                     u64 start, u64 end, int check_all)
> +{
> +     struct inode *inode = page->mapping->host;
> +     unsigned long *bitmap;
> +     unsigned long blk;
> +     u64 nr_blks;
> +     int found = 0;
> +
> +     BUG_ON(!PagePrivate(page));
> +
> +     bitmap = ((struct btrfs_page_private *)page->private)->bstate;
> +
> +     blk = (start & (PAGE_CACHE_SIZE - 1)) >> inode->i_blkbits;
> +     nr_blks = (end - start + 1) >> inode->i_blkbits;
> +
> +     while (nr_blks--) {
> +             if (test_bit((blk * BLK_NR_STATE) + blk_state, bitmap)) {
> +                     if (!check_all)
> +                             return 1;
> +                     found = 1;
> +             } else if (check_all) {
> +                     return 0;
> +             }
> +
> +             ++blk;
> +     }
> +
> +     if (!check_all && !found)
> +             return 0;
> +
> +     return 1;
> +}
> +
>  /*
>   * either insert or lock state struct between start and end use mask to tell
>   * us if waiting is desired.
> @@ -1982,14 +2064,22 @@ int test_range_bit(struct extent_io_tree *tree, u64 
> start, u64 end,
>   * helper function to set a given page up to date if all the
>   * extents in the tree for that page are up to date
>   */
> -static void check_page_uptodate(struct extent_io_tree *tree, struct page 
> *page)
> +static void check_page_uptodate(struct page *page)
>  {
>       u64 start = page_offset(page);
>       u64 end = start + PAGE_CACHE_SIZE - 1;
> -     if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL))
> +     if (test_page_blks_state(page, BLK_STATE_UPTODATE, start, end, 1))
>               SetPageUptodate(page);
>  }
>  
> +static int page_read_complete(struct page *page)
> +{
> +     u64 start = page_offset(page);
> +     u64 end = start + PAGE_CACHE_SIZE - 1;
> +
> +     return !test_page_blks_state(page, BLK_STATE_IO, start, end, 0);
> +}
> +
>  int free_io_failure(struct inode *inode, struct io_failure_record *rec)
>  {
>       int ret;
> @@ -2311,7 +2401,9 @@ int btrfs_check_repairable(struct inode *inode, struct 
> bio *failed_bio,
>        *      a) deliver good data to the caller
>        *      b) correct the bad sectors on disk
>        */
> -     if (failed_bio->bi_vcnt > 1) {
> +     if ((failed_bio->bi_vcnt > 1)
> +             || (failed_bio->bi_io_vec->bv_len
> +                     > BTRFS_I(inode)->root->sectorsize)) {
>               /*
>                * to fulfill b), we need to know the exact failing sectors, as
>                * we don't want to rewrite any more than the failed ones. thus,
> @@ -2520,18 +2612,6 @@ static void end_bio_extent_writepage(struct bio *bio, 
> int err)
>       bio_put(bio);
>  }
>  
> -static void
> -endio_readpage_release_extent(struct extent_io_tree *tree, u64 start, u64 
> len,
> -                           int uptodate)
> -{
> -     struct extent_state *cached = NULL;
> -     u64 end = start + len - 1;
> -
> -     if (uptodate && tree->track_uptodate)
> -             set_extent_uptodate(tree, start, end, &cached, GFP_ATOMIC);
> -     unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC);
> -}
> -
>  /*
>   * after a readpage IO is done, we need to:
>   * clear the uptodate bits on error
> @@ -2548,14 +2628,16 @@ static void end_bio_extent_readpage(struct bio *bio, 
> int err)
>       struct bio_vec *bvec;
>       int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
>       struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
> +     struct extent_state *cached = NULL;
> +     struct btrfs_page_private *pg_private;
>       struct extent_io_tree *tree;
> +     unsigned long flags;
>       u64 offset = 0;
>       u64 start;
>       u64 end;
> -     u64 len;
> -     u64 extent_start = 0;
> -     u64 extent_len = 0;
> +     int nr_sectors;
>       int mirror;
> +     int unlock;
>       int ret;
>       int i;
>  
> @@ -2565,54 +2647,31 @@ static void end_bio_extent_readpage(struct bio *bio, 
> int err)
>       bio_for_each_segment_all(bvec, bio, i) {
>               struct page *page = bvec->bv_page;
>               struct inode *inode = page->mapping->host;
> +             struct btrfs_root *root = BTRFS_I(inode)->root;
>  
>               pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, "
>                        "mirror=%u\n", (u64)bio->bi_iter.bi_sector, err,
>                        io_bio->mirror_num);
>               tree = &BTRFS_I(inode)->io_tree;
>  
> -             /* We always issue full-page reads, but if some block
> -              * in a page fails to read, blk_update_request() will
> -              * advance bv_offset and adjust bv_len to compensate.
> -              * Print a warning for nonzero offsets, and an error
> -              * if they don't add up to a full page.  */
> -             if (bvec->bv_offset || bvec->bv_len != PAGE_CACHE_SIZE) {
> -                     if (bvec->bv_offset + bvec->bv_len != PAGE_CACHE_SIZE)
> -                             
> btrfs_err(BTRFS_I(page->mapping->host)->root->fs_info,
> -                                "partial page read in btrfs with offset %u 
> and length %u",
> -                                     bvec->bv_offset, bvec->bv_len);
> -                     else
> -                             
> btrfs_info(BTRFS_I(page->mapping->host)->root->fs_info,
> -                                "incomplete page read in btrfs with offset 
> %u and "
> -                                "length %u",
> -                                     bvec->bv_offset, bvec->bv_len);
> -             }
> -
> -             start = page_offset(page);
> -             end = start + bvec->bv_offset + bvec->bv_len - 1;
> -             len = bvec->bv_len;
> -
> +             start = page_offset(page) + bvec->bv_offset;
> +             end = start + bvec->bv_len - 1;
> +             nr_sectors = bvec->bv_len >> inode->i_sb->s_blocksize_bits;
>               mirror = io_bio->mirror_num;
> -             if (likely(uptodate && tree->ops &&
> -                        tree->ops->readpage_end_io_hook)) {
> +
> +next_block:
> +             if (likely(uptodate)) {

Any reason of killing (tree->ops && tree->ops->readpage_end_io_hook)?

>                       ret = tree->ops->readpage_end_io_hook(io_bio, offset,
> -                                                           page, start, end,
> -                                                           mirror);
> +                                                     page, start,
> +                                                     start + 
> root->sectorsize - 1,
> +                                                     mirror);
>                       if (ret)
>                               uptodate = 0;
>                       else
>                               clean_io_failure(inode, start, page, 0);
>               }
>  
> -             if (likely(uptodate))
> -                     goto readpage_ok;
> -
> -             if (tree->ops && tree->ops->readpage_io_failed_hook) {
> -                     ret = tree->ops->readpage_io_failed_hook(page, mirror);
> -                     if (!ret && !err &&
> -                         test_bit(BIO_UPTODATE, &bio->bi_flags))
> -                             uptodate = 1;
> -             } else {
> +             if (!uptodate) {
>                       /*
>                        * The generic bio_readpage_error handles errors the
>                        * following way: If possible, new read requests are
> @@ -2623,61 +2682,63 @@ static void end_bio_extent_readpage(struct bio *bio, 
> int err)
>                        * can't handle the error it will return -EIO and we
>                        * remain responsible for that page.
>                        */
> -                     ret = bio_readpage_error(bio, offset, page, start, end,
> -                                              mirror);
> +                     ret = bio_readpage_error(bio, offset, page,
> +                                             start, start + root->sectorsize 
> - 1,
> +                                             mirror);
>                       if (ret == 0) {
> -                             uptodate =
> -                                     test_bit(BIO_UPTODATE, &bio->bi_flags);
> +                             uptodate = test_bit(BIO_UPTODATE, 
> &bio->bi_flags);
>                               if (err)
>                                       uptodate = 0;
> -                             offset += len;
> -                             continue;
> +                             offset += root->sectorsize;
> +                             if (--nr_sectors) {
> +                                     start += root->sectorsize;
> +                                     goto next_block;
> +                             } else {
> +                                     continue;
> +                             }
>                       }
>               }
> -readpage_ok:
> -             if (likely(uptodate)) {
> -                     loff_t i_size = i_size_read(inode);
> -                     pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
> -                     unsigned off;
> -
> -                     /* Zero out the end if this page straddles i_size */
> -                     off = i_size & (PAGE_CACHE_SIZE-1);
> -                     if (page->index == end_index && off)
> -                             zero_user_segment(page, off, PAGE_CACHE_SIZE);
> -                     SetPageUptodate(page);
> +
> +             if (uptodate) {
> +                     set_page_blks_state(page, 1 << BLK_STATE_UPTODATE, 
> start,
> +                                     start + root->sectorsize - 1);
> +                     check_page_uptodate(page);
>               } else {
>                       ClearPageUptodate(page);
>                       SetPageError(page);
>               }
> -             unlock_page(page);
> -             offset += len;
> -
> -             if (unlikely(!uptodate)) {
> -                     if (extent_len) {
> -                             endio_readpage_release_extent(tree,
> -                                                           extent_start,
> -                                                           extent_len, 1);
> -                             extent_start = 0;
> -                             extent_len = 0;
> -                     }
> -                     endio_readpage_release_extent(tree, start,
> -                                                   end - start + 1, 0);
> -             } else if (!extent_len) {
> -                     extent_start = start;
> -                     extent_len = end + 1 - start;
> -             } else if (extent_start + extent_len == start) {
> -                     extent_len += end + 1 - start;
> -             } else {
> -                     endio_readpage_release_extent(tree, extent_start,
> -                                                   extent_len, uptodate);
> -                     extent_start = start;
> -                     extent_len = end + 1 - start;
> +
> +             offset += root->sectorsize;
> +
> +             if (--nr_sectors) {
> +                     clear_page_blks_state(page, 1 << BLK_STATE_IO,
> +                                     start, start + root->sectorsize - 1);

private->io_lock is not acquired here but not in below.

IIUC, this can be protected by EXTENT_LOCKED.

Thanks,

-liubo

> +                     clear_extent_bit(tree, start, start + root->sectorsize 
> - 1,
> +                                     EXTENT_LOCKED, 1, 0, &cached, 
> GFP_ATOMIC);
> +                     start += root->sectorsize;
> +                     goto next_block;
>               }
> +
> +             WARN_ON(!PagePrivate(page));
> +
> +             pg_private = (struct btrfs_page_private *)page->private;
> +
> +             spin_lock_irqsave(&pg_private->io_lock, flags);
> +
> +             clear_page_blks_state(page, 1 << BLK_STATE_IO,
> +                             start, start + root->sectorsize - 1);
> +
> +             unlock = page_read_complete(page);
> +
> +             spin_unlock_irqrestore(&pg_private->io_lock, flags);
> +
> +             clear_extent_bit(tree, start, start + root->sectorsize - 1,
> +                             EXTENT_LOCKED, 1, 0, &cached, GFP_ATOMIC);
> +
> +             if (unlock)
> +                     unlock_page(page);
>       }
>  
> -     if (extent_len)
> -             endio_readpage_release_extent(tree, extent_start, extent_len,
> -                                           uptodate);
>       if (io_bio->end_io)
>               io_bio->end_io(io_bio, err);
>       bio_put(bio);
> @@ -2859,13 +2920,36 @@ static void attach_extent_buffer_page(struct 
> extent_buffer *eb,
>       }
>  }
>  
> -void set_page_extent_mapped(struct page *page)
> +int set_page_extent_mapped(struct page *page)
>  {
> +     struct btrfs_page_private *pg_private;
> +
>       if (!PagePrivate(page)) {
> +             pg_private = kzalloc(sizeof(*pg_private), GFP_NOFS);
> +             if (!pg_private)
> +                     return -ENOMEM;
> +
> +             spin_lock_init(&pg_private->io_lock);
> +
>               SetPagePrivate(page);
>               page_cache_get(page);
> -             set_page_private(page, EXTENT_PAGE_PRIVATE);
> +
> +             set_page_private(page, (unsigned long)pg_private);
> +     }
> +
> +     return 0;
> +}
> +
> +int clear_page_extent_mapped(struct page *page)
> +{
> +     if (PagePrivate(page)) {
> +             kfree((struct btrfs_page_private *)(page->private));
> +             ClearPagePrivate(page);
> +             set_page_private(page, 0);
> +             page_cache_release(page);
>       }
> +
> +     return 0;
>  }
>  
>  static struct extent_map *
> @@ -2909,6 +2993,7 @@ static int __do_readpage(struct extent_io_tree *tree,
>                        unsigned long *bio_flags, int rw)
>  {
>       struct inode *inode = page->mapping->host;
> +     struct extent_state *cached = NULL;
>       u64 start = page_offset(page);
>       u64 page_end = start + PAGE_CACHE_SIZE - 1;
>       u64 end;
> @@ -2964,8 +3049,8 @@ static int __do_readpage(struct extent_io_tree *tree,
>                       memset(userpage + pg_offset, 0, iosize);
>                       flush_dcache_page(page);
>                       kunmap_atomic(userpage);
> -                     set_extent_uptodate(tree, cur, cur + iosize - 1,
> -                                         &cached, GFP_NOFS);
> +                     set_page_blks_state(page, 1 << BLK_STATE_UPTODATE, cur,
> +                                     cur + iosize - 1);
>                       if (!parent_locked)
>                               unlock_extent_cached(tree, cur,
>                                                    cur + iosize - 1,
> @@ -3017,8 +3102,8 @@ static int __do_readpage(struct extent_io_tree *tree,
>                       flush_dcache_page(page);
>                       kunmap_atomic(userpage);
>  
> -                     set_extent_uptodate(tree, cur, cur + iosize - 1,
> -                                         &cached, GFP_NOFS);
> +                     set_page_blks_state(page, 1 << BLK_STATE_UPTODATE, cur,
> +                                     cur + iosize - 1);
>                       unlock_extent_cached(tree, cur, cur + iosize - 1,
>                                            &cached, GFP_NOFS);
>                       cur = cur + iosize;
> @@ -3026,9 +3111,9 @@ static int __do_readpage(struct extent_io_tree *tree,
>                       continue;
>               }
>               /* the get_extent function already copied into the page */
> -             if (test_range_bit(tree, cur, cur_end,
> -                                EXTENT_UPTODATE, 1, NULL)) {
> -                     check_page_uptodate(tree, page);
> +             if (test_page_blks_state(page, BLK_STATE_UPTODATE, cur,
> +                                             cur_end, 1)) {
> +                     check_page_uptodate(page);
>                       if (!parent_locked)
>                               unlock_extent(tree, cur, cur + iosize - 1);
>                       cur = cur + iosize;
> @@ -3048,6 +3133,9 @@ static int __do_readpage(struct extent_io_tree *tree,
>               }
>  
>               pnr -= page->index;
> +
> +             set_page_blks_state(page, 1 << BLK_STATE_IO, cur,
> +                             cur + iosize - 1);
>               ret = submit_extent_page(rw, tree, page,
>                                        sector, disk_io_size, pg_offset,
>                                        bdev, bio, pnr,
> @@ -3059,8 +3147,11 @@ static int __do_readpage(struct extent_io_tree *tree,
>                       *bio_flags = this_bio_flag;
>               } else {
>                       SetPageError(page);
> +                     clear_page_blks_state(page, 1 << BLK_STATE_IO, cur,
> +                                     cur + iosize - 1);
>                       if (!parent_locked)
> -                             unlock_extent(tree, cur, cur + iosize - 1);
> +                             unlock_extent_cached(tree, cur, cur + iosize - 
> 1,
> +                                             &cached, GFP_NOFS);
>               }
>               cur = cur + iosize;
>               pg_offset += iosize;
> diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
> index c668f36..541b40a 100644
> --- a/fs/btrfs/extent_io.h
> +++ b/fs/btrfs/extent_io.h
> @@ -51,11 +51,22 @@
>  #define PAGE_SET_PRIVATE2    (1 << 4)
>  #define PAGE_SET_ERROR               (1 << 5)
>  
> +enum blk_state {
> +     BLK_STATE_UPTODATE,
> +     BLK_STATE_DIRTY,
> +     BLK_STATE_IO,
> +     BLK_NR_STATE,
> +};
> +
>  /*
> - * page->private values.  Every page that is controlled by the extent
> - * map has page->private set to one.
> - */
> -#define EXTENT_PAGE_PRIVATE 1
> +  The maximum number of blocks per page (i.e. 32) occurs when using 2k
> +  as the block size and having 64k as the page size.
> +*/
> +#define BLK_STATE_NR_LONGS DIV_ROUND_UP(BLK_NR_STATE * 32, BITS_PER_LONG)
> +struct btrfs_page_private {
> +     spinlock_t io_lock;
> +     unsigned long bstate[BLK_STATE_NR_LONGS];
> +};
>  
>  struct extent_state;
>  struct btrfs_root;
> @@ -259,7 +270,14 @@ int extent_readpages(struct extent_io_tree *tree,
>  int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
>               __u64 start, __u64 len, get_extent_t *get_extent);
>  int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private);
> -void set_page_extent_mapped(struct page *page);
> +int set_page_extent_mapped(struct page *page);
> +int clear_page_extent_mapped(struct page *page);
> +int set_page_blks_state(struct page *page, unsigned long blk_states,
> +                     u64 start, u64 end);
> +int clear_page_blks_state(struct page *page, unsigned long blk_states,
> +                     u64 start, u64 end);
> +int test_page_blks_state(struct page *page, enum blk_state blk_state,
> +                     u64 start, u64 end, int check_all);
>  
>  struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
>                                         u64 start);
> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> index 0020b56..8262f83 100644
> --- a/fs/btrfs/inode.c
> +++ b/fs/btrfs/inode.c
> @@ -6622,7 +6622,6 @@ struct extent_map *btrfs_get_extent(struct inode 
> *inode, struct page *page,
>       struct btrfs_key found_key;
>       struct extent_map *em = NULL;
>       struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
> -     struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
>       struct btrfs_trans_handle *trans = NULL;
>       const bool new_inline = !page || create;
>  
> @@ -6800,8 +6799,8 @@ next:
>                       kunmap(page);
>                       btrfs_mark_buffer_dirty(leaf);
>               }
> -             set_extent_uptodate(io_tree, em->start,
> -                                 extent_map_end(em) - 1, NULL, GFP_NOFS);
> +             set_page_blks_state(page, 1 << BLK_STATE_UPTODATE, em->start,
> +                             extent_map_end(em) - 1);
>               goto insert;
>       }
>  not_found:
> @@ -8392,11 +8391,9 @@ static int __btrfs_releasepage(struct page *page, 
> gfp_t gfp_flags)
>       tree = &BTRFS_I(page->mapping->host)->io_tree;
>       map = &BTRFS_I(page->mapping->host)->extent_tree;
>       ret = try_release_extent_mapping(map, tree, page, gfp_flags);
> -     if (ret == 1) {
> -             ClearPagePrivate(page);
> -             set_page_private(page, 0);
> -             page_cache_release(page);
> -     }
> +     if (ret == 1)
> +             clear_page_extent_mapped(page);
> +
>       return ret;
>  }
>  
> -- 
> 2.1.0
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RFC PATCH V11 01/21] Btrfs: subpagesize-blocksize: Fix whole page read.

Reply via email to