On 11/03/2017 11:20 AM, fdman...@kernel.org wrote:
> From: Filipe Manana <fdman...@suse.com>
> 
> This implements support the zero range operation of fallocate. For now
> at least it's as simple as possible while reusing most of the existing
> fallocate and hole punching infrastructure.
> 
> Signed-off-by: Filipe Manana <fdman...@suse.com>
> ---
> 
> V2: Removed double inode unlock on error path from failure to lock range.
> V3: Factored common code to update isize and inode item into a helper
>     function, plus some minor cleanup.
> 
>  fs/btrfs/file.c | 351 
> +++++++++++++++++++++++++++++++++++++++++++++-----------
>  1 file changed, 285 insertions(+), 66 deletions(-)
> 
> diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
> index aafcc785f840..2cc1aed1c564 100644
> --- a/fs/btrfs/file.c
> +++ b/fs/btrfs/file.c
> @@ -2448,7 +2448,48 @@ static int find_first_non_hole(struct inode *inode, 
> u64 *start, u64 *len)
>       return ret;
>  }
>  
> -static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
> +static int btrfs_punch_hole_lock_range(struct inode *inode,
> +                                    const u64 lockstart,
> +                                    const u64 lockend,
> +                                    struct extent_state **cached_state)
> +{
> +     while (1) {
> +             struct btrfs_ordered_extent *ordered;
> +             int ret;
> +
> +             truncate_pagecache_range(inode, lockstart, lockend);
> +
> +             lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
> +                              cached_state);
> +             ordered = btrfs_lookup_first_ordered_extent(inode, lockend);
> +
> +             /*
> +              * We need to make sure we have no ordered extents in this range
> +              * and nobody raced in and read a page in this range, if we did
> +              * we need to try again.
> +              */
> +             if ((!ordered ||
> +                 (ordered->file_offset + ordered->len <= lockstart ||
> +                  ordered->file_offset > lockend)) &&
> +                  !btrfs_page_exists_in_range(inode, lockstart, lockend)) {
> +                     if (ordered)
> +                             btrfs_put_ordered_extent(ordered);
> +                     break;
> +             }
> +             if (ordered)
> +                     btrfs_put_ordered_extent(ordered);
> +             unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
> +                                  lockend, cached_state, GFP_NOFS);
> +             ret = btrfs_wait_ordered_range(inode, lockstart,
> +                                            lockend - lockstart + 1);
> +             if (ret)
> +                     return ret;
> +     }
> +     return 0;
> +}
> +
> +static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len,
> +                         bool lock_inode)

The inode_lock may no longer be needed, since it looks to be always true
in this version of the patch.

Ed

>  {
>       struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
>       struct btrfs_root *root = BTRFS_I(inode)->root;
> @@ -2477,7 +2518,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t 
> offset, loff_t len)
>       if (ret)
>               return ret;
>  
> -     inode_lock(inode);
> +     if (lock_inode)
> +             inode_lock(inode);
>       ino_size = round_up(inode->i_size, fs_info->sectorsize);
>       ret = find_first_non_hole(inode, &offset, &len);
>       if (ret < 0)
> @@ -2516,7 +2558,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t 
> offset, loff_t len)
>               truncated_block = true;
>               ret = btrfs_truncate_block(inode, offset, 0, 0);
>               if (ret) {
> -                     inode_unlock(inode);
> +                     if (lock_inode)
> +                             inode_unlock(inode);
>                       return ret;
>               }
>       }
> @@ -2564,38 +2607,12 @@ static int btrfs_punch_hole(struct inode *inode, 
> loff_t offset, loff_t len)
>               goto out_only_mutex;
>       }
>  
> -     while (1) {
> -             struct btrfs_ordered_extent *ordered;
> -
> -             truncate_pagecache_range(inode, lockstart, lockend);
> -
> -             lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
> -                              &cached_state);
> -             ordered = btrfs_lookup_first_ordered_extent(inode, lockend);
> -
> -             /*
> -              * We need to make sure we have no ordered extents in this range
> -              * and nobody raced in and read a page in this range, if we did
> -              * we need to try again.
> -              */
> -             if ((!ordered ||
> -                 (ordered->file_offset + ordered->len <= lockstart ||
> -                  ordered->file_offset > lockend)) &&
> -                  !btrfs_page_exists_in_range(inode, lockstart, lockend)) {
> -                     if (ordered)
> -                             btrfs_put_ordered_extent(ordered);
> -                     break;
> -             }
> -             if (ordered)
> -                     btrfs_put_ordered_extent(ordered);
> -             unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
> -                                  lockend, &cached_state, GFP_NOFS);
> -             ret = btrfs_wait_ordered_range(inode, lockstart,
> -                                            lockend - lockstart + 1);
> -             if (ret) {
> +     ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend,
> +                                       &cached_state);
> +     if (ret) {
> +             if (lock_inode)
>                       inode_unlock(inode);
> -                     return ret;
> -             }
> +             return ret;
>       }
>  
>       path = btrfs_alloc_path();
> @@ -2758,7 +2775,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t 
> offset, loff_t len)
>                       ret = btrfs_end_transaction(trans);
>               }
>       }
> -     inode_unlock(inode);
> +     if (lock_inode)
> +             inode_unlock(inode);
>       if (ret && !err)
>               err = ret;
>       return err;
> @@ -2804,6 +2822,217 @@ static int add_falloc_range(struct list_head *head, 
> u64 start, u64 len)
>       return 0;
>  }
>  
> +static int btrfs_fallocate_update_isize(struct inode *inode,
> +                                     const u64 end,
> +                                     const int mode)
> +{
> +     struct btrfs_trans_handle *trans;
> +     struct btrfs_root *root = BTRFS_I(inode)->root;
> +     int ret;
> +     int ret2;
> +
> +     if (mode & FALLOC_FL_KEEP_SIZE || end <= i_size_read(inode))
> +             return 0;
> +
> +     trans = btrfs_start_transaction(root, 1);
> +     if (IS_ERR(trans))
> +             return PTR_ERR(trans);
> +
> +     inode->i_ctime = current_time(inode);
> +     i_size_write(inode, end);
> +     btrfs_ordered_update_i_size(inode, end, NULL);
> +     ret = btrfs_update_inode(trans, root, inode);
> +     ret2 = btrfs_end_transaction(trans);
> +
> +     return ret ? ret : ret2;
> +}
> +
> +static int btrfs_zero_range_check_range_boundary(struct inode *inode,
> +                                              u64 offset)
> +{
> +     const u64 sectorsize = btrfs_inode_sectorsize(inode);
> +     struct extent_map *em = NULL;
> +     int ret = 0;
> +
> +     offset = round_down(offset, sectorsize);
> +     em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
> +     if (IS_ERR(em))
> +             return PTR_ERR(em);
> +
> +     if (em->block_start == EXTENT_MAP_HOLE)
> +             ret = 1;
> +
> +     free_extent_map(em);
> +     return ret;
> +}
> +
> +static int btrfs_zero_range(struct inode *inode,
> +                         loff_t offset,
> +                         loff_t len,
> +                         const int mode)
> +{
> +     struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
> +     struct extent_map *em;
> +     struct extent_changeset *data_reserved = NULL;
> +     int ret;
> +     u64 alloc_hint = 0;
> +     const u64 sectorsize = btrfs_inode_sectorsize(inode);
> +     u64 alloc_start = round_down(offset, sectorsize);
> +     u64 alloc_end = round_up(offset + len, sectorsize);
> +     u64 bytes_to_reserve = 0;
> +     bool space_reserved = false;
> +
> +     inode_dio_wait(inode);
> +
> +     em = btrfs_get_extent(BTRFS_I(inode), NULL, 0,
> +                           alloc_start, alloc_end - alloc_start, 0);
> +     if (IS_ERR(em)) {
> +             ret = PTR_ERR(em);
> +             goto out;
> +     }
> +
> +     /*
> +      * Avoid hole punching and extent allocation for some cases. More cases
> +      * could be considered, but these are unlikely common and we keep things
> +      * as simple as possible for now. Also, intentionally, if the target
> +      * range contains one or more prealloc extents together with regular
> +      * extents and holes, we drop all the existing extents and allocate a
> +      * new prealloc extent, so that we get a larger contiguous disk extent.
> +      */
> +     if (em->start <= alloc_start &&
> +         test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
> +             const u64 em_end = em->start + em->len;
> +
> +             if (em_end >= offset + len) {
> +                     /*
> +                      * The whole range is already a prealloc extent,
> +                      * do nothing except updating the inode's i_size if
> +                      * needed.
> +                      */
> +                     free_extent_map(em);
> +                     ret = btrfs_fallocate_update_isize(inode, offset + len,
> +                                                        mode);
> +                     goto out;
> +             }
> +             /*
> +              * Part of the range is already a prealloc extent, so operate
> +              * only on the remaining part of the range.
> +              */
> +             alloc_start = em_end;
> +             ASSERT(IS_ALIGNED(alloc_start, sectorsize));
> +             len = offset + len - alloc_start;
> +             offset = alloc_start;
> +             alloc_hint = em->block_start + em->len;
> +     }
> +     free_extent_map(em);
> +
> +     if (BTRFS_BYTES_TO_BLKS(fs_info, offset) ==
> +         BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)) {
> +             em = btrfs_get_extent(BTRFS_I(inode), NULL, 0,
> +                                   alloc_start, sectorsize, 0);
> +             if (IS_ERR(em)) {
> +                     ret = PTR_ERR(em);
> +                     goto out;
> +             }
> +
> +             if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
> +                     free_extent_map(em);
> +                     ret = btrfs_fallocate_update_isize(inode, offset + len,
> +                                                        mode);
> +                     goto out;
> +             }
> +             if (len < sectorsize && em->block_start != EXTENT_MAP_HOLE) {
> +                     free_extent_map(em);
> +                     ret = btrfs_truncate_block(inode, offset, len, 0);
> +                     if (!ret)
> +                             ret = btrfs_fallocate_update_isize(inode,
> +                                                                offset + len,
> +                                                                mode);
> +                     return ret;
> +             }
> +             free_extent_map(em);
> +             alloc_start = round_down(offset, sectorsize);
> +             alloc_end = alloc_start + sectorsize;
> +             goto reserve_space;
> +     }
> +
> +     alloc_start = round_up(offset, sectorsize);
> +     alloc_end = round_down(offset + len, sectorsize);
> +
> +     /*
> +      * For unaligned ranges, check the pages at the boundaries, they might
> +      * map to an extent, in which case we need to partially zero them, or
> +      * they might map to a hole, in which case we need our allocation range
> +      * to cover them.
> +      */
> +     if (!IS_ALIGNED(offset, sectorsize)) {
> +             ret = btrfs_zero_range_check_range_boundary(inode, offset);
> +             if (ret < 0)
> +                     goto out;
> +             if (ret) {
> +                     alloc_start = round_down(offset, sectorsize);
> +                     ret = 0;
> +             } else {
> +                     ret = btrfs_truncate_block(inode, offset, 0, 0);
> +                     if (ret)
> +                             goto out;
> +             }
> +     }
> +
> +     if (!IS_ALIGNED(offset + len, sectorsize)) {
> +             ret = btrfs_zero_range_check_range_boundary(inode,
> +                                                         offset + len);
> +             if (ret < 0)
> +                     goto out;
> +             if (ret) {
> +                     alloc_end = round_up(offset + len, sectorsize);
> +                     ret = 0;
> +             } else {
> +                     ret = btrfs_truncate_block(inode, offset + len, 0, 1);
> +                     if (ret)
> +                             goto out;
> +             }
> +     }
> +
> +reserve_space:
> +     if (alloc_start < alloc_end) {
> +             struct extent_state *cached_state = NULL;
> +             const u64 lockstart = alloc_start;
> +             const u64 lockend = alloc_end - 1;
> +
> +             bytes_to_reserve = alloc_end - alloc_start;
> +             ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
> +                                                   bytes_to_reserve);
> +             if (ret < 0)
> +                     goto out;
> +             space_reserved = true;
> +             ret = btrfs_qgroup_reserve_data(inode, &data_reserved,
> +                                             alloc_start, bytes_to_reserve);
> +             if (ret)
> +                     goto out;
> +             ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend,
> +                                               &cached_state);
> +             if (ret)
> +                     goto out;
> +             ret = btrfs_prealloc_file_range(inode, mode, alloc_start,
> +                                             alloc_end - alloc_start,
> +                                             i_blocksize(inode),
> +                                             offset + len, &alloc_hint);
> +             unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
> +                                  lockend, &cached_state, GFP_KERNEL);
> +             /* btrfs_prealloc_file_range releases reserved space on error */
> +             if (ret)
> +                     space_reserved = false;
> +     }
> + out:
> +     if (ret && space_reserved)
> +             btrfs_free_reserved_data_space(inode, data_reserved,
> +                                            alloc_start, bytes_to_reserve);
> +     extent_changeset_free(data_reserved);
> +
> +     return ret;
> +}
> +
>  static long btrfs_fallocate(struct file *file, int mode,
>                           loff_t offset, loff_t len)
>  {
> @@ -2829,21 +3058,24 @@ static long btrfs_fallocate(struct file *file, int 
> mode,
>       cur_offset = alloc_start;
>  
>       /* Make sure we aren't being give some crap mode */
> -     if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
> +     if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
> +                  FALLOC_FL_ZERO_RANGE))
>               return -EOPNOTSUPP;
>  
>       if (mode & FALLOC_FL_PUNCH_HOLE)
> -             return btrfs_punch_hole(inode, offset, len);
> +             return btrfs_punch_hole(inode, offset, len, true);
>  
>       /*
>        * Only trigger disk allocation, don't trigger qgroup reserve
>        *
>        * For qgroup space, it will be checked later.
>        */
> -     ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
> -                     alloc_end - alloc_start);
> -     if (ret < 0)
> -             return ret;
> +     if (!(mode & FALLOC_FL_ZERO_RANGE)) {
> +             ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
> +                                                   alloc_end - alloc_start);
> +             if (ret < 0)
> +                     return ret;
> +     }
>  
>       inode_lock(inode);
>  
> @@ -2885,6 +3117,12 @@ static long btrfs_fallocate(struct file *file, int 
> mode,
>       if (ret)
>               goto out;
>  
> +     if (mode & FALLOC_FL_ZERO_RANGE) {
> +             ret = btrfs_zero_range(inode, offset, len, mode);
> +             inode_unlock(inode);
> +             return ret;
> +     }
> +
>       locked_end = alloc_end - 1;
>       while (1) {
>               struct btrfs_ordered_extent *ordered;
> @@ -2980,37 +3218,18 @@ static long btrfs_fallocate(struct file *file, int 
> mode,
>       if (ret < 0)
>               goto out_unlock;
>  
> -     if (actual_end > inode->i_size &&
> -         !(mode & FALLOC_FL_KEEP_SIZE)) {
> -             struct btrfs_trans_handle *trans;
> -             struct btrfs_root *root = BTRFS_I(inode)->root;
> -
> -             /*
> -              * We didn't need to allocate any more space, but we
> -              * still extended the size of the file so we need to
> -              * update i_size and the inode item.
> -              */
> -             trans = btrfs_start_transaction(root, 1);
> -             if (IS_ERR(trans)) {
> -                     ret = PTR_ERR(trans);
> -             } else {
> -                     inode->i_ctime = current_time(inode);
> -                     i_size_write(inode, actual_end);
> -                     btrfs_ordered_update_i_size(inode, actual_end, NULL);
> -                     ret = btrfs_update_inode(trans, root, inode);
> -                     if (ret)
> -                             btrfs_end_transaction(trans);
> -                     else
> -                             ret = btrfs_end_transaction(trans);
> -             }
> -     }
> +     /*
> +      * We didn't need to allocate any more space, but we still extended the
> +      * size of the file so we need to update i_size and the inode item.
> +      */
> +     ret = btrfs_fallocate_update_isize(inode, actual_end, mode);
>  out_unlock:
>       unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
>                            &cached_state, GFP_KERNEL);
>  out:
>       inode_unlock(inode);
>       /* Let go of our reservation. */
> -     if (ret != 0)
> +     if (ret != 0 && !(mode & FALLOC_FL_ZERO_RANGE))
>               btrfs_free_reserved_data_space(inode, data_reserved,
>                               alloc_start, alloc_end - cur_offset);
>       extent_changeset_free(data_reserved);
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to