On Fri, Nov 3, 2017 at 10:29 AM, Filipe Manana <fdman...@kernel.org> wrote: > On Fri, Nov 3, 2017 at 9:30 AM, Nikolay Borisov <nbori...@suse.com> wrote: >> >> >> On 25.10.2017 17:59, fdman...@kernel.org wrote: >>> From: Filipe Manana <fdman...@suse.com> >>> >>> This implements support the zero range operation of fallocate. For now >>> at least it's as simple as possible while reusing most of the existing >>> fallocate and hole punching infrastructure. >>> >>> Signed-off-by: Filipe Manana <fdman...@suse.com> >>> --- >>> >>> V2: Removed double inode unlock on error path from failure to lock range. >>> >>> fs/btrfs/file.c | 332 >>> +++++++++++++++++++++++++++++++++++++++++++++++++------- >>> 1 file changed, 290 insertions(+), 42 deletions(-) >>> >>> diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c >>> index aafcc785f840..e0d15c0d1641 100644 >>> --- a/fs/btrfs/file.c >>> +++ b/fs/btrfs/file.c >>> @@ -2448,7 +2448,48 @@ static int find_first_non_hole(struct inode *inode, >>> u64 *start, u64 *len) >>> return ret; >>> } >>> >>> -static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) >>> +static int btrfs_punch_hole_lock_range(struct inode *inode, >>> + const u64 lockstart, >>> + const u64 lockend, >>> + struct extent_state **cached_state) >>> +{ >>> + while (1) { >>> + struct btrfs_ordered_extent *ordered; >>> + int ret; >>> + >>> + truncate_pagecache_range(inode, lockstart, lockend); >>> + >>> + lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, >>> + cached_state); >>> + ordered = btrfs_lookup_first_ordered_extent(inode, lockend); >>> + >>> + /* >>> + * We need to make sure we have no ordered extents in this >>> range >>> + * and nobody raced in and read a page in this range, if we >>> did >>> + * we need to try again. >>> + */ >>> + if ((!ordered || >>> + (ordered->file_offset + ordered->len <= lockstart || >>> + ordered->file_offset > lockend)) && >>> + !btrfs_page_exists_in_range(inode, lockstart, lockend)) { >>> + if (ordered) >>> + btrfs_put_ordered_extent(ordered); >>> + break; >>> + } >>> + if (ordered) >>> + btrfs_put_ordered_extent(ordered); >>> + unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, >>> + lockend, cached_state, GFP_NOFS); >>> + ret = btrfs_wait_ordered_range(inode, lockstart, >>> + lockend - lockstart + 1); >>> + if (ret) >>> + return ret; >>> + } >>> + return 0; >>> +} >>> + >>> +static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len, >>> + bool lock_inode) >>> { >>> struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); >>> struct btrfs_root *root = BTRFS_I(inode)->root; >>> @@ -2477,7 +2518,8 @@ static int btrfs_punch_hole(struct inode *inode, >>> loff_t offset, loff_t len) >>> if (ret) >>> return ret; >>> >>> - inode_lock(inode); >>> + if (lock_inode) >>> + inode_lock(inode); >>> ino_size = round_up(inode->i_size, fs_info->sectorsize); >>> ret = find_first_non_hole(inode, &offset, &len); >>> if (ret < 0) >>> @@ -2516,7 +2558,8 @@ static int btrfs_punch_hole(struct inode *inode, >>> loff_t offset, loff_t len) >>> truncated_block = true; >>> ret = btrfs_truncate_block(inode, offset, 0, 0); >>> if (ret) { >>> - inode_unlock(inode); >>> + if (lock_inode) >>> + inode_unlock(inode); >>> return ret; >>> } >>> } >>> @@ -2564,38 +2607,12 @@ static int btrfs_punch_hole(struct inode *inode, >>> loff_t offset, loff_t len) >>> goto out_only_mutex; >>> } >>> >>> - while (1) { >>> - struct btrfs_ordered_extent *ordered; >>> - >>> - truncate_pagecache_range(inode, lockstart, lockend); >>> - >>> - lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, >>> - &cached_state); >>> - ordered = btrfs_lookup_first_ordered_extent(inode, lockend); >>> - >>> - /* >>> - * We need to make sure we have no ordered extents in this >>> range >>> - * and nobody raced in and read a page in this range, if we >>> did >>> - * we need to try again. >>> - */ >>> - if ((!ordered || >>> - (ordered->file_offset + ordered->len <= lockstart || >>> - ordered->file_offset > lockend)) && >>> - !btrfs_page_exists_in_range(inode, lockstart, lockend)) { >>> - if (ordered) >>> - btrfs_put_ordered_extent(ordered); >>> - break; >>> - } >>> - if (ordered) >>> - btrfs_put_ordered_extent(ordered); >>> - unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, >>> - lockend, &cached_state, GFP_NOFS); >>> - ret = btrfs_wait_ordered_range(inode, lockstart, >>> - lockend - lockstart + 1); >>> - if (ret) { >>> + ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend, >>> + &cached_state); >>> + if (ret) { >>> + if (lock_inode) >>> inode_unlock(inode); >>> - return ret; >>> - } >>> + return ret; >>> } >>> >>> path = btrfs_alloc_path(); >>> @@ -2758,7 +2775,8 @@ static int btrfs_punch_hole(struct inode *inode, >>> loff_t offset, loff_t len) >>> ret = btrfs_end_transaction(trans); >>> } >>> } >>> - inode_unlock(inode); >>> + if (lock_inode) >>> + inode_unlock(inode); >>> if (ret && !err) >>> err = ret; >>> return err; >>> @@ -2804,6 +2822,227 @@ static int add_falloc_range(struct list_head *head, >>> u64 start, u64 len) >>> return 0; >>> } >>> >>> +static int btrfs_zero_range_update_isize(struct inode *inode, >>> + const loff_t offset, >>> + const loff_t len, >>> + const int mode) >>> +{ >>> + struct btrfs_root *root = BTRFS_I(inode)->root; >>> + struct btrfs_trans_handle *trans; >>> + const u64 end = offset + len; >>> + int ret; >>> + >>> + if (mode & FALLOC_FL_KEEP_SIZE || end <= i_size_read(inode)) >>> + return 0; >>> + >>> + i_size_write(inode, end); >>> + btrfs_ordered_update_i_size(inode, end, NULL); >>> + trans = btrfs_start_transaction(root, 1); >>> + if (IS_ERR(trans)) { >>> + ret = PTR_ERR(trans); >>> + } else { >>> + int err; >>> + >>> + ret = btrfs_update_inode(trans, root, inode); >>> + err = btrfs_end_transaction(trans); >>> + ret = ret ? ret : err; >>> + } >>> + return ret; >>> +} >>> + >>> +static int btrfs_zero_range_check_range_boundary(struct inode *inode, >>> + u64 offset) >>> +{ >>> + const u64 sectorsize = btrfs_inode_sectorsize(inode); >>> + struct extent_map *em = NULL; >>> + int ret = 0; >>> + >>> + offset = round_down(offset, sectorsize); >>> + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); >>> + if (IS_ERR(em)) >>> + return PTR_ERR(em); >>> + >>> + if (em->block_start == EXTENT_MAP_HOLE) >>> + ret = 1; >>> + >>> + free_extent_map(em); >>> + return ret; >>> +} >>> + >>> +static int btrfs_zero_range(struct inode *inode, >>> + loff_t offset, >>> + loff_t len, >>> + const int mode) >>> +{ >>> + struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; >>> + struct extent_map *em; >>> + struct extent_changeset *data_reserved = NULL; >>> + int ret; >>> + u64 alloc_hint = 0; >>> + const u64 sectorsize = btrfs_inode_sectorsize(inode); >>> + u64 alloc_start = round_down(offset, sectorsize); >>> + u64 alloc_end = round_up(offset + len, sectorsize); >>> + u64 bytes_to_reserve = 0; >>> + bool space_reserved = false; >>> + bool punch_hole = false; >>> + >>> + inode_dio_wait(inode); >>> + >>> + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, >>> + alloc_start, alloc_end - alloc_start, 0); >>> + if (IS_ERR(em)) { >>> + ret = PTR_ERR(em); >>> + goto out; >>> + } >>> + >>> + /* >>> + * Avoid hole punching and extent allocation for some cases. More >>> cases >>> + * could be considered, but these are unlikely common and we keep >>> things >>> + * as simple as possible for now. Also, intentionally, if the target >>> + * range contains one or more prealloc extents together with regular >>> + * extents and holes, we drop all the existing extents and allocate a >>> + * new prealloc extent, so that we get a larger contiguous disk >>> extent. >>> + */ >>> + if (em->start <= alloc_start && >>> + test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { >>> + const u64 em_end = em->start + em->len; >>> + >>> + if (em_end >= offset + len) { >>> + /* >>> + * The whole range is already a prealloc extent, >>> + * do nothing except updating the inode's i_size if >>> + * needed. >>> + */ >>> + free_extent_map(em); >>> + ret = btrfs_zero_range_update_isize(inode, offset, >>> + len, mode); >>> + goto out; >>> + } >>> + /* >>> + * Part of the range is already a prealloc extent, so operate >>> + * only on the remaining part of the range. >>> + */ >>> + alloc_start = em_end; >>> + ASSERT(IS_ALIGNED(alloc_start, sectorsize)); >>> + len = offset + len - alloc_start; >>> + offset = alloc_start; >>> + alloc_hint = em->block_start + em->len; >>> + } >>> + free_extent_map(em); >>> + >>> + if (BTRFS_BYTES_TO_BLKS(fs_info, offset) == >>> + BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)) { >>> + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, >>> + alloc_start, sectorsize, 0); >>> + if (IS_ERR(em)) { >>> + ret = PTR_ERR(em); >>> + goto out; >>> + } >>> + >>> + if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { >>> + free_extent_map(em); >>> + ret = btrfs_zero_range_update_isize(inode, offset, >>> + len, mode); >>> + goto out; >>> + } >>> + if (len < sectorsize && em->block_start != EXTENT_MAP_HOLE) >>> + punch_hole = true; >>> + free_extent_map(em); >>> + if (punch_hole) >>> + goto punch_hole; >> >> This here is correct for a very non-obvious reason. If punch_hole is >> true this means we are only ever going to execute the partial truncate >> code in btrfs_punch_hole and not punch a hole at all, this is very >> convoluted way of invoking truncation! > > Well, it might be non-obvious for people not experienced with fs > development, but I don't think it's that terrible as you picture it. > Every fs developer knows that punching into a range smaller then > sector size means zeroing part of a block. > It's not this sort of things that makes people unable to contribute or > understand things. > >> >> Instead, I propose something similar to the attached diff which just >> calls btrfs_truncate_block directly. This allows to remove one of the >> labels and simplifies the code flow. I if this check triggers: >> (len < sectorsize && em->block_start != EXTENT_MAP_HOLE) then it's >> guaranteed that we are within the inode boundaries so there is no need >> to update the inode size, > > There isn't??? > There is of course, if the file size is not sector size aligned and > the target range affects the last block and goes beyond the current > i_size.
Plus your diff introduces another bug when alloc_start == alloc_end, in which case we shouldn't do anything other the previous truncation of boundary pages, as you left it, it results in locking a range with and end smaller then its start plus some other unpredictable behaviour. > > hence I've omitted it, though I'm not 100% >> sure, perhaps we want to update the inode's ctime ? >> >> This passes generic/008 and generic/009 > > Well keep in mind those tests don't cover all possible scenarios. > > I've integrated those cleanups and will send a v3 later after some > stress testing. > > >> >>> + alloc_start = round_down(offset, sectorsize); >>> + alloc_end = alloc_start + sectorsize; >>> + goto reserve_space; >>> + } >>> + >>> + alloc_start = round_up(offset, sectorsize); >>> + alloc_end = round_down(offset + len, sectorsize); >>> + >>> + /* >>> + * For unaligned ranges, check the pages at the boundaries, they might >>> + * map to an extent, in which case we need to partially zero them, or >>> + * they might map to a hole, in which case we need our allocation >>> range >>> + * to cover them. >>> + */ >>> + if (!IS_ALIGNED(offset, sectorsize)) { >>> + ret = btrfs_zero_range_check_range_boundary(inode, offset); >>> + if (ret < 0) >>> + goto out; >>> + if (ret) { >>> + alloc_start = round_down(offset, sectorsize); >>> + ret = 0; >>> + } else { >>> + ret = btrfs_truncate_block(inode, offset, 0, 0); >>> + if (ret) >>> + goto out; >>> + } >>> + } >>> + >>> + if (!IS_ALIGNED(offset + len, sectorsize)) { >>> + ret = btrfs_zero_range_check_range_boundary(inode, >>> + offset + len); >>> + if (ret < 0) >>> + goto out; >>> + if (ret) { >>> + alloc_end = round_up(offset + len, sectorsize); >>> + ret = 0; >>> + } else { >>> + ret = btrfs_truncate_block(inode, offset + len, 0, 1); >>> + if (ret) >>> + goto out; >>> + } >>> + } >>> + >>> +reserve_space: >>> + if (alloc_start < alloc_end) >>> + bytes_to_reserve += alloc_end - alloc_start; >>> + >>> + if (!punch_hole && bytes_to_reserve > 0) { >>> + ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), >>> + bytes_to_reserve); >>> + if (ret < 0) >>> + goto out; >>> + space_reserved = true; >>> + ret = btrfs_qgroup_reserve_data(inode, &data_reserved, >>> + alloc_start, >>> bytes_to_reserve); >>> + if (ret) >>> + goto out; >>> + } >>> + >>> +punch_hole: >>> + if (punch_hole) { >>> + ret = btrfs_punch_hole(inode, offset, len, false); >>> + if (ret) >>> + goto out; >>> + ret = btrfs_zero_range_update_isize(inode, offset, len, mode); >>> + } else { >>> + struct extent_state *cached_state = NULL; >>> + const u64 lockstart = alloc_start; >>> + const u64 lockend = alloc_end - 1; >>> + >>> + ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend, >>> + &cached_state); >>> + if (ret) >>> + goto out; >>> + ret = btrfs_prealloc_file_range(inode, mode, alloc_start, >>> + alloc_end - alloc_start, >>> + i_blocksize(inode), >>> + offset + len, &alloc_hint); >>> + unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, >>> + lockend, &cached_state, GFP_KERNEL); >>> + /* btrfs_prealloc_file_range releases reserved space on error >>> */ >>> + if (ret) >>> + space_reserved = false; >>> + } >>> + out: >>> + if (ret && space_reserved) >>> + btrfs_free_reserved_data_space(inode, data_reserved, >>> + alloc_start, bytes_to_reserve); >>> + extent_changeset_free(data_reserved); >>> + >>> + return ret; >>> +} >>> + >>> static long btrfs_fallocate(struct file *file, int mode, >>> loff_t offset, loff_t len) >>> { >>> @@ -2829,21 +3068,24 @@ static long btrfs_fallocate(struct file *file, int >>> mode, >>> cur_offset = alloc_start; >>> >>> /* Make sure we aren't being give some crap mode */ >>> - if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) >>> + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | >>> + FALLOC_FL_ZERO_RANGE)) >>> return -EOPNOTSUPP; >>> >>> if (mode & FALLOC_FL_PUNCH_HOLE) >>> - return btrfs_punch_hole(inode, offset, len); >>> + return btrfs_punch_hole(inode, offset, len, true); >>> >>> /* >>> * Only trigger disk allocation, don't trigger qgroup reserve >>> * >>> * For qgroup space, it will be checked later. >>> */ >>> - ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), >>> - alloc_end - alloc_start); >>> - if (ret < 0) >>> - return ret; >>> + if (!(mode & FALLOC_FL_ZERO_RANGE)) { >>> + ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), >>> + alloc_end - >>> alloc_start); >>> + if (ret < 0) >>> + return ret; >>> + } >>> >>> inode_lock(inode); >>> >>> @@ -2885,6 +3127,12 @@ static long btrfs_fallocate(struct file *file, int >>> mode, >>> if (ret) >>> goto out; >>> >>> + if (mode & FALLOC_FL_ZERO_RANGE) { >>> + ret = btrfs_zero_range(inode, offset, len, mode); >>> + inode_unlock(inode); >>> + return ret; >>> + } >>> + >>> locked_end = alloc_end - 1; >>> while (1) { >>> struct btrfs_ordered_extent *ordered; >>> @@ -3010,7 +3258,7 @@ static long btrfs_fallocate(struct file *file, int >>> mode, >>> out: >>> inode_unlock(inode); >>> /* Let go of our reservation. */ >>> - if (ret != 0) >>> + if (ret != 0 && !(mode & FALLOC_FL_ZERO_RANGE)) >>> btrfs_free_reserved_data_space(inode, data_reserved, >>> alloc_start, alloc_end - cur_offset); >>> extent_changeset_free(data_reserved); >>> -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html