[PATCH] btrfs: skip file_extent generation check for free_space_inode in run_delalloc_nocow
The btrfs/001 with inode_cache mount option will encounter the following warning: WARNING: CPU: 1 PID: 23700 at fs/btrfs/inode.c:956 cow_file_range.isra.19+0x32b/0x430 [btrfs] CPU: 1 PID: 23700 Comm: btrfs Kdump: loaded Tainted: GW O 4.20.0-rc4-custom+ #30 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015 RIP: 0010:cow_file_range.isra.19+0x32b/0x430 [btrfs] Call Trace: ? free_extent_buffer+0x46/0x90 [btrfs] run_delalloc_nocow+0x455/0x900 [btrfs] btrfs_run_delalloc_range+0x1a7/0x360 [btrfs] writepage_delalloc+0xf9/0x150 [btrfs] __extent_writepage+0x125/0x3e0 [btrfs] extent_write_cache_pages+0x1b6/0x3e0 [btrfs] ? __wake_up_common_lock+0x63/0xc0 extent_writepages+0x50/0x80 [btrfs] do_writepages+0x41/0xd0 ? __filemap_fdatawrite_range+0x9e/0xf0 __filemap_fdatawrite_range+0xbe/0xf0 btrfs_fdatawrite_range+0x1b/0x50 [btrfs] __btrfs_write_out_cache+0x42c/0x480 [btrfs] btrfs_write_out_ino_cache+0x84/0xd0 [btrfs] btrfs_save_ino_cache+0x551/0x660 [btrfs] commit_fs_roots+0xc5/0x190 [btrfs] btrfs_commit_transaction+0x2bf/0x8d0 [btrfs] btrfs_mksubvol+0x48d/0x4d0 [btrfs] btrfs_ioctl_snap_create_transid+0x170/0x180 [btrfs] btrfs_ioctl_snap_create_v2+0x124/0x180 [btrfs] btrfs_ioctl+0x123f/0x3030 [btrfs] The file extent generation of the free space inode is equal to the last snapshot of the file root, so the inode will be passed to cow_file_rage. But the inode was created and its extents were preallocated in btrfs_save_ino_cache, there are no cow copies on disk. The preallocated extents don't present on disk, and the btrfs_cross_ref_exist will ignore the -ENOENT returned by check_committed_ref, so we can directly write the inode to the disk. Fixes: 78d4295b1eee ("btrfs: lift some btrfs_cross_ref_exist checks in nocow path") Signed-off-by: Lu Fengqi --- fs/btrfs/inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d54bdef16d8d..9c5e9629eb6c 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1369,7 +1369,8 @@ static noinline int run_delalloc_nocow(struct inode *inode, * Do the same check as in btrfs_cross_ref_exist but * without the unnecessary search. */ - if (btrfs_file_extent_generation(leaf, fi) <= + if (!nolock && + btrfs_file_extent_generation(leaf, fi) <= btrfs_root_last_snapshot(&root->root_item)) goto out_check; if (extent_type == BTRFS_FILE_EXTENT_REG && !force) -- 2.19.2
[PATCH v2 1/3] btrfs: remove always true if branch in find_delalloc_range
The @found is always false when it comes to the if branch. Besides, the bool type is more suitable for @found. Change the return value of the function and its caller to bool as well. Signed-off-by: Lu Fengqi --- fs/btrfs/extent_io.c | 31 +++ fs/btrfs/extent_io.h | 2 +- fs/btrfs/tests/extent-io-tests.c | 2 +- 3 files changed, 17 insertions(+), 18 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index b2769e92b556..4b6b87e63b4a 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1452,16 +1452,16 @@ int find_first_extent_bit(struct extent_io_tree *tree, u64 start, * find a contiguous range of bytes in the file marked as delalloc, not * more than 'max_bytes'. start and end are used to return the range, * - * 1 is returned if we find something, 0 if nothing was in the tree + * true is returned if we find something, false if nothing was in the tree */ -static noinline u64 find_delalloc_range(struct extent_io_tree *tree, +static noinline bool find_delalloc_range(struct extent_io_tree *tree, u64 *start, u64 *end, u64 max_bytes, struct extent_state **cached_state) { struct rb_node *node; struct extent_state *state; u64 cur_start = *start; - u64 found = 0; + bool found = false; u64 total_bytes = 0; spin_lock(&tree->lock); @@ -1472,8 +1472,7 @@ static noinline u64 find_delalloc_range(struct extent_io_tree *tree, */ node = tree_search(tree, cur_start); if (!node) { - if (!found) - *end = (u64)-1; + *end = (u64)-1; goto out; } @@ -1493,7 +1492,7 @@ static noinline u64 find_delalloc_range(struct extent_io_tree *tree, *cached_state = state; refcount_inc(&state->refs); } - found++; + found = true; *end = state->end; cur_start = state->end + 1; node = rb_next(node); @@ -1551,13 +1550,13 @@ static noinline int lock_delalloc_pages(struct inode *inode, } /* - * find a contiguous range of bytes in the file marked as delalloc, not - * more than 'max_bytes'. start and end are used to return the range, + * find and lock a contiguous range of bytes in the file marked as delalloc, + * not more than 'max_bytes'. start and end are used to return the range, * - * 1 is returned if we find something, 0 if nothing was in the tree + * true is returned if we find something, false if nothing was in the tree */ EXPORT_FOR_TESTS -noinline_for_stack u64 find_lock_delalloc_range(struct inode *inode, +noinline_for_stack bool find_lock_delalloc_range(struct inode *inode, struct extent_io_tree *tree, struct page *locked_page, u64 *start, u64 *end) @@ -1565,7 +1564,7 @@ noinline_for_stack u64 find_lock_delalloc_range(struct inode *inode, u64 max_bytes = BTRFS_MAX_EXTENT_SIZE; u64 delalloc_start; u64 delalloc_end; - u64 found; + bool found; struct extent_state *cached_state = NULL; int ret; int loops = 0; @@ -1580,7 +1579,7 @@ noinline_for_stack u64 find_lock_delalloc_range(struct inode *inode, *start = delalloc_start; *end = delalloc_end; free_extent_state(cached_state); - return 0; + return false; } /* @@ -1612,7 +1611,7 @@ noinline_for_stack u64 find_lock_delalloc_range(struct inode *inode, loops = 1; goto again; } else { - found = 0; + found = false; goto out_failed; } } @@ -3195,7 +3194,7 @@ static noinline_for_stack int writepage_delalloc(struct inode *inode, { struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; u64 page_end = delalloc_start + PAGE_SIZE - 1; - u64 nr_delalloc; + bool found; u64 delalloc_to_write = 0; u64 delalloc_end = 0; int ret; @@ -3203,11 +3202,11 @@ static noinline_for_stack int writepage_delalloc(struct inode *inode, while (delalloc_end < page_end) { - nr_delalloc = find_lock_delalloc_range(inode, tree, + found = find_lock_delalloc_range(inode, tree, page, &delalloc_start, &delalloc_end); - if (nr_delalloc == 0) { + if (!found) { delalloc_start = delalloc_end + 1;
Re: [PATCH 1/3] btrfs: remove always true if branch in find_delalloc_range
On Wed, Nov 28, 2018 at 09:01:42AM +0200, Nikolay Borisov wrote: > > >On 28.11.18 г. 5:21 ч., Lu Fengqi wrote: >> The @found is always false when it comes to the if branch. Besides, the >> bool type is more suitable for @found. > >Well if you are ranging the type of found variable it also makes sense >to change the return value of the function to bool as well. Good catch. -- Thanks, Lu > >> >> Signed-off-by: Lu Fengqi >> --- >> fs/btrfs/extent_io.c | 7 +++ >> 1 file changed, 3 insertions(+), 4 deletions(-) >> >> diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c >> index 582b4b1c41e0..b4ee3399be96 100644 >> --- a/fs/btrfs/extent_io.c >> +++ b/fs/btrfs/extent_io.c >> @@ -1461,7 +1461,7 @@ static noinline u64 find_delalloc_range(struct >> extent_io_tree *tree, >> struct rb_node *node; >> struct extent_state *state; >> u64 cur_start = *start; >> -u64 found = 0; >> +bool found = false; >> u64 total_bytes = 0; >> >> spin_lock(&tree->lock); >> @@ -1472,8 +1472,7 @@ static noinline u64 find_delalloc_range(struct >> extent_io_tree *tree, >> */ >> node = tree_search(tree, cur_start); >> if (!node) { >> -if (!found) >> -*end = (u64)-1; >> +*end = (u64)-1; >> goto out; >> } >> >> @@ -1493,7 +1492,7 @@ static noinline u64 find_delalloc_range(struct >> extent_io_tree *tree, >> *cached_state = state; >> refcount_inc(&state->refs); >> } >> -found++; >> +found = true; >> *end = state->end; >> cur_start = state->end + 1; >> node = rb_next(node); >> > >
Re: [RFC PATCH] btrfs: drop file privileges in btrfs_clone_files
On Wed, Nov 28, 2018 at 09:48:07AM +0200, Nikolay Borisov wrote: > > >On 28.11.18 г. 9:46 ч., Christoph Hellwig wrote: >> On Wed, Nov 28, 2018 at 09:44:59AM +0200, Nikolay Borisov wrote: >>> >>> >>> On 28.11.18 г. 5:07 ч., Lu Fengqi wrote: >>>> The generic/513 tell that cloning into a file did not strip security >>>> privileges (suid, capabilities) like a regular write would. >>>> >>>> Signed-off-by: Lu Fengqi >>>> --- >>>> The xfs and ocfs2 call generic_remap_file_range_prep to drop file >>>> privileges, I'm not sure whether btrfs should do the same thing. >>> >>> Why do you think btrfs shouldn't do the same thing. Looking at I'm not sure btrfs doesn't use generic check intentionally for some reason. >>> remap_file_range_prep it seems that btrfs is missing a ton of checks >>> that are useful i.e immutable files/aligned offsets etc. It is indeed. In addition, generic_remap_file_range_prep will invoke inode_dio_wait filemap_write_and_wait_range for the source and destination inode/range. For the dedupe case, it will call vfs_dedupe_file_range_compare. I still can't judge whether these operations are welcome by btrfs. I will go deep into the code. >> >> Any chance we could move btrfs over to use remap_file_range_prep so that >> all file systems share the exact same checks? In theory we can call generic_remap_file_range_prep in btrfs_remap_file_range, which give us the opportunity to clean up the duplicate check code in btrfs_extent_same and btrfs_clone_files. > >I'm not very familiar with the, Filipe is more familiar so adding to CC. >But IMO we should do that provided there are no blockers. > >Filipe, what do you think, is it feasible? I'm all ears for the suggestions. -- Thanks, Lu
[PATCH 3/3] btrfs: remove redundant nowait check for buffered_write
The generic_write_checks will check the combination of IOCB_NOWAIT and !IOCB_DIRECT. Signed-off-by: Lu Fengqi --- fs/btrfs/file.c | 4 1 file changed, 4 deletions(-) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 3835bb8c146d..190db9a685a2 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1889,10 +1889,6 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, loff_t oldsize; int clean_page = 0; - if (!(iocb->ki_flags & IOCB_DIRECT) && - (iocb->ki_flags & IOCB_NOWAIT)) - return -EOPNOTSUPP; - if (!inode_trylock(inode)) { if (iocb->ki_flags & IOCB_NOWAIT) return -EAGAIN; -- 2.19.2
[PATCH 2/3] btrfs: cleanup the useless DEFINE_WAIT in cleanup_transaction
When it is introduced at commit f094ac32aba3 ("Btrfs: fix NULL pointer after aborting a transaction"), it's useless. Signed-off-by: Lu Fengqi --- fs/btrfs/transaction.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index f92c0a88c4ad..67e84939b758 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1840,7 +1840,6 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, int err) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_transaction *cur_trans = trans->transaction; - DEFINE_WAIT(wait); WARN_ON(refcount_read(&trans->use_count) > 1); -- 2.19.2
[PATCH 1/3] btrfs: remove always true if branch in find_delalloc_range
The @found is always false when it comes to the if branch. Besides, the bool type is more suitable for @found. Signed-off-by: Lu Fengqi --- fs/btrfs/extent_io.c | 7 +++ 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 582b4b1c41e0..b4ee3399be96 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1461,7 +1461,7 @@ static noinline u64 find_delalloc_range(struct extent_io_tree *tree, struct rb_node *node; struct extent_state *state; u64 cur_start = *start; - u64 found = 0; + bool found = false; u64 total_bytes = 0; spin_lock(&tree->lock); @@ -1472,8 +1472,7 @@ static noinline u64 find_delalloc_range(struct extent_io_tree *tree, */ node = tree_search(tree, cur_start); if (!node) { - if (!found) - *end = (u64)-1; + *end = (u64)-1; goto out; } @@ -1493,7 +1492,7 @@ static noinline u64 find_delalloc_range(struct extent_io_tree *tree, *cached_state = state; refcount_inc(&state->refs); } - found++; + found = true; *end = state->end; cur_start = state->end + 1; node = rb_next(node); -- 2.19.2
[RFC PATCH] btrfs: drop file privileges in btrfs_clone_files
The generic/513 tell that cloning into a file did not strip security privileges (suid, capabilities) like a regular write would. Signed-off-by: Lu Fengqi --- The xfs and ocfs2 call generic_remap_file_range_prep to drop file privileges, I'm not sure whether btrfs should do the same thing. Any suggestion? fs/btrfs/ioctl.c | 4 1 file changed, 4 insertions(+) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 410c7e007ba8..bc33c480603b 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -4312,6 +4312,10 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src, goto out_unlock; } + ret = file_remove_privs(file); + if (ret) + goto out_unlock; + if (destoff > inode->i_size) { ret = btrfs_cont_expand(inode, inode->i_size, destoff); if (ret) -- 2.19.2
Re: [PATCH 3/6] btrfs: cleanup extent_op handling
On Wed, Nov 21, 2018 at 01:59:09PM -0500, Josef Bacik wrote: >From: Josef Bacik > >The cleanup_extent_op function actually would run the extent_op if it >needed running, which made the name sort of a misnomer. Change it to >run_and_cleanup_extent_op, and move the actual cleanup work to >cleanup_extent_op so it can be used by check_ref_cleanup() in order to >unify the extent op handling. > >Signed-off-by: Josef Bacik One nitpick below. Reviewed-by: Lu Fengqi >--- > fs/btrfs/extent-tree.c | 36 +++- > 1 file changed, 23 insertions(+), 13 deletions(-) > >diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c >index e3ed3507018d..8a776dc9cb38 100644 >--- a/fs/btrfs/extent-tree.c >+++ b/fs/btrfs/extent-tree.c >@@ -2424,19 +2424,33 @@ static void unselect_delayed_ref_head(struct >btrfs_delayed_ref_root *delayed_ref > btrfs_delayed_ref_unlock(head); > } > >-static int cleanup_extent_op(struct btrfs_trans_handle *trans, >- struct btrfs_delayed_ref_head *head) >+static struct btrfs_delayed_extent_op * >+cleanup_extent_op(struct btrfs_trans_handle *trans, The trans parameter seems useless. -- Thanks, Lu >+struct btrfs_delayed_ref_head *head) > { > struct btrfs_delayed_extent_op *extent_op = head->extent_op; >- int ret; > > if (!extent_op) >- return 0; >- head->extent_op = NULL; >+ return NULL; >+ > if (head->must_insert_reserved) { >+ head->extent_op = NULL; > btrfs_free_delayed_extent_op(extent_op); >- return 0; >+ return NULL; > } >+ return extent_op; >+} >+ >+static int run_and_cleanup_extent_op(struct btrfs_trans_handle *trans, >+ struct btrfs_delayed_ref_head *head) >+{ >+ struct btrfs_delayed_extent_op *extent_op = >+ cleanup_extent_op(trans, head); >+ int ret; >+ >+ if (!extent_op) >+ return 0; >+ head->extent_op = NULL; > spin_unlock(&head->lock); > ret = run_delayed_extent_op(trans, head, extent_op); > btrfs_free_delayed_extent_op(extent_op); >@@ -2488,7 +2502,7 @@ static int cleanup_ref_head(struct btrfs_trans_handle >*trans, > > delayed_refs = &trans->transaction->delayed_refs; > >- ret = cleanup_extent_op(trans, head); >+ ret = run_and_cleanup_extent_op(trans, head); > if (ret < 0) { > unselect_delayed_ref_head(delayed_refs, head); > btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret); >@@ -6977,12 +6991,8 @@ static noinline int check_ref_cleanup(struct >btrfs_trans_handle *trans, > if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root)) > goto out; > >- if (head->extent_op) { >- if (!head->must_insert_reserved) >- goto out; >- btrfs_free_delayed_extent_op(head->extent_op); >- head->extent_op = NULL; >- } >+ if (cleanup_extent_op(trans, head) != NULL) >+ goto out; > > /* >* waiting for the lock here would deadlock. If someone else has it >-- >2.14.3 > > >
Re: [PATCH] btrfs: Fix suspicious RCU usage warning in device_list_add
On Wed, Nov 14, 2018 at 05:05:48PM +0100, David Sterba wrote: >On Wed, Nov 14, 2018 at 03:24:56PM +0800, Lu Fengqi wrote: >> = >> WARNING: suspicious RCU usage >> 4.20.0-rc2+ #23 Tainted: G O >> - >> fs/btrfs/volumes.c:886 suspicious rcu_dereference_check() usage! >> >> Use btrfs_info_in_rcu instead of pr_info for the required lock/unlock of >> RCU string. >> >> Fixes: 1f265fc6f58b ("btrfs: harden agaist duplicate fsid on scanned >> devices") > >Thanks for the fix. > >Please note that the patch is still in the devel queue (misc-next) so >the commit id is unstable, and such fixups get folded to the patch. > >You may also reply to the original mail with patch, but sending a bare >code change without a full changelog is also fine if the original patch >was sent long time ago and the fixup could get lost. Got it. -- Thanks, Lu > >> Signed-off-by: Lu Fengqi >> --- >> fs/btrfs/volumes.c | 8 >> 1 file changed, 4 insertions(+), 4 deletions(-) >> >> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c >> index 2186300bab91..6039ae5c549e 100644 >> --- a/fs/btrfs/volumes.c >> +++ b/fs/btrfs/volumes.c >> @@ -873,15 +873,15 @@ static noinline struct btrfs_device >> *device_list_add(const char *path, >> if (device->bdev != path_bdev) { >> bdput(path_bdev); >> mutex_unlock(&fs_devices->device_list_mutex); >> -pr_warn( >> -"BTRFS: duplicate device fsid:devid for %pU:%llu old:%s >> new:%s\n", >> +btrfs_warn_in_rcu(device->fs_info, >> +"duplicate device fsid:devid for %pU:%llu old:%s >> new:%s\n", > >The trailing newline is appended by all btrfs_* message helpers, removed >in the commit. > >
[PATCH] btrfs: Fix suspicious RCU usage warning in device_list_add
= WARNING: suspicious RCU usage 4.20.0-rc2+ #23 Tainted: G O - fs/btrfs/volumes.c:886 suspicious rcu_dereference_check() usage! Use btrfs_info_in_rcu instead of pr_info for the required lock/unlock of RCU string. Fixes: 1f265fc6f58b ("btrfs: harden agaist duplicate fsid on scanned devices") Signed-off-by: Lu Fengqi --- fs/btrfs/volumes.c | 8 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 2186300bab91..6039ae5c549e 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -873,15 +873,15 @@ static noinline struct btrfs_device *device_list_add(const char *path, if (device->bdev != path_bdev) { bdput(path_bdev); mutex_unlock(&fs_devices->device_list_mutex); - pr_warn( - "BTRFS: duplicate device fsid:devid for %pU:%llu old:%s new:%s\n", + btrfs_warn_in_rcu(device->fs_info, + "duplicate device fsid:devid for %pU:%llu old:%s new:%s\n", disk_super->fsid, devid, rcu_str_deref(device->name), path); return ERR_PTR(-EEXIST); } bdput(path_bdev); - pr_info( - "BTRFS: device fsid %pU devid %llu moved old:%s new:%s\n", + btrfs_info_in_rcu(device->fs_info, + "device fsid %pU devid %llu moved old:%s new:%s\n", disk_super->fsid, devid, rcu_str_deref(device->name), path); } -- 2.19.1
Re: [PATCH v15.1 00/13] Btrfs In-band De-duplication
On Tue, Nov 13, 2018 at 02:45:45PM +0100, David Sterba wrote: >On Tue, Nov 06, 2018 at 02:41:09PM +0800, Lu Fengqi wrote: >> This patchset can be fetched from github: >> https://github.com/littleroad/linux.git dedupe_latest >> >> Now the new base is v4.20-rc1. > >Before anybody spends more time with this patchset: this is a big >feature and quite intrusive to several btrfs subsystems. Currently it's >on hold as it requires finishing the design phase, it's still only the >in-memory backend and before we claim in-band dedupe, the persistent >hash tree needs to be at least drafted or prototyped. Thanks for your explanation. However, I'm not sure why we need to draft a prototype of the persistent hash tree first when we are talking about the memory backend. -- Thanks, Lu > >At this point there are several features that are in a more complete >state so they get preferred when it comes to merging. I would have to >look up what was agreed long time ago as merging plan, but at this point >this series would require a lot of work. > >
[PATCH v10.6 5/5] btrfs-progs: dedupe: introduce reconfigure subcommand
From: Qu Wenruo Introduce reconfigure subcommand to co-operate with new kernel ioctl modification. Signed-off-by: Qu Wenruo Signed-off-by: Lu Fengqi --- Documentation/btrfs-dedupe-inband.asciidoc | 7 +++ btrfs-completion | 2 +- cmds-dedupe-ib.c | 73 +- 3 files changed, 66 insertions(+), 16 deletions(-) diff --git a/Documentation/btrfs-dedupe-inband.asciidoc b/Documentation/btrfs-dedupe-inband.asciidoc index 6096389cb0b4..78c806f772d6 100644 --- a/Documentation/btrfs-dedupe-inband.asciidoc +++ b/Documentation/btrfs-dedupe-inband.asciidoc @@ -86,6 +86,13 @@ And compression has higher priority than in-band de-duplication, means if compression and de-duplication is enabled at the same time, only compression will work. +*reconfigure* [options] :: +Re-configure in-band de-duplication parameters of a filesystem. ++ +In-band de-duplication must be enbaled first before re-configuration. ++ +[Options] are the same with 'btrfs dedupe-inband enable'. + *status* :: Show current in-band de-duplication status of a filesystem. diff --git a/btrfs-completion b/btrfs-completion index 0808f9a14df9..a3e05b238eda 100644 --- a/btrfs-completion +++ b/btrfs-completion @@ -34,7 +34,7 @@ _btrfs() commands_quota='enable disable rescan' commands_qgroup='assign remove create destroy show limit' commands_replace='start status cancel' - commands_dedupe_inband='enable disable status' + commands_dedupe_inband='enable disable status reconfigure' if [[ "$cur" == -* && $cword -le 3 && "$cmd" != "help" ]]; then COMPREPLY=( $( compgen -W '--help' -- "$cur" ) ) diff --git a/cmds-dedupe-ib.c b/cmds-dedupe-ib.c index e778457e25a8..e52f939c9ced 100644 --- a/cmds-dedupe-ib.c +++ b/cmds-dedupe-ib.c @@ -56,7 +56,6 @@ static const char * const cmd_dedupe_ib_enable_usage[] = { NULL }; - #define report_fatal_parameter(dargs, old, member, type, err_val, fmt) \ ({ \ if (dargs->member != old->member && \ @@ -88,6 +87,12 @@ static void report_parameter_error(struct btrfs_ioctl_dedupe_args *dargs, } report_option_parameter(dargs, old, flags, u8, -1, x); } + + if (dargs->status == 0 && old->cmd == BTRFS_DEDUPE_CTL_RECONF) { + error("must enable dedupe before reconfiguration"); + return; + } + if (report_fatal_parameter(dargs, old, cmd, u16, -1, u) || report_fatal_parameter(dargs, old, blocksize, u64, -1, llu) || report_fatal_parameter(dargs, old, backend, u16, -1, u) || @@ -100,14 +105,17 @@ static void report_parameter_error(struct btrfs_ioctl_dedupe_args *dargs, old->limit_nr, old->limit_mem); } -static int cmd_dedupe_ib_enable(int argc, char **argv) +static int enable_reconfig_dedupe(int argc, char **argv, int reconf) { int ret; int fd = -1; char *path; u64 blocksize = BTRFS_DEDUPE_BLOCKSIZE_DEFAULT; + int blocksize_set = 0; u16 hash_algo = BTRFS_DEDUPE_HASH_SHA256; + int hash_algo_set = 0; u16 backend = BTRFS_DEDUPE_BACKEND_INMEMORY; + int backend_set = 0; u64 limit_nr = 0; u64 limit_mem = 0; u64 sys_mem = 0; @@ -134,15 +142,17 @@ static int cmd_dedupe_ib_enable(int argc, char **argv) break; switch (c) { case 's': - if (!strcasecmp("inmemory", optarg)) + if (!strcasecmp("inmemory", optarg)) { backend = BTRFS_DEDUPE_BACKEND_INMEMORY; - else { + backend_set = 1; + } else { error("unsupported dedupe backend: %s", optarg); exit(1); } break; case 'b': blocksize = parse_size(optarg); + blocksize_set = 1; break; case 'a': if (strcmp("sha256", optarg)) { @@ -224,26 +234,40 @@ static int cmd_dedupe_ib_enable(int argc, char **argv) return 1; } memset(&dargs, -1, sizeof(dargs)); - dargs.cmd = BTRFS_DEDUPE_CTL_ENABLE; - dargs.blocksize = blocksize; - dargs.hash_algo = hash_algo; - dargs.limit_nr = limit_nr; - dargs.limit_mem = limit_mem; - dargs.backend = backend; - if (force) - dargs.flags |= BTRFS_DEDUPE_FLAG_FORCE
[PATCH v10.6 2/5] btrfs-progs: dedupe: Add enable command for dedupe command group
From: Qu Wenruo Add enable subcommand for dedupe commmand group. Signed-off-by: Qu Wenruo Signed-off-by: Lu Fengqi --- Documentation/btrfs-dedupe-inband.asciidoc | 114 +- btrfs-completion | 6 +- cmds-dedupe-ib.c | 238 + ioctl.h| 2 + 4 files changed, 358 insertions(+), 2 deletions(-) diff --git a/Documentation/btrfs-dedupe-inband.asciidoc b/Documentation/btrfs-dedupe-inband.asciidoc index 83113f5487e2..d895aafbcf45 100644 --- a/Documentation/btrfs-dedupe-inband.asciidoc +++ b/Documentation/btrfs-dedupe-inband.asciidoc @@ -22,7 +22,119 @@ use with caution. SUBCOMMAND -- -Nothing yet +*enable* [options] :: +Enable in-band de-duplication for a filesystem. ++ +`Options` ++ +-f|--force +Force 'enable' command to be exected. +Will skip memory limit check and allow 'enable' to be executed even in-band +de-duplication is already enabled. ++ +NOTE: If re-enable dedupe with '-f' option, any unspecified parameter will be +reset to its default value. + +-s|--storage-backend +Specify de-duplication hash storage backend. +Only 'inmemory' backend is supported yet. +If not specified, default value is 'inmemory'. ++ +Refer to *BACKENDS* sector for more information. + +-b|--blocksize +Specify dedupe block size. +Supported values are power of 2 from '16K' to '8M'. +Default value is '128K'. ++ +Refer to *BLOCKSIZE* sector for more information. + +-a|--hash-algorithm +Specify hash algorithm. +Only 'sha256' is supported yet. + +-l|--limit-hash +Specify maximum number of hashes stored in memory. +Only works for 'inmemory' backend. +Conflicts with '-m' option. ++ +Only positive values are valid. +Default value is '32K'. + +-m|--limit-memory +Specify maximum memory used for hashes. +Only works for 'inmemory' backend. +Conflicts with '-l' option. ++ +Only value larger than or equal to '1024' is valid. +No default value. ++ +NOTE: Memory limit will be rounded down to kernel internal hash size, +so the memory limit shown in 'btrfs dedupe-inband status' may be different +from the . + +WARNING: Too large value for '-l' or '-m' will easily trigger OOM. +Please use with caution according to system memory. + +NOTE: In-band de-duplication is not compactible with compression yet. +And compression has higher priority than in-band de-duplication, means if +compression and de-duplication is enabled at the same time, only compression +will work. + +BACKENDS + +Btrfs in-band de-duplication will support different storage backends, with +different use case and features. + +In-memory backend:: +This backend provides backward-compatibility, and more fine-tuning options. +But hash pool is non-persistent and may exhaust kernel memory if not setup +properly. ++ +This backend can be used on old btrfs(without '-O dedupe' mkfs option). +When used on old btrfs, this backend needs to be enabled manually after mount. ++ +Designed for fast hash search speed, in-memory backend will keep all dedupe +hashes in memory. (Although overall performance is still much the same with +'ondisk' backend if all 'ondisk' hash can be cached in memory) ++ +And only keeps limited number of hash in memory to avoid exhausting memory. +Hashes over the limit will be dropped following Last-Recent-Use behavior. +So this backend has a consistent overhead for given limit but can\'t ensure +all duplicated blocks will be de-duplicated. ++ +After umount and mount, in-memory backend need to refill its hash pool. + +On-disk backend:: +This backend provides persistent hash pool, with more smart memory management +for hash pool. +But it\'s not backward-compatible, meaning it must be used with '-O dedupe' mkfs +option and older kernel can\'t mount it read-write. ++ +Designed for de-duplication rate, hash pool is stored as btrfs B+ tree on disk. +This behavior may cause extra disk IO for hash search under high memory +pressure. ++ +After umount and mount, on-disk backend still has its hash on disk, no need to +refill its dedupe hash pool. + +Currently, only 'inmemory' backend is supported in btrfs-progs. + +DEDUPE BLOCK SIZE + +In-band de-duplication is done at dedupe block size. +Any data smaller than dedupe block size won\'t go through in-band +de-duplication. + +And dedupe block size affects dedupe rate and fragmentation heavily. + +Smaller block size will cause more fragments, but higher dedupe rate. + +Larger block size will cause less fragments, but lower dedupe rate. + +In-band de-duplication rate is highly related to the workload pattern. +So it\'s highly recommended to align dedupe block size to the workload +block size to make full use of d
[PATCH v10.6 3/5] btrfs-progs: dedupe: Add disable support for inband dedupelication
From: Qu Wenruo Add disable subcommand for dedupe command group. Signed-off-by: Qu Wenruo Signed-off-by: Lu Fengqi --- Documentation/btrfs-dedupe-inband.asciidoc | 5 +++ btrfs-completion | 2 +- cmds-dedupe-ib.c | 41 ++ 3 files changed, 47 insertions(+), 1 deletion(-) diff --git a/Documentation/btrfs-dedupe-inband.asciidoc b/Documentation/btrfs-dedupe-inband.asciidoc index d895aafbcf45..3452f690e3e5 100644 --- a/Documentation/btrfs-dedupe-inband.asciidoc +++ b/Documentation/btrfs-dedupe-inband.asciidoc @@ -22,6 +22,11 @@ use with caution. SUBCOMMAND -- +*disable* :: +Disable in-band de-duplication for a filesystem. ++ +This will trash all stored dedupe hash. ++ *enable* [options] :: Enable in-band de-duplication for a filesystem. + diff --git a/btrfs-completion b/btrfs-completion index 621801cf12fb..e6ec785bf849 100644 --- a/btrfs-completion +++ b/btrfs-completion @@ -34,7 +34,7 @@ _btrfs() commands_quota='enable disable rescan' commands_qgroup='assign remove create destroy show limit' commands_replace='start status cancel' - commands_dedupe_inband='enable' + commands_dedupe_inband='enable disable' if [[ "$cur" == -* && $cword -le 3 && "$cmd" != "help" ]]; then COMPREPLY=( $( compgen -W '--help' -- "$cur" ) ) diff --git a/cmds-dedupe-ib.c b/cmds-dedupe-ib.c index 4d499677d9ae..91b6fe234043 100644 --- a/cmds-dedupe-ib.c +++ b/cmds-dedupe-ib.c @@ -259,10 +259,51 @@ out: return ret; } +static const char * const cmd_dedupe_ib_disable_usage[] = { + "btrfs dedupe-inband disable ", + "Disable in-band(write time) de-duplication of a btrfs.", + NULL +}; + +static int cmd_dedupe_ib_disable(int argc, char **argv) +{ + struct btrfs_ioctl_dedupe_args dargs; + DIR *dirstream; + char *path; + int fd; + int ret; + + if (check_argc_exact(argc, 2)) + usage(cmd_dedupe_ib_disable_usage); + + path = argv[1]; + fd = open_file_or_dir(path, &dirstream); + if (fd < 0) { + error("failed to open file or directory: %s", path); + return 1; + } + memset(&dargs, 0, sizeof(dargs)); + dargs.cmd = BTRFS_DEDUPE_CTL_DISABLE; + + ret = ioctl(fd, BTRFS_IOC_DEDUPE_CTL, &dargs); + if (ret < 0) { + error("failed to disable inband deduplication: %m"); + ret = 1; + goto out; + } + ret = 0; + +out: + close_file_or_dir(fd, dirstream); + return 0; +} + const struct cmd_group dedupe_ib_cmd_group = { dedupe_ib_cmd_group_usage, dedupe_ib_cmd_group_info, { { "enable", cmd_dedupe_ib_enable, cmd_dedupe_ib_enable_usage, NULL, 0}, + { "disable", cmd_dedupe_ib_disable, cmd_dedupe_ib_disable_usage, + NULL, 0}, NULL_CMD_STRUCT } }; -- 2.19.1
[PATCH v10.6 1/5] btrfs-progs: Basic framework for dedupe-inband command group
From: Qu Wenruo Add basic ioctl header and command group framework for later use. Alone with basic man page doc. Signed-off-by: Qu Wenruo Signed-off-by: Lu Fengqi --- Documentation/Makefile.in | 1 + Documentation/btrfs-dedupe-inband.asciidoc | 40 ++ Documentation/btrfs.asciidoc | 4 +++ Makefile | 3 +- btrfs.c| 2 ++ cmds-dedupe-ib.c | 35 +++ commands.h | 2 ++ dedupe-ib.h| 28 +++ ioctl.h| 36 +++ 9 files changed, 150 insertions(+), 1 deletion(-) create mode 100644 Documentation/btrfs-dedupe-inband.asciidoc create mode 100644 cmds-dedupe-ib.c create mode 100644 dedupe-ib.h diff --git a/Documentation/Makefile.in b/Documentation/Makefile.in index afc16980c6d9..c0d797324c25 100644 --- a/Documentation/Makefile.in +++ b/Documentation/Makefile.in @@ -28,6 +28,7 @@ MAN8_TXT += btrfs-qgroup.asciidoc MAN8_TXT += btrfs-replace.asciidoc MAN8_TXT += btrfs-restore.asciidoc MAN8_TXT += btrfs-property.asciidoc +MAN8_TXT += btrfs-dedupe-inband.asciidoc # Category 5 manual page MAN5_TXT += btrfs-man5.asciidoc diff --git a/Documentation/btrfs-dedupe-inband.asciidoc b/Documentation/btrfs-dedupe-inband.asciidoc new file mode 100644 index ..83113f5487e2 --- /dev/null +++ b/Documentation/btrfs-dedupe-inband.asciidoc @@ -0,0 +1,40 @@ +btrfs-dedupe-inband(8) +== + +NAME + +btrfs-dedupe-inband - manage in-band (write time) de-duplication of a btrfs +filesystem + +SYNOPSIS + +*btrfs dedupe-inband* + +DESCRIPTION +--- +*btrfs dedupe-inband* is used to enable/disable or show current in-band de-duplication +status of a btrfs filesystem. + +Kernel support for in-band de-duplication starts from 4.19. + +WARNING: In-band de-duplication is still an experimental feautre of btrfs, +use with caution. + +SUBCOMMAND +-- +Nothing yet + +EXIT STATUS +--- +*btrfs dedupe-inband* returns a zero exit status if it succeeds. Non zero is +returned in case of failure. + +AVAILABILITY + +*btrfs* is part of btrfs-progs. +Please refer to the btrfs wiki http://btrfs.wiki.kernel.org for +further details. + +SEE ALSO + +`mkfs.btrfs`(8), diff --git a/Documentation/btrfs.asciidoc b/Documentation/btrfs.asciidoc index 7316ac094413..1cf5bddec335 100644 --- a/Documentation/btrfs.asciidoc +++ b/Documentation/btrfs.asciidoc @@ -50,6 +50,10 @@ COMMANDS Do off-line check on a btrfs filesystem. + See `btrfs-check`(8) for details. +*dedupe-inband*:: + Control btrfs in-band(write time) de-duplication. + + See `btrfs-dedupe-inband`(8) for details. + *device*:: Manage devices managed by btrfs, including add/delete/scan and so on. + diff --git a/Makefile b/Makefile index f4ab14ea74c8..f155252c91f1 100644 --- a/Makefile +++ b/Makefile @@ -124,7 +124,8 @@ cmds_objects = cmds-subvolume.o cmds-filesystem.o cmds-device.o cmds-scrub.o \ cmds-restore.o cmds-rescue.o chunk-recover.o super-recover.o \ cmds-property.o cmds-fi-usage.o cmds-inspect-dump-tree.o \ cmds-inspect-dump-super.o cmds-inspect-tree-stats.o cmds-fi-du.o \ - mkfs/common.o check/mode-common.o check/mode-lowmem.o + mkfs/common.o check/mode-common.o check/mode-lowmem.o \ + cmds-dedupe-ib.o libbtrfs_objects = send-stream.o send-utils.o kernel-lib/rbtree.o btrfs-list.o \ kernel-lib/crc32c.o messages.o \ uuid-tree.o utils-lib.o rbtree-utils.o diff --git a/btrfs.c b/btrfs.c index 2d39f2ced3e8..2168f5a8bc7f 100644 --- a/btrfs.c +++ b/btrfs.c @@ -255,6 +255,8 @@ static const struct cmd_group btrfs_cmd_group = { { "quota", cmd_quota, NULL, "a_cmd_group, 0 }, { "qgroup", cmd_qgroup, NULL, &qgroup_cmd_group, 0 }, { "replace", cmd_replace, NULL, &replace_cmd_group, 0 }, + { "dedupe-inband", cmd_dedupe_ib, NULL, &dedupe_ib_cmd_group, + 0 }, { "help", cmd_help, cmd_help_usage, NULL, 0 }, { "version", cmd_version, cmd_version_usage, NULL, 0 }, NULL_CMD_STRUCT diff --git a/cmds-dedupe-ib.c b/cmds-dedupe-ib.c new file mode 100644 index ..73c923a797da --- /dev/null +++ b/cmds-dedupe-ib.c @@ -0,0 +1,35 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2017 Fujitsu. All rights reserved. + */ + +#include +#include +#include + +#include "ctree.h" +#include "ioctl.h" + +#include "commands.h" +#include "utils.h" +#include "kerncompat.h" +#include "dedupe-ib.h" + +s
[PATCH v10.6 0/5] In-band de-duplication for btrfs-progs
Patchset can be fetched from github: https://github.com/littleroad/btrfs-progs.git dedupe_latest Inband dedupe(in-memory backend only) ioctl support for btrfs-progs. v7 changes: Update ctree.h to follow kernel structure change Update print-tree to follow kernel structure change V8 changes: Move dedup props and on-disk backend support out of the patchset Change command group name to "dedupe-inband", to avoid confusion with possible out-of-band dedupe. Suggested by Mark. Rebase to latest devel branch. V9 changes: Follow kernels ioctl change to support FORCE flag, new reconf ioctl, and more precious error reporting. v10 changes: Rebase to v4.10. Add BUILD_ASSERT for btrfs_ioctl_dedupe_args v10.1 changes: Rebase to v4.14. v10.2 changes: Rebase to v4.16.1. v10.3 changes: Rebase to v4.17. v10.4 changes: Deal with offline reviews from Misono Tomohiro. 1. s/btrfs-dedupe/btrfs-dedupe-inband 2. Replace strerror(errno) with %m 3. Use SZ_* instead of intermedia number 4. update btrfs-completion for reconfigure subcommand v10.5 changes: Rebase to v4.17.1. v10.6 changes: Rebase to v4.19. Qu Wenruo (5): btrfs-progs: Basic framework for dedupe-inband command group btrfs-progs: dedupe: Add enable command for dedupe command group btrfs-progs: dedupe: Add disable support for inband dedupelication btrfs-progs: dedupe: Add status subcommand btrfs-progs: dedupe: introduce reconfigure subcommand Documentation/Makefile.in | 1 + Documentation/btrfs-dedupe-inband.asciidoc | 167 Documentation/btrfs.asciidoc | 4 + Makefile | 3 +- btrfs-completion | 6 +- btrfs.c| 2 + cmds-dedupe-ib.c | 437 + commands.h | 2 + dedupe-ib.h| 28 ++ ioctl.h| 38 ++ 10 files changed, 686 insertions(+), 2 deletions(-) create mode 100644 Documentation/btrfs-dedupe-inband.asciidoc create mode 100644 cmds-dedupe-ib.c create mode 100644 dedupe-ib.h -- 2.19.1
[PATCH v10.6 4/5] btrfs-progs: dedupe: Add status subcommand
From: Qu Wenruo Add status subcommand for dedupe command group. Signed-off-by: Qu Wenruo Signed-off-by: Lu Fengqi --- Documentation/btrfs-dedupe-inband.asciidoc | 3 + btrfs-completion | 2 +- cmds-dedupe-ib.c | 80 ++ 3 files changed, 84 insertions(+), 1 deletion(-) diff --git a/Documentation/btrfs-dedupe-inband.asciidoc b/Documentation/btrfs-dedupe-inband.asciidoc index 3452f690e3e5..6096389cb0b4 100644 --- a/Documentation/btrfs-dedupe-inband.asciidoc +++ b/Documentation/btrfs-dedupe-inband.asciidoc @@ -86,6 +86,9 @@ And compression has higher priority than in-band de-duplication, means if compression and de-duplication is enabled at the same time, only compression will work. +*status* :: +Show current in-band de-duplication status of a filesystem. + BACKENDS Btrfs in-band de-duplication will support different storage backends, with diff --git a/btrfs-completion b/btrfs-completion index e6ec785bf849..0808f9a14df9 100644 --- a/btrfs-completion +++ b/btrfs-completion @@ -34,7 +34,7 @@ _btrfs() commands_quota='enable disable rescan' commands_qgroup='assign remove create destroy show limit' commands_replace='start status cancel' - commands_dedupe_inband='enable disable' + commands_dedupe_inband='enable disable status' if [[ "$cur" == -* && $cword -le 3 && "$cmd" != "help" ]]; then COMPREPLY=( $( compgen -W '--help' -- "$cur" ) ) diff --git a/cmds-dedupe-ib.c b/cmds-dedupe-ib.c index 91b6fe234043..e778457e25a8 100644 --- a/cmds-dedupe-ib.c +++ b/cmds-dedupe-ib.c @@ -298,12 +298,92 @@ out: return 0; } +static const char * const cmd_dedupe_ib_status_usage[] = { + "btrfs dedupe-inband status ", + "Show current in-band(write time) de-duplication status of a btrfs.", + NULL +}; + +static int cmd_dedupe_ib_status(int argc, char **argv) +{ + struct btrfs_ioctl_dedupe_args dargs; + DIR *dirstream; + char *path; + int fd; + int ret; + int print_limit = 1; + + if (check_argc_exact(argc, 2)) + usage(cmd_dedupe_ib_status_usage); + + path = argv[1]; + fd = open_file_or_dir(path, &dirstream); + if (fd < 0) { + error("failed to open file or directory: %s", path); + ret = 1; + goto out; + } + memset(&dargs, 0, sizeof(dargs)); + dargs.cmd = BTRFS_DEDUPE_CTL_STATUS; + + ret = ioctl(fd, BTRFS_IOC_DEDUPE_CTL, &dargs); + if (ret < 0) { + error("failed to get inband deduplication status: %m"); + ret = 1; + goto out; + } + ret = 0; + if (dargs.status == 0) { + printf("Status: \t\t\tDisabled\n"); + goto out; + } + printf("Status:\t\t\tEnabled\n"); + + if (dargs.hash_algo == BTRFS_DEDUPE_HASH_SHA256) + printf("Hash algorithm:\t\tSHA-256\n"); + else + printf("Hash algorithm:\t\tUnrecognized(%x)\n", + dargs.hash_algo); + + if (dargs.backend == BTRFS_DEDUPE_BACKEND_INMEMORY) { + printf("Backend:\t\tIn-memory\n"); + print_limit = 1; + } else { + printf("Backend:\t\tUnrecognized(%x)\n", + dargs.backend); + } + + printf("Dedup Blocksize:\t%llu\n", dargs.blocksize); + + if (print_limit) { + u64 cur_mem; + + /* Limit nr may be 0 */ + if (dargs.limit_nr) + cur_mem = dargs.current_nr * (dargs.limit_mem / + dargs.limit_nr); + else + cur_mem = 0; + + printf("Number of hash: \t[%llu/%llu]\n", dargs.current_nr, + dargs.limit_nr); + printf("Memory usage: \t\t[%s/%s]\n", + pretty_size(cur_mem), + pretty_size(dargs.limit_mem)); + } +out: + close_file_or_dir(fd, dirstream); + return ret; +} + const struct cmd_group dedupe_ib_cmd_group = { dedupe_ib_cmd_group_usage, dedupe_ib_cmd_group_info, { { "enable", cmd_dedupe_ib_enable, cmd_dedupe_ib_enable_usage, NULL, 0}, { "disable", cmd_dedupe_ib_disable, cmd_dedupe_ib_disable_usage, NULL, 0}, + { "status", cmd_dedupe_ib_status, cmd_dedupe_ib_status_usage, + NULL, 0}, NULL_CMD_STRUCT } }; -- 2.19.1
[PATCH v15.1 09/13] btrfs: introduce type based delalloc metadata reserve
From: Wang Xiaoguang Introduce type based metadata reserve parameter for delalloc space reservation/freeing function. The problem we are going to solve is, btrfs use different max extent size for different mount options. For de-duplication, the max extent size can be set by the dedupe ioctl, while for normal write it's 128M. And furthermore, split/merge extent hook highly depends that max extent size. Such situation contributes to quite a lot of false ENOSPC. So this patch introduces the facility to help solve these false ENOSPC related to different max extent size. Currently, only normal 128M extent size is supported. More types will follow soon. Signed-off-by: Wang Xiaoguang Signed-off-by: Qu Wenruo Signed-off-by: Lu Fengqi --- fs/btrfs/ctree.h | 43 ++--- fs/btrfs/extent-tree.c | 48 --- fs/btrfs/file.c | 30 + fs/btrfs/free-space-cache.c | 6 +- fs/btrfs/inode-map.c | 9 ++- fs/btrfs/inode.c | 115 +-- fs/btrfs/ioctl.c | 23 +++ fs/btrfs/ordered-data.c | 6 +- fs/btrfs/ordered-data.h | 3 +- fs/btrfs/relocation.c| 22 --- fs/btrfs/tests/inode-tests.c | 15 +++-- 11 files changed, 223 insertions(+), 97 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 910050d904ef..b119a19cbeaf 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -92,11 +92,24 @@ static const int btrfs_csum_sizes[] = { 4 }; /* * Count how many BTRFS_MAX_EXTENT_SIZE cover the @size */ -static inline u32 count_max_extents(u64 size) +static inline u32 count_max_extents(u64 size, u64 max_extent_size) { - return div_u64(size + BTRFS_MAX_EXTENT_SIZE - 1, BTRFS_MAX_EXTENT_SIZE); + return div_u64(size + max_extent_size - 1, max_extent_size); } +/* + * Type based metadata reserve type + * This affects how btrfs reserve metadata space for buffered write. + * + * This is caused by the different max extent size for normal COW + * and further in-band dedupe + */ +enum btrfs_metadata_reserve_type { + BTRFS_RESERVE_NORMAL, +}; + +u64 btrfs_max_extent_size(enum btrfs_metadata_reserve_type reserve_type); + struct btrfs_mapping_tree { struct extent_map_tree map_tree; }; @@ -2732,8 +2745,9 @@ int btrfs_check_data_free_space(struct inode *inode, void btrfs_free_reserved_data_space(struct inode *inode, struct extent_changeset *reserved, u64 start, u64 len); void btrfs_delalloc_release_space(struct inode *inode, - struct extent_changeset *reserved, - u64 start, u64 len, bool qgroup_free); + struct extent_changeset *reserved, + u64 start, u64 len, bool qgroup_free, + enum btrfs_metadata_reserve_type reserve_type); void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start, u64 len); void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans); @@ -2743,13 +2757,17 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *rsv); void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes, - bool qgroup_free); + bool qgroup_free, + enum btrfs_metadata_reserve_type reserve_type); -int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes); +int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes, + enum btrfs_metadata_reserve_type reserve_type); void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes, -bool qgroup_free); + bool qgroup_free, + enum btrfs_metadata_reserve_type reserve_type); int btrfs_delalloc_reserve_space(struct inode *inode, - struct extent_changeset **reserved, u64 start, u64 len); + struct extent_changeset **reserved, u64 start, u64 len, + enum btrfs_metadata_reserve_type reserve_type); void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type); struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info, unsigned short type); @@ -3152,7 +3170,11 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root); int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr); int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, unsigned int extra_bits, - struct extent_state **cached_state, int d
[PATCH v15.1 01/13] btrfs: dedupe: Introduce dedupe framework and its header
From: Wang Xiaoguang Introduce the header for btrfs in-band(write time) de-duplication framework and needed header. The new de-duplication framework is going to support 2 different dedupe methods and 1 dedupe hash. Signed-off-by: Qu Wenruo Signed-off-by: Wang Xiaoguang Signed-off-by: Lu Fengqi --- fs/btrfs/ctree.h | 7 ++ fs/btrfs/dedupe.h | 128 - fs/btrfs/disk-io.c | 1 + include/uapi/linux/btrfs.h | 34 ++ 4 files changed, 168 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 80953528572d..910050d904ef 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1118,6 +1118,13 @@ struct btrfs_fs_info { spinlock_t ref_verify_lock; struct rb_root block_tree; #endif + + /* +* Inband de-duplication related structures +*/ + unsigned long dedupe_enabled:1; + struct btrfs_dedupe_info *dedupe_info; + struct mutex dedupe_ioctl_lock; }; static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb) diff --git a/fs/btrfs/dedupe.h b/fs/btrfs/dedupe.h index 90281a7a35a8..222ce7b4d827 100644 --- a/fs/btrfs/dedupe.h +++ b/fs/btrfs/dedupe.h @@ -6,7 +6,131 @@ #ifndef BTRFS_DEDUPE_H #define BTRFS_DEDUPE_H -/* later in-band dedupe will expand this struct */ -struct btrfs_dedupe_hash; +#include +/* 32 bytes for SHA256 */ +static const int btrfs_hash_sizes[] = { 32 }; + +/* + * For caller outside of dedupe.c + * + * Different dedupe backends should have their own hash structure + */ +struct btrfs_dedupe_hash { + u64 bytenr; + u32 num_bytes; + + /* last field is a variable length array of dedupe hash */ + u8 hash[]; +}; + +struct btrfs_dedupe_info { + /* dedupe blocksize */ + u64 blocksize; + u16 backend; + u16 hash_algo; + + struct crypto_shash *dedupe_driver; + + /* +* Use mutex to portect both backends +* Even for in-memory backends, the rb-tree can be quite large, +* so mutex is better for such use case. +*/ + struct mutex lock; + + /* following members are only used in in-memory backend */ + struct rb_root hash_root; + struct rb_root bytenr_root; + struct list_head lru_list; + u64 limit_nr; + u64 current_nr; +}; + +static inline int btrfs_dedupe_hash_hit(struct btrfs_dedupe_hash *hash) +{ + return (hash && hash->bytenr); +} + +/* + * Initial inband dedupe info + * Called at dedupe enable time. + * + * Return 0 for success + * Return <0 for any error + * (from unsupported param to tree creation error for some backends) + */ +int btrfs_dedupe_enable(struct btrfs_fs_info *fs_info, + struct btrfs_ioctl_dedupe_args *dargs); + +/* + * Disable dedupe and invalidate all its dedupe data. + * Called at dedupe disable time. + * + * Return 0 for success + * Return <0 for any error + * (tree operation error for some backends) + */ +int btrfs_dedupe_disable(struct btrfs_fs_info *fs_info); + +/* + * Get current dedupe status. + * Return 0 for success + * No possible error yet + */ +void btrfs_dedupe_status(struct btrfs_fs_info *fs_info, +struct btrfs_ioctl_dedupe_args *dargs); + +/* + * Calculate hash for dedupe. + * Caller must ensure [start, start + dedupe_bs) has valid data. + * + * Return 0 for success + * Return <0 for any error + * (error from hash codes) + */ +int btrfs_dedupe_calc_hash(struct btrfs_fs_info *fs_info, + struct inode *inode, u64 start, + struct btrfs_dedupe_hash *hash); + +/* + * Search for duplicated extents by calculated hash + * Caller must call btrfs_dedupe_calc_hash() first to get the hash. + * + * @inode: the inode for we are writing + * @file_pos: offset inside the inode + * As we will increase extent ref immediately after a hash match, + * we need @file_pos and @inode in this case. + * + * Return > 0 for a hash match, and the extent ref will be + * *INCREASED*, and hash->bytenr/num_bytes will record the existing + * extent data. + * Return 0 for a hash miss. Nothing is done + * Return <0 for any error + * (tree operation error for some backends) + */ +int btrfs_dedupe_search(struct btrfs_fs_info *fs_info, + struct inode *inode, u64 file_pos, + struct btrfs_dedupe_hash *hash); + +/* + * Add a dedupe hash into dedupe info + * Return 0 for success + * Return <0 for any error + * (tree operation error for some backends) + */ +int btrfs_dedupe_add(struct btrfs_fs_info *fs_info, +struct btrfs_dedupe_hash *hash); + +/* + * Remove a dedupe hash from dedupe info + * Return 0 for success + * Return <0 for any error + * (tree operation error for some backends) + * + * NOTE: if hash deletion error is not handled well, it will lead + * to corrupted fs, as later dedupe write can points to non-exist
[PATCH v15.1 10/13] btrfs: dedupe: Inband in-memory only de-duplication implement
From: Qu Wenruo Core implement for inband de-duplication. It reuses the async_cow_start() facility to do the calculate dedupe hash. And use dedupe hash to do inband de-duplication at extent level. The workflow is as below: 1) Run delalloc range for an inode 2) Calculate hash for the delalloc range at the unit of dedupe_bs 3) For hash match(duplicated) case, just increase source extent ref and insert file extent. For hash mismatch case, go through the normal cow_file_range() fallback, and add hash into dedupe_tree. Compress for hash miss case is not supported yet. Current implement restore all dedupe hash in memory rb-tree, with LRU behavior to control the limit. Signed-off-by: Wang Xiaoguang Signed-off-by: Qu Wenruo Signed-off-by: Lu Fengqi --- fs/btrfs/ctree.h | 4 +- fs/btrfs/dedupe.h | 15 ++ fs/btrfs/extent-tree.c | 31 +++- fs/btrfs/extent_io.c | 7 +- fs/btrfs/extent_io.h | 1 + fs/btrfs/file.c| 4 + fs/btrfs/inode.c | 319 ++--- fs/btrfs/ioctl.c | 1 + fs/btrfs/relocation.c | 18 +++ 9 files changed, 343 insertions(+), 57 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index b119a19cbeaf..3a8e35b5328a 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -106,9 +106,11 @@ static inline u32 count_max_extents(u64 size, u64 max_extent_size) */ enum btrfs_metadata_reserve_type { BTRFS_RESERVE_NORMAL, + BTRFS_RESERVE_DEDUPE, }; -u64 btrfs_max_extent_size(enum btrfs_metadata_reserve_type reserve_type); +u64 btrfs_max_extent_size(struct btrfs_inode *inode, + enum btrfs_metadata_reserve_type reserve_type); struct btrfs_mapping_tree { struct extent_map_tree map_tree; diff --git a/fs/btrfs/dedupe.h b/fs/btrfs/dedupe.h index 87f5b7ce7766..8157b17c4d11 100644 --- a/fs/btrfs/dedupe.h +++ b/fs/btrfs/dedupe.h @@ -7,6 +7,7 @@ #define BTRFS_DEDUPE_H #include +#include "btrfs_inode.h" /* 32 bytes for SHA256 */ static const int btrfs_hash_sizes[] = { 32 }; @@ -47,6 +48,20 @@ struct btrfs_dedupe_info { u64 current_nr; }; +static inline u64 btrfs_dedupe_blocksize(struct btrfs_inode *inode) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + + return fs_info->dedupe_info->blocksize; +} + +static inline int inode_need_dedupe(struct inode *inode) +{ + struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; + + return fs_info->dedupe_enabled; +} + static inline int btrfs_dedupe_hash_hit(struct btrfs_dedupe_hash *hash) { return (hash && hash->bytenr); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 2c8992b919ae..fa3654045ba8 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -28,6 +28,7 @@ #include "sysfs.h" #include "qgroup.h" #include "ref-verify.h" +#include "dedupe.h" #undef SCRAMBLE_DELAYED_REFS @@ -2492,6 +2493,17 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans, btrfs_pin_extent(fs_info, head->bytenr, head->num_bytes, 1); if (head->is_data) { + /* +* If insert_reserved is given, it means +* a new extent is revered, then deleted +* in one tran, and inc/dec get merged to 0. +* +* In this case, we need to remove its dedupe +* hash. +*/ + ret = btrfs_dedupe_del(fs_info, head->bytenr); + if (ret < 0) + return ret; ret = btrfs_del_csums(trans, fs_info, head->bytenr, head->num_bytes); } @@ -5913,13 +5925,15 @@ static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info, spin_unlock(&block_rsv->lock); } -u64 btrfs_max_extent_size(enum btrfs_metadata_reserve_type reserve_type) +u64 btrfs_max_extent_size(struct btrfs_inode *inode, + enum btrfs_metadata_reserve_type reserve_type) { if (reserve_type == BTRFS_RESERVE_NORMAL) return BTRFS_MAX_EXTENT_SIZE; - - ASSERT(0); - return BTRFS_MAX_EXTENT_SIZE; + else if (reserve_type == BTRFS_RESERVE_DEDUPE) + return btrfs_dedupe_blocksize(inode); + else + return BTRFS_MAX_EXTENT_SIZE; } int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes, @@ -5930,7 +5944,7 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes, enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL; int ret = 0; bool delalloc_lock = true; - u64 max_extent_size = btrfs_max_extent_si
[PATCH v15.1 02/13] btrfs: dedupe: Introduce function to initialize dedupe info
From: Wang Xiaoguang Add generic function to initialize dedupe info. Signed-off-by: Qu Wenruo Signed-off-by: Wang Xiaoguang Reviewed-by: Josef Bacik Signed-off-by: Lu Fengqi --- fs/btrfs/Makefile | 2 +- fs/btrfs/dedupe.c | 169 + fs/btrfs/dedupe.h | 12 +++ include/uapi/linux/btrfs.h | 3 + 4 files changed, 185 insertions(+), 1 deletion(-) create mode 100644 fs/btrfs/dedupe.c diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index ca693dd554e9..78fdc87dba39 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -10,7 +10,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ export.o tree-log.o free-space-cache.o zlib.o lzo.o zstd.o \ compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \ - uuid-tree.o props.o free-space-tree.o tree-checker.o + uuid-tree.o props.o free-space-tree.o tree-checker.o dedupe.o btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o diff --git a/fs/btrfs/dedupe.c b/fs/btrfs/dedupe.c new file mode 100644 index ..06523162753d --- /dev/null +++ b/fs/btrfs/dedupe.c @@ -0,0 +1,169 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2016 Fujitsu. All rights reserved. + */ + +#include "ctree.h" +#include "dedupe.h" +#include "btrfs_inode.h" +#include "delayed-ref.h" + +struct inmem_hash { + struct rb_node hash_node; + struct rb_node bytenr_node; + struct list_head lru_list; + + u64 bytenr; + u32 num_bytes; + + u8 hash[]; +}; + +static struct btrfs_dedupe_info * +init_dedupe_info(struct btrfs_ioctl_dedupe_args *dargs) +{ + struct btrfs_dedupe_info *dedupe_info; + + dedupe_info = kzalloc(sizeof(*dedupe_info), GFP_NOFS); + if (!dedupe_info) + return ERR_PTR(-ENOMEM); + + dedupe_info->hash_algo = dargs->hash_algo; + dedupe_info->backend = dargs->backend; + dedupe_info->blocksize = dargs->blocksize; + dedupe_info->limit_nr = dargs->limit_nr; + + /* only support SHA256 yet */ + dedupe_info->dedupe_driver = crypto_alloc_shash("sha256", 0, 0); + if (IS_ERR(dedupe_info->dedupe_driver)) { + kfree(dedupe_info); + return ERR_CAST(dedupe_info->dedupe_driver); + } + + dedupe_info->hash_root = RB_ROOT; + dedupe_info->bytenr_root = RB_ROOT; + dedupe_info->current_nr = 0; + INIT_LIST_HEAD(&dedupe_info->lru_list); + mutex_init(&dedupe_info->lock); + + return dedupe_info; +} + +/* + * Helper to check if parameters are valid. + * The first invalid field will be set to (-1), to info user which parameter + * is invalid. + * Except dargs->limit_nr or dargs->limit_mem, in that case, 0 will returned + * to info user, since user can specify any value to limit, except 0. + */ +static int check_dedupe_parameter(struct btrfs_fs_info *fs_info, + struct btrfs_ioctl_dedupe_args *dargs) +{ + u64 blocksize = dargs->blocksize; + u64 limit_nr = dargs->limit_nr; + u64 limit_mem = dargs->limit_mem; + u16 hash_algo = dargs->hash_algo; + u8 backend = dargs->backend; + + /* +* Set all reserved fields to -1, allow user to detect +* unsupported optional parameters. +*/ + memset(dargs->__unused, -1, sizeof(dargs->__unused)); + if (blocksize > BTRFS_DEDUPE_BLOCKSIZE_MAX || + blocksize < BTRFS_DEDUPE_BLOCKSIZE_MIN || + blocksize < fs_info->sectorsize || + !is_power_of_2(blocksize) || + blocksize < PAGE_SIZE) { + dargs->blocksize = (u64)-1; + return -EINVAL; + } + if (hash_algo >= ARRAY_SIZE(btrfs_hash_sizes)) { + dargs->hash_algo = (u16)-1; + return -EINVAL; + } + if (backend >= BTRFS_DEDUPE_BACKEND_COUNT) { + dargs->backend = (u8)-1; + return -EINVAL; + } + + /* Backend specific check */ + if (backend == BTRFS_DEDUPE_BACKEND_INMEMORY) { + /* only one limit is accepted for enable*/ + if (dargs->limit_nr && dargs->limit_mem) { + dargs->limit_nr = 0; + dargs->limit_mem = 0; + return -EINVAL; + } + + if (!limit_nr && !limit_mem) + dargs->limit_nr = BTRFS_DEDUPE_LIMIT_NR_DEFAULT; + else { + u64 tmp = (u64)-1; + + if (limit_mem) { + tmp = div_u64(limit_mem, +
[PATCH v15.1 11/13] btrfs: dedupe: Add ioctl for inband deduplication
From: Wang Xiaoguang Add ioctl interface for inband deduplication, which includes: 1) enable 2) disable 3) status And a pseudo RO compat flag, to imply that btrfs now supports inband dedup. However we don't add any ondisk format change, it's just a pseudo RO compat flag. All these ioctl interfaces are state-less, which means caller don't need to bother previous dedupe state before calling them, and only need to care the final desired state. For example, if user want to enable dedupe with specified block size and limit, just fill the ioctl structure and call enable ioctl. No need to check if dedupe is already running. These ioctls will handle things like re-configure or disable quite well. Also, for invalid parameters, enable ioctl interface will set the field of the first encountered invalid parameter to (-1) to inform caller. While for limit_nr/limit_mem, the value will be (0). Signed-off-by: Qu Wenruo Signed-off-by: Wang Xiaoguang Signed-off-by: Lu Fengqi --- fs/btrfs/dedupe.c | 50 ++ fs/btrfs/dedupe.h | 17 +--- fs/btrfs/disk-io.c | 3 ++ fs/btrfs/ioctl.c | 85 ++ fs/btrfs/sysfs.c | 2 + include/uapi/linux/btrfs.h | 12 +- 6 files changed, 163 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/dedupe.c b/fs/btrfs/dedupe.c index 6199215022e6..76a967cca68e 100644 --- a/fs/btrfs/dedupe.c +++ b/fs/btrfs/dedupe.c @@ -29,6 +29,35 @@ static inline struct inmem_hash *inmem_alloc_hash(u16 algo) GFP_NOFS); } +void btrfs_dedupe_status(struct btrfs_fs_info *fs_info, +struct btrfs_ioctl_dedupe_args *dargs) +{ + struct btrfs_dedupe_info *dedupe_info = fs_info->dedupe_info; + + if (!fs_info->dedupe_enabled || !dedupe_info) { + dargs->status = 0; + dargs->blocksize = 0; + dargs->backend = 0; + dargs->hash_algo = 0; + dargs->limit_nr = 0; + dargs->current_nr = 0; + memset(dargs->__unused, -1, sizeof(dargs->__unused)); + return; + } + mutex_lock(&dedupe_info->lock); + dargs->status = 1; + dargs->blocksize = dedupe_info->blocksize; + dargs->backend = dedupe_info->backend; + dargs->hash_algo = dedupe_info->hash_algo; + dargs->limit_nr = dedupe_info->limit_nr; + dargs->limit_mem = dedupe_info->limit_nr * + (sizeof(struct inmem_hash) + +btrfs_hash_sizes[dedupe_info->hash_algo]); + dargs->current_nr = dedupe_info->current_nr; + mutex_unlock(&dedupe_info->lock); + memset(dargs->__unused, -1, sizeof(dargs->__unused)); +} + static struct btrfs_dedupe_info * init_dedupe_info(struct btrfs_ioctl_dedupe_args *dargs) { @@ -402,6 +431,27 @@ static void unblock_all_writers(struct btrfs_fs_info *fs_info) percpu_up_write(sb->s_writers.rw_sem + SB_FREEZE_WRITE - 1); } +int btrfs_dedupe_cleanup(struct btrfs_fs_info *fs_info) +{ + struct btrfs_dedupe_info *dedupe_info; + + fs_info->dedupe_enabled = 0; + /* same as disable */ + smp_wmb(); + dedupe_info = fs_info->dedupe_info; + fs_info->dedupe_info = NULL; + + if (!dedupe_info) + return 0; + + if (dedupe_info->backend == BTRFS_DEDUPE_BACKEND_INMEMORY) + inmem_destroy(dedupe_info); + + crypto_free_shash(dedupe_info->dedupe_driver); + kfree(dedupe_info); + return 0; +} + int btrfs_dedupe_disable(struct btrfs_fs_info *fs_info) { struct btrfs_dedupe_info *dedupe_info; diff --git a/fs/btrfs/dedupe.h b/fs/btrfs/dedupe.h index 8157b17c4d11..fdd00355d6b5 100644 --- a/fs/btrfs/dedupe.h +++ b/fs/btrfs/dedupe.h @@ -90,6 +90,15 @@ static inline struct btrfs_dedupe_hash *btrfs_dedupe_alloc_hash(u16 algo) int btrfs_dedupe_enable(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_dedupe_args *dargs); + +/* + * Get inband dedupe info + * Since it needs to access different backends' hash size, which + * is not exported, we need such simple function. + */ +void btrfs_dedupe_status(struct btrfs_fs_info *fs_info, +struct btrfs_ioctl_dedupe_args *dargs); + /* * Disable dedupe and invalidate all its dedupe data. * Called at dedupe disable time. @@ -101,12 +110,10 @@ int btrfs_dedupe_enable(struct btrfs_fs_info *fs_info, int btrfs_dedupe_disable(struct btrfs_fs_info *fs_info); /* - * Get current dedupe status. - * Return 0 for success - * No possible error yet + * Cleanup current btrfs_dedupe_info + * Called in umount time */ -void btrfs_dedupe_status(struct btrfs_fs_info *fs_info, -struct btrfs_ioctl_dedupe_args *dargs); +int btrfs_dedupe_cleanup(struct btrfs_fs_info *fs_info); /* * Calc
[PATCH v15.1 06/13] btrfs: dedupe: Introduce function to search for an existing hash
From: Wang Xiaoguang Introduce static function inmem_search() to handle the job for in-memory hash tree. The trick is, we must ensure the delayed ref head is not being run at the time we search the for the hash. With inmem_search(), we can implement the btrfs_dedupe_search() interface. Signed-off-by: Qu Wenruo Signed-off-by: Wang Xiaoguang Reviewed-by: Josef Bacik Signed-off-by: Lu Fengqi --- fs/btrfs/dedupe.c | 210 +- 1 file changed, 209 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/dedupe.c b/fs/btrfs/dedupe.c index 951fefd19fde..03ad41423c01 100644 --- a/fs/btrfs/dedupe.c +++ b/fs/btrfs/dedupe.c @@ -7,6 +7,8 @@ #include "dedupe.h" #include "btrfs_inode.h" #include "delayed-ref.h" +#include "qgroup.h" +#include "transaction.h" struct inmem_hash { struct rb_node hash_node; @@ -242,7 +244,6 @@ static int inmem_add(struct btrfs_dedupe_info *dedupe_info, struct inmem_hash *ihash; ihash = inmem_alloc_hash(algo); - if (!ihash) return -ENOMEM; @@ -436,3 +437,210 @@ int btrfs_dedupe_disable(struct btrfs_fs_info *fs_info) kfree(dedupe_info); return 0; } + +/* + * Caller must ensure the corresponding ref head is not being run. + */ +static struct inmem_hash * +inmem_search_hash(struct btrfs_dedupe_info *dedupe_info, u8 *hash) +{ + struct rb_node **p = &dedupe_info->hash_root.rb_node; + struct rb_node *parent = NULL; + struct inmem_hash *entry = NULL; + u16 hash_algo = dedupe_info->hash_algo; + int hash_len = btrfs_hash_sizes[hash_algo]; + + while (*p) { + parent = *p; + entry = rb_entry(parent, struct inmem_hash, hash_node); + + if (memcmp(hash, entry->hash, hash_len) < 0) { + p = &(*p)->rb_left; + } else if (memcmp(hash, entry->hash, hash_len) > 0) { + p = &(*p)->rb_right; + } else { + /* Found, need to re-add it to LRU list head */ + list_del(&entry->lru_list); + list_add(&entry->lru_list, &dedupe_info->lru_list); + return entry; + } + } + return NULL; +} + +static int inmem_search(struct btrfs_dedupe_info *dedupe_info, + struct inode *inode, u64 file_pos, + struct btrfs_dedupe_hash *hash) +{ + int ret; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; + struct btrfs_delayed_ref_root *delayed_refs; + struct btrfs_delayed_ref_head *head; + struct btrfs_delayed_ref_head *insert_head; + struct btrfs_delayed_data_ref *insert_dref; + struct btrfs_qgroup_extent_record *insert_qrecord = NULL; + struct inmem_hash *found_hash; + int free_insert = 1; + int qrecord_inserted = 0; + u64 ref_root = root->root_key.objectid; + u64 bytenr; + u32 num_bytes; + + insert_head = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS); + if (!insert_head) + return -ENOMEM; + insert_head->extent_op = NULL; + + insert_dref = kmem_cache_alloc(btrfs_delayed_data_ref_cachep, GFP_NOFS); + if (!insert_dref) { + kmem_cache_free(btrfs_delayed_ref_head_cachep, insert_head); + return -ENOMEM; + } + if (test_bit(BTRFS_FS_QUOTA_ENABLED, &root->fs_info->flags) && + is_fstree(ref_root)) { + insert_qrecord = kmalloc(sizeof(*insert_qrecord), GFP_NOFS); + if (!insert_qrecord) { + kmem_cache_free(btrfs_delayed_ref_head_cachep, + insert_head); + kmem_cache_free(btrfs_delayed_data_ref_cachep, + insert_dref); + return -ENOMEM; + } + } + + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto free_mem; + } + +again: + mutex_lock(&dedupe_info->lock); + found_hash = inmem_search_hash(dedupe_info, hash->hash); + /* If we don't find a duplicated extent, just return. */ + if (!found_hash) { + ret = 0; + goto out; + } + bytenr = found_hash->bytenr; + num_bytes = found_hash->num_bytes; + + btrfs_init_delayed_ref_head(insert_head, insert_qrecord, bytenr, + num_bytes, ref_root, 0, BTRFS_ADD_DELAYED_REF, true, + false); + + btrfs_init_delayed_ref_common(trans->fs_info, &insert_dref->node, + bytenr, num_bytes, ref_root, BTRFS_ADD_DELAYED_REF,
[PATCH v15.1 00/13] Btrfs In-band De-duplication
This patchset can be fetched from github: https://github.com/littleroad/linux.git dedupe_latest Now the new base is v4.20-rc1. Normal test cases from auto group exposes no regression, and ib-dedupe group can pass without problem. xfstests ib-dedupe group can be fetched from github: https://github.com/littleroad/xfstests-dev.git btrfs_dedupe_latest Changelog: v2: Totally reworked to handle multiple backends v3: Fix a stupid but deadly on-disk backend bug Add handle for multiple hash on same bytenr corner case to fix abort trans error Increase dedup rate by enhancing delayed ref handler for both backend. Move dedup_add() to run_delayed_ref() time, to fix abort trans error. Increase dedup block size up limit to 8M. v4: Add dedup prop for disabling dedup for given files/dirs. Merge inmem_search() and ondisk_search() into generic_search() to save some code Fix another delayed_ref related bug. Use the same mutex for both inmem and ondisk backend. Move dedup_add() back to btrfs_finish_ordered_io() to increase dedup rate. v5: Reuse compress routine for much simpler dedup function. Slightly improved performance due to above modification. Fix race between dedup enable/disable Fix for false ENOSPC report v6: Further enable/disable race window fix. Minor format change according to checkpatch. v7: Fix one concurrency bug with balance. Slightly modify return value from -EINVAL to -EOPNOTSUPP for btrfs_dedup_ioctl() to allow progs to distinguish unsupported commands and wrong parameter. Rebased to integration-4.6. v8: Rename 'dedup' to 'dedupe'. Add support to allow dedupe and compression work at the same time. Fix several balance related bugs. Special thanks to Satoru Takeuchi, who exposed most of them. Small dedupe hit case performance improvement. v9: Re-order the patchset to completely separate pure in-memory and any on-disk format change. Fold bug fixes into its original patch. v10: Adding back missing bug fix patch. Reduce on-disk item size. Hide dedupe ioctl under CONFIG_BTRFS_DEBUG. v11: Remove other backend and props support to focus on the framework and in-memory backend. Suggested by David. Better disable and buffered write race protection. Comprehensive fix to dedupe metadata ENOSPC problem. v12: Stateful 'enable' ioctl and new 'reconf' ioctl New FORCE flag for enable ioctl to allow stateless ioctl Precise error report and extendable ioctl structure. v12.1 Rebase to David's for-next-20160704 branch Add co-ordinate patch for subpage and dedupe patchset. v12.2 Rebase to David's for-next-20160715 branch Add co-ordinate patch for other patchset. v13 Rebase to David's for-next-20160906 branch Fix a reserved space leak bug, which only frees quota reserved space but not space_info->byte_may_use. v13.1 Rebase to Chris' for-linux-4.9 branch v14 Use generic ENOSPC fix for both compression and dedupe. v14.1 Further split ENOSPC fix. v14.2 Rebase to v4.11-rc2. Co-operate with count_max_extent() to calculate num_extents. No longer rely on qgroup fixes. v14.3 Rebase to v4.12-rc1. v14.4 Rebase to kdave/for-4.13-part1. v14.5 Rebase to v4.15-rc3. v14.6 Rebase to v4.17-rc5. v14.7 Replace SHASH_DESC_ON_STACK with kmalloc to remove VLA. Fixed the following errors by switching to div_u64. ├── arm-allmodconfig │ └── ERROR:__aeabi_uldivmod-fs-btrfs-btrfs.ko-undefined └── i386-allmodconfig └── ERROR:__udivdi3-fs-btrfs-btrfs.ko-undefined v14.8 Rebase to v4.18-rc4. v15 Rebase to v4.19-rc2. Drop "btrfs: Introduce COMPRESS reserve type to fix false enospc for compression". Remove the ifdef around btrfs inband dedupe ioctl. v15.1 Rebase to v4.20-rc1. Qu Wenruo (4): btrfs: delayed-ref: Add support for increasing data ref under spinlock btrfs: dedupe: Inband in-memory only de-duplication implement btrfs: relocation: Enhance error handling to avoid BUG_ON btrfs: dedupe: Introduce new reconfigure ioctl Wang Xiaoguang (9): btrfs: dedupe: Introduce dedupe framework and its header btrfs: dedupe: Introduce function to initialize dedupe info btrfs: dedupe: Introduce function to add hash into in-memory tree btrfs: dedupe: Introduce function to remove hash from in-memory tree btrfs: dedupe: Introduce function to search for an existing hash btrfs: dedupe: Implement btrfs_dedupe_calc_hash interface btrfs: ordered-extent: Add support for dedupe btrfs: introduce type based delalloc metadata reserve btrfs: dedupe: Add ioctl for inband deduplication fs/btrfs/Makefile| 2 +- fs/btrfs/ctree.h | 52 ++- fs/btrfs/dedupe.c| 828 +++ fs/btrfs/dedupe.h| 175 +++- fs/btrfs/delayed-ref.c | 53 ++- fs/btrfs/delayed-ref.h | 15 + fs/btrfs/disk-io.c | 4 + fs/btrfs/extent-tree.c | 67 ++- fs/btrfs/extent_io.c | 7 +- fs/btrfs/extent_io.h | 1 + fs/b
[PATCH v15.1 03/13] btrfs: dedupe: Introduce function to add hash into in-memory tree
From: Wang Xiaoguang Introduce static function inmem_add() to add hash into in-memory tree. And now we can implement the btrfs_dedupe_add() interface. Signed-off-by: Qu Wenruo Signed-off-by: Wang Xiaoguang Reviewed-by: Josef Bacik Signed-off-by: Lu Fengqi --- fs/btrfs/dedupe.c | 150 ++ 1 file changed, 150 insertions(+) diff --git a/fs/btrfs/dedupe.c b/fs/btrfs/dedupe.c index 06523162753d..784bb3a8a5ab 100644 --- a/fs/btrfs/dedupe.c +++ b/fs/btrfs/dedupe.c @@ -19,6 +19,14 @@ struct inmem_hash { u8 hash[]; }; +static inline struct inmem_hash *inmem_alloc_hash(u16 algo) +{ + if (WARN_ON(algo >= ARRAY_SIZE(btrfs_hash_sizes))) + return NULL; + return kzalloc(sizeof(struct inmem_hash) + btrfs_hash_sizes[algo], + GFP_NOFS); +} + static struct btrfs_dedupe_info * init_dedupe_info(struct btrfs_ioctl_dedupe_args *dargs) { @@ -167,3 +175,145 @@ int btrfs_dedupe_disable(struct btrfs_fs_info *fs_info) /* Place holder for bisect, will be implemented in later patches */ return 0; } + +static int inmem_insert_hash(struct rb_root *root, +struct inmem_hash *hash, int hash_len) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + struct inmem_hash *entry = NULL; + + while (*p) { + parent = *p; + entry = rb_entry(parent, struct inmem_hash, hash_node); + if (memcmp(hash->hash, entry->hash, hash_len) < 0) + p = &(*p)->rb_left; + else if (memcmp(hash->hash, entry->hash, hash_len) > 0) + p = &(*p)->rb_right; + else + return 1; + } + rb_link_node(&hash->hash_node, parent, p); + rb_insert_color(&hash->hash_node, root); + return 0; +} + +static int inmem_insert_bytenr(struct rb_root *root, + struct inmem_hash *hash) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + struct inmem_hash *entry = NULL; + + while (*p) { + parent = *p; + entry = rb_entry(parent, struct inmem_hash, bytenr_node); + if (hash->bytenr < entry->bytenr) + p = &(*p)->rb_left; + else if (hash->bytenr > entry->bytenr) + p = &(*p)->rb_right; + else + return 1; + } + rb_link_node(&hash->bytenr_node, parent, p); + rb_insert_color(&hash->bytenr_node, root); + return 0; +} + +static void __inmem_del(struct btrfs_dedupe_info *dedupe_info, + struct inmem_hash *hash) +{ + list_del(&hash->lru_list); + rb_erase(&hash->hash_node, &dedupe_info->hash_root); + rb_erase(&hash->bytenr_node, &dedupe_info->bytenr_root); + + if (!WARN_ON(dedupe_info->current_nr == 0)) + dedupe_info->current_nr--; + + kfree(hash); +} + +/* + * Insert a hash into in-memory dedupe tree + * Will remove exceeding last recent use hash. + * + * If the hash mathced with existing one, we won't insert it, to + * save memory + */ +static int inmem_add(struct btrfs_dedupe_info *dedupe_info, +struct btrfs_dedupe_hash *hash) +{ + int ret = 0; + u16 algo = dedupe_info->hash_algo; + struct inmem_hash *ihash; + + ihash = inmem_alloc_hash(algo); + + if (!ihash) + return -ENOMEM; + + /* Copy the data out */ + ihash->bytenr = hash->bytenr; + ihash->num_bytes = hash->num_bytes; + memcpy(ihash->hash, hash->hash, btrfs_hash_sizes[algo]); + + mutex_lock(&dedupe_info->lock); + + ret = inmem_insert_bytenr(&dedupe_info->bytenr_root, ihash); + if (ret > 0) { + kfree(ihash); + ret = 0; + goto out; + } + + ret = inmem_insert_hash(&dedupe_info->hash_root, ihash, + btrfs_hash_sizes[algo]); + if (ret > 0) { + /* +* We only keep one hash in tree to save memory, so if +* hash conflicts, free the one to insert. +*/ + rb_erase(&ihash->bytenr_node, &dedupe_info->bytenr_root); + kfree(ihash); + ret = 0; + goto out; + } + + list_add(&ihash->lru_list, &dedupe_info->lru_list); + dedupe_info->current_nr++; + + /* Remove the last dedupe hash if we exceed limit */ + while (dedupe_info->current_nr > dedupe_info->limit_nr) { + struct inmem_hash *last; + + last = list_entry(dedupe
[PATCH v15.1 04/13] btrfs: dedupe: Introduce function to remove hash from in-memory tree
From: Wang Xiaoguang Introduce static function inmem_del() to remove hash from in-memory dedupe tree. And implement btrfs_dedupe_del() and btrfs_dedup_disable() interfaces. Also for btrfs_dedupe_disable(), add new functions to wait existing writer and block incoming writers to eliminate all possible race. Cc: Mark Fasheh Signed-off-by: Qu Wenruo Signed-off-by: Wang Xiaoguang Signed-off-by: Lu Fengqi --- fs/btrfs/dedupe.c | 131 +++--- 1 file changed, 125 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/dedupe.c b/fs/btrfs/dedupe.c index 784bb3a8a5ab..951fefd19fde 100644 --- a/fs/btrfs/dedupe.c +++ b/fs/btrfs/dedupe.c @@ -170,12 +170,6 @@ int btrfs_dedupe_enable(struct btrfs_fs_info *fs_info, return ret; } -int btrfs_dedupe_disable(struct btrfs_fs_info *fs_info) -{ - /* Place holder for bisect, will be implemented in later patches */ - return 0; -} - static int inmem_insert_hash(struct rb_root *root, struct inmem_hash *hash, int hash_len) { @@ -317,3 +311,128 @@ int btrfs_dedupe_add(struct btrfs_fs_info *fs_info, return inmem_add(dedupe_info, hash); return -EINVAL; } + +static struct inmem_hash * +inmem_search_bytenr(struct btrfs_dedupe_info *dedupe_info, u64 bytenr) +{ + struct rb_node **p = &dedupe_info->bytenr_root.rb_node; + struct rb_node *parent = NULL; + struct inmem_hash *entry = NULL; + + while (*p) { + parent = *p; + entry = rb_entry(parent, struct inmem_hash, bytenr_node); + + if (bytenr < entry->bytenr) + p = &(*p)->rb_left; + else if (bytenr > entry->bytenr) + p = &(*p)->rb_right; + else + return entry; + } + + return NULL; +} + +/* Delete a hash from in-memory dedupe tree */ +static int inmem_del(struct btrfs_dedupe_info *dedupe_info, u64 bytenr) +{ + struct inmem_hash *hash; + + mutex_lock(&dedupe_info->lock); + hash = inmem_search_bytenr(dedupe_info, bytenr); + if (!hash) { + mutex_unlock(&dedupe_info->lock); + return 0; + } + + __inmem_del(dedupe_info, hash); + mutex_unlock(&dedupe_info->lock); + return 0; +} + +/* Remove a dedupe hash from dedupe tree */ +int btrfs_dedupe_del(struct btrfs_fs_info *fs_info, u64 bytenr) +{ + struct btrfs_dedupe_info *dedupe_info = fs_info->dedupe_info; + + if (!fs_info->dedupe_enabled) + return 0; + + if (WARN_ON(dedupe_info == NULL)) + return -EINVAL; + + if (dedupe_info->backend == BTRFS_DEDUPE_BACKEND_INMEMORY) + return inmem_del(dedupe_info, bytenr); + return -EINVAL; +} + +static void inmem_destroy(struct btrfs_dedupe_info *dedupe_info) +{ + struct inmem_hash *entry, *tmp; + + mutex_lock(&dedupe_info->lock); + list_for_each_entry_safe(entry, tmp, &dedupe_info->lru_list, lru_list) + __inmem_del(dedupe_info, entry); + mutex_unlock(&dedupe_info->lock); +} + +/* + * Helper function to wait and block all incoming writers + * + * Use rw_sem introduced for freeze to wait/block writers. + * So during the block time, no new write will happen, so we can + * do something quite safe, espcially helpful for dedupe disable, + * as it affect buffered write. + */ +static void block_all_writers(struct btrfs_fs_info *fs_info) +{ + struct super_block *sb = fs_info->sb; + + percpu_down_write(sb->s_writers.rw_sem + SB_FREEZE_WRITE - 1); + down_write(&sb->s_umount); +} + +static void unblock_all_writers(struct btrfs_fs_info *fs_info) +{ + struct super_block *sb = fs_info->sb; + + up_write(&sb->s_umount); + percpu_up_write(sb->s_writers.rw_sem + SB_FREEZE_WRITE - 1); +} + +int btrfs_dedupe_disable(struct btrfs_fs_info *fs_info) +{ + struct btrfs_dedupe_info *dedupe_info; + int ret; + + dedupe_info = fs_info->dedupe_info; + + if (!dedupe_info) + return 0; + + /* Don't allow disable status change in RO mount */ + if (fs_info->sb->s_flags & MS_RDONLY) + return -EROFS; + + /* +* Wait for all unfinished writers and block further writers. +* Then sync the whole fs so all current write will go through +* dedupe, and all later write won't go through dedupe. +*/ + block_all_writers(fs_info); + ret = sync_filesystem(fs_info->sb); + fs_info->dedupe_enabled = 0; + fs_info->dedupe_info = NULL; + unblock_all_writers(fs_info); + if (ret < 0) + return ret; + + /* now we are OK to clean up everything */ + if (dedupe_info->backend == BTRFS_DEDUPE_BACKEND_INMEMOR
[PATCH v15.1 13/13] btrfs: dedupe: Introduce new reconfigure ioctl
From: Qu Wenruo Introduce new reconfigure ioctl and new FORCE flag for in-band dedupe ioctls. Now dedupe enable and reconfigure ioctl are stateful. | Current state | Ioctl| Next state | | Disabled | enable| Enabled | | Enabled | enable| Not allowed | | Enabled | reconf| Enabled | | Enabled | disable | Disabled| | Disabled | dsiable | Disabled| | Disabled | reconf| Not allowed | (While disable is always stateless) While for guys prefer stateless ioctl (myself for example), new FORCE flag is introduced. In FORCE mode, enable/disable is completely stateless. | Current state | Ioctl| Next state | | Disabled | enable| Enabled | | Enabled | enable| Enabled | | Enabled | disable | Disabled| | Disabled | disable | Disabled| Also, re-configure ioctl will only modify specified fields. Unlike enable, un-specified fields will be filled with default value. For example: # btrfs dedupe enable --block-size 64k /mnt # btrfs dedupe reconfigure --limit-hash 1m /mnt Will leads to: dedupe blocksize: 64K dedupe hash limit nr: 1m While for enable: # btrfs dedupe enable --force --block-size 64k /mnt # btrfs dedupe enable --force --limit-hash 1m /mnt Will reset blocksize to default value: dedupe blocksize: 128K << reset dedupe hash limit nr: 1m Suggested-by: David Sterba Signed-off-by: Qu Wenruo Signed-off-by: Lu Fengqi --- fs/btrfs/dedupe.c | 132 ++--- fs/btrfs/dedupe.h | 13 fs/btrfs/ioctl.c | 13 include/uapi/linux/btrfs.h | 11 +++- 4 files changed, 143 insertions(+), 26 deletions(-) diff --git a/fs/btrfs/dedupe.c b/fs/btrfs/dedupe.c index 76a967cca68e..92152134d3c0 100644 --- a/fs/btrfs/dedupe.c +++ b/fs/btrfs/dedupe.c @@ -29,6 +29,40 @@ static inline struct inmem_hash *inmem_alloc_hash(u16 algo) GFP_NOFS); } +/* + * Copy from current dedupe info to fill dargs. + * For reconf case, only fill members which is uninitialized. + */ +static void get_dedupe_status(struct btrfs_dedupe_info *dedupe_info, + struct btrfs_ioctl_dedupe_args *dargs) +{ + int reconf = (dargs->cmd == BTRFS_DEDUPE_CTL_RECONF); + + dargs->status = 1; + + if (!reconf || (reconf && dargs->blocksize == (u64)-1)) + dargs->blocksize = dedupe_info->blocksize; + if (!reconf || (reconf && dargs->backend == (u16)-1)) + dargs->backend = dedupe_info->backend; + if (!reconf || (reconf && dargs->hash_algo == (u16)-1)) + dargs->hash_algo = dedupe_info->hash_algo; + + /* +* For re-configure case, if not modifying limit, +* therir limit will be set to 0, unlike other fields +*/ + if (!reconf || !(dargs->limit_nr || dargs->limit_mem)) { + dargs->limit_nr = dedupe_info->limit_nr; + dargs->limit_mem = dedupe_info->limit_nr * + (sizeof(struct inmem_hash) + +btrfs_hash_sizes[dedupe_info->hash_algo]); + } + + /* current_nr doesn't makes sense for reconfig case */ + if (!reconf) + dargs->current_nr = dedupe_info->current_nr; +} + void btrfs_dedupe_status(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_dedupe_args *dargs) { @@ -45,15 +79,7 @@ void btrfs_dedupe_status(struct btrfs_fs_info *fs_info, return; } mutex_lock(&dedupe_info->lock); - dargs->status = 1; - dargs->blocksize = dedupe_info->blocksize; - dargs->backend = dedupe_info->backend; - dargs->hash_algo = dedupe_info->hash_algo; - dargs->limit_nr = dedupe_info->limit_nr; - dargs->limit_mem = dedupe_info->limit_nr * - (sizeof(struct inmem_hash) + -btrfs_hash_sizes[dedupe_info->hash_algo]); - dargs->current_nr = dedupe_info->current_nr; + get_dedupe_status(dedupe_info, dargs); mutex_unlock(&dedupe_info->lock); memset(dargs->__unused, -1, sizeof(dargs->__unused)); } @@ -98,17 +124,50 @@ init_dedupe_info(struct btrfs_ioctl_dedupe_args *dargs) static int check_dedupe_parameter(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_dedupe_args *dargs) { - u64 blocksize = dargs->blocksize; - u64 limit_nr = dargs->limit_nr; - u64 limit_mem = dargs->limit_mem; - u16 hash_algo = darg
[PATCH v15.1 12/13] btrfs: relocation: Enhance error handling to avoid BUG_ON
From: Qu Wenruo Since the introduction of btrfs dedupe tree, it's possible that balance can race with dedupe disabling. When this happens, dedupe_enabled will make btrfs_get_fs_root() return PTR_ERR(-ENOENT). But due to a bug in error handling branch, when this happens backref_cache->nr_nodes is increased but the node is neither added to backref_cache or nr_nodes decreased. Causing BUG_ON() in backref_cache_cleanup() [ 2611.668810] [ cut here ] [ 2611.669946] kernel BUG at /home/sat/ktest/linux/fs/btrfs/relocation.c:243! [ 2611.670572] invalid opcode: [#1] SMP [ 2611.686797] Call Trace: [ 2611.687034] [] btrfs_relocate_block_group+0x1b3/0x290 [btrfs] [ 2611.687706] [] btrfs_relocate_chunk.isra.40+0x47/0xd0 [btrfs] [ 2611.688385] [] btrfs_balance+0xb22/0x11e0 [btrfs] [ 2611.688966] [] btrfs_ioctl_balance+0x391/0x3a0 [btrfs] [ 2611.689587] [] btrfs_ioctl+0x1650/0x2290 [btrfs] [ 2611.690145] [] ? lru_cache_add+0x3a/0x80 [ 2611.690647] [] ? lru_cache_add_active_or_unevictable+0x4c/0xc0 [ 2611.691310] [] ? handle_mm_fault+0xcd4/0x17f0 [ 2611.691842] [] ? cp_new_stat+0x153/0x180 [ 2611.692342] [] ? __vma_link_rb+0xfd/0x110 [ 2611.692842] [] ? vma_link+0xb9/0xc0 [ 2611.693303] [] do_vfs_ioctl+0xa1/0x5a0 [ 2611.693781] [] ? __do_page_fault+0x1b4/0x400 [ 2611.694310] [] SyS_ioctl+0x41/0x70 [ 2611.694758] [] entry_SYSCALL_64_fastpath+0x12/0x71 [ 2611.695331] Code: ff 48 8b 45 bf 49 83 af a8 05 00 00 01 49 89 87 a0 05 00 00 e9 2e fd ff ff b8 f4 ff ff ff e9 e4 fb ff ff 0f 0b 0f 0b 0f 0b 0f 0b <0f> 0b 0f 0b 41 89 c6 e9 b8 fb ff ff e8 9e a6 e8 e0 4c 89 e7 44 [ 2611.697870] RIP [] relocate_block_group+0x741/0x7a0 [btrfs] [ 2611.698818] RSP This patch will call remove_backref_node() in error handling branch, and cache the returned -ENOENT in relocate_tree_block() and continue balancing. Reported-by: Satoru Takeuchi Signed-off-by: Qu Wenruo Signed-off-by: Lu Fengqi --- fs/btrfs/relocation.c | 19 --- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index b7c304c6e741..ee96390d1e42 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -854,6 +854,13 @@ struct backref_node *build_backref_tree(struct reloc_control *rc, root = read_fs_root(rc->extent_root->fs_info, key.offset); if (IS_ERR(root)) { err = PTR_ERR(root); + /* +* Don't forget to cleanup current node. +* As it may not be added to backref_cache but nr_node +* increased. +* This will cause BUG_ON() in backref_cache_cleanup(). +*/ + remove_backref_node(&rc->backref_cache, cur); goto out; } @@ -3021,8 +3028,15 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans, node = build_backref_tree(rc, &block->key, block->level, block->bytenr); if (IS_ERR(node)) { + /* +* The root(dedupe tree yet) of the tree block is +* going to be freed and can't be reached. +* Just skip it and continue balancing. +*/ + if (PTR_ERR(node) == -ENOENT) + continue; err = PTR_ERR(node); - goto out; + break; } ret = relocate_tree_block(trans, rc, node, &block->key, @@ -3030,10 +3044,9 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans, if (ret < 0) { if (ret != -EAGAIN || &block->rb_node == rb_first(blocks)) err = ret; - goto out; + break; } } -out: err = finish_pending_nodes(trans, rc, path, err); out_free_path: -- 2.19.1
[PATCH v15.1 05/13] btrfs: delayed-ref: Add support for increasing data ref under spinlock
From: Qu Wenruo For in-band dedupe, btrfs needs to increase data ref with delayed_ref locked, so add a new function btrfs_add_delayed_data_ref_lock() to increase extent ref with delayed_refs already locked. Export init_delayed_ref_head and init_delayed_ref_common for inband dedupe. Signed-off-by: Qu Wenruo Reviewed-by: Josef Bacik Signed-off-by: Lu Fengqi --- fs/btrfs/delayed-ref.c | 53 +- fs/btrfs/delayed-ref.h | 15 2 files changed, 52 insertions(+), 16 deletions(-) diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 9301b3ad9217..ae8968f10ce0 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -533,7 +533,7 @@ update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs, spin_unlock(&existing->lock); } -static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref, +void btrfs_init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref, struct btrfs_qgroup_extent_record *qrecord, u64 bytenr, u64 num_bytes, u64 ref_root, u64 reserved, int action, bool is_data, @@ -661,7 +661,7 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans, } /* - * init_delayed_ref_common - Initialize the structure which represents a + * btrfs_init_delayed_ref_common - Initialize the structure which represents a * modification to a an extent. * * @fs_info:Internal to the mounted filesystem mount structure. @@ -685,7 +685,7 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans, * when recording a metadata extent or BTRFS_SHARED_DATA_REF_KEY/ * BTRFS_EXTENT_DATA_REF_KEY when recording data extent */ -static void init_delayed_ref_common(struct btrfs_fs_info *fs_info, +void btrfs_init_delayed_ref_common(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_node *ref, u64 bytenr, u64 num_bytes, u64 ref_root, int action, u8 ref_type) @@ -758,14 +758,14 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, else ref_type = BTRFS_TREE_BLOCK_REF_KEY; - init_delayed_ref_common(fs_info, &ref->node, bytenr, num_bytes, - ref_root, action, ref_type); + btrfs_init_delayed_ref_common(fs_info, &ref->node, bytenr, num_bytes, + ref_root, action, ref_type); ref->root = ref_root; ref->parent = parent; ref->level = level; - init_delayed_ref_head(head_ref, record, bytenr, num_bytes, - ref_root, 0, action, false, is_system); + btrfs_init_delayed_ref_head(head_ref, record, bytenr, num_bytes, + ref_root, 0, action, false, is_system); head_ref->extent_op = extent_op; delayed_refs = &trans->transaction->delayed_refs; @@ -794,6 +794,29 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, return 0; } +/* + * Do real delayed data ref insert. + * Caller must hold delayed_refs->lock and allocation memory + * for dref,head_ref and record. + */ +int btrfs_add_delayed_data_ref_locked(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_head *head_ref, + struct btrfs_qgroup_extent_record *qrecord, + struct btrfs_delayed_data_ref *ref, int action, + int *qrecord_inserted_ret, int *old_ref_mod, + int *new_ref_mod) +{ + struct btrfs_delayed_ref_root *delayed_refs; + + head_ref = add_delayed_ref_head(trans, head_ref, qrecord, + action, qrecord_inserted_ret, + old_ref_mod, new_ref_mod); + + delayed_refs = &trans->transaction->delayed_refs; + + return insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node); +} + /* * add a delayed data ref. it's similar to btrfs_add_delayed_tree_ref. */ @@ -820,7 +843,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, ref_type = BTRFS_SHARED_DATA_REF_KEY; else ref_type = BTRFS_EXTENT_DATA_REF_KEY; - init_delayed_ref_common(fs_info, &ref->node, bytenr, num_bytes, + btrfs_init_delayed_ref_common(fs_info, &ref->node, bytenr, num_bytes, ref_root, action, ref_type); ref->root = ref_root; ref->parent = parent; @@ -845,8 +868,8 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, } } - init_delayed_ref_head(head_ref, record, bytenr, num_bytes, ref_root, - reserved, actio
[PATCH v15.1 08/13] btrfs: ordered-extent: Add support for dedupe
From: Wang Xiaoguang Add ordered-extent support for dedupe. Note, current ordered-extent support only supports non-compressed source extent. Support for compressed source extent will be added later. Signed-off-by: Qu Wenruo Signed-off-by: Wang Xiaoguang Reviewed-by: Josef Bacik --- fs/btrfs/ordered-data.c | 46 + fs/btrfs/ordered-data.h | 13 2 files changed, 55 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 0c4ef208b8b9..4b112258a79b 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -12,6 +12,7 @@ #include "extent_io.h" #include "disk-io.h" #include "compression.h" +#include "dedupe.h" static struct kmem_cache *btrfs_ordered_extent_cache; @@ -170,7 +171,8 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree, */ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, u64 start, u64 len, u64 disk_len, - int type, int dio, int compress_type) + int type, int dio, int compress_type, + struct btrfs_dedupe_hash *hash) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; @@ -191,6 +193,33 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, entry->inode = igrab(inode); entry->compress_type = compress_type; entry->truncated_len = (u64)-1; + entry->hash = NULL; + /* +* A hash hit means we have already incremented the extents delayed +* ref. +* We must handle this even if another process is trying to +* turn off dedupe, otherwise we will leak a reference. +*/ + if (hash && (hash->bytenr || root->fs_info->dedupe_enabled)) { + struct btrfs_dedupe_info *dedupe_info; + + dedupe_info = root->fs_info->dedupe_info; + if (WARN_ON(dedupe_info == NULL)) { + kmem_cache_free(btrfs_ordered_extent_cache, + entry); + return -EINVAL; + } + entry->hash = btrfs_dedupe_alloc_hash(dedupe_info->hash_algo); + if (!entry->hash) { + kmem_cache_free(btrfs_ordered_extent_cache, entry); + return -ENOMEM; + } + entry->hash->bytenr = hash->bytenr; + entry->hash->num_bytes = hash->num_bytes; + memcpy(entry->hash->hash, hash->hash, + btrfs_hash_sizes[dedupe_info->hash_algo]); + } + if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE) set_bit(type, &entry->flags); @@ -245,15 +274,23 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, { return __btrfs_add_ordered_extent(inode, file_offset, start, len, disk_len, type, 0, - BTRFS_COMPRESS_NONE); + BTRFS_COMPRESS_NONE, NULL); } +int btrfs_add_ordered_extent_dedupe(struct inode *inode, u64 file_offset, + u64 start, u64 len, u64 disk_len, int type, + struct btrfs_dedupe_hash *hash) +{ + return __btrfs_add_ordered_extent(inode, file_offset, start, len, + disk_len, type, 0, + BTRFS_COMPRESS_NONE, hash); +} int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset, u64 start, u64 len, u64 disk_len, int type) { return __btrfs_add_ordered_extent(inode, file_offset, start, len, disk_len, type, 1, - BTRFS_COMPRESS_NONE); + BTRFS_COMPRESS_NONE, NULL); } int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset, @@ -262,7 +299,7 @@ int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset, { return __btrfs_add_ordered_extent(inode, file_offset, start, len, disk_len, type, 0, - compress_type); + compress_type, NULL); } /* @@ -444,6 +481,7 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) list_del(&sum->list); kfree(sum); } + kfree(entry->hash); kmem_cache_free(btrfs_ordered_extent_cache, entry); } } diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 02d813aaa261..08c7ee986bb9 100644 --- a/fs/b
[PATCH v15.1 07/13] btrfs: dedupe: Implement btrfs_dedupe_calc_hash interface
From: Wang Xiaoguang Unlike in-memory or on-disk dedupe method, only SHA256 hash method is supported yet, so implement btrfs_dedupe_calc_hash() interface using SHA256. Signed-off-by: Qu Wenruo Signed-off-by: Wang Xiaoguang Reviewed-by: Josef Bacik Signed-off-by: Lu Fengqi --- fs/btrfs/dedupe.c | 50 +++ 1 file changed, 50 insertions(+) diff --git a/fs/btrfs/dedupe.c b/fs/btrfs/dedupe.c index 03ad41423c01..6199215022e6 100644 --- a/fs/btrfs/dedupe.c +++ b/fs/btrfs/dedupe.c @@ -644,3 +644,53 @@ int btrfs_dedupe_search(struct btrfs_fs_info *fs_info, } return ret; } + +int btrfs_dedupe_calc_hash(struct btrfs_fs_info *fs_info, + struct inode *inode, u64 start, + struct btrfs_dedupe_hash *hash) +{ + int i; + int ret; + struct page *p; + struct shash_desc *shash; + struct btrfs_dedupe_info *dedupe_info = fs_info->dedupe_info; + struct crypto_shash *tfm = dedupe_info->dedupe_driver; + u64 dedupe_bs; + u64 sectorsize = fs_info->sectorsize; + + shash = kmalloc(sizeof(*shash) + crypto_shash_descsize(tfm), GFP_NOFS); + if (!shash) + return -ENOMEM; + + if (!fs_info->dedupe_enabled || !hash) + return 0; + + if (WARN_ON(dedupe_info == NULL)) + return -EINVAL; + + WARN_ON(!IS_ALIGNED(start, sectorsize)); + + dedupe_bs = dedupe_info->blocksize; + + shash->tfm = tfm; + shash->flags = 0; + ret = crypto_shash_init(shash); + if (ret) + return ret; + for (i = 0; sectorsize * i < dedupe_bs; i++) { + char *d; + + p = find_get_page(inode->i_mapping, + (start >> PAGE_SHIFT) + i); + if (WARN_ON(!p)) + return -ENOENT; + d = kmap(p); + ret = crypto_shash_update(shash, d, sectorsize); + kunmap(p); + put_page(p); + if (ret) + return ret; + } + ret = crypto_shash_final(shash, hash->hash); + return ret; +} -- 2.19.1
Re: [PATCH] Btrfs: fix missing delayed iputs on unmount
On Tue, Oct 30, 2018 at 05:14:42PM -0700, Omar Sandoval wrote: >From: Omar Sandoval > >There's a race between close_ctree() and cleaner_kthread(). >close_ctree() sets btrfs_fs_closing(), and the cleaner stops when it >sees it set, but this is racy; the cleaner might have already checked >the bit and could be cleaning stuff. In particular, if it deletes unused >block groups, it will create delayed iputs for the free space cache >inodes. As of "btrfs: don't run delayed_iputs in commit", we're no >longer running delayed iputs after a commit. Therefore, if the cleaner >creates more delayed iputs after delayed iputs are run in >btrfs_commit_super(), we will leak inodes on unmount and get a busy Since the assert added via commit e187831e1875 ("btrfs: assert on non-empty delayed iputs") wasn't triggered, it doesn't seem to be the cause of inode leak. -- Thanks, Lu >inode crash from the VFS. > >Fix it by parking the cleaner before we actually close anything. Then, >any remaining delayed iputs will always be handled in >btrfs_commit_super(). This also ensures that the commit in close_ctree() >is really the last commit, so we can get rid of the commit in >cleaner_kthread(). > >Fixes: 30928e9baac2 ("btrfs: don't run delayed_iputs in commit") >Signed-off-by: Omar Sandoval >--- >We found this with a stress test that our containers team runs. I'm >wondering if this same race could have caused any other issues other >than this new iput thing, but I couldn't identify any. > > fs/btrfs/disk-io.c | 40 +++- > 1 file changed, 7 insertions(+), 33 deletions(-) > >diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c >index b0ab41da91d1..7c17284ae3c2 100644 >--- a/fs/btrfs/disk-io.c >+++ b/fs/btrfs/disk-io.c >@@ -1664,9 +1664,8 @@ static int cleaner_kthread(void *arg) > struct btrfs_root *root = arg; > struct btrfs_fs_info *fs_info = root->fs_info; > int again; >- struct btrfs_trans_handle *trans; > >- do { >+ while (1) { > again = 0; > > /* Make the cleaner go to sleep early. */ >@@ -1715,42 +1714,16 @@ static int cleaner_kthread(void *arg) >*/ > btrfs_delete_unused_bgs(fs_info); > sleep: >+ if (kthread_should_park()) >+ kthread_parkme(); >+ if (kthread_should_stop()) >+ return 0; > if (!again) { > set_current_state(TASK_INTERRUPTIBLE); >- if (!kthread_should_stop()) >- schedule(); >+ schedule(); > __set_current_state(TASK_RUNNING); > } >- } while (!kthread_should_stop()); >- >- /* >- * Transaction kthread is stopped before us and wakes us up. >- * However we might have started a new transaction and COWed some >- * tree blocks when deleting unused block groups for example. So >- * make sure we commit the transaction we started to have a clean >- * shutdown when evicting the btree inode - if it has dirty pages >- * when we do the final iput() on it, eviction will trigger a >- * writeback for it which will fail with null pointer dereferences >- * since work queues and other resources were already released and >- * destroyed by the time the iput/eviction/writeback is made. >- */ >- trans = btrfs_attach_transaction(root); >- if (IS_ERR(trans)) { >- if (PTR_ERR(trans) != -ENOENT) >- btrfs_err(fs_info, >-"cleaner transaction attach returned %ld", >-PTR_ERR(trans)); >- } else { >- int ret; >- >- ret = btrfs_commit_transaction(trans); >- if (ret) >- btrfs_err(fs_info, >-"cleaner open transaction commit returned %d", >-ret); > } >- >- return 0; > } > > static int transaction_kthread(void *arg) >@@ -3931,6 +3904,7 @@ void close_ctree(struct btrfs_fs_info *fs_info) > int ret; > > set_bit(BTRFS_FS_CLOSING_START, &fs_info->flags); >+ kthread_park(fs_info->cleaner_kthread); > > /* wait for the qgroup rescan worker to stop */ > btrfs_qgroup_wait_for_completion(fs_info, false); >-- >2.19.1 > > > >
Re: [PATCH] Btrfs: fix missing delayed iputs on unmount
On Tue, Oct 30, 2018 at 05:14:42PM -0700, Omar Sandoval wrote: >From: Omar Sandoval > >There's a race between close_ctree() and cleaner_kthread(). >close_ctree() sets btrfs_fs_closing(), and the cleaner stops when it >sees it set, but this is racy; the cleaner might have already checked >the bit and could be cleaning stuff. In particular, if it deletes unused >block groups, it will create delayed iputs for the free space cache >inodes. As of "btrfs: don't run delayed_iputs in commit", we're no >longer running delayed iputs after a commit. Therefore, if the cleaner >creates more delayed iputs after delayed iputs are run in >btrfs_commit_super(), we will leak inodes on unmount and get a busy >inode crash from the VFS. > >Fix it by parking the cleaner before we actually close anything. Then, >any remaining delayed iputs will always be handled in >btrfs_commit_super(). This also ensures that the commit in close_ctree() >is really the last commit, so we can get rid of the commit in >cleaner_kthread(). > >Fixes: 30928e9baac2 ("btrfs: don't run delayed_iputs in commit") >Signed-off-by: Omar Sandoval >--- >We found this with a stress test that our containers team runs. I'm >wondering if this same race could have caused any other issues other >than this new iput thing, but I couldn't identify any. I noticed an inode leak issue in generic/475, but whether dropping commit 30928e9baac2 ("btrfs: don't run delayed_iputs in commit") or applying this patch, the issue still exists. I have attached the dmesg. > > fs/btrfs/disk-io.c | 40 +++- > 1 file changed, 7 insertions(+), 33 deletions(-) > >diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c >index b0ab41da91d1..7c17284ae3c2 100644 >--- a/fs/btrfs/disk-io.c >+++ b/fs/btrfs/disk-io.c >@@ -1664,9 +1664,8 @@ static int cleaner_kthread(void *arg) > struct btrfs_root *root = arg; > struct btrfs_fs_info *fs_info = root->fs_info; > int again; >- struct btrfs_trans_handle *trans; > >- do { >+ while (1) { > again = 0; > > /* Make the cleaner go to sleep early. */ >@@ -1715,42 +1714,16 @@ static int cleaner_kthread(void *arg) >*/ > btrfs_delete_unused_bgs(fs_info); > sleep: >+ if (kthread_should_park()) >+ kthread_parkme(); >+ if (kthread_should_stop()) >+ return 0; > if (!again) { > set_current_state(TASK_INTERRUPTIBLE); >- if (!kthread_should_stop()) >- schedule(); >+ schedule(); > __set_current_state(TASK_RUNNING); > } >- } while (!kthread_should_stop()); >- >- /* >- * Transaction kthread is stopped before us and wakes us up. >- * However we might have started a new transaction and COWed some >- * tree blocks when deleting unused block groups for example. So >- * make sure we commit the transaction we started to have a clean >- * shutdown when evicting the btree inode - if it has dirty pages >- * when we do the final iput() on it, eviction will trigger a >- * writeback for it which will fail with null pointer dereferences >- * since work queues and other resources were already released and >- * destroyed by the time the iput/eviction/writeback is made. >- */ >- trans = btrfs_attach_transaction(root); >- if (IS_ERR(trans)) { >- if (PTR_ERR(trans) != -ENOENT) >- btrfs_err(fs_info, >-"cleaner transaction attach returned %ld", >-PTR_ERR(trans)); >- } else { >- int ret; >- >- ret = btrfs_commit_transaction(trans); >- if (ret) >- btrfs_err(fs_info, >-"cleaner open transaction commit returned %d", >-ret); > } >- >- return 0; > } > > static int transaction_kthread(void *arg) >@@ -3931,6 +3904,7 @@ void close_ctree(struct btrfs_fs_info *fs_info) > int ret; > > set_bit(BTRFS_FS_CLOSING_START, &fs_info->flags); >+ kthread_park(fs_info->cleaner_kthread); Since we are not going to call kthread_unpark, I am not sure why kthread_park is used instead of kthread_stop here. It looks like there is no significant difference between stopping instantly and delayed stop. -- Thanks, Lu > > /* wait for the qgroup rescan worker to stop */ > btrfs_qgroup_wait_for_completion(fs_info, false); >-- >2.19.1 > > > > [ 366.955193] run fstests generic/475 at 2018-10-31 15:06:43 [ 367.495791] BTRFS: device fsid 812f883c-40b2-4456-9769-b94ddf1cb07e devid 1 transid 5 /dev/nvme0n1p2 [ 367.624469] BTRFS info (device dm-3): disk space caching is enabled [ 367.627305] BTRFS info (device dm-3): has skinny extents [ 367.6287
[PATCH 0/3] fix pinned underflow in generic/475
When running generic/475, pinned underflow may occur. This patch will fix this problem, but there are still other warnings need to addressed in this case. Patch 1-2 introduce a macro and wrappers to help detect underflow Patch 3 the fix patch of pinned underflow Lu Fengqi (2): btrfs: extent-tree: Detect bytes_pinned underflow earlier btrfs: fix pinned underflow after transaction aborted Qu Wenruo (1): btrfs: extent-tree: Detect bytes_may_use underflow earlier fs/btrfs/disk-io.c | 12 +- fs/btrfs/extent-tree.c | 53 ++ 2 files changed, 44 insertions(+), 21 deletions(-) -- 2.19.1
[PATCH 2/3] btrfs: extent-tree: Detect bytes_pinned underflow earlier
Introduce a new wrapper update_bytes_pinned to replace open coded bytes_pinned modifiers. Signed-off-by: Lu Fengqi --- fs/btrfs/extent-tree.c | 9 + 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index c0147a1307e7..bb91db944d21 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -65,6 +65,7 @@ static inline void update_##name(struct btrfs_space_info *sinfo, \ } DECLARE_SPACE_INFO_UPDATE(bytes_may_use); +DECLARE_SPACE_INFO_UPDATE(bytes_pinned); static int __btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *node, u64 parent, @@ -6163,7 +6164,7 @@ static int update_block_group(struct btrfs_trans_handle *trans, old_val -= num_bytes; btrfs_set_block_group_used(&cache->item, old_val); cache->pinned += num_bytes; - cache->space_info->bytes_pinned += num_bytes; + update_bytes_pinned(cache->space_info, num_bytes); cache->space_info->bytes_used -= num_bytes; cache->space_info->disk_used -= num_bytes * factor; spin_unlock(&cache->lock); @@ -6234,7 +6235,7 @@ static int pin_down_extent(struct btrfs_fs_info *fs_info, spin_lock(&cache->space_info->lock); spin_lock(&cache->lock); cache->pinned += num_bytes; - cache->space_info->bytes_pinned += num_bytes; + update_bytes_pinned(cache->space_info, num_bytes); if (reserved) { cache->reserved -= num_bytes; cache->space_info->bytes_reserved -= num_bytes; @@ -6599,7 +6600,7 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, spin_lock(&space_info->lock); spin_lock(&cache->lock); cache->pinned -= len; - space_info->bytes_pinned -= len; + update_bytes_pinned(space_info, -len); trace_btrfs_space_reservation(fs_info, "pinned", space_info->flags, len, 0); @@ -10710,7 +10711,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) spin_lock(&space_info->lock); spin_lock(&block_group->lock); - space_info->bytes_pinned -= block_group->pinned; + update_bytes_pinned(space_info, -block_group->pinned); space_info->bytes_readonly += block_group->pinned; percpu_counter_add_batch(&space_info->total_bytes_pinned, -block_group->pinned, -- 2.19.1
[PATCH 1/3] btrfs: extent-tree: Detect bytes_may_use underflow earlier
From: Qu Wenruo Although we have space_info::bytes_may_use underflow detection in btrfs_free_reserved_data_space_noquota(), we have more callers who are subtracting number from space_info::bytes_may_use. So instead of doing underflow detection for every caller, introduce a new wrapper update_bytes_may_use() to replace open coded bytes_may_use modifiers. This also introduce a macro to declare more wrappers, but currently space_info::bytes_may_use is the mostly interesting one. Signed-off-by: Qu Wenruo --- fs/btrfs/extent-tree.c | 44 +++--- 1 file changed, 28 insertions(+), 16 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index a1febf155747..c0147a1307e7 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -51,6 +51,21 @@ enum { CHUNK_ALLOC_FORCE = 2, }; +/* Helper function to detect various space info bytes underflow */ +#define DECLARE_SPACE_INFO_UPDATE(name) \ +static inline void update_##name(struct btrfs_space_info *sinfo, \ +s64 bytes) \ +{ \ + if (bytes < 0 && sinfo->name < -bytes) {\ + WARN_ON(1); \ + sinfo->name = 0;\ + return; \ + } \ + sinfo->name += bytes; \ +} + +DECLARE_SPACE_INFO_UPDATE(bytes_may_use); + static int __btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *node, u64 parent, u64 root_objectid, u64 owner_objectid, @@ -4256,7 +4271,7 @@ int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes) data_sinfo->flags, bytes, 1); return -ENOSPC; } - data_sinfo->bytes_may_use += bytes; + update_bytes_may_use(data_sinfo, bytes); trace_btrfs_space_reservation(fs_info, "space_info", data_sinfo->flags, bytes, 1); spin_unlock(&data_sinfo->lock); @@ -4309,10 +4324,7 @@ void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start, data_sinfo = fs_info->data_sinfo; spin_lock(&data_sinfo->lock); - if (WARN_ON(data_sinfo->bytes_may_use < len)) - data_sinfo->bytes_may_use = 0; - else - data_sinfo->bytes_may_use -= len; + update_bytes_may_use(data_sinfo, -len); trace_btrfs_space_reservation(fs_info, "space_info", data_sinfo->flags, len, 0); spin_unlock(&data_sinfo->lock); @@ -5108,7 +5120,7 @@ static int wait_reserve_ticket(struct btrfs_fs_info *fs_info, list_del_init(&ticket->list); if (ticket->bytes && ticket->bytes < orig_bytes) { u64 num_bytes = orig_bytes - ticket->bytes; - space_info->bytes_may_use -= num_bytes; + update_bytes_may_use(space_info, -num_bytes); trace_btrfs_space_reservation(fs_info, "space_info", space_info->flags, num_bytes, 0); } @@ -5154,13 +5166,13 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, * If not things get more complicated. */ if (used + orig_bytes <= space_info->total_bytes) { - space_info->bytes_may_use += orig_bytes; + update_bytes_may_use(space_info, orig_bytes); trace_btrfs_space_reservation(fs_info, "space_info", space_info->flags, orig_bytes, 1); ret = 0; } else if (can_overcommit(fs_info, space_info, orig_bytes, flush, system_chunk)) { - space_info->bytes_may_use += orig_bytes; + update_bytes_may_use(space_info, orig_bytes); trace_btrfs_space_reservation(fs_info, "space_info", space_info->flags, orig_bytes, 1); ret = 0; @@ -5223,7 +5235,7 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, if (ticket.bytes) { if (ticket.bytes < orig_bytes) { u64 num_bytes = orig_bytes - ticket.bytes; - space_info->bytes_may_use -= num_bytes; + update_bytes_may_use(space_info, -num_bytes); trace_btrfs_space_reservation(fs_info, "space_info", space_info->flags,
[PATCH 3/3] btrfs: fix pinned underflow after transaction aborted
When running generic/475, we may get the following warning in the dmesg. [ 6902.102154] WARNING: CPU: 3 PID: 18013 at fs/btrfs/extent-tree.c:9776 btrfs_free_block_groups+0x2af/0x3b0 [btrfs] [ 6902.104886] Modules linked in: btrfs(O) xor zstd_decompress zstd_compress xxhash raid6_pq efivarfs xfs nvme nvme_core [last unloaded: btrfs] [ 6902.109160] CPU: 3 PID: 18013 Comm: umount Tainted: GW O 4.19.0-rc8+ #8 [ 6902.110971] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015 [ 6902.112857] RIP: 0010:btrfs_free_block_groups+0x2af/0x3b0 [btrfs] [ 6902.114377] Code: c6 48 89 04 24 48 8b 83 50 17 00 00 48 39 c6 0f 84 ab 00 00 00 4c 8b ab 50 17 00 00 49 83 bd 50 ff ff ff 00 0f 84 b4 00 00 00 <0f> 0b 31 c9 49 8d b5 f8 fe ff ff 31 d2 48 89 df e8 fc 76 ff ff 49 [ 6902.118921] RSP: 0018:c9000459bdb0 EFLAGS: 00010286 [ 6902.120315] RAX: 880175050bb0 RBX: 8801124a8000 RCX: 00170007 [ 6902.121969] RDX: 0002 RSI: 00170007 RDI: 8125fb74 [ 6902.123716] RBP: 880175055d10 R08: R09: [ 6902.125417] R10: R11: R12: 880175055d88 [ 6902.127129] R13: 880175050bb0 R14: R15: dead0100 [ 6902.129060] FS: 7f4507223780() GS:88017ba0() knlGS: [ 6902.130996] CS: 0010 DS: ES: CR0: 80050033 [ 6902.132558] CR2: 5623599cac78 CR3: 00014b71 CR4: 003606e0 [ 6902.134270] DR0: DR1: DR2: [ 6902.135981] DR3: DR6: fffe0ff0 DR7: 0400 [ 6902.137836] Call Trace: [ 6902.138939] close_ctree+0x171/0x330 [btrfs] [ 6902.140181] ? kthread_stop+0x146/0x1f0 [ 6902.141277] generic_shutdown_super+0x6c/0x100 [ 6902.142517] kill_anon_super+0x14/0x30 [ 6902.143554] btrfs_kill_super+0x13/0x100 [btrfs] [ 6902.144790] deactivate_locked_super+0x2f/0x70 [ 6902.146014] cleanup_mnt+0x3b/0x70 [ 6902.147020] task_work_run+0x9e/0xd0 [ 6902.148036] do_syscall_64+0x470/0x600 [ 6902.149142] ? trace_hardirqs_off_thunk+0x1a/0x1c [ 6902.150375] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 6902.151640] RIP: 0033:0x7f45077a6a7b [ 6902.152782] Code: 23 0c 00 f7 d8 64 89 01 48 83 c8 ff c3 66 90 f3 0f 1e fa 31 f6 e9 05 00 00 00 90 0f 1f 40 00 f3 0f 1e fa b8 a6 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d b5 23 0c 00 f7 d8 64 89 01 48 [ 6902.157324] RSP: 002b:7ffd589f3e68 EFLAGS: 0246 ORIG_RAX: 00a6 [ 6902.159187] RAX: RBX: 55e8eec732b0 RCX: 7f45077a6a7b [ 6902.160834] RDX: 0001 RSI: RDI: 55e8eec73490 [ 6902.162526] RBP: R08: 55e8eec734b0 R09: 7ffd589f26c0 [ 6902.164141] R10: R11: 0246 R12: 55e8eec73490 [ 6902.165815] R13: 7f4507ac61a4 R14: R15: 7ffd589f40d8 [ 6902.167553] irq event stamp: 0 [ 6902.168998] hardirqs last enabled at (0): [<>] (null) [ 6902.170731] hardirqs last disabled at (0): [] copy_process.part.55+0x3b0/0x1f00 [ 6902.172773] softirqs last enabled at (0): [] copy_process.part.55+0x3b0/0x1f00 [ 6902.174671] softirqs last disabled at (0): [<>] (null) [ 6902.176407] ---[ end trace 463138c2986b275c ]--- [ 6902.177636] BTRFS info (device dm-3): space_info 4 has 273465344 free, is not full [ 6902.179453] BTRFS info (device dm-3): space_info total=276824064, used=4685824, pinned=18446744073708158976, reserved=0, may_use=0, readonly=65536 ^^^ obviously underflow When transaction_kthread is running cleanup_transaction(), another fsstress is running btrfs_commit_transaction(). The btrfs_finish_extent_commit() may get the same range as btrfs_destroy_pinned_extent() got, which causes the pinned underflow. Signed-off-by: Lu Fengqi --- fs/btrfs/disk-io.c | 12 +++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index b0ab41da91d1..00ee5e37e989 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -4359,13 +4359,23 @@ static int btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info, unpin = pinned_extents; again: while (1) { + /* +* The btrfs_finish_extent_commit() may get the same range as +* ours between find_first_extent_bit and clear_extent_dirty. +* Hence, hold the unused_bg_unpin_mutex to avoid double unpin +* the same extent range. +*/ + mutex_lock(&fs_info->unused_bg_unpin_mutex); ret = find_first_extent_bit(unpin, 0, &start, &end,
[PATCH] btrfs: delayed-ref: extract find_first_ref_head from find_ref_head
The find_ref_head shouldn't return the first entry even if no exact match is found. So move the hidden behavior to higher level. Besides, remove the useless local variables in the btrfs_select_ref_head. Signed-off-by: Lu Fengqi --- fs/btrfs/delayed-ref.c | 45 +++--- 1 file changed, 25 insertions(+), 20 deletions(-) diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 6e8be384398e..a92f104cf06f 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -164,14 +164,28 @@ static struct btrfs_delayed_ref_node* tree_insert(struct rb_root_cached *root, return NULL; } +static struct btrfs_delayed_ref_head *find_first_ref_head( + struct btrfs_delayed_ref_root *dr) +{ + struct rb_node *n; + struct btrfs_delayed_ref_head *entry; + + n = rb_first_cached(&dr->href_root); + if (!n) + return NULL; + + entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node); + + return entry; +} + /* * find an head entry based on bytenr. This returns the delayed ref * head if it was able to find one, or NULL if nothing was in that spot. * If return_bigger is given, the next bigger entry is returned if no exact - * match is found. But if no bigger one is found then the first node of the - * ref head tree will be returned. + * match is found. */ -static struct btrfs_delayed_ref_head* find_ref_head( +static struct btrfs_delayed_ref_head *find_ref_head( struct btrfs_delayed_ref_root *dr, u64 bytenr, bool return_bigger) { @@ -195,10 +209,9 @@ static struct btrfs_delayed_ref_head* find_ref_head( if (bytenr > entry->bytenr) { n = rb_next(&entry->href_node); if (!n) - n = rb_first_cached(&dr->href_root); + return NULL; entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node); - return entry; } return entry; } @@ -358,33 +371,25 @@ struct btrfs_delayed_ref_head *btrfs_select_ref_head( struct btrfs_delayed_ref_root *delayed_refs) { struct btrfs_delayed_ref_head *head; - u64 start; - bool loop = false; again: - start = delayed_refs->run_delayed_start; - head = find_ref_head(delayed_refs, start, true); - if (!head && !loop) { + head = find_ref_head(delayed_refs, delayed_refs->run_delayed_start, +true); + if (!head && delayed_refs->run_delayed_start != 0) { delayed_refs->run_delayed_start = 0; - start = 0; - loop = true; - head = find_ref_head(delayed_refs, start, true); - if (!head) - return NULL; - } else if (!head && loop) { - return NULL; + head = find_first_ref_head(delayed_refs); } + if (!head) + return NULL; while (head->processing) { struct rb_node *node; node = rb_next(&head->href_node); if (!node) { - if (loop) + if (delayed_refs->run_delayed_start == 0) return NULL; delayed_refs->run_delayed_start = 0; - start = 0; - loop = true; goto again; } head = rb_entry(node, struct btrfs_delayed_ref_head, -- 2.19.1
Re: [PATCH 0/6] Some trivail cleanup about dealyed-refs
On Thu, Oct 11, 2018 at 01:51:37PM +0200, David Sterba wrote: >On Thu, Oct 11, 2018 at 01:40:32PM +0800, Lu Fengqi wrote: >> There is no functional change. Just improve readablity. >> >> PATCH 1-4 parameter cleanup patches >> PATCH 5 cleanup about btrfs_select_ref_head >> PATCH 6 switch int to bool; add some comment >> >> Lu Fengqi (6): >> btrfs: delayed-ref: pass delayed_refs directly to >> btrfs_select_ref_head() >> btrfs: delayed-ref: pass delayed_refs directly to >> btrfs_delayed_ref_lock() >> btrfs: remove fs_info from btrfs_check_space_for_delayed_refs >> btrfs: remove fs_info from btrfs_should_throttle_delayed_refs >> btrfs: simplify btrfs_select_ref_head and cleanup some local variables >> btrfs: switch return_bigger to bool in find_ref_head > >1-4 and 6 added to misc-next, thanks. There is not patch 2 at the misc-next branch. So it was forgotten? -- Thanks, Lu
Re: [PATCH 5/6] btrfs: simplify btrfs_select_ref_head and cleanup some local variables
On Thu, Oct 11, 2018 at 02:45:04PM +0200, David Sterba wrote: >On Thu, Oct 11, 2018 at 03:28:15PM +0300, Nikolay Borisov wrote: >> > I noticed that there is a macro called SCRAMBLE_DELAYED_REFS in the >> > extent-tree.c. I am a bit curious whether it has been forgotten by >> > everyone, I have not found any test results about its performance impact. >> >> I guess it was used during testing but nothing currently sets it. I.e it >> might make sense to enable it if BTRFS_DEBUG is set. > >Agreed, the way the scrambling is supposed to be used does not align >very well with the typical testing workflow so adding to ti the >BTRFS_DEBUG set is ok, unless there are severe performance problems. I will add it to the BTRFS_DEBUG set, and test if it has severe performance problems. > >The part in btrfs_run_delayed_refs would be better hidden in a function >similar to btrfs_debug_check_extent_io_range or btrfs_leak_debug_check. Got it. -- Thanks, Lu
Re: [PATCH 5/6] btrfs: simplify btrfs_select_ref_head and cleanup some local variables
On Thu, Oct 11, 2018 at 03:28:15PM +0300, Nikolay Borisov wrote: > > >On 11.10.2018 15:15, Lu Fengqi wrote: >> On Thu, Oct 11, 2018 at 09:40:52AM +0300, Nikolay Borisov wrote: >>> >>> >>> On 11.10.2018 08:40, Lu Fengqi wrote: >>>> If the return value of find_ref_head() is NULL, the only possibility is >>>> that delayed_refs' head ref rbtree is empty. Hence, the second >>>> find_ref_head() is pointless. >>>>> Besides, the local variables loop and start are unnecessary, just remove >>>> them. >>> >>> So the objective of that function is to get a reference to the first >>> delayed head which is not processed. This is done by essentially keeping >>> track of the last range that was processed in >>> delayed_refs->run_delayed_start >>>> >>>> Signed-off-by: Lu Fengqi >>>> --- >>>> fs/btrfs/delayed-ref.c | 17 +++-- >>>> 1 file changed, 3 insertions(+), 14 deletions(-) >>>> >>>> diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c >>>> index 885581852bea..2726d2fb4bbe 100644 >>>> --- a/fs/btrfs/delayed-ref.c >>>> +++ b/fs/btrfs/delayed-ref.c >>>> @@ -354,20 +354,11 @@ struct btrfs_delayed_ref_head * >>>> btrfs_select_ref_head(struct btrfs_delayed_ref_root *delayed_refs) >>>> { >>>>struct btrfs_delayed_ref_head *head; >>>> - u64 start; >>>> - bool loop = false; >>>> >>>> again: >>>> - start = delayed_refs->run_delayed_start; >>>> - head = find_ref_head(delayed_refs, start, 1); >>>> - if (!head && !loop) { >>>> + head = find_ref_head(delayed_refs, delayed_refs->run_delayed_start, 1); >>>> + if (!head) { >>>>delayed_refs->run_delayed_start = 0; >>>> - start = 0; >>>> - loop = true; >>>> - head = find_ref_head(delayed_refs, start, 1); >>>> - if (!head) >>>> - return NULL; >>>> - } else if (!head && loop) { >>> >>> I believe this will have a negative impact since it actually will >>> prevent finding a head which was added BEFORE the last processed head. >>> So when a ref head is selected in btrfs_obtain_ref_head then the >>> delayed_refs->lock is dropped and the given head is locked and >>> delayed_refs->run_delayed_start points to the end of the selected range >>> that the head represents. At this point it's possible that another >>> thread modifies a different range which is before the one we have >>> selected so graphically it will be something like: >>> >>> >>> ---[HEAD2]->[HEAD1]-- >>> 0N >>> >>> Where HEAD1 is the head returned from first invocation of >>> btrfs_obtain_ref_head. Once btrfs_obtain_ref_head is called the 2nd >>> time it will not find HEAD2 so will just reset run_delayed_start to 0 >>> and return. So it will be up to another run of the delayed refs to >>> actually find head2. Essentially you made btrfs_obtain_ref_head less >> >> Not exactly. In fact, find_ref_head hides such a logic. When >> return_bigger is set, if there is no larger entry to return, the first >> entry will be returned. Please see the comment I add in the PATCH 6. >> >> Hence, the 2nd invocation of btrfs_obtain_ref_head still will return >> HEAD2. There is no functional change here. >> >> However, your question makes me consider whether such hidden logic >> should be extracted from find_ref_head to btrfs_select_ref_head. > >Right I agree with your. As it stands I will expect that if >return_bigger is true to specifically return a bigger entry or if >nothing is found to return null. IMO this behavior is higher level and This is also exactly what I want. The patch is on the way. >belongs to btrfs_delayed_ref_head. > >> >>> greedy. Have you characterized what kind of performance impact this have? >> >> I noticed that there is a macro called SCRAMBLE_DELAYED_REFS in the >> extent-tree.c. I am a bit curious whether it has been forgotten by >> everyone, I have not found any test results about its performance impact. > >I guess it was used during testing but nothing currently sets it. I.e it >might make sense to enable it if BTRFS_DEBUG is set. > Make sense. -- Thanks, Lu
Re: [PATCH 5/6] btrfs: simplify btrfs_select_ref_head and cleanup some local variables
On Thu, Oct 11, 2018 at 09:40:52AM +0300, Nikolay Borisov wrote: > > >On 11.10.2018 08:40, Lu Fengqi wrote: >> If the return value of find_ref_head() is NULL, the only possibility is >> that delayed_refs' head ref rbtree is empty. Hence, the second >> find_ref_head() is pointless. >> > Besides, the local variables loop and start are unnecessary, just remove >> them. > >So the objective of that function is to get a reference to the first >delayed head which is not processed. This is done by essentially keeping >track of the last range that was processed in >delayed_refs->run_delayed_start >> >> Signed-off-by: Lu Fengqi >> --- >> fs/btrfs/delayed-ref.c | 17 +++-- >> 1 file changed, 3 insertions(+), 14 deletions(-) >> >> diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c >> index 885581852bea..2726d2fb4bbe 100644 >> --- a/fs/btrfs/delayed-ref.c >> +++ b/fs/btrfs/delayed-ref.c >> @@ -354,20 +354,11 @@ struct btrfs_delayed_ref_head * >> btrfs_select_ref_head(struct btrfs_delayed_ref_root *delayed_refs) >> { >> struct btrfs_delayed_ref_head *head; >> -u64 start; >> -bool loop = false; >> >> again: >> -start = delayed_refs->run_delayed_start; >> -head = find_ref_head(delayed_refs, start, 1); >> -if (!head && !loop) { >> +head = find_ref_head(delayed_refs, delayed_refs->run_delayed_start, 1); >> +if (!head) { >> delayed_refs->run_delayed_start = 0; >> -start = 0; >> -loop = true; >> -head = find_ref_head(delayed_refs, start, 1); >> -if (!head) >> -return NULL; >> -} else if (!head && loop) { > >I believe this will have a negative impact since it actually will >prevent finding a head which was added BEFORE the last processed head. >So when a ref head is selected in btrfs_obtain_ref_head then the >delayed_refs->lock is dropped and the given head is locked and >delayed_refs->run_delayed_start points to the end of the selected range >that the head represents. At this point it's possible that another >thread modifies a different range which is before the one we have >selected so graphically it will be something like: > > >---[HEAD2]->[HEAD1]-- >0N > >Where HEAD1 is the head returned from first invocation of >btrfs_obtain_ref_head. Once btrfs_obtain_ref_head is called the 2nd >time it will not find HEAD2 so will just reset run_delayed_start to 0 >and return. So it will be up to another run of the delayed refs to >actually find head2. Essentially you made btrfs_obtain_ref_head less Not exactly. In fact, find_ref_head hides such a logic. When return_bigger is set, if there is no larger entry to return, the first entry will be returned. Please see the comment I add in the PATCH 6. Hence, the 2nd invocation of btrfs_obtain_ref_head still will return HEAD2. There is no functional change here. However, your question makes me consider whether such hidden logic should be extracted from find_ref_head to btrfs_select_ref_head. >greedy. Have you characterized what kind of performance impact this have? I noticed that there is a macro called SCRAMBLE_DELAYED_REFS in the extent-tree.c. I am a bit curious whether it has been forgotten by everyone, I have not found any test results about its performance impact. -- Thanks, Lu > > > > >> return NULL; >> } >> >> @@ -376,11 +367,9 @@ btrfs_select_ref_head(struct btrfs_delayed_ref_root >> *delayed_refs) >> >> node = rb_next(&head->href_node); >> if (!node) { >> -if (loop) >> +if (delayed_refs->run_delayed_start == 0) >> return NULL; >> delayed_refs->run_delayed_start = 0; >> -start = 0; >> -loop = true; >> goto again; >> } >> head = rb_entry(node, struct btrfs_delayed_ref_head, >> > >
[PATCH] btrfs: qgroup: move the qgroup->members check out from (!qgroup)'s else branch
There is no reason to put this check in (!qgroup)'s else branch because if qgroup is null, it will goto out directly. So move it out to reduce indent. No Functional Change. Signed-off-by: Lu Fengqi --- fs/btrfs/qgroup.c | 13 +++-- 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 27f517315388..af65ab1640b0 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1416,13 +1416,14 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid) if (!qgroup) { ret = -ENOENT; goto out; - } else { - /* check if there are no children of this qgroup */ - if (!list_empty(&qgroup->members)) { - ret = -EBUSY; - goto out; - } } + + /* check if there are no children of this qgroup */ + if (!list_empty(&qgroup->members)) { + ret = -EBUSY; + goto out; + } + ret = del_qgroup_item(trans, qgroupid); if (ret && ret != -ENOENT) goto out; -- 2.19.1
[PATCH 0/6] Some trivail cleanup about dealyed-refs
There is no functional change. Just improve readablity. PATCH 1-4 parameter cleanup patches PATCH 5 cleanup about btrfs_select_ref_head PATCH 6 switch int to bool; add some comment Lu Fengqi (6): btrfs: delayed-ref: pass delayed_refs directly to btrfs_select_ref_head() btrfs: delayed-ref: pass delayed_refs directly to btrfs_delayed_ref_lock() btrfs: remove fs_info from btrfs_check_space_for_delayed_refs btrfs: remove fs_info from btrfs_should_throttle_delayed_refs btrfs: simplify btrfs_select_ref_head and cleanup some local variables btrfs: switch return_bigger to bool in find_ref_head fs/btrfs/ctree.h | 6 ++ fs/btrfs/delayed-ref.c | 35 ++- fs/btrfs/delayed-ref.h | 4 ++-- fs/btrfs/extent-tree.c | 15 +++ fs/btrfs/inode.c | 7 +++ fs/btrfs/transaction.c | 4 ++-- 6 files changed, 26 insertions(+), 45 deletions(-) -- 2.19.1
[PATCH 2/6] btrfs: delayed-ref: pass delayed_refs directly to btrfs_delayed_ref_lock()
Since trans is only used for referring to delayed_refs, there is no need to pass it instead of delayed_refs to btrfs_delayed_ref_lock(). No functional change. Signed-off-by: Lu Fengqi --- fs/btrfs/delayed-ref.c | 5 + fs/btrfs/delayed-ref.h | 2 +- fs/btrfs/extent-tree.c | 2 +- 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 13ae86252c4c..885581852bea 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -204,12 +204,9 @@ static struct btrfs_delayed_ref_head* find_ref_head( return NULL; } -int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans, +int btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_head *head) { - struct btrfs_delayed_ref_root *delayed_refs; - - delayed_refs = &trans->transaction->delayed_refs; lockdep_assert_held(&delayed_refs->lock); if (mutex_trylock(&head->mutex)) return 0; diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index aa66ad6919ab..ef6f5cf75b3e 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -255,7 +255,7 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head * btrfs_find_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, u64 bytenr); -int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans, +int btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_head *head); static inline void btrfs_delayed_ref_unlock(struct btrfs_delayed_ref_head *head) { diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 22acc1545147..77156bd2a9a7 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2521,7 +2521,7 @@ static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head( * Grab the lock that says we are going to process all the refs for * this head */ - ret = btrfs_delayed_ref_lock(trans, head); + ret = btrfs_delayed_ref_lock(delayed_refs, head); spin_unlock(&delayed_refs->lock); /* -- 2.19.1
[PATCH 1/6] btrfs: delayed-ref: pass delayed_refs directly to btrfs_select_ref_head()
Since trans is only used for referring to delayed_refs, there is no need to pass it instead of delayed_refs to btrfs_select_ref_head(). No functional change. Signed-off-by: Lu Fengqi --- fs/btrfs/delayed-ref.c | 5 + fs/btrfs/delayed-ref.h | 2 +- fs/btrfs/extent-tree.c | 2 +- 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 7f567c944fec..13ae86252c4c 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -354,15 +354,12 @@ int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq) } struct btrfs_delayed_ref_head * -btrfs_select_ref_head(struct btrfs_trans_handle *trans) +btrfs_select_ref_head(struct btrfs_delayed_ref_root *delayed_refs) { - struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_delayed_ref_head *head; u64 start; bool loop = false; - delayed_refs = &trans->transaction->delayed_refs; - again: start = delayed_refs->run_delayed_start; head = find_ref_head(delayed_refs, start, 1); diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index c3e3486a126c..aa66ad6919ab 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -264,7 +264,7 @@ static inline void btrfs_delayed_ref_unlock(struct btrfs_delayed_ref_head *head) struct btrfs_delayed_ref_head * -btrfs_select_ref_head(struct btrfs_trans_handle *trans); +btrfs_select_ref_head(struct btrfs_delayed_ref_root *delayed_refs); int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 22b9269ae84c..22acc1545147 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2511,7 +2511,7 @@ static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head( int ret; spin_lock(&delayed_refs->lock); - head = btrfs_select_ref_head(trans); + head = btrfs_select_ref_head(delayed_refs); if (!head) { spin_unlock(&delayed_refs->lock); return head; -- 2.19.1
[PATCH 5/6] btrfs: simplify btrfs_select_ref_head and cleanup some local variables
If the return value of find_ref_head() is NULL, the only possibility is that delayed_refs' head ref rbtree is empty. Hence, the second find_ref_head() is pointless. Besides, the local variables loop and start are unnecessary, just remove them. Signed-off-by: Lu Fengqi --- fs/btrfs/delayed-ref.c | 17 +++-- 1 file changed, 3 insertions(+), 14 deletions(-) diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 885581852bea..2726d2fb4bbe 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -354,20 +354,11 @@ struct btrfs_delayed_ref_head * btrfs_select_ref_head(struct btrfs_delayed_ref_root *delayed_refs) { struct btrfs_delayed_ref_head *head; - u64 start; - bool loop = false; again: - start = delayed_refs->run_delayed_start; - head = find_ref_head(delayed_refs, start, 1); - if (!head && !loop) { + head = find_ref_head(delayed_refs, delayed_refs->run_delayed_start, 1); + if (!head) { delayed_refs->run_delayed_start = 0; - start = 0; - loop = true; - head = find_ref_head(delayed_refs, start, 1); - if (!head) - return NULL; - } else if (!head && loop) { return NULL; } @@ -376,11 +367,9 @@ btrfs_select_ref_head(struct btrfs_delayed_ref_root *delayed_refs) node = rb_next(&head->href_node); if (!node) { - if (loop) + if (delayed_refs->run_delayed_start == 0) return NULL; delayed_refs->run_delayed_start = 0; - start = 0; - loop = true; goto again; } head = rb_entry(node, struct btrfs_delayed_ref_head, -- 2.19.1
[PATCH 4/6] btrfs: remove fs_info from btrfs_should_throttle_delayed_refs
The avg_delayed_ref_runtime can be referenced from the transaction handle. Signed-off-by: Lu Fengqi --- fs/btrfs/ctree.h | 3 +-- fs/btrfs/extent-tree.c | 5 ++--- fs/btrfs/inode.c | 5 ++--- fs/btrfs/transaction.c | 2 +- 4 files changed, 6 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 4002c9fd924b..68ca41dbbef3 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2598,8 +2598,7 @@ static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_fs_info *fs_info, return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * num_items; } -int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info); +int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans); int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans); void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info, const u64 start); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 831dc2ac1942..241de034ba09 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2826,8 +2826,7 @@ int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans) return ret; } -int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info) +int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans) { u64 num_entries = atomic_read(&trans->transaction->delayed_refs.num_entries); @@ -2835,7 +2834,7 @@ int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans, u64 val; smp_mb(); - avg_runtime = fs_info->avg_delayed_ref_runtime; + avg_runtime = trans->fs_info->avg_delayed_ref_runtime; val = num_entries * avg_runtime; if (val >= NSEC_PER_SEC) return 1; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 6a5557e8909d..f22f77172c5f 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4713,7 +4713,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, btrfs_abort_transaction(trans, ret); break; } - if (btrfs_should_throttle_delayed_refs(trans, fs_info)) + if (btrfs_should_throttle_delayed_refs(trans)) btrfs_async_run_delayed_refs(fs_info, trans->delayed_ref_updates * 2, trans->transid, 0); @@ -4722,8 +4722,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, extent_num_bytes)) { should_end = true; } - if (btrfs_should_throttle_delayed_refs(trans, - fs_info)) + if (btrfs_should_throttle_delayed_refs(trans)) should_throttle = true; } } diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index c5015458c5c8..5686290a50e1 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -835,7 +835,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, trans->delayed_ref_updates = 0; if (!trans->sync) { must_run_delayed_refs = - btrfs_should_throttle_delayed_refs(trans, info); + btrfs_should_throttle_delayed_refs(trans); cur = max_t(unsigned long, cur, 32); /* -- 2.19.1
[PATCH 3/6] btrfs: remove fs_info from btrfs_check_space_for_delayed_refs
It can be referenced from the transaction handle. Signed-off-by: Lu Fengqi --- fs/btrfs/ctree.h | 3 +-- fs/btrfs/extent-tree.c | 6 +++--- fs/btrfs/inode.c | 2 +- fs/btrfs/transaction.c | 2 +- 4 files changed, 6 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 15c659f23411..4002c9fd924b 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2600,8 +2600,7 @@ static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_fs_info *fs_info, int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); -int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info); +int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans); void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info, const u64 start); void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 77156bd2a9a7..831dc2ac1942 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2789,9 +2789,9 @@ u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes) return num_csums; } -int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info) +int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans) { + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_block_rsv *global_rsv; u64 num_heads = trans->transaction->delayed_refs.num_heads_ready; u64 csum_bytes = trans->transaction->delayed_refs.pending_csums; @@ -2842,7 +2842,7 @@ int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans, if (val >= NSEC_PER_SEC / 2) return 2; - return btrfs_check_space_for_delayed_refs(trans, fs_info); + return btrfs_check_space_for_delayed_refs(trans); } struct async_delayed_refs { diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 6c476dc81b8e..6a5557e8909d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5319,7 +5319,7 @@ static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root, * Try to steal from the global reserve if there is space for * it. */ - if (!btrfs_check_space_for_delayed_refs(trans, fs_info) && + if (!btrfs_check_space_for_delayed_refs(trans) && !btrfs_block_rsv_migrate(global_rsv, rsv, rsv->size, false)) return trans; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index e7f618b17b07..c5015458c5c8 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -760,7 +760,7 @@ static int should_end_transaction(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; - if (btrfs_check_space_for_delayed_refs(trans, fs_info)) + if (btrfs_check_space_for_delayed_refs(trans)) return 1; return !!btrfs_block_rsv_check(&fs_info->global_block_rsv, 5); -- 2.19.1
[PATCH 6/6] btrfs: switch return_bigger to bool in find_ref_head
Using bool is more suitable than int here, and add the comment about the return_bigger. Signed-off-by: Lu Fengqi --- fs/btrfs/delayed-ref.c | 10 ++ 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 2726d2fb4bbe..61a19376239e 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -168,11 +168,12 @@ static struct btrfs_delayed_ref_node* tree_insert(struct rb_root_cached *root, * find an head entry based on bytenr. This returns the delayed ref * head if it was able to find one, or NULL if nothing was in that spot. * If return_bigger is given, the next bigger entry is returned if no exact - * match is found. + * match is found. But if no bigger one is found then the first node of the + * ref head tree will be returned. */ static struct btrfs_delayed_ref_head* find_ref_head( struct btrfs_delayed_ref_root *dr, u64 bytenr, - int return_bigger) + bool return_bigger) { struct rb_root *root = &dr->href_root.rb_root; struct rb_node *n; @@ -356,7 +357,8 @@ btrfs_select_ref_head(struct btrfs_delayed_ref_root *delayed_refs) struct btrfs_delayed_ref_head *head; again: - head = find_ref_head(delayed_refs, delayed_refs->run_delayed_start, 1); + head = find_ref_head(delayed_refs, delayed_refs->run_delayed_start, +true); if (!head) { delayed_refs->run_delayed_start = 0; return NULL; @@ -894,7 +896,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_head * btrfs_find_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, u64 bytenr) { - return find_ref_head(delayed_refs, bytenr, 0); + return find_ref_head(delayed_refs, bytenr, false); } void __cold btrfs_delayed_ref_exit(void) -- 2.19.1
Re: [PATCH 0/3] Misc refactoring of check_file_extent
On Thu, Sep 13, 2018 at 03:05:04PM +0300, Nikolay Borisov wrote: >While looking at check_file_extent I thought that the code might be a bit >cleaner than it actually is and cleaner as well. The first patch factors out >the code dealing with inline extents into a separate function aptly named >check_file_extent_inline. This allows to remove some inline-specific variable >from check_file_extent. Patch 2 just moves the final check in the new function >into the already existing branch handling the !compressed case. Finally >the check which detects unknown extent types is moved first in >check_file_extent, >followed by the code to handle inline extents and finally the existing code to >handle regular/prealloc extents is left intact. > >This patchset brings no functional changes. For the series, Reviewed-by: Lu Fengqi -- Thanks, Lu > >Nikolay Borisov (3): > btrfs-progs: check: lowmem: Factor out inline extent checking code in >its own function > btrfs-progs: check: lowmem: Refactor extent len test in >check_file_extent_inline > btrfs-progs: check: lowmem: Refactor extent type checks in >check_file_extent > > check/mode-lowmem.c | 151 ++-- > 1 file changed, 89 insertions(+), 62 deletions(-) > >-- >2.17.1 > > >
Re: [PATCH] btrfs-progs: lowmem: fix false alert about the existence of gaps in the check_file_extent
On Thu, Sep 13, 2018 at 12:12:27PM +0300, Nikolay Borisov wrote: > > >On 13.09.2018 11:20, Lu Fengqi wrote: >> In the check_inode_item function, the extent_end variable used to store the >> end of the last file extent that has checked. When it passes to >> check_file_extent, if the offset of the next file extent is not equal to >> it, there is a gap between the two file extents. > >The 'end' parameter of check_file_extent tracks the ending offset of the >last checked extent. This is used to detect gaps between adjacent extents. > >> >> In the case of a gap existing, it is wrong that only add the >> extent_num_bytes of this file extent to the invalid extent_end variable as >> before. Therefore, lowmem check will false alert that there are gaps >> between the subsequent file extents of this inode due to the wrong >> extent_end variable. > >Currently such gaps are wrongly detected since for regular extents only >the size of the extent is added to the 'end' parameter. This results in >wrongly considering all extents of a file as having gaps between them >when only 2 of them really have a gap as seen in the example below. Thank you for refining the commit message for me. > >> >> Solution: >> The extent_end variable should set to the sum of the offset and the >> extent_num_bytes of the file extent. >> >> Example: >> Suppose that lowmem check the following file extent of inode 257. >> >> item 6 key (257 EXTENT_DATA 0) itemoff 15813 itemsize 53 >> generation 6 type 1 (regular) >> extent data disk byte 13631488 nr 4096 >> extent data offset 0 nr 4096 ram 4096 >> extent compression 0 (none) >> item 7 key (257 EXTENT_DATA 8192) itemoff 15760 itemsize 53 >> generation 6 type 1 (regular) >> extent data disk byte 13631488 nr 4096 >> extent data offset 0 nr 4096 ram 4096 >> extent compression 0 (none) >> item 8 key (257 EXTENT_DATA 12288) itemoff 15707 itemsize 53 >> generation 6 type 1 (regular) >> extent data disk byte 13631488 nr 4096 >> extent data offset 0 nr 4096 ram 4096 >> extent compression 0 (none) >> >> For inode 257, check_inode_item set extent_end to 0, then call >> check_file_extent to check item {6,7,8}. >> item 6) >> offset(0) == extent_end(0) >> extent_end = extent_end(0) + extent_num_bytes(4096) >> item 7) >> offset(8192) != extent_end(4096) >> extent_end = extent_end(4096) + extent_num_bytes(4096) >> ^^^ >> The old extent_end should replace by offset(8192). >> item 8) >> offset(12288) != extent_end(8192) >> ^^^ >> But there is no gap between item {7,8}. > >The example makes sense. But can the same thing happen with the inline >extents, ie should the same adjustments be made for the code in if >(extent_type == BTRFS_FILE_EXTENT_INLINE) ? > IIRC, generally there is only one inline extent per file. Although there will be other regular extents, the inline extent must be the first one. So it seems that there is no need to change the code in if (extent_type == BTRFS_FILE_EXTENT_INLINE). -- Thanks, Lu >> >> Fixes: d88da10ddd42 ("btrfs-progs: check: introduce function to check file >> extent") >> Signed-off-by: Lu Fengqi >> --- >> check/mode-lowmem.c | 2 +- >> 1 file changed, 1 insertion(+), 1 deletion(-) >> >> diff --git a/check/mode-lowmem.c b/check/mode-lowmem.c >> index 1bce44f5658a..370318f0e631 100644 >> --- a/check/mode-lowmem.c >> +++ b/check/mode-lowmem.c >> @@ -1974,7 +1974,7 @@ static int check_file_extent(struct btrfs_root *root, >> struct btrfs_path *path, >> } >> } >> >> -*end += extent_num_bytes; >> +*end = fkey.offset + extent_num_bytes; >> if (!is_hole) >> *size += extent_num_bytes; >> >> > >
Re: [PATCH] btrfs: Handle error of get_old_root
On Thu, Sep 13, 2018 at 11:35:10AM +0300, Nikolay Borisov wrote: >In btrfs_search_old_slot get_old_root is always used with the >assumption it cannot fail. However, this is not true in rare >circumstance it can fail and return null. This will lead to null >point dereference when the header is read. Fix this by checking the >return value and properly handling NULL by setting ret to -EIO and >returning gracefully. > >CID: 1087503 >Signed-off-by: Nikolay Borisov Reviewed-by: Lu Fengqi -- Thanks, Lu >--- > fs/btrfs/ctree.c | 4 > 1 file changed, 4 insertions(+) > >diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c >index 1124d236291d..a5399fd49c17 100644 >--- a/fs/btrfs/ctree.c >+++ b/fs/btrfs/ctree.c >@@ -2961,6 +2961,10 @@ int btrfs_search_old_slot(struct btrfs_root *root, >const struct btrfs_key *key, > > again: > b = get_old_root(root, time_seq); >+ if (!b) { >+ ret = -EIO; >+ goto done; >+ } > level = btrfs_header_level(b); > p->locks[level] = BTRFS_READ_LOCK; > >-- >2.7.4 > > >
Re: [PATCH] btrfs: Remove logically dead code from btrfs_orphan_cleanup
On Thu, Sep 13, 2018 at 11:35:00AM +0300, Nikolay Borisov wrote: >In btrfs_orphan_cleanup the final 'if (ret) goto out' cannot ever be >executed. This is due to the last assignment to 'ret' depending on >the return value of btrfs_iget. If an error other than -ENOENT is >returned then the loop is prematurely terminated by 'goto out'. >On the other hand, if the error value is ENOENT then a subsequent >if branch is executed that always re-assigns 'ret' and in case it's >an error just terminates the loop. No functional changes. > >CID: 1437392 >Signed-off-by: Nikolay Borisov Reviewed-by: Lu Fengqi >--- > fs/btrfs/inode.c | 2 -- > 1 file changed, 2 deletions(-) > >diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c >index 3f03fec06a3a..64df0378a22f 100644 >--- a/fs/btrfs/inode.c >+++ b/fs/btrfs/inode.c >@@ -3471,8 +3471,6 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) > > /* this will do delete_inode and everything for us */ > iput(inode); >- if (ret) >- goto out; > } > /* release the path since we're done with it */ > btrfs_release_path(path); >-- >2.7.4 > > > -- Thanks, Lu
Re: [PATCH] btrfs-progs: calibrate extent_end when found a gap
On Thu, Sep 13, 2018 at 04:30:28PM +0800, Lu Fengqi wrote: >On Tue, Sep 11, 2018 at 04:41:21PM +0200, David Sterba wrote: >>On Tue, Sep 04, 2018 at 08:42:01PM +0800, Lu Fengqi wrote: >>> The extent_end will be used to check whether there is gap between this >>> extent and next extent. If it is not calibrated, check_file_extent will >> >>Do you mean 'synchronized' or 'matching'. > >I apologize for this incomprehensible commit message, and I have updated >the commit message. > >[PATCH] btrfs-progs: lowmem: fix false alert about the existence of gaps in >the check_file_extent > >> >>> mistake that there are gaps between the remaining extents. >> >>If this is a bugfix, do you have a testcase? Thanks. >> > >The testcase requires some check repair's fixes (including originl and lowmem) >that my colleagues are working on. After they get it, I will send the >testcase. > >The attached is the image which can trigger the false alert. Sorry, I miss the attached. -- Thanks, Lu > >Without the patch mentioned before, lowmem check will false alert that expect >the hole extent [257 EXTENT_DATA 8192]. > >ERROR: root 5 EXTENT_DATA[257 12288] gap exists, expected: EXTENT_DATA[257 >8192] > >-- >Thanks, >Lu > > file_extent_with_gap.img Description: Binary data
Re: [PATCH] btrfs-progs: calibrate extent_end when found a gap
On Tue, Sep 11, 2018 at 04:41:21PM +0200, David Sterba wrote: >On Tue, Sep 04, 2018 at 08:42:01PM +0800, Lu Fengqi wrote: >> The extent_end will be used to check whether there is gap between this >> extent and next extent. If it is not calibrated, check_file_extent will > >Do you mean 'synchronized' or 'matching'. I apologize for this incomprehensible commit message, and I have updated the commit message. [PATCH] btrfs-progs: lowmem: fix false alert about the existence of gaps in the check_file_extent > >> mistake that there are gaps between the remaining extents. > >If this is a bugfix, do you have a testcase? Thanks. > The testcase requires some check repair's fixes (including originl and lowmem) that my colleagues are working on. After they get it, I will send the testcase. The attached is the image which can trigger the false alert. Without the patch mentioned before, lowmem check will false alert that expect the hole extent [257 EXTENT_DATA 8192]. ERROR: root 5 EXTENT_DATA[257 12288] gap exists, expected: EXTENT_DATA[257 8192] -- Thanks, Lu
[PATCH] btrfs-progs: lowmem: fix false alert about the existence of gaps in the check_file_extent
In the check_inode_item function, the extent_end variable used to store the end of the last file extent that has checked. When it passes to check_file_extent, if the offset of the next file extent is not equal to it, there is a gap between the two file extents. In the case of a gap existing, it is wrong that only add the extent_num_bytes of this file extent to the invalid extent_end variable as before. Therefore, lowmem check will false alert that there are gaps between the subsequent file extents of this inode due to the wrong extent_end variable. Solution: The extent_end variable should set to the sum of the offset and the extent_num_bytes of the file extent. Example: Suppose that lowmem check the following file extent of inode 257. item 6 key (257 EXTENT_DATA 0) itemoff 15813 itemsize 53 generation 6 type 1 (regular) extent data disk byte 13631488 nr 4096 extent data offset 0 nr 4096 ram 4096 extent compression 0 (none) item 7 key (257 EXTENT_DATA 8192) itemoff 15760 itemsize 53 generation 6 type 1 (regular) extent data disk byte 13631488 nr 4096 extent data offset 0 nr 4096 ram 4096 extent compression 0 (none) item 8 key (257 EXTENT_DATA 12288) itemoff 15707 itemsize 53 generation 6 type 1 (regular) extent data disk byte 13631488 nr 4096 extent data offset 0 nr 4096 ram 4096 extent compression 0 (none) For inode 257, check_inode_item set extent_end to 0, then call check_file_extent to check item {6,7,8}. item 6) offset(0) == extent_end(0) extent_end = extent_end(0) + extent_num_bytes(4096) item 7) offset(8192) != extent_end(4096) extent_end = extent_end(4096) + extent_num_bytes(4096) ^^^ The old extent_end should replace by offset(8192). item 8) offset(12288) != extent_end(8192) ^^^ But there is no gap between item {7,8}. Fixes: d88da10ddd42 ("btrfs-progs: check: introduce function to check file extent") Signed-off-by: Lu Fengqi --- check/mode-lowmem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/check/mode-lowmem.c b/check/mode-lowmem.c index 1bce44f5658a..370318f0e631 100644 --- a/check/mode-lowmem.c +++ b/check/mode-lowmem.c @@ -1974,7 +1974,7 @@ static int check_file_extent(struct btrfs_root *root, struct btrfs_path *path, } } - *end += extent_num_bytes; + *end = fkey.offset + extent_num_bytes; if (!is_hole) *size += extent_num_bytes; -- 2.18.0
Re: [RFC PATCH v2 1/4] btrfs: factor out btrfs_link_subvol from create_subvol
On Tue, Sep 11, 2018 at 07:57:03PM +0800, Qu Wenruo wrote: > > >On 2018/9/11 下午7:29, Lu Fengqi wrote: >> The function btrfs_link_subvol is responsible to link the subvolume to >> the specified directory, which is the opposite of what >> btrfs_unlink_subvol does. >> >> No functional change. >> >> Signed-off-by: Lu Fengqi > >The patch itself is OK. > >Just small nitpicks inlined below. > >> --- >> fs/btrfs/ioctl.c | 64 +++- >> 1 file changed, 41 insertions(+), 23 deletions(-) >> >> diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c >> index 4905d13dee0a..1b03d07acde2 100644 >> --- a/fs/btrfs/ioctl.c >> +++ b/fs/btrfs/ioctl.c >> @@ -542,6 +542,45 @@ int btrfs_is_empty_uuid(u8 *uuid) >> return 1; >> } >> >> +static int btrfs_link_subvol(struct btrfs_trans_handle *trans, >> + struct inode *dir, u64 objectid, const char *name, >> + int namelen) >> +{ >> +struct btrfs_root *root = BTRFS_I(dir)->root; >> +struct btrfs_key key; >> +u64 index = 0; >> +int ret; >> + >> +/* >> + * insert the directory item >> + */ >> +ret = btrfs_set_inode_index(BTRFS_I(dir), &index); >> +if (ret) { >> +btrfs_abort_transaction(trans, ret); >> +return ret; >> +} >> + >> +key.objectid = objectid; >> +key.type = BTRFS_ROOT_ITEM_KEY; >> +key.offset = -1; >> +ret = btrfs_insert_dir_item(trans, name, namelen, BTRFS_I(dir), &key, >> +BTRFS_FT_DIR, index); >> +if (ret) { >> +btrfs_abort_transaction(trans, ret); >> +return ret; >> +} >> + >> +btrfs_i_size_write(BTRFS_I(dir), dir->i_size + namelen * 2); >> +ret = btrfs_update_inode(trans, root, dir); >> +BUG_ON(ret); > >What about clean up this BUG_ON()? > >> + >> +ret = btrfs_add_root_ref(trans, objectid, root->root_key.objectid, >> + btrfs_ino(BTRFS_I(dir)), index, name, namelen); >> +BUG_ON(ret); > >And this one? Sorry, this makes you confused. This is exactly the cleanup done by Patch 2, because I want to just move the code in Patch 1. Thanks, Lu > >Thanks, >Qu > >> + >> +return ret; >> +} >> + >> static noinline int create_subvol(struct inode *dir, >>struct dentry *dentry, >>const char *name, int namelen, >> @@ -563,7 +602,6 @@ static noinline int create_subvol(struct inode *dir, >> int err; >> u64 objectid; >> u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID; >> -u64 index = 0; >> uuid_le new_uuid; >> >> root_item = kzalloc(sizeof(*root_item), GFP_KERNEL); >> @@ -677,29 +715,9 @@ static noinline int create_subvol(struct inode *dir, >> new_root->highest_objectid = new_dirid; >> mutex_unlock(&new_root->objectid_mutex); >> >> -/* >> - * insert the directory item >> - */ >> -ret = btrfs_set_inode_index(BTRFS_I(dir), &index); >> -if (ret) { >> -btrfs_abort_transaction(trans, ret); >> -goto fail; >> -} >> - >> -ret = btrfs_insert_dir_item(trans, name, namelen, BTRFS_I(dir), &key, >> -BTRFS_FT_DIR, index); >> -if (ret) { >> -btrfs_abort_transaction(trans, ret); >> +ret = btrfs_link_subvol(trans, dir, objectid, name, namelen); >> +if (ret) >> goto fail; >> -} >> - >> -btrfs_i_size_write(BTRFS_I(dir), dir->i_size + namelen * 2); >> -ret = btrfs_update_inode(trans, root, dir); >> -BUG_ON(ret); >> - >> -ret = btrfs_add_root_ref(trans, objectid, root->root_key.objectid, >> - btrfs_ino(BTRFS_I(dir)), index, name, namelen); >> -BUG_ON(ret); >> >> ret = btrfs_uuid_tree_add(trans, root_item->uuid, >>BTRFS_UUID_KEY_SUBVOL, objectid); >> >
Re: [PATCH v3 00/10] undelete subvolume offline version
On Mon, May 07, 2018 at 11:10:23AM +0800, Lu Fengqi wrote: >This patchset will add undelete-subvol subcommand for btrfs rescue. > Hi David Although there are some disagreements about undeleting subvolumes online implementation, the offline version is considered more acceptable. Would you like to spend some time talking about your opinions? -- Thanks, Lu >Patchset can be fetched from github: >https://github.com/littleroad/btrfs-progs.git undelete > >v2->v3: fixed some issues pointed out by Qu. >v1->v2: add -s option to allow user specify the subvolume which will be >recovered. > >The first patch are not modified. >For the rest, please see the changelog in the patches. > >Lu Fengqi (10): > btrfs-progs: copy btrfs_del_orphan_item from kernel > btrfs-progs: extract btrfs_link_subvol from btrfs_mksubvol > btrfs-progs: use btrfs_find_free_dir_index to find free inode index > btrfs-progs: undelete-subvol: introduce is_subvol_intact > btrfs-progs: undelete-subvol: introduce recover_dead_root > btrfs-progs: undelete-subvol: introduce link_subvol_to_lostfound > btrfs-progs: undelete-subvol: introduce btrfs_undelete_subvols > btrfs-progs: undelete-subvol: add undelete-subvol subcommand > btrfs-progs: tests: add testcase for undelete-subvol > btrfs-progs: undelete-subvol: update completion and documentation > > Documentation/btrfs-rescue.asciidoc | 12 + > Makefile | 3 +- > btrfs-completion | 2 +- > cmds-rescue.c | 69 ++ > convert/main.c| 59 - > ctree.h | 8 +- > inode.c | 119 + > .../031-undelete-subvol/intact_subvolume.img | Bin 0 -> 4096 bytes > .../subvolume_in_drop_progress.raw.xz | Bin 0 -> 23452 bytes > tests/misc-tests/031-undelete-subvol/test.sh | 38 +++ > undelete-subvol.c | 227 ++ > undelete-subvol.h | 11 + > 12 files changed, 501 insertions(+), 47 deletions(-) > create mode 100644 tests/misc-tests/031-undelete-subvol/intact_subvolume.img > create mode 100644 > tests/misc-tests/031-undelete-subvol/subvolume_in_drop_progress.raw.xz > create mode 100755 tests/misc-tests/031-undelete-subvol/test.sh > create mode 100644 undelete-subvol.c > create mode 100644 undelete-subvol.h > >-- >2.17.0 > > > >-- >To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in >the body of a message to majord...@vger.kernel.org >More majordomo info at http://vger.kernel.org/majordomo-info.html
[RFC PATCH v2 2/2] btrfs-progs: subvolume: undelete: add btrfs subvolume undelete subcommand
Add the undelete subcommand, this is depend on the BTRFS_IOC_SUBVOL_UNDELETE ioctl. Signed-off-by: Lu Fengqi --- btrfs-completion | 2 +- cmds-subvolume.c | 70 2 files changed, 71 insertions(+), 1 deletion(-) diff --git a/btrfs-completion b/btrfs-completion index ae683f4ecf61..2b43fbd63023 100644 --- a/btrfs-completion +++ b/btrfs-completion @@ -30,7 +30,7 @@ _btrfs() local cmd=${words[1]} commands='subvolume filesystem balance device scrub check rescue restore inspect-internal property send receive quota qgroup replace help version' - commands_subvolume='create delete list snapshot find-new get-default set-default show sync' + commands_subvolume='create delete list snapshot find-new get-default set-default show sync undelete' commands_filesystem='defragment sync resize show df du label usage' commands_balance='start pause cancel resume status' commands_device='scan add delete remove ready stats usage' diff --git a/cmds-subvolume.c b/cmds-subvolume.c index e7a884af1f5d..228d0b9e9b34 100644 --- a/cmds-subvolume.c +++ b/cmds-subvolume.c @@ -1219,6 +1219,74 @@ out: return !!ret; } +static const char * const cmd_subvol_undelete_usage[] = { + "btrfs subvolume undelete [-n ] ", + "Undelete the subvolume of the given to .", + "", + "-n recover the subvolume with .", + NULL +}; + +static int cmd_subvol_undelete(int argc, char **argv) +{ + struct btrfs_ioctl_subvol_undelete_args args; + bool need_assign_name = true; + DIR *dirstream = NULL; + char *dest; + int fd = -1; + int ret; + + memset(&args, 0, sizeof(args)); + + while (1) { + int c = getopt(argc, argv, "n:"); + + if (c < 0) + break; + + switch (c) { + case 'n': + strncpy_null(args.name, optarg); + need_assign_name = false; + break; + default: + usage(cmd_subvol_undelete_usage); + } + } + if (!need_assign_name) { + if (!test_issubvolname(args.name)) { + error("invalid subvolume name: %s", args.name); + return -EINVAL; + } else if (strlen(args.name) > BTRFS_VOL_NAME_MAX) { + error("subvolume name too long: %s", args.name); + return -EINVAL; + } + } + + if (check_argc_exact(argc - optind, 2)) + usage(cmd_subvol_undelete_usage); + + args.subvol_id = arg_strtou64(argv[optind]); + if (need_assign_name) + snprintf(args.name, BTRFS_VOL_NAME_MAX, "sub_%llu", + args.subvol_id); + + dest = argv[optind + 1]; + fd = btrfs_open_dir(dest, &dirstream, 1); + if (fd < 0) { + error("can't access '%s'", dest); + return -1; + } + + ret = ioctl(fd, BTRFS_IOC_SUBVOL_UNDELETE, &args); + if (ret) + perror("BTRFS_IOC_SUBVOL_UNDELETE"); + + close_file_or_dir(fd, dirstream); + + return ret; +} + static const char subvolume_cmd_group_info[] = "manage subvolumes: create, delete, list, etc"; @@ -1237,6 +1305,8 @@ const struct cmd_group subvolume_cmd_group = { NULL, 0 }, { "show", cmd_subvol_show, cmd_subvol_show_usage, NULL, 0 }, { "sync", cmd_subvol_sync, cmd_subvol_sync_usage, NULL, 0 }, + { "undelete", cmd_subvol_undelete, cmd_subvol_undelete_usage, + NULL, 0 }, NULL_CMD_STRUCT } }; -- 2.18.0
[RFC PATCH v2 1/2] btrfs-progs: ioctl: add BTRFS_IOC_SUBVOL_UNDELETE to ioctl.h
Copied from uapi/linux/btrfs.h. Signed-off-by: Lu Fengqi --- ioctl.h | 7 +++ 1 file changed, 7 insertions(+) diff --git a/ioctl.h b/ioctl.h index 709e996f401c..75978a4e8265 100644 --- a/ioctl.h +++ b/ioctl.h @@ -670,6 +670,11 @@ struct btrfs_ioctl_send_args_64 { } __attribute__((packed)); BUILD_ASSERT(sizeof(struct btrfs_ioctl_send_args_64) == 72); +struct btrfs_ioctl_subvol_undelete_args { + __u64 subvol_id; + char name[BTRFS_PATH_NAME_MAX + 1]; +}; + #define BTRFS_IOC_SEND_64_COMPAT_DEFINED 1 /* Error codes as returned by the kernel */ @@ -828,6 +833,8 @@ static inline char *btrfs_err_str(enum btrfs_err_code err_code) struct btrfs_ioctl_feature_flags[3]) #define BTRFS_IOC_RM_DEV_V2_IOW(BTRFS_IOCTL_MAGIC, 58, \ struct btrfs_ioctl_vol_args_v2) +#define BTRFS_IOC_SUBVOL_UNDELETE _IOWR(BTRFS_IOCTL_MAGIC, 63, \ + struct btrfs_ioctl_subvol_undelete_args) #ifdef __cplusplus } #endif -- 2.18.0
[RFC PATCH v2 4/4] btrfs: undelete: Add BTRFS_IOCTL_SUBVOL_UNDELETE ioctl
This ioctl will provide user the ability to recover the subvolume of the given id to the given directory. Note: It will lock fs_info->cleaner_mutex to keep the cleaner kthread from deleting the subvolume which we want to recover. Signed-off-by: Lu Fengqi --- fs/btrfs/ioctl.c | 64 ++ include/uapi/linux/btrfs.h | 7 + 2 files changed, 71 insertions(+) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index f088dea53c16..3ddf6e1c117b 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1993,6 +1993,68 @@ static int btrfs_undelete_subvolume(struct btrfs_root *root, return ret; } +static int btrfs_ioctl_undelete(struct file *file, void __user *argp) +{ + struct btrfs_ioctl_subvol_undelete_args *args; + struct inode *inode = file_inode(file); + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_root *root; + int ret = 0; + + if (!S_ISDIR(inode->i_mode)) + return -ENOTDIR; + + args = memdup_user(argp, sizeof(*args)); + if (IS_ERR(args)) + return PTR_ERR(args); + + args->name[BTRFS_PATH_NAME_MAX] = '\0'; + + if (!capable(CAP_SYS_ADMIN)) { + ret = -EPERM; + goto free; + } + + ret = mnt_want_write_file(file); + if (ret) + goto free; + + ret = -ENOENT; + spin_lock(&fs_info->trans_lock); + list_for_each_entry(root, &fs_info->dead_roots, root_list) { + if (root->root_key.objectid == args->subvol_id) { + list_del_init(&root->root_list); + ret = 0; + break; + } + } + spin_unlock(&fs_info->trans_lock); + if (ret) + goto drop_write; + + /* +* Lock cleaner_mutex to prevent the cleaner kthread from deleting the +* subvolume we want to recover so that we can perform the next rescue +* in a relaxed manner. +*/ + mutex_lock(&fs_info->cleaner_mutex); + + ret = btrfs_undelete_subvolume(root, file->f_path.dentry, args->name, + strlen(args->name)); + if (ret) { + btrfs_add_dead_root(root); + goto unlock; + } + +unlock: + mutex_unlock(&fs_info->cleaner_mutex); +drop_write: + mnt_drop_write_file(file); +free: + kfree(args); + return ret; +} + static noinline int btrfs_ioctl_subvol_getflags(struct file *file, void __user *arg) { @@ -6118,6 +6180,8 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_get_subvol_rootref(file, argp); case BTRFS_IOC_INO_LOOKUP_USER: return btrfs_ioctl_ino_lookup_user(file, argp); + case BTRFS_IOC_SUBVOL_UNDELETE: + return btrfs_ioctl_undelete(file, argp); } return -ENOTTY; diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h index 5ca1d21fc4a7..e6d3c8e24bb8 100644 --- a/include/uapi/linux/btrfs.h +++ b/include/uapi/linux/btrfs.h @@ -816,6 +816,11 @@ struct btrfs_ioctl_get_subvol_rootref_args { __u8 align[7]; }; +struct btrfs_ioctl_subvol_undelete_args { + __u64 subvol_id; + char name[BTRFS_PATH_NAME_MAX + 1]; +}; + /* Error codes as returned by the kernel */ enum btrfs_err_code { BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET = 1, @@ -940,5 +945,7 @@ enum btrfs_err_code { struct btrfs_ioctl_get_subvol_rootref_args) #define BTRFS_IOC_INO_LOOKUP_USER _IOWR(BTRFS_IOCTL_MAGIC, 62, \ struct btrfs_ioctl_ino_lookup_user_args) +#define BTRFS_IOC_SUBVOL_UNDELETE _IOWR(BTRFS_IOCTL_MAGIC, 63, \ + struct btrfs_ioctl_subvol_undelete_args) #endif /* _UAPI_LINUX_BTRFS_H */ -- 2.18.0
[RFC PATCH v2 3/4] btrfs: undelete: introduce btrfs_undelete_subvolume
The function will do the following things which are almost the opposite of what btrfs_delete_subvolume() does: 1. link the subvolume to the parent specified; 2. clear root flag and set root_refs to 1; 3. add the subvol to the uuid_tree; 4. delete the orphan_item. Signed-off-by: Lu Fengqi --- fs/btrfs/ioctl.c | 113 +++ 1 file changed, 113 insertions(+) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index f6173d4e7ced..f088dea53c16 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1880,6 +1880,119 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file, return ret; } +static int btrfs_undelete_subvolume(struct btrfs_root *root, + struct dentry *parent, const char *name, + int namelen) +{ + struct inode *dir = d_inode(parent); + struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); + struct btrfs_root_item *root_item = &root->root_item; + struct btrfs_trans_handle *trans; + struct btrfs_block_rsv block_rsv; + struct dentry *dentry; + struct inode *inode; + u64 root_flags; + int ret; + + btrfs_debug(fs_info, "Undelete subvolume %llu", + root->root_key.objectid); + + /* only care about the intact subvolume */ + if (btrfs_disk_key_objectid(&root_item->drop_progress) != 0) + return 0; + + ret = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT); + if (ret == -EINTR) + return ret; + + dentry = lookup_one_len(name, parent, namelen); + if (IS_ERR(dentry)) { + ret = PTR_ERR(dentry); + goto out_unlock; + } + + down_write(&fs_info->subvol_sem); + + ret = btrfs_may_create(dir, dentry); + if (ret) + goto out_up_write; + + ret = btrfs_check_dir_item_collision(BTRFS_I(dir)->root, dir->i_ino, +name, namelen); + if (ret) + goto out_up_write; + + btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP); + /* +* 1 - parent dir inode +* 2 - dir entries +* 2 - root ref/backref +* 1 - UUID item +*/ + ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 6, false); + if (ret) + goto out_up_write; + + trans = btrfs_start_transaction(BTRFS_I(dir)->root, 0); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + btrfs_subvolume_release_metadata(fs_info, &block_rsv); + goto out_up_write; + } + + trans->block_rsv = &block_rsv; + trans->bytes_reserved = block_rsv.size; + + ret = btrfs_link_subvol(trans, dir, root->root_key.objectid, name, + namelen); + if (ret) + goto fail; + + /* clear BTRFS_ROOT_SUBVOL_DEAD root flag and set root_refs to 1*/ + root_flags = btrfs_root_flags(root_item); + btrfs_set_root_flags(root_item, +root_flags & ~BTRFS_ROOT_SUBVOL_DEAD); + btrfs_set_root_refs(root_item, 1); + ret = btrfs_update_root(trans, fs_info->tree_root, + &root->root_key, &root->root_item); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto fail; + } + + ret = btrfs_uuid_tree_add(trans, root_item->uuid, BTRFS_UUID_KEY_SUBVOL, + root->root_key.objectid); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto fail; + } + + ret = btrfs_del_orphan_item(trans, fs_info->tree_root, + root->root_key.objectid); + if (ret && ret != -ENOENT) { + btrfs_abort_transaction(trans, ret); + goto fail; + } +fail: + trans->block_rsv = NULL; + trans->bytes_reserved = 0; + btrfs_subvolume_release_metadata(fs_info, &block_rsv); + ret = btrfs_commit_transaction(trans); + if (!ret) { + inode = btrfs_lookup_dentry(dir, dentry); + if (IS_ERR(inode)) + return PTR_ERR(inode); + d_instantiate(dentry, inode); + fsnotify_mkdir(dir, dentry); + } +out_up_write: + up_write(&fs_info->subvol_sem); + dput(dentry); +out_unlock: + inode_unlock(dir); + return ret; +} + static noinline int btrfs_ioctl_subvol_getflags(struct file *file, void __user *arg) { -- 2.18.0
[RFC PATCH v2 0/4] undelete subvolume online version
This patchset will add the BTRFS_IOC_SUBVOL_UNDELETE ioctl for online btrfs subvolume undelete. And using the online_undelete version of btrfs-progs, user can recover the subvolume given by to the directory given by . The optional parameter [-n ] can be used to set the name of the recovered subvolume. # btrfs subvolume undelete [-n ] btrfs online undelete version: https://github.com/littleroad/linux.git undelete btrfs-progs online undelete version: https://github.com/littleroad/btrfs-progs.git online_undelete Issue: #82 Lu Fengqi (4): btrfs: factor out btrfs_link_subvol from create_subvol btrfs: don't BUG_ON() in btrfs_link_subvol() btrfs: undelete: introduce btrfs_undelete_subvolume btrfs: undelete: Add BTRFS_IOCTL_SUBVOL_UNDELETE ioctl fs/btrfs/ioctl.c | 247 + include/uapi/linux/btrfs.h | 7 ++ 2 files changed, 231 insertions(+), 23 deletions(-) -- 2.18.0
[RFC PATCH v2 1/4] btrfs: factor out btrfs_link_subvol from create_subvol
The function btrfs_link_subvol is responsible to link the subvolume to the specified directory, which is the opposite of what btrfs_unlink_subvol does. No functional change. Signed-off-by: Lu Fengqi --- fs/btrfs/ioctl.c | 64 +++- 1 file changed, 41 insertions(+), 23 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 4905d13dee0a..1b03d07acde2 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -542,6 +542,45 @@ int btrfs_is_empty_uuid(u8 *uuid) return 1; } +static int btrfs_link_subvol(struct btrfs_trans_handle *trans, +struct inode *dir, u64 objectid, const char *name, +int namelen) +{ + struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_key key; + u64 index = 0; + int ret; + + /* +* insert the directory item +*/ + ret = btrfs_set_inode_index(BTRFS_I(dir), &index); + if (ret) { + btrfs_abort_transaction(trans, ret); + return ret; + } + + key.objectid = objectid; + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = -1; + ret = btrfs_insert_dir_item(trans, name, namelen, BTRFS_I(dir), &key, + BTRFS_FT_DIR, index); + if (ret) { + btrfs_abort_transaction(trans, ret); + return ret; + } + + btrfs_i_size_write(BTRFS_I(dir), dir->i_size + namelen * 2); + ret = btrfs_update_inode(trans, root, dir); + BUG_ON(ret); + + ret = btrfs_add_root_ref(trans, objectid, root->root_key.objectid, +btrfs_ino(BTRFS_I(dir)), index, name, namelen); + BUG_ON(ret); + + return ret; +} + static noinline int create_subvol(struct inode *dir, struct dentry *dentry, const char *name, int namelen, @@ -563,7 +602,6 @@ static noinline int create_subvol(struct inode *dir, int err; u64 objectid; u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID; - u64 index = 0; uuid_le new_uuid; root_item = kzalloc(sizeof(*root_item), GFP_KERNEL); @@ -677,29 +715,9 @@ static noinline int create_subvol(struct inode *dir, new_root->highest_objectid = new_dirid; mutex_unlock(&new_root->objectid_mutex); - /* -* insert the directory item -*/ - ret = btrfs_set_inode_index(BTRFS_I(dir), &index); - if (ret) { - btrfs_abort_transaction(trans, ret); - goto fail; - } - - ret = btrfs_insert_dir_item(trans, name, namelen, BTRFS_I(dir), &key, - BTRFS_FT_DIR, index); - if (ret) { - btrfs_abort_transaction(trans, ret); + ret = btrfs_link_subvol(trans, dir, objectid, name, namelen); + if (ret) goto fail; - } - - btrfs_i_size_write(BTRFS_I(dir), dir->i_size + namelen * 2); - ret = btrfs_update_inode(trans, root, dir); - BUG_ON(ret); - - ret = btrfs_add_root_ref(trans, objectid, root->root_key.objectid, -btrfs_ino(BTRFS_I(dir)), index, name, namelen); - BUG_ON(ret); ret = btrfs_uuid_tree_add(trans, root_item->uuid, BTRFS_UUID_KEY_SUBVOL, objectid); -- 2.18.0
[RFC PATCH v2 2/4] btrfs: don't BUG_ON() in btrfs_link_subvol()
Both of btrfs_update_inode() and btrfs_add_root_ref() may fail because of ENOMEM. So there's no reason to panic here, we can replace BUG_ON() with btrfs_abort_transaction() here. Signed-off-by: Lu Fengqi --- fs/btrfs/ioctl.c | 10 -- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 1b03d07acde2..f6173d4e7ced 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -572,11 +572,17 @@ static int btrfs_link_subvol(struct btrfs_trans_handle *trans, btrfs_i_size_write(BTRFS_I(dir), dir->i_size + namelen * 2); ret = btrfs_update_inode(trans, root, dir); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, ret); + return ret; + } ret = btrfs_add_root_ref(trans, objectid, root->root_key.objectid, btrfs_ino(BTRFS_I(dir)), index, name, namelen); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, ret); + return ret; + } return ret; } -- 2.18.0
[PATCH] btrfs-progs: calibrate extent_end when found a gap
The extent_end will be used to check whether there is gap between this extent and next extent. If it is not calibrated, check_file_extent will mistake that there are gaps between the remaining extents. Signed-off-by: Lu Fengqi --- check/mode-lowmem.c | 1 + 1 file changed, 1 insertion(+) diff --git a/check/mode-lowmem.c b/check/mode-lowmem.c index 1bce44f5658a..0f14a4968e84 100644 --- a/check/mode-lowmem.c +++ b/check/mode-lowmem.c @@ -1972,6 +1972,7 @@ static int check_file_extent(struct btrfs_root *root, struct btrfs_path *path, root->objectid, fkey.objectid, fkey.offset, fkey.objectid, *end); } + *end = fkey.offset; } *end += extent_num_bytes; -- 2.18.0
[PATCH v10.5 2/5] btrfs-progs: dedupe: Add enable command for dedupe command group
From: Qu Wenruo Add enable subcommand for dedupe commmand group. Signed-off-by: Qu Wenruo Signed-off-by: Lu Fengqi --- Documentation/btrfs-dedupe-inband.asciidoc | 114 +- btrfs-completion | 6 +- cmds-dedupe-ib.c | 238 + ioctl.h| 2 + 4 files changed, 358 insertions(+), 2 deletions(-) diff --git a/Documentation/btrfs-dedupe-inband.asciidoc b/Documentation/btrfs-dedupe-inband.asciidoc index 83113f5487e2..d895aafbcf45 100644 --- a/Documentation/btrfs-dedupe-inband.asciidoc +++ b/Documentation/btrfs-dedupe-inband.asciidoc @@ -22,7 +22,119 @@ use with caution. SUBCOMMAND -- -Nothing yet +*enable* [options] :: +Enable in-band de-duplication for a filesystem. ++ +`Options` ++ +-f|--force +Force 'enable' command to be exected. +Will skip memory limit check and allow 'enable' to be executed even in-band +de-duplication is already enabled. ++ +NOTE: If re-enable dedupe with '-f' option, any unspecified parameter will be +reset to its default value. + +-s|--storage-backend +Specify de-duplication hash storage backend. +Only 'inmemory' backend is supported yet. +If not specified, default value is 'inmemory'. ++ +Refer to *BACKENDS* sector for more information. + +-b|--blocksize +Specify dedupe block size. +Supported values are power of 2 from '16K' to '8M'. +Default value is '128K'. ++ +Refer to *BLOCKSIZE* sector for more information. + +-a|--hash-algorithm +Specify hash algorithm. +Only 'sha256' is supported yet. + +-l|--limit-hash +Specify maximum number of hashes stored in memory. +Only works for 'inmemory' backend. +Conflicts with '-m' option. ++ +Only positive values are valid. +Default value is '32K'. + +-m|--limit-memory +Specify maximum memory used for hashes. +Only works for 'inmemory' backend. +Conflicts with '-l' option. ++ +Only value larger than or equal to '1024' is valid. +No default value. ++ +NOTE: Memory limit will be rounded down to kernel internal hash size, +so the memory limit shown in 'btrfs dedupe-inband status' may be different +from the . + +WARNING: Too large value for '-l' or '-m' will easily trigger OOM. +Please use with caution according to system memory. + +NOTE: In-band de-duplication is not compactible with compression yet. +And compression has higher priority than in-band de-duplication, means if +compression and de-duplication is enabled at the same time, only compression +will work. + +BACKENDS + +Btrfs in-band de-duplication will support different storage backends, with +different use case and features. + +In-memory backend:: +This backend provides backward-compatibility, and more fine-tuning options. +But hash pool is non-persistent and may exhaust kernel memory if not setup +properly. ++ +This backend can be used on old btrfs(without '-O dedupe' mkfs option). +When used on old btrfs, this backend needs to be enabled manually after mount. ++ +Designed for fast hash search speed, in-memory backend will keep all dedupe +hashes in memory. (Although overall performance is still much the same with +'ondisk' backend if all 'ondisk' hash can be cached in memory) ++ +And only keeps limited number of hash in memory to avoid exhausting memory. +Hashes over the limit will be dropped following Last-Recent-Use behavior. +So this backend has a consistent overhead for given limit but can\'t ensure +all duplicated blocks will be de-duplicated. ++ +After umount and mount, in-memory backend need to refill its hash pool. + +On-disk backend:: +This backend provides persistent hash pool, with more smart memory management +for hash pool. +But it\'s not backward-compatible, meaning it must be used with '-O dedupe' mkfs +option and older kernel can\'t mount it read-write. ++ +Designed for de-duplication rate, hash pool is stored as btrfs B+ tree on disk. +This behavior may cause extra disk IO for hash search under high memory +pressure. ++ +After umount and mount, on-disk backend still has its hash on disk, no need to +refill its dedupe hash pool. + +Currently, only 'inmemory' backend is supported in btrfs-progs. + +DEDUPE BLOCK SIZE + +In-band de-duplication is done at dedupe block size. +Any data smaller than dedupe block size won\'t go through in-band +de-duplication. + +And dedupe block size affects dedupe rate and fragmentation heavily. + +Smaller block size will cause more fragments, but higher dedupe rate. + +Larger block size will cause less fragments, but lower dedupe rate. + +In-band de-duplication rate is highly related to the workload pattern. +So it\'s highly recommended to align dedupe block size to the workload +block size to make full use of d
[PATCH v10.5 5/5] btrfs-progs: dedupe: introduce reconfigure subcommand
From: Qu Wenruo Introduce reconfigure subcommand to co-operate with new kernel ioctl modification. Signed-off-by: Qu Wenruo Signed-off-by: Lu Fengqi --- Documentation/btrfs-dedupe-inband.asciidoc | 7 +++ btrfs-completion | 2 +- cmds-dedupe-ib.c | 73 +- 3 files changed, 66 insertions(+), 16 deletions(-) diff --git a/Documentation/btrfs-dedupe-inband.asciidoc b/Documentation/btrfs-dedupe-inband.asciidoc index 6096389cb0b4..78c806f772d6 100644 --- a/Documentation/btrfs-dedupe-inband.asciidoc +++ b/Documentation/btrfs-dedupe-inband.asciidoc @@ -86,6 +86,13 @@ And compression has higher priority than in-band de-duplication, means if compression and de-duplication is enabled at the same time, only compression will work. +*reconfigure* [options] :: +Re-configure in-band de-duplication parameters of a filesystem. ++ +In-band de-duplication must be enbaled first before re-configuration. ++ +[Options] are the same with 'btrfs dedupe-inband enable'. + *status* :: Show current in-band de-duplication status of a filesystem. diff --git a/btrfs-completion b/btrfs-completion index 62a7bdd4d0d5..6ff48e4c2f6a 100644 --- a/btrfs-completion +++ b/btrfs-completion @@ -41,7 +41,7 @@ _btrfs() commands_quota='enable disable rescan' commands_qgroup='assign remove create destroy show limit' commands_replace='start status cancel' - commands_dedupe_inband='enable disable status' + commands_dedupe_inband='enable disable status reconfigure' if [[ "$cur" == -* && $cword -le 3 && "$cmd" != "help" ]]; then COMPREPLY=( $( compgen -W '--help' -- "$cur" ) ) diff --git a/cmds-dedupe-ib.c b/cmds-dedupe-ib.c index e778457e25a8..e52f939c9ced 100644 --- a/cmds-dedupe-ib.c +++ b/cmds-dedupe-ib.c @@ -56,7 +56,6 @@ static const char * const cmd_dedupe_ib_enable_usage[] = { NULL }; - #define report_fatal_parameter(dargs, old, member, type, err_val, fmt) \ ({ \ if (dargs->member != old->member && \ @@ -88,6 +87,12 @@ static void report_parameter_error(struct btrfs_ioctl_dedupe_args *dargs, } report_option_parameter(dargs, old, flags, u8, -1, x); } + + if (dargs->status == 0 && old->cmd == BTRFS_DEDUPE_CTL_RECONF) { + error("must enable dedupe before reconfiguration"); + return; + } + if (report_fatal_parameter(dargs, old, cmd, u16, -1, u) || report_fatal_parameter(dargs, old, blocksize, u64, -1, llu) || report_fatal_parameter(dargs, old, backend, u16, -1, u) || @@ -100,14 +105,17 @@ static void report_parameter_error(struct btrfs_ioctl_dedupe_args *dargs, old->limit_nr, old->limit_mem); } -static int cmd_dedupe_ib_enable(int argc, char **argv) +static int enable_reconfig_dedupe(int argc, char **argv, int reconf) { int ret; int fd = -1; char *path; u64 blocksize = BTRFS_DEDUPE_BLOCKSIZE_DEFAULT; + int blocksize_set = 0; u16 hash_algo = BTRFS_DEDUPE_HASH_SHA256; + int hash_algo_set = 0; u16 backend = BTRFS_DEDUPE_BACKEND_INMEMORY; + int backend_set = 0; u64 limit_nr = 0; u64 limit_mem = 0; u64 sys_mem = 0; @@ -134,15 +142,17 @@ static int cmd_dedupe_ib_enable(int argc, char **argv) break; switch (c) { case 's': - if (!strcasecmp("inmemory", optarg)) + if (!strcasecmp("inmemory", optarg)) { backend = BTRFS_DEDUPE_BACKEND_INMEMORY; - else { + backend_set = 1; + } else { error("unsupported dedupe backend: %s", optarg); exit(1); } break; case 'b': blocksize = parse_size(optarg); + blocksize_set = 1; break; case 'a': if (strcmp("sha256", optarg)) { @@ -224,26 +234,40 @@ static int cmd_dedupe_ib_enable(int argc, char **argv) return 1; } memset(&dargs, -1, sizeof(dargs)); - dargs.cmd = BTRFS_DEDUPE_CTL_ENABLE; - dargs.blocksize = blocksize; - dargs.hash_algo = hash_algo; - dargs.limit_nr = limit_nr; - dargs.limit_mem = limit_mem; - dargs.backend = backend; - if (force) - dargs.flags |= BTRFS_DEDUPE_FLAG_FORCE
[PATCH v10.5 3/5] btrfs-progs: dedupe: Add disable support for inband dedupelication
From: Qu Wenruo Add disable subcommand for dedupe command group. Signed-off-by: Qu Wenruo Signed-off-by: Lu Fengqi --- Documentation/btrfs-dedupe-inband.asciidoc | 5 +++ btrfs-completion | 2 +- cmds-dedupe-ib.c | 41 ++ 3 files changed, 47 insertions(+), 1 deletion(-) diff --git a/Documentation/btrfs-dedupe-inband.asciidoc b/Documentation/btrfs-dedupe-inband.asciidoc index d895aafbcf45..3452f690e3e5 100644 --- a/Documentation/btrfs-dedupe-inband.asciidoc +++ b/Documentation/btrfs-dedupe-inband.asciidoc @@ -22,6 +22,11 @@ use with caution. SUBCOMMAND -- +*disable* :: +Disable in-band de-duplication for a filesystem. ++ +This will trash all stored dedupe hash. ++ *enable* [options] :: Enable in-band de-duplication for a filesystem. + diff --git a/btrfs-completion b/btrfs-completion index cfdf70966e47..a74a23f42022 100644 --- a/btrfs-completion +++ b/btrfs-completion @@ -41,7 +41,7 @@ _btrfs() commands_quota='enable disable rescan' commands_qgroup='assign remove create destroy show limit' commands_replace='start status cancel' - commands_dedupe_inband='enable' + commands_dedupe_inband='enable disable' if [[ "$cur" == -* && $cword -le 3 && "$cmd" != "help" ]]; then COMPREPLY=( $( compgen -W '--help' -- "$cur" ) ) diff --git a/cmds-dedupe-ib.c b/cmds-dedupe-ib.c index 4d499677d9ae..91b6fe234043 100644 --- a/cmds-dedupe-ib.c +++ b/cmds-dedupe-ib.c @@ -259,10 +259,51 @@ out: return ret; } +static const char * const cmd_dedupe_ib_disable_usage[] = { + "btrfs dedupe-inband disable ", + "Disable in-band(write time) de-duplication of a btrfs.", + NULL +}; + +static int cmd_dedupe_ib_disable(int argc, char **argv) +{ + struct btrfs_ioctl_dedupe_args dargs; + DIR *dirstream; + char *path; + int fd; + int ret; + + if (check_argc_exact(argc, 2)) + usage(cmd_dedupe_ib_disable_usage); + + path = argv[1]; + fd = open_file_or_dir(path, &dirstream); + if (fd < 0) { + error("failed to open file or directory: %s", path); + return 1; + } + memset(&dargs, 0, sizeof(dargs)); + dargs.cmd = BTRFS_DEDUPE_CTL_DISABLE; + + ret = ioctl(fd, BTRFS_IOC_DEDUPE_CTL, &dargs); + if (ret < 0) { + error("failed to disable inband deduplication: %m"); + ret = 1; + goto out; + } + ret = 0; + +out: + close_file_or_dir(fd, dirstream); + return 0; +} + const struct cmd_group dedupe_ib_cmd_group = { dedupe_ib_cmd_group_usage, dedupe_ib_cmd_group_info, { { "enable", cmd_dedupe_ib_enable, cmd_dedupe_ib_enable_usage, NULL, 0}, + { "disable", cmd_dedupe_ib_disable, cmd_dedupe_ib_disable_usage, + NULL, 0}, NULL_CMD_STRUCT } }; -- 2.18.0
[PATCH v10.5 4/5] btrfs-progs: dedupe: Add status subcommand
From: Qu Wenruo Add status subcommand for dedupe command group. Signed-off-by: Qu Wenruo Signed-off-by: Lu Fengqi --- Documentation/btrfs-dedupe-inband.asciidoc | 3 + btrfs-completion | 2 +- cmds-dedupe-ib.c | 80 ++ 3 files changed, 84 insertions(+), 1 deletion(-) diff --git a/Documentation/btrfs-dedupe-inband.asciidoc b/Documentation/btrfs-dedupe-inband.asciidoc index 3452f690e3e5..6096389cb0b4 100644 --- a/Documentation/btrfs-dedupe-inband.asciidoc +++ b/Documentation/btrfs-dedupe-inband.asciidoc @@ -86,6 +86,9 @@ And compression has higher priority than in-band de-duplication, means if compression and de-duplication is enabled at the same time, only compression will work. +*status* :: +Show current in-band de-duplication status of a filesystem. + BACKENDS Btrfs in-band de-duplication will support different storage backends, with diff --git a/btrfs-completion b/btrfs-completion index a74a23f42022..62a7bdd4d0d5 100644 --- a/btrfs-completion +++ b/btrfs-completion @@ -41,7 +41,7 @@ _btrfs() commands_quota='enable disable rescan' commands_qgroup='assign remove create destroy show limit' commands_replace='start status cancel' - commands_dedupe_inband='enable disable' + commands_dedupe_inband='enable disable status' if [[ "$cur" == -* && $cword -le 3 && "$cmd" != "help" ]]; then COMPREPLY=( $( compgen -W '--help' -- "$cur" ) ) diff --git a/cmds-dedupe-ib.c b/cmds-dedupe-ib.c index 91b6fe234043..e778457e25a8 100644 --- a/cmds-dedupe-ib.c +++ b/cmds-dedupe-ib.c @@ -298,12 +298,92 @@ out: return 0; } +static const char * const cmd_dedupe_ib_status_usage[] = { + "btrfs dedupe-inband status ", + "Show current in-band(write time) de-duplication status of a btrfs.", + NULL +}; + +static int cmd_dedupe_ib_status(int argc, char **argv) +{ + struct btrfs_ioctl_dedupe_args dargs; + DIR *dirstream; + char *path; + int fd; + int ret; + int print_limit = 1; + + if (check_argc_exact(argc, 2)) + usage(cmd_dedupe_ib_status_usage); + + path = argv[1]; + fd = open_file_or_dir(path, &dirstream); + if (fd < 0) { + error("failed to open file or directory: %s", path); + ret = 1; + goto out; + } + memset(&dargs, 0, sizeof(dargs)); + dargs.cmd = BTRFS_DEDUPE_CTL_STATUS; + + ret = ioctl(fd, BTRFS_IOC_DEDUPE_CTL, &dargs); + if (ret < 0) { + error("failed to get inband deduplication status: %m"); + ret = 1; + goto out; + } + ret = 0; + if (dargs.status == 0) { + printf("Status: \t\t\tDisabled\n"); + goto out; + } + printf("Status:\t\t\tEnabled\n"); + + if (dargs.hash_algo == BTRFS_DEDUPE_HASH_SHA256) + printf("Hash algorithm:\t\tSHA-256\n"); + else + printf("Hash algorithm:\t\tUnrecognized(%x)\n", + dargs.hash_algo); + + if (dargs.backend == BTRFS_DEDUPE_BACKEND_INMEMORY) { + printf("Backend:\t\tIn-memory\n"); + print_limit = 1; + } else { + printf("Backend:\t\tUnrecognized(%x)\n", + dargs.backend); + } + + printf("Dedup Blocksize:\t%llu\n", dargs.blocksize); + + if (print_limit) { + u64 cur_mem; + + /* Limit nr may be 0 */ + if (dargs.limit_nr) + cur_mem = dargs.current_nr * (dargs.limit_mem / + dargs.limit_nr); + else + cur_mem = 0; + + printf("Number of hash: \t[%llu/%llu]\n", dargs.current_nr, + dargs.limit_nr); + printf("Memory usage: \t\t[%s/%s]\n", + pretty_size(cur_mem), + pretty_size(dargs.limit_mem)); + } +out: + close_file_or_dir(fd, dirstream); + return ret; +} + const struct cmd_group dedupe_ib_cmd_group = { dedupe_ib_cmd_group_usage, dedupe_ib_cmd_group_info, { { "enable", cmd_dedupe_ib_enable, cmd_dedupe_ib_enable_usage, NULL, 0}, { "disable", cmd_dedupe_ib_disable, cmd_dedupe_ib_disable_usage, NULL, 0}, + { "status", cmd_dedupe_ib_status, cmd_dedupe_ib_status_usage, + NULL, 0}, NULL_CMD_STRUCT } }; -- 2.18.0
[PATCH v10.5 0/5] In-band de-duplication for btrfs-progs
Patchset can be fetched from github: https://github.com/littleroad/btrfs-progs.git dedupe_latest Inband dedupe(in-memory backend only) ioctl support for btrfs-progs. v7 changes: Update ctree.h to follow kernel structure change Update print-tree to follow kernel structure change V8 changes: Move dedup props and on-disk backend support out of the patchset Change command group name to "dedupe-inband", to avoid confusion with possible out-of-band dedupe. Suggested by Mark. Rebase to latest devel branch. V9 changes: Follow kernels ioctl change to support FORCE flag, new reconf ioctl, and more precious error reporting. v10 changes: Rebase to v4.10. Add BUILD_ASSERT for btrfs_ioctl_dedupe_args v10.1 changes: Rebase to v4.14. v10.2 changes: Rebase to v4.16.1. v10.3 changes: Rebase to v4.17. v10.4 changes: Deal with offline reviews from Misono Tomohiro. 1. s/btrfs-dedupe/btrfs-dedupe-inband 2. Replace strerror(errno) with %m 3. Use SZ_* instead of intermedia number 4. update btrfs-completion for reconfigure subcommand v10.5 changes: Rebase to v4.17.1. Qu Wenruo (5): btrfs-progs: Basic framework for dedupe-inband command group btrfs-progs: dedupe: Add enable command for dedupe command group btrfs-progs: dedupe: Add disable support for inband dedupelication btrfs-progs: dedupe: Add status subcommand btrfs-progs: dedupe: introduce reconfigure subcommand Documentation/Makefile.in | 1 + Documentation/btrfs-dedupe-inband.asciidoc | 167 Documentation/btrfs.asciidoc | 4 + Makefile | 3 +- btrfs-completion | 6 +- btrfs.c| 2 + cmds-dedupe-ib.c | 437 + commands.h | 2 + dedupe-ib.h| 28 ++ ioctl.h| 38 ++ 10 files changed, 686 insertions(+), 2 deletions(-) create mode 100644 Documentation/btrfs-dedupe-inband.asciidoc create mode 100644 cmds-dedupe-ib.c create mode 100644 dedupe-ib.h -- 2.18.0
[PATCH v10.5 1/5] btrfs-progs: Basic framework for dedupe-inband command group
From: Qu Wenruo Add basic ioctl header and command group framework for later use. Alone with basic man page doc. Signed-off-by: Qu Wenruo Signed-off-by: Lu Fengqi --- Documentation/Makefile.in | 1 + Documentation/btrfs-dedupe-inband.asciidoc | 40 ++ Documentation/btrfs.asciidoc | 4 +++ Makefile | 3 +- btrfs.c| 2 ++ cmds-dedupe-ib.c | 35 +++ commands.h | 2 ++ dedupe-ib.h| 28 +++ ioctl.h| 36 +++ 9 files changed, 150 insertions(+), 1 deletion(-) create mode 100644 Documentation/btrfs-dedupe-inband.asciidoc create mode 100644 cmds-dedupe-ib.c create mode 100644 dedupe-ib.h diff --git a/Documentation/Makefile.in b/Documentation/Makefile.in index 184647c41940..402155fae001 100644 --- a/Documentation/Makefile.in +++ b/Documentation/Makefile.in @@ -28,6 +28,7 @@ MAN8_TXT += btrfs-qgroup.asciidoc MAN8_TXT += btrfs-replace.asciidoc MAN8_TXT += btrfs-restore.asciidoc MAN8_TXT += btrfs-property.asciidoc +MAN8_TXT += btrfs-dedupe-inband.asciidoc # Category 5 manual page MAN5_TXT += btrfs-man5.asciidoc diff --git a/Documentation/btrfs-dedupe-inband.asciidoc b/Documentation/btrfs-dedupe-inband.asciidoc new file mode 100644 index ..83113f5487e2 --- /dev/null +++ b/Documentation/btrfs-dedupe-inband.asciidoc @@ -0,0 +1,40 @@ +btrfs-dedupe-inband(8) +== + +NAME + +btrfs-dedupe-inband - manage in-band (write time) de-duplication of a btrfs +filesystem + +SYNOPSIS + +*btrfs dedupe-inband* + +DESCRIPTION +--- +*btrfs dedupe-inband* is used to enable/disable or show current in-band de-duplication +status of a btrfs filesystem. + +Kernel support for in-band de-duplication starts from 4.19. + +WARNING: In-band de-duplication is still an experimental feautre of btrfs, +use with caution. + +SUBCOMMAND +-- +Nothing yet + +EXIT STATUS +--- +*btrfs dedupe-inband* returns a zero exit status if it succeeds. Non zero is +returned in case of failure. + +AVAILABILITY + +*btrfs* is part of btrfs-progs. +Please refer to the btrfs wiki http://btrfs.wiki.kernel.org for +further details. + +SEE ALSO + +`mkfs.btrfs`(8), diff --git a/Documentation/btrfs.asciidoc b/Documentation/btrfs.asciidoc index 7316ac094413..1cf5bddec335 100644 --- a/Documentation/btrfs.asciidoc +++ b/Documentation/btrfs.asciidoc @@ -50,6 +50,10 @@ COMMANDS Do off-line check on a btrfs filesystem. + See `btrfs-check`(8) for details. +*dedupe-inband*:: + Control btrfs in-band(write time) de-duplication. + + See `btrfs-dedupe-inband`(8) for details. + *device*:: Manage devices managed by btrfs, including add/delete/scan and so on. + diff --git a/Makefile b/Makefile index fcfc815a2a5b..4052cecfae4d 100644 --- a/Makefile +++ b/Makefile @@ -123,7 +123,8 @@ cmds_objects = cmds-subvolume.o cmds-filesystem.o cmds-device.o cmds-scrub.o \ cmds-restore.o cmds-rescue.o chunk-recover.o super-recover.o \ cmds-property.o cmds-fi-usage.o cmds-inspect-dump-tree.o \ cmds-inspect-dump-super.o cmds-inspect-tree-stats.o cmds-fi-du.o \ - mkfs/common.o check/mode-common.o check/mode-lowmem.o + mkfs/common.o check/mode-common.o check/mode-lowmem.o \ + cmds-dedupe-ib.o libbtrfs_objects = send-stream.o send-utils.o kernel-lib/rbtree.o btrfs-list.o \ kernel-lib/crc32c.o messages.o \ uuid-tree.o utils-lib.o rbtree-utils.o diff --git a/btrfs.c b/btrfs.c index 2d39f2ced3e8..2168f5a8bc7f 100644 --- a/btrfs.c +++ b/btrfs.c @@ -255,6 +255,8 @@ static const struct cmd_group btrfs_cmd_group = { { "quota", cmd_quota, NULL, "a_cmd_group, 0 }, { "qgroup", cmd_qgroup, NULL, &qgroup_cmd_group, 0 }, { "replace", cmd_replace, NULL, &replace_cmd_group, 0 }, + { "dedupe-inband", cmd_dedupe_ib, NULL, &dedupe_ib_cmd_group, + 0 }, { "help", cmd_help, cmd_help_usage, NULL, 0 }, { "version", cmd_version, cmd_version_usage, NULL, 0 }, NULL_CMD_STRUCT diff --git a/cmds-dedupe-ib.c b/cmds-dedupe-ib.c new file mode 100644 index ..73c923a797da --- /dev/null +++ b/cmds-dedupe-ib.c @@ -0,0 +1,35 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2017 Fujitsu. All rights reserved. + */ + +#include +#include +#include + +#include "ctree.h" +#include "ioctl.h" + +#include "commands.h" +#include "utils.h" +#include "kerncompat.h" +#include "dedupe-ib.h" + +s
[PATCH v15 09/13] btrfs: introduce type based delalloc metadata reserve
From: Wang Xiaoguang Introduce type based metadata reserve parameter for delalloc space reservation/freeing function. The problem we are going to solve is, btrfs use different max extent size for different mount options. For de-duplication, the max extent size can be set by the dedupe ioctl, while for normal write it's 128M. And furthermore, split/merge extent hook highly depends that max extent size. Such situation contributes to quite a lot of false ENOSPC. So this patch introduces the facility to help solve these false ENOSPC related to different max extent size. Currently, only normal 128M extent size is supported. More types will follow soon. Signed-off-by: Wang Xiaoguang Signed-off-by: Qu Wenruo Signed-off-by: Lu Fengqi --- fs/btrfs/ctree.h | 43 ++--- fs/btrfs/extent-tree.c | 48 --- fs/btrfs/file.c | 30 + fs/btrfs/free-space-cache.c | 6 +- fs/btrfs/inode-map.c | 9 ++- fs/btrfs/inode.c | 115 +-- fs/btrfs/ioctl.c | 23 +++ fs/btrfs/ordered-data.c | 6 +- fs/btrfs/ordered-data.h | 3 +- fs/btrfs/relocation.c| 22 --- fs/btrfs/tests/inode-tests.c | 15 +++-- 11 files changed, 223 insertions(+), 97 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 741ef21a6185..4f0b6a12ecb1 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -98,11 +98,24 @@ static const int btrfs_csum_sizes[] = { 4 }; /* * Count how many BTRFS_MAX_EXTENT_SIZE cover the @size */ -static inline u32 count_max_extents(u64 size) +static inline u32 count_max_extents(u64 size, u64 max_extent_size) { - return div_u64(size + BTRFS_MAX_EXTENT_SIZE - 1, BTRFS_MAX_EXTENT_SIZE); + return div_u64(size + max_extent_size - 1, max_extent_size); } +/* + * Type based metadata reserve type + * This affects how btrfs reserve metadata space for buffered write. + * + * This is caused by the different max extent size for normal COW + * and further in-band dedupe + */ +enum btrfs_metadata_reserve_type { + BTRFS_RESERVE_NORMAL, +}; + +u64 btrfs_max_extent_size(enum btrfs_metadata_reserve_type reserve_type); + struct btrfs_mapping_tree { struct extent_map_tree map_tree; }; @@ -2742,8 +2755,9 @@ int btrfs_check_data_free_space(struct inode *inode, void btrfs_free_reserved_data_space(struct inode *inode, struct extent_changeset *reserved, u64 start, u64 len); void btrfs_delalloc_release_space(struct inode *inode, - struct extent_changeset *reserved, - u64 start, u64 len, bool qgroup_free); + struct extent_changeset *reserved, + u64 start, u64 len, bool qgroup_free, + enum btrfs_metadata_reserve_type reserve_type); void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start, u64 len); void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans); @@ -2753,13 +2767,17 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *rsv); void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes, - bool qgroup_free); + bool qgroup_free, + enum btrfs_metadata_reserve_type reserve_type); -int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes); +int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes, + enum btrfs_metadata_reserve_type reserve_type); void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes, -bool qgroup_free); + bool qgroup_free, + enum btrfs_metadata_reserve_type reserve_type); int btrfs_delalloc_reserve_space(struct inode *inode, - struct extent_changeset **reserved, u64 start, u64 len); + struct extent_changeset **reserved, u64 start, u64 len, + enum btrfs_metadata_reserve_type reserve_type); void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type); struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info, unsigned short type); @@ -3165,7 +3183,11 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root); int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr); int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, unsigned int extra_bits, - struct extent_state **cached_state, int d
[PATCH v15 03/13] btrfs: dedupe: Introduce function to add hash into in-memory tree
From: Wang Xiaoguang Introduce static function inmem_add() to add hash into in-memory tree. And now we can implement the btrfs_dedupe_add() interface. Signed-off-by: Qu Wenruo Signed-off-by: Wang Xiaoguang Reviewed-by: Josef Bacik Signed-off-by: Lu Fengqi --- fs/btrfs/dedupe.c | 150 ++ 1 file changed, 150 insertions(+) diff --git a/fs/btrfs/dedupe.c b/fs/btrfs/dedupe.c index 06523162753d..784bb3a8a5ab 100644 --- a/fs/btrfs/dedupe.c +++ b/fs/btrfs/dedupe.c @@ -19,6 +19,14 @@ struct inmem_hash { u8 hash[]; }; +static inline struct inmem_hash *inmem_alloc_hash(u16 algo) +{ + if (WARN_ON(algo >= ARRAY_SIZE(btrfs_hash_sizes))) + return NULL; + return kzalloc(sizeof(struct inmem_hash) + btrfs_hash_sizes[algo], + GFP_NOFS); +} + static struct btrfs_dedupe_info * init_dedupe_info(struct btrfs_ioctl_dedupe_args *dargs) { @@ -167,3 +175,145 @@ int btrfs_dedupe_disable(struct btrfs_fs_info *fs_info) /* Place holder for bisect, will be implemented in later patches */ return 0; } + +static int inmem_insert_hash(struct rb_root *root, +struct inmem_hash *hash, int hash_len) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + struct inmem_hash *entry = NULL; + + while (*p) { + parent = *p; + entry = rb_entry(parent, struct inmem_hash, hash_node); + if (memcmp(hash->hash, entry->hash, hash_len) < 0) + p = &(*p)->rb_left; + else if (memcmp(hash->hash, entry->hash, hash_len) > 0) + p = &(*p)->rb_right; + else + return 1; + } + rb_link_node(&hash->hash_node, parent, p); + rb_insert_color(&hash->hash_node, root); + return 0; +} + +static int inmem_insert_bytenr(struct rb_root *root, + struct inmem_hash *hash) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + struct inmem_hash *entry = NULL; + + while (*p) { + parent = *p; + entry = rb_entry(parent, struct inmem_hash, bytenr_node); + if (hash->bytenr < entry->bytenr) + p = &(*p)->rb_left; + else if (hash->bytenr > entry->bytenr) + p = &(*p)->rb_right; + else + return 1; + } + rb_link_node(&hash->bytenr_node, parent, p); + rb_insert_color(&hash->bytenr_node, root); + return 0; +} + +static void __inmem_del(struct btrfs_dedupe_info *dedupe_info, + struct inmem_hash *hash) +{ + list_del(&hash->lru_list); + rb_erase(&hash->hash_node, &dedupe_info->hash_root); + rb_erase(&hash->bytenr_node, &dedupe_info->bytenr_root); + + if (!WARN_ON(dedupe_info->current_nr == 0)) + dedupe_info->current_nr--; + + kfree(hash); +} + +/* + * Insert a hash into in-memory dedupe tree + * Will remove exceeding last recent use hash. + * + * If the hash mathced with existing one, we won't insert it, to + * save memory + */ +static int inmem_add(struct btrfs_dedupe_info *dedupe_info, +struct btrfs_dedupe_hash *hash) +{ + int ret = 0; + u16 algo = dedupe_info->hash_algo; + struct inmem_hash *ihash; + + ihash = inmem_alloc_hash(algo); + + if (!ihash) + return -ENOMEM; + + /* Copy the data out */ + ihash->bytenr = hash->bytenr; + ihash->num_bytes = hash->num_bytes; + memcpy(ihash->hash, hash->hash, btrfs_hash_sizes[algo]); + + mutex_lock(&dedupe_info->lock); + + ret = inmem_insert_bytenr(&dedupe_info->bytenr_root, ihash); + if (ret > 0) { + kfree(ihash); + ret = 0; + goto out; + } + + ret = inmem_insert_hash(&dedupe_info->hash_root, ihash, + btrfs_hash_sizes[algo]); + if (ret > 0) { + /* +* We only keep one hash in tree to save memory, so if +* hash conflicts, free the one to insert. +*/ + rb_erase(&ihash->bytenr_node, &dedupe_info->bytenr_root); + kfree(ihash); + ret = 0; + goto out; + } + + list_add(&ihash->lru_list, &dedupe_info->lru_list); + dedupe_info->current_nr++; + + /* Remove the last dedupe hash if we exceed limit */ + while (dedupe_info->current_nr > dedupe_info->limit_nr) { + struct inmem_hash *last; + + last = list_entry(dedupe
[PATCH v15 00/13] Btrfs In-band De-duplication
This patchset can be fetched from github: https://github.com/littleroad/linux.git dedupe_latest Now the new base is v4.19-rc2, and drop the patch about compression which conflict with compression heuristic. Normal test cases from auto group exposes no regression, and ib-dedupe group can pass without problem. xfstests ib-dedupe group can be fetched from github: https://github.com/littleroad/xfstests-dev.git btrfs_dedupe_latest Changelog: v2: Totally reworked to handle multiple backends v3: Fix a stupid but deadly on-disk backend bug Add handle for multiple hash on same bytenr corner case to fix abort trans error Increase dedup rate by enhancing delayed ref handler for both backend. Move dedup_add() to run_delayed_ref() time, to fix abort trans error. Increase dedup block size up limit to 8M. v4: Add dedup prop for disabling dedup for given files/dirs. Merge inmem_search() and ondisk_search() into generic_search() to save some code Fix another delayed_ref related bug. Use the same mutex for both inmem and ondisk backend. Move dedup_add() back to btrfs_finish_ordered_io() to increase dedup rate. v5: Reuse compress routine for much simpler dedup function. Slightly improved performance due to above modification. Fix race between dedup enable/disable Fix for false ENOSPC report v6: Further enable/disable race window fix. Minor format change according to checkpatch. v7: Fix one concurrency bug with balance. Slightly modify return value from -EINVAL to -EOPNOTSUPP for btrfs_dedup_ioctl() to allow progs to distinguish unsupported commands and wrong parameter. Rebased to integration-4.6. v8: Rename 'dedup' to 'dedupe'. Add support to allow dedupe and compression work at the same time. Fix several balance related bugs. Special thanks to Satoru Takeuchi, who exposed most of them. Small dedupe hit case performance improvement. v9: Re-order the patchset to completely separate pure in-memory and any on-disk format change. Fold bug fixes into its original patch. v10: Adding back missing bug fix patch. Reduce on-disk item size. Hide dedupe ioctl under CONFIG_BTRFS_DEBUG. v11: Remove other backend and props support to focus on the framework and in-memory backend. Suggested by David. Better disable and buffered write race protection. Comprehensive fix to dedupe metadata ENOSPC problem. v12: Stateful 'enable' ioctl and new 'reconf' ioctl New FORCE flag for enable ioctl to allow stateless ioctl Precise error report and extendable ioctl structure. v12.1 Rebase to David's for-next-20160704 branch Add co-ordinate patch for subpage and dedupe patchset. v12.2 Rebase to David's for-next-20160715 branch Add co-ordinate patch for other patchset. v13 Rebase to David's for-next-20160906 branch Fix a reserved space leak bug, which only frees quota reserved space but not space_info->byte_may_use. v13.1 Rebase to Chris' for-linux-4.9 branch v14 Use generic ENOSPC fix for both compression and dedupe. v14.1 Further split ENOSPC fix. v14.2 Rebase to v4.11-rc2. Co-operate with count_max_extent() to calculate num_extents. No longer rely on qgroup fixes. v14.3 Rebase to v4.12-rc1. v14.4 Rebase to kdave/for-4.13-part1. v14.5 Rebase to v4.15-rc3. v14.6 Rebase to v4.17-rc5. v14.7 Replace SHASH_DESC_ON_STACK with kmalloc to remove VLA. Fixed the following errors by switching to div_u64. ├── arm-allmodconfig │ └── ERROR:__aeabi_uldivmod-fs-btrfs-btrfs.ko-undefined └── i386-allmodconfig └── ERROR:__udivdi3-fs-btrfs-btrfs.ko-undefined v14.8 Rebase to v4.18-rc4. v15 Rebase to v4.19-rc2. Drop "btrfs: Introduce COMPRESS reserve type to fix false enospc for compression". Remove the ifdef around btrfs inband dedupe ioctl. Qu Wenruo (4): btrfs: delayed-ref: Add support for increasing data ref under spinlock btrfs: dedupe: Inband in-memory only de-duplication implement btrfs: relocation: Enhance error handling to avoid BUG_ON btrfs: dedupe: Introduce new reconfigure ioctl Wang Xiaoguang (9): btrfs: dedupe: Introduce dedupe framework and its header btrfs: dedupe: Introduce function to initialize dedupe info btrfs: dedupe: Introduce function to add hash into in-memory tree btrfs: dedupe: Introduce function to remove hash from in-memory tree btrfs: dedupe: Introduce function to search for an existing hash btrfs: dedupe: Implement btrfs_dedupe_calc_hash interface btrfs: ordered-extent: Add support for dedupe btrfs: introduce type based delalloc metadata reserve btrfs: dedupe: Add ioctl for inband deduplication fs/btrfs/Makefile| 2 +- fs/btrfs/ctree.h | 52 ++- fs/btrfs/dedupe.c| 828 +++ fs/btrfs/dedupe.h| 175 +++- fs/btrfs/delayed-ref.c | 53 ++- fs/btrfs/delayed-ref.h | 15 + fs/btrfs/disk-io.c | 4 + fs/btrfs/extent-tree.c | 67 ++- fs/btrfs/extent_io.c |
[PATCH v15 13/13] btrfs: dedupe: Introduce new reconfigure ioctl
From: Qu Wenruo Introduce new reconfigure ioctl and new FORCE flag for in-band dedupe ioctls. Now dedupe enable and reconfigure ioctl are stateful. | Current state | Ioctl| Next state | | Disabled | enable| Enabled | | Enabled | enable| Not allowed | | Enabled | reconf| Enabled | | Enabled | disable | Disabled| | Disabled | dsiable | Disabled| | Disabled | reconf| Not allowed | (While disable is always stateless) While for guys prefer stateless ioctl (myself for example), new FORCE flag is introduced. In FORCE mode, enable/disable is completely stateless. | Current state | Ioctl| Next state | | Disabled | enable| Enabled | | Enabled | enable| Enabled | | Enabled | disable | Disabled| | Disabled | disable | Disabled| Also, re-configure ioctl will only modify specified fields. Unlike enable, un-specified fields will be filled with default value. For example: # btrfs dedupe enable --block-size 64k /mnt # btrfs dedupe reconfigure --limit-hash 1m /mnt Will leads to: dedupe blocksize: 64K dedupe hash limit nr: 1m While for enable: # btrfs dedupe enable --force --block-size 64k /mnt # btrfs dedupe enable --force --limit-hash 1m /mnt Will reset blocksize to default value: dedupe blocksize: 128K << reset dedupe hash limit nr: 1m Suggested-by: David Sterba Signed-off-by: Qu Wenruo Signed-off-by: Lu Fengqi --- fs/btrfs/dedupe.c | 132 ++--- fs/btrfs/dedupe.h | 13 fs/btrfs/ioctl.c | 13 include/uapi/linux/btrfs.h | 11 +++- 4 files changed, 143 insertions(+), 26 deletions(-) diff --git a/fs/btrfs/dedupe.c b/fs/btrfs/dedupe.c index a147e148bbb8..2be3e53acc6a 100644 --- a/fs/btrfs/dedupe.c +++ b/fs/btrfs/dedupe.c @@ -29,6 +29,40 @@ static inline struct inmem_hash *inmem_alloc_hash(u16 algo) GFP_NOFS); } +/* + * Copy from current dedupe info to fill dargs. + * For reconf case, only fill members which is uninitialized. + */ +static void get_dedupe_status(struct btrfs_dedupe_info *dedupe_info, + struct btrfs_ioctl_dedupe_args *dargs) +{ + int reconf = (dargs->cmd == BTRFS_DEDUPE_CTL_RECONF); + + dargs->status = 1; + + if (!reconf || (reconf && dargs->blocksize == (u64)-1)) + dargs->blocksize = dedupe_info->blocksize; + if (!reconf || (reconf && dargs->backend == (u16)-1)) + dargs->backend = dedupe_info->backend; + if (!reconf || (reconf && dargs->hash_algo == (u16)-1)) + dargs->hash_algo = dedupe_info->hash_algo; + + /* +* For re-configure case, if not modifying limit, +* therir limit will be set to 0, unlike other fields +*/ + if (!reconf || !(dargs->limit_nr || dargs->limit_mem)) { + dargs->limit_nr = dedupe_info->limit_nr; + dargs->limit_mem = dedupe_info->limit_nr * + (sizeof(struct inmem_hash) + +btrfs_hash_sizes[dedupe_info->hash_algo]); + } + + /* current_nr doesn't makes sense for reconfig case */ + if (!reconf) + dargs->current_nr = dedupe_info->current_nr; +} + void btrfs_dedupe_status(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_dedupe_args *dargs) { @@ -45,15 +79,7 @@ void btrfs_dedupe_status(struct btrfs_fs_info *fs_info, return; } mutex_lock(&dedupe_info->lock); - dargs->status = 1; - dargs->blocksize = dedupe_info->blocksize; - dargs->backend = dedupe_info->backend; - dargs->hash_algo = dedupe_info->hash_algo; - dargs->limit_nr = dedupe_info->limit_nr; - dargs->limit_mem = dedupe_info->limit_nr * - (sizeof(struct inmem_hash) + -btrfs_hash_sizes[dedupe_info->hash_algo]); - dargs->current_nr = dedupe_info->current_nr; + get_dedupe_status(dedupe_info, dargs); mutex_unlock(&dedupe_info->lock); memset(dargs->__unused, -1, sizeof(dargs->__unused)); } @@ -98,17 +124,50 @@ init_dedupe_info(struct btrfs_ioctl_dedupe_args *dargs) static int check_dedupe_parameter(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_dedupe_args *dargs) { - u64 blocksize = dargs->blocksize; - u64 limit_nr = dargs->limit_nr; - u64 limit_mem = dargs->limit_mem; - u16 hash_algo = darg
[PATCH v15 12/13] btrfs: relocation: Enhance error handling to avoid BUG_ON
From: Qu Wenruo Since the introduction of btrfs dedupe tree, it's possible that balance can race with dedupe disabling. When this happens, dedupe_enabled will make btrfs_get_fs_root() return PTR_ERR(-ENOENT). But due to a bug in error handling branch, when this happens backref_cache->nr_nodes is increased but the node is neither added to backref_cache or nr_nodes decreased. Causing BUG_ON() in backref_cache_cleanup() [ 2611.668810] [ cut here ] [ 2611.669946] kernel BUG at /home/sat/ktest/linux/fs/btrfs/relocation.c:243! [ 2611.670572] invalid opcode: [#1] SMP [ 2611.686797] Call Trace: [ 2611.687034] [] btrfs_relocate_block_group+0x1b3/0x290 [btrfs] [ 2611.687706] [] btrfs_relocate_chunk.isra.40+0x47/0xd0 [btrfs] [ 2611.688385] [] btrfs_balance+0xb22/0x11e0 [btrfs] [ 2611.688966] [] btrfs_ioctl_balance+0x391/0x3a0 [btrfs] [ 2611.689587] [] btrfs_ioctl+0x1650/0x2290 [btrfs] [ 2611.690145] [] ? lru_cache_add+0x3a/0x80 [ 2611.690647] [] ? lru_cache_add_active_or_unevictable+0x4c/0xc0 [ 2611.691310] [] ? handle_mm_fault+0xcd4/0x17f0 [ 2611.691842] [] ? cp_new_stat+0x153/0x180 [ 2611.692342] [] ? __vma_link_rb+0xfd/0x110 [ 2611.692842] [] ? vma_link+0xb9/0xc0 [ 2611.693303] [] do_vfs_ioctl+0xa1/0x5a0 [ 2611.693781] [] ? __do_page_fault+0x1b4/0x400 [ 2611.694310] [] SyS_ioctl+0x41/0x70 [ 2611.694758] [] entry_SYSCALL_64_fastpath+0x12/0x71 [ 2611.695331] Code: ff 48 8b 45 bf 49 83 af a8 05 00 00 01 49 89 87 a0 05 00 00 e9 2e fd ff ff b8 f4 ff ff ff e9 e4 fb ff ff 0f 0b 0f 0b 0f 0b 0f 0b <0f> 0b 0f 0b 41 89 c6 e9 b8 fb ff ff e8 9e a6 e8 e0 4c 89 e7 44 [ 2611.697870] RIP [] relocate_block_group+0x741/0x7a0 [btrfs] [ 2611.698818] RSP This patch will call remove_backref_node() in error handling branch, and cache the returned -ENOENT in relocate_tree_block() and continue balancing. Reported-by: Satoru Takeuchi Signed-off-by: Qu Wenruo --- fs/btrfs/relocation.c | 22 +- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 59a9c22ebf51..5f4b138fcb35 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -845,6 +845,13 @@ struct backref_node *build_backref_tree(struct reloc_control *rc, root = read_fs_root(rc->extent_root->fs_info, key.offset); if (IS_ERR(root)) { err = PTR_ERR(root); + /* +* Don't forget to cleanup current node. +* As it may not be added to backref_cache but nr_node +* increased. +* This will cause BUG_ON() in backref_cache_cleanup(). +*/ + remove_backref_node(&rc->backref_cache, cur); goto out; } @@ -3018,14 +3025,21 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans, } rb_node = rb_first(blocks); - while (rb_node) { + for (rb_node = rb_first(blocks); rb_node; rb_node = rb_next(rb_node)) { block = rb_entry(rb_node, struct tree_block, rb_node); node = build_backref_tree(rc, &block->key, block->level, block->bytenr); if (IS_ERR(node)) { + /* +* The root(dedupe tree yet) of the tree block is +* going to be freed and can't be reached. +* Just skip it and continue balancing. +*/ + if (PTR_ERR(node) == -ENOENT) + continue; err = PTR_ERR(node); - goto out; + break; } ret = relocate_tree_block(trans, rc, node, &block->key, @@ -3033,11 +3047,9 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans, if (ret < 0) { if (ret != -EAGAIN || rb_node == rb_first(blocks)) err = ret; - goto out; + break; } - rb_node = rb_next(rb_node); } -out: err = finish_pending_nodes(trans, rc, path, err); out_free_path: -- 2.18.0
[PATCH v15 07/13] btrfs: dedupe: Implement btrfs_dedupe_calc_hash interface
From: Wang Xiaoguang Unlike in-memory or on-disk dedupe method, only SHA256 hash method is supported yet, so implement btrfs_dedupe_calc_hash() interface using SHA256. Signed-off-by: Qu Wenruo Signed-off-by: Wang Xiaoguang Reviewed-by: Josef Bacik Signed-off-by: Lu Fengqi --- fs/btrfs/dedupe.c | 50 +++ 1 file changed, 50 insertions(+) diff --git a/fs/btrfs/dedupe.c b/fs/btrfs/dedupe.c index 9c6152b7f0eb..9b0a90dd8e42 100644 --- a/fs/btrfs/dedupe.c +++ b/fs/btrfs/dedupe.c @@ -644,3 +644,53 @@ int btrfs_dedupe_search(struct btrfs_fs_info *fs_info, } return ret; } + +int btrfs_dedupe_calc_hash(struct btrfs_fs_info *fs_info, + struct inode *inode, u64 start, + struct btrfs_dedupe_hash *hash) +{ + int i; + int ret; + struct page *p; + struct shash_desc *shash; + struct btrfs_dedupe_info *dedupe_info = fs_info->dedupe_info; + struct crypto_shash *tfm = dedupe_info->dedupe_driver; + u64 dedupe_bs; + u64 sectorsize = fs_info->sectorsize; + + shash = kmalloc(sizeof(*shash) + crypto_shash_descsize(tfm), GFP_NOFS); + if (!shash) + return -ENOMEM; + + if (!fs_info->dedupe_enabled || !hash) + return 0; + + if (WARN_ON(dedupe_info == NULL)) + return -EINVAL; + + WARN_ON(!IS_ALIGNED(start, sectorsize)); + + dedupe_bs = dedupe_info->blocksize; + + shash->tfm = tfm; + shash->flags = 0; + ret = crypto_shash_init(shash); + if (ret) + return ret; + for (i = 0; sectorsize * i < dedupe_bs; i++) { + char *d; + + p = find_get_page(inode->i_mapping, + (start >> PAGE_SHIFT) + i); + if (WARN_ON(!p)) + return -ENOENT; + d = kmap(p); + ret = crypto_shash_update(shash, d, sectorsize); + kunmap(p); + put_page(p); + if (ret) + return ret; + } + ret = crypto_shash_final(shash, hash->hash); + return ret; +} -- 2.18.0
[PATCH v15 11/13] btrfs: dedupe: Add ioctl for inband deduplication
From: Wang Xiaoguang Add ioctl interface for inband deduplication, which includes: 1) enable 2) disable 3) status And a pseudo RO compat flag, to imply that btrfs now supports inband dedup. However we don't add any ondisk format change, it's just a pseudo RO compat flag. All these ioctl interfaces are state-less, which means caller don't need to bother previous dedupe state before calling them, and only need to care the final desired state. For example, if user want to enable dedupe with specified block size and limit, just fill the ioctl structure and call enable ioctl. No need to check if dedupe is already running. These ioctls will handle things like re-configure or disable quite well. Also, for invalid parameters, enable ioctl interface will set the field of the first encountered invalid parameter to (-1) to inform caller. While for limit_nr/limit_mem, the value will be (0). Signed-off-by: Qu Wenruo Signed-off-by: Wang Xiaoguang Signed-off-by: Lu Fengqi --- fs/btrfs/dedupe.c | 50 + fs/btrfs/dedupe.h | 17 +++--- fs/btrfs/disk-io.c | 3 ++ fs/btrfs/ioctl.c | 65 ++ fs/btrfs/sysfs.c | 2 ++ include/uapi/linux/btrfs.h | 12 ++- 6 files changed, 143 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/dedupe.c b/fs/btrfs/dedupe.c index 9b0a90dd8e42..a147e148bbb8 100644 --- a/fs/btrfs/dedupe.c +++ b/fs/btrfs/dedupe.c @@ -29,6 +29,35 @@ static inline struct inmem_hash *inmem_alloc_hash(u16 algo) GFP_NOFS); } +void btrfs_dedupe_status(struct btrfs_fs_info *fs_info, +struct btrfs_ioctl_dedupe_args *dargs) +{ + struct btrfs_dedupe_info *dedupe_info = fs_info->dedupe_info; + + if (!fs_info->dedupe_enabled || !dedupe_info) { + dargs->status = 0; + dargs->blocksize = 0; + dargs->backend = 0; + dargs->hash_algo = 0; + dargs->limit_nr = 0; + dargs->current_nr = 0; + memset(dargs->__unused, -1, sizeof(dargs->__unused)); + return; + } + mutex_lock(&dedupe_info->lock); + dargs->status = 1; + dargs->blocksize = dedupe_info->blocksize; + dargs->backend = dedupe_info->backend; + dargs->hash_algo = dedupe_info->hash_algo; + dargs->limit_nr = dedupe_info->limit_nr; + dargs->limit_mem = dedupe_info->limit_nr * + (sizeof(struct inmem_hash) + +btrfs_hash_sizes[dedupe_info->hash_algo]); + dargs->current_nr = dedupe_info->current_nr; + mutex_unlock(&dedupe_info->lock); + memset(dargs->__unused, -1, sizeof(dargs->__unused)); +} + static struct btrfs_dedupe_info * init_dedupe_info(struct btrfs_ioctl_dedupe_args *dargs) { @@ -402,6 +431,27 @@ static void unblock_all_writers(struct btrfs_fs_info *fs_info) percpu_up_write(sb->s_writers.rw_sem + SB_FREEZE_WRITE - 1); } +int btrfs_dedupe_cleanup(struct btrfs_fs_info *fs_info) +{ + struct btrfs_dedupe_info *dedupe_info; + + fs_info->dedupe_enabled = 0; + /* same as disable */ + smp_wmb(); + dedupe_info = fs_info->dedupe_info; + fs_info->dedupe_info = NULL; + + if (!dedupe_info) + return 0; + + if (dedupe_info->backend == BTRFS_DEDUPE_BACKEND_INMEMORY) + inmem_destroy(dedupe_info); + + crypto_free_shash(dedupe_info->dedupe_driver); + kfree(dedupe_info); + return 0; +} + int btrfs_dedupe_disable(struct btrfs_fs_info *fs_info) { struct btrfs_dedupe_info *dedupe_info; diff --git a/fs/btrfs/dedupe.h b/fs/btrfs/dedupe.h index 8157b17c4d11..fdd00355d6b5 100644 --- a/fs/btrfs/dedupe.h +++ b/fs/btrfs/dedupe.h @@ -90,6 +90,15 @@ static inline struct btrfs_dedupe_hash *btrfs_dedupe_alloc_hash(u16 algo) int btrfs_dedupe_enable(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_dedupe_args *dargs); + +/* + * Get inband dedupe info + * Since it needs to access different backends' hash size, which + * is not exported, we need such simple function. + */ +void btrfs_dedupe_status(struct btrfs_fs_info *fs_info, +struct btrfs_ioctl_dedupe_args *dargs); + /* * Disable dedupe and invalidate all its dedupe data. * Called at dedupe disable time. @@ -101,12 +110,10 @@ int btrfs_dedupe_enable(struct btrfs_fs_info *fs_info, int btrfs_dedupe_disable(struct btrfs_fs_info *fs_info); /* - * Get current dedupe status. - * Return 0 for success - * No possible error yet + * Cleanup current btrfs_dedupe_info + * Called in umount time */ -void btrfs_dedupe_status(struct btrfs_fs_info *fs_info, -struct btrfs_ioctl_dedupe_args *dargs); +int btrfs_dedupe_cleanup(struct btrfs_fs_info *fs_info);
[PATCH v15 04/13] btrfs: dedupe: Introduce function to remove hash from in-memory tree
From: Wang Xiaoguang Introduce static function inmem_del() to remove hash from in-memory dedupe tree. And implement btrfs_dedupe_del() and btrfs_dedup_disable() interfaces. Also for btrfs_dedupe_disable(), add new functions to wait existing writer and block incoming writers to eliminate all possible race. Cc: Mark Fasheh Signed-off-by: Qu Wenruo Signed-off-by: Wang Xiaoguang Signed-off-by: Lu Fengqi --- fs/btrfs/dedupe.c | 131 +++--- 1 file changed, 125 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/dedupe.c b/fs/btrfs/dedupe.c index 784bb3a8a5ab..951fefd19fde 100644 --- a/fs/btrfs/dedupe.c +++ b/fs/btrfs/dedupe.c @@ -170,12 +170,6 @@ int btrfs_dedupe_enable(struct btrfs_fs_info *fs_info, return ret; } -int btrfs_dedupe_disable(struct btrfs_fs_info *fs_info) -{ - /* Place holder for bisect, will be implemented in later patches */ - return 0; -} - static int inmem_insert_hash(struct rb_root *root, struct inmem_hash *hash, int hash_len) { @@ -317,3 +311,128 @@ int btrfs_dedupe_add(struct btrfs_fs_info *fs_info, return inmem_add(dedupe_info, hash); return -EINVAL; } + +static struct inmem_hash * +inmem_search_bytenr(struct btrfs_dedupe_info *dedupe_info, u64 bytenr) +{ + struct rb_node **p = &dedupe_info->bytenr_root.rb_node; + struct rb_node *parent = NULL; + struct inmem_hash *entry = NULL; + + while (*p) { + parent = *p; + entry = rb_entry(parent, struct inmem_hash, bytenr_node); + + if (bytenr < entry->bytenr) + p = &(*p)->rb_left; + else if (bytenr > entry->bytenr) + p = &(*p)->rb_right; + else + return entry; + } + + return NULL; +} + +/* Delete a hash from in-memory dedupe tree */ +static int inmem_del(struct btrfs_dedupe_info *dedupe_info, u64 bytenr) +{ + struct inmem_hash *hash; + + mutex_lock(&dedupe_info->lock); + hash = inmem_search_bytenr(dedupe_info, bytenr); + if (!hash) { + mutex_unlock(&dedupe_info->lock); + return 0; + } + + __inmem_del(dedupe_info, hash); + mutex_unlock(&dedupe_info->lock); + return 0; +} + +/* Remove a dedupe hash from dedupe tree */ +int btrfs_dedupe_del(struct btrfs_fs_info *fs_info, u64 bytenr) +{ + struct btrfs_dedupe_info *dedupe_info = fs_info->dedupe_info; + + if (!fs_info->dedupe_enabled) + return 0; + + if (WARN_ON(dedupe_info == NULL)) + return -EINVAL; + + if (dedupe_info->backend == BTRFS_DEDUPE_BACKEND_INMEMORY) + return inmem_del(dedupe_info, bytenr); + return -EINVAL; +} + +static void inmem_destroy(struct btrfs_dedupe_info *dedupe_info) +{ + struct inmem_hash *entry, *tmp; + + mutex_lock(&dedupe_info->lock); + list_for_each_entry_safe(entry, tmp, &dedupe_info->lru_list, lru_list) + __inmem_del(dedupe_info, entry); + mutex_unlock(&dedupe_info->lock); +} + +/* + * Helper function to wait and block all incoming writers + * + * Use rw_sem introduced for freeze to wait/block writers. + * So during the block time, no new write will happen, so we can + * do something quite safe, espcially helpful for dedupe disable, + * as it affect buffered write. + */ +static void block_all_writers(struct btrfs_fs_info *fs_info) +{ + struct super_block *sb = fs_info->sb; + + percpu_down_write(sb->s_writers.rw_sem + SB_FREEZE_WRITE - 1); + down_write(&sb->s_umount); +} + +static void unblock_all_writers(struct btrfs_fs_info *fs_info) +{ + struct super_block *sb = fs_info->sb; + + up_write(&sb->s_umount); + percpu_up_write(sb->s_writers.rw_sem + SB_FREEZE_WRITE - 1); +} + +int btrfs_dedupe_disable(struct btrfs_fs_info *fs_info) +{ + struct btrfs_dedupe_info *dedupe_info; + int ret; + + dedupe_info = fs_info->dedupe_info; + + if (!dedupe_info) + return 0; + + /* Don't allow disable status change in RO mount */ + if (fs_info->sb->s_flags & MS_RDONLY) + return -EROFS; + + /* +* Wait for all unfinished writers and block further writers. +* Then sync the whole fs so all current write will go through +* dedupe, and all later write won't go through dedupe. +*/ + block_all_writers(fs_info); + ret = sync_filesystem(fs_info->sb); + fs_info->dedupe_enabled = 0; + fs_info->dedupe_info = NULL; + unblock_all_writers(fs_info); + if (ret < 0) + return ret; + + /* now we are OK to clean up everything */ + if (dedupe_info->backend == BTRFS_DEDUPE_BACKEND_INMEMOR
[PATCH v15 02/13] btrfs: dedupe: Introduce function to initialize dedupe info
From: Wang Xiaoguang Add generic function to initialize dedupe info. Signed-off-by: Qu Wenruo Signed-off-by: Wang Xiaoguang Reviewed-by: Josef Bacik Signed-off-by: Lu Fengqi --- fs/btrfs/Makefile | 2 +- fs/btrfs/dedupe.c | 169 + fs/btrfs/dedupe.h | 12 +++ include/uapi/linux/btrfs.h | 3 + 4 files changed, 185 insertions(+), 1 deletion(-) create mode 100644 fs/btrfs/dedupe.c diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index ca693dd554e9..78fdc87dba39 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -10,7 +10,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ export.o tree-log.o free-space-cache.o zlib.o lzo.o zstd.o \ compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \ - uuid-tree.o props.o free-space-tree.o tree-checker.o + uuid-tree.o props.o free-space-tree.o tree-checker.o dedupe.o btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o diff --git a/fs/btrfs/dedupe.c b/fs/btrfs/dedupe.c new file mode 100644 index ..06523162753d --- /dev/null +++ b/fs/btrfs/dedupe.c @@ -0,0 +1,169 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2016 Fujitsu. All rights reserved. + */ + +#include "ctree.h" +#include "dedupe.h" +#include "btrfs_inode.h" +#include "delayed-ref.h" + +struct inmem_hash { + struct rb_node hash_node; + struct rb_node bytenr_node; + struct list_head lru_list; + + u64 bytenr; + u32 num_bytes; + + u8 hash[]; +}; + +static struct btrfs_dedupe_info * +init_dedupe_info(struct btrfs_ioctl_dedupe_args *dargs) +{ + struct btrfs_dedupe_info *dedupe_info; + + dedupe_info = kzalloc(sizeof(*dedupe_info), GFP_NOFS); + if (!dedupe_info) + return ERR_PTR(-ENOMEM); + + dedupe_info->hash_algo = dargs->hash_algo; + dedupe_info->backend = dargs->backend; + dedupe_info->blocksize = dargs->blocksize; + dedupe_info->limit_nr = dargs->limit_nr; + + /* only support SHA256 yet */ + dedupe_info->dedupe_driver = crypto_alloc_shash("sha256", 0, 0); + if (IS_ERR(dedupe_info->dedupe_driver)) { + kfree(dedupe_info); + return ERR_CAST(dedupe_info->dedupe_driver); + } + + dedupe_info->hash_root = RB_ROOT; + dedupe_info->bytenr_root = RB_ROOT; + dedupe_info->current_nr = 0; + INIT_LIST_HEAD(&dedupe_info->lru_list); + mutex_init(&dedupe_info->lock); + + return dedupe_info; +} + +/* + * Helper to check if parameters are valid. + * The first invalid field will be set to (-1), to info user which parameter + * is invalid. + * Except dargs->limit_nr or dargs->limit_mem, in that case, 0 will returned + * to info user, since user can specify any value to limit, except 0. + */ +static int check_dedupe_parameter(struct btrfs_fs_info *fs_info, + struct btrfs_ioctl_dedupe_args *dargs) +{ + u64 blocksize = dargs->blocksize; + u64 limit_nr = dargs->limit_nr; + u64 limit_mem = dargs->limit_mem; + u16 hash_algo = dargs->hash_algo; + u8 backend = dargs->backend; + + /* +* Set all reserved fields to -1, allow user to detect +* unsupported optional parameters. +*/ + memset(dargs->__unused, -1, sizeof(dargs->__unused)); + if (blocksize > BTRFS_DEDUPE_BLOCKSIZE_MAX || + blocksize < BTRFS_DEDUPE_BLOCKSIZE_MIN || + blocksize < fs_info->sectorsize || + !is_power_of_2(blocksize) || + blocksize < PAGE_SIZE) { + dargs->blocksize = (u64)-1; + return -EINVAL; + } + if (hash_algo >= ARRAY_SIZE(btrfs_hash_sizes)) { + dargs->hash_algo = (u16)-1; + return -EINVAL; + } + if (backend >= BTRFS_DEDUPE_BACKEND_COUNT) { + dargs->backend = (u8)-1; + return -EINVAL; + } + + /* Backend specific check */ + if (backend == BTRFS_DEDUPE_BACKEND_INMEMORY) { + /* only one limit is accepted for enable*/ + if (dargs->limit_nr && dargs->limit_mem) { + dargs->limit_nr = 0; + dargs->limit_mem = 0; + return -EINVAL; + } + + if (!limit_nr && !limit_mem) + dargs->limit_nr = BTRFS_DEDUPE_LIMIT_NR_DEFAULT; + else { + u64 tmp = (u64)-1; + + if (limit_mem) { + tmp = div_u64(limit_mem, +
[PATCH v15 05/13] btrfs: delayed-ref: Add support for increasing data ref under spinlock
From: Qu Wenruo For in-band dedupe, btrfs needs to increase data ref with delayed_ref locked, so add a new function btrfs_add_delayed_data_ref_lock() to increase extent ref with delayed_refs already locked. Export init_delayed_ref_head and init_delayed_ref_common for inband dedupe. Signed-off-by: Qu Wenruo Reviewed-by: Josef Bacik Signed-off-by: Lu Fengqi --- fs/btrfs/delayed-ref.c | 53 +- fs/btrfs/delayed-ref.h | 15 2 files changed, 52 insertions(+), 16 deletions(-) diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 62ff545ba1f7..faca30b334ee 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -526,7 +526,7 @@ update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs, spin_unlock(&existing->lock); } -static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref, +void btrfs_init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref, struct btrfs_qgroup_extent_record *qrecord, u64 bytenr, u64 num_bytes, u64 ref_root, u64 reserved, int action, bool is_data, @@ -654,7 +654,7 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans, } /* - * init_delayed_ref_common - Initialize the structure which represents a + * btrfs_init_delayed_ref_common - Initialize the structure which represents a * modification to a an extent. * * @fs_info:Internal to the mounted filesystem mount structure. @@ -678,7 +678,7 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans, * when recording a metadata extent or BTRFS_SHARED_DATA_REF_KEY/ * BTRFS_EXTENT_DATA_REF_KEY when recording data extent */ -static void init_delayed_ref_common(struct btrfs_fs_info *fs_info, +void btrfs_init_delayed_ref_common(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_node *ref, u64 bytenr, u64 num_bytes, u64 ref_root, int action, u8 ref_type) @@ -751,14 +751,14 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, else ref_type = BTRFS_TREE_BLOCK_REF_KEY; - init_delayed_ref_common(fs_info, &ref->node, bytenr, num_bytes, - ref_root, action, ref_type); + btrfs_init_delayed_ref_common(fs_info, &ref->node, bytenr, num_bytes, + ref_root, action, ref_type); ref->root = ref_root; ref->parent = parent; ref->level = level; - init_delayed_ref_head(head_ref, record, bytenr, num_bytes, - ref_root, 0, action, false, is_system); + btrfs_init_delayed_ref_head(head_ref, record, bytenr, num_bytes, + ref_root, 0, action, false, is_system); head_ref->extent_op = extent_op; delayed_refs = &trans->transaction->delayed_refs; @@ -787,6 +787,29 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, return 0; } +/* + * Do real delayed data ref insert. + * Caller must hold delayed_refs->lock and allocation memory + * for dref,head_ref and record. + */ +int btrfs_add_delayed_data_ref_locked(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_head *head_ref, + struct btrfs_qgroup_extent_record *qrecord, + struct btrfs_delayed_data_ref *ref, int action, + int *qrecord_inserted_ret, int *old_ref_mod, + int *new_ref_mod) +{ + struct btrfs_delayed_ref_root *delayed_refs; + + head_ref = add_delayed_ref_head(trans, head_ref, qrecord, + action, qrecord_inserted_ret, + old_ref_mod, new_ref_mod); + + delayed_refs = &trans->transaction->delayed_refs; + + return insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node); +} + /* * add a delayed data ref. it's similar to btrfs_add_delayed_tree_ref. */ @@ -813,7 +836,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, ref_type = BTRFS_SHARED_DATA_REF_KEY; else ref_type = BTRFS_EXTENT_DATA_REF_KEY; - init_delayed_ref_common(fs_info, &ref->node, bytenr, num_bytes, + btrfs_init_delayed_ref_common(fs_info, &ref->node, bytenr, num_bytes, ref_root, action, ref_type); ref->root = ref_root; ref->parent = parent; @@ -838,8 +861,8 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, } } - init_delayed_ref_head(head_ref, record, bytenr, num_bytes, ref_root, - reserved, actio
[PATCH v15 10/13] btrfs: dedupe: Inband in-memory only de-duplication implement
From: Qu Wenruo Core implement for inband de-duplication. It reuses the async_cow_start() facility to do the calculate dedupe hash. And use dedupe hash to do inband de-duplication at extent level. The workflow is as below: 1) Run delalloc range for an inode 2) Calculate hash for the delalloc range at the unit of dedupe_bs 3) For hash match(duplicated) case, just increase source extent ref and insert file extent. For hash mismatch case, go through the normal cow_file_range() fallback, and add hash into dedupe_tree. Compress for hash miss case is not supported yet. Current implement restore all dedupe hash in memory rb-tree, with LRU behavior to control the limit. Signed-off-by: Wang Xiaoguang Signed-off-by: Qu Wenruo Signed-off-by: Lu Fengqi --- fs/btrfs/ctree.h | 4 +- fs/btrfs/dedupe.h | 15 ++ fs/btrfs/extent-tree.c | 31 +++- fs/btrfs/extent_io.c | 7 +- fs/btrfs/extent_io.h | 1 + fs/btrfs/file.c| 4 + fs/btrfs/inode.c | 316 +++-- fs/btrfs/ioctl.c | 1 + fs/btrfs/relocation.c | 18 +++ 9 files changed, 341 insertions(+), 56 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 4f0b6a12ecb1..627d617e3265 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -112,9 +112,11 @@ static inline u32 count_max_extents(u64 size, u64 max_extent_size) */ enum btrfs_metadata_reserve_type { BTRFS_RESERVE_NORMAL, + BTRFS_RESERVE_DEDUPE, }; -u64 btrfs_max_extent_size(enum btrfs_metadata_reserve_type reserve_type); +u64 btrfs_max_extent_size(struct btrfs_inode *inode, + enum btrfs_metadata_reserve_type reserve_type); struct btrfs_mapping_tree { struct extent_map_tree map_tree; diff --git a/fs/btrfs/dedupe.h b/fs/btrfs/dedupe.h index 87f5b7ce7766..8157b17c4d11 100644 --- a/fs/btrfs/dedupe.h +++ b/fs/btrfs/dedupe.h @@ -7,6 +7,7 @@ #define BTRFS_DEDUPE_H #include +#include "btrfs_inode.h" /* 32 bytes for SHA256 */ static const int btrfs_hash_sizes[] = { 32 }; @@ -47,6 +48,20 @@ struct btrfs_dedupe_info { u64 current_nr; }; +static inline u64 btrfs_dedupe_blocksize(struct btrfs_inode *inode) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + + return fs_info->dedupe_info->blocksize; +} + +static inline int inode_need_dedupe(struct inode *inode) +{ + struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; + + return fs_info->dedupe_enabled; +} + static inline int btrfs_dedupe_hash_hit(struct btrfs_dedupe_hash *hash) { return (hash && hash->bytenr); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index f90233ffcb27..131d48487c84 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -28,6 +28,7 @@ #include "sysfs.h" #include "qgroup.h" #include "ref-verify.h" +#include "dedupe.h" #undef SCRAMBLE_DELAYED_REFS @@ -2489,6 +2490,17 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans, btrfs_pin_extent(fs_info, head->bytenr, head->num_bytes, 1); if (head->is_data) { + /* +* If insert_reserved is given, it means +* a new extent is revered, then deleted +* in one tran, and inc/dec get merged to 0. +* +* In this case, we need to remove its dedupe +* hash. +*/ + ret = btrfs_dedupe_del(fs_info, head->bytenr); + if (ret < 0) + return ret; ret = btrfs_del_csums(trans, fs_info, head->bytenr, head->num_bytes); } @@ -5882,13 +5894,15 @@ static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info, spin_unlock(&block_rsv->lock); } -u64 btrfs_max_extent_size(enum btrfs_metadata_reserve_type reserve_type) +u64 btrfs_max_extent_size(struct btrfs_inode *inode, + enum btrfs_metadata_reserve_type reserve_type) { if (reserve_type == BTRFS_RESERVE_NORMAL) return BTRFS_MAX_EXTENT_SIZE; - - ASSERT(0); - return BTRFS_MAX_EXTENT_SIZE; + else if (reserve_type == BTRFS_RESERVE_DEDUPE) + return btrfs_dedupe_blocksize(inode); + else + return BTRFS_MAX_EXTENT_SIZE; } int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes, @@ -5899,7 +5913,7 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes, enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL; int ret = 0; bool delalloc_lock = true; - u64 max_extent_size = btrfs_max_extent_si
[PATCH v15 08/13] btrfs: ordered-extent: Add support for dedupe
From: Wang Xiaoguang Add ordered-extent support for dedupe. Note, current ordered-extent support only supports non-compressed source extent. Support for compressed source extent will be added later. Signed-off-by: Qu Wenruo Signed-off-by: Wang Xiaoguang Reviewed-by: Josef Bacik --- fs/btrfs/ordered-data.c | 46 + fs/btrfs/ordered-data.h | 13 2 files changed, 55 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 0c4ef208b8b9..4b112258a79b 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -12,6 +12,7 @@ #include "extent_io.h" #include "disk-io.h" #include "compression.h" +#include "dedupe.h" static struct kmem_cache *btrfs_ordered_extent_cache; @@ -170,7 +171,8 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree, */ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, u64 start, u64 len, u64 disk_len, - int type, int dio, int compress_type) + int type, int dio, int compress_type, + struct btrfs_dedupe_hash *hash) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; @@ -191,6 +193,33 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, entry->inode = igrab(inode); entry->compress_type = compress_type; entry->truncated_len = (u64)-1; + entry->hash = NULL; + /* +* A hash hit means we have already incremented the extents delayed +* ref. +* We must handle this even if another process is trying to +* turn off dedupe, otherwise we will leak a reference. +*/ + if (hash && (hash->bytenr || root->fs_info->dedupe_enabled)) { + struct btrfs_dedupe_info *dedupe_info; + + dedupe_info = root->fs_info->dedupe_info; + if (WARN_ON(dedupe_info == NULL)) { + kmem_cache_free(btrfs_ordered_extent_cache, + entry); + return -EINVAL; + } + entry->hash = btrfs_dedupe_alloc_hash(dedupe_info->hash_algo); + if (!entry->hash) { + kmem_cache_free(btrfs_ordered_extent_cache, entry); + return -ENOMEM; + } + entry->hash->bytenr = hash->bytenr; + entry->hash->num_bytes = hash->num_bytes; + memcpy(entry->hash->hash, hash->hash, + btrfs_hash_sizes[dedupe_info->hash_algo]); + } + if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE) set_bit(type, &entry->flags); @@ -245,15 +274,23 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, { return __btrfs_add_ordered_extent(inode, file_offset, start, len, disk_len, type, 0, - BTRFS_COMPRESS_NONE); + BTRFS_COMPRESS_NONE, NULL); } +int btrfs_add_ordered_extent_dedupe(struct inode *inode, u64 file_offset, + u64 start, u64 len, u64 disk_len, int type, + struct btrfs_dedupe_hash *hash) +{ + return __btrfs_add_ordered_extent(inode, file_offset, start, len, + disk_len, type, 0, + BTRFS_COMPRESS_NONE, hash); +} int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset, u64 start, u64 len, u64 disk_len, int type) { return __btrfs_add_ordered_extent(inode, file_offset, start, len, disk_len, type, 1, - BTRFS_COMPRESS_NONE); + BTRFS_COMPRESS_NONE, NULL); } int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset, @@ -262,7 +299,7 @@ int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset, { return __btrfs_add_ordered_extent(inode, file_offset, start, len, disk_len, type, 0, - compress_type); + compress_type, NULL); } /* @@ -444,6 +481,7 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) list_del(&sum->list); kfree(sum); } + kfree(entry->hash); kmem_cache_free(btrfs_ordered_extent_cache, entry); } } diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 02d813aaa261..08c7ee986bb9 100644 --- a/fs/b
[PATCH v15 01/13] btrfs: dedupe: Introduce dedupe framework and its header
From: Wang Xiaoguang Introduce the header for btrfs in-band(write time) de-duplication framework and needed header. The new de-duplication framework is going to support 2 different dedupe methods and 1 dedupe hash. Signed-off-by: Qu Wenruo Signed-off-by: Wang Xiaoguang Signed-off-by: Lu Fengqi --- fs/btrfs/ctree.h | 7 ++ fs/btrfs/dedupe.h | 128 - fs/btrfs/disk-io.c | 1 + include/uapi/linux/btrfs.h | 34 ++ 4 files changed, 168 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 53af9f5253f4..741ef21a6185 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1125,6 +1125,13 @@ struct btrfs_fs_info { spinlock_t ref_verify_lock; struct rb_root block_tree; #endif + + /* +* Inband de-duplication related structures +*/ + unsigned long dedupe_enabled:1; + struct btrfs_dedupe_info *dedupe_info; + struct mutex dedupe_ioctl_lock; }; static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb) diff --git a/fs/btrfs/dedupe.h b/fs/btrfs/dedupe.h index 90281a7a35a8..222ce7b4d827 100644 --- a/fs/btrfs/dedupe.h +++ b/fs/btrfs/dedupe.h @@ -6,7 +6,131 @@ #ifndef BTRFS_DEDUPE_H #define BTRFS_DEDUPE_H -/* later in-band dedupe will expand this struct */ -struct btrfs_dedupe_hash; +#include +/* 32 bytes for SHA256 */ +static const int btrfs_hash_sizes[] = { 32 }; + +/* + * For caller outside of dedupe.c + * + * Different dedupe backends should have their own hash structure + */ +struct btrfs_dedupe_hash { + u64 bytenr; + u32 num_bytes; + + /* last field is a variable length array of dedupe hash */ + u8 hash[]; +}; + +struct btrfs_dedupe_info { + /* dedupe blocksize */ + u64 blocksize; + u16 backend; + u16 hash_algo; + + struct crypto_shash *dedupe_driver; + + /* +* Use mutex to portect both backends +* Even for in-memory backends, the rb-tree can be quite large, +* so mutex is better for such use case. +*/ + struct mutex lock; + + /* following members are only used in in-memory backend */ + struct rb_root hash_root; + struct rb_root bytenr_root; + struct list_head lru_list; + u64 limit_nr; + u64 current_nr; +}; + +static inline int btrfs_dedupe_hash_hit(struct btrfs_dedupe_hash *hash) +{ + return (hash && hash->bytenr); +} + +/* + * Initial inband dedupe info + * Called at dedupe enable time. + * + * Return 0 for success + * Return <0 for any error + * (from unsupported param to tree creation error for some backends) + */ +int btrfs_dedupe_enable(struct btrfs_fs_info *fs_info, + struct btrfs_ioctl_dedupe_args *dargs); + +/* + * Disable dedupe and invalidate all its dedupe data. + * Called at dedupe disable time. + * + * Return 0 for success + * Return <0 for any error + * (tree operation error for some backends) + */ +int btrfs_dedupe_disable(struct btrfs_fs_info *fs_info); + +/* + * Get current dedupe status. + * Return 0 for success + * No possible error yet + */ +void btrfs_dedupe_status(struct btrfs_fs_info *fs_info, +struct btrfs_ioctl_dedupe_args *dargs); + +/* + * Calculate hash for dedupe. + * Caller must ensure [start, start + dedupe_bs) has valid data. + * + * Return 0 for success + * Return <0 for any error + * (error from hash codes) + */ +int btrfs_dedupe_calc_hash(struct btrfs_fs_info *fs_info, + struct inode *inode, u64 start, + struct btrfs_dedupe_hash *hash); + +/* + * Search for duplicated extents by calculated hash + * Caller must call btrfs_dedupe_calc_hash() first to get the hash. + * + * @inode: the inode for we are writing + * @file_pos: offset inside the inode + * As we will increase extent ref immediately after a hash match, + * we need @file_pos and @inode in this case. + * + * Return > 0 for a hash match, and the extent ref will be + * *INCREASED*, and hash->bytenr/num_bytes will record the existing + * extent data. + * Return 0 for a hash miss. Nothing is done + * Return <0 for any error + * (tree operation error for some backends) + */ +int btrfs_dedupe_search(struct btrfs_fs_info *fs_info, + struct inode *inode, u64 file_pos, + struct btrfs_dedupe_hash *hash); + +/* + * Add a dedupe hash into dedupe info + * Return 0 for success + * Return <0 for any error + * (tree operation error for some backends) + */ +int btrfs_dedupe_add(struct btrfs_fs_info *fs_info, +struct btrfs_dedupe_hash *hash); + +/* + * Remove a dedupe hash from dedupe info + * Return 0 for success + * Return <0 for any error + * (tree operation error for some backends) + * + * NOTE: if hash deletion error is not handled well, it will lead + * to corrupted fs, as later dedupe write can points to non-exist
[PATCH v15 06/13] btrfs: dedupe: Introduce function to search for an existing hash
From: Wang Xiaoguang Introduce static function inmem_search() to handle the job for in-memory hash tree. The trick is, we must ensure the delayed ref head is not being run at the time we search the for the hash. With inmem_search(), we can implement the btrfs_dedupe_search() interface. Signed-off-by: Qu Wenruo Signed-off-by: Wang Xiaoguang Reviewed-by: Josef Bacik Signed-off-by: Lu Fengqi --- fs/btrfs/dedupe.c | 210 +- 1 file changed, 209 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/dedupe.c b/fs/btrfs/dedupe.c index 951fefd19fde..9c6152b7f0eb 100644 --- a/fs/btrfs/dedupe.c +++ b/fs/btrfs/dedupe.c @@ -7,6 +7,8 @@ #include "dedupe.h" #include "btrfs_inode.h" #include "delayed-ref.h" +#include "qgroup.h" +#include "transaction.h" struct inmem_hash { struct rb_node hash_node; @@ -242,7 +244,6 @@ static int inmem_add(struct btrfs_dedupe_info *dedupe_info, struct inmem_hash *ihash; ihash = inmem_alloc_hash(algo); - if (!ihash) return -ENOMEM; @@ -436,3 +437,210 @@ int btrfs_dedupe_disable(struct btrfs_fs_info *fs_info) kfree(dedupe_info); return 0; } + +/* + * Caller must ensure the corresponding ref head is not being run. + */ +static struct inmem_hash * +inmem_search_hash(struct btrfs_dedupe_info *dedupe_info, u8 *hash) +{ + struct rb_node **p = &dedupe_info->hash_root.rb_node; + struct rb_node *parent = NULL; + struct inmem_hash *entry = NULL; + u16 hash_algo = dedupe_info->hash_algo; + int hash_len = btrfs_hash_sizes[hash_algo]; + + while (*p) { + parent = *p; + entry = rb_entry(parent, struct inmem_hash, hash_node); + + if (memcmp(hash, entry->hash, hash_len) < 0) { + p = &(*p)->rb_left; + } else if (memcmp(hash, entry->hash, hash_len) > 0) { + p = &(*p)->rb_right; + } else { + /* Found, need to re-add it to LRU list head */ + list_del(&entry->lru_list); + list_add(&entry->lru_list, &dedupe_info->lru_list); + return entry; + } + } + return NULL; +} + +static int inmem_search(struct btrfs_dedupe_info *dedupe_info, + struct inode *inode, u64 file_pos, + struct btrfs_dedupe_hash *hash) +{ + int ret; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; + struct btrfs_delayed_ref_root *delayed_refs; + struct btrfs_delayed_ref_head *head; + struct btrfs_delayed_ref_head *insert_head; + struct btrfs_delayed_data_ref *insert_dref; + struct btrfs_qgroup_extent_record *insert_qrecord = NULL; + struct inmem_hash *found_hash; + int free_insert = 1; + int qrecord_inserted = 0; + u64 ref_root = root->root_key.objectid; + u64 bytenr; + u32 num_bytes; + + insert_head = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS); + if (!insert_head) + return -ENOMEM; + insert_head->extent_op = NULL; + + insert_dref = kmem_cache_alloc(btrfs_delayed_data_ref_cachep, GFP_NOFS); + if (!insert_dref) { + kmem_cache_free(btrfs_delayed_ref_head_cachep, insert_head); + return -ENOMEM; + } + if (test_bit(BTRFS_FS_QUOTA_ENABLED, &root->fs_info->flags) && + is_fstree(ref_root)) { + insert_qrecord = kmalloc(sizeof(*insert_qrecord), GFP_NOFS); + if (!insert_qrecord) { + kmem_cache_free(btrfs_delayed_ref_head_cachep, + insert_head); + kmem_cache_free(btrfs_delayed_data_ref_cachep, + insert_dref); + return -ENOMEM; + } + } + + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto free_mem; + } + +again: + mutex_lock(&dedupe_info->lock); + found_hash = inmem_search_hash(dedupe_info, hash->hash); + /* If we don't find a duplicated extent, just return. */ + if (!found_hash) { + ret = 0; + goto out; + } + bytenr = found_hash->bytenr; + num_bytes = found_hash->num_bytes; + + btrfs_init_delayed_ref_head(insert_head, insert_qrecord, bytenr, + num_bytes, ref_root, 0, BTRFS_ADD_DELAYED_REF, true, + false); + + btrfs_init_delayed_ref_common(trans->fs_info, &insert_dref->node, + bytenr, num_bytes, ref_root, BTRFS_ADD_DELAYED_REF,
[PATCH v3] btrfs: fix qgroup_free wrong num_bytes in btrfs_subvolume_reserve_metadata()
After btrfs_qgroup_reserve_meta_prealloc(), num_bytes will be assigned again by btrfs_calc_trans_metadata_size(). Once block_rsv fails, we can't properly free the num_bytes of the previous qgroup_reserve. Use a separate variable to store the num_bytes of the qgroup_reserve. Delete the comment for the qgroup_reserved that does not exist and add a comment about use_global_rsv. Fixes: c4c129db5da8 ("btrfs: drop unused parameter qgroup_reserved") Signed-off-by: Lu Fengqi --- Changelog: v2->v3: update the subject and commit message to reflect this is a fixes v1->v2: break the line that exceed 80 char fs/btrfs/extent-tree.c | 17 - 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index de6f75f5547b..2d9074295d7f 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5800,7 +5800,7 @@ void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans) * root: the root of the parent directory * rsv: block reservation * items: the number of items that we need do reservation - * qgroup_reserved: used to return the reserved size in qgroup + * use_global_rsv: allow fallback to the global block reservation * * This function is used to reserve the space for snapshot/subvolume * creation and deletion. Those operations are different with the @@ -5810,10 +5810,10 @@ void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans) * the space reservation mechanism in start_transaction(). */ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, -struct btrfs_block_rsv *rsv, -int items, +struct btrfs_block_rsv *rsv, int items, bool use_global_rsv) { + u64 qgroup_num_bytes = 0; u64 num_bytes; int ret; struct btrfs_fs_info *fs_info = root->fs_info; @@ -5821,12 +5821,11 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) { /* One for parent inode, two for dir entries */ - num_bytes = 3 * fs_info->nodesize; - ret = btrfs_qgroup_reserve_meta_prealloc(root, num_bytes, true); + qgroup_num_bytes = 3 * fs_info->nodesize; + ret = btrfs_qgroup_reserve_meta_prealloc(root, + qgroup_num_bytes, true); if (ret) return ret; - } else { - num_bytes = 0; } num_bytes = btrfs_calc_trans_metadata_size(fs_info, items); @@ -5838,8 +5837,8 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, if (ret == -ENOSPC && use_global_rsv) ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, 1); - if (ret && num_bytes) - btrfs_qgroup_free_meta_prealloc(root, num_bytes); + if (ret && qgroup_num_bytes) + btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes); return ret; } -- 2.18.0 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/5] btrfs: use a separate variable to store the num_bytes of the qgroup_reserve
David Sterba 于2018年8月8日周三 下午9:57写道: > > On Wed, Aug 08, 2018 at 11:04:37AM +0800, Lu Fengqi wrote: > > On Tue, Aug 07, 2018 at 06:19:12PM +0200, David Sterba wrote: > > >On Sat, Aug 04, 2018 at 09:10:54PM +0800, Lu Fengqi wrote: > > >> After btrfs_qgroup_reserve_meta_prealloc(), num_bytes will be assigned > > >> again by btrfs_calc_trans_metadata_size(). Therefore, once block_rsv > > >> fails, we cannot properly free the num_bytes of the previous > > >> qgroup_reserve. > > > > > >This does not look like a trivial cleanup at all. There was an unused > > >parameter, removed in c4c129db5da8f070147f175 ("btrfs: drop unused > > >parameter qgroup_reserved"), that introduced the bug. This was in this > > >rc1 so it's a regression and I'll consider pushing it to the 4.18 final. > > > > I apologize for the inconvenience. I should add the Fixes tag, and really > > shouldn't mix it into the trivial cleanup patch set. > > As the bug does not qualify as urgent regression, I'm not going to > forward it to 4.18. Please update the subject and changelog so it > reflects that's an actual fix. I'll add it to the 4.19 queue then. > Thanks. No problem. I will send it tomorrow. - Thanks, Lu > -- > To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in > the body of a message to majord...@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC PATCH 0/4] undelete subvolume online version
On Wed, Aug 08, 2018 at 02:11:24PM +0800, Qu Wenruo wrote: > > >On 2018年08月08日 00:39, David Sterba wrote: >> On Sun, Aug 05, 2018 at 06:39:57PM +0800, Lu Fengqi wrote: >>> This patchset will add the BTRFS_IOC_SUBVOL_UNDELETE ioctl for online >>> btrfs subvolume undelete. >>> >>> And btrfs subvolume undelete subcommand was added to btrfs-progs. >>> >>> So user can use the following command to recover all the subolume that >>> is left on the device. The recovered subvolume will be link to dir >>> named to . >> >> Hm, I don't agree with the proposed interface - to recover all deleted >> subvolumes. IMO this should recover just one subvolume of a given id a >> to given directory. >> >> The ioctl structure has to be reworked, I've skimmed the code and saw >> some suspicious things but will have a look after the interface is >> settled. > >My concern is, is such purpose really needed? > >Yes, it's possible user made some mistake and want to get back the data. >But putting an ioctl for 'undelete', then user may consider btrfs is so >powerful that can undelete everything. >In short, this undelete feature gives user too high expectation. > >And don't expect user really to read man pages. There are already tons There is no more way about the too high expectation of users. If we provide a feature with a sufficiently detailed man page, but users do not read the man page when using this feature, I can only think that they are not responsible for their own data. So, this seems to be a problem they need to consider. >of reports where user execute btrfs check --repair without realizing >--repair is pretty dangerous (and thanks to the work done to repair, it >normally doesn't cause catastrophic result, but sometimes it indeed >causes extra damage) The good news is that online undelete is not as dangerous as btrfs check --repair. In fact, I think it is safe enough. > >And when user tried and failed due to deleted tree blocks, they will get >even more frustrated or even betrayed. As mentioned previous, maybe we should do what we think is right, such as give the user more abilities to protect/recover their data, not to take care of any sensitive users? > >I prefer to put such undelete as an off-line rescue tool, instead of >making it online with an ioctl interface. I also think that the offline undelete is more useful. After all, umount immediately to prevent further data loss is always the most effective after a mistake. However, since we can give the ability of online undelete to a user which couldn't umount the filesystem easily, and don't have any side effect on existing features. IMHO, there is no reason to reject this. -- Thanks, Lu -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/5] btrfs: use a separate variable to store the num_bytes of the qgroup_reserve
On Tue, Aug 07, 2018 at 06:19:12PM +0200, David Sterba wrote: >On Sat, Aug 04, 2018 at 09:10:54PM +0800, Lu Fengqi wrote: >> After btrfs_qgroup_reserve_meta_prealloc(), num_bytes will be assigned >> again by btrfs_calc_trans_metadata_size(). Therefore, once block_rsv >> fails, we cannot properly free the num_bytes of the previous >> qgroup_reserve. > >This does not look like a trivial cleanup at all. There was an unused >parameter, removed in c4c129db5da8f070147f175 ("btrfs: drop unused >parameter qgroup_reserved"), that introduced the bug. This was in this >rc1 so it's a regression and I'll consider pushing it to the 4.18 final. > > I apologize for the inconvenience. I should add the Fixes tag, and really shouldn't mix it into the trivial cleanup patch set. -- Thanks, Lu -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC PATCH 0/4] undelete subvolume online version
On Tue, Aug 07, 2018 at 06:39:50PM +0200, David Sterba wrote: >On Sun, Aug 05, 2018 at 06:39:57PM +0800, Lu Fengqi wrote: >> This patchset will add the BTRFS_IOC_SUBVOL_UNDELETE ioctl for online >> btrfs subvolume undelete. >> >> And btrfs subvolume undelete subcommand was added to btrfs-progs. >> >> So user can use the following command to recover all the subolume that >> is left on the device. The recovered subvolume will be link to dir >> named to . > >Hm, I don't agree with the proposed interface - to recover all deleted >subvolumes. IMO this should recover just one subvolume of a given id a >to given directory. Thank you for taking the time to respond. I may have thought too much about the interface before. In my imagination, the cleaner kthread is like a monster that devours user data at any time, so the user must perform an online undelete operation as soon as possible, so there is no time to determine the subvol_id that should be passed. However, I have to admit that I don't know much about the user's actual usage scenarios, I will accept the interface you provided. Of course, I really like this because it greatly simplifies the ioctl structure. > >The ioctl structure has to be reworked, I've skimmed the code and saw >some suspicious things but will have a look after the interface is >settled. When I rework the ioctl structure, I will carefully recheck the incorrect place in the code. -- Thanks, Lu -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v3] btrfs: qgroup: Remove qgroup items along with subvolume deletion
On Mon, Aug 06, 2018 at 01:53:28PM +0900, Misono Tomohiro wrote: >When qgroup is on, subvolume deletion does not remove qgroup items >of the subvolume (qgroup info, limit, relation) from quota tree and >they need to get removed manually by "btrfs qgroup destroy". > >Since level 0 qgroup cannot be used/inherited by any other subvolume, >let's remove them automatically when subvolume is deleted >(to be precise, when the subvolume root is dropped). > >Reviewed-by: Lu Fengqi >Reviewed-by: Qu Wenruo >Signed-off-by: Misono Tomohiro >--- >v2 -> v3: > Use root->root_key.objectid instead of root->objectid > Add Reviewed-by tag > >v1 -> v2: > Move call of btrfs_remove_qgroup() from btrfs_delete_subvolume() > to btrfs_snapshot_destroy() so that it will be called after the > subvolume root is really dropped > > fs/btrfs/extent-tree.c | 16 > 1 file changed, 12 insertions(+), 4 deletions(-) > >diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c >index 9e7b237b9547..48edf839ed2c 100644 >--- a/fs/btrfs/extent-tree.c >+++ b/fs/btrfs/extent-tree.c >@@ -8871,12 +8871,13 @@ int btrfs_drop_snapshot(struct btrfs_root *root, > struct btrfs_root_item *root_item = &root->root_item; > struct walk_control *wc; > struct btrfs_key key; >+ u64 objectid = root->root_key.objectid; > int err = 0; > int ret; > int level; > bool root_dropped = false; > >- btrfs_debug(fs_info, "Drop subvolume %llu", root->objectid); >+ btrfs_debug(fs_info, "Drop subvolume %llu", objectid); > > path = btrfs_alloc_path(); > if (!path) { >@@ -9030,7 +9031,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, > goto out_end_trans; > } > >- if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { >+ if (objectid != BTRFS_TREE_RELOC_OBJECTID) { > ret = btrfs_find_root(tree_root, &root->root_key, path, > NULL, NULL); > if (ret < 0) { >@@ -9043,8 +9044,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, >* >* The most common failure here is just -ENOENT. >*/ >- btrfs_del_orphan_item(trans, tree_root, >-root->root_key.objectid); >+ btrfs_del_orphan_item(trans, tree_root, objectid); > } > } > >@@ -9056,6 +9056,14 @@ int btrfs_drop_snapshot(struct btrfs_root *root, > btrfs_put_fs_root(root); > } > root_dropped = true; >+ >+ /* Remove level-0 qgroup items since no other subvolume can use them */ >+ ret = btrfs_remove_qgroup(trans, objectid); >+ if (ret && ret != -EINVAL && ret != -ENOENT) { I'm sorry for missing the snapshot case. If it is a snapshot, then when we remove the relevant qgroup, we will not be able to perform quick_update_accounting(), and it will return 1. So we shouldn't abort the transaction when the return value = 1. btrfs_remove_qgroup -> __del_qgroup_relation -> quick_update_accounting << if qgroup->excl != qgroup->rfer; return 1 -- Thanks, Lu >+ btrfs_abort_transaction(trans, ret); >+ err = ret; >+ } >+ > out_end_trans: > btrfs_end_transaction_throttle(trans); > out_free: >-- >2.14.4 > > >-- >To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in >the body of a message to majord...@vger.kernel.org >More majordomo info at http://vger.kernel.org/majordomo-info.html > > -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v2 00/12] mkfs: Quota support through -R|--runtime quota
On Mon, Jul 30, 2018 at 01:03:00PM +0800, Qu Wenruo wrote: >Ping the 3rd time? > >Or should I just rebase the patchset? Hi Qu Could you rebase this patchset? Because I want to test existing test cases with enabled quota, mkfs_qgroup seems to ease my workload. -- Thanks, Lu > >Thanks, >Qu > >On 2018年03月08日 09:17, Qu Wenruo wrote: >> Ping again. >> >> Since David is planning to merge qgroup patchset, this feature would >> greatly improve test coverage. >> >> Thanks, >> Qu >> >> On 2018年01月11日 14:04, Qu Wenruo wrote: >>> Ping? >>> >>> Or do I need to rebase the patchset? >>> >>> Thanks, >>> Qu >>> >>> On 2017年11月07日 16:42, Qu Wenruo wrote: Can be fetched from github: https://github.com/adam900710/btrfs-progs/tree/mkfs_qgroup This patchset adds quota support, which means the result fs will have quota enabled by default, and its accounting is already consistent, no manually rescan or quota enable is needed. The overall design of such support is: 1) Create needed tree Both btrfs_root and real root item and tree root leaf. For this, a new infrastructure, btrfs_create_tree(), is added for this. 2) Fill quota root with basic skeleton Only 3 items are really needed a) global quota status item b) quota info for specified qgroup c) quota limit for specified qgroup Currently only 0/5 qgroup is passed. If we're going to support extra subvolume at mkfs time, just pass the subvolume id into insert_qgroup_items(). The content doesn't matter at all. 3) Repair qgroups using infrastructure from qgroup-verify In fact, qgroup repair is just offline rescan. Although the original qgroup-verify infrastructure is mostly noisy, modify it a little to make it silent to function as offline quota rescan. And such support is mainly designed for developers and QA guys. As to enable quota, before we must normally mount the fs, enable quota (and rescan if needed). This ioctl based procedure is not common, and fstests doesn't provide such support. There are several attempts to make fstests to support it, but due to different reasons, all these attempts failed. To make it easier to test all existing test cases with btrfs quota enabled, the current best method is to support quota at mkfs time, and here comes the patchset. BTW with -R|--runtime-features, we have several possible target to add. Not limited to such ioctl based operation, but also mount option based ones. Like space-cache-tree (space_cache=v2). Qu Wenruo (12): btrfs-progs: qgroup-verify: Also repair qgroup status version btrfs-progs: qgroup-verify: Use fs_info->readonly to check if we should repair qgroups btrfs-progs: qgroup-verify: Move qgroup classification out of report_qgroups btrfs-progs: qgroup-verify: Allow repair_qgroups function to do silent repair btrfs-progs: ctree: Introduce function to create an empty tree btrfs-progs: mkfs: Introduce function to insert qgroup info and limit items ^^^ Above patches are not modified at all ^^^ vvv Modification starts below vvv btrfs-progs: mkfs: Introduce function to setup quota root and rescan btrfs-progs: fsfeatures: Introduce a new set of features, runtime_features btrfs-progs: mkfs: Introduce --runtime-features option btrfs-progs: mkfs: Introduce quota runtime feature btrfs-progs: test/mkfs: Add test case for -R quota option btrfs-progs: test/mkfs: Add test case for --rootdir and -R quota Documentation/mkfs.btrfs.asciidoc | 23 +++ cmds-check.c | 2 +- convert/main.c | 4 +- ctree.c| 109 ++ ctree.h| 3 + fsfeatures.c | 131 ++--- fsfeatures.h | 10 +- mkfs/main.c| 194 ++--- qgroup-verify.c| 51 +-- qgroup-verify.h| 2 +- tests/mkfs-tests/001-basic-profiles/test.sh| 10 ++ tests/mkfs-tests/010-rootdir-and-quota/test.sh | 51 +++ 12 files changed, 529 insertions(+), 61 deletions(-) create mode 100755 tests/mkfs-tests/010-rootdir-and-quota/test.sh >>> >> > -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[RFC PATCH 4/4] btrfs: undelete: Add the btrfs_ioctl_undelete
The function will traverse the root from the fs_info->dead_roots and try to call btrfs_undelete_subvolume() to recover them. Note: It will lock fs_info->cleaner_mutex to keep the cleaner kthread from deleting the subvolume which we want to recover. Signed-off-by: Lu Fengqi --- fs/btrfs/ioctl.c | 83 ++ include/uapi/linux/btrfs.h | 9 + 2 files changed, 92 insertions(+) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 7a11c4f8e450..83b9839799d0 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1980,6 +1980,87 @@ static int btrfs_undelete_subvolume(const struct path *parent, return ret; } +static int btrfs_ioctl_undelete(struct file *file, void __user *argp) +{ + struct btrfs_ioctl_undelete_args __user *uarg; + struct btrfs_ioctl_undelete_args *args; + struct inode *inode = file_inode(file); + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_root *root, *tmp; + char *name; + u64 count = 0; + u64 objectid; + int err = 0, ret; + + /* copy search header and buffer size */ + uarg = (struct btrfs_ioctl_undelete_args __user *)argp; + args = memdup_user(uarg, sizeof(*args)); + if (IS_ERR(args)) + return PTR_ERR(args); + args->name[BTRFS_PATH_NAME_MAX] = '\0'; + + name = kzalloc(BTRFS_PATH_NAME_MAX + 1, GFP_KERNEL); + if (IS_ERR(name)) { + err = PTR_ERR(name); + goto free_args; + } + + if (!capable(CAP_SYS_ADMIN)) { + err = -EPERM; + goto free; + } + + err = mnt_want_write_file(file); + if (err) + goto free; + + /* Lock cleaner_mutex to prevent the cleaner kthread from deleting the +* subvolume we want to recover so that we can perform the next rescue +* in a relaxed manner. +*/ + mutex_lock(&fs_info->cleaner_mutex); + + list_for_each_entry_safe(root, tmp, &fs_info->dead_roots, root_list) { + objectid = root->root_key.objectid; + snprintf(name, BTRFS_PATH_NAME_MAX, "%s%llu", args->name, + objectid); + ret = btrfs_undelete_subvolume(&file->f_path, root, name, + strlen(name)); + if (ret) + continue; + + /* +* Feel free to remove this root from dead_root list since we +* have recover it successfully. +*/ + spin_lock(&fs_info->trans_lock); + list_del_init(&root->root_list); + spin_unlock(&fs_info->trans_lock); + + if ((count + 1) * sizeof(objectid) > args->buf_size) + continue; + + /* copy the subvolume id to user space */ + ret = copy_to_user(&uarg->buf[count], &objectid, + sizeof(objectid)); + if (ret) + err = -EFAULT; + count++; + } + + mutex_unlock(&fs_info->cleaner_mutex); + mnt_drop_write_file(file); + + /* copy the count to user space */ + if (copy_to_user(&uarg->count, &count, sizeof(count))) + err = -EFAULT; +free: + kfree(name); +free_args: + kfree(args); + return err; +} + static noinline int btrfs_ioctl_subvol_getflags(struct file *file, void __user *arg) { @@ -6089,6 +6170,8 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_get_subvol_rootref(file, argp); case BTRFS_IOC_INO_LOOKUP_USER: return btrfs_ioctl_ino_lookup_user(file, argp); + case BTRFS_IOC_SUBVOL_UNDELETE: + return btrfs_ioctl_undelete(file, argp); } return -ENOTTY; diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h index 5ca1d21fc4a7..25d030687b27 100644 --- a/include/uapi/linux/btrfs.h +++ b/include/uapi/linux/btrfs.h @@ -816,6 +816,13 @@ struct btrfs_ioctl_get_subvol_rootref_args { __u8 align[7]; }; +struct btrfs_ioctl_undelete_args { + char name[BTRFS_PATH_NAME_MAX + 1]; /* in - subvolume name prefix */ + __u64 buf_size; /* in - size of buffer */ + __u64 count;/* out - store number of recoverd subvolumes */ + __u64 buf[0]; /* out - store ids of recoverd subolumes */ +}; + /* Error codes as returned by the kernel */ enum btrfs_err_code { BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET = 1, @@ -940,5 +947,7 @@ enum btrfs_err_code { struct btrfs_ioctl_get_subvol_rootref_args) #define BTRFS_IOC_INO_LOOKUP_USER _IOWR(BTRFS_IOCTL_MAGIC, 62,
[RFC PATCH 0/4] undelete subvolume online version
This patchset will add the BTRFS_IOC_SUBVOL_UNDELETE ioctl for online btrfs subvolume undelete. And btrfs subvolume undelete subcommand was added to btrfs-progs. So user can use the following command to recover all the subolume that is left on the device. The recovered subvolume will be link to dir named to . # btrfs subvolume undelete [-p ] btrfs online undelete version: https://github.com/littleroad/linux.git undelete btrfs-progs online undelete version: https://github.com/littleroad/btrfs-progs.git online_undelete Issue: #82 Lu Fengqi (4): btrfs: factor out btrfs_link_subvol from create_subvol btrfs: don't BUG_ON() in btrfs_link_subvol() btrfs: undelete: introduce btrfs_undelete_subvolume btrfs: undelete: Add the btrfs_ioctl_undelete fs/btrfs/ioctl.c | 270 + include/uapi/linux/btrfs.h | 9 ++ 2 files changed, 255 insertions(+), 24 deletions(-) -- 2.18.0 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[RFC PATCH 1/4] btrfs: factor out btrfs_link_subvol from create_subvol
The function btrfs_link_subvol is responsible to link the subvolume to the specified directory, which is the opposite of what btrfs_unlink_subvol does. No functional change. Signed-off-by: Lu Fengqi --- fs/btrfs/ioctl.c | 65 ++-- 1 file changed, 41 insertions(+), 24 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index d3a5d2a41e5f..d37c26f69112 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -542,6 +542,45 @@ int btrfs_is_empty_uuid(u8 *uuid) return 1; } +static int btrfs_link_subvol(struct btrfs_trans_handle *trans, +struct inode *dir, u64 objectid, const char *name, +int namelen) +{ + struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_key key; + u64 index = 0; + int ret; + + /* +* insert the directory item +*/ + ret = btrfs_set_inode_index(BTRFS_I(dir), &index); + if (ret) { + btrfs_abort_transaction(trans, ret); + return ret; + } + + key.objectid = objectid; + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = -1; + ret = btrfs_insert_dir_item(trans, root, name, namelen, BTRFS_I(dir), + &key, BTRFS_FT_DIR, index); + if (ret) { + btrfs_abort_transaction(trans, ret); + return ret; + } + + btrfs_i_size_write(BTRFS_I(dir), dir->i_size + namelen * 2); + ret = btrfs_update_inode(trans, root, dir); + BUG_ON(ret); + + ret = btrfs_add_root_ref(trans, objectid, root->root_key.objectid, +btrfs_ino(BTRFS_I(dir)), index, name, namelen); + BUG_ON(ret); + + return ret; +} + static noinline int create_subvol(struct inode *dir, struct dentry *dentry, const char *name, int namelen, @@ -563,7 +602,6 @@ static noinline int create_subvol(struct inode *dir, int err; u64 objectid; u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID; - u64 index = 0; uuid_le new_uuid; root_item = kzalloc(sizeof(*root_item), GFP_KERNEL); @@ -677,30 +715,9 @@ static noinline int create_subvol(struct inode *dir, new_root->highest_objectid = new_dirid; mutex_unlock(&new_root->objectid_mutex); - /* -* insert the directory item -*/ - ret = btrfs_set_inode_index(BTRFS_I(dir), &index); - if (ret) { - btrfs_abort_transaction(trans, ret); - goto fail; - } - - ret = btrfs_insert_dir_item(trans, root, - name, namelen, BTRFS_I(dir), &key, - BTRFS_FT_DIR, index); - if (ret) { - btrfs_abort_transaction(trans, ret); + ret = btrfs_link_subvol(trans, dir, objectid, name, namelen); + if (ret) goto fail; - } - - btrfs_i_size_write(BTRFS_I(dir), dir->i_size + namelen * 2); - ret = btrfs_update_inode(trans, root, dir); - BUG_ON(ret); - - ret = btrfs_add_root_ref(trans, objectid, root->root_key.objectid, -btrfs_ino(BTRFS_I(dir)), index, name, namelen); - BUG_ON(ret); ret = btrfs_uuid_tree_add(trans, root_item->uuid, BTRFS_UUID_KEY_SUBVOL, objectid); -- 2.18.0 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[RFC PATCH 3/4] btrfs: undelete: introduce btrfs_undelete_subvolume
The function will do the following things which are almost the opposite of what btrfs_delete_subvolume() does: 1. link the subvolume to the parent specified; 2. clear root flag and set root_refs to 1; 3. add the subvol to the uuid_tree; 4. delete the orphan_item. Signed-off-by: Lu Fengqi --- fs/btrfs/ioctl.c | 116 +++ 1 file changed, 116 insertions(+) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index e0b5a8fb15e7..7a11c4f8e450 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1864,6 +1864,122 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file, return ret; } +static int btrfs_undelete_subvolume(const struct path *parent, + struct btrfs_root *root, + const char *name, int namelen) +{ + struct inode *dir = d_inode(parent->dentry); + struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); + struct btrfs_root_item *root_item = &root->root_item; + struct btrfs_trans_handle *trans; + struct btrfs_block_rsv block_rsv; + struct dentry *dentry; + struct inode *inode; + u64 root_flags; + int ret; + + btrfs_debug(fs_info, "Undelete subvolume %llu", + root->root_key.objectid); + + /* only care about the intact subvolume */ + if (btrfs_disk_key_objectid(&root_item->drop_progress) != 0) + return 0; + + /* root_refs of destination parent root must not be 0 */ + if (btrfs_root_refs(&BTRFS_I(dir)->root->root_item) == 0) + return -ENOENT; + + ret = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT); + if (ret == -EINTR) + return ret; + + dentry = lookup_one_len(name, parent->dentry, namelen); + if (IS_ERR(dentry)) { + ret = PTR_ERR(dentry); + goto out_unlock; + } + + down_write(&fs_info->subvol_sem); + + ret = btrfs_may_create(dir, dentry); + if (ret) + goto out_up_write; + + ret = btrfs_check_dir_item_collision(root, dir->i_ino, name, namelen); + if (ret) + goto out_up_write; + + btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP); + /* +* 1 - parent dir inode +* 2 - dir entries +* 2 - root ref/backref +* 1 - UUID item +*/ + ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 6, false); + if (ret) + goto out_up_write; + + trans = btrfs_start_transaction(BTRFS_I(dir)->root, 0); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + btrfs_subvolume_release_metadata(fs_info, &block_rsv); + goto out_up_write; + } + + trans->block_rsv = &block_rsv; + trans->bytes_reserved = block_rsv.size; + + ret = btrfs_link_subvol(trans, dir, root->root_key.objectid, name, + namelen); + if (ret) + goto fail; + + /* clear BTRFS_ROOT_SUBVOL_DEAD root flag and set root_refs to 1*/ + root_flags = btrfs_root_flags(root_item); + btrfs_set_root_flags(root_item, +root_flags & ~BTRFS_ROOT_SUBVOL_DEAD); + btrfs_set_root_refs(root_item, 1); + ret = btrfs_update_root(trans, fs_info->tree_root, + &root->root_key, &root->root_item); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto fail; + } + + ret = btrfs_uuid_tree_add(trans, root_item->uuid, BTRFS_UUID_KEY_SUBVOL, + root->root_key.objectid); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto fail; + } + + ret = btrfs_del_orphan_item(trans, fs_info->tree_root, + root->root_key.objectid); + if (ret && ret != -ENOENT) { + btrfs_abort_transaction(trans, ret); + goto fail; + } +fail: + trans->block_rsv = NULL; + trans->bytes_reserved = 0; + btrfs_subvolume_release_metadata(fs_info, &block_rsv); + ret = btrfs_commit_transaction(trans); + if (!ret) { + inode = btrfs_lookup_dentry(dir, dentry); + if (IS_ERR(inode)) + return PTR_ERR(inode); + d_instantiate(dentry, inode); + fsnotify_mkdir(dir, dentry); + } +out_up_write: + up_write(&fs_info->subvol_sem); + dput(dentry); +out_unlock: + inode_unlock(dir); + return ret; +} + static noinline int btrfs_ioctl_subvol_getflags(struct file *file, void __user *arg) { -- 2.18.0 -- To unsubscribe from th
[RFC PATCH 2/4] btrfs: don't BUG_ON() in btrfs_link_subvol()
Both of btrfs_update_inode() and btrfs_add_root_ref() may fail because of ENOMEM. So there's no reason to panic here, we can replace BUG_ON() with btrfs_abort_transaction() here. Signed-off-by: Lu Fengqi --- fs/btrfs/ioctl.c | 10 -- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index d37c26f69112..e0b5a8fb15e7 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -572,11 +572,17 @@ static int btrfs_link_subvol(struct btrfs_trans_handle *trans, btrfs_i_size_write(BTRFS_I(dir), dir->i_size + namelen * 2); ret = btrfs_update_inode(trans, root, dir); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, ret); + return ret; + } ret = btrfs_add_root_ref(trans, objectid, root->root_key.objectid, btrfs_ino(BTRFS_I(dir)), index, name, namelen); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, ret); + return ret; + } return ret; } -- 2.18.0 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html