[PATCH] Btrfs: fix use-after-free when cloning a trailing file hole
The transaction handle was being used after being freed. Cc: Chris Mason c...@fb.com Signed-off-by: Filipe David Borba Manana fdman...@gmail.com --- fs/btrfs/ioctl.c | 6 ++ 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 02dc64b..2562dc7 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3565,12 +3565,10 @@ process_slot: btrfs_end_transaction(trans, root); goto out; } - ret = clone_finish_inode_update(trans, inode, destoff + len, - destoff, olen); - if (ret) - goto out; clone_update_extent_map(inode, trans, path, NULL, last_dest_end, destoff + len - last_dest_end); + ret = clone_finish_inode_update(trans, inode, destoff + len, + destoff, olen); } out: -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v2] Btrfs: fix use-after-free when cloning a trailing file hole
The transaction handle was being used after being freed. Cc: Chris Mason c...@fb.com Signed-off-by: Filipe David Borba Manana fdman...@gmail.com --- V2: Removed file extent item argument to clone_update_extent_map() for more clarity. fs/btrfs/ioctl.c | 20 +--- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 02dc64b..2a99f49 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3142,7 +3142,6 @@ out: static void clone_update_extent_map(struct inode *inode, const struct btrfs_trans_handle *trans, const struct btrfs_path *path, - struct btrfs_file_extent_item *fi, const u64 hole_offset, const u64 hole_len) { @@ -3157,7 +3156,11 @@ static void clone_update_extent_map(struct inode *inode, return; } - if (fi) { + if (path) { + struct btrfs_file_extent_item *fi; + + fi = btrfs_item_ptr(path-nodes[0], path-slots[0], + struct btrfs_file_extent_item); btrfs_extent_item_to_extent_map(inode, path, fi, false, em); em-generation = -1; if (btrfs_file_extent_type(path-nodes[0], fi) == @@ -3511,18 +3514,15 @@ process_slot: btrfs_item_ptr_offset(leaf, slot), size); inode_add_bytes(inode, datal); - extent = btrfs_item_ptr(leaf, slot, - struct btrfs_file_extent_item); } /* If we have an implicit hole (NO_HOLES feature). */ if (drop_start new_key.offset) clone_update_extent_map(inode, trans, - path, NULL, drop_start, + NULL, drop_start, new_key.offset - drop_start); - clone_update_extent_map(inode, trans, path, - extent, 0, 0); + clone_update_extent_map(inode, trans, path, 0, 0); btrfs_mark_buffer_dirty(leaf); btrfs_release_path(path); @@ -3565,12 +3565,10 @@ process_slot: btrfs_end_transaction(trans, root); goto out; } + clone_update_extent_map(inode, trans, NULL, last_dest_end, + destoff + len - last_dest_end); ret = clone_finish_inode_update(trans, inode, destoff + len, destoff, olen); - if (ret) - goto out; - clone_update_extent_map(inode, trans, path, NULL, last_dest_end, - destoff + len - last_dest_end); } out: -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] Btrfs: atomically set inode-i_flags in btrfs_update_iflags
This change is based on the corresponding recent change for ext4: ext4: atomically set inode-i_flags in ext4_set_inode_flags() That has the following commit message that applies to btrfs as well: Use cmpxchg() to atomically set i_flags instead of clearing out the S_IMMUTABLE, S_APPEND, etc. flags and then setting them from the EXT4_IMMUTABLE_FL, EXT4_APPEND_FL flags, since this opens up a race where an immutable file has the immutable flag cleared for a brief window of time. Replacing EXT4_IMMUTABLE_FL and EXT4_APPEND_FL with BTRFS_INODE_IMMUTABLE and BTRFS_INODE_APPEND, respectively. Signed-off-by: Filipe David Borba Manana fdman...@gmail.com --- fs/btrfs/ioctl.c | 17 ++--- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 6ea1546..02dc64b 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -136,19 +136,22 @@ static unsigned int btrfs_flags_to_ioctl(unsigned int flags) void btrfs_update_iflags(struct inode *inode) { struct btrfs_inode *ip = BTRFS_I(inode); - - inode-i_flags = ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); + unsigned int new_fl = 0; if (ip-flags BTRFS_INODE_SYNC) - inode-i_flags |= S_SYNC; + new_fl |= S_SYNC; if (ip-flags BTRFS_INODE_IMMUTABLE) - inode-i_flags |= S_IMMUTABLE; + new_fl |= S_IMMUTABLE; if (ip-flags BTRFS_INODE_APPEND) - inode-i_flags |= S_APPEND; + new_fl |= S_APPEND; if (ip-flags BTRFS_INODE_NOATIME) - inode-i_flags |= S_NOATIME; + new_fl |= S_NOATIME; if (ip-flags BTRFS_INODE_DIRSYNC) - inode-i_flags |= S_DIRSYNC; + new_fl |= S_DIRSYNC; + + set_mask_bits(inode-i_flags, + S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | S_DIRSYNC, + new_fl); } /* -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v2] Btrfs: fix crash when starting transaction
Often when starting a transaction we commit the currently running transaction, which can end up writing block group caches when the current process has its journal_info set to NULL (and not to a transaction). This makes our assertion at btrfs_check_data_free_space() (current_journal != NULL) fail, resulting in a crash/hang. Therefore fix it by setting journal_info. Two different traces of this issue follow below. 1) [51502.241936] BTRFS: assertion failed: current-journal_info, file: fs/btrfs/extent-tree.c, line: 3670 [51502.242213] [ cut here ] [51502.242493] kernel BUG at fs/btrfs/ctree.h:3964! [51502.242669] invalid opcode: [#1] SMP DEBUG_PAGEALLOC (...) [51502.244010] Call Trace: [51502.244010] [a02bc025] btrfs_check_data_free_space+0x395/0x3a0 [btrfs] [51502.244010] [a02c3bdc] btrfs_write_dirty_block_groups+0x4ac/0x640 [btrfs] [51502.244010] [a0357a6a] commit_cowonly_roots+0x164/0x226 [btrfs] [51502.244010] [a02d53cd] btrfs_commit_transaction+0x4ed/0xab0 [btrfs] [51502.244010] [8168ec7b] ? _raw_spin_unlock+0x2b/0x40 [51502.244010] [a02d6259] start_transaction+0x459/0x620 [btrfs] [51502.244010] [a02d67ab] btrfs_start_transaction+0x1b/0x20 [btrfs] [51502.244010] [a02d73e1] __unlink_start_trans+0x31/0xe0 [btrfs] [51502.244010] [a02dea67] btrfs_unlink+0x37/0xc0 [btrfs] [51502.244010] [811bb054] ? do_unlinkat+0x114/0x2a0 [51502.244010] [811baebc] vfs_unlink+0xcc/0x150 [51502.244010] [811bb1a0] do_unlinkat+0x260/0x2a0 [51502.244010] [811a9ef4] ? filp_close+0x64/0x90 [51502.244010] [810aaea6] ? trace_hardirqs_on_caller+0x16/0x1e0 [51502.244010] [81349cab] ? trace_hardirqs_on_thunk+0x3a/0x3f [51502.244010] [811be9eb] SyS_unlinkat+0x1b/0x40 [51502.244010] [81698452] system_call_fastpath+0x16/0x1b [51502.244010] Code: 0b 55 48 89 e5 0f 0b 55 48 89 e5 0f 0b 55 89 f1 48 c7 c2 71 13 36 a0 48 89 fe 31 c0 48 c7 c7 b8 43 36 a0 48 89 e5 e8 5d b0 32 e1 0f 0b 0f 1f 44 00 00 55 b9 11 00 00 00 48 89 e5 41 55 49 89 f5 [51502.244010] RIP [a03575da] assfail.constprop.88+0x1e/0x20 [btrfs] 2) [25405.097230] BTRFS: assertion failed: current-journal_info, file: fs/btrfs/extent-tree.c, line: 3670 [25405.097488] [ cut here ] [25405.097767] kernel BUG at fs/btrfs/ctree.h:3964! [25405.097940] invalid opcode: [#1] SMP DEBUG_PAGEALLOC (...) [25405.18] Call Trace: [25405.18] [a02bc025] btrfs_check_data_free_space+0x395/0x3a0 [btrfs] [25405.18] [a02c3bdc] btrfs_write_dirty_block_groups+0x4ac/0x640 [btrfs] [25405.18] [a035755a] commit_cowonly_roots+0x164/0x226 [btrfs] [25405.18] [a02d53cd] btrfs_commit_transaction+0x4ed/0xab0 [btrfs] [25405.18] [8109c170] ? bit_waitqueue+0xc0/0xc0 [25405.18] [a02d6259] start_transaction+0x459/0x620 [btrfs] [25405.18] [a02d67ab] btrfs_start_transaction+0x1b/0x20 [btrfs] [25405.18] [a02e3407] btrfs_create+0x47/0x210 [btrfs] [25405.18] [a02d74cc] ? btrfs_permission+0x3c/0x80 [btrfs] [25405.18] [811bc63b] vfs_create+0x9b/0x130 [25405.18] [811bcf19] do_last+0x849/0xe20 [25405.18] [811b9409] ? link_path_walk+0x79/0x820 [25405.18] [811bd5b5] path_openat+0xc5/0x690 [25405.18] [810ab07d] ? trace_hardirqs_on+0xd/0x10 [25405.18] [811cdcd2] ? __alloc_fd+0x32/0x1d0 [25405.18] [811be2a3] do_filp_open+0x43/0xa0 [25405.18] [811cddf1] ? __alloc_fd+0x151/0x1d0 [25405.18] [811abcfc] do_sys_open+0x13c/0x230 [25405.18] [810aaea6] ? trace_hardirqs_on_caller+0x16/0x1e0 [25405.18] [811abe12] SyS_open+0x22/0x30 [25405.18] [81698452] system_call_fastpath+0x16/0x1b [25405.18] Code: 0b 55 48 89 e5 0f 0b 55 48 89 e5 0f 0b 55 89 f1 48 c7 c2 51 13 36 a0 48 89 fe 31 c0 48 c7 c7 d0 43 36 a0 48 89 e5 e8 6d b5 32 e1 0f 0b 0f 1f 44 00 00 55 b9 11 00 00 00 48 89 e5 41 55 49 89 f5 [25405.18] RIP [a03570ca] assfail.constprop.88+0x1e/0x20 [btrfs] Signed-off-by: Filipe David Borba Manana fdman...@gmail.com --- V2: Removed test for current-journal_info == NULL. At this point it's always expected to be NULL. fs/btrfs/transaction.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index ac984a3..614eac3 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -491,6 +491,7 @@ again: smp_mb(); if (cur_trans-state = TRANS_STATE_BLOCKED may_wait_transaction(root, type)) { + current-journal_info = h
[PATCH] Btrfs: assert send doesn't attempt to start transactions
When starting a transaction just assert that current-journal_info doesn't contain a send transaction stub, since send isn't supposed to start transactions and when it finishes (either successfully or not) it's supposed to set current-journal_info to NULL. This is motivated by the change titled: Btrfs: fix crash when starting transaction Signed-off-by: Filipe David Borba Manana fdman...@gmail.com --- fs/btrfs/transaction.c | 6 -- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 614eac3..47870ca 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -386,11 +386,13 @@ start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type, bool reloc_reserved = false; int ret; + /* Send isn't supposed to start transactions. */ + ASSERT(current-journal_info != (void *)BTRFS_SEND_TRANS_STUB); + if (test_bit(BTRFS_FS_STATE_ERROR, root-fs_info-fs_state)) return ERR_PTR(-EROFS); - if (current-journal_info - current-journal_info != (void *)BTRFS_SEND_TRANS_STUB) { + if (current-journal_info) { WARN_ON(type TRANS_EXTWRITERS); h = current-journal_info; h-use_count++; -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] Btrfs: implement support for fallocate collapse range
This implements fallocate's FALLOC_FL_COLLAPSE_RANGE operation for BTRFS. This fallocate operation was introduced in the linux kernel version 3.15. Existing tests in xfstests already test this operation explicitly and implicitly via fsstress. Signed-off-by: Filipe David Borba Manana fdman...@gmail.com --- fs/btrfs/ctree.c | 42 - fs/btrfs/ctree.h | 2 + fs/btrfs/extent-tree.c | 30 +-- fs/btrfs/file.c| 486 + 4 files changed, 453 insertions(+), 107 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index aeab453..8f1a371 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -2825,12 +2825,12 @@ cow_done: * It is safe to drop the lock on our parent before we * go through the expensive btree search on b. * -* If we're inserting or deleting (ins_len != 0), then we might -* be changing slot zero, which may require changing the parent. -* So, we can't drop the lock until after we know which slot -* we're operating on. +* If we're inserting, deleting or updating a key (cow != 0), +* then we might be changing slot zero, which may require +* changing the parent. So, we can't drop the lock until after +* we know which slot we're operating on. */ - if (!ins_len !p-keep_locks) { + if (!cow !p-keep_locks) { int u = level + 1; if (u BTRFS_MAX_LEVEL p-locks[u]) { @@ -2865,7 +2865,7 @@ cow_done: * which means we must have a write lock * on the parent */ - if (slot == 0 ins_len + if (slot == 0 cow write_lock_level level + 1) { write_lock_level = level + 1; btrfs_release_path(p); @@ -5660,6 +5660,36 @@ next: } /* + * This differs from btrfs_find_next_key in that it ignores leaf/node + * generations and it doesn't unlock and re-lock nodes/leaves nor does + * any subsequent searches (calls to btrfs_search_slot), preserving the + * locks in the given path. + * + * Returns 0 if a next key exists, 1 otherwise. + */ +int btrfs_find_next_current_key(struct btrfs_path *path, int level, + struct btrfs_key *key) + +{ + for (; level BTRFS_MAX_LEVEL; level++) { + if (!path-nodes[level]) + break; + if (path-slots[level] + 1 = + btrfs_header_nritems(path-nodes[level])) + continue; + if (level == 0) + btrfs_item_key_to_cpu(path-nodes[level], key, + path-slots[level] + 1); + else + btrfs_node_key_to_cpu(path-nodes[level], key, + path-slots[level] + 1); + return 0; + } + return 1; +} + + +/* * search the tree again to find a leaf with greater keys * returns 0 if it found something or 1 if there are no greater leaves. * returns 0 on io errors. diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index b7e2c1c..166a35f 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3446,6 +3446,8 @@ struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root); int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *key, int lowest_level, u64 min_trans); +int btrfs_find_next_current_key(struct btrfs_path *path, int level, + struct btrfs_key *key); int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, struct btrfs_path *path, u64 min_trans); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index fafb3e5..a6d0ec7 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -100,8 +100,6 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, static int do_chunk_alloc(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root, u64 flags, int force); -static int find_next_key(struct btrfs_path *path, int level, -struct btrfs_key *key); static void dump_space_info(struct btrfs_space_info *info, u64 bytes, int dump_block_groups); static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, @@ -440,7 +438,7 @@ next: if (path-slots[0] nritems) { btrfs_item_key_to_cpu(leaf, key, path-slots[0]); } else { - ret = find_next_key(path, 0
[PATCH] generic/017: skip invalid block sizes for btrfs
In btrfs the block size (called sector size in btrfs) can not be smaller then the page size. Therefore skip block sizes smaller then page size if the fs is btrfs, so that the test can succeed on btrfs (testing only with block sizes of 4kb on systems with a page size of 4Kb). Signed-off-by: Filipe David Borba Manana fdman...@gmail.com --- tests/generic/017 | 8 1 file changed, 8 insertions(+) diff --git a/tests/generic/017 b/tests/generic/017 index 13b7254..6495be5 100755 --- a/tests/generic/017 +++ b/tests/generic/017 @@ -51,6 +51,14 @@ BLOCKS=10240 for (( BSIZE = 1024; BSIZE = 4096; BSIZE *= 2 )); do + # btrfs doesn't support block size smaller then page size + if [ $FSTYP == btrfs ]; then + if (( $BSIZE `getconf PAGE_SIZE` )); then + echo 80 + continue + fi + fi + length=$(($BLOCKS * $BSIZE)) case $FSTYP in xfs) -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 2/6 v5] Btrfs: send, implement total data size command to allow for progress estimation
This new send flag makes send calculate first the amount of new file data (in bytes) the send root has relatively to the parent root, or for the case of a non-incremental send, the total amount of file data the stream will create (including holes and prealloc extents). In other words, it computes the sum of the lengths of all write, clone and fallocate operations that will be sent through the send stream. This data size value is sent in a new command, named BTRFS_SEND_C_TOTAL_DATA_SIZE, that immediately follows a BTRFS_SEND_C_SUBVOL or BTRFS_SEND_C_SNAPSHOT command, and precedes any command that changes a file or the filesystem hierarchy. Upon receiving a write, clone or fallocate command, the receiving end can increment a counter by the data length of that command and therefore report progress by comparing the counter's value with the data size value received in the BTRFS_SEND_C_TOTAL_DATA_SIZE command. The approach is simple, before the normal operation of send, do a scan in the file system tree for new inodes and new/changed file extent items, just like in send's normal operation, and keep incrementing a counter with new inodes' size and the size of file extents (and file holes) that are going to be written, cloned or fallocated. This is actually a simpler and more lightweight tree scan/processing than the one we do when sending the changes, as it doesn't process inode references nor does any lookups in the extent tree for example. After modifying btrfs-progs to understand this new command and report progress, here's an example (the -o flag tells btrfs send to pass the new flag to the kernel's send ioctl): $ btrfs send -s --stream-version 2 /mnt/sdd/snap_base | btrfs receive /mnt/sdc At subvol /mnt/sdd/snap_base At subvol snap_base About to receive 9212392667 bytes Subvolume /mnt/sdc//snap_base, 4059722426 / 9212392667 bytes received, 44.07%, 40.32MB/s $ btrfs send -s --stream-version 2 -p /mnt/sdd/snap_base /mnt/sdd/snap_incr | btrfs receive /mnt/sdc At subvol /mnt/sdd/snap_incr At subvol snap_incr About to receive 9571342213 bytes Subvolume /mnt/sdc//snap_incr, 6557345221 / 9571342213 bytes received, 68.51%, 51.04MB/s Signed-off-by: Filipe David Borba Manana fdman...@gmail.com --- V2: A v2 stream is now only produced if the send ioctl caller passes in one of the new flags (BTRFS_SEND_FLAG_CALCULATE_DATA_SIZE | BTRFS_SEND_FLAG_SUPPORT_FALLOCATE) to avoid breaking old clients. V3: Removed BTRFS_SEND_FLAG_SUPPORT_FALLOCATE and added BTRFS_SEND_FLAG_STREAM_V2, added commands for inode set flags and otime. V4: There's no v4, bumped directly to v5 to make all patches in the series have the same version. V5: Rebased against latest chris/integration branch. fs/btrfs/send.c | 194 ++-- 1 file changed, 162 insertions(+), 32 deletions(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index d7ef14b..dd6f5ec 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -81,7 +81,13 @@ struct clone_root { #define SEND_CTX_MAX_NAME_CACHE_SIZE 128 #define SEND_CTX_NAME_CACHE_CLEAN_SIZE (SEND_CTX_MAX_NAME_CACHE_SIZE * 2) +enum btrfs_send_phase { + SEND_PHASE_STREAM_CHANGES, + SEND_PHASE_COMPUTE_DATA_SIZE, +}; + struct send_ctx { + enum btrfs_send_phase phase; struct file *send_filp; loff_t send_off; char *send_buf; @@ -116,6 +122,7 @@ struct send_ctx { u64 cur_inode_last_extent; u64 send_progress; + u64 total_data_size; struct list_head new_refs; struct list_head deleted_refs; @@ -696,6 +703,8 @@ static int send_rename(struct send_ctx *sctx, { int ret; + ASSERT(sctx-phase != SEND_PHASE_COMPUTE_DATA_SIZE); + verbose_printk(btrfs: send_rename %s - %s\n, from-start, to-start); ret = begin_cmd(sctx, BTRFS_SEND_C_RENAME); @@ -720,6 +729,8 @@ static int send_link(struct send_ctx *sctx, { int ret; + ASSERT(sctx-phase != SEND_PHASE_COMPUTE_DATA_SIZE); + verbose_printk(btrfs: send_link %s - %s\n, path-start, lnk-start); ret = begin_cmd(sctx, BTRFS_SEND_C_LINK); @@ -743,6 +754,8 @@ static int send_unlink(struct send_ctx *sctx, struct fs_path *path) { int ret; + ASSERT(sctx-phase != SEND_PHASE_COMPUTE_DATA_SIZE); + verbose_printk(btrfs: send_unlink %s\n, path-start); ret = begin_cmd(sctx, BTRFS_SEND_C_UNLINK); @@ -765,6 +778,8 @@ static int send_rmdir(struct send_ctx *sctx, struct fs_path *path) { int ret; + ASSERT(sctx-phase != SEND_PHASE_COMPUTE_DATA_SIZE); + verbose_printk(btrfs: send_rmdir %s\n, path-start); ret = begin_cmd(sctx, BTRFS_SEND_C_RMDIR); @@ -2325,6 +2340,9 @@ static int send_truncate(struct send_ctx *sctx, u64 ino, u64 gen, u64 size) int ret = 0; struct fs_path *p; + if (sctx-phase == SEND_PHASE_COMPUTE_DATA_SIZE) + return 0
[PATCH 3/6 v5] Btrfs: send, use fallocate command to punch holes
Instead of sending a write command with a data buffer filled with 0 value bytes, use the fallocate command, introduced in the send stream version 2, to tell the receiver to punch a file hole using the fallocate system call. Signed-off-by: Filipe David Borba Manana fdman...@gmail.com --- V2: A v2 stream is now only produced if the send ioctl caller passes in one of the new flags (BTRFS_SEND_FLAG_CALCULATE_DATA_SIZE | BTRFS_SEND_FLAG_SUPPORT_FALLOCATE) to avoid breaking old clients. V3: Added missing path allocation, messed up rebase. V4: Removed BTRFS_SEND_FLAG_SUPPORT_FALLOCATE and added BTRFS_SEND_FLAG_STREAM_V2, added commands for inode set flags and otime. V5: Rebased against latest chris/integration branch. fs/btrfs/send.c | 55 --- fs/btrfs/send.h | 4 2 files changed, 56 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index dd6f5ec..300eaee 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -569,6 +569,7 @@ static int tlv_put(struct send_ctx *sctx, u16 attr, const void *data, int len) return tlv_put(sctx, attr, __tmp, sizeof(__tmp)); \ } +TLV_PUT_DEFINE_INT(32) TLV_PUT_DEFINE_INT(64) static int tlv_put_string(struct send_ctx *sctx, u16 attr, @@ -4500,18 +4501,59 @@ out: return ret; } +static int send_fallocate(struct send_ctx *sctx, u32 flags, + u64 offset, u64 len) +{ + struct fs_path *p = NULL; + int ret = 0; + + ASSERT(sctx-flags BTRFS_SEND_FLAG_STREAM_V2); + + if (sctx-phase == SEND_PHASE_COMPUTE_DATA_SIZE) { + sctx-total_data_size += len; + return 0; + } + + p = fs_path_alloc(); + if (!p) + return -ENOMEM; + ret = get_cur_path(sctx, sctx-cur_ino, sctx-cur_inode_gen, p); + if (ret 0) + goto out; + + ret = begin_cmd(sctx, BTRFS_SEND_C_FALLOCATE); + if (ret 0) + goto out; + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); + TLV_PUT_U32(sctx, BTRFS_SEND_A_FALLOCATE_FLAGS, flags); + TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); + TLV_PUT_U64(sctx, BTRFS_SEND_A_SIZE, len); + ret = send_cmd(sctx); + +tlv_put_failure: +out: + fs_path_free(p); + return ret; +} + static int send_hole(struct send_ctx *sctx, u64 end) { struct fs_path *p = NULL; u64 offset = sctx-cur_inode_last_extent; - u64 len; + u64 len = end - offset; int ret = 0; if (sctx-phase == SEND_PHASE_COMPUTE_DATA_SIZE) { - sctx-total_data_size += end - offset; + sctx-total_data_size += len; return 0; } + if (sctx-flags BTRFS_SEND_FLAG_STREAM_V2) + return send_fallocate(sctx, + BTRFS_SEND_PUNCH_HOLE_FALLOC_FLAGS, + offset, + len); + p = fs_path_alloc(); if (!p) return -ENOMEM; @@ -4568,7 +4610,8 @@ static int send_write_or_clone(struct send_ctx *sctx, len = btrfs_file_extent_num_bytes(path-nodes[0], ei); } - if (offset + len sctx-cur_inode_size) + if (offset sctx-cur_inode_size + offset + len sctx-cur_inode_size) len = sctx-cur_inode_size - offset; if (len == 0) { ret = 0; @@ -4585,6 +4628,12 @@ static int send_write_or_clone(struct send_ctx *sctx, ret = send_clone(sctx, offset, len, clone_root); } else if (sctx-flags BTRFS_SEND_FLAG_NO_FILE_DATA) { ret = send_update_extent(sctx, offset, len); + } else if (btrfs_file_extent_disk_bytenr(path-nodes[0], ei) == 0 + type != BTRFS_FILE_EXTENT_INLINE + (sctx-flags BTRFS_SEND_FLAG_STREAM_V2) + offset sctx-cur_inode_size) { + ret = send_fallocate(sctx, BTRFS_SEND_PUNCH_HOLE_FALLOC_FLAGS, +offset, len); } else { while (pos len) { l = len - pos; diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h index 96f583c..987936c 100644 --- a/fs/btrfs/send.h +++ b/fs/btrfs/send.h @@ -148,6 +148,10 @@ enum { #define BTRFS_SEND_A_FALLOCATE_FLAG_KEEP_SIZE (1 0) #define BTRFS_SEND_A_FALLOCATE_FLAG_PUNCH_HOLE (1 1) +#define BTRFS_SEND_PUNCH_HOLE_FALLOC_FLAGS\ + (BTRFS_SEND_A_FALLOCATE_FLAG_KEEP_SIZE | \ +BTRFS_SEND_A_FALLOCATE_FLAG_PUNCH_HOLE) + #ifdef __KERNEL__ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg); #endif -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 4/6 v5] Btrfs: send, use fallocate command to allocate extents
The send stream version 2 adds the fallocate command, which can be used to allocate extents for a file or punch holes in a file. Previously we were ignoring file prealloc extents or treating them as extents filled with 0 bytes and sending a regular write command to the stream. After this change, together with my previous change titled: Btrfs: send, use fallocate command to punch holes an incremental send preserves the hole and data structure of files, which can be seen via calls to lseek with the whence parameter set to SEEK_DATA or SEEK_HOLE, as the example below shows: mkfs.btrfs -f /dev/sdc mount /dev/sdc /mnt xfs_io -f -c pwrite -S 0x01 -b 30 0 30 /mnt/foo btrfs subvolume snapshot -r /mnt /mnt/mysnap1 xfs_io -c fpunch 10 5 /mnt/foo xfs_io -c falloc 10 5 /mnt/foo xfs_io -c pwrite -S 0xff -b 1000 12 1000 /mnt/foo xfs_io -c fpunch 25 2 /mnt/foo # prealloc extents that start beyond the inode's size xfs_io -c falloc -k 30 100 /mnt/foo xfs_io -c falloc -k 900 200 /mnt/foo btrfs subvolume snapshot -r /mnt /mnt/mysnap2 btrfs send /mnt/mysnap1 -f /tmp/1.snap btrfs send -p /mnt/mysnap1 /mnt/mysnap2 -f /tmp/2.snap mkfs.btrfs -f /dev/sdd mount /dev/sdd /mnt2 btrfs receive /mnt2 -f /tmp/1.snap btrfs receive /mnt2 -f /tmp/2.snap Before this change the hole/data structure differed between both filesystems: $ xfs_io -r -c 'seek -r -a 0' /mnt/mysnap2/foo Whence Result DATA0 HOLE102400 DATA118784 HOLE122880 DATA147456 HOLE253952 DATA266240 HOLE30 $ xfs_io -r -c 'seek -r -a 0' /mnt2/mysnap2/foo Whence Result DATA0 HOLE30 After this change the second filesystem (/dev/sdd) ends up with the same hole/data structure as the first filesystem. Also, after this change, prealloc extents that lie beyond the inode's size (were allocated with fallocate + keep size flag) are also replicated by an incremental send. For the above test, it can be observed via fiemap (or btrfs-debug-tree): $ xfs_io -r -c 'fiemap -l' /mnt2/mysnap2/foo 0: [0..191]: 25096..25287 192 blocks 1: [192..199]: 24672..24679 8 blocks 2: [200..231]: 24584..24615 32 blocks 3: [232..239]: 24680..24687 8 blocks 4: [240..287]: 24616..24663 48 blocks 5: [288..295]: 24688..24695 8 blocks 6: [296..487]: 25392..25583 192 blocks 7: [488..495]: 24696..24703 8 blocks 8: [496..519]: hole 24 blocks 9: [520..527]: 24704..24711 8 blocks 10: [528..583]: 25624..25679 56 blocks 11: [584..591]: 24712..24719 8 blocks 12: [592..2543]: 26192..28143 1952 blocks 13: [2544..17575]: hole 15032 blocks 14: [17576..21487]: 28144..32055 3912 blocks The test for xfstests was already merged (btrfs/047) that verifies that a send stream version 2 does space pre-allocation and hole punching. Signed-off-by: Filipe David Borba Manana fdman...@gmail.com --- V2: Added new send ioctl flag BTRFS_SEND_FLAG_SUPPORT_FALLOCATE. A version 2 stream is now only produced is the ioctl caller specifies at least one of the new send flags (BTRFS_SEND_FLAG_SUPPORT_FALLOCATE or BTRFS_SEND_FLAG_CALCULATE_DATA_SIZE). V3: Fixed rebase, removed some duplicate logic on truncate + falloc -k. V4: Removed BTRFS_SEND_FLAG_SUPPORT_FALLOCATE and added BTRFS_SEND_FLAG_STREAM_V2, added commands for inode set flags and otime. V5: Rebased against latest chris/integration branch and updated commit message. fs/btrfs/send.c | 78 + 1 file changed, 57 insertions(+), 21 deletions(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 300eaee..873eeb1 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -113,9 +113,10 @@ struct send_ctx { */ u64 cur_ino; u64 cur_inode_gen; - int cur_inode_new; - int cur_inode_new_gen; - int cur_inode_deleted; + u8 cur_inode_new:1; + u8 cur_inode_new_gen:1; + u8 cur_inode_skip_truncate:1; + u8 cur_inode_deleted:1; u64 cur_inode_size; u64 cur_inode_mode; u64 cur_inode_rdev; @@ -4580,6 +4581,19 @@ tlv_put_failure: return ret; } +static int truncate_before_falloc(struct send_ctx *sctx) +{ + int ret = 0; + + if (!sctx-cur_inode_skip_truncate) { + ret = send_truncate(sctx, sctx-cur_ino, + sctx-cur_inode_gen, + sctx-cur_inode_size); + sctx-cur_inode_skip_truncate = 1; + } + return ret; +} + static int send_write_or_clone(struct send_ctx *sctx, struct btrfs_path *path, struct btrfs_key *key, @@ -4619,8 +4633,7 @@ static int send_write_or_clone(struct send_ctx *sctx
[PATCH 1/6 v5] Btrfs: send, bump stream version
This increases the send stream version from version 1 to version 2, adding new commands: 1) total data size - used to tell the receiver how much file data the stream will add or update; 2) fallocate - used to pre-allocate space for files and to punch holes in files; 3) inode set flags; 4) set inode otime. This is preparation work for subsequent changes that implement the new features. A version 2 stream is only produced if the send ioctl caller passes in one of the new flags (BTRFS_SEND_FLAG_CALCULATE_DATA_SIZE | BTRFS_SEND_FLAG_STREAM_V2), meaning old clients are unaffected. Signed-off-by: Filipe David Borba Manana fdman...@gmail.com --- V2: A v2 stream is now only produced if the send ioctl caller passes in one of the new flags (BTRFS_SEND_FLAG_CALCULATE_DATA_SIZE | BTRFS_SEND_FLAG_SUPPORT_FALLOCATE) to avoid breaking old clients. V3: Removed BTRFS_SEND_FLAG_SUPPORT_FALLOCATE and added BTRFS_SEND_FLAG_STREAM_V2, added commands for inode set flags and otime. V4: There's no v4, bumped directly to v5 to make all patches in the series have the same version. V5: Rebased against latest chris/integration branch. fs/btrfs/send.c| 7 ++- fs/btrfs/send.h| 21 - include/uapi/linux/btrfs.h | 21 - 3 files changed, 46 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 6528aa6..d7ef14b 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -637,7 +637,10 @@ static int send_header(struct send_ctx *sctx) struct btrfs_stream_header hdr; strcpy(hdr.magic, BTRFS_SEND_STREAM_MAGIC); - hdr.version = cpu_to_le32(BTRFS_SEND_STREAM_VERSION); + if (sctx-flags BTRFS_SEND_FLAG_STREAM_V2) + hdr.version = cpu_to_le32(BTRFS_SEND_STREAM_VERSION_2); + else + hdr.version = cpu_to_le32(BTRFS_SEND_STREAM_VERSION_1); return write_buf(sctx-send_filp, hdr, sizeof(hdr), sctx-send_off); @@ -5572,6 +5575,8 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) INIT_LIST_HEAD(sctx-name_cache_list); sctx-flags = arg-flags; + if (sctx-flags BTRFS_SEND_FLAG_CALCULATE_DATA_SIZE) + sctx-flags |= BTRFS_SEND_FLAG_STREAM_V2; sctx-send_filp = fget(arg-send_fd); if (!sctx-send_filp) { diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h index 48d425a..96f583c 100644 --- a/fs/btrfs/send.h +++ b/fs/btrfs/send.h @@ -20,7 +20,8 @@ #include ctree.h #define BTRFS_SEND_STREAM_MAGIC btrfs-stream -#define BTRFS_SEND_STREAM_VERSION 1 +#define BTRFS_SEND_STREAM_VERSION_1 1 +#define BTRFS_SEND_STREAM_VERSION_2 2 #define BTRFS_SEND_BUF_SIZE (1024 * 64) #define BTRFS_SEND_READ_SIZE (1024 * 48) @@ -87,6 +88,15 @@ enum btrfs_send_cmd { BTRFS_SEND_C_END, BTRFS_SEND_C_UPDATE_EXTENT, + + /* +* The following commands were added in stream version 2. +*/ + BTRFS_SEND_C_TOTAL_DATA_SIZE, + BTRFS_SEND_C_FALLOCATE, + BTRFS_SEND_C_INODE_SET_FLAGS, + BTRFS_SEND_C_UTIMES2, /* Same as UTIMES, but it includes OTIME too. */ + __BTRFS_SEND_C_MAX, }; #define BTRFS_SEND_C_MAX (__BTRFS_SEND_C_MAX - 1) @@ -125,10 +135,19 @@ enum { BTRFS_SEND_A_CLONE_OFFSET, BTRFS_SEND_A_CLONE_LEN, + /* +* The following attributes were added in stream version 2. +*/ + BTRFS_SEND_A_FALLOCATE_FLAGS, /* 32 bits */ + BTRFS_SEND_A_INODE_FLAGS, /* 32 bits */ + __BTRFS_SEND_A_MAX, }; #define BTRFS_SEND_A_MAX (__BTRFS_SEND_A_MAX - 1) +#define BTRFS_SEND_A_FALLOCATE_FLAG_KEEP_SIZE (1 0) +#define BTRFS_SEND_A_FALLOCATE_FLAG_PUNCH_HOLE (1 1) + #ifdef __KERNEL__ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg); #endif diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h index 6f9c38c..62440d8 100644 --- a/include/uapi/linux/btrfs.h +++ b/include/uapi/linux/btrfs.h @@ -477,10 +477,29 @@ struct btrfs_ioctl_received_subvol_args { */ #define BTRFS_SEND_FLAG_OMIT_END_CMD 0x4 +/* + * Calculate the amount (in bytes) of new file data between the send and + * parent snapshots, or in case of a full send, the total amount of file data + * we will send. + * This corresponds to the sum of the data lengths of each write, clone and + * fallocate commands that are sent through the send stream. The receiving end + * can use this information to compute progress. + * + * Added in send stream version 2, and implies producing a version 2 stream. + */ +#define BTRFS_SEND_FLAG_CALCULATE_DATA_SIZE0x8 + +/* + * Used by a client to request a version 2 of the send stream. + */ +#define BTRFS_SEND_FLAG_STREAM_V2 0x10 + #define BTRFS_SEND_FLAG_MASK \ (BTRFS_SEND_FLAG_NO_FILE_DATA | \ BTRFS_SEND_FLAG_OMIT_STREAM_HEADER | \ -BTRFS_SEND_FLAG_OMIT_END_CMD) +BTRFS_SEND_FLAG_OMIT_END_CMD
[PATCH 5/6 v5] Btrfs: add missing cleanup on sysfs init failure
If we failed during initialization of sysfs, we weren't unregistering the top level btrfs sysfs entry nor the debugfs stuff. Not unregistering the top level sysfs entry makes future attempts to reload the btrfs module impossible and the following is reported in dmesg: [ 2246.451296] WARNING: CPU: 3 PID: 10999 at fs/sysfs/dir.c:486 sysfs_warn_dup+0x91/0xb0() [ 2246.451298] sysfs: cannot create duplicate filename '/fs/btrfs' [ 2246.451298] Modules linked in: btrfs(+) raid6_pq xor bnep rfcomm bluetooth binfmt_misc nfsd auth_rpcgss oid_registry nfs_acl nfs lockd fscache sunrpc parport_pc parport psmouse serio_raw pcspkr evbug i2c_piix4 e1000 floppy [last unloaded: btrfs] [ 2246.451310] CPU: 3 PID: 10999 Comm: modprobe Tainted: GW 3.13.0-fdm-btrfs-next-24+ #7 [ 2246.451311] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 [ 2246.451312] 0009 8800d353fa08 816f1da6 0410 [ 2246.451314] 8800d353fa58 8800d353fa48 8104a32c 88020821a290 [ 2246.451316] 88020821a290 88020821a290 8802148f 8800d353fb80 [ 2246.451318] Call Trace: [ 2246.451322] [816f1da6] dump_stack+0x4e/0x68 [ 2246.451324] [8104a32c] warn_slowpath_common+0x8c/0xc0 [ 2246.451325] [8104a416] warn_slowpath_fmt+0x46/0x50 [ 2246.451328] [81367dc5] ? strlcat+0x65/0x90 () Signed-off-by: Filipe David Borba Manana fdman...@gmail.com --- V1..V4: There's no v1, v2, v3 and v4. Bumped directly to v5 to make all patches in the series have the same version. V5: Rebased against latest chris/integration branch. fs/btrfs/sysfs.c | 10 +- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index df39458..06ad529 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -710,10 +710,18 @@ int btrfs_init_sysfs(void) ret = btrfs_init_debugfs(); if (ret) - return ret; + goto out1; init_feature_attrs(); ret = sysfs_create_group(btrfs_kset-kobj, btrfs_feature_attr_group); + if (ret) + goto out2; + + return 0; +out2: + debugfs_remove_recursive(btrfs_debugfs_root_dentry); +out1: + kset_unregister(btrfs_kset); return ret; } -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 6/6 v5] Btrfs: add send_stream_version attribute to sysfs
So that applications can find out what's the highest send stream version supported/implemented by the running kernel: $ cat /sys/fs/btrfs/send/stream_version 2 Signed-off-by: Filipe David Borba Manana fdman...@gmail.com Reviewed-by: David Sterba dste...@suse.cz --- V1..V4: There's no v1, v2, v3 and v4. Bumped directly to v5 to make all patches in the series have the same version. V5: Rebased against latest chris/integration branch. fs/btrfs/send.h | 1 + fs/btrfs/sysfs.c | 27 +++ 2 files changed, 28 insertions(+) diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h index 987936c..047fd6d 100644 --- a/fs/btrfs/send.h +++ b/fs/btrfs/send.h @@ -22,6 +22,7 @@ #define BTRFS_SEND_STREAM_MAGIC btrfs-stream #define BTRFS_SEND_STREAM_VERSION_1 1 #define BTRFS_SEND_STREAM_VERSION_2 2 +#define BTRFS_SEND_STREAM_VERSION_LATEST BTRFS_SEND_STREAM_VERSION_2 #define BTRFS_SEND_BUF_SIZE (1024 * 64) #define BTRFS_SEND_READ_SIZE (1024 * 48) diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 06ad529..9869d94 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -31,6 +31,7 @@ #include transaction.h #include sysfs.h #include volumes.h +#include send.h static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj); @@ -700,6 +701,26 @@ static int btrfs_init_debugfs(void) return 0; } +static ssize_t send_stream_version_show(struct kobject *kobj, + struct kobj_attribute *a, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, %d\n, + BTRFS_SEND_STREAM_VERSION_LATEST); +} + +BTRFS_ATTR(stream_version, 0444, send_stream_version_show); + +static struct attribute *btrfs_send_attrs[] = { + BTRFS_ATTR_PTR(stream_version), + NULL +}; + +static const struct attribute_group btrfs_send_attr_group = { + .name = send, + .attrs = btrfs_send_attrs, +}; + int btrfs_init_sysfs(void) { int ret; @@ -716,8 +737,13 @@ int btrfs_init_sysfs(void) ret = sysfs_create_group(btrfs_kset-kobj, btrfs_feature_attr_group); if (ret) goto out2; + ret = sysfs_create_group(btrfs_kset-kobj, btrfs_send_attr_group); + if (ret) + goto out3; return 0; +out3: + sysfs_remove_group(btrfs_kset-kobj, btrfs_feature_attr_group); out2: debugfs_remove_recursive(btrfs_debugfs_root_dentry); out1: @@ -729,6 +755,7 @@ out1: void btrfs_exit_sysfs(void) { sysfs_remove_group(btrfs_kset-kobj, btrfs_feature_attr_group); + sysfs_remove_group(btrfs_kset-kobj, btrfs_send_attr_group); kset_unregister(btrfs_kset); debugfs_remove_recursive(btrfs_debugfs_root_dentry); } -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] Btrfs: fix crash when starting transaction
Often when starting a transaction we commit the currently running transaction, which can end up writing block group caches when the current process has its journal_info set to NULL (and not to a transaction). This makes our assertion at btrfs_check_data_free_space() (current_journal != NULL) fail, resulting in a crash/hang. Therefore fix it by setting journal_info. Two different traces of this issue follow below. 1) [51502.241936] BTRFS: assertion failed: current-journal_info, file: fs/btrfs/extent-tree.c, line: 3670 [51502.242213] [ cut here ] [51502.242493] kernel BUG at fs/btrfs/ctree.h:3964! [51502.242669] invalid opcode: [#1] SMP DEBUG_PAGEALLOC (...) [51502.244010] Call Trace: [51502.244010] [a02bc025] btrfs_check_data_free_space+0x395/0x3a0 [btrfs] [51502.244010] [a02c3bdc] btrfs_write_dirty_block_groups+0x4ac/0x640 [btrfs] [51502.244010] [a0357a6a] commit_cowonly_roots+0x164/0x226 [btrfs] [51502.244010] [a02d53cd] btrfs_commit_transaction+0x4ed/0xab0 [btrfs] [51502.244010] [8168ec7b] ? _raw_spin_unlock+0x2b/0x40 [51502.244010] [a02d6259] start_transaction+0x459/0x620 [btrfs] [51502.244010] [a02d67ab] btrfs_start_transaction+0x1b/0x20 [btrfs] [51502.244010] [a02d73e1] __unlink_start_trans+0x31/0xe0 [btrfs] [51502.244010] [a02dea67] btrfs_unlink+0x37/0xc0 [btrfs] [51502.244010] [811bb054] ? do_unlinkat+0x114/0x2a0 [51502.244010] [811baebc] vfs_unlink+0xcc/0x150 [51502.244010] [811bb1a0] do_unlinkat+0x260/0x2a0 [51502.244010] [811a9ef4] ? filp_close+0x64/0x90 [51502.244010] [810aaea6] ? trace_hardirqs_on_caller+0x16/0x1e0 [51502.244010] [81349cab] ? trace_hardirqs_on_thunk+0x3a/0x3f [51502.244010] [811be9eb] SyS_unlinkat+0x1b/0x40 [51502.244010] [81698452] system_call_fastpath+0x16/0x1b [51502.244010] Code: 0b 55 48 89 e5 0f 0b 55 48 89 e5 0f 0b 55 89 f1 48 c7 c2 71 13 36 a0 48 89 fe 31 c0 48 c7 c7 b8 43 36 a0 48 89 e5 e8 5d b0 32 e1 0f 0b 0f 1f 44 00 00 55 b9 11 00 00 00 48 89 e5 41 55 49 89 f5 [51502.244010] RIP [a03575da] assfail.constprop.88+0x1e/0x20 [btrfs] 2) [25405.097230] BTRFS: assertion failed: current-journal_info, file: fs/btrfs/extent-tree.c, line: 3670 [25405.097488] [ cut here ] [25405.097767] kernel BUG at fs/btrfs/ctree.h:3964! [25405.097940] invalid opcode: [#1] SMP DEBUG_PAGEALLOC (...) [25405.18] Call Trace: [25405.18] [a02bc025] btrfs_check_data_free_space+0x395/0x3a0 [btrfs] [25405.18] [a02c3bdc] btrfs_write_dirty_block_groups+0x4ac/0x640 [btrfs] [25405.18] [a035755a] commit_cowonly_roots+0x164/0x226 [btrfs] [25405.18] [a02d53cd] btrfs_commit_transaction+0x4ed/0xab0 [btrfs] [25405.18] [8109c170] ? bit_waitqueue+0xc0/0xc0 [25405.18] [a02d6259] start_transaction+0x459/0x620 [btrfs] [25405.18] [a02d67ab] btrfs_start_transaction+0x1b/0x20 [btrfs] [25405.18] [a02e3407] btrfs_create+0x47/0x210 [btrfs] [25405.18] [a02d74cc] ? btrfs_permission+0x3c/0x80 [btrfs] [25405.18] [811bc63b] vfs_create+0x9b/0x130 [25405.18] [811bcf19] do_last+0x849/0xe20 [25405.18] [811b9409] ? link_path_walk+0x79/0x820 [25405.18] [811bd5b5] path_openat+0xc5/0x690 [25405.18] [810ab07d] ? trace_hardirqs_on+0xd/0x10 [25405.18] [811cdcd2] ? __alloc_fd+0x32/0x1d0 [25405.18] [811be2a3] do_filp_open+0x43/0xa0 [25405.18] [811cddf1] ? __alloc_fd+0x151/0x1d0 [25405.18] [811abcfc] do_sys_open+0x13c/0x230 [25405.18] [810aaea6] ? trace_hardirqs_on_caller+0x16/0x1e0 [25405.18] [811abe12] SyS_open+0x22/0x30 [25405.18] [81698452] system_call_fastpath+0x16/0x1b [25405.18] Code: 0b 55 48 89 e5 0f 0b 55 48 89 e5 0f 0b 55 89 f1 48 c7 c2 51 13 36 a0 48 89 fe 31 c0 48 c7 c7 d0 43 36 a0 48 89 e5 e8 6d b5 32 e1 0f 0b 0f 1f 44 00 00 55 b9 11 00 00 00 48 89 e5 41 55 49 89 f5 [25405.18] RIP [a03570ca] assfail.constprop.88+0x1e/0x20 [btrfs] Signed-off-by: Filipe David Borba Manana fdman...@gmail.com --- fs/btrfs/transaction.c | 4 1 file changed, 4 insertions(+) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index ac984a3..fe4abe9 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -491,7 +491,11 @@ again: smp_mb(); if (cur_trans-state = TRANS_STATE_BLOCKED may_wait_transaction(root, type)) { + void *journal_info = current-journal_info; + if (!journal_info) + current-journal_info = h
[PATCH] Btrfs: remove unused wait queue in struct extent_buffer
The lock_wq wait queue is not used anywhere, therefore just remove it. On a x86_64 system, this reduced sizeof(struct extent_buffer) from 320 bytes down to 296 bytes, which means a 4Kb page can now be used for 13 extent buffers instead of 12. Signed-off-by: Filipe David Borba Manana fdman...@gmail.com --- fs/btrfs/extent_io.h | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 8b63f2d..dbbea4f 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -158,7 +158,6 @@ struct extent_buffer { * to unlock */ wait_queue_head_t read_lock_wq; - wait_queue_head_t lock_wq; struct page *pages[INLINE_EXTENT_BUFFER_PAGES]; #ifdef CONFIG_BTRFS_DEBUG struct list_head leak_list; -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] Btrfs: fix qgroups sanity test crash or hang
Often when running the qgroups sanity test, a crash or a hang happened. This is because the extent buffer the test uses for the root node doesn't have an header level explicitly set, making it have a random level value. This is a problem when it's not zero for the btrfs_search_slot() calls the test ends up doing, resulting in crashes or hangs such as the following: [ 6454.127192] Btrfs loaded, debug=on, assert=on, integrity-checker=on (...) [ 6454.127760] BTRFS: selftest: Running qgroup tests [ 6454.127964] BTRFS: selftest: Running test_test_no_shared_qgroup [ 6454.127966] BTRFS: selftest: Qgroup basic add [ 6480.152005] BUG: soft lockup - CPU#0 stuck for 23s! [modprobe:5383] [ 6480.152005] Modules linked in: btrfs(+) xor raid6_pq binfmt_misc nfsd auth_rpcgss oid_registry nfs_acl nfs lockd fscache sunrpc i2c_piix4 i2c_core pcspkr evbug psmouse serio_raw e1000 [last unloaded: btrfs] [ 6480.152005] irq event stamp: 188448 [ 6480.152005] hardirqs last enabled at (188447): [8168ef5c] restore_args+0x0/0x30 [ 6480.152005] hardirqs last disabled at (188448): [81698e6a] apic_timer_interrupt+0x6a/0x80 [ 6480.152005] softirqs last enabled at (188446): [810516cf] __do_softirq+0x1cf/0x450 [ 6480.152005] softirqs last disabled at (188441): [81051c25] irq_exit+0xb5/0xc0 [ 6480.152005] CPU: 0 PID: 5383 Comm: modprobe Not tainted 3.15.0-rc8-fdm-btrfs-next-33+ #4 [ 6480.152005] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 [ 6480.152005] task: 8802146125a0 ti: 8800d0d0 task.ti: 8800d0d0 [ 6480.152005] RIP: 0010:[81349a63] [81349a63] __write_lock_failed+0x13/0x20 [ 6480.152005] RSP: 0018:8800d0d038e8 EFLAGS: 0287 [ 6480.152005] RAX: RBX: 8168ef5c RCX: 05deb8525852 [ 6480.152005] RDX: RSI: 1d45 RDI: 8802105000b8 [ 6480.152005] RBP: 8800d0d038e8 R08: fe12710f63db R09: a03196fb [ 6480.152005] R10: 8802146125a0 R11: 880214612e28 R12: 8800d0d03858 [ 6480.152005] R13: R14: 8800d0d0 R15: 8802146125a0 [ 6480.152005] FS: 7f14ff804700() GS:880215e0() knlGS: [ 6480.152005] CS: 0010 DS: ES: CR0: 8005003b [ 6480.152005] CR2: 7fff4df0dac8 CR3: d1796000 CR4: 06f0 [ 6480.152005] Stack: [ 6480.152005] 8800d0d03908 810ae967 0001 8802105000b8 [ 6480.152005] 8800d0d03938 8168e57e a0319c16 0007 [ 6480.152005] 88021050 880210500100 8800d0d039b8 a0319c16 [ 6480.152005] Call Trace: [ 6480.152005] [810ae967] do_raw_write_lock+0x47/0xa0 [ 6480.152005] [8168e57e] _raw_write_lock+0x5e/0x80 [ 6480.152005] [a0319c16] ? btrfs_tree_lock+0x116/0x270 [btrfs] [ 6480.152005] [a0319c16] btrfs_tree_lock+0x116/0x270 [btrfs] [ 6480.152005] [a02b2acb] btrfs_lock_root_node+0x3b/0x50 [btrfs] [ 6480.152005] [a02b81a6] btrfs_search_slot+0x916/0xa20 [btrfs] [ 6480.152005] [811a727f] ? create_object+0x23f/0x300 [ 6480.152005] [a02b9958] btrfs_insert_empty_items+0x78/0xd0 [btrfs] [ 6480.152005] [a036041a] insert_normal_tree_ref.constprop.4+0xa2/0x19a [btrfs] [ 6480.152005] [a03605c3] test_no_shared_qgroup+0xb1/0x1ca [btrfs] [ 6480.152005] [8108cad6] ? local_clock+0x16/0x30 [ 6480.152005] [a035ef8e] btrfs_test_qgroups+0x1ae/0x1d7 [btrfs] [ 6480.152005] [a03a69d2] ? ftrace_define_fields_btrfs_space_reservation+0xfd/0xfd [btrfs] [ 6480.152005] [a03a6a86] init_btrfs_fs+0xb4/0x153 [btrfs] [ 6480.152005] [81000352] do_one_initcall+0x102/0x150 [ 6480.152005] [8103d223] ? set_memory_nx+0x43/0x50 [ 6480.152005] [81682668] ? set_section_ro_nx+0x6d/0x74 [ 6480.152005] [810d91cc] load_module+0x1cdc/0x2630 (...) Therefore initialize the extent buffer as an empty leaf (level 0). Issue easy to reproduce when btrfs is built as a module via: $ for ((i = 1; i = 100; i++)); do rmmod btrfs; modprobe btrfs; done Signed-off-by: Filipe David Borba Manana fdman...@gmail.com --- fs/btrfs/tests/qgroup-tests.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c index fa691b7..0e69c8e 100644 --- a/fs/btrfs/tests/qgroup-tests.c +++ b/fs/btrfs/tests/qgroup-tests.c @@ -410,6 +410,8 @@ int btrfs_test_qgroups(void) * *cough*backref walking code*cough* */ root-node = alloc_test_extent_buffer(root-fs_info, 4096, 4096); + btrfs_set_header_level(root-node, 0); + btrfs_set_header_nritems(root-node, 0); if (!root-node) { test_msg(Couldn't allocate dummy buffer\n); ret = -ENOMEM; -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More
[PATCH v2] Btrfs: fix qgroups sanity test crash or hang
Often when running the qgroups sanity test, a crash or a hang happened. This is because the extent buffer the test uses for the root node doesn't have an header level explicitly set, making it have a random level value. This is a problem when it's not zero for the btrfs_search_slot() calls the test ends up doing, resulting in crashes or hangs such as the following: [ 6454.127192] Btrfs loaded, debug=on, assert=on, integrity-checker=on (...) [ 6454.127760] BTRFS: selftest: Running qgroup tests [ 6454.127964] BTRFS: selftest: Running test_test_no_shared_qgroup [ 6454.127966] BTRFS: selftest: Qgroup basic add [ 6480.152005] BUG: soft lockup - CPU#0 stuck for 23s! [modprobe:5383] [ 6480.152005] Modules linked in: btrfs(+) xor raid6_pq binfmt_misc nfsd auth_rpcgss oid_registry nfs_acl nfs lockd fscache sunrpc i2c_piix4 i2c_core pcspkr evbug psmouse serio_raw e1000 [last unloaded: btrfs] [ 6480.152005] irq event stamp: 188448 [ 6480.152005] hardirqs last enabled at (188447): [8168ef5c] restore_args+0x0/0x30 [ 6480.152005] hardirqs last disabled at (188448): [81698e6a] apic_timer_interrupt+0x6a/0x80 [ 6480.152005] softirqs last enabled at (188446): [810516cf] __do_softirq+0x1cf/0x450 [ 6480.152005] softirqs last disabled at (188441): [81051c25] irq_exit+0xb5/0xc0 [ 6480.152005] CPU: 0 PID: 5383 Comm: modprobe Not tainted 3.15.0-rc8-fdm-btrfs-next-33+ #4 [ 6480.152005] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 [ 6480.152005] task: 8802146125a0 ti: 8800d0d0 task.ti: 8800d0d0 [ 6480.152005] RIP: 0010:[81349a63] [81349a63] __write_lock_failed+0x13/0x20 [ 6480.152005] RSP: 0018:8800d0d038e8 EFLAGS: 0287 [ 6480.152005] RAX: RBX: 8168ef5c RCX: 05deb8525852 [ 6480.152005] RDX: RSI: 1d45 RDI: 8802105000b8 [ 6480.152005] RBP: 8800d0d038e8 R08: fe12710f63db R09: a03196fb [ 6480.152005] R10: 8802146125a0 R11: 880214612e28 R12: 8800d0d03858 [ 6480.152005] R13: R14: 8800d0d0 R15: 8802146125a0 [ 6480.152005] FS: 7f14ff804700() GS:880215e0() knlGS: [ 6480.152005] CS: 0010 DS: ES: CR0: 8005003b [ 6480.152005] CR2: 7fff4df0dac8 CR3: d1796000 CR4: 06f0 [ 6480.152005] Stack: [ 6480.152005] 8800d0d03908 810ae967 0001 8802105000b8 [ 6480.152005] 8800d0d03938 8168e57e a0319c16 0007 [ 6480.152005] 88021050 880210500100 8800d0d039b8 a0319c16 [ 6480.152005] Call Trace: [ 6480.152005] [810ae967] do_raw_write_lock+0x47/0xa0 [ 6480.152005] [8168e57e] _raw_write_lock+0x5e/0x80 [ 6480.152005] [a0319c16] ? btrfs_tree_lock+0x116/0x270 [btrfs] [ 6480.152005] [a0319c16] btrfs_tree_lock+0x116/0x270 [btrfs] [ 6480.152005] [a02b2acb] btrfs_lock_root_node+0x3b/0x50 [btrfs] [ 6480.152005] [a02b81a6] btrfs_search_slot+0x916/0xa20 [btrfs] [ 6480.152005] [811a727f] ? create_object+0x23f/0x300 [ 6480.152005] [a02b9958] btrfs_insert_empty_items+0x78/0xd0 [btrfs] [ 6480.152005] [a036041a] insert_normal_tree_ref.constprop.4+0xa2/0x19a [btrfs] [ 6480.152005] [a03605c3] test_no_shared_qgroup+0xb1/0x1ca [btrfs] [ 6480.152005] [8108cad6] ? local_clock+0x16/0x30 [ 6480.152005] [a035ef8e] btrfs_test_qgroups+0x1ae/0x1d7 [btrfs] [ 6480.152005] [a03a69d2] ? ftrace_define_fields_btrfs_space_reservation+0xfd/0xfd [btrfs] [ 6480.152005] [a03a6a86] init_btrfs_fs+0xb4/0x153 [btrfs] [ 6480.152005] [81000352] do_one_initcall+0x102/0x150 [ 6480.152005] [8103d223] ? set_memory_nx+0x43/0x50 [ 6480.152005] [81682668] ? set_section_ro_nx+0x6d/0x74 [ 6480.152005] [810d91cc] load_module+0x1cdc/0x2630 (...) Therefore initialize the extent buffer as an empty leaf (level 0). Issue easy to reproduce when btrfs is built as a module via: $ for ((i = 1; i = 100; i++)); do rmmod btrfs; modprobe btrfs; done Signed-off-by: Filipe David Borba Manana fdman...@gmail.com --- V2: Fixed silly mistake. Set root-node's header level and nritems after checking if root-node is not null. fs/btrfs/tests/qgroup-tests.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c index fa691b7..ec3dcb2 100644 --- a/fs/btrfs/tests/qgroup-tests.c +++ b/fs/btrfs/tests/qgroup-tests.c @@ -415,6 +415,8 @@ int btrfs_test_qgroups(void) ret = -ENOMEM; goto out; } + btrfs_set_header_level(root-node, 0); + btrfs_set_header_nritems(root-node, 0); root-alloc_bytenr += 8192; tmp_root = btrfs_alloc_dummy_root(); -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord
[PATCH v6] xfstests: add test for btrfs cloning with file holes
Regression test for the btrfs ioctl clone operation when the source range contains hole(s) and the FS has the NO_HOLES feature enabled (file holes don't need file extent items in the btree to represent them). This issue is fixed by the following linux kernel btrfs patch: Btrfs: fix clone to deal with holes when NO_HOLES feature is enabled Signed-off-by: Filipe David Borba Manana fdman...@gmail.com --- V2: Increased test coverage by testing the cases where a hole overlaps the start and end of the cloning range. V3: Test the case where the cloning range includes an hole at the end of the source file and might increase the size of the target file. V4: Added test for the case where the clone range covers only a hole at the beginning of the source file. Made the test be skipped if the available version of mkfs.btrfs doesn't support the no-holes feature. And when testing the case where the no-holes feature isn't enabled, explicitly ask mkfs.btrfs to disable no-holes (future versions of mkfs.btrfs might enable this feature by default). V5: Detect if kernel supports NO_HOLES feature too. Added some messages (echoes) before each od call to make it easier to match output with each specific test. V6: Pass -s to xfs_io when creating the test files. common/rc | 25 tests/btrfs/055 | 165 + tests/btrfs/055.out | 347 tests/btrfs/group | 1 + 4 files changed, 538 insertions(+) create mode 100755 tests/btrfs/055 create mode 100644 tests/btrfs/055.out diff --git a/common/rc b/common/rc index f27ee53..e2136d0 100644 --- a/common/rc +++ b/common/rc @@ -2177,6 +2177,31 @@ _require_btrfs_send_stream_version() fi } +_require_btrfs_mkfs_feature() +{ + if [ -z $1 ]; then + echo Missing feature name argument for _require_btrfs_mkfs_feature + exit 1 + fi + feat=$1 + $MKFS_BTRFS_PROG -O list-all 21 | \ + grep '^[ \t]*'$feat'\b' /dev/null 21 + [ $? -eq 0 ] || \ + _notrun Feature $feat not supported in the available version of mkfs.btrfs +} + +_require_btrfs_fs_feature() +{ + if [ -z $1 ]; then + echo Missing feature name argument for _require_btrfs_fs_feature + exit 1 + fi + feat=$1 + modprobe btrfs /dev/null 21 + [ -e /sys/fs/btrfs/features/$feat ] || \ + _notrun Feature $feat not supported by the available btrfs version +} + init_rc() { if [ $iam == new ] diff --git a/tests/btrfs/055 b/tests/btrfs/055 new file mode 100755 index 000..10c6040 --- /dev/null +++ b/tests/btrfs/055 @@ -0,0 +1,165 @@ +#! /bin/bash +# FS QA Test No. btrfs/055 +# +# Regression test for the btrfs ioctl clone operation when the source range +# contains hole(s) and the FS has the NO_HOLES feature enabled (file holes +# don't need file extent items in the btree to represent them). +# +# This issue is fixed by the following linux kernel btrfs patch: +# +#Btrfs: fix clone to deal with holes when NO_HOLES feature is enabled +# +#--- +# Copyright (c) 2014 Filipe Manana. All Rights Reserved. +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +#--- +# + +seq=`basename $0` +seqres=$RESULT_DIR/$seq +echo QA output created by $seq + +tmp=/tmp/$$ +status=1 # failure is the default! +trap _cleanup; exit \$status 0 1 2 3 15 + +_cleanup() +{ +rm -fr $tmp +} + +# get standard environment, filters and checks +. ./common/rc +. ./common/filter + +# real QA test starts here +_supported_fs btrfs +_supported_os Linux +_require_scratch +_require_btrfs_cloner +_require_btrfs_fs_feature no_holes +_require_btrfs_mkfs_feature no-holes +_need_to_be_root + +rm -f $seqres.full + +test_btrfs_clone_with_holes() +{ + _scratch_mkfs $1 /dev/null 21 + _scratch_mount + + # Create a file with 4 extents and 1 hole, all with a size of 8Kb each. + # The hole is in the range [16384, 24576[. + $XFS_IO_PROG -s -f -c pwrite -S 0x01 -b 8192 0 8192 \ + -c pwrite -S 0x02 -b 8192 8192 8192 \ + -c pwrite -S 0x04 -b 8192 24576 8192 \ + -c pwrite
[PATCH v3] xfstests: add test for btrfs clone + fsync durability
Regression test for btrfs ioctl clone operation + fsync + log recovery. The issue was that doing an fsync after cloning into a file didn't gave any persistence guarantees as it should. What happened was that the in memory metadata (extent maps) weren't updated, which made the fsync code not able to detect that file data has been changed and must be persisted to the log. This issue is fixed by the following linux kernel btrfs patch: Btrfs: make fsync work after cloning into a file Signed-off-by: Filipe David Borba Manana fdman...@gmail.com --- V2: Test small files too, consisting of a single inline extent, as it triggers different code paths. V3: Pass -s to xfs_io when creating test files. tests/btrfs/056 | 145 tests/btrfs/056.out | 129 ++ tests/btrfs/group | 1 + 3 files changed, 275 insertions(+) create mode 100755 tests/btrfs/056 create mode 100644 tests/btrfs/056.out diff --git a/tests/btrfs/056 b/tests/btrfs/056 new file mode 100755 index 000..9ecfeb8 --- /dev/null +++ b/tests/btrfs/056 @@ -0,0 +1,145 @@ +#! /bin/bash +# FS QA Test No. btrfs/056 +# +# Regression test for btrfs ioctl clone operation + fsync + log recovery. +# The issue was that doing an fsync after cloning into a file didn't gave any +# persistence guarantees as it should. What happened was that the in memory +# metadata (extent maps) weren't updated, which made the fsync code not able +# to detect that file data has been changed. +# +# This issue is fixed by the following linux kernel btrfs patch: +# +#Btrfs: make fsync work after cloning into a file +# +#--- +# Copyright (c) 2014 Filipe Manana. All Rights Reserved. +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +#--- +# + +seq=`basename $0` +seqres=$RESULT_DIR/$seq +echo QA output created by $seq + +tmp=/tmp/$$ +status=1 # failure is the default! +trap _cleanup; exit \$status 0 1 2 3 15 + +_cleanup() +{ + _cleanup_flakey + rm -fr $tmp +} + +# get standard environment, filters and checks +. ./common/rc +. ./common/filter +. ./common/dmflakey + +# real QA test starts here +_supported_fs btrfs +_supported_os Linux +_require_scratch +_require_btrfs_cloner +_require_btrfs_fs_feature no_holes +_require_btrfs_mkfs_feature no-holes +_require_dm_flakey +_need_to_be_root + +rm -f $seqres.full + +test_btrfs_clone_fsync_log_recover() +{ + _scratch_mkfs $1 /dev/null 21 + _init_flakey + SAVE_MOUNT_OPTIONS=$MOUNT_OPTIONS + MOUNT_OPTIONS=$MOUNT_OPTIONS $2 + _mount_flakey + + # Create a file with 4 extents and 1 hole, all with a size of 8Kb each. + # The hole is in the range [16384, 24576[. + $XFS_IO_PROG -s -f -c pwrite -S 0x01 -b 8192 0 8192 \ + -c pwrite -S 0x02 -b 8192 8192 8192 \ + -c pwrite -S 0x04 -b 8192 24576 8192 \ + -c pwrite -S 0x05 -b 8192 32768 8192 \ + $SCRATCH_MNT/foo | _filter_xfs_io + + # Clone destination file, 1 extent of 96kb. + $XFS_IO_PROG -f -c pwrite -S 0xff -b 98304 0 98304 -c fsync \ + $SCRATCH_MNT/bar | _filter_xfs_io + + # Clone second half of the 2nd extent, the 8kb hole, the 3rd extent + # and the first half of the 4th extent into file bar. + $CLONER_PROG -s 12288 -d 0 -l 24576 $SCRATCH_MNT/foo $SCRATCH_MNT/bar + $XFS_IO_PROG -c fsync $SCRATCH_MNT/bar + + # Test small files too consisting of 1 inline extent + $XFS_IO_PROG -f -c pwrite -S 0x00 -b 3500 0 3500 -c fsync \ + $SCRATCH_MNT/foo2 | _filter_xfs_io + + $XFS_IO_PROG -f -c pwrite -S 0xcc -b 1000 0 1000 -c fsync \ + $SCRATCH_MNT/bar2 | _filter_xfs_io + + # Clone the entire foo2 file into bar2, overwriting all data in bar2 + # and increasing its size. + $CLONER_PROG -s 0 -d 0 -l 3500 $SCRATCH_MNT/foo2 $SCRATCH_MNT/bar2 + $XFS_IO_PROG -c fsync $SCRATCH_MNT/bar2 + + _load_flakey_table $FLAKEY_DROP_WRITES + _unmount_flakey + + # Verify that there are no consistency errors. + _check_scratch_fs $FLAKEY_DEV + + _load_flakey_table $FLAKEY_ALLOW_WRITES + _mount_flakey
[PATCH] Btrfs: fix RCU correctness warning when running sanity tests
When CONFIG_PROVE_RCU=y and CONFIG_PROVE_RCU_REPEATEDLY=y, the following was dumped in dmesg: [ 3197.218064] === [ 3197.218064] [ INFO: suspicious RCU usage. ] [ 3197.218066] 3.15.0-rc8-fdm-btrfs-next-33+ #4 Not tainted [ 3197.218067] --- [ 3197.218068] include/linux/radix-tree.h:196 suspicious rcu_dereference_check() usage! [ 3197.218068] [ 3197.218068] other info that might help us debug this: [ 3197.218068] [ 3197.218070] [ 3197.218070] rcu_scheduler_active = 1, debug_locks = 1 [ 3197.218071] 1 lock held by modprobe/12024: [ 3197.218072] #0: ((fs_info-buffer_lock)-rlock){+.+...}, at: [a025c5fa] btrfs_free_dummy_root+0x5a/0x1d0 [btrfs] [ 3197.218093] [ 3197.218093] stack backtrace: [ 3197.218095] CPU: 3 PID: 12024 Comm: modprobe Not tainted 3.15.0-rc8-fdm-btrfs-next-33+ #4 [ 3197.218096] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 [ 3197.218097] 0001 8800af18fc18 81685c5a feb0 [ 3197.218099] 8800cf6ccb40 8800af18fc48 810a6316 8801d955f640 [ 3197.218101] 8800d719e328 8800d719e370 8800d719c000 8800af18fcb8 [ 3197.218102] Call Trace: [ 3197.218105] [81685c5a] dump_stack+0x4e/0x68 [ 3197.218108] [810a6316] lockdep_rcu_suspicious+0xe6/0x130 [ 3197.218119] [a025c728] btrfs_free_dummy_root+0x188/0x1d0 [btrfs] [ 3197.218129] [a025f56a] btrfs_test_qgroups+0xea/0x1bb [btrfs] [ 3197.218137] [a03a19d2] ? ftrace_define_fields_btrfs_space_reservation+0xfd/0xfd [btrfs] [ 3197.218144] [a03a19d2] ? ftrace_define_fields_btrfs_space_reservation+0xfd/0xfd [btrfs] [ 3197.218151] [a03a1ab7] init_btrfs_fs+0xe5/0x184 [btrfs] [ 3197.218154] [81000352] do_one_initcall+0x102/0x150 [ 3197.218157] [8103d223] ? set_memory_nx+0x43/0x50 [ 3197.218160] [81682668] ? set_section_ro_nx+0x6d/0x74 [ 3197.218162] [810d91cc] load_module+0x1cdc/0x2630 [ 3197.218164] [810d4e90] ? show_initstate+0x60/0x60 [ 3197.218166] [810d9c9e] SyS_finit_module+0x8e/0x90 [ 3197.218168] [81698212] system_call_fastpath+0x16/0x1b Signed-off-by: Filipe David Borba Manana fdman...@gmail.com --- fs/btrfs/tests/btrfs-tests.c | 8 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index a5dcacb..bbbfec9 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -130,8 +130,8 @@ static void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info) struct radix_tree_iter iter; void **slot; - spin_lock(fs_info-buffer_lock); restart: + rcu_read_lock(); radix_tree_for_each_slot(slot, fs_info-buffer_radix, iter, 0) { struct extent_buffer *eb; @@ -144,11 +144,11 @@ restart: goto restart; continue; } - spin_unlock(fs_info-buffer_lock); + rcu_read_unlock(); free_extent_buffer_stale(eb); - spin_lock(fs_info-buffer_lock); + goto restart; } - spin_unlock(fs_info-buffer_lock); + rcu_read_unlock(); btrfs_free_qgroup_config(fs_info); btrfs_free_fs_roots(fs_info); -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v2] Btrfs: fix RCU correctness warning when running sanity tests
When CONFIG_PROVE_RCU=y and CONFIG_PROVE_RCU_REPEATEDLY=y, the following was dumped in dmesg: [ 3197.218064] === [ 3197.218064] [ INFO: suspicious RCU usage. ] [ 3197.218066] 3.15.0-rc8-fdm-btrfs-next-33+ #4 Not tainted [ 3197.218067] --- [ 3197.218068] include/linux/radix-tree.h:196 suspicious rcu_dereference_check() usage! [ 3197.218068] [ 3197.218068] other info that might help us debug this: [ 3197.218068] [ 3197.218070] [ 3197.218070] rcu_scheduler_active = 1, debug_locks = 1 [ 3197.218071] 1 lock held by modprobe/12024: [ 3197.218072] #0: ((fs_info-buffer_lock)-rlock){+.+...}, at: [a025c5fa] btrfs_free_dummy_root+0x5a/0x1d0 [btrfs] [ 3197.218093] [ 3197.218093] stack backtrace: [ 3197.218095] CPU: 3 PID: 12024 Comm: modprobe Not tainted 3.15.0-rc8-fdm-btrfs-next-33+ #4 [ 3197.218096] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 [ 3197.218097] 0001 8800af18fc18 81685c5a feb0 [ 3197.218099] 8800cf6ccb40 8800af18fc48 810a6316 8801d955f640 [ 3197.218101] 8800d719e328 8800d719e370 8800d719c000 8800af18fcb8 [ 3197.218102] Call Trace: [ 3197.218105] [81685c5a] dump_stack+0x4e/0x68 [ 3197.218108] [810a6316] lockdep_rcu_suspicious+0xe6/0x130 [ 3197.218119] [a025c728] btrfs_free_dummy_root+0x188/0x1d0 [btrfs] [ 3197.218129] [a025f56a] btrfs_test_qgroups+0xea/0x1bb [btrfs] [ 3197.218137] [a03a19d2] ? ftrace_define_fields_btrfs_space_reservation+0xfd/0xfd [btrfs] [ 3197.218144] [a03a19d2] ? ftrace_define_fields_btrfs_space_reservation+0xfd/0xfd [btrfs] [ 3197.218151] [a03a1ab7] init_btrfs_fs+0xe5/0x184 [btrfs] [ 3197.218154] [81000352] do_one_initcall+0x102/0x150 [ 3197.218157] [8103d223] ? set_memory_nx+0x43/0x50 [ 3197.218160] [81682668] ? set_section_ro_nx+0x6d/0x74 [ 3197.218162] [810d91cc] load_module+0x1cdc/0x2630 [ 3197.218164] [810d4e90] ? show_initstate+0x60/0x60 [ 3197.218166] [810d9c9e] SyS_finit_module+0x8e/0x90 [ 3197.218168] [81698212] system_call_fastpath+0x16/0x1b Signed-off-by: Filipe David Borba Manana fdman...@gmail.com --- V2: Added missing rcu read unlock if a retry is needed. fs/btrfs/tests/btrfs-tests.c | 12 +++- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index a5dcacb..bdb1f05 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -130,8 +130,8 @@ static void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info) struct radix_tree_iter iter; void **slot; - spin_lock(fs_info-buffer_lock); restart: + rcu_read_lock(); radix_tree_for_each_slot(slot, fs_info-buffer_radix, iter, 0) { struct extent_buffer *eb; @@ -140,15 +140,17 @@ restart: continue; /* Shouldn't happen but that kind of thinking creates CVE's */ if (radix_tree_exception(eb)) { - if (radix_tree_deref_retry(eb)) + if (radix_tree_deref_retry(eb)) { + rcu_read_unlock(); goto restart; + } continue; } - spin_unlock(fs_info-buffer_lock); + rcu_read_unlock(); free_extent_buffer_stale(eb); - spin_lock(fs_info-buffer_lock); + goto restart; } - spin_unlock(fs_info-buffer_lock); + rcu_read_unlock(); btrfs_free_qgroup_config(fs_info); btrfs_free_fs_roots(fs_info); -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] Btrfs: ensure btrfs_prev_leaf doesn't miss 1 item
We might have had an item with the previous key in the tree right before we released our path. And after we released our path, that item might have been pushed to the first slot (0) of the leaf we were holding due to a tree balance. Alternatively, an item with the previous key can exist as the only element of a leaf (big fat item). Therefore account for these 2 cases, so that our callers (like btrfs_previous_item) don't miss an existing item with a key matching the previous key we computed above. Signed-off-by: Filipe David Borba Manana fdman...@gmail.com --- fs/btrfs/ctree.c | 12 +++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index d99d965..4eada52 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -5097,7 +5097,17 @@ int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path) return ret; btrfs_item_key(path-nodes[0], found_key, 0); ret = comp_keys(found_key, key); - if (ret 0) + /* +* We might have had an item with the previous key in the tree right +* before we released our path. And after we released our path, that +* item might have been pushed to the first slot (0) of the leaf we +* were holding due to a tree balance. Alternatively, an item with the +* previous key can exist as the only element of a leaf (big fat item). +* Therefore account for these 2 cases, so that our callers (like +* btrfs_previous_item) don't miss an existing item with a key matching +* the previous key we computed above. +*/ + if (ret = 0) return 0; return 1; } -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v4] Btrfs: make fsync work after cloning into a file
When cloning into a file, we were correctly replacing the extent items in the target range and removing the extent maps. However we weren't replacing the extent maps with new ones that point to the new extents - as a consequence, an incremental fsync (when the inode doesn't have the full sync flag) was a NOOP, since it relies on the existence of extent maps in the modified list of the inode's extent map tree, which was empty. Therefore add new extent maps to reflect the target clone range. A test case for xfstests follows. Signed-off-by: Filipe David Borba Manana fdman...@gmail.com --- V2: Factored out needed code from inode.c:btrfs_get_extent() into a separate function so that it can be reused by the cloning code, avoiding some duplicated and non-trivial logic when populating an extent map from a file extent item. V3: Removed unused function parameter, leftover from V1. V4: Simplified some code in mapping from file extent item to extent map. fs/btrfs/ctree.h | 5 fs/btrfs/file-item.c | 66 fs/btrfs/inode.c | 41 +++- fs/btrfs/ioctl.c | 65 +++ 4 files changed, 139 insertions(+), 38 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index af523d6..a668fd9 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3749,6 +3749,11 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, struct bio *bio, u64 file_start, int contig); int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, struct list_head *list, int search_commit); +void btrfs_extent_item_to_extent_map(struct inode *inode, +const struct btrfs_path *path, +struct btrfs_file_extent_item *fi, +struct extent_map *em); + /* inode.c */ struct btrfs_delalloc_work { struct inode *inode; diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 609d56b..9732b33 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -885,3 +885,69 @@ out: fail_unlock: goto out; } + +void btrfs_extent_item_to_extent_map(struct inode *inode, +const struct btrfs_path *path, +struct btrfs_file_extent_item *fi, +struct extent_map *em) +{ + struct btrfs_root *root = BTRFS_I(inode)-root; + struct extent_buffer *leaf = path-nodes[0]; + const int slot = path-slots[0]; + struct btrfs_key key; + u64 extent_start, extent_end; + u64 bytenr; + u8 type = btrfs_file_extent_type(leaf, fi); + + em-bdev = root-fs_info-fs_devices-latest_bdev; + btrfs_item_key_to_cpu(leaf, key, slot); + extent_start = key.offset; + + if (type == BTRFS_FILE_EXTENT_REG || + type == BTRFS_FILE_EXTENT_PREALLOC) { + extent_end = extent_start + + btrfs_file_extent_num_bytes(leaf, fi); + } else if (type == BTRFS_FILE_EXTENT_INLINE) { + size_t size; + size = btrfs_file_extent_inline_len(leaf, slot, fi); + extent_end = ALIGN(extent_start + size, root-sectorsize); + } + + em-ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); + em-start = extent_start; + em-orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi); + em-compress_type = btrfs_file_extent_compression(leaf, fi); + if (em-compress_type != BTRFS_COMPRESS_NONE) + set_bit(EXTENT_FLAG_COMPRESSED, em-flags); + + if (type == BTRFS_FILE_EXTENT_REG || + type == BTRFS_FILE_EXTENT_PREALLOC) { + em-len = extent_end - extent_start; + em-orig_start = extent_start - + btrfs_file_extent_offset(leaf, fi); + bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); + if (bytenr == 0) { + em-block_start = EXTENT_MAP_HOLE; + return; + } + if (em-compress_type != BTRFS_COMPRESS_NONE) { + em-block_len = em-orig_block_len; + } else { + bytenr += btrfs_file_extent_offset(leaf, fi); + em-block_len = em-len; + if (type == BTRFS_FILE_EXTENT_PREALLOC) + set_bit(EXTENT_FLAG_PREALLOC, em-flags); + } + em-block_start = bytenr; + } else if (type == BTRFS_FILE_EXTENT_INLINE) { + em-block_start = EXTENT_MAP_INLINE; + em-orig_start = em-start; + em-len = extent_end - extent_start; + em-block_len = em-orig_block_len; + } else { + btrfs_err(root-fs_info
[PATCH v5] Btrfs: make fsync work after cloning into a file
When cloning into a file, we were correctly replacing the extent items in the target range and removing the extent maps. However we weren't replacing the extent maps with new ones that point to the new extents - as a consequence, an incremental fsync (when the inode doesn't have the full sync flag) was a NOOP, since it relies on the existence of extent maps in the modified list of the inode's extent map tree, which was empty. Therefore add new extent maps to reflect the target clone range. A test case for xfstests follows. Signed-off-by: Filipe David Borba Manana fdman...@gmail.com --- V2: Factored out needed code from inode.c:btrfs_get_extent() into a separate function so that it can be reused by the cloning code, avoiding some duplicated and non-trivial logic when populating an extent map from a file extent item. V3: Removed unused function parameter, leftover from V1. V4: Simplified some code in mapping from file extent item to extent map. V5: Corrected refactoring to have the same exact behaviour as before (in btrfs_get_extent) for new inline extents. Fixed an issue introduced in V4 that made xfstests/generic/269 trigger an error and the following warning in dmesg: [13229.752008] WARNING: CPU: 1 PID: 13326 at fs/btrfs/extent_io.c:5097 map_private_extent_buffer+0xd4/0xe0 [btrfs]() [13229.752383] btrfs bad mapping eb start 78897152 len 4096, wanted 4098 8 Updated the corresponding test case for xfstests to test for inline file extents. fs/btrfs/ctree.h | 6 + fs/btrfs/file-item.c | 76 fs/btrfs/inode.c | 42 +++-- fs/btrfs/ioctl.c | 69 +++ 4 files changed, 155 insertions(+), 38 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index af523d6..b7e2c1c 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3749,6 +3749,12 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, struct bio *bio, u64 file_start, int contig); int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, struct list_head *list, int search_commit); +void btrfs_extent_item_to_extent_map(struct inode *inode, +const struct btrfs_path *path, +struct btrfs_file_extent_item *fi, +const bool new_inline, +struct extent_map *em); + /* inode.c */ struct btrfs_delalloc_work { struct inode *inode; diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 609d56b..f46cfe4 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -885,3 +885,79 @@ out: fail_unlock: goto out; } + +void btrfs_extent_item_to_extent_map(struct inode *inode, +const struct btrfs_path *path, +struct btrfs_file_extent_item *fi, +const bool new_inline, +struct extent_map *em) +{ + struct btrfs_root *root = BTRFS_I(inode)-root; + struct extent_buffer *leaf = path-nodes[0]; + const int slot = path-slots[0]; + struct btrfs_key key; + u64 extent_start, extent_end; + u64 bytenr; + u8 type = btrfs_file_extent_type(leaf, fi); + int compress_type = btrfs_file_extent_compression(leaf, fi); + + em-bdev = root-fs_info-fs_devices-latest_bdev; + btrfs_item_key_to_cpu(leaf, key, slot); + extent_start = key.offset; + + if (type == BTRFS_FILE_EXTENT_REG || + type == BTRFS_FILE_EXTENT_PREALLOC) { + extent_end = extent_start + + btrfs_file_extent_num_bytes(leaf, fi); + } else if (type == BTRFS_FILE_EXTENT_INLINE) { + size_t size; + size = btrfs_file_extent_inline_len(leaf, slot, fi); + extent_end = ALIGN(extent_start + size, root-sectorsize); + } + + em-ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); + if (type == BTRFS_FILE_EXTENT_REG || + type == BTRFS_FILE_EXTENT_PREALLOC) { + em-start = extent_start; + em-len = extent_end - extent_start; + em-orig_start = extent_start - + btrfs_file_extent_offset(leaf, fi); + em-orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi); + bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); + if (bytenr == 0) { + em-block_start = EXTENT_MAP_HOLE; + return; + } + if (compress_type != BTRFS_COMPRESS_NONE) { + set_bit(EXTENT_FLAG_COMPRESSED, em-flags); + em-compress_type = compress_type; + em-block_start = bytenr
[PATCH v2] xfstests: add test for btrfs clone + fsync durability
Regression test for btrfs ioctl clone operation + fsync + log recovery. The issue was that doing an fsync after cloning into a file didn't gave any persistence guarantees as it should. What happened was that the in memory metadata (extent maps) weren't updated, which made the fsync code not able to detect that file data has been changed and must be persisted to the log. This issue is fixed by the following linux kernel btrfs patch: Btrfs: make fsync work after cloning into a file Signed-off-by: Filipe David Borba Manana fdman...@gmail.com --- V2: Test small files too, consisting of a single inline extent, as it triggers different code paths. tests/btrfs/056 | 150 tests/btrfs/056.out | 129 tests/btrfs/group | 1 + 3 files changed, 280 insertions(+) create mode 100755 tests/btrfs/056 create mode 100644 tests/btrfs/056.out diff --git a/tests/btrfs/056 b/tests/btrfs/056 new file mode 100755 index 000..e066442 --- /dev/null +++ b/tests/btrfs/056 @@ -0,0 +1,150 @@ +#! /bin/bash +# FS QA Test No. btrfs/056 +# +# Regression test for btrfs ioctl clone operation + fsync + log recovery. +# The issue was that doing an fsync after cloning into a file didn't gave any +# persistence guarantees as it should. What happened was that the in memory +# metadata (extent maps) weren't updated, which made the fsync code not able +# to detect that file data has been changed. +# +# This issue is fixed by the following linux kernel btrfs patch: +# +#Btrfs: make fsync work after cloning into a file +# +#--- +# Copyright (c) 2014 Filipe Manana. All Rights Reserved. +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +#--- +# + +seq=`basename $0` +seqres=$RESULT_DIR/$seq +echo QA output created by $seq + +tmp=/tmp/$$ +status=1 # failure is the default! +trap _cleanup; exit \$status 0 1 2 3 15 + +_cleanup() +{ + _cleanup_flakey + rm -fr $tmp +} + +# get standard environment, filters and checks +. ./common/rc +. ./common/filter +. ./common/dmflakey + +# real QA test starts here +_supported_fs btrfs +_supported_os Linux +_require_scratch +_require_btrfs_cloner +_require_btrfs_fs_feature no_holes +_require_btrfs_mkfs_feature no-holes +_require_dm_flakey +_need_to_be_root + +rm -f $seqres.full + +test_btrfs_clone_fsync_log_recover() +{ + _scratch_mkfs $1 /dev/null 21 + _init_flakey + SAVE_MOUNT_OPTIONS=$MOUNT_OPTIONS + MOUNT_OPTIONS=$MOUNT_OPTIONS $2 + _mount_flakey + + # Create a file with 4 extents and 1 hole, all with a size of 8Kb each. + # The hole is in the range [16384, 24576[. + $XFS_IO_PROG -f -c pwrite -S 0x01 -b 8192 0 8192 \ + -c fsync \ + -c pwrite -S 0x02 -b 8192 8192 8192 \ + -c fsync \ + -c pwrite -S 0x04 -b 8192 24576 8192 \ + -c fsync \ + -c pwrite -S 0x05 -b 8192 32768 8192 \ + -c fsync \ + $SCRATCH_MNT/foo | _filter_xfs_io + + # Clone destination file, 1 extent of 96kb. + $XFS_IO_PROG -f -c pwrite -S 0xff -b 98304 0 98304 -c fsync \ + $SCRATCH_MNT/bar | _filter_xfs_io + + # Clone second half of the 2nd extent, the 8kb hole, the 3rd extent + # and the first half of the 4th extent into file bar. + $CLONER_PROG -s 12288 -d 0 -l 24576 $SCRATCH_MNT/foo $SCRATCH_MNT/bar + + $XFS_IO_PROG -c fsync $SCRATCH_MNT/bar + + # Test small files too consisting of 1 inline extent + $XFS_IO_PROG -f -c pwrite -S 0x00 -b 3500 0 3500 -c fsync \ + $SCRATCH_MNT/foo2 | _filter_xfs_io + + $XFS_IO_PROG -f -c pwrite -S 0xcc -b 1000 0 1000 -c fsync \ + $SCRATCH_MNT/bar2 | _filter_xfs_io + + # Clone the entire foo2 file into bar2, overwriting all data in bar2 + # and increasing its size. + $CLONER_PROG -s 0 -d 0 -l 3500 $SCRATCH_MNT/foo2 $SCRATCH_MNT/bar2 + $XFS_IO_PROG -c fsync $SCRATCH_MNT/bar2 + + _load_flakey_table $FLAKEY_DROP_WRITES + _unmount_flakey + + # Verify that there are no consistency errors. + _check_scratch_fs
[PATCH v3] Btrfs: make fsync work after cloning into a file
When cloning into a file, we were correctly replacing the extent items in the target range and removing the extent maps. However we weren't replacing the extent maps with new ones that point to the new extents - as a consequence, an incremental fsync (when the inode doesn't have the full sync flag) was a NOOP, since it relies on the existence of extent maps in the modified list of the inode's extent map tree, which was empty. Therefore add new extent maps to reflect the target clone range. A test case for xfstests follows. Signed-off-by: Filipe David Borba Manana fdman...@gmail.com --- V2: Factored out needed code from inode.c:btrfs_get_extent() into a separate function so that it can be reused by the cloning code, avoiding some duplicated and non-trivial logic when populating an extent map from a file extent item. V3: Removed unused function parameter, leftover from V1. fs/btrfs/ctree.h | 5 fs/btrfs/file-item.c | 68 fs/btrfs/inode.c | 41 +++ fs/btrfs/ioctl.c | 65 + 4 files changed, 141 insertions(+), 38 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index af523d6..a668fd9 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3749,6 +3749,11 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, struct bio *bio, u64 file_start, int contig); int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, struct list_head *list, int search_commit); +void btrfs_extent_item_to_extent_map(struct inode *inode, +const struct btrfs_path *path, +struct btrfs_file_extent_item *fi, +struct extent_map *em); + /* inode.c */ struct btrfs_delalloc_work { struct inode *inode; diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 609d56b..f8f2436 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -885,3 +885,71 @@ out: fail_unlock: goto out; } + +void btrfs_extent_item_to_extent_map(struct inode *inode, +const struct btrfs_path *path, +struct btrfs_file_extent_item *fi, +struct extent_map *em) +{ + struct btrfs_root *root = BTRFS_I(inode)-root; + struct extent_buffer *leaf = path-nodes[0]; + const int slot = path-slots[0]; + struct btrfs_key key; + u64 extent_start, extent_end; + u64 bytenr; + u8 type = btrfs_file_extent_type(leaf, fi); + + em-bdev = root-fs_info-fs_devices-latest_bdev; + btrfs_item_key_to_cpu(leaf, key, slot); + extent_start = key.offset; + + if (type == BTRFS_FILE_EXTENT_REG || + type == BTRFS_FILE_EXTENT_PREALLOC) { + extent_end = extent_start + + btrfs_file_extent_num_bytes(leaf, fi); + } else if (type == BTRFS_FILE_EXTENT_INLINE) { + size_t size; + size = btrfs_file_extent_inline_len(leaf, slot, fi); + extent_end = ALIGN(extent_start + size, root-sectorsize); + } + + em-ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); + em-compress_type = btrfs_file_extent_compression(leaf, fi); + if (em-compress_type != BTRFS_COMPRESS_NONE) + set_bit(EXTENT_FLAG_COMPRESSED, em-flags); + + if (type == BTRFS_FILE_EXTENT_REG || + type == BTRFS_FILE_EXTENT_PREALLOC) { + em-start = extent_start; + em-len = extent_end - extent_start; + em-orig_start = extent_start - + btrfs_file_extent_offset(leaf, fi); + em-orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi); + bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); + if (bytenr == 0) { + em-block_start = EXTENT_MAP_HOLE; + return; + } + if (em-compress_type != BTRFS_COMPRESS_NONE) { + em-block_start = bytenr; + em-block_len = em-orig_block_len; + } else { + bytenr += btrfs_file_extent_offset(leaf, fi); + em-block_start = bytenr; + em-block_len = em-len; + if (type == BTRFS_FILE_EXTENT_PREALLOC) + set_bit(EXTENT_FLAG_PREALLOC, em-flags); + } + } else if (type == BTRFS_FILE_EXTENT_INLINE) { + em-block_start = EXTENT_MAP_INLINE; + em-start = extent_start; + em-orig_start = EXTENT_MAP_HOLE; + em-len = extent_end - extent_start; + em-block_len = (u64)-1; + } else { + btrfs_err(root-fs_info
[PATCH 3/3] Btrfs: don't release invalid page in btrfs_page_exists_in_range()
In inode.c:btrfs_page_exists_in_range(), if the page we got from the radix tree is an exception entry, which can't be retried, we exit the loop with a non-NULL page and then call page_cache_release against it, which is not ok since it's not a valid page. This could also make us return true when we shouldn't. Signed-off-by: Filipe David Borba Manana fdman...@gmail.com --- fs/btrfs/inode.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index f265f41..477e64a 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6776,6 +6776,7 @@ bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end) * here as an exceptional entry: so return it without * attempting to raise page count. */ + page = NULL; break; /* TODO: Is this relevant for this use case? */ } -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 1/3] Btrfs: make sure we retry if we couldn't get the page
In inode.c:btrfs_page_exists_in_range(), if we can't get the page we need to retry. However we weren't retrying because we weren't setting page to NULL, which makes the while loop exit immediately and will make us call page_cache_release after exiting the loop which is incorrect because our page get didn't succeed. This could also make us return true when we shouldn't. Signed-off-by: Filipe David Borba Manana fdman...@gmail.com --- fs/btrfs/inode.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 38d1e7b..cdbd20e 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6777,8 +6777,10 @@ bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end) break; /* TODO: Is this relevant for this use case? */ } - if (!page_cache_get_speculative(page)) + if (!page_cache_get_speculative(page)) { + page = NULL; continue; + } /* * Has the page moved? -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 2/3] Btrfs: make sure we retry if page is a retriable exception
In inode.c:btrfs_page_exists_in_range(), if the page we get from the radix tree is an exception which should make us retry, set page to NULL in order to really retry, because otherwise we don't get another loop iteration executed (page != NULL makes the while loop exit). This also was making us call page_cache_release after exiting the loop, which isn't correct because page doesn't point to a valid page, and possibly return true from the function when we shouldn't. Signed-off-by: Filipe David Borba Manana fdman...@gmail.com --- fs/btrfs/inode.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index cdbd20e..f265f41 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6767,8 +6767,10 @@ bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end) break; if (radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) + if (radix_tree_deref_retry(page)) { + page = NULL; continue; + } /* * Otherwise, shmem/tmpfs must be storing a swap entry * here as an exceptional entry: so return it without -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] Btrfs: make fsync work after cloning into a file
When cloning into a file, we were correctly replacing the extent items in the target range and removing the extent maps. However we weren't replacing the extent maps with new ones that point to the new extents - as a consequence, an incremental fsync (when the inode doesn't have the full sync flag) was a NOOP, since it relies on the existence of extent maps in the modified list of the inode's extent map tree, which was empty. Therefore add new extent maps to reflect the target clone range. A test case for xfstests follows. Signed-off-by: Filipe David Borba Manana fdman...@gmail.com --- fs/btrfs/ioctl.c | 114 +++ 1 file changed, 114 insertions(+) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 44dcfd0..1197478 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3043,6 +3043,104 @@ out: return ret; } +static void clone_update_extent_map(struct inode *inode, + const struct btrfs_trans_handle *trans, + const struct btrfs_path *path, + const struct btrfs_key *key, + struct btrfs_file_extent_item *fi, + const u64 hole_offset, + const u64 hole_len) +{ + struct extent_map_tree *em_tree = BTRFS_I(inode)-extent_tree; + struct btrfs_root *root = BTRFS_I(inode)-root; + struct extent_buffer *leaf = path-nodes[0]; + const int slot = path-slots[0]; + struct extent_map *em; + u64 extent_start, extent_end; + u64 bytenr; + u8 type; + int ret; + + em = alloc_extent_map(); + if (!em) { + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + BTRFS_I(inode)-runtime_flags); + return; + } + + em-bdev = root-fs_info-fs_devices-latest_bdev; + if (!fi) { + em-start = hole_offset; + em-len = hole_len; + em-ram_bytes = em-len; + em-orig_start = hole_offset; + em-block_start = EXTENT_MAP_HOLE; + em-block_len = 0; + em-orig_block_len = 0; + em-compress_type = BTRFS_COMPRESS_NONE; + em-generation = trans-transid; + goto insert_em; + } + + em-generation = -1; + extent_start = key-offset; + extent_end = extent_start + btrfs_file_extent_num_bytes(leaf, fi); + bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); + type = btrfs_file_extent_type(leaf, fi); + + em-start = extent_start; + em-ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); + em-compress_type = btrfs_file_extent_compression(leaf, fi); + + if (em-compress_type != BTRFS_COMPRESS_NONE) + set_bit(EXTENT_FLAG_COMPRESSED, em-flags); + + if (type == BTRFS_FILE_EXTENT_INLINE) { + em-len = ALIGN(btrfs_file_extent_inline_len(leaf, slot, fi), + root-sectorsize); + em-orig_block_len = em-len; + em-orig_start = em-start; + em-block_start = EXTENT_MAP_INLINE; + em-block_len = (u64)-1; + goto insert_em; + } + + em-len = extent_end - extent_start; + em-orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi); + em-orig_start = extent_start - btrfs_file_extent_offset(leaf, fi); + if (bytenr == 0) + em-block_start = EXTENT_MAP_HOLE; + else + em-block_start = bytenr; + + if (em-compress_type == BTRFS_COMPRESS_NONE) { + em-block_start += btrfs_file_extent_offset(leaf, fi); + em-block_len = em-len; + } else { + em-block_len = em-orig_block_len; + } + + if (type == BTRFS_FILE_EXTENT_PREALLOC) + set_bit(EXTENT_FLAG_PREALLOC, em-flags); + +insert_em: + while (1) { + write_lock(em_tree-lock); + ret = add_extent_mapping(em_tree, em, 1); + write_unlock(em_tree-lock); + if (ret != -EEXIST) { + free_extent_map(em); + break; + } + btrfs_drop_extent_cache(inode, em-start, + em-start + em-len - 1, 0); + } + + if (unlikely(ret)) + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + BTRFS_I(inode)-runtime_flags); +} + /** * btrfs_clone() - clone a range from inode file to another * @@ -3361,8 +3459,19 @@ process_slot: btrfs_item_ptr_offset(leaf, slot), size); inode_add_bytes(inode, datal); + extent = btrfs_item_ptr(leaf, slot, + struct
[PATCH] xfstests: add test for btrfs clone + fsync durability
Regression test for btrfs ioctl clone operation + fsync + log recovery. The issue was that doing an fsync after cloning into a file didn't gave any persistence guarantees as it should. What happened was that the in memory metadata (extent maps) weren't updated, which made the fsync code not able to detect that file data has been changed and must be persisted to the log. This issue is fixed by the following linux kernel btrfs patch: Btrfs: make fsync work after cloning into a file Signed-off-by: Filipe David Borba Manana fdman...@gmail.com --- tests/btrfs/056 | 134 tests/btrfs/056.out | 89 ++ tests/btrfs/group | 1 + 3 files changed, 224 insertions(+) create mode 100755 tests/btrfs/056 create mode 100644 tests/btrfs/056.out diff --git a/tests/btrfs/056 b/tests/btrfs/056 new file mode 100755 index 000..cfe87cd --- /dev/null +++ b/tests/btrfs/056 @@ -0,0 +1,134 @@ +#! /bin/bash +# FS QA Test No. btrfs/056 +# +# Regression test for btrfs ioctl clone operation + fsync + log recovery. +# The issue was that doing an fsync after cloning into a file didn't gave any +# persistence guarantees as it should. What happened was that the in memory +# metadata (extent maps) weren't updated, which made the fsync code not able +# to detect that file data has been changed. +# +# This issue is fixed by the following linux kernel btrfs patch: +# +#Btrfs: make fsync work after cloning into a file +# +#--- +# Copyright (c) 2014 Filipe Manana. All Rights Reserved. +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +#--- +# + +seq=`basename $0` +seqres=$RESULT_DIR/$seq +echo QA output created by $seq + +tmp=/tmp/$$ +status=1 # failure is the default! +trap _cleanup; exit \$status 0 1 2 3 15 + +_cleanup() +{ + _cleanup_flakey + rm -fr $tmp +} + +# get standard environment, filters and checks +. ./common/rc +. ./common/filter +. ./common/dmflakey + +# real QA test starts here +_supported_fs btrfs +_supported_os Linux +_require_scratch +_require_btrfs_cloner +_require_btrfs_fs_feature no_holes +_require_btrfs_mkfs_feature no-holes +_require_dm_flakey +_need_to_be_root + +rm -f $seqres.full + +test_btrfs_clone_fsync_log_recover() +{ + _scratch_mkfs $1 /dev/null 21 + _init_flakey + SAVE_MOUNT_OPTIONS=$MOUNT_OPTIONS + MOUNT_OPTIONS=$MOUNT_OPTIONS $2 + _mount_flakey + + # Create a file with 4 extents and 1 hole, all with a size of 8Kb each. + # The hole is in the range [16384, 24576[. + $XFS_IO_PROG -f -c pwrite -S 0x01 -b 8192 0 8192 \ + -c fsync \ + -c pwrite -S 0x02 -b 8192 8192 8192 \ + -c fsync \ + -c pwrite -S 0x04 -b 8192 24576 8192 \ + -c fsync \ + -c pwrite -S 0x05 -b 8192 32768 8192 \ + -c fsync \ + $SCRATCH_MNT/foo | _filter_xfs_io + + # Clone destination file, 1 extent of 96kb. + $XFS_IO_PROG -f -c pwrite -S 0xff -b 98304 0 98304 -c fsync \ + $SCRATCH_MNT/bar | _filter_xfs_io + + # Clone second half of the 2nd extent, the 8kb hole, the 3rd extent + # and the first half of the 4th extent into file bar. + $CLONER_PROG -s 12288 -d 0 -l 24576 $SCRATCH_MNT/foo $SCRATCH_MNT/bar + + $XFS_IO_PROG -c fsync $SCRATCH_MNT/bar + + _load_flakey_table $FLAKEY_DROP_WRITES + _unmount_flakey + + # Verify that there are no consistency errors. + _check_scratch_fs $FLAKEY_DEV + + _load_flakey_table $FLAKEY_ALLOW_WRITES + _mount_flakey + + # Verify the cloned range was persisted by fsync and the log recovery + # code did its work well. + od -t x1 $SCRATCH_MNT/bar + + _unmount_flakey + + # Verify that there are no consistency errors. + _check_scratch_fs $FLAKEY_DEV + + _cleanup_flakey + MOUNT_OPTIONS=$SAVE_MOUNT_OPTIONS +} + +# Regardless of the NO_HOLES feature being enabled or not, the test results +# should be exactly the same for both cases. + +echo Testing without the NO_HOLES feature +# As of btrfs-progs 3.14.x, the no-holes feature isn't enabled
[PATCH] Btrfs: update commit root on snapshot creation after orphan cleanup
On snapshot creation (either writable or read-only), we do orphan cleanup against the root of the snapshot. If the cleanup did remove any orphans, then the current root node will be different from the commit root node until the next transaction commit happens. A send operation always uses the commit root of a snapshot - this means it will see the orphans if it starts computing the send stream before the next transaction commit happens (triggered by a timer or sync() for .e.g), which is when the commit root gets assigned a reference to current root, where the orphans are not visible anymore. The consequence of send seeing the orphans is explained below. For example: mkfs.btrfs -f /dev/sdd mount -o commit=999 /dev/sdd /mnt # open a file with O_TMPFILE and leave it open # write some data to the file btrfs subvolume snapshot -r /mnt /mnt/snap1 btrfs send /mnt/snap1 -f /tmp/send.data The send operation will fail with the following error: ERROR: send ioctl failed with -116: Stale file handle What happens here is that our snapshot has an orphan inode still visible through the commit root, that corresponds to the tmpfile. However send will attempt to call inode.c:btrfs_iget(), with the goal of reading the file's data, which will return -ESTALE because it will use the current root (and not the commit root) of the snapshot. Of course, there are other cases where we can get orphans, but this example using a tmpfile makes it much easier to reproduce the issue. Therefore on snapshot creation, after calling btrfs_orphan_cleanup, if the commit root is different from the current root, just commit the transaction associated with the snapshot's root (if it exists), so that a send will not see any orphans that don't exist anymore. This also guarantees a send will always see the same content regardless of whether a transaction commit happened already before the send was requested and after the orphan cleanup (meaning the commit root and current roots are the same) or it hasn't happened yet (commit and current roots are different). Signed-off-by: Filipe David Borba Manana fdman...@gmail.com --- fs/btrfs/ioctl.c | 29 + 1 file changed, 29 insertions(+) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 95194a9..6680ad9 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -712,6 +712,35 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, if (ret) goto fail; + /* +* If orphan cleanup did remove any orphans, it means the tree was +* modified and therefore the commit root is not the same as the +* current root anymore. This is a problem, because send uses the +* commit root and therefore can see inode items that don't exist +* in the current root anymore, and for example make calls to +* btrfs_iget, which will do tree lookups based on the current root +* and not on the commit root. Those lookups will fail, returning a +* -ESTALE error, and making send fail with that error. So make sure +* a send does not see any orphans we have just removed, and that it +* will see the same inodes regardless of whether a transaction +* commit happened before it started (meaning that the commit root +* will be the same as the current root) or not. +*/ + if (readonly pending_snapshot-snap-node != + pending_snapshot-snap-commit_root) { + trans = btrfs_join_transaction(pending_snapshot-snap); + if (IS_ERR(trans) PTR_ERR(trans) != -ENOENT) { + ret = PTR_ERR(trans); + goto fail; + } + if (!IS_ERR(trans)) { + ret = btrfs_commit_transaction(trans, + pending_snapshot-snap); + if (ret) + goto fail; + } + } + inode = btrfs_lookup_dentry(dentry-d_parent-d_inode, dentry); if (IS_ERR(inode)) { ret = PTR_ERR(inode); -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v5] xfstests: add test for btrfs cloning with file holes
Regression test for the btrfs ioctl clone operation when the source range contains hole(s) and the FS has the NO_HOLES feature enabled (file holes don't need file extent items in the btree to represent them). This issue is fixed by the following linux kernel btrfs patch: Btrfs: fix clone to deal with holes when NO_HOLES feature is enabled Signed-off-by: Filipe David Borba Manana fdman...@gmail.com --- V2: Increased test coverage by testing the cases where a hole overlaps the start and end of the cloning range. V3: Test the case where the cloning range includes an hole at the end of the source file and might increase the size of the target file. V4: Added test for the case where the clone range covers only a hole at the beginning of the source file. Made the test be skipped if the available version of mkfs.btrfs doesn't support the no-holes feature. And when testing the case where the no-holes feature isn't enabled, explicitly ask mkfs.btrfs to disable no-holes (future versions of mkfs.btrfs might enable this feature by default). V5: Detect if kernel supports NO_HOLES feature too. Added some messages (echoes) before each od call to make it easier to match output with each specific test. common/rc | 25 tests/btrfs/055 | 173 ++ tests/btrfs/055.out | 347 tests/btrfs/group | 1 + 4 files changed, 546 insertions(+) create mode 100755 tests/btrfs/055 create mode 100644 tests/btrfs/055.out diff --git a/common/rc b/common/rc index f27ee53..e2136d0 100644 --- a/common/rc +++ b/common/rc @@ -2177,6 +2177,31 @@ _require_btrfs_send_stream_version() fi } +_require_btrfs_mkfs_feature() +{ + if [ -z $1 ]; then + echo Missing feature name argument for _require_btrfs_mkfs_feature + exit 1 + fi + feat=$1 + $MKFS_BTRFS_PROG -O list-all 21 | \ + grep '^[ \t]*'$feat'\b' /dev/null 21 + [ $? -eq 0 ] || \ + _notrun Feature $feat not supported in the available version of mkfs.btrfs +} + +_require_btrfs_fs_feature() +{ + if [ -z $1 ]; then + echo Missing feature name argument for _require_btrfs_fs_feature + exit 1 + fi + feat=$1 + modprobe btrfs /dev/null 21 + [ -e /sys/fs/btrfs/features/$feat ] || \ + _notrun Feature $feat not supported by the available btrfs version +} + init_rc() { if [ $iam == new ] diff --git a/tests/btrfs/055 b/tests/btrfs/055 new file mode 100755 index 000..be38d09 --- /dev/null +++ b/tests/btrfs/055 @@ -0,0 +1,173 @@ +#! /bin/bash +# FS QA Test No. btrfs/055 +# +# Regression test for the btrfs ioctl clone operation when the source range +# contains hole(s) and the FS has the NO_HOLES feature enabled (file holes +# don't need file extent items in the btree to represent them). +# +# This issue is fixed by the following linux kernel btrfs patch: +# +#Btrfs: fix clone to deal with holes when NO_HOLES feature is enabled +# +#--- +# Copyright (c) 2014 Filipe Manana. All Rights Reserved. +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +#--- +# + +seq=`basename $0` +seqres=$RESULT_DIR/$seq +echo QA output created by $seq + +tmp=/tmp/$$ +status=1 # failure is the default! +trap _cleanup; exit \$status 0 1 2 3 15 + +_cleanup() +{ +rm -fr $tmp +} + +# get standard environment, filters and checks +. ./common/rc +. ./common/filter + +# real QA test starts here +_supported_fs btrfs +_supported_os Linux +_require_scratch +_require_btrfs_cloner +_require_btrfs_fs_feature no_holes +_require_btrfs_mkfs_feature no-holes +_need_to_be_root + +rm -f $seqres.full + +test_btrfs_clone_with_holes() +{ + _scratch_mkfs $1 /dev/null 21 + _scratch_mount + + # Create a file with 4 extents and 1 hole, all with a size of 8Kb each. + $XFS_IO_PROG -f -c pwrite -S 0x01 -b 8192 0 8192 $SCRATCH_MNT/foo \ + | _filter_xfs_io + sync + $XFS_IO_PROG -c pwrite -S 0x02 -b 8192 8192 8192 $SCRATCH_MNT/foo \ + | _filter_xfs_io + sync + # After the following write we get an hole in the range [16384, 24576
[PATCH v2] xfstests: add test for btrfs cloning with file holes
Regression test for the btrfs ioctl clone operation when the source range contains hole(s) and the FS has the NO_HOLES feature enabled (file holes don't need file extent items in the btree to represent them). This issue is fixed by the following linux kernel btrfs patch: Btrfs: fix clone to deal with holes when NO_HOLES feature is enabled Signed-off-by: Filipe David Borba Manana fdman...@gmail.com --- V2: Increased test coverage by testing the cases where a hole overlaps the start and end of the cloning range. tests/btrfs/055 | 112 + tests/btrfs/055.out | 117 tests/btrfs/group | 1 + 3 files changed, 230 insertions(+) create mode 100755 tests/btrfs/055 create mode 100644 tests/btrfs/055.out diff --git a/tests/btrfs/055 b/tests/btrfs/055 new file mode 100755 index 000..4a1614b --- /dev/null +++ b/tests/btrfs/055 @@ -0,0 +1,112 @@ +#! /bin/bash +# FS QA Test No. btrfs/055 +# +# Regression test for the btrfs ioctl clone operation when the source range +# contains hole(s) and the FS has the NO_HOLES feature enabled (file holes +# don't need file extent items in the btree to represent them). +# +# This issue is fixed by the following linux kernel btrfs patch: +# +#Btrfs: fix clone to deal with holes when NO_HOLES feature is enabled +# +#--- +# Copyright (c) 2014 Filipe Manana. All Rights Reserved. +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +#--- +# + +seq=`basename $0` +seqres=$RESULT_DIR/$seq +echo QA output created by $seq + +tmp=/tmp/$$ +status=1 # failure is the default! +trap _cleanup; exit \$status 0 1 2 3 15 + +_cleanup() +{ +rm -fr $tmp +} + +# get standard environment, filters and checks +. ./common/rc +. ./common/filter + +# real QA test starts here +_supported_fs btrfs +_supported_os Linux +_require_scratch +_require_btrfs_cloner +_need_to_be_root + +rm -f $seqres.full + +test_btrfs_clone_with_holes() +{ + _scratch_mkfs $1 /dev/null 21 + _scratch_mount + + # Create a file with 4 extents and 1 hole, all with a size of 8Kb each. + $XFS_IO_PROG -f -c pwrite -S 0x01 -b 8192 0 8192 $SCRATCH_MNT/foo \ + | _filter_xfs_io + sync + $XFS_IO_PROG -c pwrite -S 0x02 -b 8192 8192 8192 $SCRATCH_MNT/foo \ + | _filter_xfs_io + sync + # After the following write we get a hole in the range [16384, 24576[ + $XFS_IO_PROG -c pwrite -S 0x04 -b 8192 24576 8192 $SCRATCH_MNT/foo \ + | _filter_xfs_io + sync + $XFS_IO_PROG -c pwrite -S 0x05 -b 8192 32768 8192 $SCRATCH_MNT/foo \ + | _filter_xfs_io + sync + + # Clone destination file, 1 extent of 96kb. + $XFS_IO_PROG -f -c pwrite -S 0xff -b 98304 0 98304 $SCRATCH_MNT/bar \ + | _filter_xfs_io + sync + + # Clone 2nd extent, 8Kb hole and 3rd extent of foo into bar. + $CLONER_PROG -s 8192 -d 0 -l 24576 $SCRATCH_MNT/foo $SCRATCH_MNT/bar + + # Verify both extents and the hole were cloned. + od -t x1 $SCRATCH_MNT/bar + + # Cloning range starts at the middle of a hole. + $CLONER_PROG -s 20480 -d 32768 -l 12288 $SCRATCH_MNT/foo $SCRATCH_MNT/bar + + # Verify that half of the hole and the following 8Kb extent were cloned. + od -t x1 $SCRATCH_MNT/bar + + # Cloning range ends at the middle of a hole. + $CLONER_PROG -s 0 -d 65536 -l 20480 $SCRATCH_MNT/foo $SCRATCH_MNT/bar + + # Verify that 2 extents of 8kb and a 4kb hole were cloned. + od -t x1 $SCRATCH_MNT/bar + + # Verify that there are no consistency errors. + _check_scratch_fs +} + +echo Testing without the NO_HOLES feature +test_btrfs_clone_with_holes + +_scratch_unmount + +echo Testing with the NO_HOLES feature enabled +test_btrfs_clone_with_holes -O no-holes + +status=0 +exit diff --git a/tests/btrfs/055.out b/tests/btrfs/055.out new file mode 100644 index 000..cd627ce --- /dev/null +++ b/tests/btrfs/055.out @@ -0,0 +1,117 @@ +QA output created by 055 +Testing without the NO_HOLES feature +wrote 8192/8192 bytes at offset 0 +XXX Bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +wrote 8192/8192 bytes at offset 8192 +XXX
[PATCH v2] Btrfs: fix clone to deal with holes when NO_HOLES feature is enabled
If the NO_HOLES feature is enabled holes don't have file extent items in the btree that represent them anymore. This made the clone operation ignore the gaps that exist between consecutive file extent items and therefore not create the holes at the destination. A test case for xfstests follows. Signed-off-by: Filipe David Borba Manana fdman...@gmail.com --- V2: Deal with holes at the boundaries of the cloning range and that either overlap the boundary completely or partially. Test case for xfstests updated too to test these 2 cases. fs/btrfs/ioctl.c | 54 +- 1 file changed, 49 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 04ece8f..4a7a311 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2995,7 +2995,8 @@ out: * @destoff: Offset within @inode to start clone */ static int btrfs_clone(struct inode *src, struct inode *inode, - u64 off, u64 olen, u64 olen_aligned, u64 destoff) + const u64 off, const u64 olen, const u64 olen_aligned, + const u64 destoff) { struct btrfs_root *root = BTRFS_I(inode)-root; struct btrfs_path *path = NULL; @@ -3007,8 +3008,10 @@ static int btrfs_clone(struct inode *src, struct inode *inode, int slot; int ret; int no_quota; - u64 len = olen_aligned; + const u64 len = olen_aligned; u64 last_disko = 0; + u64 last_dest_end = destoff; + bool add_trailing_hole = false; ret = -ENOMEM; buf = vmalloc(btrfs_level_size(root, 0)); @@ -3077,6 +3080,7 @@ process_slot: u64 datao = 0, datal = 0; u8 comp; u64 endoff; + u64 drop_start; extent = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); @@ -3106,7 +3110,20 @@ process_slot: path-slots[0]++; goto process_slot; } else if (key.offset = off + len) { - break; + if (last_dest_end destoff + len) { + /* +* We have an implicit hole (NO_HOLES +* feature is enabled) that fully or +* partially overlaps our cloning range +* at its end. +*/ + btrfs_release_path(path); + path-leave_spinning = 0; + add_trailing_hole = true; + goto start_trans; + } else { + break; + } } size = btrfs_item_size_nr(leaf, slot); @@ -3125,6 +3142,19 @@ process_slot: new_key.offset = destoff; /* +* Deal with a hole that doesn't have an extent item +* that represents it (NO_HOLES feature enabled). +* This hole is either in the middle of the cloning +* range or at the beginning (fully overlaps it or +* partially overlaps it). +*/ + if (new_key.offset != last_dest_end) + drop_start = last_dest_end; + else + drop_start = new_key.offset; + +start_trans: + /* * 1 - adjusting old extent (we may have to split it) * 1 - add new extent * 1 - inode update @@ -3135,6 +3165,19 @@ process_slot: goto out; } + if (add_trailing_hole) { + ret = btrfs_drop_extents(trans, root, inode, +last_dest_end, +destoff + len, 1); + if (ret ret != -EOPNOTSUPP) + btrfs_abort_transaction(trans, root, + ret); + btrfs_end_transaction(trans, root); + if (ret) + goto out; + break; + } + if (type == BTRFS_FILE_EXTENT_REG || type == BTRFS_FILE_EXTENT_PREALLOC
[PATCH v3] Btrfs: fix clone to deal with holes when NO_HOLES feature is enabled
If the NO_HOLES feature is enabled holes don't have file extent items in the btree that represent them anymore. This made the clone operation ignore the gaps that exist between consecutive file extent items and therefore not create the holes at the destination. A test case for xfstests follows. Signed-off-by: Filipe David Borba Manana fdman...@gmail.com --- V2: Deal with holes at the boundaries of the cloning range and that either overlap the boundary completely or partially. Test case for xfstests updated too to test these 2 cases. V3: Deal with the case where the cloning range overlaps (partially or completely) a hole at the end of the source file, and might increase the size of the target file. Updated the test for xfstests to cover these cases too. fs/btrfs/ioctl.c | 63 1 file changed, 59 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 04ece8f..f508f5e 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2995,7 +2995,8 @@ out: * @destoff: Offset within @inode to start clone */ static int btrfs_clone(struct inode *src, struct inode *inode, - u64 off, u64 olen, u64 olen_aligned, u64 destoff) + const u64 off, const u64 olen, const u64 olen_aligned, + const u64 destoff) { struct btrfs_root *root = BTRFS_I(inode)-root; struct btrfs_path *path = NULL; @@ -3007,8 +3008,9 @@ static int btrfs_clone(struct inode *src, struct inode *inode, int slot; int ret; int no_quota; - u64 len = olen_aligned; + const u64 len = olen_aligned; u64 last_disko = 0; + u64 last_dest_end = destoff; ret = -ENOMEM; buf = vmalloc(btrfs_level_size(root, 0)); @@ -3077,6 +3079,7 @@ process_slot: u64 datao = 0, datal = 0; u8 comp; u64 endoff; + u64 drop_start; extent = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); @@ -3125,6 +3128,18 @@ process_slot: new_key.offset = destoff; /* +* Deal with a hole that doesn't have an extent item +* that represents it (NO_HOLES feature enabled). +* This hole is either in the middle of the cloning +* range or at the beginning (fully overlaps it or +* partially overlaps it). +*/ + if (new_key.offset != last_dest_end) + drop_start = last_dest_end; + else + drop_start = new_key.offset; + + /* * 1 - adjusting old extent (we may have to split it) * 1 - add new extent * 1 - inode update @@ -3153,7 +3168,7 @@ process_slot: } ret = btrfs_drop_extents(trans, root, inode, -new_key.offset, +drop_start, new_key.offset + datal, 1); if (ret) { @@ -3254,7 +3269,7 @@ process_slot: aligned_end = ALIGN(new_key.offset + datal, root-sectorsize); ret = btrfs_drop_extents(trans, root, inode, -new_key.offset, +drop_start, aligned_end, 1); if (ret) { @@ -3301,6 +3316,7 @@ process_slot: * but shouldn't round up the file size */ endoff = new_key.offset + datal; + last_dest_end = endoff; if (endoff destoff+olen) endoff = destoff+olen; if (endoff inode-i_size) @@ -3321,6 +3337,45 @@ process_slot: } ret = 0; + if (last_dest_end destoff + len) { + /* +* We have an implicit hole (NO_HOLES feature is enabled) that +* fully or partially overlaps our cloning range at its end. +*/ + btrfs_release_path(path); + path-leave_spinning = 0; + + /* +* 1 - remove extent(s) +* 1
[PATCH v3] xfstests: add test for btrfs cloning with file holes
Regression test for the btrfs ioctl clone operation when the source range contains hole(s) and the FS has the NO_HOLES feature enabled (file holes don't need file extent items in the btree to represent them). This issue is fixed by the following linux kernel btrfs patch: Btrfs: fix clone to deal with holes when NO_HOLES feature is enabled Signed-off-by: Filipe David Borba Manana fdman...@gmail.com --- V2: Increased test coverage by testing the cases where a hole overlaps the start and end of the cloning range. V3: Test the case where the cloning range includes an hole at the end of the source file and might increase the size of the target file. tests/btrfs/055 | 141 tests/btrfs/055.out | 225 tests/btrfs/group | 1 + 3 files changed, 367 insertions(+) create mode 100755 tests/btrfs/055 create mode 100644 tests/btrfs/055.out diff --git a/tests/btrfs/055 b/tests/btrfs/055 new file mode 100755 index 000..fad4b1c --- /dev/null +++ b/tests/btrfs/055 @@ -0,0 +1,141 @@ +#! /bin/bash +# FS QA Test No. btrfs/055 +# +# Regression test for the btrfs ioctl clone operation when the source range +# contains hole(s) and the FS has the NO_HOLES feature enabled (file holes +# don't need file extent items in the btree to represent them). +# +# This issue is fixed by the following linux kernel btrfs patch: +# +#Btrfs: fix clone to deal with holes when NO_HOLES feature is enabled +# +#--- +# Copyright (c) 2014 Filipe Manana. All Rights Reserved. +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +#--- +# + +seq=`basename $0` +seqres=$RESULT_DIR/$seq +echo QA output created by $seq + +tmp=/tmp/$$ +status=1 # failure is the default! +trap _cleanup; exit \$status 0 1 2 3 15 + +_cleanup() +{ +rm -fr $tmp +} + +# get standard environment, filters and checks +. ./common/rc +. ./common/filter + +# real QA test starts here +_supported_fs btrfs +_supported_os Linux +_require_scratch +_require_btrfs_cloner +_need_to_be_root + +rm -f $seqres.full + +test_btrfs_clone_with_holes() +{ + _scratch_mkfs $1 /dev/null 21 + _scratch_mount + + # Create a file with 4 extents and 1 hole, all with a size of 8Kb each. + $XFS_IO_PROG -f -c pwrite -S 0x01 -b 8192 0 8192 $SCRATCH_MNT/foo \ + | _filter_xfs_io + sync + $XFS_IO_PROG -c pwrite -S 0x02 -b 8192 8192 8192 $SCRATCH_MNT/foo \ + | _filter_xfs_io + sync + # After the following write we get a hole in the range [16384, 24576[ + $XFS_IO_PROG -c pwrite -S 0x04 -b 8192 24576 8192 $SCRATCH_MNT/foo \ + | _filter_xfs_io + sync + $XFS_IO_PROG -c pwrite -S 0x05 -b 8192 32768 8192 $SCRATCH_MNT/foo \ + | _filter_xfs_io + sync + + # Clone destination file, 1 extent of 96kb. + $XFS_IO_PROG -f -c pwrite -S 0xff -b 98304 0 98304 $SCRATCH_MNT/bar \ + | _filter_xfs_io + sync + + # Clone 2nd extent, 8Kb hole and 3rd extent of foo into bar. + $CLONER_PROG -s 8192 -d 0 -l 24576 $SCRATCH_MNT/foo $SCRATCH_MNT/bar + + # Verify both extents and the hole were cloned. + od -t x1 $SCRATCH_MNT/bar + + # Cloning range starts at the middle of a hole. + $CLONER_PROG -s 20480 -d 32768 -l 12288 $SCRATCH_MNT/foo \ + $SCRATCH_MNT/bar + + # Verify that half of the hole and the following 8Kb extent were cloned. + od -t x1 $SCRATCH_MNT/bar + + # Cloning range ends at the middle of a hole. + $CLONER_PROG -s 0 -d 65536 -l 20480 $SCRATCH_MNT/foo $SCRATCH_MNT/bar + + # Verify that 2 extents of 8kb and a 4kb hole were cloned. + od -t x1 $SCRATCH_MNT/bar + + # Create a 24Kb hole at the end of the source file (foo). + $XFS_IO_PROG -c truncate 65536 $SCRATCH_MNT/foo + sync + + # Now clone a range that overlaps that hole at the end of the foo file. + # It should clone the last 4Kb of the extent at offset 32768 and the + # first 8kb of the 24kb hole at the end of foo. + $CLONER_PROG -s 36864 -d 86016 -l 12288 $SCRATCH_MNT/foo \ + $SCRATCH_MNT/bar + + # Verify that the second half of the 8Kb extent
[PATCH v4] Btrfs: fix clone to deal with holes when NO_HOLES feature is enabled
If the NO_HOLES feature is enabled holes don't have file extent items in the btree that represent them anymore. This made the clone operation ignore the gaps that exist between consecutive file extent items and therefore not create the holes at the destination. When not using the NO_HOLES feature, the holes were created at the destination. A test case for xfstests follows. Signed-off-by: Filipe David Borba Manana fdman...@gmail.com --- V2: Deal with holes at the boundaries of the cloning range and that either overlap the boundary completely or partially. Test case for xfstests updated too to test these 2 cases. V3: Deal with the case where the cloning range overlaps (partially or completely) a hole at the end of the source file, and might increase the size of the target file. Updated the test for xfstests to cover these cases too. V4: Moved some duplicated code into an helper function. fs/btrfs/ioctl.c | 108 ++- 1 file changed, 83 insertions(+), 25 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 04ece8f..95194a9 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2983,6 +2983,37 @@ out: return ret; } +static int clone_finish_inode_update(struct btrfs_trans_handle *trans, +struct inode *inode, +u64 endoff, +const u64 destoff, +const u64 olen) +{ + struct btrfs_root *root = BTRFS_I(inode)-root; + int ret; + + inode_inc_iversion(inode); + inode-i_mtime = inode-i_ctime = CURRENT_TIME; + /* +* We round up to the block size at eof when determining which +* extents to clone above, but shouldn't round up the file size. +*/ + if (endoff destoff + olen) + endoff = destoff + olen; + if (endoff inode-i_size) + btrfs_i_size_write(inode, endoff); + + ret = btrfs_update_inode(trans, root, inode); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + btrfs_end_transaction(trans, root); + goto out; + } + ret = btrfs_end_transaction(trans, root); +out: + return ret; +} + /** * btrfs_clone() - clone a range from inode file to another * @@ -2995,7 +3026,8 @@ out: * @destoff: Offset within @inode to start clone */ static int btrfs_clone(struct inode *src, struct inode *inode, - u64 off, u64 olen, u64 olen_aligned, u64 destoff) + const u64 off, const u64 olen, const u64 olen_aligned, + const u64 destoff) { struct btrfs_root *root = BTRFS_I(inode)-root; struct btrfs_path *path = NULL; @@ -3007,8 +3039,9 @@ static int btrfs_clone(struct inode *src, struct inode *inode, int slot; int ret; int no_quota; - u64 len = olen_aligned; + const u64 len = olen_aligned; u64 last_disko = 0; + u64 last_dest_end = destoff; ret = -ENOMEM; buf = vmalloc(btrfs_level_size(root, 0)); @@ -3076,7 +3109,7 @@ process_slot: u64 disko = 0, diskl = 0; u64 datao = 0, datal = 0; u8 comp; - u64 endoff; + u64 drop_start; extent = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); @@ -3125,6 +3158,18 @@ process_slot: new_key.offset = destoff; /* +* Deal with a hole that doesn't have an extent item +* that represents it (NO_HOLES feature enabled). +* This hole is either in the middle of the cloning +* range or at the beginning (fully overlaps it or +* partially overlaps it). +*/ + if (new_key.offset != last_dest_end) + drop_start = last_dest_end; + else + drop_start = new_key.offset; + + /* * 1 - adjusting old extent (we may have to split it) * 1 - add new extent * 1 - inode update @@ -3153,7 +3198,7 @@ process_slot: } ret = btrfs_drop_extents(trans, root, inode, -new_key.offset, +drop_start, new_key.offset + datal, 1); if (ret) { @@ -3254,7 +3299,7 @@ process_slot
[PATCH] Btrfs: avoid visiting all extent items when cloning a range
When cloning a range of a file, we were visiting all the extent items in the btree that belong to our source inode. We don't need to visit those extent items that don't overlap the range we are cloning, as doing so only makes us waste time and do unnecessary btree navigations (btrfs_next_leaf) for inodes that have a large number of file extent items in the btree. Signed-off-by: Filipe David Borba Manana fdman...@gmail.com --- fs/btrfs/ioctl.c | 23 --- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 603c036..f20d91d 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3025,7 +3025,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode, /* clone data */ key.objectid = btrfs_ino(src); key.type = BTRFS_EXTENT_DATA_KEY; - key.offset = 0; + key.offset = off; while (1) { /* @@ -3037,6 +3037,17 @@ static int btrfs_clone(struct inode *src, struct inode *inode, 0, 0); if (ret 0) goto out; + /* +* First search, if no extent item that starts at offset off was +* found but the previous item is an extent item, it's possible +* it might overlap our target range, therefore process it. +*/ + if (key.offset == off ret 0 path-slots[0] 0) { + btrfs_item_key_to_cpu(path-nodes[0], key, + path-slots[0] - 1); + if (key.type == BTRFS_EXTENT_DATA_KEY) + path-slots[0]--; + } nritems = btrfs_header_nritems(path-nodes[0]); process_slot: @@ -3086,10 +3097,16 @@ process_slot: extent); } - if (key.offset + datal = off || - key.offset = off + len - 1) { + /* +* The first search might have left us at an extent +* item that ends before our target range's start, can +* happen if we have holes and NO_HOLES feature enabled. +*/ + if (key.offset + datal = off) { path-slots[0]++; goto process_slot; + } else if (key.offset = off + len) { + break; } size = btrfs_item_size_nr(leaf, slot); -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] Btrfs: ioctl, don't re-lock extent range when not necessary
In ioctl.c:lock_extent_range(), after locking our target range, the ordered extent that btrfs_lookup_first_ordered_extent() returns us may not overlap our target range at all. In this case we would just unlock our target range, wait for any new ordered extents that overlap the range to complete, lock again the range and repeat all these steps until we don't get any ordered extent and the delalloc flag isn't set in the io tree for our target range. Therefore just stop if we get an ordered extent that doesn't overlap our target range and the dealalloc flag isn't set for the range in the inode's io tree. Signed-off-by: Filipe David Borba Manana fdman...@gmail.com --- fs/btrfs/ioctl.c | 9 +++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 38f2169..603c036 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2700,10 +2700,15 @@ static inline void lock_extent_range(struct inode *inode, u64 off, u64 len) lock_extent(BTRFS_I(inode)-io_tree, off, off + len - 1); ordered = btrfs_lookup_first_ordered_extent(inode, off + len - 1); - if (!ordered + if ((!ordered || +ordered-file_offset + ordered-len = off || +ordered-file_offset = off + len) !test_range_bit(BTRFS_I(inode)-io_tree, off, - off + len - 1, EXTENT_DELALLOC, 0, NULL)) + off + len - 1, EXTENT_DELALLOC, 0, NULL)) { + if (ordered) + btrfs_put_ordered_extent(ordered); break; + } unlock_extent(BTRFS_I(inode)-io_tree, off, off + len - 1); if (ordered) btrfs_put_ordered_extent(ordered); -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] Btrfs: fix clone to deal with holes when NO_HOLES feature is enabled
If the NO_HOLES feature is enabled holes don't have file extent items in the btree that represent them anymore. This made the clone operation ignore the gaps that exist between consecutive file extent items and therefore not create the holes at the destination. A test case for xfstests follows. Signed-off-by: Filipe David Borba Manana fdman...@gmail.com --- fs/btrfs/ioctl.c | 17 +++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index ecf56af..bf34b7a 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3009,6 +3009,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode, int no_quota; u64 len = olen_aligned; u64 last_disko = 0; + u64 last_dest_end = (u64)-1; ret = -ENOMEM; buf = vmalloc(btrfs_level_size(root, 0)); @@ -3077,6 +3078,7 @@ process_slot: u64 datao = 0, datal = 0; u8 comp; u64 endoff; + u64 drop_start; extent = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); @@ -3125,6 +3127,16 @@ process_slot: new_key.offset = destoff; /* +* Deal with a hole that doesn't have an extent item +* that represents it (NO_HOLES feature enabled). +*/ + if (last_dest_end != (u64)-1 + new_key.offset != last_dest_end) + drop_start = last_dest_end; + else + drop_start = new_key.offset; + + /* * 1 - adjusting old extent (we may have to split it) * 1 - add new extent * 1 - inode update @@ -3153,7 +3165,7 @@ process_slot: } ret = btrfs_drop_extents(trans, root, inode, -new_key.offset, +drop_start, new_key.offset + datal, 1); if (ret) { @@ -3254,7 +3266,7 @@ process_slot: aligned_end = ALIGN(new_key.offset + datal, root-sectorsize); ret = btrfs_drop_extents(trans, root, inode, -new_key.offset, +drop_start, aligned_end, 1); if (ret) { @@ -3301,6 +3313,7 @@ process_slot: * but shouldn't round up the file size */ endoff = new_key.offset + datal; + last_dest_end = endoff; if (endoff destoff+olen) endoff = destoff+olen; if (endoff inode-i_size) -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v3] Btrfs: avoid visiting all extent items when cloning a range
When cloning a range of a file, we were visiting all the extent items in the btree that belong to our source inode. We don't need to visit those extent items that don't overlap the range we are cloning, as doing so only makes us waste time and do unnecessary btree navigations (btrfs_next_leaf) for inodes that have a large number of file extent items in the btree. Signed-off-by: Filipe David Borba Manana fdman...@gmail.com --- V2: Leave the loop as soon as we process the last extent, to avoid one extra and unnecessary btree search. V3: Removed unintentional and unrelated change from v2. fs/btrfs/ioctl.c | 26 ++ 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 603c036..04ece8f 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3025,7 +3025,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode, /* clone data */ key.objectid = btrfs_ino(src); key.type = BTRFS_EXTENT_DATA_KEY; - key.offset = 0; + key.offset = off; while (1) { /* @@ -3037,6 +3037,17 @@ static int btrfs_clone(struct inode *src, struct inode *inode, 0, 0); if (ret 0) goto out; + /* +* First search, if no extent item that starts at offset off was +* found but the previous item is an extent item, it's possible +* it might overlap our target range, therefore process it. +*/ + if (key.offset == off ret 0 path-slots[0] 0) { + btrfs_item_key_to_cpu(path-nodes[0], key, + path-slots[0] - 1); + if (key.type == BTRFS_EXTENT_DATA_KEY) + path-slots[0]--; + } nritems = btrfs_header_nritems(path-nodes[0]); process_slot: @@ -3086,10 +3097,16 @@ process_slot: extent); } - if (key.offset + datal = off || - key.offset = off + len - 1) { + /* +* The first search might have left us at an extent +* item that ends before our target range's start, can +* happen if we have holes and NO_HOLES feature enabled. +*/ + if (key.offset + datal = off) { path-slots[0]++; goto process_slot; + } else if (key.offset = off + len) { + break; } size = btrfs_item_size_nr(leaf, slot); @@ -3296,6 +3313,8 @@ process_slot: goto out; } ret = btrfs_end_transaction(trans, root); + if (new_key.offset + datal = destoff + len) + break; } btrfs_release_path(path); key.offset++; @@ -3303,7 +3322,6 @@ process_slot: ret = 0; out: - btrfs_release_path(path); btrfs_free_path(path); vfree(buf); return ret; -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] Btrfs: fix transaction leak during fsync call
If btrfs_log_dentry_safe() returns an error, we set ret to 1 and fall through with the goal of committing the transaction. However, in the case where the inode doesn't need a full sync, we would call btrfs_wait_ordered_range() against the target range for our inode, and if it returned an error, we would return without commiting or ending the transaction, leaving the transaction open forever. Since when btrfs_wait_ordered_range() doesn't return an error we commit the transaction, it doesn't make sense to make a call to btrfs_wait_ordered_range() because committing the transaction will wait for all ordered extents to complete anyway. Signed-off-by: Filipe David Borba Manana fdman...@gmail.com --- fs/btrfs/file.c | 6 -- 1 file changed, 6 deletions(-) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 8accf94..e0be468 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2022,12 +2022,6 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) goto out; } } - if (!full_sync) { - ret = btrfs_wait_ordered_range(inode, start, - end - start + 1); - if (ret) - goto out; - } ret = btrfs_commit_transaction(trans, root); } else { ret = btrfs_end_transaction(trans, root); -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v2] Btrfs: fix transaction leak during fsync call
If btrfs_log_dentry_safe() returns an error, we set ret to 1 and fall through with the goal of committing the transaction. However, in the case where the inode doesn't need a full sync, we would call btrfs_wait_ordered_range() against the target range for our inode, and if it returned an error, we would return without commiting or ending the transaction. Signed-off-by: Filipe David Borba Manana fdman...@gmail.com --- V2: Leave the call to btrfs_wait_ordered_range(), it's needed if the fs is mounted with -o no_flushoncommit. fs/btrfs/file.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index e46bfaf..5a00597 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2025,8 +2025,10 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) if (!full_sync) { ret = btrfs_wait_ordered_range(inode, start, end - start + 1); - if (ret) + if (ret) { + btrfs_end_transaction(trans, root); goto out; + } } ret = btrfs_commit_transaction(trans, root); } else { -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 2/2 v3] xfstests: add test for btrfs ioctl clone operation
This is a test to verify that the btrfs ioctl clone operation is able to clone extents of a file to different positions of the file, that is, the source and target files are the same. Existing tests only cover the case where the source and target files are different. Signed-off-by: Filipe David Borba Manana fdman...@gmail.com --- V2: Made the test exercise a more complex code path in the btrfs ioctl clone code. Now we have extents with different sizes and make the cloner process partial extents and split existing extents with smaller ones. V3: Add tests to verify that after writing to a cloned extent, the original extent isn't modified, that defragmenting a file with cloned extents doesn't change the file contents and that all the tests have the same exact semantics (as observed by an application/user) regardless of the following options (and any combination): cow/nodatacow/compression. tests/btrfs/052 | 171 ++ tests/btrfs/052.out | 499 tests/btrfs/group | 1 + 3 files changed, 671 insertions(+) create mode 100755 tests/btrfs/052 create mode 100644 tests/btrfs/052.out diff --git a/tests/btrfs/052 b/tests/btrfs/052 new file mode 100755 index 000..671034e --- /dev/null +++ b/tests/btrfs/052 @@ -0,0 +1,171 @@ +#! /bin/bash +# FS QA Test No. btrfs/052 +# +# Verify that the btrfs ioctl clone operation can operate on the same +# file as a source and target. That is, clone extents within the same +# file. +# +#--- +# Copyright (c) 2014 Filipe Manana. All Rights Reserved. +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +#--- +# + +seq=`basename $0` +seqres=$RESULT_DIR/$seq +echo QA output created by $seq + +tmp=/tmp/$$ +status=1 # failure is the default! +trap _cleanup; exit \$status 0 1 2 3 15 + +_cleanup() +{ +rm -fr $tmp +} + +# get standard environment, filters and checks +. ./common/rc +. ./common/filter + +# real QA test starts here +_supported_fs btrfs +_supported_os Linux +_require_scratch +_require_btrfs_cloner +_need_to_be_root + +rm -f $seqres.full + +test_btrfs_clone_same_file() +{ + if [ -z $1 ]; then + MOUNT_OPTIONS= + else + MOUNT_OPTIONS=-O $1 + fi + _scratch_mkfs /dev/null 21 + _scratch_mount $MOUNT_OPTIONS + + # Create a file with 5 extents, 4 of 8Kb each and 1 of 64Kb. + $XFS_IO_PROG -f -c pwrite -S 0x01 -b 8192 0 8192 $SCRATCH_MNT/foo \ + | _filter_xfs_io + sync + $XFS_IO_PROG -c pwrite -S 0x02 -b 8192 8192 8192 $SCRATCH_MNT/foo \ + | _filter_xfs_io + sync + $XFS_IO_PROG -c pwrite -S 0x03 -b 8192 16384 8192 $SCRATCH_MNT/foo \ + | _filter_xfs_io + sync + $XFS_IO_PROG -c pwrite -S 0x04 -b 8192 24576 8192 $SCRATCH_MNT/foo \ + | _filter_xfs_io + sync + $XFS_IO_PROG -c pwrite -S 0x05 -b 65536 32768 65536 $SCRATCH_MNT/foo \ + | _filter_xfs_io + sync + + # Digest of initial content. + md5sum $SCRATCH_MNT/foo | _filter_scratch + + # Same source and target ranges - must fail. + $CLONER_PROG -s 8192 -d 8192 -l 8192 $SCRATCH_MNT/foo $SCRATCH_MNT/foo + # Check file content didn't change. + md5sum $SCRATCH_MNT/foo | _filter_scratch + + # Intersection between source and target ranges - must fail too. + $CLONER_PROG -s 4096 -d 8192 -l 8192 $SCRATCH_MNT/foo $SCRATCH_MNT/foo + # Check file content didn't change. + md5sum $SCRATCH_MNT/foo | _filter_scratch + + # Clone an entire extent from a higher range to a lower range. + $CLONER_PROG -s 24576 -d 0 -l 8192 $SCRATCH_MNT/foo $SCRATCH_MNT/foo + + # Check entire file, the 8Kb block at offset 0 now has the same content + # as the 8Kb block at offset 24576. + od -t x1 $SCRATCH_MNT/foo + + # Clone an entire extent from a lower range to a higher range. + $CLONER_PROG -s 8192 -d 16384 -l 8192 $SCRATCH_MNT/foo $SCRATCH_MNT/foo + + # Check entire file, the 8Kb block at offset 0 now has the same content + # as the 8Kb block at offset 24576, and the 8Kb block at offset 16384 + # now has the same content as the 8Kb block
[PATCH v2] xfstests: add test for btrfs send with large xattrs
Verify that btrfs send is able to replicate xattrs larger than PATH_MAX. This is possible if the b+tree leaf size is larger than 4Kb (mkfs.btrfs's default is max(16Kb, PAGE_SIZE) as of btrfs-progs v3.12, and max(4Kb, PAGE_SIZE in older versions). This issue is fixed by the following linux kernel btrfs patch: Btrfs: send, use the right limits for xattr names and values Signed-off-by: Filipe David Borba Manana fdman...@gmail.com --- V2: Updated second invocation of btrfs send to be incremental. This way we test both a full send (snapshot 1) and an incremental send (differences between snapshot 2 and snapshot 1). tests/btrfs/053 | 109 tests/btrfs/053.out | 1 + tests/btrfs/group | 1 + 3 files changed, 111 insertions(+) create mode 100755 tests/btrfs/053 create mode 100644 tests/btrfs/053.out diff --git a/tests/btrfs/053 b/tests/btrfs/053 new file mode 100755 index 000..3994110 --- /dev/null +++ b/tests/btrfs/053 @@ -0,0 +1,109 @@ +#! /bin/bash +# FS QA Test No. btrfs/053 +# +# Verify that btrfs send is able to replicate xattrs larger than PATH_MAX. +# This is possible if the b+tree leaf size is larger than 4Kb (mkfs.btrfs's +# default is max(16Kb, PAGE_SIZE) as of btrfs-progs v3.12, and max(4Kb, +# PAGE_SIZE in older versions). +# +# This issue is fixed by the following linux kernel btrfs patch: +# +# Btrfs: send, use the right limits for xattr names and values +# +#--- +# Copyright (c) 2014 Filipe Manana. All Rights Reserved. +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +#--- +# + +seq=`basename $0` +seqres=$RESULT_DIR/$seq +echo QA output created by $seq + +tmp=/tmp/$$ +status=1 # failure is the default! +trap _cleanup; exit \$status 0 1 2 3 15 + +_cleanup() +{ +rm -fr $send_files_dir +rm -fr $tmp +} + +# get standard environment, filters and checks +. ./common/rc +. ./common/filter +. ./common/attr + +# real QA test starts here +_supported_fs btrfs +_supported_os Linux +_require_scratch +_require_fssum +_require_attrs +_need_to_be_root + +# max(16384, PAGE_SIZE) is the default leaf/node size on btrfs-progs v3.12+. +# Older versions just use max(4096, PAGE_SIZE). +# mkfs.btrfs can't create an fs with a leaf/node size smaller than PAGE_SIZE. +leaf_size=$(echo -e 16384\n`getconf PAGE_SIZE` | sort -nr | head -1) + +send_files_dir=$TEST_DIR/btrfs-test-$seq + +rm -f $seqres.full +rm -fr $send_files_dir +mkdir $send_files_dir + +_scratch_mkfs -l $leaf_size /dev/null 21 +_scratch_mount + +echo hello world $SCRATCH_MNT/foobar + +$SETFATTR_PROG -n user.xattr_name_1 -v `$PERL_PROG -e 'print A x 6000;'` \ + $SCRATCH_MNT/foobar + +_run_btrfs_util_prog subvolume snapshot -r $SCRATCH_MNT $SCRATCH_MNT/mysnap1 +run_check $FSSUM_PROG -A -f -w $send_files_dir/1.fssum $SCRATCH_MNT/mysnap1 + +# Update existing xattr value and add a new xattr too. +$SETFATTR_PROG -n user.xattr_name_1 -v `$PERL_PROG -e 'print Z x ;'` \ + $SCRATCH_MNT/foobar +$SETFATTR_PROG -n user.xattr_name_2 -v `$PERL_PROG -e 'print U x ;'` \ + $SCRATCH_MNT/foobar + +_run_btrfs_util_prog subvolume snapshot -r $SCRATCH_MNT $SCRATCH_MNT/mysnap2 +run_check $FSSUM_PROG -A -f -w $send_files_dir/2.fssum \ + -x $SCRATCH_MNT/mysnap2/mysnap1 $SCRATCH_MNT/mysnap2 + +_run_btrfs_util_prog send $SCRATCH_MNT/mysnap1 -f $send_files_dir/1.snap +_run_btrfs_util_prog send -p $SCRATCH_MNT/mysnap1 $SCRATCH_MNT/mysnap2 \ + -f $send_files_dir/2.snap + +_scratch_unmount +_check_scratch_fs + +_scratch_mkfs -l $leaf_size /dev/null 21 +_scratch_mount + +_run_btrfs_util_prog receive $SCRATCH_MNT -f $send_files_dir/1.snap +run_check $FSSUM_PROG -r $send_files_dir/1.fssum $SCRATCH_MNT/mysnap1 + +_run_btrfs_util_prog receive $SCRATCH_MNT -f $send_files_dir/2.snap +run_check $FSSUM_PROG -r $send_files_dir/2.fssum $SCRATCH_MNT/mysnap2 + +_check_scratch_fs + +status=0 +exit diff --git a/tests/btrfs/053.out b/tests/btrfs/053.out new file mode 100644 index 000..4c61638 --- /dev/null +++ b/tests/btrfs/053.out @@ -0,0 +1 @@ +QA output created by 053 diff --git a/tests/btrfs/group b/tests/btrfs/group index 5ff9b8e..ea49c5c 100644 --- a/tests/btrfs/group +++ b/tests/btrfs/group @@ -55,3 +55,4 @@ 050 auto 051 auto quick 052 auto
[PATCH] Btrfs: send, don't error in the presence of subvols/snapshots
If we are doing an incremental send and the base snapshot has a directory with name X that doesn't exist anymore in the second snapshot and a new subvolume/snapshot exists in the second snapshot that has the same name as the directory (name X), the incremental send would fail with -ENOENT error. This is because it attempts to lookup for an inode with a number matching the objectid of a root, which doesn't exist. Steps to reproduce: mkfs.btrfs -f /dev/sdd mount /dev/sdd /mnt mkdir /mnt/testdir btrfs subvolume snapshot -r /mnt /mnt/mysnap1 rmdir /mnt/testdir btrfs subvolume create /mnt/testdir btrfs subvolume snapshot -r /mnt /mnt/mysnap2 btrfs send -p /mnt/mysnap1 /mnt/mysnap2 -f /tmp/send.data A test case for xfstests follows. Reported-by: Robert White rwh...@pobox.com Signed-off-by: Filipe David Borba Manana fdman...@gmail.com --- fs/btrfs/send.c | 12 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 1a65a40..f51525e 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1642,7 +1642,8 @@ out: static int lookup_dir_item_inode(struct btrfs_root *root, u64 dir, const char *name, int name_len, u64 *found_inode, -u8 *found_type) +u8 *found_type, +int *found_is_root) { int ret = 0; struct btrfs_dir_item *di; @@ -1666,6 +1667,8 @@ static int lookup_dir_item_inode(struct btrfs_root *root, btrfs_dir_item_key_to_cpu(path-nodes[0], di, key); *found_inode = key.objectid; *found_type = btrfs_dir_type(path-nodes[0], di); + if (found_is_root) + *found_is_root = (key.type == BTRFS_ROOT_ITEM_KEY); out: btrfs_free_path(path); @@ -1816,7 +1819,7 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen, } ret = lookup_dir_item_inode(sctx-parent_root, dir, name, name_len, - other_inode, other_type); + other_inode, other_type, NULL); if (ret 0 ret != -ENOENT) goto out; if (ret) { @@ -1861,6 +1864,7 @@ static int did_overwrite_ref(struct send_ctx *sctx, u64 gen; u64 ow_inode; u8 other_type; + int other_is_root = 0; if (!sctx-parent_root) goto out; @@ -1871,10 +1875,10 @@ static int did_overwrite_ref(struct send_ctx *sctx, /* check if the ref was overwritten by another ref */ ret = lookup_dir_item_inode(sctx-send_root, dir, name, name_len, - ow_inode, other_type); + ow_inode, other_type, other_is_root); if (ret 0 ret != -ENOENT) goto out; - if (ret) { + if (ret || other_is_root) { /* was never and will never be overwritten */ ret = 0; goto out; -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] Btrfs: set dead flag on the right root when destroying snapshot
We were setting the BTRFS_ROOT_SUBVOL_DEAD flag on the root of the parent of our target snapshot, instead of setting it in the target snapshot's root. This is easy to observe by running the following scenario: mkfs.btrfs -f /dev/sdd mount /dev/sdd /mnt btrfs subvolume create /mnt/first_subvol btrfs subvolume snapshot -r /mnt /mnt/mysnap1 btrfs subvolume delete /mnt/first_subvol btrfs subvolume snapshot -r /mnt /mnt/mysnap2 btrfs send -p /mnt/mysnap1 /mnt/mysnap2 -f /tmp/send.data The send command failed because the send ioctl returned -EPERM. A test case for xfstests follows. Signed-off-by: Filipe David Borba Manana fdman...@gmail.com --- fs/btrfs/ioctl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 362720a..482cad5 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2314,7 +2314,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, spin_lock(dest-root_item_lock); root_flags = btrfs_root_flags(root-root_item); if (root-send_in_progress == 0) { - btrfs_set_root_flags(root-root_item, + btrfs_set_root_flags(dest-root_item, root_flags | BTRFS_ROOT_SUBVOL_DEAD); spin_unlock(dest-root_item_lock); } else { @@ -2417,7 +2417,7 @@ out_unlock: if (err) { spin_lock(dest-root_item_lock); root_flags = btrfs_root_flags(root-root_item); - btrfs_set_root_flags(root-root_item, + btrfs_set_root_flags(dest-root_item, root_flags ~BTRFS_ROOT_SUBVOL_DEAD); spin_unlock(dest-root_item_lock); } -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v2] Btrfs: set dead flag on the right root when destroying snapshot
We were setting the BTRFS_ROOT_SUBVOL_DEAD flag on the root of the parent of our target snapshot, instead of setting it in the target snapshot's root. This is easy to observe by running the following scenario: mkfs.btrfs -f /dev/sdd mount /dev/sdd /mnt btrfs subvolume create /mnt/first_subvol btrfs subvolume snapshot -r /mnt /mnt/mysnap1 btrfs subvolume delete /mnt/first_subvol btrfs subvolume snapshot -r /mnt /mnt/mysnap2 btrfs send -p /mnt/mysnap1 /mnt/mysnap2 -f /tmp/send.data The send command failed because the send ioctl returned -EPERM. A test case for xfstests follows. Signed-off-by: Filipe David Borba Manana fdman...@gmail.com --- V2: Add missing replacements of 'root' with 'dest'. fs/btrfs/ioctl.c | 12 ++-- 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 362720a..38f2169 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2312,16 +2312,16 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, * again is not run concurrently. */ spin_lock(dest-root_item_lock); - root_flags = btrfs_root_flags(root-root_item); - if (root-send_in_progress == 0) { - btrfs_set_root_flags(root-root_item, + root_flags = btrfs_root_flags(dest-root_item); + if (dest-send_in_progress == 0) { + btrfs_set_root_flags(dest-root_item, root_flags | BTRFS_ROOT_SUBVOL_DEAD); spin_unlock(dest-root_item_lock); } else { spin_unlock(dest-root_item_lock); btrfs_warn(root-fs_info, Attempt to delete subvolume %llu during send, - root-root_key.objectid); + dest-root_key.objectid); err = -EPERM; goto out_dput; } @@ -2416,8 +2416,8 @@ out_up_write: out_unlock: if (err) { spin_lock(dest-root_item_lock); - root_flags = btrfs_root_flags(root-root_item); - btrfs_set_root_flags(root-root_item, + root_flags = btrfs_root_flags(dest-root_item); + btrfs_set_root_flags(dest-root_item, root_flags ~BTRFS_ROOT_SUBVOL_DEAD); spin_unlock(dest-root_item_lock); } -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] xfstests: test for btrfs send when nested subvols/snapshots exist
Regression test for a btrfs incremental send issue where the difference between the snapshots used by the incremental send consists of one of these cases: 1) First snapshot has a directory with name X and in the second snapshot that directory doesn't exist anymore but a subvolume/snapshot with the same name (X) exists; 2) First snapshot has a subvolume/snapshot with name X and in the second snapshot that subvolume/snapshot doesn't exist anymore (might have been replaced by a directory with the same name or not). This issue is fixed by the following linux kernel btrfs patches: Btrfs: send, don't error in the presence of subvols/snapshots Btrfs: set dead flag on the right root when destroying snapshot Signed-off-by: Filipe David Borba Manana fdman...@gmail.com --- tests/btrfs/054 | 109 tests/btrfs/054.out | 1 + tests/btrfs/group | 1 + 3 files changed, 111 insertions(+) create mode 100755 tests/btrfs/054 create mode 100644 tests/btrfs/054.out diff --git a/tests/btrfs/054 b/tests/btrfs/054 new file mode 100755 index 000..215861c --- /dev/null +++ b/tests/btrfs/054 @@ -0,0 +1,109 @@ +#! /bin/bash +# FS QA Test No. btrfs/054 +# +# Regression test for a btrfs incremental send issue where the difference +# between the snapshots used by the incremental send consists of one of +# these cases: +# +# 1) First snapshot has a directory with name X and in the second snapshot +#that directory doesn't exist anymore but a subvolume/snapshot with +#the same name (X) exists; +# +# 2) First snapshot has a subvolume/snapshot with name X and in the second +#snapshot that subvolume/snapshot doesn't exist anymore (might have been +#replaced by a directory with the same name or not). +# +# This issue is fixed by the following linux kernel btrfs patches: +# +#Btrfs: send, don't error in the presence of subvols/snapshots +#Btrfs: set dead flag on the right root when destroying snapshot +# +#--- +# Copyright (c) 2014 Filipe Manana. All Rights Reserved. +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +#--- +# + +seq=`basename $0` +seqres=$RESULT_DIR/$seq +echo QA output created by $seq + +tmp=/tmp/$$ +status=1 # failure is the default! +trap _cleanup; exit \$status 0 1 2 3 15 + +_cleanup() +{ +rm -fr $send_files_dir +rm -fr $tmp +} + +# get standard environment, filters and checks +. ./common/rc +. ./common/filter +. ./common/attr + +# real QA test starts here +_supported_fs btrfs +_supported_os Linux +_require_scratch +_need_to_be_root + +send_files_dir=$TEST_DIR/btrfs-test-$seq + +rm -f $seqres.full +rm -fr $send_files_dir +mkdir $send_files_dir + +_scratch_mkfs /dev/null 21 +_scratch_mount + +mkdir $SCRATCH_MNT/testdir +_run_btrfs_util_prog subvolume create $SCRATCH_MNT/first_subvol + +_run_btrfs_util_prog subvolume snapshot -r $SCRATCH_MNT $SCRATCH_MNT/mysnap1 + +# Replace the directory testdir with a subvolume that has the same name. +rmdir $SCRATCH_MNT/testdir +_run_btrfs_util_prog subvolume create $SCRATCH_MNT/testdir + +# Delete the subvolume first_subvol and create a directory with the same name. +_run_btrfs_util_prog subvolume delete $SCRATCH_MNT/first_subvol +mkdir $SCRATCH_MNT/first_subvol + +_run_btrfs_util_prog subvolume snapshot -r $SCRATCH_MNT $SCRATCH_MNT/mysnap2 + +_run_btrfs_util_prog send $SCRATCH_MNT/mysnap1 -f $send_files_dir/1.snap +_run_btrfs_util_prog send $SCRATCH_MNT/mysnap2 -p $SCRATCH_MNT/mysnap1 \ + -f $send_files_dir/2.snap + +_scratch_unmount +_check_scratch_fs + +_scratch_mkfs /dev/null 21 +_scratch_mount + +_run_btrfs_util_prog receive $SCRATCH_MNT -f $send_files_dir/1.snap +[ -e $SCRATCH_MNT/first_subvol ] \ + echo Subvolume first_subvol was not supposed to be replicated by full send! + +_run_btrfs_util_prog receive $SCRATCH_MNT -f $send_files_dir/2.snap +[ -e $SCRATCH_MNT/testdir ] \ + echo Directory testdir was supposed to be deleted after incremental send! + +_check_scratch_fs + +status=0 +exit diff --git a/tests/btrfs/054.out b/tests/btrfs/054.out new file mode 100644 index 000..03e258b --- /dev/null +++ b/tests/btrfs/054.out @@ -0,0 +1 @@ +QA output created by 054 diff --git a/tests/btrfs/group b/tests/btrfs/group index
[PATCH v2] Btrfs: send, don't error in the presence of subvols/snapshots
If we are doing an incremental send and the base snapshot has a directory with name X that doesn't exist anymore in the second snapshot and a new subvolume/snapshot exists in the second snapshot that has the same name as the directory (name X), the incremental send would fail with -ENOENT error. This is because it attempts to lookup for an inode with a number matching the objectid of a root, which doesn't exist. Steps to reproduce: mkfs.btrfs -f /dev/sdd mount /dev/sdd /mnt mkdir /mnt/testdir btrfs subvolume snapshot -r /mnt /mnt/mysnap1 rmdir /mnt/testdir btrfs subvolume create /mnt/testdir btrfs subvolume snapshot -r /mnt /mnt/mysnap2 btrfs send -p /mnt/mysnap1 /mnt/mysnap2 -f /tmp/send.data A test case for xfstests follows. Reported-by: Robert White rwh...@pobox.com Signed-off-by: Filipe David Borba Manana fdman...@gmail.com --- V2: Simpler version. fs/btrfs/send.c | 4 1 file changed, 4 insertions(+) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 1a65a40..2722b26 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1664,6 +1664,10 @@ static int lookup_dir_item_inode(struct btrfs_root *root, goto out; } btrfs_dir_item_key_to_cpu(path-nodes[0], di, key); + if (key.type == BTRFS_ROOT_ITEM_KEY) { + ret = -ENOENT; + goto out; + } *found_inode = key.objectid; *found_type = btrfs_dir_type(path-nodes[0], di); -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 2/2 v2] xfstests: add test for btrfs ioctl clone operation
This is a test to verify that the btrfs ioctl clone operation is able to clone extents of a file to different positions of the file, that is, the source and target files are the same. Existing tests only cover the case where the source and target files are different. Signed-off-by: Filipe David Borba Manana fdman...@gmail.com --- V2: Made the test exercise a more complex code path in the btrfs ioctl clone code. Now we have extents with different sizes and make the cloner process partial extents and split existing extents with smaller ones. tests/btrfs/052 | 116 tests/btrfs/052.out | 51 +++ tests/btrfs/group | 1 + 3 files changed, 168 insertions(+) create mode 100755 tests/btrfs/052 create mode 100644 tests/btrfs/052.out diff --git a/tests/btrfs/052 b/tests/btrfs/052 new file mode 100755 index 000..9b98521 --- /dev/null +++ b/tests/btrfs/052 @@ -0,0 +1,116 @@ +#! /bin/bash +# FS QA Test No. btrfs/052 +# +# Verify that the btrfs ioctl clone operation can operate on the same +# file as a source and target. That is, clone extents within the same +# file. +# +#--- +# Copyright (c) 2014 Filipe Manana. All Rights Reserved. +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +#--- +# + +seq=`basename $0` +seqres=$RESULT_DIR/$seq +echo QA output created by $seq + +tmp=/tmp/$$ +status=1 # failure is the default! +trap _cleanup; exit \$status 0 1 2 3 15 + +_cleanup() +{ +rm -fr $tmp +} + +# get standard environment, filters and checks +. ./common/rc +. ./common/filter + +# real QA test starts here +_supported_fs btrfs +_supported_os Linux +_require_scratch +_require_btrfs_cloner +_need_to_be_root + +rm -f $seqres.full + +_scratch_mkfs /dev/null 21 +_scratch_mount + +# Create a file with 5 extents, 4 of 8Kb each and 1 of 64Kb. +$XFS_IO_PROG -f -c pwrite -S 0x01 -b 8192 0 8192 $SCRATCH_MNT/foo \ + | _filter_xfs_io +sync +$XFS_IO_PROG -c pwrite -S 0x02 -b 8192 8192 8192 $SCRATCH_MNT/foo \ + | _filter_xfs_io +sync +$XFS_IO_PROG -c pwrite -S 0x03 -b 8192 16384 8192 $SCRATCH_MNT/foo \ + | _filter_xfs_io +sync +$XFS_IO_PROG -c pwrite -S 0x04 -b 8192 24576 8192 $SCRATCH_MNT/foo \ + | _filter_xfs_io +sync +$XFS_IO_PROG -c pwrite -S 0x05 -b 65536 32768 65536 $SCRATCH_MNT/foo \ + | _filter_xfs_io +sync + +# Digest of initial content. +md5sum $SCRATCH_MNT/foo | _filter_scratch + +# Same source and target ranges - must fail. +$CLONER_PROG -s 8192 -d 8192 -l 8192 $SCRATCH_MNT/foo $SCRATCH_MNT/foo +# Check file content didn't change. +md5sum $SCRATCH_MNT/foo | _filter_scratch + +# Intersection between source and target ranges - must fail too. +$CLONER_PROG -s 4096 -d 8192 -l 8192 $SCRATCH_MNT/foo $SCRATCH_MNT/foo +# Check file content didn't change. +md5sum $SCRATCH_MNT/foo | _filter_scratch + +# Clone an entire extent from a higher range to a lower range. +$CLONER_PROG -s 24576 -d 0 -l 8192 $SCRATCH_MNT/foo $SCRATCH_MNT/foo + +# Check entire file, the 8Kb block at offset 0 now has the same content as the +# 8Kb block at offset 24576. +od -t x1 $SCRATCH_MNT/foo + +# Clone an entire extent from a lower range to a higher range. +$CLONER_PROG -s 8192 -d 16384 -l 8192 $SCRATCH_MNT/foo $SCRATCH_MNT/foo + +# Check entire file, the 8Kb block at offset 0 now has the same content as the +# 8Kb block at offset 24576, and the 8Kb block at offset 16384 now has the same +# content as the 8Kb block at offset 8192. +od -t x1 $SCRATCH_MNT/foo + +# Now clone 1 extent and an half into the file range starting at offset 65536. +# So we get the second half of the extent at offset 16384 and the whole extent +# at 24576 cloned into the middle of the 64Kb extent that starts at file offset +# 32768. This makes the clone ioctl process more extent items from the b+tree +# and forces a split of the large 64Kb extent at the end of the file. +$CLONER_PROG -s 20480 -d 65536 -l 12288 $SCRATCH_MNT/foo $SCRATCH_MNT/foo + +# Check entire file. Besides the previous changes, we now should have 4096 bytes +# with the value 0x02 at file offset 65536, and 8192 bytes with value 0x04 at +# the file offset 69632. The ranges [32768, 65536[ and [77824, 98304[ should +# remain with all bytes having a value of 0x05. +od -t
[PATCH] Btrfs-progs: debug-tree, add option to dump a single tree
Very often while debugging filesystems with many subvolumes and/or snapshots, specially when they are large, I want to see only the content of one of the trees. So this change just adds an option to btrfs-debug-tree to allow to specify the id of the tree we're interesting in dumping to stdout. Example: btrfs-debug-tree -t 257 /dev/sdc Will only dump the tree of the first snapshot or subvolume that was created. Signed-off-by: Filipe David Borba Manana fdman...@gmail.com --- btrfs-debug-tree.c | 12 ++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/btrfs-debug-tree.c b/btrfs-debug-tree.c index cb6c106..36e1115 100644 --- a/btrfs-debug-tree.c +++ b/btrfs-debug-tree.c @@ -41,6 +41,8 @@ static int print_usage(void) fprintf(stderr, \t-u : print info of uuid tree only\n); fprintf(stderr, \t-b block_num : print info of the specified block only\n); + fprintf(stderr, + \t-t tree_id : print only the tree with the given id\n); fprintf(stderr, %s\n, BTRFS_BUILD_VERSION); exit(1); } @@ -136,12 +138,13 @@ int main(int ac, char **av) int root_backups = 0; u64 block_only = 0; struct btrfs_root *tree_root_scan; + u64 tree_id = 0; radix_tree_init(); while(1) { int c; - c = getopt(ac, av, deb:rRu); + c = getopt(ac, av, deb:rRut:); if (c 0) break; switch(c) { @@ -164,6 +167,9 @@ int main(int ac, char **av) case 'b': block_only = arg_strtou64(optarg); break; + case 't': + tree_id = arg_strtou64(optarg); + break; default: print_usage(); } @@ -208,7 +214,7 @@ int main(int ac, char **av) goto close_root; } - if (!(extent_only || uuid_tree_only)) { + if (!(extent_only || uuid_tree_only || tree_id)) { if (roots_only) { printf(root tree: %llu level %d\n, (unsigned long long)info-tree_root-node-start, @@ -268,6 +274,8 @@ again: 0); if (!extent_buffer_uptodate(buf)) goto next; + if (tree_id found_key.objectid != tree_id) + goto next; switch(found_key.objectid) { case BTRFS_ROOT_TREE_OBJECTID: -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] Btrfs-progs: receive, allow to continue after errors happen
Due to either bugs in send (kernel) that generate a command against a wrong path for example, or transient errors on the receiving side, we stopped processing the send stream immediately and exited with an error. It's often desirable to continue processing the send stream even if an error happens while processing a single command from the send stream. This change just adds a --max-errors N parameter, whose default value is 1 (preserving current behaviour), that allows to tolerate N errors before stopping. A value of 0 means to never stop no matter how many errors we get into while processing the send stream. Regardless of its value, errors are always printed to stderr when they happen, just like before this change. Signed-off-by: Filipe David Borba Manana fdman...@gmail.com --- Documentation/btrfs-receive.txt | 3 +++ cmds-receive.c | 24 +++- send-stream.c | 22 ++ send-stream.h | 3 ++- 4 files changed, 42 insertions(+), 10 deletions(-) diff --git a/Documentation/btrfs-receive.txt b/Documentation/btrfs-receive.txt index a67bc66..a87c086 100644 --- a/Documentation/btrfs-receive.txt +++ b/Documentation/btrfs-receive.txt @@ -38,6 +38,9 @@ Use this option to specify a file to use instead. Terminate after receiving an end cmd in the data stream. Without this option, the receiver terminates only if an error is recognized or on EOF. +--max-errors N:: +Terminate as soon as N errors happened while processing commands from the send +stream. Default value is 1. A value of 0 means no limit. EXIT STATUS --- diff --git a/cmds-receive.c b/cmds-receive.c index 13db4c9..1aa4e52 100644 --- a/cmds-receive.c +++ b/cmds-receive.c @@ -33,6 +33,7 @@ #include wait.h #include assert.h #include time.h +#include getopt.h #include sys/stat.h #include sys/types.h @@ -954,7 +955,8 @@ static struct btrfs_send_ops send_ops = { .fallocate = process_fallocate, }; -static int do_receive(struct btrfs_receive *r, const char *tomnt, int r_fd) +static int do_receive(struct btrfs_receive *r, const char *tomnt, int r_fd, + u64 max_errors) { int ret; char *dest_dir_full_path; @@ -1006,7 +1008,8 @@ static int do_receive(struct btrfs_receive *r, const char *tomnt, int r_fd) while (!end) { ret = btrfs_read_and_process_send_stream(r_fd, send_ops, r, -r-honor_end_cmd); +r-honor_end_cmd, +max_errors); if (ret 0) goto out; if (ret) @@ -1049,6 +1052,11 @@ out: return ret; } +static const struct option long_opts[] = { + { max-errors, 1, NULL, 'E' }, + { NULL, 0, NULL, 0 } +}; + int cmd_receive(int argc, char **argv) { int c; @@ -1056,7 +1064,7 @@ int cmd_receive(int argc, char **argv) char *fromfile = NULL; struct btrfs_receive r; int receive_fd = fileno(stdin); - + u64 max_errors = 1; int ret; memset(r, 0, sizeof(r)); @@ -1064,7 +1072,7 @@ int cmd_receive(int argc, char **argv) r.write_fd = -1; r.dest_dir_fd = -1; - while ((c = getopt(argc, argv, evf:)) != -1) { + while ((c = getopt_long(argc, argv, evf:, long_opts, NULL)) != -1) { switch (c) { case 'v': g_verbose++; @@ -1075,6 +1083,9 @@ int cmd_receive(int argc, char **argv) case 'e': r.honor_end_cmd = 1; break; + case 'E': + max_errors = arg_strtou64(optarg); + break; case '?': default: fprintf(stderr, ERROR: receive args invalid.\n); @@ -1095,7 +1106,7 @@ int cmd_receive(int argc, char **argv) } } - ret = do_receive(r, tomnt, receive_fd); + ret = do_receive(r, tomnt, receive_fd, max_errors); return !!ret; } @@ -1121,5 +1132,8 @@ const char * const cmd_receive_usage[] = { in the data stream. Without this option,, the receiver terminates only if an error, is recognized or on EOF., + --max-errors N Terminate as soon as N errors happened while, +processing commands from the send stream., +Default value is 1. A value of 0 means no limit., NULL }; diff --git a/send-stream.c b/send-stream.c index 812639f..a9acdf5 100644 --- a/send-stream.c +++ b/send-stream.c @@ -452,13 +452,21 @@ out: return ret; } +/* + * If max_errors is 0, then don't stop processing the stream if one of the + * callbacks in btrfs_send_ops structure returns an error. If greater than
[PATCH] xfstests: add test for btrfs send with large xattrs
Verify that btrfs send is able to replicate xattrs larger than PATH_MAX. This is possible if the b+tree leaf size is larger than 4Kb (mkfs.btrfs's default is max(16Kb, PAGE_SIZE) as of btrfs-progs v3.12, and max(4Kb, PAGE_SIZE in older versions). This issue is fixed by the following linux kernel btrfs patch: Btrfs: send, use the right limits for xattr names and values Signed-off-by: Filipe David Borba Manana fdman...@gmail.com --- tests/btrfs/053 | 108 tests/btrfs/053.out | 1 + tests/btrfs/group | 1 + 3 files changed, 110 insertions(+) create mode 100755 tests/btrfs/053 create mode 100644 tests/btrfs/053.out diff --git a/tests/btrfs/053 b/tests/btrfs/053 new file mode 100755 index 000..4dbdf59 --- /dev/null +++ b/tests/btrfs/053 @@ -0,0 +1,108 @@ +#! /bin/bash +# FS QA Test No. btrfs/053 +# +# Verify that btrfs send is able to replicate xattrs larger than PATH_MAX. +# This is possible if the b+tree leaf size is larger than 4Kb (mkfs.btrfs's +# default is max(16Kb, PAGE_SIZE) as of btrfs-progs v3.12, and max(4Kb, +# PAGE_SIZE in older versions). +# +# This issue is fixed by the following linux kernel btrfs patch: +# +# Btrfs: send, use the right limits for xattr names and values +# +#--- +# Copyright (c) 2014 Filipe Manana. All Rights Reserved. +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +#--- +# + +seq=`basename $0` +seqres=$RESULT_DIR/$seq +echo QA output created by $seq + +tmp=/tmp/$$ +status=1 # failure is the default! +trap _cleanup; exit \$status 0 1 2 3 15 + +_cleanup() +{ +rm -fr $send_files_dir +rm -fr $tmp +} + +# get standard environment, filters and checks +. ./common/rc +. ./common/filter +. ./common/attr + +# real QA test starts here +_supported_fs btrfs +_supported_os Linux +_require_scratch +_require_fssum +_require_attrs +_need_to_be_root + +# max(16384, PAGE_SIZE) is the default leaf/node size on btrfs-progs v3.12+. +# Older versions just use max(4096, PAGE_SIZE). +# mkfs.btrfs can't create an fs with a leaf/node size smaller than PAGE_SIZE. +leaf_size=$(echo -e 16384\n`getconf PAGE_SIZE` | sort -nr | head -1) + +send_files_dir=$TEST_DIR/btrfs-test-$seq + +rm -f $seqres.full +rm -fr $send_files_dir +mkdir $send_files_dir + +_scratch_mkfs -l $leaf_size /dev/null 21 +_scratch_mount + +echo hello world $SCRATCH_MNT/foobar + +$SETFATTR_PROG -n user.xattr_name_1 -v `$PERL_PROG -e 'print A x 6000;'` \ + $SCRATCH_MNT/foobar + +_run_btrfs_util_prog subvolume snapshot -r $SCRATCH_MNT $SCRATCH_MNT/mysnap1 +run_check $FSSUM_PROG -A -f -w $send_files_dir/1.fssum $SCRATCH_MNT/mysnap1 + +# Update existing xattr value and add a new xattr too. +$SETFATTR_PROG -n user.xattr_name_1 -v `$PERL_PROG -e 'print Z x ;'` \ + $SCRATCH_MNT/foobar +$SETFATTR_PROG -n user.xattr_name_2 -v `$PERL_PROG -e 'print U x ;'` \ + $SCRATCH_MNT/foobar + +_run_btrfs_util_prog subvolume snapshot -r $SCRATCH_MNT $SCRATCH_MNT/mysnap2 +run_check $FSSUM_PROG -A -f -w $send_files_dir/2.fssum \ + -x $SCRATCH_MNT/mysnap2/mysnap1 $SCRATCH_MNT/mysnap2 + +_run_btrfs_util_prog send $SCRATCH_MNT/mysnap1 -f $send_files_dir/1.snap +_run_btrfs_util_prog send $SCRATCH_MNT/mysnap2 -f $send_files_dir/2.snap + +_scratch_unmount +_check_scratch_fs + +_scratch_mkfs -l $leaf_size /dev/null 21 +_scratch_mount + +_run_btrfs_util_prog receive $SCRATCH_MNT -f $send_files_dir/1.snap +run_check $FSSUM_PROG -r $send_files_dir/1.fssum $SCRATCH_MNT/mysnap1 + +_run_btrfs_util_prog receive $SCRATCH_MNT -f $send_files_dir/2.snap +run_check $FSSUM_PROG -r $send_files_dir/2.fssum $SCRATCH_MNT/mysnap2 + +_check_scratch_fs + +status=0 +exit diff --git a/tests/btrfs/053.out b/tests/btrfs/053.out new file mode 100644 index 000..4c61638 --- /dev/null +++ b/tests/btrfs/053.out @@ -0,0 +1 @@ +QA output created by 053 diff --git a/tests/btrfs/group b/tests/btrfs/group index 5ff9b8e..ea49c5c 100644 --- a/tests/btrfs/group +++ b/tests/btrfs/group @@ -55,3 +55,4 @@ 050 auto 051 auto quick 052 auto quick +053 auto quick -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] Btrfs: send, use the right limits for xattr names and values
We were limiting the sum of the xattr name and value lengths to PATH_MAX, which is not correct, specially on filesystems created with btrfs-progs v3.12 or higher, where the default leaf size is max(16384, PAGE_SIZE), or systems with page sizes larger than 4096 bytes. Xattrs have their own specific maximum name and value lengths, which depend on the leaf size, therefore use these limits to be able to send xattrs with sizes larger than PATH_MAX. A test case for xfstests follows. Signed-off-by: Filipe David Borba Manana fdman...@gmail.com --- fs/btrfs/send.c | 30 +++--- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 70c5e8c..1a65a40 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -995,7 +995,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_dir_item *di; struct btrfs_key di_key; char *buf = NULL; - const int buf_len = PATH_MAX; + int buf_len; u32 name_len; u32 data_len; u32 cur; @@ -1005,6 +1005,11 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, int num; u8 type; + if (found_key-type == BTRFS_XATTR_ITEM_KEY) + buf_len = BTRFS_MAX_XATTR_SIZE(root); + else + buf_len = PATH_MAX; + buf = kmalloc(buf_len, GFP_NOFS); if (!buf) { ret = -ENOMEM; @@ -1026,12 +1031,23 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, type = btrfs_dir_type(eb, di); btrfs_dir_item_key_to_cpu(eb, di, di_key); - /* -* Path too long -*/ - if (name_len + data_len buf_len) { - ret = -ENAMETOOLONG; - goto out; + if (type == BTRFS_FT_XATTR) { + if (name_len XATTR_NAME_MAX) { + ret = -ENAMETOOLONG; + goto out; + } + if (name_len + data_len buf_len) { + ret = -E2BIG; + goto out; + } + } else { + /* +* Path too long +*/ + if (name_len + data_len buf_len) { + ret = -ENAMETOOLONG; + goto out; + } } read_extent_buffer(eb, buf, (unsigned long)(di + 1), -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 1/2] xfstests: add helper require function _require_btrfs_cloner
So that the same check (btrfs cloner program presence) can be reused by other tests. Signed-off-by: Filipe David Borba Manana fdman...@gmail.com --- common/rc | 7 +++ tests/btrfs/035 | 4 +--- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/common/rc b/common/rc index d1788d1..f27ee53 100644 --- a/common/rc +++ b/common/rc @@ -2085,6 +2085,13 @@ _require_fssum() [ -x $FSSUM_PROG ] || _notrun fssum not built } +_require_btrfs_cloner() +{ + CLONER_PROG=$here/src/cloner + [ -x $CLONER_PROG ] || \ + _notrun cloner binary not present at $CLONER_PROG +} + # Given 2 files, verify that they have the same mapping but different # inodes - i.e. an undisturbed reflink # Silent if so, make noise if not diff --git a/tests/btrfs/035 b/tests/btrfs/035 index 6808179..dd303af 100755 --- a/tests/btrfs/035 +++ b/tests/btrfs/035 @@ -45,13 +45,11 @@ trap _cleanup ; exit \$status 0 1 2 3 15 _supported_fs btrfs _supported_os Linux _require_scratch +_require_btrfs_cloner _scratch_mkfs /dev/null 21 _scratch_mount -CLONER_PROG=$here/src/cloner -[ -x $CLONER_PROG ] || _notrun cloner binary not present at $CLONER_PROG - src_str=aa echo -n $src_str $SCRATCH_MNT/src -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v3] Btrfs: ensure readers see new data after a clone operation
; } fd2 = open(DST_FILE, O_RDWR); if (fd2 0) { fprintf(stderr, Error open dst file: %s\n, strerror(errno)); return 1; } clone_args.src_fd = fd1; clone_args.src_offset = 0; clone_args.src_length = 4096; clone_args.dest_offset = 0; ret = ioctl(fd2, BTRFS_IOC_CLONE_RANGE, clone_args); assert(ret == 0); close(fd1); close(fd2); pthread_mutex_lock(mutex); clone_done = 1; pthread_mutex_unlock(mutex); ret = pthread_join(reader, NULL); assert(ret == 0); pthread_mutex_lock(mutex); ret = stale_data ? 1 : 0; pthread_mutex_unlock(mutex); return ret; } Signed-off-by: Filipe David Borba Manana fdman...@gmail.com --- V2: Protect against ongoing writes by locking the target range in the io tree and wait for any existing ordered extents for that range to finish before starting the clone operation. V3: Fixed the locking ranges for the case where the source and target inodes are the same. I was passing an end offset to lock_extent_range when that function expects a range length instead. This resulted in incorrect unlocking, leave some extent states locked forever. This is now tested with a new test case for xfstests. fs/btrfs/ioctl.c | 36 +++- 1 file changed, 31 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index fba7a00..362720a 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3410,15 +3410,41 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, goto out_unlock; } - /* truncate page cache pages from target inode range */ - truncate_inode_pages_range(inode-i_data, destoff, - PAGE_CACHE_ALIGN(destoff + len) - 1); + /* +* Lock the target range too. Right after we replace the file extent +* items in the fs tree (which now point to the cloned data), we might +* have a worker replace them with extent items relative to a write +* operation that was issued before this clone operation (i.e. confront +* with inode.c:btrfs_finish_ordered_io). +*/ + if (same_inode) { + u64 lock_start = min_t(u64, off, destoff); + u64 lock_len = max_t(u64, off, destoff) + len - lock_start; - lock_extent_range(src, off, len); + lock_extent_range(src, lock_start, lock_len); + } else { + lock_extent_range(src, off, len); + lock_extent_range(inode, destoff, len); + } ret = btrfs_clone(src, inode, off, olen, len, destoff); - unlock_extent(BTRFS_I(src)-io_tree, off, off + len - 1); + if (same_inode) { + u64 lock_start = min_t(u64, off, destoff); + u64 lock_end = max_t(u64, off, destoff) + len - 1; + + unlock_extent(BTRFS_I(src)-io_tree, lock_start, lock_end); + } else { + unlock_extent(BTRFS_I(src)-io_tree, off, off + len - 1); + unlock_extent(BTRFS_I(inode)-io_tree, destoff, + destoff + len - 1); + } + /* +* Truncate page cache pages so that future reads will see the cloned +* data immediately and not the previous data. +*/ + truncate_inode_pages_range(inode-i_data, destoff, + PAGE_CACHE_ALIGN(destoff + len) - 1); out_unlock: if (!same_inode) { if (inode src) { -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 2/2] xfstests: add test for btrfs ioctl clone operation
This is a test to verify that the btrfs ioctl clone operation is able to clone extents of a file to different positions of the file, that is, the source and target files are the same. Existing tests only cover the case where the source and target files are different. Signed-off-by: Filipe David Borba Manana fdman...@gmail.com --- tests/btrfs/052 | 100 tests/btrfs/052.out | 30 tests/btrfs/group | 1 + 3 files changed, 131 insertions(+) create mode 100755 tests/btrfs/052 create mode 100644 tests/btrfs/052.out diff --git a/tests/btrfs/052 b/tests/btrfs/052 new file mode 100755 index 000..292eb50 --- /dev/null +++ b/tests/btrfs/052 @@ -0,0 +1,100 @@ +#! /bin/bash +# FS QA Test No. btrfs/052 +# +# Verify that the btrfs ioctl clone operation can operate on the same +# file as a source and target. That is, clone extents within the same +# file. +# +#--- +# Copyright (c) 2014 Filipe Manana. All Rights Reserved. +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +#--- +# + +seq=`basename $0` +seqres=$RESULT_DIR/$seq +echo QA output created by $seq + +tmp=/tmp/$$ +status=1 # failure is the default! +trap _cleanup; exit \$status 0 1 2 3 15 + +_cleanup() +{ +rm -fr $tmp +} + +# get standard environment, filters and checks +. ./common/rc +. ./common/filter + +# real QA test starts here +_supported_fs btrfs +_supported_os Linux +_require_scratch +_require_btrfs_cloner +_need_to_be_root + +rm -f $seqres.full + +_scratch_mkfs /dev/null 21 +_scratch_mount + +# Create a file with 4 extents of 8Kb each. +$XFS_IO_PROG -f -c pwrite -S 0x01 -b 8192 0 8192 $SCRATCH_MNT/foo \ + | _filter_xfs_io +sync +$XFS_IO_PROG -c pwrite -S 0x02 -b 8192 8192 8192 $SCRATCH_MNT/foo \ + | _filter_xfs_io +sync +$XFS_IO_PROG -c pwrite -S 0x03 -b 8192 16384 8192 $SCRATCH_MNT/foo \ + | _filter_xfs_io +sync +$XFS_IO_PROG -c pwrite -S 0x04 -b 8192 24576 8192 $SCRATCH_MNT/foo \ + | _filter_xfs_io +sync + +# Digest of initial content. +md5sum $SCRATCH_MNT/foo | _filter_scratch + +# Same source and target ranges - must fail. +$CLONER_PROG -s 8192 -d 8192 -l 8192 $SCRATCH_MNT/foo $SCRATCH_MNT/foo +# Check file content didn't change. +md5sum $SCRATCH_MNT/foo | _filter_scratch + +# Intersection between source and target ranges - must fail too. +$CLONER_PROG -s 4096 -d 8192 -l 8192 $SCRATCH_MNT/foo $SCRATCH_MNT/foo +# Check file content didn't change. +md5sum $SCRATCH_MNT/foo | _filter_scratch + +# Clone from a higher range to a lower range. +$CLONER_PROG -s 24576 -d 0 -l 8192 $SCRATCH_MNT/foo $SCRATCH_MNT/foo + +# Check entire file, the 8Kb block at offset 0 now has the same content as the +# 8Kb block at offset 24576. +od -t x1 $SCRATCH_MNT/foo + +# Clone from a lower range to a higher range. +$CLONER_PROG -s 8192 -d 16384 -l 8192 $SCRATCH_MNT/foo $SCRATCH_MNT/foo + +# Check entire file, the 8Kb block at offset 0 now has the same content as the +# 8Kb block at offset 24576, and the 8Kb block at offset 16384 now has the same +# content as the 8Kb block at offset 8192. +od -t x1 $SCRATCH_MNT/foo + +_check_scratch_fs + +status=0 +exit diff --git a/tests/btrfs/052.out b/tests/btrfs/052.out new file mode 100644 index 000..0073813 --- /dev/null +++ b/tests/btrfs/052.out @@ -0,0 +1,30 @@ +QA output created by 052 +wrote 8192/8192 bytes at offset 0 +XXX Bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +wrote 8192/8192 bytes at offset 8192 +XXX Bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +wrote 8192/8192 bytes at offset 16384 +XXX Bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +wrote 8192/8192 bytes at offset 24576 +XXX Bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +b328fe91ed791d96b3ca6830ef50475f SCRATCH_MNT/foo +clone failed: Invalid argument +b328fe91ed791d96b3ca6830ef50475f SCRATCH_MNT/foo +clone failed: Invalid argument +b328fe91ed791d96b3ca6830ef50475f SCRATCH_MNT/foo +000 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 +* +002 02 02 02 02 02 02 02 02 02 02 02 02 02 02 02 02 +* +004 03 03 03 03 03 03 03 03 03 03 03 03 03 03 03 03 +* +006 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 +* +010 +000 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 +* +002 02
[PATCH] Btrfs: send, fix corrupted paths strings for long paths
If a path has more than 230 characters, we allocate a new buffer to use for the path, but we were forgotting to copy the contents of the previous buffer into the new one, which has random content from the kmalloc call. Test: mkfs.btrfs -f /dev/sdd mount /dev/sdd /mnt TEST_PATH=/mnt/fdmanana/.config/google-chrome-mysetup/Default/Pepper_Data/Shockwave_Flash/WritableRoot/#SharedObjects/JSHJ4ZKN/s.wsj.net/[[IMPORT]]/players.edgesuite.net/flash/plugins/osmf/advanced-streaming-plugin/v2.7/osmf1.6/Ak# mkdir -p $TEST_PATH echo hello world $TEST_PATH/amaiAdvancedStreamingPlugin.txt btrfs subvolume snapshot -r /mnt /mnt/mysnap1 btrfs send /mnt/mysnap1 -f /tmp/1.snap A test for xfstests follows. Signed-off-by: Filipe David Borba Manana fdman...@gmail.com Cc: Marc Merlin m...@merlins.org --- fs/btrfs/send.c | 7 +-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index f6bbc1e..70c5e8c 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -368,10 +368,13 @@ static int fs_path_ensure_buf(struct fs_path *p, int len) /* * First time the inline_buf does not suffice */ - if (p-buf == p-inline_buf) + if (p-buf == p-inline_buf) { tmp_buf = kmalloc(len, GFP_NOFS); - else + if (tmp_buf) + memcpy(tmp_buf, p-buf, old_buf_len); + } else { tmp_buf = krealloc(p-buf, len, GFP_NOFS); + } if (!tmp_buf) return -ENOMEM; p-buf = tmp_buf; -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] xfstests: add test for btrfs send with long paths
Regression test for btrfs send where long paths (exceeding 230 characters) made send produce paths with random characters from a memory buffer returned by kmalloc, as send forgot to populate the new buffer with the path string. This issue is fixed by the following linux kernel btrfs patch: Btrfs: send, fix corrupted path strings for long paths Signed-off-by: Filipe David Borba Manana fdman...@gmail.com --- tests/btrfs/051 | 85 + tests/btrfs/051.out | 1 + tests/btrfs/group | 1 + 3 files changed, 87 insertions(+) create mode 100755 tests/btrfs/051 create mode 100644 tests/btrfs/051.out diff --git a/tests/btrfs/051 b/tests/btrfs/051 new file mode 100755 index 000..53df664 --- /dev/null +++ b/tests/btrfs/051 @@ -0,0 +1,85 @@ +#! /bin/bash +# FS QA Test No. btrfs/051 +# +# Regression test for btrfs send where long paths (exceeding 230 characters) +# made send produce paths with random characters from a memory buffer returned +# by kmalloc, as send forgot to populate the new buffer with the path string. +# +# This issue is fixed by the following linux kernel btrfs patch: +# +# Btrfs: send, fix corrupted path strings for long paths +# +#--- +# Copyright (c) 2014 Filipe Manana. All Rights Reserved. +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +#--- +# + +seq=`basename $0` +seqres=$RESULT_DIR/$seq +echo QA output created by $seq + +tmp=/tmp/$$ +status=1 # failure is the default! +trap _cleanup; exit \$status 0 1 2 3 15 + +_cleanup() +{ +rm -fr $send_files_dir +rm -fr $tmp +} + +# get standard environment, filters and checks +. ./common/rc +. ./common/filter + +# real QA test starts here +_supported_fs btrfs +_supported_os Linux +_require_scratch +_require_fssum +_need_to_be_root + +send_files_dir=$TEST_DIR/btrfs-test-$seq + +rm -f $seqres.full +rm -fr $send_files_dir +mkdir $send_files_dir + +_scratch_mkfs /dev/null 21 +_scratch_mount + +TEST_PATH=$SCRATCH_MNT/fdmanana/.config/google-chrome-mysetup/Default/Pepper_Data/Shockwave_Flash/WritableRoot/#SharedObjects/JSHJ4ZKN/s.wsj.net/[[IMPORT]]/players.edgesuite.net/flash/plugins/osmf/advanced-streaming-plugin/v2.7/osmf1.6/Ak# + +mkdir -p $TEST_PATH +echo hello world $TEST_PATH/amaiAdvancedStreamingPlugin.txt + +_run_btrfs_util_prog subvolume snapshot -r $SCRATCH_MNT $SCRATCH_MNT/mysnap1 +run_check $FSSUM_PROG -A -f -w $send_files_dir/1.fssum $SCRATCH_MNT/mysnap1 +_run_btrfs_util_prog send $SCRATCH_MNT/mysnap1 -f $send_files_dir/1.snap + +_scratch_unmount +_check_scratch_fs + +_scratch_mkfs /dev/null 21 +_scratch_mount + +_run_btrfs_util_prog receive $SCRATCH_MNT -f $send_files_dir/1.snap +run_check $FSSUM_PROG -r $send_files_dir/1.fssum $SCRATCH_MNT/mysnap1 + +_check_scratch_fs + +status=0 +exit diff --git a/tests/btrfs/051.out b/tests/btrfs/051.out new file mode 100644 index 000..636dcef --- /dev/null +++ b/tests/btrfs/051.out @@ -0,0 +1 @@ +QA output created by 051 diff --git a/tests/btrfs/group b/tests/btrfs/group index 69a80e0..0673449 100644 --- a/tests/btrfs/group +++ b/tests/btrfs/group @@ -53,3 +53,4 @@ 048 auto quick 049 auto quick 050 auto +051 auto quick -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v2] Btrfs: send, fix corrupted path strings for long paths
If a path has more than 230 characters, we allocate a new buffer to use for the path, but we were forgotting to copy the contents of the previous buffer into the new one, which has random content from the kmalloc call. Test: mkfs.btrfs -f /dev/sdd mount /dev/sdd /mnt TEST_PATH=/mnt/fdmanana/.config/google-chrome-mysetup/Default/Pepper_Data/Shockwave_Flash/WritableRoot/#SharedObjects/JSHJ4ZKN/s.wsj.net/[[IMPORT]]/players.edgesuite.net/flash/plugins/osmf/advanced-streaming-plugin/v2.7/osmf1.6/Ak# mkdir -p $TEST_PATH echo hello world $TEST_PATH/amaiAdvancedStreamingPlugin.txt btrfs subvolume snapshot -r /mnt /mnt/mysnap1 btrfs send /mnt/mysnap1 -f /tmp/1.snap A test for xfstests follows. Signed-off-by: Filipe David Borba Manana fdman...@gmail.com Cc: Marc Merlin m...@merlins.org --- V2: Fix change title, paths to path. fs/btrfs/send.c | 7 +-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index f6bbc1e..70c5e8c 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -368,10 +368,13 @@ static int fs_path_ensure_buf(struct fs_path *p, int len) /* * First time the inline_buf does not suffice */ - if (p-buf == p-inline_buf) + if (p-buf == p-inline_buf) { tmp_buf = kmalloc(len, GFP_NOFS); - else + if (tmp_buf) + memcpy(tmp_buf, p-buf, old_buf_len); + } else { tmp_buf = krealloc(p-buf, len, GFP_NOFS); + } if (!tmp_buf) return -ENOMEM; p-buf = tmp_buf; -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v2] Btrfs: ensure readers see new data after a clone operation
; } fd2 = open(DST_FILE, O_RDWR); if (fd2 0) { fprintf(stderr, Error open dst file: %s\n, strerror(errno)); return 1; } clone_args.src_fd = fd1; clone_args.src_offset = 0; clone_args.src_length = 4096; clone_args.dest_offset = 0; ret = ioctl(fd2, BTRFS_IOC_CLONE_RANGE, clone_args); assert(ret == 0); close(fd1); close(fd2); pthread_mutex_lock(mutex); clone_done = 1; pthread_mutex_unlock(mutex); ret = pthread_join(reader, NULL); assert(ret == 0); pthread_mutex_lock(mutex); ret = stale_data ? 1 : 0; pthread_mutex_unlock(mutex); return ret; } Signed-off-by: Filipe David Borba Manana fdman...@gmail.com --- V2: Protect against ongoing writes by locking the target range in the io tree and wait for any existing ordered extents for that range to finish before starting the clone operation. fs/btrfs/ioctl.c | 34 -- 1 file changed, 28 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index fba7a00..12a60ef 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3410,15 +3410,37 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, goto out_unlock; } - /* truncate page cache pages from target inode range */ - truncate_inode_pages_range(inode-i_data, destoff, - PAGE_CACHE_ALIGN(destoff + len) - 1); - - lock_extent_range(src, off, len); + /* +* Lock the target range too. Right after we replace the file extent +* items in the fs tree (which now point to the cloned data), we might +* have a worker replace them with extent items relative to a write +* operation that was issued before this clone operation (i.e. confront +* with inode.c:btrfs_finish_ordered_io). +*/ + if (same_inode) { + lock_extent_range(src, min_t(u64, off, destoff), + max_t(u64, off + len, destoff + len)); + } else { + lock_extent_range(src, off, len); + lock_extent_range(inode, destoff, len); + } ret = btrfs_clone(src, inode, off, olen, len, destoff); - unlock_extent(BTRFS_I(src)-io_tree, off, off + len - 1); + if (same_inode) { + unlock_extent(BTRFS_I(src)-io_tree, min_t(u64, off, destoff), + max_t(u64, off + len, destoff + len) - 1); + } else { + unlock_extent(BTRFS_I(src)-io_tree, off, off + len - 1); + unlock_extent(BTRFS_I(inode)-io_tree, destoff, + destoff + len - 1); + } + /* +* Truncate page cache pages so that future reads will see the cloned +* data immediately and not the previous data. +*/ + truncate_inode_pages_range(inode-i_data, destoff, + PAGE_CACHE_ALIGN(destoff + len) - 1); out_unlock: if (!same_inode) { if (inode src) { -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] Btrfs: fix leak of block group cache objects
The change titled: Btrfs: fix broken free space cache after the system crashed can increment a block group cache object twice in find_free_extent() and never decrement it twice, resulting in a memory leak. This is easy to reproduce by having kmemleak enabled and the following steps: mkfs.btrfs -f /dev/sdd mount /dev/sdd /mnt umount /mnt rmmod btrfs cat /sys/kernel/debug/kmemleak unreferenced object 0x8802089249d8 (size 512): comm mount, pid 6826, jiffies 430600 (age 3065.636s) hex dump (first 32 bytes): 00 00 c0 01 00 00 00 00 c0 00 00 00 40 00 00 00 @... 00 00 c0 01 00 00 00 00 00 00 01 00 00 00 00 00 backtrace: [816ab3b6] kmemleak_alloc+0x26/0x50 [8119447d] kmem_cache_alloc_trace+0x11d/0x1e0 [a02a369c] btrfs_create_block_group_cache+0x3c/0x160 [btrfs] [a02adf07] btrfs_read_block_groups+0x1d7/0x650 [btrfs] [a02bc800] open_ctree+0x16a0/0x20c0 [btrfs] [a0293321] btrfs_mount+0x6b1/0x980 [btrfs] [811aa9c0] mount_fs+0x20/0xe0 [811c76f3] vfs_kern_mount+0x73/0x170 [811ca046] do_mount+0x206/0xb20 [811cac4e] SyS_mount+0x8e/0xe0 [816c8492] system_call_fastpath+0x16/0x1b [] 0x unreferenced object 0x8802019571d0 (size 128): comm mount, pid 6826, jiffies 430600 (age 3065.684s) hex dump (first 32 bytes): 4d 06 4d 06 ad 4e ad de ff ff ff ff 00 00 00 00 M.M..N.. ff ff ff ff ff ff ff ff 90 0d 36 a0 ff ff ff ff ..6. backtrace: [816ab3b6] kmemleak_alloc+0x26/0x50 [8119447d] kmem_cache_alloc_trace+0x11d/0x1e0 [a02a36be] btrfs_create_block_group_cache+0x5e/0x160 [btrfs] [a02adf07] btrfs_read_block_groups+0x1d7/0x650 [btrfs] [a02bc800] open_ctree+0x16a0/0x20c0 [btrfs] [a0293321] btrfs_mount+0x6b1/0x980 [btrfs] [811aa9c0] mount_fs+0x20/0xe0 [811c76f3] vfs_kern_mount+0x73/0x170 [811ca046] do_mount+0x206/0xb20 [811cac4e] SyS_mount+0x8e/0xe0 [816c8492] system_call_fastpath+0x16/0x1b [] 0x Signed-off-by: Filipe David Borba Manana fdman...@gmail.com --- Note: this only affects Chris' integration branch. fs/btrfs/extent-tree.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index eb0760f..0bad610 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -6456,6 +6456,8 @@ static noinline int find_free_extent(struct btrfs_root *orig_root, } else { index = get_block_group_index(block_group); btrfs_grab_block_group(block_group, delalloc); + /* compensate get by btrfs_grab_block_group() */ + btrfs_put_block_group(block_group); goto have_block_group; } } else if (block_group) { -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] Btrfs: send, fix incorrect ref access when using extrefs
When running send, if an inode only has extended reference items associated to it and no regular references, send.c:get_first_ref() was incorrectly assuming the reference it found was of type BTRFS_INODE_REF_KEY due to use of the wrong key variable. This caused weird behaviour when using the found item has a regular reference, such as weird path string, and occasionally (when lucky) a crash: [ 190.600652] general protection fault: [#1] SMP DEBUG_PAGEALLOC [ 190.600994] Modules linked in: btrfs xor raid6_pq binfmt_misc nfsd auth_rpcgss oid_registry nfs_acl nfs lockd fscache sunrpc psmouse serio_raw evbug pcspkr i2c_piix4 e1000 floppy [ 190.602565] CPU: 2 PID: 14520 Comm: btrfs Not tainted 3.13.0-fdm-btrfs-next-26+ #1 [ 190.602728] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 [ 190.602868] task: 8800d447c920 ti: 8801fa79e000 task.ti: 8801fa79e000 [ 190.603030] RIP: 0010:[813266b4] [813266b4] memcpy+0x54/0x110 [ 190.603262] RSP: 0018:8801fa79f880 EFLAGS: 00010202 [ 190.603395] RAX: 8800d4326e3f RBX: 036a RCX: 8800 [ 190.603553] RDX: 032a RSI: ffe708844042936a RDI: 8800d43271a9 [ 190.603710] RBP: 8801fa79f8c8 R08: 003a4ef0 R09: [ 190.603867] R10: 793a4ef09f00 R11: 9f53726f R12: 8800d43271a9 [ 190.604020] R13: 1600 R14: 8802110134f0 R15: 036a [ 190.604020] FS: 7fb423d09b80() GS:88021620() knlGS: [ 190.604020] CS: 0010 DS: ES: CR0: 8005003b [ 190.604020] CR2: 7fb4229d4b78 CR3: 0001f5d76000 CR4: 06e0 [ 190.604020] Stack: [ 190.604020] a01f4d49 8801fa79f8f0 09f9 8801fa79f8c8 [ 190.604020] 09f9 880211013260 f971 88021147dba8 [ 190.604020] 09f9 8801fa79f918 a02367f5 8801fa79f928 [ 190.604020] Call Trace: [ 190.604020] [a01f4d49] ? read_extent_buffer+0xb9/0x120 [btrfs] [ 190.604020] [a02367f5] fs_path_add_from_extent_buffer+0x45/0x60 [btrfs] [ 190.604020] [a0238806] get_first_ref+0x1f6/0x210 [btrfs] [ 190.604020] [a0238994] __get_cur_name_and_parent+0x174/0x3a0 [btrfs] [ 190.604020] [8118df3d] ? kmem_cache_alloc_trace+0x11d/0x1e0 [ 190.604020] [a0236674] ? fs_path_alloc+0x24/0x60 [btrfs] [ 190.604020] [a0238c91] get_cur_path+0xd1/0x240 [btrfs] (...) Steps to reproduce (either crash or some weirdness like an odd path string): mkfs.btrfs -f -O extref /dev/sdd mount /dev/sdd /mnt mkdir /mnt/testdir touch /mnt/testdir/foobar for i in `seq 1 2550`; do ln /mnt/testdir/foobar /mnt/testdir/foobar_link_`printf %04d $i` done ln /mnt/testdir/foobar /mnt/testdir/final_foobar_name rm -f /mnt/testdir/foobar for i in `seq 1 2550`; do rm -f /mnt/testdir/foobar_link_`printf %04d $i` done btrfs subvolume snapshot -r /mnt /mnt/mysnap btrfs send /mnt/mysnap -f /tmp/mysnap.send Signed-off-by: Filipe David Borba Manana fdman...@gmail.com --- fs/btrfs/send.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 40f353f..0035bdd 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1688,7 +1688,7 @@ static int get_first_ref(struct btrfs_root *root, u64 ino, goto out; } - if (key.type == BTRFS_INODE_REF_KEY) { + if (found_key.type == BTRFS_INODE_REF_KEY) { struct btrfs_inode_ref *iref; iref = btrfs_item_ptr(path-nodes[0], path-slots[0], struct btrfs_inode_ref); -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] xfstests: btrfs, add regression test for send with extrefs
Regression for btrfs send when an inode only has extended references associated to it (no regular references present). This used to cause incorrect access to a b+tree leaf, where an extended reference item was accessed as if it were a regular reference item, causing unexpected and unpredictable behaviour such as producing a random/weird path string or a crash. This issue is fixed by the following linux kernel btrfs patch: Btrfs: send, fix incorrect ref access when using extrefs Cc: Josef Bacik jba...@fb.com Signed-off-by: Filipe David Borba Manana fdman...@gmail.com --- tests/btrfs/050 | 109 tests/btrfs/050.out | 1 + tests/btrfs/group | 1 + 3 files changed, 111 insertions(+) create mode 100755 tests/btrfs/050 create mode 100644 tests/btrfs/050.out diff --git a/tests/btrfs/050 b/tests/btrfs/050 new file mode 100755 index 000..6e4bd13 --- /dev/null +++ b/tests/btrfs/050 @@ -0,0 +1,109 @@ +#! /bin/bash +# FS QA Test No. btrfs/050 +# +# Regression for btrfs send when an inode only has extended references +# associated to it (no regular references present). This used to cause +# incorrect access to a b+tree leaf, where an extended reference item +# was accessed as if it were a regular reference item, causing unexpected +# and unpredictable behaviour such as producing a random/weird path string +# or a crash. +# +# This issue is fixed by the following linux kernel btrfs patch: +# +# Btrfs: send, fix incorrect ref access when using extrefs +# +#--- +# Copyright (c) 2014 Filipe Manana. All Rights Reserved. +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +#--- +# + +seq=`basename $0` +seqres=$RESULT_DIR/$seq +echo QA output created by $seq + +tmp=/tmp/$$ +status=1 # failure is the default! +trap _cleanup; exit \$status 0 1 2 3 15 + +_cleanup() +{ +rm -fr $send_files_dir +rm -fr $tmp +} + +# get standard environment, filters and checks +. ./common/rc +. ./common/filter + +# real QA test starts here +_supported_fs btrfs +_supported_os Linux +_require_scratch +_require_fssum +_need_to_be_root + +send_files_dir=$TEST_DIR/btrfs-test-$seq + +rm -f $seqres.full +rm -fr $send_files_dir +mkdir $send_files_dir + +_scratch_mkfs -O extref /dev/null 21 +_scratch_mount + +# 2550 hard links is enough to cause creation of extended references +# even if the leaf/node size is 64Kb (largest possible). +NUM_LINKS=2550 +TEST_PATH=$SCRATCH_MNT/home/john/files/series/qwerty + +mkdir -p $TEST_PATH +touch $TEST_PATH/foobar + +# Create a bunch of hard links for the file, such that at least one +# inode extended reference item is created. +for i in `seq 1 $NUM_LINKS`; do + ln $TEST_PATH/foobar $TEST_PATH/foobar_link_`printf %04d $i` +done + +# The only link we'll have alive at the end. +ln $TEST_PATH/foobar $TEST_PATH/final_foobar_name + +# Now delete all previous hard links (except the last one). This will +# remove the regular inode reference item from the b+tree, and will +# leave only an inode extended reference item, which is the condition +# necessary to trigger the bug. +rm -f $TEST_PATH/foobar +for i in `seq 1 $NUM_LINKS`; do + rm -f $TEST_PATH/foobar_link_`printf %04d $i` +done + +_run_btrfs_util_prog subvolume snapshot -r $SCRATCH_MNT $SCRATCH_MNT/mysnap1 +run_check $FSSUM_PROG -A -f -w $send_files_dir/1.fssum $SCRATCH_MNT/mysnap1 +_run_btrfs_util_prog send $SCRATCH_MNT/mysnap1 -f $send_files_dir/1.snap + +_scratch_unmount +_check_scratch_fs + +_scratch_mkfs /dev/null 21 +_scratch_mount + +_run_btrfs_util_prog receive $SCRATCH_MNT -f $send_files_dir/1.snap +run_check $FSSUM_PROG -r $send_files_dir/1.fssum $SCRATCH_MNT/mysnap1 + +_check_scratch_fs + +status=0 +exit diff --git a/tests/btrfs/050.out b/tests/btrfs/050.out new file mode 100644 index 000..37f2cbc --- /dev/null +++ b/tests/btrfs/050.out @@ -0,0 +1 @@ +QA output created by 050 diff --git a/tests/btrfs/group b/tests/btrfs/group index 59b0c98..69a80e0 100644 --- a/tests/btrfs/group +++ b/tests/btrfs/group @@ -52,3 +52,4 @@ 047 auto quick 048 auto quick 049 auto quick +050 auto -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http
[PATCH v3] Btrfs: fix hang on error (such as ENOSPC) when writing extent pages
When running low on available disk space and having several processes doing buffered file IO, I got the following trace in dmesg: [ 4202.720152] INFO: task kworker/u8:1:5450 blocked for more than 120 seconds. [ 4202.720401] Not tainted 3.13.0-fdm-btrfs-next-26+ #1 [ 4202.720596] echo 0 /proc/sys/kernel/hung_task_timeout_secs disables this message. [ 4202.720874] kworker/u8:1D 0001 0 5450 2 0x [ 4202.720904] Workqueue: btrfs-flush_delalloc normal_work_helper [btrfs] [ 4202.720908] 8801f62ddc38 0082 880203ac2490 001d3f40 [ 4202.720913] 8801f62ddfd8 001d3f40 8800c4f0c920 880203ac2490 [ 4202.720918] 001d4a40 88020fe85a40 88020fe85ab8 0001 [ 4202.720922] Call Trace: [ 4202.720931] [816a3cb9] schedule+0x29/0x70 [ 4202.720950] [a01ec48d] btrfs_start_ordered_extent+0x6d/0x110 [btrfs] [ 4202.720956] [8108e620] ? bit_waitqueue+0xc0/0xc0 [ 4202.720972] [a01ec559] btrfs_run_ordered_extent_work+0x29/0x40 [btrfs] [ 4202.720988] [a0201987] normal_work_helper+0x137/0x2c0 [btrfs] [ 4202.720994] [810680e5] process_one_work+0x1f5/0x530 (...) [ 4202.721027] 2 locks held by kworker/u8:1/5450: [ 4202.721028] #0: (%s-%s){..}, at: [81068083] process_one_work+0x193/0x530 [ 4202.721037] #1: ((work-normal_work)){+.+...}, at: [81068083] process_one_work+0x193/0x530 [ 4202.721054] INFO: task btrfs:7891 blocked for more than 120 seconds. [ 4202.721258] Not tainted 3.13.0-fdm-btrfs-next-26+ #1 [ 4202.721444] echo 0 /proc/sys/kernel/hung_task_timeout_secs disables this message. [ 4202.721699] btrfs D 0001 0 7891 7890 0x0001 [ 4202.721704] 88018c2119e8 0086 8800a33d2490 001d3f40 [ 4202.721710] 88018c211fd8 001d3f40 8802144b 8800a33d2490 [ 4202.721714] 8800d8576640 88020fe85bc0 88020fe85bc8 7fff [ 4202.721718] Call Trace: [ 4202.721723] [816a3cb9] schedule+0x29/0x70 [ 4202.721727] [816a2ebc] schedule_timeout+0x1dc/0x270 [ 4202.721732] [8109bd79] ? mark_held_locks+0xb9/0x140 [ 4202.721736] [816a90c0] ? _raw_spin_unlock_irq+0x30/0x40 [ 4202.721740] [8109bf0d] ? trace_hardirqs_on_caller+0x10d/0x1d0 [ 4202.721744] [816a488f] wait_for_completion+0xdf/0x120 [ 4202.721749] [8107fa90] ? try_to_wake_up+0x310/0x310 [ 4202.721765] [a01ebee4] btrfs_wait_ordered_extents+0x1f4/0x280 [btrfs] [ 4202.721781] [a020526e] btrfs_mksubvol.isra.62+0x30e/0x5a0 [btrfs] [ 4202.721786] [8108e620] ? bit_waitqueue+0xc0/0xc0 [ 4202.721799] [a02056a9] btrfs_ioctl_snap_create_transid+0x1a9/0x1b0 [btrfs] [ 4202.721813] [a020583a] btrfs_ioctl_snap_create_v2+0x10a/0x170 [btrfs] (...) It turns out that extent_io.c:__extent_writepage(), which ends up being called through filemap_fdatawrite_range() in btrfs_start_ordered_extent(), was getting -ENOSPC when calling the fill_delalloc callback. In this situation, it returned without the writepage_end_io_hook callback (inode.c:btrfs_writepage_end_io_hook) ever being called for the respective page, which prevents the ordered extent's bytes_left count from ever reaching 0, and therefore a finish_ordered_fn work is never queued into the endio_write_workers queue. This makes the task that called btrfs_start_ordered_extent() hang forever on the wait queue of the ordered extent. This is fairly easy to reproduce using a small filesystem and fsstress on a quad core vm: mkfs.btrfs -f -b `expr 2100 \* 1024 \* 1024` /dev/sdd mount /dev/sdd /mnt fsstress -p 6 -d /mnt -n 10 -x \ btrfs subvolume snapshot -r /mnt /mnt/mysnap \ -f allocsp=0 \ -f bulkstat=0 \ -f bulkstat1=0 \ -f chown=0 \ -f creat=1 \ -f dread=0 \ -f dwrite=0 \ -f fallocate=1 \ -f fdatasync=0 \ -f fiemap=0 \ -f freesp=0 \ -f fsync=0 \ -f getattr=0 \ -f getdents=0 \ -f link=0 \ -f mkdir=0 \ -f mknod=0 \ -f punch=1 \ -f read=0 \ -f readlink=0 \ -f rename=0 \ -f resvsp=0 \ -f rmdir=0 \ -f setxattr=0 \ -f stat=0 \ -f symlink=0 \ -f sync=0 \ -f truncate=1 \ -f unlink=0 \ -f unresvsp=0 \ -f write=4 So just ensure that if an error happens while writing the extent page we call the writepage_end_io_hook callback. Also make it return the error code and ensure the caller (extent_write_cache_pages) processes all pages in the page vector even if an error happens only for some of them, so that ordered extents end up released. Signed-off-by: Filipe David Borba Manana
[PATCH] Btrfs: fix hang on error (such as ENOSPC) when writing extent pages
When running low on available disk space and having several processes doing buffered file IO, I got the following trace in dmesg: [ 4202.720152] INFO: task kworker/u8:1:5450 blocked for more than 120 seconds. [ 4202.720401] Not tainted 3.13.0-fdm-btrfs-next-26+ #1 [ 4202.720596] echo 0 /proc/sys/kernel/hung_task_timeout_secs disables this message. [ 4202.720874] kworker/u8:1D 0001 0 5450 2 0x [ 4202.720904] Workqueue: btrfs-flush_delalloc normal_work_helper [btrfs] [ 4202.720908] 8801f62ddc38 0082 880203ac2490 001d3f40 [ 4202.720913] 8801f62ddfd8 001d3f40 8800c4f0c920 880203ac2490 [ 4202.720918] 001d4a40 88020fe85a40 88020fe85ab8 0001 [ 4202.720922] Call Trace: [ 4202.720931] [816a3cb9] schedule+0x29/0x70 [ 4202.720950] [a01ec48d] btrfs_start_ordered_extent+0x6d/0x110 [btrfs] [ 4202.720956] [8108e620] ? bit_waitqueue+0xc0/0xc0 [ 4202.720972] [a01ec559] btrfs_run_ordered_extent_work+0x29/0x40 [btrfs] [ 4202.720988] [a0201987] normal_work_helper+0x137/0x2c0 [btrfs] [ 4202.720994] [810680e5] process_one_work+0x1f5/0x530 (...) [ 4202.721027] 2 locks held by kworker/u8:1/5450: [ 4202.721028] #0: (%s-%s){..}, at: [81068083] process_one_work+0x193/0x530 [ 4202.721037] #1: ((work-normal_work)){+.+...}, at: [81068083] process_one_work+0x193/0x530 [ 4202.721054] INFO: task btrfs:7891 blocked for more than 120 seconds. [ 4202.721258] Not tainted 3.13.0-fdm-btrfs-next-26+ #1 [ 4202.721444] echo 0 /proc/sys/kernel/hung_task_timeout_secs disables this message. [ 4202.721699] btrfs D 0001 0 7891 7890 0x0001 [ 4202.721704] 88018c2119e8 0086 8800a33d2490 001d3f40 [ 4202.721710] 88018c211fd8 001d3f40 8802144b 8800a33d2490 [ 4202.721714] 8800d8576640 88020fe85bc0 88020fe85bc8 7fff [ 4202.721718] Call Trace: [ 4202.721723] [816a3cb9] schedule+0x29/0x70 [ 4202.721727] [816a2ebc] schedule_timeout+0x1dc/0x270 [ 4202.721732] [8109bd79] ? mark_held_locks+0xb9/0x140 [ 4202.721736] [816a90c0] ? _raw_spin_unlock_irq+0x30/0x40 [ 4202.721740] [8109bf0d] ? trace_hardirqs_on_caller+0x10d/0x1d0 [ 4202.721744] [816a488f] wait_for_completion+0xdf/0x120 [ 4202.721749] [8107fa90] ? try_to_wake_up+0x310/0x310 [ 4202.721765] [a01ebee4] btrfs_wait_ordered_extents+0x1f4/0x280 [btrfs] [ 4202.721781] [a020526e] btrfs_mksubvol.isra.62+0x30e/0x5a0 [btrfs] [ 4202.721786] [8108e620] ? bit_waitqueue+0xc0/0xc0 [ 4202.721799] [a02056a9] btrfs_ioctl_snap_create_transid+0x1a9/0x1b0 [btrfs] [ 4202.721813] [a020583a] btrfs_ioctl_snap_create_v2+0x10a/0x170 [btrfs] (...) It turns out that extent_io.c:__extent_writepage(), which ends up being called through filemap_fdatawrite_range() in btrfs_start_ordered_extent(), was getting -ENOSPC when calling the fill_delalloc callback. In this situation, it returned without the writepage_end_io_hook callback (inode.c:btrfs_writepage_end_io_hook) ever being called for the respective page, which prevents the ordered extent's bytes_left count from ever reaching 0, and therefore a finish_ordered_fn work is never queued into the endio_write_workers queue. This makes the task that called btrfs_start_ordered_extent() hang forever on the wait queue of the ordered extent. This is fairly easy to reproduce using a small filesystem and fsstress on a quad core vm: mkfs.btrfs -f -b `expr 2100 \* 1024 \* 1024` /dev/sdd mount /dev/sdd /mnt fsstress -p 6 -d /mnt -n 10 -x \ btrfs subvolume snapshot -r /mnt /mnt/mysnap \ -f allocsp=0 \ -f bulkstat=0 \ -f bulkstat1=0 \ -f chown=0 \ -f creat=1 \ -f dread=0 \ -f dwrite=0 \ -f fallocate=1 \ -f fdatasync=0 \ -f fiemap=0 \ -f freesp=0 \ -f fsync=0 \ -f getattr=0 \ -f getdents=0 \ -f link=0 \ -f mkdir=0 \ -f mknod=0 \ -f punch=1 \ -f read=0 \ -f readlink=0 \ -f rename=0 \ -f resvsp=0 \ -f rmdir=0 \ -f setxattr=0 \ -f stat=0 \ -f symlink=0 \ -f sync=0 \ -f truncate=1 \ -f unlink=0 \ -f unresvsp=0 \ -f write=4 Signed-off-by: Filipe David Borba Manana fdman...@gmail.com --- fs/btrfs/extent_io.c | 6 +- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 0c43896..b5a097f 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3374,9 +3374,13 @@ done: done_unlocked: + if (PageError(page
[PATCH v2] Btrfs: fix hang on error (such as ENOSPC) when writing extent pages
When running low on available disk space and having several processes doing buffered file IO, I got the following trace in dmesg: [ 4202.720152] INFO: task kworker/u8:1:5450 blocked for more than 120 seconds. [ 4202.720401] Not tainted 3.13.0-fdm-btrfs-next-26+ #1 [ 4202.720596] echo 0 /proc/sys/kernel/hung_task_timeout_secs disables this message. [ 4202.720874] kworker/u8:1D 0001 0 5450 2 0x [ 4202.720904] Workqueue: btrfs-flush_delalloc normal_work_helper [btrfs] [ 4202.720908] 8801f62ddc38 0082 880203ac2490 001d3f40 [ 4202.720913] 8801f62ddfd8 001d3f40 8800c4f0c920 880203ac2490 [ 4202.720918] 001d4a40 88020fe85a40 88020fe85ab8 0001 [ 4202.720922] Call Trace: [ 4202.720931] [816a3cb9] schedule+0x29/0x70 [ 4202.720950] [a01ec48d] btrfs_start_ordered_extent+0x6d/0x110 [btrfs] [ 4202.720956] [8108e620] ? bit_waitqueue+0xc0/0xc0 [ 4202.720972] [a01ec559] btrfs_run_ordered_extent_work+0x29/0x40 [btrfs] [ 4202.720988] [a0201987] normal_work_helper+0x137/0x2c0 [btrfs] [ 4202.720994] [810680e5] process_one_work+0x1f5/0x530 (...) [ 4202.721027] 2 locks held by kworker/u8:1/5450: [ 4202.721028] #0: (%s-%s){..}, at: [81068083] process_one_work+0x193/0x530 [ 4202.721037] #1: ((work-normal_work)){+.+...}, at: [81068083] process_one_work+0x193/0x530 [ 4202.721054] INFO: task btrfs:7891 blocked for more than 120 seconds. [ 4202.721258] Not tainted 3.13.0-fdm-btrfs-next-26+ #1 [ 4202.721444] echo 0 /proc/sys/kernel/hung_task_timeout_secs disables this message. [ 4202.721699] btrfs D 0001 0 7891 7890 0x0001 [ 4202.721704] 88018c2119e8 0086 8800a33d2490 001d3f40 [ 4202.721710] 88018c211fd8 001d3f40 8802144b 8800a33d2490 [ 4202.721714] 8800d8576640 88020fe85bc0 88020fe85bc8 7fff [ 4202.721718] Call Trace: [ 4202.721723] [816a3cb9] schedule+0x29/0x70 [ 4202.721727] [816a2ebc] schedule_timeout+0x1dc/0x270 [ 4202.721732] [8109bd79] ? mark_held_locks+0xb9/0x140 [ 4202.721736] [816a90c0] ? _raw_spin_unlock_irq+0x30/0x40 [ 4202.721740] [8109bf0d] ? trace_hardirqs_on_caller+0x10d/0x1d0 [ 4202.721744] [816a488f] wait_for_completion+0xdf/0x120 [ 4202.721749] [8107fa90] ? try_to_wake_up+0x310/0x310 [ 4202.721765] [a01ebee4] btrfs_wait_ordered_extents+0x1f4/0x280 [btrfs] [ 4202.721781] [a020526e] btrfs_mksubvol.isra.62+0x30e/0x5a0 [btrfs] [ 4202.721786] [8108e620] ? bit_waitqueue+0xc0/0xc0 [ 4202.721799] [a02056a9] btrfs_ioctl_snap_create_transid+0x1a9/0x1b0 [btrfs] [ 4202.721813] [a020583a] btrfs_ioctl_snap_create_v2+0x10a/0x170 [btrfs] (...) It turns out that extent_io.c:__extent_writepage(), which ends up being called through filemap_fdatawrite_range() in btrfs_start_ordered_extent(), was getting -ENOSPC when calling the fill_delalloc callback. In this situation, it returned without the writepage_end_io_hook callback (inode.c:btrfs_writepage_end_io_hook) ever being called for the respective page, which prevents the ordered extent's bytes_left count from ever reaching 0, and therefore a finish_ordered_fn work is never queued into the endio_write_workers queue. This makes the task that called btrfs_start_ordered_extent() hang forever on the wait queue of the ordered extent. This is fairly easy to reproduce using a small filesystem and fsstress on a quad core vm: mkfs.btrfs -f -b `expr 2100 \* 1024 \* 1024` /dev/sdd mount /dev/sdd /mnt fsstress -p 6 -d /mnt -n 10 -x \ btrfs subvolume snapshot -r /mnt /mnt/mysnap \ -f allocsp=0 \ -f bulkstat=0 \ -f bulkstat1=0 \ -f chown=0 \ -f creat=1 \ -f dread=0 \ -f dwrite=0 \ -f fallocate=1 \ -f fdatasync=0 \ -f fiemap=0 \ -f freesp=0 \ -f fsync=0 \ -f getattr=0 \ -f getdents=0 \ -f link=0 \ -f mkdir=0 \ -f mknod=0 \ -f punch=1 \ -f read=0 \ -f readlink=0 \ -f rename=0 \ -f resvsp=0 \ -f rmdir=0 \ -f setxattr=0 \ -f stat=0 \ -f symlink=0 \ -f sync=0 \ -f truncate=1 \ -f unlink=0 \ -f unresvsp=0 \ -f write=4 Signed-off-by: Filipe David Borba Manana fdman...@gmail.com --- V2: Grab error from `em` pointer if available, do the error check and end_extent_writepage call before unlocking the page (just like end_bio_extent_writepage does). fs/btrfs/extent_io.c | 7 ++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs
[PATCH] Btrfs-progs: check, fix csum check in the presence of non-inlined refs
When we have non-inlined extent references, we were failing to find the corresponding extent item for an existing csum item in the csum tree. Reproducer: mkfs.btrfs -f /dev/sdd mount /dev/sdd /mnt xfs_io -f -c falloc 780366 135302 /mnt/foo xfs_io -c falloc 327680 151552 /mnt/foo xfs_io -c pwrite -S 0xff -b 131072 0 131072 /mnt/foo sync for i in `seq 1 40`; do btrfs subvolume snapshot /mnt /mnt/snap$i ; done umount /mnt btrfs check /dev/sdd The check command exited with status 1 and the following output: Checking filesystem on /dev/sdd UUID: 2416ab5f-9d71-457e-bb13-a27d4f6b399a checking extents checking free space cache checking fs roots checking csums There are no extents for csum range 12980224-12984320 Csum exists for 12980224-12984320 but there is no extent record found 1388544 bytes used err is 1 total csum bytes: 132 total tree bytes: 704512 total fs tree bytes: 573440 total extent tree bytes: 16384 btree space waste bytes: 564479 file data blocks allocated: 19341312 referenced 14606336 Btrfs v3.14.1-94-g80597e7 After this change it no longer erroneously reports a missing extent for the csum item and exits with a status of 0. Also added missing btrfs_prev_leaf() return value checks, as we were ignoring errors and non-existence of left siblings completely. Signed-off-by: Filipe David Borba Manana fdman...@gmail.com --- cmds-check.c | 38 +++--- 1 file changed, 27 insertions(+), 11 deletions(-) diff --git a/cmds-check.c b/cmds-check.c index 103efc5..18612c8 100644 --- a/cmds-check.c +++ b/cmds-check.c @@ -3650,8 +3650,7 @@ static int check_extent_exists(struct btrfs_root *root, u64 bytenr, key.objectid = bytenr; key.type = BTRFS_EXTENT_ITEM_KEY; - key.offset = 0; - + key.offset = (u64)-1; again: ret = btrfs_search_slot(NULL, root-fs_info-extent_root, key, path, @@ -3661,10 +3660,17 @@ again: btrfs_free_path(path); return ret; } else if (ret) { - if (path-slots[0]) + if (path-slots[0] 0) { path-slots[0]--; - else - btrfs_prev_leaf(root, path); + } else { + ret = btrfs_prev_leaf(root, path); + if (ret 0) { + goto out; + } else if (ret 0) { + ret = 0; + goto out; + } + } } btrfs_item_key_to_cpu(path-nodes[0], key, path-slots[0]); @@ -3674,13 +3680,22 @@ again: * bytenr, so walk back one more just in case. Dear future traveler, * first congrats on mastering time travel. Now if it's not too much * trouble could you go back to 2006 and tell Chris to make the -* BLOCK_GROUP_ITEM_KEY lower than the EXTENT_ITEM_KEY please? +* BLOCK_GROUP_ITEM_KEY (and BTRFS_*_REF_KEY) lower than the +* EXTENT_ITEM_KEY please? */ - if (key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) { - if (path-slots[0]) + while (key.type BTRFS_EXTENT_ITEM_KEY) { + if (path-slots[0] 0) { path-slots[0]--; - else - btrfs_prev_leaf(root, path); + } else { + ret = btrfs_prev_leaf(root, path); + if (ret 0) { + goto out; + } else if (ret 0) { + ret = 0; + goto out; + } + } + btrfs_item_key_to_cpu(path-nodes[0], key, path-slots[0]); } while (num_bytes) { @@ -3752,7 +3767,8 @@ again: } ret = 0; - if (num_bytes) { +out: + if (num_bytes !ret) { fprintf(stderr, There are no extents for csum range %Lu-%Lu\n, bytenr, bytenr+num_bytes); ret = 1; -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v2] Btrfs: fix leaf corruption caused by ENOSPC while hole punching
my other patch to perform the leaf sanity checks once a leaf is marked as dirty (if the integrity checker is enabled), it would have been much harder to debug this issue. This change might fix a few similar issues reported by users in the mailing list regarding assertion failures in btrfs_set_item_key_safe calls performed by __btrfs_drop_extents, such as the following report: http://comments.gmane.org/gmane.comp.file-systems.btrfs/32938 Asking fill_holes() to create a 0 bytes wide file hole item also produced the first warning in the trace above, as we passed a range to btrfs_drop_extent_cache that has an end smaller (by -1) than its start. On 3.14 kernels this issue manifests itself through leaf corruption, as we get duplicated file extent item keys in a leaf when calling setup_items_for_insert(), but on older kernels, setup_items_for_insert() isn't called by __btrfs_drop_extents(), instead we have callers of __btrfs_drop_extents(), namely the functions inode.c:insert_inline_extent() and inode.c:insert_reserved_file_extent(), calling btrfs_insert_empty_item() to insert the new file extent item, which would fail with error -EEXIST, instead of inserting a duplicated key - which is still a serious issue as it would make all similar file extent item replace operations keep failing if they target the same file range. Cc: sta...@vger.kernel.org Signed-off-by: Filipe David Borba Manana fdman...@gmail.com --- V2: Updated commit message to mention difference between 3.14 kernels and older releases and cc'ed stable. Made the logic in __btrfs_drop_extents simpler and made it remove any 0 bytes file extent item within the target range, and not only extent items that have an offset matching search_start. fs/btrfs/file.c | 20 +++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 49e5fbf..7c3c84f 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -785,6 +785,18 @@ next_slot: extent_end = search_start; } + /* +* Don't skip extent items representing 0 byte lengths. They +* used to be created (bug) if while punching holes we hit +* -ENOSPC condition. So if we find one here, just ensure we +* delete it, otherwise we would insert a new file extent item +* with the same key (offset) as that 0 bytes length file +* extent item in the call to setup_items_for_insert() later +* in this function. +*/ + if (extent_end == key.offset extent_end = search_start) + goto delete_extent_item; + if (extent_end = search_start) { path-slots[0]++; goto next_slot; @@ -898,6 +910,7 @@ next_slot: *| -- extent -- | */ if (start = key.offset end = extent_end) { +delete_extent_item: if (del_nr == 0) { del_slot = path-slots[0]; del_nr = 1; @@ -2353,7 +2366,12 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) } trans-block_rsv = root-fs_info-trans_block_rsv; - if (cur_offset ino_size) { + /* +* Don't insert file hole extent item if it's for a range beyond eof +* (because it's useless) or if it represents a 0 bytes range (when +* cur_offset == drop_end). +*/ + if (cur_offset ino_size cur_offset drop_end) { ret = fill_holes(trans, inode, path, cur_offset, drop_end); if (ret) { err = ret; -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v2] Btrfs: implement inode_operations callback tmpfile
This implements the tmpfile callback of struct inode_operations, introduced in the linux kernel 3.11, and implemented already by some filesystems. This callback is invoked by the VFS when the flag O_TMPFILE is passed to the open system call. Signed-off-by: Filipe David Borba Manana fdman...@gmail.com --- V2: Updated change log and comment about how many metadata units are needed for the transaction. Left the ACL inheritance in the callback (like ext4 does) since the thread in linux-fsdevel seems to have ended with the conclusion that this is the right behaviour (as Andreas Gruenbacher says). fs/btrfs/inode.c | 118 +-- 1 file changed, 98 insertions(+), 20 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 0c0bb45..b5397db 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5549,6 +5549,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, struct btrfs_inode_ref *ref; struct btrfs_key key[2]; u32 sizes[2]; + int nitems = name ? 2 : 1; unsigned long ptr; int ret; @@ -5568,7 +5569,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, */ inode-i_ino = objectid; - if (dir) { + if (dir name) { trace_btrfs_inode_request(dir); ret = btrfs_set_inode_index(dir, index); @@ -5577,6 +5578,8 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, iput(inode); return ERR_PTR(ret); } + } else if (dir) { + *index = 0; } /* * index_cnt is ignored for everything but a dir, @@ -5601,21 +5604,24 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, btrfs_set_key_type(key[0], BTRFS_INODE_ITEM_KEY); key[0].offset = 0; - /* -* Start new inodes with an inode_ref. This is slightly more -* efficient for small numbers of hard links since they will -* be packed into one item. Extended refs will kick in if we -* add more hard links than can fit in the ref item. -*/ - key[1].objectid = objectid; - btrfs_set_key_type(key[1], BTRFS_INODE_REF_KEY); - key[1].offset = ref_objectid; - sizes[0] = sizeof(struct btrfs_inode_item); - sizes[1] = name_len + sizeof(*ref); + + if (name) { + /* +* Start new inodes with an inode_ref. This is slightly more +* efficient for small numbers of hard links since they will +* be packed into one item. Extended refs will kick in if we +* add more hard links than can fit in the ref item. +*/ + key[1].objectid = objectid; + btrfs_set_key_type(key[1], BTRFS_INODE_REF_KEY); + key[1].offset = ref_objectid; + + sizes[1] = name_len + sizeof(*ref); + } path-leave_spinning = 1; - ret = btrfs_insert_empty_items(trans, root, path, key, sizes, 2); + ret = btrfs_insert_empty_items(trans, root, path, key, sizes, nitems); if (ret != 0) goto fail; @@ -5628,12 +5634,14 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, sizeof(*inode_item)); fill_inode_item(trans, path-nodes[0], inode_item, inode); - ref = btrfs_item_ptr(path-nodes[0], path-slots[0] + 1, -struct btrfs_inode_ref); - btrfs_set_inode_ref_name_len(path-nodes[0], ref, name_len); - btrfs_set_inode_ref_index(path-nodes[0], ref, *index); - ptr = (unsigned long)(ref + 1); - write_extent_buffer(path-nodes[0], name, ptr, name_len); + if (name) { + ref = btrfs_item_ptr(path-nodes[0], path-slots[0] + 1, +struct btrfs_inode_ref); + btrfs_set_inode_ref_name_len(path-nodes[0], ref, name_len); + btrfs_set_inode_ref_index(path-nodes[0], ref, *index); + ptr = (unsigned long)(ref + 1); + write_extent_buffer(path-nodes[0], name, ptr, name_len); + } btrfs_mark_buffer_dirty(path-nodes[0]); btrfs_free_path(path); @@ -5669,7 +5677,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, return inode; fail: - if (dir) + if (dir name) BTRFS_I(dir)-index_cnt--; btrfs_free_path(path); iput(inode); @@ -5954,6 +5962,15 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, err = btrfs_update_inode(trans, root, inode); if (err) goto fail; + if (inode-i_nlink == 1) { + /* +* If new hard link count is 1, it's a file created
[PATCH] Btrfs: fix leaf corruption caused by ENOSPC while hole punching
my other patch to perform the leaf sanity checks once a leaf is marked as dirty (if the integrity checker is enabled), it would have been much harder to debug this issue. This change might fix a few similar issues reported by users in the mailing list regarding assertion failures in btrfs_set_item_key_safe calls performed by __btrfs_drop_extents, such as the following report: http://comments.gmane.org/gmane.comp.file-systems.btrfs/32938 Asking fill_holes() to create a 0 bytes wide file hole item also produced the first warning in the trace above, as we passed a range to btrfs_drop_extent_cache that has an end smaller (by -1) than its start. Signed-off-by: Filipe David Borba Manana fdman...@gmail.com --- fs/btrfs/file.c | 19 +-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 49e5fbf..cac902a 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -785,7 +785,17 @@ next_slot: extent_end = search_start; } - if (extent_end = search_start) { + /* +* Don't skip extent items representing 0 byte lengths. They +* used to be created (bug) if while punching holes we hit +* -ENOSPC condition. So if we find one here, just ensure we +* delete it, otherwise we would insert a new file extent item +* with the same key (offset) as that 0 bytes length file +* extent item in the call to setup_items_for_insert() later +* in this function. +*/ + if (extent_end = search_start + !(extent_end == key.offset extent_end == search_start)) { path-slots[0]++; goto next_slot; } @@ -2353,7 +2363,12 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) } trans-block_rsv = root-fs_info-trans_block_rsv; - if (cur_offset ino_size) { + /* +* Don't insert file hole extent item if it's for a range beyond eof +* (because it's useless) or if it represents a 0 bytes range (when +* cur_offset == drop_end). +*/ + if (cur_offset ino_size cur_offset drop_end) { ret = fill_holes(trans, inode, path, cur_offset, drop_end); if (ret) { err = ret; -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] Btrfs: read inode size after acquiring the mutex when punching a hole
In a previous change, commit 12870f1c9b2de7d475d22e73fd7db1b418599725, I accidentally moved the roundup of inode-i_size to outside of the critical section delimited by the inode mutex, which is not atomic and not correct since the size can be changed by other task before we acquire the mutex. Therefore fix it. Signed-off-by: Filipe David Borba Manana fdman...@gmail.com --- fs/btrfs/file.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 23f6a9d..efaad37 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2192,13 +2192,14 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) bool same_page = ((offset PAGE_CACHE_SHIFT) == ((offset + len - 1) PAGE_CACHE_SHIFT)); bool no_holes = btrfs_fs_incompat(root-fs_info, NO_HOLES); - u64 ino_size = round_up(inode-i_size, PAGE_CACHE_SIZE); + u64 ino_size; ret = btrfs_wait_ordered_range(inode, offset, len); if (ret) return ret; mutex_lock(inode-i_mutex); + ino_size = round_up(inode-i_size, PAGE_CACHE_SIZE); /* * We needn't truncate any page which is beyond the end of the file * because we are sure there is no data there. -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] Btrfs: correctly set profile flags on seqlock retry
If we had to retry on the profiles seqlock (due to a concurrent write), we would set bits on the input flags that corresponded both to the current profile and to previous values of the profile. Signed-off-by: Filipe David Borba Manana fdman...@gmail.com --- fs/btrfs/extent-tree.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 678cb35..5590af9 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3543,11 +3543,13 @@ static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) return extended_to_chunk(flags | tmp); } -static u64 get_alloc_profile(struct btrfs_root *root, u64 flags) +static u64 get_alloc_profile(struct btrfs_root *root, u64 orig_flags) { unsigned seq; + u64 flags; do { + flags = orig_flags; seq = read_seqbegin(root-fs_info-profiles_lock); if (flags BTRFS_BLOCK_GROUP_DATA) -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] Btrfs: use correct key when repeating search for extent item
If skinny metadata is enabled and our first tree search fails to find a skinny extent item, we may repeat a tree search for a fat extent item (if the previous item in the leaf is not the fat extent we're looking for). However we were not setting the new key's objectid to the right value, as we previously used the same key variable to peek at the previous item in the leaf, which has a different objectid. So just set the right objectid to avoid modifying/deleting a wrong item if we repeat the tree search. Signed-off-by: Filipe David Borba Manana fdman...@gmail.com --- fs/btrfs/extent-tree.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 1306487..678cb35 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1542,6 +1542,7 @@ again: ret = 0; } if (ret) { + key.objectid = bytenr; key.type = BTRFS_EXTENT_ITEM_KEY; key.offset = num_bytes; btrfs_release_path(path); @@ -5719,6 +5720,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, if (ret 0 skinny_metadata) { skinny_metadata = false; + key.objectid = bytenr; key.type = BTRFS_EXTENT_ITEM_KEY; key.offset = num_bytes; btrfs_release_path(path); -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 1/6 v3] Btrfs: send, bump stream version
This increases the send stream version from version 1 to version 2, adding new commands: 1) total data size - used to tell the receiver how much file data the stream will add or update; 2) fallocate - used to pre-allocate space for files and to punch holes in files; 3) inode set flags; 4) set inode otime. This is preparation work for subsequent changes that implement the new features. A version 2 stream is only produced if the send ioctl caller passes in one of the new flags (BTRFS_SEND_FLAG_CALCULATE_DATA_SIZE | BTRFS_SEND_FLAG_STREAM_V2), meaning old clients are unaffected. Signed-off-by: Filipe David Borba Manana fdman...@gmail.com --- V2: A v2 stream is now only produced if the send ioctl caller passes in one of the new flags (BTRFS_SEND_FLAG_CALCULATE_DATA_SIZE | BTRFS_SEND_FLAG_SUPPORT_FALLOCATE) to avoid breaking old clients. V3: Removed BTRFS_SEND_FLAG_SUPPORT_FALLOCATE and added BTRFS_SEND_FLAG_STREAM_V2, added commands for inode set flags and otime. fs/btrfs/send.c| 7 ++- fs/btrfs/send.h| 21 - include/uapi/linux/btrfs.h | 21 - 3 files changed, 46 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 289e9f3..7b4b0c3 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -632,7 +632,10 @@ static int send_header(struct send_ctx *sctx) struct btrfs_stream_header hdr; strcpy(hdr.magic, BTRFS_SEND_STREAM_MAGIC); - hdr.version = cpu_to_le32(BTRFS_SEND_STREAM_VERSION); + if (sctx-flags BTRFS_SEND_FLAG_STREAM_V2) + hdr.version = cpu_to_le32(BTRFS_SEND_STREAM_VERSION_2); + else + hdr.version = cpu_to_le32(BTRFS_SEND_STREAM_VERSION_1); return write_buf(sctx-send_filp, hdr, sizeof(hdr), sctx-send_off); @@ -5554,6 +5557,8 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) INIT_LIST_HEAD(sctx-name_cache_list); sctx-flags = arg-flags; + if (sctx-flags BTRFS_SEND_FLAG_CALCULATE_DATA_SIZE) + sctx-flags |= BTRFS_SEND_FLAG_STREAM_V2; sctx-send_filp = fget(arg-send_fd); if (!sctx-send_filp) { diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h index 48d425a..96f583c 100644 --- a/fs/btrfs/send.h +++ b/fs/btrfs/send.h @@ -20,7 +20,8 @@ #include ctree.h #define BTRFS_SEND_STREAM_MAGIC btrfs-stream -#define BTRFS_SEND_STREAM_VERSION 1 +#define BTRFS_SEND_STREAM_VERSION_1 1 +#define BTRFS_SEND_STREAM_VERSION_2 2 #define BTRFS_SEND_BUF_SIZE (1024 * 64) #define BTRFS_SEND_READ_SIZE (1024 * 48) @@ -87,6 +88,15 @@ enum btrfs_send_cmd { BTRFS_SEND_C_END, BTRFS_SEND_C_UPDATE_EXTENT, + + /* +* The following commands were added in stream version 2. +*/ + BTRFS_SEND_C_TOTAL_DATA_SIZE, + BTRFS_SEND_C_FALLOCATE, + BTRFS_SEND_C_INODE_SET_FLAGS, + BTRFS_SEND_C_UTIMES2, /* Same as UTIMES, but it includes OTIME too. */ + __BTRFS_SEND_C_MAX, }; #define BTRFS_SEND_C_MAX (__BTRFS_SEND_C_MAX - 1) @@ -125,10 +135,19 @@ enum { BTRFS_SEND_A_CLONE_OFFSET, BTRFS_SEND_A_CLONE_LEN, + /* +* The following attributes were added in stream version 2. +*/ + BTRFS_SEND_A_FALLOCATE_FLAGS, /* 32 bits */ + BTRFS_SEND_A_INODE_FLAGS, /* 32 bits */ + __BTRFS_SEND_A_MAX, }; #define BTRFS_SEND_A_MAX (__BTRFS_SEND_A_MAX - 1) +#define BTRFS_SEND_A_FALLOCATE_FLAG_KEEP_SIZE (1 0) +#define BTRFS_SEND_A_FALLOCATE_FLAG_PUNCH_HOLE (1 1) + #ifdef __KERNEL__ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg); #endif diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h index b4d6909..8ab2761 100644 --- a/include/uapi/linux/btrfs.h +++ b/include/uapi/linux/btrfs.h @@ -464,10 +464,29 @@ struct btrfs_ioctl_received_subvol_args { */ #define BTRFS_SEND_FLAG_OMIT_END_CMD 0x4 +/* + * Calculate the amount (in bytes) of new file data between the send and + * parent snapshots, or in case of a full send, the total amount of file data + * we will send. + * This corresponds to the sum of the data lengths of each write, clone and + * fallocate commands that are sent through the send stream. The receiving end + * can use this information to compute progress. + * + * Added in send stream version 2, and implies producing a version 2 stream. + */ +#define BTRFS_SEND_FLAG_CALCULATE_DATA_SIZE0x8 + +/* + * Used by a client to request a version 2 of the send stream. + */ +#define BTRFS_SEND_FLAG_STREAM_V2 0x10 + #define BTRFS_SEND_FLAG_MASK \ (BTRFS_SEND_FLAG_NO_FILE_DATA | \ BTRFS_SEND_FLAG_OMIT_STREAM_HEADER | \ -BTRFS_SEND_FLAG_OMIT_END_CMD) +BTRFS_SEND_FLAG_OMIT_END_CMD | \ +BTRFS_SEND_FLAG_CALCULATE_DATA_SIZE | \ +BTRFS_SEND_FLAG_STREAM_V2) struct btrfs_ioctl_send_args { __s64 send_fd
[PATCH 2/6 v3] Btrfs: send, implement total data size command to allow for progress estimation
This new send flag makes send calculate first the amount of new file data (in bytes) the send root has relatively to the parent root, or for the case of a non-incremental send, the total amount of file data the stream will create (including holes and prealloc extents). In other words, it computes the sum of the lengths of all write, clone and fallocate operations that will be sent through the send stream. This data size value is sent in a new command, named BTRFS_SEND_C_TOTAL_DATA_SIZE, that immediately follows a BTRFS_SEND_C_SUBVOL or BTRFS_SEND_C_SNAPSHOT command, and precedes any command that changes a file or the filesystem hierarchy. Upon receiving a write, clone or fallocate command, the receiving end can increment a counter by the data length of that command and therefore report progress by comparing the counter's value with the data size value received in the BTRFS_SEND_C_TOTAL_DATA_SIZE command. The approach is simple, before the normal operation of send, do a scan in the file system tree for new inodes and new/changed file extent items, just like in send's normal operation, and keep incrementing a counter with new inodes' size and the size of file extents (and file holes) that are going to be written, cloned or fallocated. This is actually a simpler and more lightweight tree scan/processing than the one we do when sending the changes, as it doesn't process inode references nor does any lookups in the extent tree for example. After modifying btrfs-progs to understand this new command and report progress, here's an example (the -o flag tells btrfs send to pass the new flag to the kernel's send ioctl): $ btrfs send -s --stream-version 2 /mnt/sdd/snap_base | btrfs receive /mnt/sdc At subvol /mnt/sdd/snap_base At subvol snap_base About to receive 9212392667 bytes Subvolume /mnt/sdc//snap_base, 4059722426 / 9212392667 bytes received, 44.07%, 40.32MB/s $ btrfs send -s --stream-version 2 -p /mnt/sdd/snap_base /mnt/sdd/snap_incr | btrfs receive /mnt/sdc At subvol /mnt/sdd/snap_incr At subvol snap_incr About to receive 9571342213 bytes Subvolume /mnt/sdc//snap_incr, 6557345221 / 9571342213 bytes received, 68.51%, 51.04MB/s Signed-off-by: Filipe David Borba Manana fdman...@gmail.com --- V2: A v2 stream is now only produced if the send ioctl caller passes in one of the new flags (BTRFS_SEND_FLAG_CALCULATE_DATA_SIZE | BTRFS_SEND_FLAG_SUPPORT_FALLOCATE) to avoid breaking old clients. V3: Removed BTRFS_SEND_FLAG_SUPPORT_FALLOCATE and added BTRFS_SEND_FLAG_STREAM_V2, added commands for inode set flags and otime. fs/btrfs/send.c | 194 ++-- 1 file changed, 162 insertions(+), 32 deletions(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 7b4b0c3..2a52cc9 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -81,7 +81,13 @@ struct clone_root { #define SEND_CTX_MAX_NAME_CACHE_SIZE 128 #define SEND_CTX_NAME_CACHE_CLEAN_SIZE (SEND_CTX_MAX_NAME_CACHE_SIZE * 2) +enum btrfs_send_phase { + SEND_PHASE_STREAM_CHANGES, + SEND_PHASE_COMPUTE_DATA_SIZE, +}; + struct send_ctx { + enum btrfs_send_phase phase; struct file *send_filp; loff_t send_off; char *send_buf; @@ -116,6 +122,7 @@ struct send_ctx { u64 cur_inode_last_extent; u64 send_progress; + u64 total_data_size; struct list_head new_refs; struct list_head deleted_refs; @@ -691,6 +698,8 @@ static int send_rename(struct send_ctx *sctx, { int ret; + ASSERT(sctx-phase != SEND_PHASE_COMPUTE_DATA_SIZE); + verbose_printk(btrfs: send_rename %s - %s\n, from-start, to-start); ret = begin_cmd(sctx, BTRFS_SEND_C_RENAME); @@ -715,6 +724,8 @@ static int send_link(struct send_ctx *sctx, { int ret; + ASSERT(sctx-phase != SEND_PHASE_COMPUTE_DATA_SIZE); + verbose_printk(btrfs: send_link %s - %s\n, path-start, lnk-start); ret = begin_cmd(sctx, BTRFS_SEND_C_LINK); @@ -738,6 +749,8 @@ static int send_unlink(struct send_ctx *sctx, struct fs_path *path) { int ret; + ASSERT(sctx-phase != SEND_PHASE_COMPUTE_DATA_SIZE); + verbose_printk(btrfs: send_unlink %s\n, path-start); ret = begin_cmd(sctx, BTRFS_SEND_C_UNLINK); @@ -760,6 +773,8 @@ static int send_rmdir(struct send_ctx *sctx, struct fs_path *path) { int ret; + ASSERT(sctx-phase != SEND_PHASE_COMPUTE_DATA_SIZE); + verbose_printk(btrfs: send_rmdir %s\n, path-start); ret = begin_cmd(sctx, BTRFS_SEND_C_RMDIR); @@ -2307,6 +2322,9 @@ static int send_truncate(struct send_ctx *sctx, u64 ino, u64 gen, u64 size) int ret = 0; struct fs_path *p; + if (sctx-phase == SEND_PHASE_COMPUTE_DATA_SIZE) + return 0; + verbose_printk(btrfs: send_truncate %llu size=%llu\n, ino, size); p = fs_path_alloc(); @@ -2336,6 +2354,8 @@ static int send_chmod(struct send_ctx *sctx, u64
[PATCH 3/6 v4] Btrfs: send, use fallocate command to punch holes
Instead of sending a write command with a data buffer filled with 0 value bytes, use the fallocate command, introduced in the send stream version 2, to tell the receiver to punch a file hole using the fallocate system call. Signed-off-by: Filipe David Borba Manana fdman...@gmail.com --- V2: A v2 stream is now only produced if the send ioctl caller passes in one of the new flags (BTRFS_SEND_FLAG_CALCULATE_DATA_SIZE | BTRFS_SEND_FLAG_SUPPORT_FALLOCATE) to avoid breaking old clients. V3: Added missing path allocation, messed up rebase. V4: Removed BTRFS_SEND_FLAG_SUPPORT_FALLOCATE and added BTRFS_SEND_FLAG_STREAM_V2, added commands for inode set flags and otime. fs/btrfs/send.c | 55 --- fs/btrfs/send.h | 4 2 files changed, 56 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 2a52cc9..e57000b 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -564,6 +564,7 @@ static int tlv_put(struct send_ctx *sctx, u16 attr, const void *data, int len) return tlv_put(sctx, attr, __tmp, sizeof(__tmp)); \ } +TLV_PUT_DEFINE_INT(32) TLV_PUT_DEFINE_INT(64) static int tlv_put_string(struct send_ctx *sctx, u16 attr, @@ -4482,18 +4483,59 @@ out: return ret; } +static int send_fallocate(struct send_ctx *sctx, u32 flags, + u64 offset, u64 len) +{ + struct fs_path *p = NULL; + int ret = 0; + + ASSERT(sctx-flags BTRFS_SEND_FLAG_STREAM_V2); + + if (sctx-phase == SEND_PHASE_COMPUTE_DATA_SIZE) { + sctx-total_data_size += len; + return 0; + } + + p = fs_path_alloc(); + if (!p) + return -ENOMEM; + ret = get_cur_path(sctx, sctx-cur_ino, sctx-cur_inode_gen, p); + if (ret 0) + goto out; + + ret = begin_cmd(sctx, BTRFS_SEND_C_FALLOCATE); + if (ret 0) + goto out; + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); + TLV_PUT_U32(sctx, BTRFS_SEND_A_FALLOCATE_FLAGS, flags); + TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); + TLV_PUT_U64(sctx, BTRFS_SEND_A_SIZE, len); + ret = send_cmd(sctx); + +tlv_put_failure: +out: + fs_path_free(p); + return ret; +} + static int send_hole(struct send_ctx *sctx, u64 end) { struct fs_path *p = NULL; u64 offset = sctx-cur_inode_last_extent; - u64 len; + u64 len = end - offset; int ret = 0; if (sctx-phase == SEND_PHASE_COMPUTE_DATA_SIZE) { - sctx-total_data_size += end - offset; + sctx-total_data_size += len; return 0; } + if (sctx-flags BTRFS_SEND_FLAG_STREAM_V2) + return send_fallocate(sctx, + BTRFS_SEND_PUNCH_HOLE_FALLOC_FLAGS, + offset, + len); + p = fs_path_alloc(); if (!p) return -ENOMEM; @@ -4550,7 +4592,8 @@ static int send_write_or_clone(struct send_ctx *sctx, len = btrfs_file_extent_num_bytes(path-nodes[0], ei); } - if (offset + len sctx-cur_inode_size) + if (offset sctx-cur_inode_size + offset + len sctx-cur_inode_size) len = sctx-cur_inode_size - offset; if (len == 0) { ret = 0; @@ -4567,6 +4610,12 @@ static int send_write_or_clone(struct send_ctx *sctx, ret = send_clone(sctx, offset, len, clone_root); } else if (sctx-flags BTRFS_SEND_FLAG_NO_FILE_DATA) { ret = send_update_extent(sctx, offset, len); + } else if (btrfs_file_extent_disk_bytenr(path-nodes[0], ei) == 0 + type != BTRFS_FILE_EXTENT_INLINE + (sctx-flags BTRFS_SEND_FLAG_STREAM_V2) + offset sctx-cur_inode_size) { + ret = send_fallocate(sctx, BTRFS_SEND_PUNCH_HOLE_FALLOC_FLAGS, +offset, len); } else { while (pos len) { l = len - pos; diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h index 96f583c..987936c 100644 --- a/fs/btrfs/send.h +++ b/fs/btrfs/send.h @@ -148,6 +148,10 @@ enum { #define BTRFS_SEND_A_FALLOCATE_FLAG_KEEP_SIZE (1 0) #define BTRFS_SEND_A_FALLOCATE_FLAG_PUNCH_HOLE (1 1) +#define BTRFS_SEND_PUNCH_HOLE_FALLOC_FLAGS\ + (BTRFS_SEND_A_FALLOCATE_FLAG_KEEP_SIZE | \ +BTRFS_SEND_A_FALLOCATE_FLAG_PUNCH_HOLE) + #ifdef __KERNEL__ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg); #endif -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 4/6 v4] Btrfs: send, use fallocate command to allocate extents
The send stream version 2 adds the fallocate command, which can be used to allocate extents for a file or punch holes in a file. Previously we were ignoring file prealloc extents or treating them as extents filled with 0 bytes and sending a regular write command to the stream. After this change, together with my previous change titled: Btrfs: send, use fallocate command to punch holes an incremental send preserves the hole and data structure of files, which can be seen via calls to lseek with the whence parameter set to SEEK_DATA or SEEK_HOLE, as the example below shows: mkfs.btrfs -f /dev/sdc mount /dev/sdc /mnt xfs_io -f -c pwrite -S 0x01 -b 30 0 30 /mnt/foo btrfs subvolume snapshot -r /mnt /mnt/mysnap1 xfs_io -c fpunch 10 5 /mnt/foo xfs_io -c falloc 10 5 /mnt/foo xfs_io -c pwrite -S 0xff -b 1000 12 1000 /mnt/foo xfs_io -c fpunch 25 2 /mnt/foo # prealloc extents that start beyond the inode's size xfs_io -c falloc -k 30 100 /mnt/foo xfs_io -c falloc -k 900 200 /mnt/foo btrfs subvolume snapshot -r /mnt /mnt/mysnap2 btrfs send /mnt/mysnap1 -f /tmp/1.snap btrfs send -p /mnt/mysnap1 /mnt/mysnap2 -f /tmp/2.snap mkfs.btrfs -f /dev/sdd mount /dev/sdd /mnt2 btrfs receive /mnt2 -f /tmp/1.snap btrfs receive /mnt2 -f /tmp/2.snap Before this change the hole/data structure differed between both filesystems: $ xfs_io -r -c 'seek -r -a 0' /mnt/mysnap2/foo Whence Result DATA0 HOLE102400 DATA118784 HOLE122880 DATA147456 HOLE253952 DATA266240 HOLE30 $ xfs_io -r -c 'seek -r -a 0' /mnt2/mysnap2/foo Whence Result DATA0 HOLE30 After this change the second filesystem (/dev/sdd) ends up with the same hole/data structure as the first filesystem. Also, after this change, prealloc extents that lie beyond the inode's size (were allocated with fallocate + keep size flag) are also replicated by an incremental send. For the above test, it can be observed via fiemap (or btrfs-debug-tree): $ xfs_io -r -c 'fiemap -l' /mnt2/mysnap2/foo 0: [0..191]: 25096..25287 192 blocks 1: [192..199]: 24672..24679 8 blocks 2: [200..231]: 24584..24615 32 blocks 3: [232..239]: 24680..24687 8 blocks 4: [240..287]: 24616..24663 48 blocks 5: [288..295]: 24688..24695 8 blocks 6: [296..487]: 25392..25583 192 blocks 7: [488..495]: 24696..24703 8 blocks 8: [496..519]: hole 24 blocks 9: [520..527]: 24704..24711 8 blocks 10: [528..583]: 25624..25679 56 blocks 11: [584..591]: 24712..24719 8 blocks 12: [592..2543]: 26192..28143 1952 blocks 13: [2544..17575]: hole 15032 blocks 14: [17576..21487]: 28144..32055 3912 blocks A test case for xfstests will follow. Signed-off-by: Filipe David Borba Manana fdman...@gmail.com --- V2: Added new send ioctl flag BTRFS_SEND_FLAG_SUPPORT_FALLOCATE. A version 2 stream is now only produced is the ioctl caller specifies at least one of the new send flags (BTRFS_SEND_FLAG_SUPPORT_FALLOCATE or BTRFS_SEND_FLAG_CALCULATE_DATA_SIZE). V3: Fixed rebase, removed some duplicate logic on truncate + falloc -k. V4: Removed BTRFS_SEND_FLAG_SUPPORT_FALLOCATE and added BTRFS_SEND_FLAG_STREAM_V2, added commands for inode set flags and otime. fs/btrfs/send.c | 78 + 1 file changed, 57 insertions(+), 21 deletions(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index e57000b..d6c9466 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -113,9 +113,10 @@ struct send_ctx { */ u64 cur_ino; u64 cur_inode_gen; - int cur_inode_new; - int cur_inode_new_gen; - int cur_inode_deleted; + u8 cur_inode_new:1; + u8 cur_inode_new_gen:1; + u8 cur_inode_skip_truncate:1; + u8 cur_inode_deleted:1; u64 cur_inode_size; u64 cur_inode_mode; u64 cur_inode_rdev; @@ -4562,6 +4563,19 @@ tlv_put_failure: return ret; } +static int truncate_before_falloc(struct send_ctx *sctx) +{ + int ret = 0; + + if (!sctx-cur_inode_skip_truncate) { + ret = send_truncate(sctx, sctx-cur_ino, + sctx-cur_inode_gen, + sctx-cur_inode_size); + sctx-cur_inode_skip_truncate = 1; + } + return ret; +} + static int send_write_or_clone(struct send_ctx *sctx, struct btrfs_path *path, struct btrfs_key *key, @@ -4601,8 +4615,7 @@ static int send_write_or_clone(struct send_ctx *sctx, } if (sctx-phase == SEND_PHASE_COMPUTE_DATA_SIZE) { - if (offset sctx-cur_inode_size) - sctx-total_data_size += len; + sctx
[PATCH 5/6] Btrfs: add missing cleanup on sysfs init failure
If we failed during initialization of sysfs, we weren't unregistering the top level btrfs sysfs entry nor the debugfs stuff. Not unregistering the top level sysfs entry makes future attempts to reload the btrfs module impossible and the following is reported in dmesg: [ 2246.451296] WARNING: CPU: 3 PID: 10999 at fs/sysfs/dir.c:486 sysfs_warn_dup+0x91/0xb0() [ 2246.451298] sysfs: cannot create duplicate filename '/fs/btrfs' [ 2246.451298] Modules linked in: btrfs(+) raid6_pq xor bnep rfcomm bluetooth binfmt_misc nfsd auth_rpcgss oid_registry nfs_acl nfs lockd fscache sunrpc parport_pc parport psmouse serio_raw pcspkr evbug i2c_piix4 e1000 floppy [last unloaded: btrfs] [ 2246.451310] CPU: 3 PID: 10999 Comm: modprobe Tainted: GW 3.13.0-fdm-btrfs-next-24+ #7 [ 2246.451311] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 [ 2246.451312] 0009 8800d353fa08 816f1da6 0410 [ 2246.451314] 8800d353fa58 8800d353fa48 8104a32c 88020821a290 [ 2246.451316] 88020821a290 88020821a290 8802148f 8800d353fb80 [ 2246.451318] Call Trace: [ 2246.451322] [816f1da6] dump_stack+0x4e/0x68 [ 2246.451324] [8104a32c] warn_slowpath_common+0x8c/0xc0 [ 2246.451325] [8104a416] warn_slowpath_fmt+0x46/0x50 [ 2246.451328] [81367dc5] ? strlcat+0x65/0x90 () Signed-off-by: Filipe David Borba Manana fdman...@gmail.com --- fs/btrfs/sysfs.c | 10 +- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index c5eb214..58a1dd1 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -672,10 +672,18 @@ int btrfs_init_sysfs(void) ret = btrfs_init_debugfs(); if (ret) - return ret; + goto out1; init_feature_attrs(); ret = sysfs_create_group(btrfs_kset-kobj, btrfs_feature_attr_group); + if (ret) + goto out2; + + return 0; +out2: + debugfs_remove_recursive(btrfs_debugfs_root_dentry); +out1: + kset_unregister(btrfs_kset); return ret; } -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 6/6] Btrfs: add send_stream_version attribute to sysfs
So that applications can find out what's the highest send stream version supported/implemented by the running kernel: $ cat /sys/fs/btrfs/send_stream_version 2 Signed-off-by: Filipe David Borba Manana fdman...@gmail.com --- fs/btrfs/send.h | 1 + fs/btrfs/sysfs.c | 36 2 files changed, 37 insertions(+) diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h index 987936c..047fd6d 100644 --- a/fs/btrfs/send.h +++ b/fs/btrfs/send.h @@ -22,6 +22,7 @@ #define BTRFS_SEND_STREAM_MAGIC btrfs-stream #define BTRFS_SEND_STREAM_VERSION_1 1 #define BTRFS_SEND_STREAM_VERSION_2 2 +#define BTRFS_SEND_STREAM_VERSION_LATEST BTRFS_SEND_STREAM_VERSION_2 #define BTRFS_SEND_BUF_SIZE (1024 * 64) #define BTRFS_SEND_READ_SIZE (1024 * 48) diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 58a1dd1..2f8fff6 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -31,6 +31,7 @@ #include transaction.h #include sysfs.h #include volumes.h +#include send.h static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj); @@ -662,6 +663,36 @@ static int btrfs_init_debugfs(void) return 0; } +static ssize_t send_stream_version_show(struct kobject *kobj, + struct kobj_attribute *a, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, %d\n, + BTRFS_SEND_STREAM_VERSION_LATEST); +} + +BTRFS_ATTR(send_stream_version, 0444, send_stream_version_show); + +static const struct attribute *btrfs_root_attrs[] = { + BTRFS_ATTR_PTR(send_stream_version), + NULL +}; + +static int add_root_attrs(struct kobject *kobj) +{ + int error = 0; + int i; + + for (i = 0; btrfs_root_attrs[i]; i++) { + const struct attribute *a = btrfs_root_attrs[i]; + + error = sysfs_add_file_to_group(kobj-parent, a, kobj-name); + if (error) + break; + } + return error; +} + int btrfs_init_sysfs(void) { int ret; @@ -678,8 +709,13 @@ int btrfs_init_sysfs(void) ret = sysfs_create_group(btrfs_kset-kobj, btrfs_feature_attr_group); if (ret) goto out2; + ret = add_root_attrs(btrfs_kset-kobj); + if (ret) + goto out3; return 0; +out3: + sysfs_remove_group(btrfs_kset-kobj, btrfs_feature_attr_group); out2: debugfs_remove_recursive(btrfs_debugfs_root_dentry); out1: -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 1/4 v3] Btrfs-progs: send, bump stream version
This increases the send stream version from version 1 to version 2, adding new commands: 1) total data size - used to tell the receiver how much file data the stream will add or update; 2) fallocate - used to pre-allocate space for files and to punch holes in files; 3) inode set flags; 4) set inode otime. This is preparation work for subsequent changes that implement the new features. This doesn't break compatibility with older kernels or clients. In order to get a version 2 send stream, new flags must be passed to the send ioctl. Signed-off-by: Filipe David Borba Manana fdman...@gmail.com --- V2: Added new send ioctl flag BTRFS_SEND_FLAG_SUPPORT_FALLOCATE. A version 2 stream is now only produced is the ioctl caller specifies at least one of the new send flags (BTRFS_SEND_FLAG_SUPPORT_FALLOCATE or BTRFS_SEND_FLAG_CALCULATE_DATA_SIZE). V3: Removed BTRFS_SEND_FLAG_SUPPORT_FALLOCATE flag and -a command line option for btrfs-send. Both were replaced with BTRFS_SEND_FLAG_STREAM_V2 and --stream-version=version_number respectively. Added commands for inode sets flags and otime too. Documentation/btrfs-send.txt | 3 +++ cmds-send.c | 57 ++-- ioctl.h | 15 send-stream.c| 2 +- send.h | 23 +- 5 files changed, 85 insertions(+), 15 deletions(-) diff --git a/Documentation/btrfs-send.txt b/Documentation/btrfs-send.txt index 18a98fa..067fc27 100644 --- a/Documentation/btrfs-send.txt +++ b/Documentation/btrfs-send.txt @@ -40,6 +40,9 @@ Use this snapshot as a clone source for an incremental send (multiple allowed). -f outfile:: Output is normally written to stdout. To write to a file, use this option. An alternative would be to use pipes. +--stream-version version:: +Ask the kernel to produce a specific send stream version. More recent stream versions provide +new features and better performance. Default value is 1. EXIT STATUS --- diff --git a/cmds-send.c b/cmds-send.c index 1cd457d..bd575f8 100644 --- a/cmds-send.c +++ b/cmds-send.c @@ -32,6 +32,7 @@ #include libgen.h #include mntent.h #include assert.h +#include getopt.h #include uuid/uuid.h @@ -45,6 +46,7 @@ #include send-utils.h static int g_verbose = 0; +static int g_stream_version = BTRFS_SEND_STREAM_VERSION_1; struct btrfs_send { int send_fd; @@ -281,6 +283,8 @@ static int do_send(struct btrfs_send *send, u64 parent_root_id, io_send.flags |= BTRFS_SEND_FLAG_OMIT_STREAM_HEADER; if (!is_last_subvol) io_send.flags |= BTRFS_SEND_FLAG_OMIT_END_CMD; + if (g_stream_version == BTRFS_SEND_STREAM_VERSION_2) + io_send.flags |= BTRFS_SEND_FLAG_STREAM_V2; ret = ioctl(subvol_fd, BTRFS_IOC_SEND, io_send); if (ret) { ret = -errno; @@ -406,6 +410,11 @@ out: return ret; } +static const struct option long_options[] = { + { stream-version, 1, NULL, 'V' }, + { NULL, 0, NULL, 0 } +}; + int cmd_send(int argc, char **argv) { char *subvol = NULL; @@ -424,7 +433,8 @@ int cmd_send(int argc, char **argv) memset(send, 0, sizeof(send)); send.dump_fd = fileno(stdout); - while ((c = getopt(argc, argv, vec:f:i:p:)) != -1) { + while ((c = getopt_long(argc, argv, vec:f:i:p:, + long_options, NULL)) != -1) { switch (c) { case 'v': g_verbose++; @@ -511,6 +521,24 @@ int cmd_send(int argc, char **argv) ERROR: -i was removed, use -c instead\n); ret = 1; goto out; + case 'V': + if (sscanf(optarg, %d, g_stream_version) != 1) { + fprintf(stderr, + ERROR: invalid value for stream version: %s\n, + optarg); + ret = 1; + goto out; + } + if (g_stream_version = 0 || + g_stream_version BTRFS_SEND_STREAM_VERSION_MAX) { + fprintf(stderr, + ERROR: unsupported stream version %d, minimum: 1, maximum: %d\n, + g_stream_version, + BTRFS_SEND_STREAM_VERSION_MAX); + ret = 1; + goto out; + } + break; case '?': default: fprintf(stderr, ERROR: send args invalid.\n); @@ -673,7 +701,7 @@ out: } const char * const cmd_send_usage[] = { - btrfs send [-ve] [-p parent] [-c clone-src] [-f outfile] subvol [subvol
[PATCH 2/4 v4] Btrfs-progs: send, implement total data size callback and progress report
This is a followup to the kernel patch titled: Btrfs: send, implement total data size command to allow for progress estimation This makes the btrfs send and receive commands aware of the new send flag, named BTRFS_SEND_C_TOTAL_DATA_SIZE, which tells us the amount of file data that is new between the parent and send snapshots/roots. As this command immediately follows the commands to start a snapshot/subvolume, it can be used to report and compute progress, by keeping a counter that is incremented with the data length of each write, clone and fallocate command that is received from the stream. Example: $ btrfs send -s --stream-version 2 /mnt/sdd/snap_base | btrfs receive /mnt/sdc At subvol /mnt/sdd/snap_base At subvol snap_base About to receive 9212392667 bytes Subvolume /mnt/sdc//snap_base, 4059722426 / 9212392667 bytes received, 44.07%, 40.32MB/s $ btrfs send -s --stream-version 2 -p /mnt/sdd/snap_base /mnt/sdd/snap_incr | btrfs receive /mnt/sdc At subvol /mnt/sdd/snap_incr At subvol snap_incr About to receive 9571342213 bytes Subvolume /mnt/sdc//snap_incr, 6557345221 / 9571342213 bytes received, 68.51%, 51.04MB/s At the moment progress is only reported by btrfs-receive, but it is possible and simple to do it for btrfs-send too, so that we can get progress report when not piping btrfs-send output to btrfs-receive (directly to a file). Signed-off-by: Filipe David Borba Manana fdman...@gmail.com --- V2: Added new send ioctl flag BTRFS_SEND_FLAG_SUPPORT_FALLOCATE. A version 2 stream is now only produced is the ioctl caller specifies at least one of the new send flags (BTRFS_SEND_FLAG_SUPPORT_FALLOCATE or BTRFS_SEND_FLAG_CALCULATE_DATA_SIZE). V3: Renamed option -o to -s, removed some duplicated code (progress reset). V4: Removed BTRFS_SEND_FLAG_SUPPORT_FALLOCATE flag and -a command line option for btrfs-send. Both were replaced with BTRFS_SEND_FLAG_STREAM_V2 and --stream-version=version_number respectively. Added commands for inode set flags and otime too. Documentation/btrfs-send.txt | 4 ++ cmds-receive.c | 91 cmds-send.c | 23 ++- send-stream.c| 4 ++ send-stream.h| 1 + 5 files changed, 121 insertions(+), 2 deletions(-) diff --git a/Documentation/btrfs-send.txt b/Documentation/btrfs-send.txt index 067fc27..1b18d32 100644 --- a/Documentation/btrfs-send.txt +++ b/Documentation/btrfs-send.txt @@ -43,6 +43,10 @@ An alternative would be to use pipes. --stream-version version:: Ask the kernel to produce a specific send stream version. More recent stream versions provide new features and better performance. Default value is 1. +-s:: +Obtain the total data size for each subvolume or snapshot to send. This demands additional +processing (mostly IO bound) but is useful for the receive command to report progress. This +option requires send stream version 2 or higher. EXIT STATUS --- diff --git a/cmds-receive.c b/cmds-receive.c index d6cd3da..bd5255c 100644 --- a/cmds-receive.c +++ b/cmds-receive.c @@ -32,6 +32,7 @@ #include ftw.h #include wait.h #include assert.h +#include time.h #include sys/stat.h #include sys/types.h @@ -71,6 +72,14 @@ struct btrfs_receive struct subvol_uuid_search sus; int honor_end_cmd; + + /* For the subvolume/snapshot we're currently receiving. */ + u64 total_data_size; + u64 bytes_received; + time_t last_progress_update; + u64 bytes_received_last_update; + float progress; + const char *target; }; static int finish_subvol(struct btrfs_receive *r) @@ -143,6 +152,16 @@ out: return ret; } +static void reset_progress(struct btrfs_receive *r, const char *dest) +{ + r-total_data_size = 0; + r-bytes_received = 0; + r-progress = 0.0; + r-last_progress_update = 0; + r-bytes_received_last_update = 0; + r-target = dest; +} + static int process_subvol(const char *path, const u8 *uuid, u64 ctransid, void *user) { @@ -156,6 +175,7 @@ static int process_subvol(const char *path, const u8 *uuid, u64 ctransid, goto out; r-cur_subvol = calloc(1, sizeof(*r-cur_subvol)); + reset_progress(r, Subvolume); if (strlen(r-dest_dir_path) == 0) r-cur_subvol-path = strdup(path); @@ -205,6 +225,7 @@ static int process_snapshot(const char *path, const u8 *uuid, u64 ctransid, goto out; r-cur_subvol = calloc(1, sizeof(*r-cur_subvol)); + reset_progress(r, Snapshot); if (strlen(r-dest_dir_path) == 0) r-cur_subvol-path = strdup(path); @@ -287,6 +308,73 @@ out: return ret; } +static int process_total_data_size(u64 size, void *user) +{ + struct btrfs_receive *r = user; + + r-total_data_size = size; + fprintf(stdout, About
[PATCH 3/4 v4] Btrfs-progs: send, implement fallocate command callback
The fallocate send stream command, added in stream version 2, is used to pre-allocate space for files and punch file holes. This change implements the callback for that new command, using the fallocate function from the standard C library to carry out the specified action (allocate file space or punch a file hole). Signed-off-by: Filipe David Borba Manana fdman...@gmail.com --- V2: Use the new send ioctl flag BTRFS_SEND_FLAG_SUPPORT_FALLOCATE if the user asks for it (-a command line option), which will make the kernel generate a version 2 send stream, so that old clients aren't affected. V3: Rebased on new patchset (new version of patch 2/4). V4: Removed BTRFS_SEND_FLAG_SUPPORT_FALLOCATE flag and -a command line option for btrfs-send. Both were replaced with BTRFS_SEND_FLAG_STREAM_V2 and --stream-version=version_number respectively. Added commands for inode set flags and otime too. cmds-receive.c | 38 ++ send-stream.c | 13 + send-stream.h | 2 ++ 3 files changed, 53 insertions(+) diff --git a/cmds-receive.c b/cmds-receive.c index bd5255c..5e96423 100644 --- a/cmds-receive.c +++ b/cmds-receive.c @@ -41,6 +41,7 @@ #include sys/types.h #include sys/xattr.h #include uuid/uuid.h +#include linux/falloc.h #include ctree.h #include ioctl.h @@ -887,6 +888,42 @@ out: return ret; } +static int process_fallocate(const char *path, u32 flags, u64 offset, +u64 len, void *user) +{ + struct btrfs_receive *r = user; + char *full_path = path_cat(r-full_subvol_path, path); + int mode = 0; + int ret; + + if (flags BTRFS_SEND_A_FALLOCATE_FLAG_KEEP_SIZE) + mode |= FALLOC_FL_KEEP_SIZE; + if (flags BTRFS_SEND_A_FALLOCATE_FLAG_PUNCH_HOLE) + mode |= FALLOC_FL_PUNCH_HOLE; + + if (g_verbose = 2) + fprintf(stderr, + fallocate %s - flags %u, offset %llu, len %llu\n, + path, flags, offset, len); + + ret = open_inode_for_write(r, full_path); + if (ret 0) + goto out; + + ret = fallocate(r-write_fd, mode, offset, len); + if (ret) { + ret = -errno; + fprintf(stderr, + ERROR: fallocate against %s failed. %s\n, + path, strerror(-ret)); + goto out; + } + update_progress(r, len); + +out: + free(full_path); + return ret; +} static struct btrfs_send_ops send_ops = { .subvol = process_subvol, @@ -910,6 +947,7 @@ static struct btrfs_send_ops send_ops = { .chown = process_chown, .utimes = process_utimes, .total_data_size = process_total_data_size, + .fallocate = process_fallocate, }; static int do_receive(struct btrfs_receive *r, const char *tomnt, int r_fd) diff --git a/send-stream.c b/send-stream.c index e1bd4ce..812639f 100644 --- a/send-stream.c +++ b/send-stream.c @@ -425,6 +425,19 @@ static int read_and_process_cmd(struct btrfs_send_stream *s) TLV_GET_U64(s, BTRFS_SEND_A_SIZE, tmp); ret = s-ops-total_data_size(tmp, s-user); break; + case BTRFS_SEND_C_FALLOCATE: + { + u32 flags; + u64 len; + + TLV_GET_STRING(s, BTRFS_SEND_A_PATH, path); + TLV_GET_U32(s, BTRFS_SEND_A_FALLOCATE_FLAGS, flags); + TLV_GET_U64(s, BTRFS_SEND_A_FILE_OFFSET, offset); + TLV_GET_U64(s, BTRFS_SEND_A_SIZE, len); + ret = s-ops-fallocate(path, flags, offset, len, + s-user); + } + break; case BTRFS_SEND_C_END: ret = 1; break; diff --git a/send-stream.h b/send-stream.h index 3a653a9..479e40c 100644 --- a/send-stream.h +++ b/send-stream.h @@ -55,6 +55,8 @@ struct btrfs_send_ops { void *user); int (*update_extent)(const char *path, u64 offset, u64 len, void *user); int (*total_data_size)(u64 size, void *user); + int (*fallocate)(const char *path, u32 flags, u64 offset, +u64 len, void *user); }; int btrfs_read_and_process_send_stream(int fd, -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v4] xfstests: btrfs, test send's ability to punch holes and prealloc extents
This test verifies that after an incremental btrfs send the replicated file has the same exact hole and data structure as in the origin filesystem. This didn't use to be the case before the send stream version 2 - holes were sent as write operations of 0 valued bytes instead of punching holes with the fallocate system call, and pre-allocated extents were sent as well as write operations of 0 valued bytes instead of intructions for the receiver to use the fallocate system call. It also checks that prealloc extents that lie beyond the file's size are replicated by an incremental send. Signed-off-by: Filipe David Borba Manana fdman...@gmail.com --- V2: Addressed Dave's comments, and updated btrfs send invocation, by specifying the new command line option (-a) that enables use of fallocate - added function _require_btrfs_send_fallocate_flag() to skip the test when an old version of btrfs-progs is found. V3: Corrected use of fiemap with _filter_fiemap. Was passing -l instead of -v to fiemap, which resulted in output consisting only of a single line related to a hole instead of all holes and data extents (and I wanted to verify the falloc -k extents were preserved after the btrfs send). V4: Updated invocation of btrfs send, as the flag -a was removed from btrfs-send in favour of --stream-version=version common/rc | 9 tests/btrfs/047 | 122 tests/btrfs/047.out | 35 +++ tests/btrfs/group | 1 + 4 files changed, 167 insertions(+) create mode 100755 tests/btrfs/047 create mode 100644 tests/btrfs/047.out diff --git a/common/rc b/common/rc index acf419b..a9d1c4c 100644 --- a/common/rc +++ b/common/rc @@ -2262,6 +2262,15 @@ _run_btrfs_util_prog() run_check $BTRFS_UTIL_PROG $* } +_require_btrfs_send_stream_version() +{ + $BTRFS_UTIL_PROG send 21 | \ + grep '^[ \t]*\-\-stream\-version version' /dev/null 21 + if [ $? -ne 0 ]; then + _notrun Missing btrfs-progs send --stream-version command line option, skipped this test + fi +} + init_rc() { if [ $iam == new ] diff --git a/tests/btrfs/047 b/tests/btrfs/047 new file mode 100755 index 000..cc1936d --- /dev/null +++ b/tests/btrfs/047 @@ -0,0 +1,122 @@ +#! /bin/bash +# FS QA Test No. btrfs/047 +# +# Verify that after an incremental btrfs send the replicated file has +# the same exact hole and data structure as in the origin filesystem. +# This didn't use to be the case before the send stream version 2 - +# holes were sent as write operations of 0 valued bytes instead of punching +# holes with the fallocate system call, and pre-allocated extents were sent +# as well as write operations of 0 valued bytes instead of intructions for +# the receiver to use the fallocate system call. Also check that prealloc +# extents that lie beyond the file's size are replicated by an incremental +# send. +# +# More specifically, this structure preserving guarantee was added by the +# following linux kernel commits: +# +#Btrfs: send, use fallocate command to punch holes +#Btrfs: send, use fallocate command to allocate extents +# +#--- +# Copyright (c) 2014 Filipe Manana. All Rights Reserved. +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +#--- +# + +seq=`basename $0` +seqres=$RESULT_DIR/$seq +echo QA output created by $seq + +tmp=/tmp/$$ +status=1 # failure is the default! +trap _cleanup; exit \$status 0 1 2 3 15 + +_cleanup() +{ +rm -fr $send_files_dir +rm -fr $tmp +} + +# get standard environment, filters and checks +. ./common/rc +. ./common/filter +. ./common/punch + +# real QA test starts here +_supported_fs btrfs +_supported_os Linux +_require_scratch +_require_fssum +_require_xfs_io_fiemap +_require_btrfs_send_stream_version +_need_to_be_root + +send_files_dir=$TEST_DIR/btrfs-test-$seq + +rm -f $seqres.full +rm -fr $send_files_dir +mkdir $send_files_dir + +_scratch_mkfs /dev/null 21 +_scratch_mount + +$XFS_IO_PROG -f -c pwrite -S 0x01 -b 30 0 30 $SCRATCH_MNT/foo \ + | _filter_xfs_io + +_run_btrfs_util_prog subvolume snapshot -r $SCRATCH_MNT $SCRATCH_MNT/mysnap1 + +$XFS_IO_PROG -c fpunch 10 5 $SCRATCH_MNT/foo +$XFS_IO_PROG -c
[PATCH 6/6 v2] Btrfs: add send_stream_version attribute to sysfs
So that applications can find out what's the highest send stream version supported/implemented by the running kernel: $ cat /sys/fs/btrfs/send/stream_version 2 Signed-off-by: Filipe David Borba Manana fdman...@gmail.com --- V2: Renamed /sys/fs/btrfs/send_stream_version to /sys/fs/btrfs/send/stream_version, as in the future it might be useful to add other sysfs attrbutes related to send (other ro information or tunables like internal buffer sizes, etc). fs/btrfs/send.h | 1 + fs/btrfs/sysfs.c | 27 +++ 2 files changed, 28 insertions(+) diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h index 987936c..047fd6d 100644 --- a/fs/btrfs/send.h +++ b/fs/btrfs/send.h @@ -22,6 +22,7 @@ #define BTRFS_SEND_STREAM_MAGIC btrfs-stream #define BTRFS_SEND_STREAM_VERSION_1 1 #define BTRFS_SEND_STREAM_VERSION_2 2 +#define BTRFS_SEND_STREAM_VERSION_LATEST BTRFS_SEND_STREAM_VERSION_2 #define BTRFS_SEND_BUF_SIZE (1024 * 64) #define BTRFS_SEND_READ_SIZE (1024 * 48) diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 58a1dd1..d93c0b5 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -31,6 +31,7 @@ #include transaction.h #include sysfs.h #include volumes.h +#include send.h static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj); @@ -662,6 +663,26 @@ static int btrfs_init_debugfs(void) return 0; } +static ssize_t send_stream_version_show(struct kobject *kobj, + struct kobj_attribute *a, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, %d\n, + BTRFS_SEND_STREAM_VERSION_LATEST); +} + +BTRFS_ATTR(stream_version, 0444, send_stream_version_show); + +static struct attribute *btrfs_send_attrs[] = { + BTRFS_ATTR_PTR(stream_version), + NULL +}; + +static const struct attribute_group btrfs_send_attr_group = { + .name = send, + .attrs = btrfs_send_attrs, +}; + int btrfs_init_sysfs(void) { int ret; @@ -678,8 +699,13 @@ int btrfs_init_sysfs(void) ret = sysfs_create_group(btrfs_kset-kobj, btrfs_feature_attr_group); if (ret) goto out2; + ret = sysfs_create_group(btrfs_kset-kobj, btrfs_send_attr_group); + if (ret) + goto out3; return 0; +out3: + sysfs_remove_group(btrfs_kset-kobj, btrfs_feature_attr_group); out2: debugfs_remove_recursive(btrfs_debugfs_root_dentry); out1: @@ -691,6 +717,7 @@ out1: void btrfs_exit_sysfs(void) { sysfs_remove_group(btrfs_kset-kobj, btrfs_feature_attr_group); + sysfs_remove_group(btrfs_kset-kobj, btrfs_send_attr_group); kset_unregister(btrfs_kset); debugfs_remove_recursive(btrfs_debugfs_root_dentry); } -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] xfstests: btrfs/004, fix filefrag filter for files with 1 block only
If the file consists of a single block, then filefrag mentions '1 block of ...', and the filter expected 'blocks of ...'. Example: $ echo qwerty foobar $ filefrag -v foobar Filesystem type is: ef53 File size of foobar is 7 (1 block of 4096 bytes) ext: logical_offset:physical_offset: length: expected: flags: 0:0.. 0: 0.. 0: 1: unknown,delalloc,eof foobar: 1 extent found Signed-off-by: Filipe David Borba Manana fdman...@gmail.com --- tests/btrfs/004 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/btrfs/004 b/tests/btrfs/004 index 211d8bc..670e1c2 100755 --- a/tests/btrfs/004 +++ b/tests/btrfs/004 @@ -58,7 +58,7 @@ _require_command /usr/sbin/filefrag rm -f $seqres.full FILEFRAG_FILTER=' - if (/blocks of (\d+) bytes/) { + if (/blocks? of (\d+) bytes/) { $blocksize = $1; next } -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] xfstests: btrfs/004, fix failure with inlined file extents
Files that consist of an inline extent, have the corresponding data in the filesystem btree and not on a dedicated extent. For such extents filefrag (fiemap) will report a physical location of 0 for that extent and set the 'inline' flag. The btrfs inspect-internal logical-resolve command will cause a lookup in the extent tree for the extent address we give it as an argument, which fails with errno ENOENT if it is 0. This error didn't happen always, as the test uses fsstress to generate a random filesystem, which needed to generate at least one file that could be inlined (content less than 4018 bytes). Example, taken from results/btrfs/004.full: # filefrag -v /home/fdmanana/btrfs-tests/scratch_1/snap1/p0/de/d1b/dcb/fb1 Filesystem type is: 9123683e File size of /home/fdmanana/btrfs-tests/scratch_1/snap1/p0/de/d1b/dcb/fb1 is 3860 (1 block of 4096 bytes) ext: logical_offset:physical_offset: length: expected: flags: 0:0..4095: 0.. 4095: 4096: not_aligned,inline,eof 1: 280.. 344: 35190.. 35254: 65: 1: eof /home/fdmanana/btrfs-tests/scratch_1/snap1/p0/de/d1b/dcb/fb1: 2 extents found after filter: 0#0#0 0#0#0 # stat -c %i /home/fdmanana/btrfs-tests/scratch_1/snap1/p0/de/d1b/dcb/fb1 403 # /home/fdmanana/git/hub/btrfs-progs/btrfs inspect-internal logical-resolve -P 0 /home/fdmanana/btrfs-tests/scratch_1 ioctl ret=-1, error: No such file or directory Signed-off-by: Filipe David Borba Manana fdman...@gmail.com --- tests/btrfs/004 | 34 +++--- 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/tests/btrfs/004 b/tests/btrfs/004 index 670e1c2..1d5b816 100755 --- a/tests/btrfs/004 +++ b/tests/btrfs/004 @@ -65,9 +65,11 @@ FILEFRAG_FILTER=' ($ext, $logical, $physical, $length) = (/^\s*(\d+):\s+(\d+)..\s+\d+:\s+(\d+)..\s+\d+:\s+(\d+):/) or next; + ($flags) = /.*:\s*(\S*)$/; print $physical * $blocksize, #, $length * $blocksize, #, - $logical * $blocksize, ' + $logical * $blocksize, #, + $flags, ' # this makes filefrag output script readable by using a perl helper. # output is one extent per line, with three numbers separated by '#' @@ -230,16 +232,26 @@ workout() continue; fi for i in $extents; do - physical=$i - length=$i - logical=$i - physical=`echo $physical | sed -e 's/#.*//'` - length=`echo $length | sed -e 's/[^#]+#//'` - length=`echo $length | sed -e 's/#.*//'` - logical=`echo $logical | sed -e 's/.*#//'` - _btrfs_inspect_check $file $physical $length $logical \ - $snap_name - ret=$? + physical=`echo $i | cut -d '#' -f 1` + length=`echo $i | cut -d '#' -f 2` + logical=`echo $i | cut -d '#' -f 3` + flags=`echo $i | cut -d '#' -f 4` + # Skip inline extents, otherwise btrfs inspect-internal + # logical-resolve will fail (with errno ENOENT), as it + # can't find an extent with a start address of 0 in the + # extent tree. + if [ $physical -eq 0 ]; then + echo $flags | grep -E '(^|,)inline(,|$)' \ +/dev/null + ret=$? + if [ $ret -ne 0 ]; then + echo Unexpected physical address 0 for non-inline extent, file $file, flags $flags + fi + else + _btrfs_inspect_check $file $physical $length \ + $logical $snap_name + ret=$? + fi if [ $ret -ne 0 ]; then errcnt=`expr $errcnt + 1` fi -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 1/4 v2] Btrfs: send, bump stream version
This increases the send stream version from version 1 to version 2, adding 2 new commands: 1) total data size - used to tell the receiver how much file data the stream will add or update; 2) fallocate - used to pre-allocate space for files and to punch holes in files. This is preparation work for subsequent changes that implement the new features (computing total data size and use fallocate for better performance). A version 2 stream is only produced if the send ioctl caller passes in one of the new flags (BTRFS_SEND_FLAG_CALCULATE_DATA_SIZE | BTRFS_SEND_FLAG_SUPPORT_FALLOCATE), meaning old clients are unaffected. Signed-off-by: Filipe David Borba Manana fdman...@gmail.com --- V2: A v2 stream is now only produced if the send ioctl caller passes in one of the new flags (BTRFS_SEND_FLAG_CALCULATE_DATA_SIZE | BTRFS_SEND_FLAG_SUPPORT_FALLOCATE) to avoid breaking old clients. fs/btrfs/send.c| 6 +- fs/btrfs/send.h| 14 +- include/uapi/linux/btrfs.h | 24 +++- 3 files changed, 41 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 289e9f3..53712aa 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -632,7 +632,11 @@ static int send_header(struct send_ctx *sctx) struct btrfs_stream_header hdr; strcpy(hdr.magic, BTRFS_SEND_STREAM_MAGIC); - hdr.version = cpu_to_le32(BTRFS_SEND_STREAM_VERSION); + if (sctx-flags (BTRFS_SEND_FLAG_CALCULATE_DATA_SIZE | + BTRFS_SEND_FLAG_SUPPORT_FALLOCATE)) + hdr.version = cpu_to_le32(BTRFS_SEND_STREAM_VERSION_2); + else + hdr.version = cpu_to_le32(BTRFS_SEND_STREAM_VERSION_1); return write_buf(sctx-send_filp, hdr, sizeof(hdr), sctx-send_off); diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h index 48d425a..367030d 100644 --- a/fs/btrfs/send.h +++ b/fs/btrfs/send.h @@ -20,7 +20,8 @@ #include ctree.h #define BTRFS_SEND_STREAM_MAGIC btrfs-stream -#define BTRFS_SEND_STREAM_VERSION 1 +#define BTRFS_SEND_STREAM_VERSION_1 1 +#define BTRFS_SEND_STREAM_VERSION_2 2 #define BTRFS_SEND_BUF_SIZE (1024 * 64) #define BTRFS_SEND_READ_SIZE (1024 * 48) @@ -87,6 +88,11 @@ enum btrfs_send_cmd { BTRFS_SEND_C_END, BTRFS_SEND_C_UPDATE_EXTENT, + + /* added in stream version 2 */ + BTRFS_SEND_C_TOTAL_DATA_SIZE, + BTRFS_SEND_C_FALLOCATE, + __BTRFS_SEND_C_MAX, }; #define BTRFS_SEND_C_MAX (__BTRFS_SEND_C_MAX - 1) @@ -125,10 +131,16 @@ enum { BTRFS_SEND_A_CLONE_OFFSET, BTRFS_SEND_A_CLONE_LEN, + /* added in stream version 2 */ + BTRFS_SEND_A_FALLOCATE_FLAGS, + __BTRFS_SEND_A_MAX, }; #define BTRFS_SEND_A_MAX (__BTRFS_SEND_A_MAX - 1) +#define BTRFS_SEND_A_FALLOCATE_FLAG_KEEP_SIZE (1 0) +#define BTRFS_SEND_A_FALLOCATE_FLAG_PUNCH_HOLE (1 1) + #ifdef __KERNEL__ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg); #endif diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h index b4d6909..6611406 100644 --- a/include/uapi/linux/btrfs.h +++ b/include/uapi/linux/btrfs.h @@ -464,10 +464,32 @@ struct btrfs_ioctl_received_subvol_args { */ #define BTRFS_SEND_FLAG_OMIT_END_CMD 0x4 +/* + * Calculate the amount (in bytes) of new file data between the send and + * parent snapshots, or in case of a full send, the total amount of file data + * we will send. + * This corresponds to the sum of the data lengths of each write, clone and + * fallocate commands that are sent through the send stream. The receiving end + * can use this information to compute progress. + * + * Added in send stream version 2. + */ +#define BTRFS_SEND_FLAG_CALCULATE_DATA_SIZE0x8 + +/* + * Use fallocate command to pre-allocate file extents and punch file holes, + * instead of write commands with data buffers filled with 0 value bytes. + * + * Added in send stream version 2. + */ +#define BTRFS_SEND_FLAG_SUPPORT_FALLOCATE 0x10 + #define BTRFS_SEND_FLAG_MASK \ (BTRFS_SEND_FLAG_NO_FILE_DATA | \ BTRFS_SEND_FLAG_OMIT_STREAM_HEADER | \ -BTRFS_SEND_FLAG_OMIT_END_CMD) +BTRFS_SEND_FLAG_OMIT_END_CMD | \ +BTRFS_SEND_FLAG_CALCULATE_DATA_SIZE | \ +BTRFS_SEND_FLAG_SUPPORT_FALLOCATE) struct btrfs_ioctl_send_args { __s64 send_fd; /* in */ -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 3/4 v2] Btrfs: send, use fallocate command to punch holes
Instead of sending a write command with a data buffer filled with 0 value bytes, use the fallocate command, introduced in the send stream version 2, to tell the receiver to punch a file hole using the fallocate system call. Signed-off-by: Filipe David Borba Manana fdman...@gmail.com --- V2: A v2 stream is now only produced if the send ioctl caller passes in one of the new flags (BTRFS_SEND_FLAG_CALCULATE_DATA_SIZE | BTRFS_SEND_FLAG_SUPPORT_FALLOCATE) to avoid breaking old clients. fs/btrfs/send.c | 56 +++- fs/btrfs/send.h | 4 2 files changed, 55 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index f5db492..2c6d58c 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -564,6 +564,7 @@ static int tlv_put(struct send_ctx *sctx, u16 attr, const void *data, int len) return tlv_put(sctx, attr, __tmp, sizeof(__tmp)); \ } +TLV_PUT_DEFINE_INT(32) TLV_PUT_DEFINE_INT(64) static int tlv_put_string(struct send_ctx *sctx, u16 attr, @@ -4483,15 +4484,16 @@ out: return ret; } -static int send_hole(struct send_ctx *sctx, u64 end) +static int send_fallocate(struct send_ctx *sctx, u32 flags, + u64 offset, u64 len) { struct fs_path *p = NULL; - u64 offset = sctx-cur_inode_last_extent; - u64 len; int ret = 0; + ASSERT(sctx-flags BTRFS_SEND_FLAG_SUPPORT_FALLOCATE); + if (sctx-phase == SEND_PHASE_COMPUTE_DATA_SIZE) { - sctx-total_data_size += end - offset; + sctx-total_data_size += len; return 0; } @@ -4500,6 +4502,43 @@ static int send_hole(struct send_ctx *sctx, u64 end) return -ENOMEM; ret = get_cur_path(sctx, sctx-cur_ino, sctx-cur_inode_gen, p); if (ret 0) + goto out; + + ret = begin_cmd(sctx, BTRFS_SEND_C_FALLOCATE); + if (ret 0) + goto out; + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); + TLV_PUT_U32(sctx, BTRFS_SEND_A_FALLOCATE_FLAGS, flags); + TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); + TLV_PUT_U64(sctx, BTRFS_SEND_A_SIZE, len); + ret = send_cmd(sctx); + +tlv_put_failure: +out: + fs_path_free(p); + return ret; +} + +static int send_hole(struct send_ctx *sctx, u64 end) +{ + struct fs_path *p = NULL; + u64 offset = sctx-cur_inode_last_extent; + u64 len = end - offset; + int ret = 0; + + if (sctx-phase == SEND_PHASE_COMPUTE_DATA_SIZE) { + sctx-total_data_size += len; + return 0; + } + + if (sctx-flags BTRFS_SEND_FLAG_SUPPORT_FALLOCATE) + return send_fallocate(sctx, + BTRFS_SEND_PUNCH_HOLE_FALLOC_FLAGS, + offset, + len); + + ret = get_cur_path(sctx, sctx-cur_ino, sctx-cur_inode_gen, p); + if (ret 0) goto tlv_put_failure; memset(sctx-read_buf, 0, BTRFS_SEND_READ_SIZE); while (offset end) { @@ -4551,7 +4590,8 @@ static int send_write_or_clone(struct send_ctx *sctx, len = btrfs_file_extent_num_bytes(path-nodes[0], ei); } - if (offset + len sctx-cur_inode_size) + if (offset sctx-cur_inode_size + offset + len sctx-cur_inode_size) len = sctx-cur_inode_size - offset; if (len == 0) { ret = 0; @@ -4568,6 +4608,12 @@ static int send_write_or_clone(struct send_ctx *sctx, ret = send_clone(sctx, offset, len, clone_root); } else if (sctx-flags BTRFS_SEND_FLAG_NO_FILE_DATA) { ret = send_update_extent(sctx, offset, len); + } else if (btrfs_file_extent_disk_bytenr(path-nodes[0], ei) == 0 + type != BTRFS_FILE_EXTENT_INLINE + (sctx-flags BTRFS_SEND_FLAG_SUPPORT_FALLOCATE) + offset sctx-cur_inode_size) { + ret = send_fallocate(sctx, BTRFS_SEND_PUNCH_HOLE_FALLOC_FLAGS, +offset, len); } else { while (pos len) { l = len - pos; diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h index 367030d..a632c0d 100644 --- a/fs/btrfs/send.h +++ b/fs/btrfs/send.h @@ -141,6 +141,10 @@ enum { #define BTRFS_SEND_A_FALLOCATE_FLAG_KEEP_SIZE (1 0) #define BTRFS_SEND_A_FALLOCATE_FLAG_PUNCH_HOLE (1 1) +#define BTRFS_SEND_PUNCH_HOLE_FALLOC_FLAGS\ + (BTRFS_SEND_A_FALLOCATE_FLAG_KEEP_SIZE | \ +BTRFS_SEND_A_FALLOCATE_FLAG_PUNCH_HOLE) + #ifdef __KERNEL__ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg); #endif -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 4/4 v2] Btrfs: send, use fallocate command to allocate extents
The send stream version 2 adds the fallocate command, which can be used to allocate extents for a file or punch holes in a file. Previously we were ignoring file prealloc extents or treating them as extents filled with 0 bytes and sending a regular write command to the stream. After this change, together with my previous change titled: Btrfs: send, use fallocate command to punch holes an incremental send preserves the hole and data structure of files, which can be seen via calls to lseek with the whence parameter set to SEEK_DATA or SEEK_HOLE, as the example below shows: mkfs.btrfs -f /dev/sdc mount /dev/sdc /mnt xfs_io -f -c pwrite -S 0x01 -b 30 0 30 /mnt/foo btrfs subvolume snapshot -r /mnt /mnt/mysnap1 xfs_io -c fpunch 10 5 /mnt/foo xfs_io -c falloc 10 5 /mnt/foo xfs_io -c pwrite -S 0xff -b 1000 12 1000 /mnt/foo xfs_io -c fpunch 25 2 /mnt/foo # prealloc extents that start beyond the inode's size xfs_io -c falloc -k 30 100 /mnt/foo xfs_io -c falloc -k 900 200 /mnt/foo btrfs subvolume snapshot -r /mnt /mnt/mysnap2 btrfs send /mnt/mysnap1 -f /tmp/1.snap btrfs send -p /mnt/mysnap1 /mnt/mysnap2 -f /tmp/2.snap mkfs.btrfs -f /dev/sdd mount /dev/sdd /mnt2 btrfs receive /mnt2 -f /tmp/1.snap btrfs receive /mnt2 -f /tmp/2.snap Before this change the hole/data structure differed between both filesystems: $ xfs_io -r -c 'seek -r -a 0' /mnt/mysnap2/foo Whence Result DATA0 HOLE102400 DATA118784 HOLE122880 DATA147456 HOLE253952 DATA266240 HOLE30 $ xfs_io -r -c 'seek -r -a 0' /mnt2/mysnap2/foo Whence Result DATA0 HOLE30 After this change the second filesystem (/dev/sdd) ends up with the same hole/data structure as the first filesystem. Also, after this change, prealloc extents that lie beyond the inode's size (were allocated with fallocate + keep size flag) are also replicated by an incremental send. For the above test, it can be observed via fiemap (or btrfs-debug-tree): $ xfs_io -r -c 'fiemap -l' /mnt2/mysnap2/foo 0: [0..191]: 25096..25287 192 blocks 1: [192..199]: 24672..24679 8 blocks 2: [200..231]: 24584..24615 32 blocks 3: [232..239]: 24680..24687 8 blocks 4: [240..287]: 24616..24663 48 blocks 5: [288..295]: 24688..24695 8 blocks 6: [296..487]: 25392..25583 192 blocks 7: [488..495]: 24696..24703 8 blocks 8: [496..519]: hole 24 blocks 9: [520..527]: 24704..24711 8 blocks 10: [528..583]: 25624..25679 56 blocks 11: [584..591]: 24712..24719 8 blocks 12: [592..2543]: 26192..28143 1952 blocks 13: [2544..17575]: hole 15032 blocks 14: [17576..21487]: 28144..32055 3912 blocks A test case for xfstests will follow. Signed-off-by: Filipe David Borba Manana fdman...@gmail.com --- V2: A v2 stream is now only produced if the send ioctl caller passes in one of the new flags (BTRFS_SEND_FLAG_CALCULATE_DATA_SIZE | BTRFS_SEND_FLAG_SUPPORT_FALLOCATE) to avoid breaking old clients. fs/btrfs/send.c | 70 +++-- 1 file changed, 48 insertions(+), 22 deletions(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 2c6d58c..043fd43 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -113,9 +113,10 @@ struct send_ctx { */ u64 cur_ino; u64 cur_inode_gen; - int cur_inode_new; - int cur_inode_new_gen; - int cur_inode_deleted; + u8 cur_inode_new:1; + u8 cur_inode_new_gen:1; + u8 cur_inode_skip_truncate:1; + u8 cur_inode_deleted:1; u64 cur_inode_size; u64 cur_inode_mode; u64 cur_inode_rdev; @@ -4599,8 +4600,7 @@ static int send_write_or_clone(struct send_ctx *sctx, } if (sctx-phase == SEND_PHASE_COMPUTE_DATA_SIZE) { - if (offset sctx-cur_inode_size) - sctx-total_data_size += len; + sctx-total_data_size += len; goto out; } @@ -4614,6 +4614,27 @@ static int send_write_or_clone(struct send_ctx *sctx, offset sctx-cur_inode_size) { ret = send_fallocate(sctx, BTRFS_SEND_PUNCH_HOLE_FALLOC_FLAGS, offset, len); + } else if (type == BTRFS_FILE_EXTENT_PREALLOC + (sctx-flags BTRFS_SEND_FLAG_SUPPORT_FALLOCATE)) { + u32 mode = 0; + if (offset sctx-cur_inode_size) { + ret = send_fallocate(sctx, +BTRFS_SEND_PUNCH_HOLE_FALLOC_FLAGS, +offset, len); + if (ret) + goto out; + } else { + if (!sctx-cur_inode_skip_truncate
[PATCH 1/4 v2] Btrfs-progs: send, bump stream version
This increases the send stream version from version 1 to version 2, adding 2 new commands: 1) total data size - used to tell the receiver how much file data the stream will add or update; 2) fallocate - used to pre-allocate space for files and to punch holes in files. This is preparation work for subsequent changes that implement the new features (computing total data size and use fallocate for better performance). This doesn't break compatibility with older kernels or clients. In order to get a version 2 send stream, new flags must be passed to the send ioctl. Signed-off-by: Filipe David Borba Manana fdman...@gmail.com --- V2: Added new send ioctl flag BTRFS_SEND_FLAG_SUPPORT_FALLOCATE. A version 2 stream is now only produced is the ioctl caller specifies at least one of the new send flags (BTRFS_SEND_FLAG_SUPPORT_FALLOCATE or BTRFS_SEND_FLAG_CALCULATE_DATA_SIZE). ioctl.h | 18 ++ send.h | 13 - 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/ioctl.h b/ioctl.h index 231660a..e2c506b 100644 --- a/ioctl.h +++ b/ioctl.h @@ -392,6 +392,24 @@ struct btrfs_ioctl_received_subvol_args { */ #define BTRFS_SEND_FLAG_OMIT_END_CMD 0x4 +/* + * The sum of all length fields the receiver will get in write, clone and + * fallocate commands. + * This can be used by the receiver to compute progress, at the expense of some + * initial metadata scan performed by the sender (kernel). + * + * Added in send stream version 2. + */ +#define BTRFS_SEND_FLAG_CALCULATE_DATA_SIZE0x8 + +/* + * Use fallocate command to pre-allocate file extents and punch file holes, + * instead of write commands with data buffers filled with 0 value bytes. + * + * Added in send stream version 2. + */ +#define BTRFS_SEND_FLAG_SUPPORT_FALLOCATE 0x10 + struct btrfs_ioctl_send_args { __s64 send_fd; /* in */ __u64 clone_sources_count; /* in */ diff --git a/send.h b/send.h index e8da785..69e81fb 100644 --- a/send.h +++ b/send.h @@ -24,7 +24,7 @@ extern C { #endif #define BTRFS_SEND_STREAM_MAGIC btrfs-stream -#define BTRFS_SEND_STREAM_VERSION 1 +#define BTRFS_SEND_STREAM_VERSION 2 #define BTRFS_SEND_BUF_SIZE (1024 * 64) #define BTRFS_SEND_READ_SIZE (1024 * 48) @@ -91,6 +91,11 @@ enum btrfs_send_cmd { BTRFS_SEND_C_END, BTRFS_SEND_C_UPDATE_EXTENT, + + /* added in stream version 2 */ + BTRFS_SEND_C_TOTAL_DATA_SIZE, + BTRFS_SEND_C_FALLOCATE, + __BTRFS_SEND_C_MAX, }; #define BTRFS_SEND_C_MAX (__BTRFS_SEND_C_MAX - 1) @@ -129,10 +134,16 @@ enum { BTRFS_SEND_A_CLONE_OFFSET, BTRFS_SEND_A_CLONE_LEN, + /* added in stream version 2 */ + BTRFS_SEND_A_FALLOCATE_FLAGS, + __BTRFS_SEND_A_MAX, }; #define BTRFS_SEND_A_MAX (__BTRFS_SEND_A_MAX - 1) +#define BTRFS_SEND_A_FALLOCATE_FLAG_KEEP_SIZE (1 0) +#define BTRFS_SEND_A_FALLOCATE_FLAG_PUNCH_HOLE (1 1) + #ifdef __KERNEL__ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg); #endif -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 2/4 v2] Btrfs-progs: send, implement total data size callback and progress report
This is a followup to the kernel patch titled: Btrfs: send, implement total data size command to allow for progress estimation This makes the btrfs send and receive commands aware of the new send flag, named BTRFS_SEND_C_TOTAL_DATA_SIZE, which tells us the amount of file data that is new between the parent and send snapshots/roots. As this command immediately follows the commands to start a snapshot/subvolume, it can be used to report and compute progress, by keeping a counter that is incremented with the data length of each write, clone and fallocate command that is received from the stream. Example: $ btrfs send -o /mnt/sdd/snap_base | btrfs receive /mnt/sdc At subvol /mnt/sdd/snap_base At subvol snap_base About to receive 9212392667 bytes Subvolume /mnt/sdc//snap_base, 4059722426 / 9212392667 bytes received, 44.07%, 40.32MB/s $ btrfs send -o -p /mnt/sdd/snap_base /mnt/sdd/snap_incr | btrfs receive /mnt/sdc At subvol /mnt/sdd/snap_incr At subvol snap_incr About to receive 9571342213 bytes Subvolume /mnt/sdc//snap_incr, 6557345221 / 9571342213 bytes received, 68.51%, 51.04MB/s At the moment progress is only reported by btrfs-receive, but it is possible and simple to do it for btrfs-send too, so that we can get progress report when not piping btrfs-send output to btrfs-receive (directly to a file). Signed-off-by: Filipe David Borba Manana fdman...@gmail.com --- V2: Added new send ioctl flag BTRFS_SEND_FLAG_SUPPORT_FALLOCATE. A version 2 stream is now only produced is the ioctl caller specifies at least one of the new send flags (BTRFS_SEND_FLAG_SUPPORT_FALLOCATE or BTRFS_SEND_FLAG_CALCULATE_DATA_SIZE). Documentation/btrfs-send.txt | 3 ++ cmds-receive.c | 91 cmds-send.c | 14 ++- send-stream.c| 4 ++ send-stream.h| 1 + 5 files changed, 111 insertions(+), 2 deletions(-) diff --git a/Documentation/btrfs-send.txt b/Documentation/btrfs-send.txt index 18a98fa..38470b0 100644 --- a/Documentation/btrfs-send.txt +++ b/Documentation/btrfs-send.txt @@ -40,6 +40,9 @@ Use this snapshot as a clone source for an incremental send (multiple allowed). -f outfile:: Output is normally written to stdout. To write to a file, use this option. An alternative would be to use pipes. +-o:: +Obtain the total data size for each subvolume or snapshot to send. This demands additional +processing (mostly IO bound) but is useful for the receive command to report progress. EXIT STATUS --- diff --git a/cmds-receive.c b/cmds-receive.c index d6cd3da..19300fc 100644 --- a/cmds-receive.c +++ b/cmds-receive.c @@ -32,6 +32,7 @@ #include ftw.h #include wait.h #include assert.h +#include time.h #include sys/stat.h #include sys/types.h @@ -71,6 +72,14 @@ struct btrfs_receive struct subvol_uuid_search sus; int honor_end_cmd; + + /* For the subvolume/snapshot we're currently receiving. */ + u64 total_data_size; + u64 bytes_received; + time_t last_progress_update; + u64 bytes_received_last_update; + float progress; + const char *target; }; static int finish_subvol(struct btrfs_receive *r) @@ -156,6 +165,12 @@ static int process_subvol(const char *path, const u8 *uuid, u64 ctransid, goto out; r-cur_subvol = calloc(1, sizeof(*r-cur_subvol)); + r-total_data_size = 0; + r-bytes_received = 0; + r-progress = 0.0; + r-last_progress_update = 0; + r-bytes_received_last_update = 0; + r-target = Subvolume; if (strlen(r-dest_dir_path) == 0) r-cur_subvol-path = strdup(path); @@ -205,6 +220,12 @@ static int process_snapshot(const char *path, const u8 *uuid, u64 ctransid, goto out; r-cur_subvol = calloc(1, sizeof(*r-cur_subvol)); + r-total_data_size = 0; + r-bytes_received = 0; + r-progress = 0.0; + r-last_progress_update = 0; + r-bytes_received_last_update = 0; + r-target = Snapshot; if (strlen(r-dest_dir_path) == 0) r-cur_subvol-path = strdup(path); @@ -287,6 +308,73 @@ out: return ret; } +static int process_total_data_size(u64 size, void *user) +{ + struct btrfs_receive *r = user; + + r-total_data_size = size; + fprintf(stdout, About to receive %llu bytes\n, size); + + return 0; +} + +static void update_progress(struct btrfs_receive *r, u64 bytes) +{ + float new_progress; + time_t now; + time_t tdiff; + + if (r-total_data_size == 0) + return; + + r-bytes_received += bytes; + + now = time(NULL); + tdiff = now - r-last_progress_update; + if (tdiff 1) { + if (r-bytes_received == r-total_data_size) + fprintf(stdout, \n); + return; + } + + new_progress = ((float
[PATCH 3/4 v2] Btrfs-progs: send, implement fallocate command callback
The fallocate send stream command, added in stream version 2, is used to pre-allocate space for files and punch file holes. This change implements the callback for that new command, using the fallocate function from the standard C library to carry out the specified action (allocate file space or punch a file hole). Signed-off-by: Filipe David Borba Manana fdman...@gmail.com --- V2: Use the new send ioctl flag BTRFS_SEND_FLAG_SUPPORT_FALLOCATE if the user asks for it (-a command line option), which will make the kernel generate a version 2 send stream, so that old clients aren't affected. Documentation/btrfs-send.txt | 3 +++ cmds-receive.c | 38 ++ cmds-send.c | 12 ++-- send-stream.c| 13 + send-stream.h| 2 ++ 5 files changed, 66 insertions(+), 2 deletions(-) diff --git a/Documentation/btrfs-send.txt b/Documentation/btrfs-send.txt index 38470b0..e96be07 100644 --- a/Documentation/btrfs-send.txt +++ b/Documentation/btrfs-send.txt @@ -43,6 +43,9 @@ An alternative would be to use pipes. -o:: Obtain the total data size for each subvolume or snapshot to send. This demands additional processing (mostly IO bound) but is useful for the receive command to report progress. +-a:: +Use fallocate to pre-allocate file extents and to punch file holes, instead of writing zeroes +to files. EXIT STATUS --- diff --git a/cmds-receive.c b/cmds-receive.c index 19300fc..3f30066 100644 --- a/cmds-receive.c +++ b/cmds-receive.c @@ -41,6 +41,7 @@ #include sys/types.h #include sys/xattr.h #include uuid/uuid.h +#include linux/falloc.h #include ctree.h #include ioctl.h @@ -887,6 +888,42 @@ out: return ret; } +static int process_fallocate(const char *path, u32 flags, u64 offset, +u64 len, void *user) +{ + struct btrfs_receive *r = user; + char *full_path = path_cat(r-full_subvol_path, path); + int mode = 0; + int ret; + + if (flags BTRFS_SEND_A_FALLOCATE_FLAG_KEEP_SIZE) + mode |= FALLOC_FL_KEEP_SIZE; + if (flags BTRFS_SEND_A_FALLOCATE_FLAG_PUNCH_HOLE) + mode |= FALLOC_FL_PUNCH_HOLE; + + if (g_verbose = 2) + fprintf(stderr, + fallocate %s - flags %u, offset %llu, len %llu\n, + path, flags, offset, len); + + ret = open_inode_for_write(r, full_path); + if (ret 0) + goto out; + + ret = fallocate(r-write_fd, mode, offset, len); + if (ret) { + ret = -errno; + fprintf(stderr, + ERROR: fallocate against %s failed. %s\n, + path, strerror(-ret)); + goto out; + } + update_progress(r, len); + +out: + free(full_path); + return ret; +} static struct btrfs_send_ops send_ops = { .subvol = process_subvol, @@ -910,6 +947,7 @@ static struct btrfs_send_ops send_ops = { .chown = process_chown, .utimes = process_utimes, .total_data_size = process_total_data_size, + .fallocate = process_fallocate, }; static int do_receive(struct btrfs_receive *r, const char *tomnt, int r_fd) diff --git a/cmds-send.c b/cmds-send.c index 69f5ba1..2a62e68 100644 --- a/cmds-send.c +++ b/cmds-send.c @@ -46,6 +46,7 @@ static int g_verbose = 0; static int g_total_data_size = 0; +static int g_fallocate = 0; struct btrfs_send { int send_fd; @@ -284,6 +285,8 @@ static int do_send(struct btrfs_send *send, u64 parent_root_id, io_send.flags |= BTRFS_SEND_FLAG_OMIT_END_CMD; if (g_total_data_size) io_send.flags |= BTRFS_SEND_FLAG_CALCULATE_DATA_SIZE; + if (g_fallocate) + io_send.flags |= BTRFS_SEND_FLAG_SUPPORT_FALLOCATE; ret = ioctl(subvol_fd, BTRFS_IOC_SEND, io_send); if (ret) { ret = -errno; @@ -427,7 +430,7 @@ int cmd_send(int argc, char **argv) memset(send, 0, sizeof(send)); send.dump_fd = fileno(stdout); - while ((c = getopt(argc, argv, veoc:f:i:p:)) != -1) { + while ((c = getopt(argc, argv, veoac:f:i:p:)) != -1) { switch (c) { case 'v': g_verbose++; @@ -517,6 +520,9 @@ int cmd_send(int argc, char **argv) case 'o': g_total_data_size = 1; break; + case 'a': + g_fallocate = 1; + break; case '?': default: fprintf(stderr, ERROR: send args invalid.\n); @@ -679,7 +685,7 @@ out: } const char * const cmd_send_usage[] = { - btrfs send [-veo] [-p parent] [-c clone-src] [-f outfile] subvol [subvol...], + btrfs send [-veoa] [-p parent] [-c clone-src] [-f outfile] subvol [subvol...], Send
[PATCH 2/4 v2] Btrfs: send, implement total data size command to allow for progress estimation
This new send flag makes send calculate first the amount of new file data (in bytes) the send root has relatively to the parent root, or for the case of a non-incremental send, the total amount of file data the stream will create (including holes and prealloc extents). In other words, it computes the sum of the lengths of all write, clone and fallocate operations that will be sent through the send stream. This data size value is sent in a new command, named BTRFS_SEND_C_TOTAL_DATA_SIZE, that immediately follows a BTRFS_SEND_C_SUBVOL or BTRFS_SEND_C_SNAPSHOT command, and precedes any command that changes a file or the filesystem hierarchy. Upon receiving a write, clone or fallocate command, the receiving end can increment a counter by the data length of that command and therefore report progress by comparing the counter's value with the data size value received in the BTRFS_SEND_C_TOTAL_DATA_SIZE command. The approach is simple, before the normal operation of send, do a scan in the file system tree for new inodes and new/changed file extent items, just like in send's normal operation, and keep incrementing a counter with new inodes' size and the size of file extents (and file holes) that are going to be written, cloned or fallocated. This is actually a simpler and more lightweight tree scan/processing than the one we do when sending the changes, as it doesn't process inode references nor does any lookups in the extent tree for example. After modifying btrfs-progs to understand this new command and report progress, here's an example (the -o flag tells btrfs send to pass the new flag to the kernel's send ioctl): $ btrfs send -o /mnt/sdd/snap_base | btrfs receive /mnt/sdc At subvol /mnt/sdd/snap_base At subvol snap_base About to receive 9212392667 bytes Subvolume /mnt/sdc//snap_base, 4059722426 / 9212392667 bytes received, 44.07%, 40.32MB/s $ btrfs send -o -p /mnt/sdd/snap_base /mnt/sdd/snap_incr | btrfs receive /mnt/sdc At subvol /mnt/sdd/snap_incr At subvol snap_incr About to receive 9571342213 bytes Subvolume /mnt/sdc//snap_incr, 6557345221 / 9571342213 bytes received, 68.51%, 51.04MB/s Signed-off-by: Filipe David Borba Manana fdman...@gmail.com --- V2: A v2 stream is now only produced if the send ioctl caller passes in one of the new flags (BTRFS_SEND_FLAG_CALCULATE_DATA_SIZE | BTRFS_SEND_FLAG_SUPPORT_FALLOCATE) to avoid breaking old clients. fs/btrfs/send.c | 194 ++-- 1 file changed, 162 insertions(+), 32 deletions(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 53712aa..f5db492 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -81,7 +81,13 @@ struct clone_root { #define SEND_CTX_MAX_NAME_CACHE_SIZE 128 #define SEND_CTX_NAME_CACHE_CLEAN_SIZE (SEND_CTX_MAX_NAME_CACHE_SIZE * 2) +enum btrfs_send_phase { + SEND_PHASE_STREAM_CHANGES, + SEND_PHASE_COMPUTE_DATA_SIZE, +}; + struct send_ctx { + enum btrfs_send_phase phase; struct file *send_filp; loff_t send_off; char *send_buf; @@ -116,6 +122,7 @@ struct send_ctx { u64 cur_inode_last_extent; u64 send_progress; + u64 total_data_size; struct list_head new_refs; struct list_head deleted_refs; @@ -692,6 +699,8 @@ static int send_rename(struct send_ctx *sctx, { int ret; + ASSERT(sctx-phase != SEND_PHASE_COMPUTE_DATA_SIZE); + verbose_printk(btrfs: send_rename %s - %s\n, from-start, to-start); ret = begin_cmd(sctx, BTRFS_SEND_C_RENAME); @@ -716,6 +725,8 @@ static int send_link(struct send_ctx *sctx, { int ret; + ASSERT(sctx-phase != SEND_PHASE_COMPUTE_DATA_SIZE); + verbose_printk(btrfs: send_link %s - %s\n, path-start, lnk-start); ret = begin_cmd(sctx, BTRFS_SEND_C_LINK); @@ -739,6 +750,8 @@ static int send_unlink(struct send_ctx *sctx, struct fs_path *path) { int ret; + ASSERT(sctx-phase != SEND_PHASE_COMPUTE_DATA_SIZE); + verbose_printk(btrfs: send_unlink %s\n, path-start); ret = begin_cmd(sctx, BTRFS_SEND_C_UNLINK); @@ -761,6 +774,8 @@ static int send_rmdir(struct send_ctx *sctx, struct fs_path *path) { int ret; + ASSERT(sctx-phase != SEND_PHASE_COMPUTE_DATA_SIZE); + verbose_printk(btrfs: send_rmdir %s\n, path-start); ret = begin_cmd(sctx, BTRFS_SEND_C_RMDIR); @@ -2308,6 +2323,9 @@ static int send_truncate(struct send_ctx *sctx, u64 ino, u64 gen, u64 size) int ret = 0; struct fs_path *p; + if (sctx-phase == SEND_PHASE_COMPUTE_DATA_SIZE) + return 0; + verbose_printk(btrfs: send_truncate %llu size=%llu\n, ino, size); p = fs_path_alloc(); @@ -2337,6 +2355,8 @@ static int send_chmod(struct send_ctx *sctx, u64 ino, u64 gen, u64 mode) int ret = 0; struct fs_path *p; + ASSERT(sctx-phase != SEND_PHASE_COMPUTE_DATA_SIZE); + verbose_printk(btrfs: send_chmod
[PATCH 4/4 v2] Btrfs-progs: add write and clone commands debug info to receive
When specifying -vv print information about received write and clone commands too, as we do this for other commands already and it's very useful for debugging and troubleshooting. Signed-off-by: Filipe David Borba Manana fdman...@gmail.com --- V2: Added new send ioctl flag BTRFS_SEND_FLAG_SUPPORT_FALLOCATE. A version 2 stream is now only produced is the ioctl caller specifies at least one of the new send flags (BTRFS_SEND_FLAG_SUPPORT_FALLOCATE or BTRFS_SEND_FLAG_CALCULATE_DATA_SIZE). cmds-receive.c | 9 + 1 file changed, 9 insertions(+) diff --git a/cmds-receive.c b/cmds-receive.c index 3f30066..7a23823 100644 --- a/cmds-receive.c +++ b/cmds-receive.c @@ -636,6 +636,10 @@ static int process_write(const char *path, const void *data, u64 offset, u64 pos = 0; int w; + if (g_verbose = 2) + fprintf(stderr, write %s, offset %llu, len %llu\n, + path, offset, len); + ret = open_inode_for_write(r, full_path); if (ret 0) goto out; @@ -672,6 +676,11 @@ static int process_clone(const char *path, u64 offset, u64 len, char *full_clone_path = NULL; int clone_fd = -1; + if (g_verbose = 2) + fprintf(stderr, + clone %s, offset %llu, len %llu, clone path %s, clone offset %llu\n, + path, offset, len, clone_path, clone_offset); + ret = open_inode_for_write(r, full_path); if (ret 0) goto out; -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v2] xfstests: btrfs, test send's ability to punch holes and prealloc extents
This test verifies that after an incremental btrfs send the replicated file has the same exact hole and data structure as in the origin filesystem. This didn't use to be the case before the send stream version 2 - holes were sent as write operations of 0 valued bytes instead of punching holes with the fallocate system call, and pre-allocated extents were sent as well as write operations of 0 valued bytes instead of intructions for the receiver to use the fallocate system call. It also checks that prealloc extents that lie beyond the file's size are replicated by an incremental send. Signed-off-by: Filipe David Borba Manana fdman...@gmail.com --- V2: Addressed Dave's comments, and updated btrfs send invocation, by specifying the new command line option (-a) that enables use of fallocate - added function _require_btrfs_send_fallocate_flag() to skip the test when an old version of btrfs-progs is found. common/rc | 9 tests/btrfs/047 | 121 tests/btrfs/047.out | 27 tests/btrfs/group | 1 + 4 files changed, 158 insertions(+) create mode 100755 tests/btrfs/047 create mode 100644 tests/btrfs/047.out diff --git a/common/rc b/common/rc index acf419b..e94e51c 100644 --- a/common/rc +++ b/common/rc @@ -2262,6 +2262,15 @@ _run_btrfs_util_prog() run_check $BTRFS_UTIL_PROG $* } +_require_btrfs_send_fallocate_flag() +{ + $BTRFS_UTIL_PROG send 21 | \ + grep '^[ \t]*\-a[ \t]\+.* fallocate ' /dev/null 21 + if [ $? -ne 0 ]; then + _notrun Missing btrfs-progs send -a command line option, skipped this test + fi +} + init_rc() { if [ $iam == new ] diff --git a/tests/btrfs/047 b/tests/btrfs/047 new file mode 100755 index 000..c8171a5 --- /dev/null +++ b/tests/btrfs/047 @@ -0,0 +1,121 @@ +#! /bin/bash +# FS QA Test No. btrfs/047 +# +# Verify that after an incremental btrfs send the replicated file has +# the same exact hole and data structure as in the origin filesystem. +# This didn't use to be the case before the send stream version 2 - +# holes were sent as write operations of 0 valued bytes instead of punching +# holes with the fallocate system call, and pre-allocated extents were sent +# as well as write operations of 0 valued bytes instead of intructions for +# the receiver to use the fallocate system call. Also check that prealloc +# extents that lie beyond the file's size are replicated by an incremental +# send. +# +# More specifically, this structure preserving guarantee was added by the +# following linux kernel commits: +# +#Btrfs: send, use fallocate command to punch holes +#Btrfs: send, use fallocate command to allocate extents +# +#--- +# Copyright (c) 2014 Filipe Manana. All Rights Reserved. +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +#--- +# + +seq=`basename $0` +seqres=$RESULT_DIR/$seq +echo QA output created by $seq + +tmp=/tmp/$$ +status=1 # failure is the default! +trap _cleanup; exit \$status 0 1 2 3 15 + +_cleanup() +{ +rm -fr $send_files_dir +rm -fr $tmp +} + +# get standard environment, filters and checks +. ./common/rc +. ./common/filter +. ./common/punch + +# real QA test starts here +_supported_fs btrfs +_supported_os Linux +_require_scratch +_require_fssum +_require_xfs_io_fiemap +_require_btrfs_send_fallocate_flag +_need_to_be_root + +send_files_dir=$TEST_DIR/btrfs-test-$seq + +rm -f $seqres.full +rm -fr $send_files_dir +mkdir $send_files_dir + +_scratch_mkfs /dev/null 21 +_scratch_mount + +$XFS_IO_PROG -f -c pwrite -S 0x01 -b 30 0 30 $SCRATCH_MNT/foo \ + | _filter_xfs_io + +_run_btrfs_util_prog subvolume snapshot -r $SCRATCH_MNT $SCRATCH_MNT/mysnap1 + +$XFS_IO_PROG -c fpunch 10 5 $SCRATCH_MNT/foo +$XFS_IO_PROG -c falloc 10 5 $SCRATCH_MNT/foo +$XFS_IO_PROG -c pwrite -S 0xff -b 1000 12 1000 $SCRATCH_MNT/foo \ + | _filter_xfs_io +$XFS_IO_PROG -c fpunch 25 2 $SCRATCH_MNT/foo + +$XFS_IO_PROG -c falloc -k 30 100 $SCRATCH_MNT/foo +$XFS_IO_PROG -c falloc -k 900 200 $SCRATCH_MNT/foo + +_run_btrfs_util_prog subvolume snapshot -r $SCRATCH_MNT $SCRATCH_MNT/mysnap2 + +_run_btrfs_util_prog send -a $SCRATCH_MNT/mysnap1 -f
[PATCH v2] xfstests: btrfs, add test for btrfs properties
This test case verifies the btrfs properties feature, a new feature introduced in the linux kernel version 3.14. Signed-off-by: Filipe David Borba Manana fdman...@gmail.com --- V2: Addressed Dave's comments, removed function to check for existence of the btrfs-progs property command and use instead existing function _require_btrfs which checks if a btrfs-progs command exists and is equivalent to what I had before. tests/btrfs/048 | 220 tests/btrfs/048.out | 78 +++ tests/btrfs/group | 1 + 3 files changed, 299 insertions(+) create mode 100755 tests/btrfs/048 create mode 100644 tests/btrfs/048.out diff --git a/tests/btrfs/048 b/tests/btrfs/048 new file mode 100755 index 000..e998f97 --- /dev/null +++ b/tests/btrfs/048 @@ -0,0 +1,220 @@ +#! /bin/bash +# FS QA Test No. btrfs/048 +# +# Btrfs properties test. The btrfs properties feature was introduced in the +# linux kernel 3.14. +# +#--- +# Copyright (c) 2014 Filipe Manana. All Rights Reserved. +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +#--- +# + +seq=`basename $0` +seqres=$RESULT_DIR/$seq +echo QA output created by $seq + +here=`pwd` +tmp=/tmp/$$ + +status=1 # failure is the default! +trap _cleanup; exit \$status 0 1 2 3 15 + +_cleanup() +{ +rm -fr $send_files_dir +rm -fr $tmp +} + +# get standard environment, filters and checks +. ./common/rc +. ./common/filter + +# real QA test starts here +_supported_fs btrfs +_supported_os Linux +_require_scratch +_require_btrfs property +_need_to_be_root + +send_files_dir=$TEST_DIR/btrfs-test-$seq + +rm -f $seqres.full +rm -fr $send_files_dir +mkdir $send_files_dir + +_scratch_mkfs /dev/null 21 +_scratch_mount + +echo Testing label property +$BTRFS_UTIL_PROG property get $SCRATCH_MNT label +echo *** +$BTRFS_UTIL_PROG property set $SCRATCH_MNT label foobar +$BTRFS_UTIL_PROG property get $SCRATCH_MNT label +echo *** +$BTRFS_UTIL_PROG property get $SCRATCH_MNT +echo *** +$BTRFS_UTIL_PROG property set $SCRATCH_MNT label '' +$BTRFS_UTIL_PROG property get $SCRATCH_MNT label +echo *** +mkdir $SCRATCH_MNT/testdir +$BTRFS_UTIL_PROG property get $SCRATCH_MNT/testdir label +echo *** + +echo -e \nTesting subvolume ro property +_run_btrfs_util_prog subvolume create $SCRATCH_MNT/sv1 +$BTRFS_UTIL_PROG property get $SCRATCH_MNT/sv1 ro +echo *** +$BTRFS_UTIL_PROG property set $SCRATCH_MNT/sv1 ro foo +echo *** +$BTRFS_UTIL_PROG property set $SCRATCH_MNT/sv1 ro true +echo *** +$BTRFS_UTIL_PROG property get $SCRATCH_MNT/sv1 ro +echo *** +touch $SCRATCH_MNT/sv1/foobar 21 | _filter_scratch +echo *** +$BTRFS_UTIL_PROG property set $SCRATCH_MNT/sv1 ro false +touch $SCRATCH_MNT/sv1/foobar 21 | _filter_scratch +$BTRFS_UTIL_PROG property get $SCRATCH_MNT/sv1 +echo *** + +echo -e \nTesting compression property +mkdir $SCRATCH_MNT/testdir/subdir1 +touch $SCRATCH_MNT/testdir/file1 +$BTRFS_UTIL_PROG property get $SCRATCH_MNT/testdir/file1 compression +$BTRFS_UTIL_PROG property get $SCRATCH_MNT/testdir/subdir1 compression +echo *** +$BTRFS_UTIL_PROG property set $SCRATCH_MNT/testdir/file1 compression \ + foo 21 | _filter_scratch +echo *** +$BTRFS_UTIL_PROG property set $SCRATCH_MNT/testdir/file1 compression lzo +$BTRFS_UTIL_PROG property get $SCRATCH_MNT/testdir/file1 compression + +# Verify property was persisted. +_scratch_unmount +_check_scratch_fs +_scratch_mount +$BTRFS_UTIL_PROG property get $SCRATCH_MNT/testdir/file1 compression +$BTRFS_UTIL_PROG property set $SCRATCH_MNT/testdir/file1 compression zlib +$BTRFS_UTIL_PROG property get $SCRATCH_MNT/testdir/file1 compression +$BTRFS_UTIL_PROG property set $SCRATCH_MNT/testdir/file1 compression '' +$BTRFS_UTIL_PROG property get $SCRATCH_MNT/testdir/file1 compression + +# Test compression property inheritance. +echo *** +$BTRFS_UTIL_PROG property set $SCRATCH_MNT/testdir/subdir1 compression lzo +$BTRFS_UTIL_PROG property get $SCRATCH_MNT/testdir/subdir1 compression +echo *** +mkdir $SCRATCH_MNT/testdir/subdir1/subsubdir +touch $SCRATCH_MNT/testdir/subdir1/some_file +$BTRFS_UTIL_PROG property get $SCRATCH_MNT/testdir/subdir1/subsubdir compression +echo *** +$BTRFS_UTIL_PROG property get $SCRATCH_MNT/testdir/subdir1/some_file compression +echo *** +mkdir
[PATCH 3/4 v3] Btrfs: send, use fallocate command to punch holes
Instead of sending a write command with a data buffer filled with 0 value bytes, use the fallocate command, introduced in the send stream version 2, to tell the receiver to punch a file hole using the fallocate system call. Signed-off-by: Filipe David Borba Manana fdman...@gmail.com --- V2: A v2 stream is now only produced if the send ioctl caller passes in one of the new flags (BTRFS_SEND_FLAG_CALCULATE_DATA_SIZE | BTRFS_SEND_FLAG_SUPPORT_FALLOCATE) to avoid breaking old clients. V3: Added missing path allocation, messed up rebase. fs/btrfs/send.c | 55 --- fs/btrfs/send.h | 4 2 files changed, 56 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index f5db492..bb9afea 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -564,6 +564,7 @@ static int tlv_put(struct send_ctx *sctx, u16 attr, const void *data, int len) return tlv_put(sctx, attr, __tmp, sizeof(__tmp)); \ } +TLV_PUT_DEFINE_INT(32) TLV_PUT_DEFINE_INT(64) static int tlv_put_string(struct send_ctx *sctx, u16 attr, @@ -4483,18 +4484,59 @@ out: return ret; } +static int send_fallocate(struct send_ctx *sctx, u32 flags, + u64 offset, u64 len) +{ + struct fs_path *p = NULL; + int ret = 0; + + ASSERT(sctx-flags BTRFS_SEND_FLAG_SUPPORT_FALLOCATE); + + if (sctx-phase == SEND_PHASE_COMPUTE_DATA_SIZE) { + sctx-total_data_size += len; + return 0; + } + + p = fs_path_alloc(); + if (!p) + return -ENOMEM; + ret = get_cur_path(sctx, sctx-cur_ino, sctx-cur_inode_gen, p); + if (ret 0) + goto out; + + ret = begin_cmd(sctx, BTRFS_SEND_C_FALLOCATE); + if (ret 0) + goto out; + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); + TLV_PUT_U32(sctx, BTRFS_SEND_A_FALLOCATE_FLAGS, flags); + TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); + TLV_PUT_U64(sctx, BTRFS_SEND_A_SIZE, len); + ret = send_cmd(sctx); + +tlv_put_failure: +out: + fs_path_free(p); + return ret; +} + static int send_hole(struct send_ctx *sctx, u64 end) { struct fs_path *p = NULL; u64 offset = sctx-cur_inode_last_extent; - u64 len; + u64 len = end - offset; int ret = 0; if (sctx-phase == SEND_PHASE_COMPUTE_DATA_SIZE) { - sctx-total_data_size += end - offset; + sctx-total_data_size += len; return 0; } + if (sctx-flags BTRFS_SEND_FLAG_SUPPORT_FALLOCATE) + return send_fallocate(sctx, + BTRFS_SEND_PUNCH_HOLE_FALLOC_FLAGS, + offset, + len); + p = fs_path_alloc(); if (!p) return -ENOMEM; @@ -4551,7 +4593,8 @@ static int send_write_or_clone(struct send_ctx *sctx, len = btrfs_file_extent_num_bytes(path-nodes[0], ei); } - if (offset + len sctx-cur_inode_size) + if (offset sctx-cur_inode_size + offset + len sctx-cur_inode_size) len = sctx-cur_inode_size - offset; if (len == 0) { ret = 0; @@ -4568,6 +4611,12 @@ static int send_write_or_clone(struct send_ctx *sctx, ret = send_clone(sctx, offset, len, clone_root); } else if (sctx-flags BTRFS_SEND_FLAG_NO_FILE_DATA) { ret = send_update_extent(sctx, offset, len); + } else if (btrfs_file_extent_disk_bytenr(path-nodes[0], ei) == 0 + type != BTRFS_FILE_EXTENT_INLINE + (sctx-flags BTRFS_SEND_FLAG_SUPPORT_FALLOCATE) + offset sctx-cur_inode_size) { + ret = send_fallocate(sctx, BTRFS_SEND_PUNCH_HOLE_FALLOC_FLAGS, +offset, len); } else { while (pos len) { l = len - pos; diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h index 367030d..a632c0d 100644 --- a/fs/btrfs/send.h +++ b/fs/btrfs/send.h @@ -141,6 +141,10 @@ enum { #define BTRFS_SEND_A_FALLOCATE_FLAG_KEEP_SIZE (1 0) #define BTRFS_SEND_A_FALLOCATE_FLAG_PUNCH_HOLE (1 1) +#define BTRFS_SEND_PUNCH_HOLE_FALLOC_FLAGS\ + (BTRFS_SEND_A_FALLOCATE_FLAG_KEEP_SIZE | \ +BTRFS_SEND_A_FALLOCATE_FLAG_PUNCH_HOLE) + #ifdef __KERNEL__ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg); #endif -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html