Re: [PATCH 1/5] Btrfs-progs: fix closing of devices
On 11/06/2013, at 16:00, David Sterba dste...@suse.cz wrote: On Mon, Jun 10, 2013 at 09:07:55PM +0100, Filipe David Borba Manana wrote: --- a/disk-io.c +++ b/disk-io.c @@ -1267,12 +1267,12 @@ static int close_all_devices(struct btrfs_fs_info *fs_info) while (!list_empty(list)) { device = list_entry(list-next, struct btrfs_device, dev_list); list_del_init(device-dev_list); -if (device-fd) { +if (device-fd 0) { -device = kmalloc(sizeof(*device), GFP_NOFS); +device = kzalloc(sizeof(*device), GFP_NOFS); if (!device) return -ENOMEM; -device-total_ios = 0; Can you please describe the incremental change between the two patches? Why do you exclude fd == 0 ? Sorry, I messed up my git send-email foo. I excluded 0 because the original code did it too, however it was logically incorrect. The last patch version (v2, 3rd email) does not exclude 0 and ensures that -1 is used everywhere as a marker for invalid fd. (Hopefully now I figured how to use git send-email to update a patch correctly) Thanks david -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] Btrfs: read lock extent buffer while walking backrefs
Before processing the extent buffer, acquire a read lock on it, so that we're safe against concurrent updates on the extent buffer. Signed-off-by: Filipe Manana fdman...@suse.com --- fs/btrfs/backref.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index e25564b..a1efd39 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1001,8 +1001,11 @@ again: ret = -EIO; goto out; } + btrfs_tree_read_lock(eb); + btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); ret = find_extent_in_eb(eb, bytenr, *extent_item_pos, eie); + btrfs_tree_read_unlock_blocking(eb); free_extent_buffer(eb); if (ret 0) goto out; -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] Btrfs: reduce size of struct extent_state
The tree field of struct extent_state was only used to figure out if an extent state was connected to an inode's io tree or not. For this we can just use the rb_node field itself. On a x86_64 system with this change the sizeof(struct extent_state) is reduced from 96 bytes down to 88 bytes, meaning that with a page size of 4096 bytes we can now store 46 extent states per page instead of 42. Signed-off-by: Filipe Manana fdman...@suse.com --- fs/btrfs/extent_io.c | 37 - fs/btrfs/extent_io.h | 1 - 2 files changed, 20 insertions(+), 18 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 23398ad..7e44e18 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -25,6 +25,11 @@ static struct kmem_cache *extent_state_cache; static struct kmem_cache *extent_buffer_cache; static struct bio_set *btrfs_bioset; +static inline bool extent_state_in_tree(const struct extent_state *state) +{ + return !RB_EMPTY_NODE(state-rb_node); +} + #ifdef CONFIG_BTRFS_DEBUG static LIST_HEAD(buffers); static LIST_HEAD(states); @@ -59,9 +64,9 @@ void btrfs_leak_debug_check(void) while (!list_empty(states)) { state = list_entry(states.next, struct extent_state, leak_list); - printk(KERN_ERR BTRFS: state leak: start %llu end %llu - state %lu in tree %p refs %d\n, - state-start, state-end, state-state, state-tree, + pr_err(BTRFS: state leak: start %llu end %llu state %lu in tree %d refs %d\n, + state-start, state-end, state-state, + extent_state_in_tree(state), atomic_read(state-refs)); list_del(state-leak_list); kmem_cache_free(extent_state_cache, state); @@ -209,7 +214,7 @@ static struct extent_state *alloc_extent_state(gfp_t mask) return state; state-state = 0; state-private = 0; - state-tree = NULL; + RB_CLEAR_NODE(state-rb_node); btrfs_leak_debug_add(state-leak_list, states); atomic_set(state-refs, 1); init_waitqueue_head(state-wq); @@ -222,7 +227,7 @@ void free_extent_state(struct extent_state *state) if (!state) return; if (atomic_dec_and_test(state-refs)) { - WARN_ON(state-tree); + WARN_ON(extent_state_in_tree(state)); btrfs_leak_debug_del(state-leak_list); trace_free_extent_state(state, _RET_IP_); kmem_cache_free(extent_state_cache, state); @@ -371,8 +376,8 @@ static void merge_state(struct extent_io_tree *tree, other-state == state-state) { merge_cb(tree, state, other); state-start = other-start; - other-tree = NULL; rb_erase(other-rb_node, tree-state); + RB_CLEAR_NODE(other-rb_node); free_extent_state(other); } } @@ -383,8 +388,8 @@ static void merge_state(struct extent_io_tree *tree, other-state == state-state) { merge_cb(tree, state, other); state-end = other-end; - other-tree = NULL; rb_erase(other-rb_node, tree-state); + RB_CLEAR_NODE(other-rb_node); free_extent_state(other); } } @@ -442,7 +447,6 @@ static int insert_state(struct extent_io_tree *tree, found-start, found-end, start, end); return -EEXIST; } - state-tree = tree; merge_state(tree, state); return 0; } @@ -486,7 +490,6 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig, free_extent_state(prealloc); return -EEXIST; } - prealloc-tree = tree; return 0; } @@ -524,9 +527,9 @@ static struct extent_state *clear_state_bit(struct extent_io_tree *tree, wake_up(state-wq); if (state-state == 0) { next = next_state(state); - if (state-tree) { + if (extent_state_in_tree(state)) { rb_erase(state-rb_node, tree-state); - state-tree = NULL; + RB_CLEAR_NODE(state-rb_node); free_extent_state(state); } else { WARN_ON(1); @@ -606,8 +609,8 @@ again: cached_state = NULL; } - if (cached cached-tree cached-start = start - cached-end start) { + if (cached extent_state_in_tree(cached) + cached-start = start cached-end start) { if (clear) atomic_dec(cached-refs
[PATCH] Btrfs: set error return value in btrfs_get_blocks_direct
We were returning with 0 (success) because we weren't extracting the error code from em (PTR_ERR(em)). Fix it. Signed-off-by: Filipe Manana fdman...@suse.com --- fs/btrfs/inode.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 6b65fab..8a946c0 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6998,8 +6998,10 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, block_start, len, orig_block_len, ram_bytes, type); - if (IS_ERR(em)) + if (IS_ERR(em)) { + ret = PTR_ERR(em); goto unlock_err; + } } ret = btrfs_add_ordered_extent_dio(inode, start, -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] Btrfs: race free update of commit root for ro snapshots
This is a better solution for the problem addressed in the following commit: Btrfs: update commit root on snapshot creation after orphan cleanup (3821f348889e506efbd268cc8149e0ebfa47c4e5) The previous solution wasn't the best because of 2 reasons: 1) It added another full transaction commit, which is more expensive than just swapping the commit root with the root; 2) Not completely race-free. As soon as the transaction commits, the snapshots becomes visible from user space, and before we do the orphan cleanup, user space can ask for a send operation that uses the new snapshot. This change addresses those 2 issues. Special thanks to Alex Lyakas for spotting the second issue. Cc: Alex Lyakas alex.bt...@zadarastorage.com Signed-off-by: Filipe Manana fdman...@suse.com --- fs/btrfs/inode.c | 29 + fs/btrfs/ioctl.c | 29 - 2 files changed, 29 insertions(+), 29 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 1d5f0b3..982a8f7 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5227,6 +5227,35 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) iput(inode); inode = ERR_PTR(ret); } + /* +* If orphan cleanup did remove any orphans, it means the tree +* was modified and therefore the commit root is not the same as +* the current root anymore. This is a problem, because send +* uses the commit root and therefore can see inode items that +* don't exist in the current root anymore, and for example make +* calls to btrfs_iget, which will do tree lookups based on the +* current root and not on the commit root. Those lookups will +* fail, returning a -ESTALE error, and making send fail with +* that error. So make sure a send does not see any orphans we +* have just removed, and that it will see the same inodes +* regardless of whether a transaction commit happened before +* it started (meaning that the commit root will be the same as +* the current root) or not. +*/ + if (sub_root-node != sub_root-commit_root) { + u64 sub_flags = btrfs_root_flags(sub_root-root_item); + + if (sub_flags BTRFS_ROOT_SUBVOL_RDONLY) { + struct extent_buffer *eb; + + down_write(root-fs_info-commit_root_sem); + eb = sub_root-commit_root; + sub_root-commit_root = + btrfs_root_node(sub_root); + up_write(root-fs_info-commit_root_sem); + free_extent_buffer(eb); + } + } } return inode; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 2a30ac1..d44abc0 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -715,35 +715,6 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, if (ret) goto fail; - /* -* If orphan cleanup did remove any orphans, it means the tree was -* modified and therefore the commit root is not the same as the -* current root anymore. This is a problem, because send uses the -* commit root and therefore can see inode items that don't exist -* in the current root anymore, and for example make calls to -* btrfs_iget, which will do tree lookups based on the current root -* and not on the commit root. Those lookups will fail, returning a -* -ESTALE error, and making send fail with that error. So make sure -* a send does not see any orphans we have just removed, and that it -* will see the same inodes regardless of whether a transaction -* commit happened before it started (meaning that the commit root -* will be the same as the current root) or not. -*/ - if (readonly pending_snapshot-snap-node != - pending_snapshot-snap-commit_root) { - trans = btrfs_join_transaction(pending_snapshot-snap); - if (IS_ERR(trans) PTR_ERR(trans) != -ENOENT) { - ret = PTR_ERR(trans); - goto fail; - } - if (!IS_ERR(trans)) { - ret = btrfs_commit_transaction(trans, - pending_snapshot-snap); - if (ret) - goto fail; - } - } - inode = btrfs_lookup_dentry(dentry-d_parent-d_inode, dentry); if (IS_ERR(inode)) { ret = PTR_ERR(inode
[PATCH] Btrfs: unlock nodes earlier when inserting items in a btree
In ctree.c:setup_items_for_insert(), we can unlock all nodes in our path before we process the leaf (shift items and data, adjust data offsets, etc). This allows for better btree concurrency, as we're often holding a write lock on at least the node at level 1. Signed-off-by: Filipe Manana fdman...@suse.com --- fs/btrfs/ctree.c | 12 ++-- 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 263145b..bd0ae3e 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -4738,6 +4738,12 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, int slot; struct btrfs_map_token token; + if (path-slots[0] == 0) { + btrfs_cpu_key_to_disk(disk_key, cpu_key); + fixup_low_keys(root, path, disk_key, 1); + } + btrfs_unlock_up_safe(path, 1); + btrfs_init_map_token(token); leaf = path-nodes[0]; @@ -4798,12 +4804,6 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, } btrfs_set_header_nritems(leaf, nritems + nr); - - if (slot == 0) { - btrfs_cpu_key_to_disk(disk_key, cpu_key); - fixup_low_keys(root, path, disk_key, 1); - } - btrfs_unlock_up_safe(path, 1); btrfs_mark_buffer_dirty(leaf); if (btrfs_leaf_free_space(root, leaf) 0) { -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] Btrfs: avoid unnecessary switch of path locks to blocking mode
If we need to cow a node, increase the write lock level and retry the tree search, there's no point of changing the node locks in our path to blocking mode, as we only waste time and unnecessarily wake up other tasks waiting on the spinning locks (just to block them again shortly after) because we release our path before repeating the tree search. Signed-off-by: Filipe Manana fdman...@suse.com --- fs/btrfs/ctree.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index bd0ae3e..783ea3b 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -2792,8 +2792,6 @@ again: if (!should_cow_block(trans, root, b)) goto cow_done; - btrfs_set_path_blocking(p); - /* * must have write locks on this node and the * parent @@ -2807,6 +2805,7 @@ again: goto again; } + btrfs_set_path_blocking(p); err = btrfs_cow_block(trans, root, b, p-nodes[level + 1], p-slots[level + 1], b); -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] Btrfs: return path with unlocked nodes in btrfs_next_leaf
Calling unlock_up() to release our new path doesn't release the read lock on the node at level 1, because our return path has path-slots[0] == 0, which makes unlock_up() skip unlocking that node. Since we don't need to return that node locked, call btrfs_unlock_up_safe() instead of unlock_up(), which will release all nodes in the path (except the leaf of course). For any level N = 2, the corresponding node lock isn't released by unlock_up() too if path-slots[N - 1] == 0. Releasing the read lock immediately will allow concurrent writers to write lock that node at level 1 (or higher levels if applicable) while the btrfs_next_leaf() caller processes the leaf. Signed-off-by: Filipe Manana fdman...@suse.com --- fs/btrfs/ctree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 783ea3b..8ca6761 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -5833,7 +5833,7 @@ again: } ret = 0; done: - unlock_up(path, 0, 1, 0, NULL); + btrfs_unlock_up_safe(path, 1); path-leave_spinning = old_spinning; if (!old_spinning) btrfs_set_path_blocking(path); -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v2] Btrfs: race free update of commit root for ro snapshots
This is a better solution for the problem addressed in the following commit: Btrfs: update commit root on snapshot creation after orphan cleanup (3821f348889e506efbd268cc8149e0ebfa47c4e5) The previous solution wasn't the best because of 2 reasons: 1) It added another full transaction commit, which is more expensive than just swapping the commit root with the root; 2) If a reboot happened after the first transaction commit (the one that creates the snapshot) and before the second transaction commit, then we would end up with the same problem if a send using that snapshot was requested before the first transaction commit after the reboot. This change addresses those 2 issues. The second issue is addressed by switching the commit root in the dentry lookup VFS callback, which is also called by the snapshot/subvol creation ioctl and performs orphan cleanup if needed. Like the vfs, the ioctl locks the parent inode too, preventing race issues between a dentry lookup and snapshot creation. Cc: Alex Lyakas alex.bt...@zadarastorage.com Signed-off-by: Filipe Manana fdman...@suse.com --- V2: Updated commit message, as original second issue was not correct. Removed redundant btrfs_orphan_cleanup() call in the snapshot creation ioctl, as it's performed by btrfs_lookup_dentry() which is called by the ioctl. fs/btrfs/inode.c | 36 fs/btrfs/ioctl.c | 33 - 2 files changed, 36 insertions(+), 33 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 1d5f0b3..4f35c6c 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5227,6 +5227,42 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) iput(inode); inode = ERR_PTR(ret); } + /* +* If orphan cleanup did remove any orphans, it means the tree +* was modified and therefore the commit root is not the same as +* the current root anymore. This is a problem, because send +* uses the commit root and therefore can see inode items that +* don't exist in the current root anymore, and for example make +* calls to btrfs_iget, which will do tree lookups based on the +* current root and not on the commit root. Those lookups will +* fail, returning a -ESTALE error, and making send fail with +* that error. So make sure a send does not see any orphans we +* have just removed, and that it will see the same inodes +* regardless of whether a transaction commit happened before +* it started (meaning that the commit root will be the same as +* the current root) or not. +*/ + if (sub_root-node != sub_root-commit_root) { + u64 sub_flags = btrfs_root_flags(sub_root-root_item); + + if (sub_flags BTRFS_ROOT_SUBVOL_RDONLY) { + struct extent_buffer *eb; + + /* +* Assert we can't have races between dentry +* lookup called through the snapshot creation +* ioctl and the VFS. +*/ + ASSERT(mutex_is_locked(dir-i_mutex)); + + down_write(root-fs_info-commit_root_sem); + eb = sub_root-commit_root; + sub_root-commit_root = + btrfs_root_node(sub_root); + up_write(root-fs_info-commit_root_sem); + free_extent_buffer(eb); + } + } } return inode; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 2a30ac1..ef2e073 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -711,39 +711,6 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, if (ret) goto fail; - ret = btrfs_orphan_cleanup(pending_snapshot-snap); - if (ret) - goto fail; - - /* -* If orphan cleanup did remove any orphans, it means the tree was -* modified and therefore the commit root is not the same as the -* current root anymore. This is a problem, because send uses the -* commit root and therefore can see inode items that don't exist -* in the current root anymore, and for example make calls to -* btrfs_iget, which will do tree lookups based on the current root -* and not on the commit root. Those lookups will fail, returning a -* -ESTALE error, and making send fail with that error. So make sure -* a send does
[PATCH] Btrfs: ensure tmpfile inode is always persisted with link count of 0
If we open a file with O_TMPFILE, don't do any further operation on it (so that the inode item isn't updated) and then force a transaction commit, we get a persisted inode item with a link count of 1, and not 0 as it should be. Steps to reproduce it (requires a modern xfs_io with -T support): $ mkfs.btrfs -f /dev/sdd $ mount -o /dev/sdd /mnt $ xfs_io -T /mnt $ sync Then btrfs-debug-tree shows the inode item with a link count of 1: $ btrfs-debug-tree /dev/sdd (...) fs tree key (FS_TREE ROOT_ITEM 0) leaf 29556736 items 4 free space 15851 generation 6 owner 5 fs uuid f164d01b-1b92-481d-a4e4-435fb0f843d0 chunk uuid 0e3d0e56-bcca-4a1c-aa5f-cec2c6f4f7a6 item 0 key (256 INODE_ITEM 0) itemoff 16123 itemsize 160 inode generation 3 transid 6 size 0 block group 0 mode 40755 links 1 item 1 key (256 INODE_REF 256) itemoff 16111 itemsize 12 inode ref index 0 namelen 2 name: .. item 2 key (257 INODE_ITEM 0) itemoff 15951 itemsize 160 inode generation 6 transid 6 size 0 block group 0 mode 100600 links 1 item 3 key (ORPHAN ORPHAN_ITEM 257) itemoff 15951 itemsize 0 orphan item checksum tree key (CSUM_TREE ROOT_ITEM 0) (...) Signed-off-by: Filipe Manana fdman...@suse.com --- fs/btrfs/inode.c | 15 +++ 1 file changed, 15 insertions(+) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 4f35c6c..8ad3ea9 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5688,6 +5688,13 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, } /* +* O_TMPFILE, set link count to 0, so that after this point, +* we fill in an inode item with the correct link count. +*/ + if (!name) + set_nlink(inode, 0); + + /* * we have to initialize this early, so we can reclaim the inode * number if we fail afterwards in this function. */ @@ -9133,6 +9140,14 @@ static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) if (ret) goto out; + /* +* We set number of links to 0 in btrfs_new_inode(), and here we set +* it to 1 because d_tmpfile() will issue a warning if the count is 0, +* through: +* +*d_tmpfile() - inode_dec_link_count() - drop_nlink() +*/ + set_nlink(inode, 1); d_tmpfile(dentry, inode); mark_inode_dirty(inode); -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] xfstests: add regression test for btrfs send with orphans
Regression test for a btrfs issue where we create a RO snapshot to use for a send operation, which fails with a -ESTALE error, due to the presence of orphan inodes accessible through the snapshot's commit root but no longer present through the main root. This issue is fixed by the following linux kernel btrfs patch: Btrfs: update commit root on snapshot creation after orphan cleanup Signed-off-by: Filipe Manana fdman...@suse.com --- tests/btrfs/057 | 81 + tests/btrfs/057.out | 1 + tests/btrfs/group | 1 + 3 files changed, 83 insertions(+) create mode 100755 tests/btrfs/057 create mode 100644 tests/btrfs/057.out diff --git a/tests/btrfs/057 b/tests/btrfs/057 new file mode 100755 index 000..2174077 --- /dev/null +++ b/tests/btrfs/057 @@ -0,0 +1,81 @@ +#! /bin/bash +# FS QA Test No. btrfs/057 +# +# Regression test for a btrfs issue where we create a RO snapshot to use for +# a send operation which fails with a -ESTALE error, due to the presence of +# orphan inodes accessible through the snapshot's commit root but no longer +# present through the main root. +# +# This issue is fixed by the following linux kernel btrfs patch: +# +#Btrfs: update commit root on snapshot creation after orphan cleanup +# +#--- +# Copyright (C) 2014 SUSE Linux Products GmbH. All Rights Reserved. +# Author: Filipe Manana fdman...@suse.com +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +#--- +# + +seq=`basename $0` +seqres=$RESULT_DIR/$seq +echo QA output created by $seq + +tmp=/tmp/$$ +status=1 # failure is the default! +trap _cleanup; exit \$status 0 1 2 3 15 + +_cleanup() +{ + if [ ! -z $XFS_IO_PID ]; then + kill $XFS_IO_PID /dev/null 21 + fi + rm -fr $tmp +} + +# get standard environment, filters and checks +. ./common/rc +. ./common/filter + +# real QA test starts here +_supported_fs btrfs +_supported_os Linux +_require_scratch +# Requiring flink command tests for the presence of the -T option used +# to pass O_TMPFILE to open(2). +_require_xfs_io_command flink +_need_to_be_root + +rm -f $seqres.full + +_scratch_mkfs /dev/null 21 +_scratch_mount + +# Create a tmpfile file, write some data to it and leave it open, so that our +# main subvolume has an orphan inode item. +$XFS_IO_PROG -T $SCRATCH_MNT $seqres.full 21 ( + echo pwrite 0 65536 + read +) +XFS_IO_PID=$! + +# With the tmpfile open, create a RO snapshot and use it for a send operation. +# The send operation used to fail with -ESTALE due to the presence of the +# orphan inode. +_run_btrfs_util_prog subvolume snapshot -r $SCRATCH_MNT $SCRATCH_MNT/mysnap +_run_btrfs_util_prog send $SCRATCH_MNT/mysnap -f /dev/null + +status=0 +exit diff --git a/tests/btrfs/057.out b/tests/btrfs/057.out new file mode 100644 index 000..b26eefe --- /dev/null +++ b/tests/btrfs/057.out @@ -0,0 +1 @@ +QA output created by 057 diff --git a/tests/btrfs/group b/tests/btrfs/group index 2da7127..ebc38c5 100644 --- a/tests/btrfs/group +++ b/tests/btrfs/group @@ -59,3 +59,4 @@ 054 auto quick 055 auto quick 056 auto quick +057 auto quick -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v2] xfstests: add regression test for btrfs send with orphans
Regression test for a btrfs issue where we create a RO snapshot to use for a send operation, which fails with a -ESTALE error, due to the presence of orphan inodes accessible through the snapshot's commit root but no longer present through the main root. This issue is fixed by the following linux kernel btrfs patch: Btrfs: update commit root on snapshot creation after orphan cleanup Signed-off-by: Filipe Manana fdman...@suse.com --- V2: Replaced a redirect with a redirect to $seqres.full, and added a sleep. tests/btrfs/057 | 84 + tests/btrfs/057.out | 1 + tests/btrfs/group | 1 + 3 files changed, 86 insertions(+) create mode 100755 tests/btrfs/057 create mode 100644 tests/btrfs/057.out diff --git a/tests/btrfs/057 b/tests/btrfs/057 new file mode 100755 index 000..1e313e9 --- /dev/null +++ b/tests/btrfs/057 @@ -0,0 +1,84 @@ +#! /bin/bash +# FS QA Test No. btrfs/057 +# +# Regression test for a btrfs issue where we create a RO snapshot to use for +# a send operation which fails with a -ESTALE error, due to the presence of +# orphan inodes accessible through the snapshot's commit root but no longer +# present through the main root. +# +# This issue is fixed by the following linux kernel btrfs patch: +# +#Btrfs: update commit root on snapshot creation after orphan cleanup +# +#--- +# Copyright (C) 2014 SUSE Linux Products GmbH. All Rights Reserved. +# Author: Filipe Manana fdman...@suse.com +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +#--- +# + +seq=`basename $0` +seqres=$RESULT_DIR/$seq +echo QA output created by $seq + +tmp=/tmp/$$ +status=1 # failure is the default! +trap _cleanup; exit \$status 0 1 2 3 15 + +_cleanup() +{ + if [ ! -z $XFS_IO_PID ]; then + kill $XFS_IO_PID /dev/null 21 + fi + rm -fr $tmp +} + +# get standard environment, filters and checks +. ./common/rc +. ./common/filter + +# real QA test starts here +_supported_fs btrfs +_supported_os Linux +_require_scratch +# Requiring flink command tests for the presence of the -T option used +# to pass O_TMPFILE to open(2). +_require_xfs_io_command flink +_need_to_be_root + +rm -f $seqres.full + +_scratch_mkfs /dev/null 21 +_scratch_mount + +# Create a tmpfile file, write some data to it and leave it open, so that our +# main subvolume has an orphan inode item. +$XFS_IO_PROG -T $SCRATCH_MNT $seqres.full 21 ( + echo pwrite 0 65536 + read +) +XFS_IO_PID=$! + +# Give it some time to the xfs_io process to create the tmpfile. +sleep 3 + +# With the tmpfile open, create a RO snapshot and use it for a send operation. +# The send operation used to fail with -ESTALE due to the presence of the +# orphan inode. +_run_btrfs_util_prog subvolume snapshot -r $SCRATCH_MNT $SCRATCH_MNT/mysnap +_run_btrfs_util_prog send $SCRATCH_MNT/mysnap -f /dev/null + +status=0 +exit diff --git a/tests/btrfs/057.out b/tests/btrfs/057.out new file mode 100644 index 000..b26eefe --- /dev/null +++ b/tests/btrfs/057.out @@ -0,0 +1 @@ +QA output created by 057 diff --git a/tests/btrfs/group b/tests/btrfs/group index 2da7127..ebc38c5 100644 --- a/tests/btrfs/group +++ b/tests/btrfs/group @@ -59,3 +59,4 @@ 054 auto quick 055 auto quick 056 auto quick +057 auto quick -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] Btrfs: make btrfs_search_forward return with nodes unlocked
None of the uses of btrfs_search_forward() need to have the path nodes (level = 1) read locked, only the leaf needs to be locked while the caller processes it. Therefore make it return a path with all nodes unlocked, except for the leaf. This change is motivated by the observation that during a file fsync we repeatdly call btrfs_search_forward() and process the returned leaf while upper nodes of the returned path (level = 1) are read locked, which unnecessarily blocks other tasks that want to write to the same fs/subvol btree. Therefore instead of modifying the fsync code to unlock all nodes with level = 1 immediately after calling btrfs_search_forward(), change btrfs_search_forward() to do it, so that it benefits all callers. Signed-off-by: Filipe Manana fdman...@suse.com --- fs/btrfs/ctree.c | 11 +++ fs/btrfs/ioctl.c | 5 - fs/btrfs/tree-log.c | 3 --- fs/btrfs/uuid-tree.c | 1 - fs/btrfs/volumes.c | 2 -- 5 files changed, 7 insertions(+), 15 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 8ca6761..993d81b 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -5144,8 +5144,9 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, u32 nritems; int level; int ret = 1; + int keep_locks = path-keep_locks; - WARN_ON(!path-keep_locks); + path-keep_locks = 1; again: cur = btrfs_read_lock_root_node(root); level = btrfs_header_level(cur); @@ -5209,7 +5210,6 @@ find_next_key: path-slots[level] = slot; if (level == path-lowest_level) { ret = 0; - unlock_up(path, level, 1, 0, NULL); goto out; } btrfs_set_path_blocking(path); @@ -5224,9 +5224,12 @@ find_next_key: btrfs_clear_path_blocking(path, NULL, 0); } out: - if (ret == 0) + path-keep_locks = keep_locks; + if (ret == 0) { + btrfs_unlock_up_safe(path, path-lowest_level + 1); + btrfs_set_path_blocking(path); memcpy(min_key, found_key, sizeof(found_key)); - btrfs_set_path_blocking(path); + } return ret; } diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index ef2e073..d490abd 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -936,12 +936,9 @@ static int find_new_extents(struct btrfs_root *root, min_key.offset = *off; while (1) { - path-keep_locks = 1; ret = btrfs_search_forward(root, min_key, path, newer_than); if (ret != 0) goto none; - path-keep_locks = 0; - btrfs_unlock_up_safe(path, 1); process_slot: if (min_key.objectid != ino) goto none; @@ -2083,8 +2080,6 @@ static noinline int search_ioctl(struct inode *inode, key.type = sk-min_type; key.offset = sk-min_offset; - path-keep_locks = 1; - while (1) { ret = btrfs_search_forward(root, key, path, sk-min_transid); if (ret != 0) { diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 6e0fa17..df332dd 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2981,8 +2981,6 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, min_key.type = key_type; min_key.offset = min_offset; - path-keep_locks = 1; - ret = btrfs_search_forward(root, min_key, path, trans-transid); /* @@ -3950,7 +3948,6 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, err = ret; goto out_unlock; } - path-keep_locks = 1; while (1) { ins_nr = 0; diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c index f6a4c03..7782829 100644 --- a/fs/btrfs/uuid-tree.c +++ b/fs/btrfs/uuid-tree.c @@ -279,7 +279,6 @@ int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info, key.offset = 0; again_search_slot: - path-keep_locks = 1; ret = btrfs_search_forward(root, key, path, 0); if (ret) { if (ret 0) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 0daf748..73e4d30 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3629,8 +3629,6 @@ static int btrfs_uuid_scan_kthread(void *data) max_key.type = BTRFS_ROOT_ITEM_KEY; max_key.offset = (u64)-1; - path-keep_locks = 1; - while (1) { ret = btrfs_search_forward(root, key, path, 0); if (ret) { -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] Btrfs: fix hole detection during file fsync
The file hole detection logic during a file fsync wasn't correct, because it didn't look back (in a previous leaf) for the last file extent item that can be in a leaf to the left of our leaf and that has a generation lower than the current transaction id. This made it assume that a hole exists when it really doesn't exist in the file. Such false positive hole detection happens in the following scenario: * We have a file that has many file extent items, covering 3 or more btree leafs (the first leaf must contain non file extent items too). * Two ranges of the file are modified, with their extent items being located at 2 different leafs and those leafs aren't consecutive. * When processing the second leaf, we weren't checking if some file extent item exists that is located in some leaf that is between our 2 leafs, and therefore assumed the range defined between the last file extent item in first leaf and the first file extent item in the second leaf matched a hole. Fortunately this didn't result in overriding the log with wrong data, instead it made the last loop in copy_items() attempt to insert a duplicated key (for a hole file extent item), which makes the file fsync code return with -EEXIST to file.c:btrfs_sync_file() which in turn ends up doing a full transaction commit. I could trigger this issue with the following test for xfstests (which never fails, either without or with this patch). The last fsync call results in a full transaction commit, due to the -EEXIST error mentioned above. I could also observe this behaviour happening frequently when running xfstests/generic/075 in a loop. Test: _cleanup() { _cleanup_flakey rm -fr $tmp } # get standard environment, filters and checks . ./common/rc . ./common/filter . ./common/dmflakey # real QA test starts here _supported_fs btrfs _supported_os Linux _require_scratch _require_dm_flakey _need_to_be_root rm -f $seqres.full # Create a file with many file extent items, each representing a 4Kb extent. # These items span 3 btree leaves, of 16Kb each (default mkfs.btrfs leaf size # as of btrfs-progs 3.12). _scratch_mkfs -l 16384 /dev/null 21 _init_flakey SAVE_MOUNT_OPTIONS=$MOUNT_OPTIONS MOUNT_OPTIONS=$MOUNT_OPTIONS -o commit=999 _mount_flakey # First fsync, inode has BTRFS_INODE_NEEDS_FULL_SYNC flag set. $XFS_IO_PROG -f -c pwrite -S 0x01 -b 4096 0 4096 -c fsync \ $SCRATCH_MNT/foo | _filter_xfs_io # For any of the following fsync calls, inode doesn't have the flag # BTRFS_INODE_NEEDS_FULL_SYNC set. for ((i = 1; i = 500; i++)); do OFFSET=$((4096 * i)) LEN=4096 $XFS_IO_PROG -c pwrite -S 0x01 $OFFSET $LEN -c fsync \ $SCRATCH_MNT/foo | _filter_xfs_io done # Commit transaction and bump next transaction's id (to 7). sync # Truncate will set the BTRFS_INODE_NEEDS_FULL_SYNC flag in the btrfs's # inode runtime flags. $XFS_IO_PROG -c truncate 2048000 $SCRATCH_MNT/foo # Commit transaction and bump next transaction's id (to 8). sync # Touch 1 extent item from the first leaf and 1 from the last leaf. The leaf # in the middle, containing only file extent items, isn't touched. So the # next fsync, when calling btrfs_search_forward(), won't visit that middle # leaf. First and 3rd leaf have generation 6, while middle one has generation 8. $XFS_IO_PROG \ -c pwrite -S 0xee -b 4096 0 4096 \ -c pwrite -S 0xff -b 4096 2043904 4096 \ -c fsync \ $SCRATCH_MNT/foo | _filter_xfs_io _load_flakey_table $FLAKEY_DROP_WRITES md5sum $SCRATCH_MNT/foo | _filter_scratch _unmount_flakey _load_flakey_table $FLAKEY_ALLOW_WRITES # During mount, we'll replay the log created by the fsync above, and the file's # md5 digest should be the same we got before the unmount. _mount_flakey md5sum $SCRATCH_MNT/foo | _filter_scratch _unmount_flakey MOUNT_OPTIONS=$SAVE_MOUNT_OPTIONS status=0 exit Signed-off-by: Filipe Manana fdman...@suse.com --- fs/btrfs/tree-log.c | 17 + 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index df332dd..5a917a6 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3296,7 +3296,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, struct list_head ordered_sums; int skip_csum = BTRFS_I(inode)-flags BTRFS_INODE_NODATASUM; bool has_extents = false; - bool need_find_last_extent = (*last_extent == 0); + bool need_find_last_extent = true; bool done = false; INIT_LIST_HEAD(ordered_sums); @@ -3350,8 +3350,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, */ if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY) { has_extents = true
[PATCH v2] Btrfs: fix hole detection during file fsync
The file hole detection logic during a file fsync wasn't correct, because it didn't look back (in a previous leaf) for the last file extent item that can be in a leaf to the left of our leaf and that has a generation lower than the current transaction id. This made it assume that a hole exists when it really doesn't exist in the file. Such false positive hole detection happens in the following scenario: * We have a file that has many file extent items, covering 3 or more btree leafs (the first leaf must contain non file extent items too). * Two ranges of the file are modified, with their extent items being located at 2 different leafs and those leafs aren't consecutive. * When processing the second modified leaf, we weren't checking if some file extent item exists that is located in some leaf that is between our 2 modified leafs, and therefore assumed the range defined between the last file extent item in the first leaf and the first file extent item in the second leaf matched a hole. Fortunately this didn't result in overriding the log with wrong data, instead it made the last loop in copy_items() attempt to insert a duplicated key (for a hole file extent item), which makes the file fsync code return with -EEXIST to file.c:btrfs_sync_file() which in turn ends up doing a full transaction commit, which is much more expensive then writing only to the log tree and wait for it to be durably persisted (as well as the file's modified extents/pages). Therefore fix the hole detection logic, so that we don't pay the cost of doing full transaction commits. I could trigger this issue with the following test for xfstests (which never fails, either without or with this patch). The last fsync call results in a full transaction commit, due to the -EEXIST error mentioned above. I could also observe this behaviour happening frequently when running xfstests/generic/075 in a loop. Test: _cleanup() { _cleanup_flakey rm -fr $tmp } # get standard environment, filters and checks . ./common/rc . ./common/filter . ./common/dmflakey # real QA test starts here _supported_fs btrfs _supported_os Linux _require_scratch _require_dm_flakey _need_to_be_root rm -f $seqres.full # Create a file with many file extent items, each representing a 4Kb extent. # These items span 3 btree leaves, of 16Kb each (default mkfs.btrfs leaf size # as of btrfs-progs 3.12). _scratch_mkfs -l 16384 /dev/null 21 _init_flakey SAVE_MOUNT_OPTIONS=$MOUNT_OPTIONS MOUNT_OPTIONS=$MOUNT_OPTIONS -o commit=999 _mount_flakey # First fsync, inode has BTRFS_INODE_NEEDS_FULL_SYNC flag set. $XFS_IO_PROG -f -c pwrite -S 0x01 -b 4096 0 4096 -c fsync \ $SCRATCH_MNT/foo | _filter_xfs_io # For any of the following fsync calls, inode doesn't have the flag # BTRFS_INODE_NEEDS_FULL_SYNC set. for ((i = 1; i = 500; i++)); do OFFSET=$((4096 * i)) LEN=4096 $XFS_IO_PROG -c pwrite -S 0x01 $OFFSET $LEN -c fsync \ $SCRATCH_MNT/foo | _filter_xfs_io done # Commit transaction and bump next transaction's id (to 7). sync # Truncate will set the BTRFS_INODE_NEEDS_FULL_SYNC flag in the btrfs's # inode runtime flags. $XFS_IO_PROG -c truncate 2048000 $SCRATCH_MNT/foo # Commit transaction and bump next transaction's id (to 8). sync # Touch 1 extent item from the first leaf and 1 from the last leaf. The leaf # in the middle, containing only file extent items, isn't touched. So the # next fsync, when calling btrfs_search_forward(), won't visit that middle # leaf. First and 3rd leaf have now a generation with value 8, while the # middle leaf remains with a generation with value 6. $XFS_IO_PROG \ -c pwrite -S 0xee -b 4096 0 4096 \ -c pwrite -S 0xff -b 4096 2043904 4096 \ -c fsync \ $SCRATCH_MNT/foo | _filter_xfs_io _load_flakey_table $FLAKEY_DROP_WRITES md5sum $SCRATCH_MNT/foo | _filter_scratch _unmount_flakey _load_flakey_table $FLAKEY_ALLOW_WRITES # During mount, we'll replay the log created by the fsync above, and the file's # md5 digest should be the same we got before the unmount. _mount_flakey md5sum $SCRATCH_MNT/foo | _filter_scratch _unmount_flakey MOUNT_OPTIONS=$SAVE_MOUNT_OPTIONS status=0 exit Signed-off-by: Filipe Manana fdman...@suse.com --- V2: Updated commit message, fixed a couple typos, grammar and a more clear explanation of the problem. fs/btrfs/tree-log.c | 17 + 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index df332dd..5a917a6 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3296,7 +3296,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, struct list_head ordered_sums; int skip_csum = BTRFS_I(inode)-flags
[PATCH] Btrfs: don't monopolize a core when evicting inode
If an inode has a very large number of extent maps, we can spend a lot of time freeing them, which triggers a soft lockup warning. Therefore reschedule if we need to when freeing the extent maps while evicting the inode. I could trigger this all the time by running xfstests/generic/299 on a file system with the no-holes feature enabled. That test creates an inode with 11386677 extent maps. $ mkfs.btrfs -f -O no-holes $TEST_DEV $ MKFS_OPTIONS=-O no-holes ./check generic/299 generic/299 382s ... Message from syslogd@debian-vm3 at Aug 7 10:44:29 ... kernel:[85304.208017] BUG: soft lockup - CPU#0 stuck for 22s! [umount:25330] 384s Ran: generic/299 Passed all 1 tests $ dmesg (...) [86304.300017] BUG: soft lockup - CPU#0 stuck for 23s! [umount:25330] (...) [86304.300036] Call Trace: [86304.300036] [81698ba9] __slab_free+0x54/0x295 [86304.300036] [a02ee9cc] ? free_extent_map+0x5c/0xb0 [btrfs] [86304.300036] [811a6cd2] kmem_cache_free+0x282/0x2a0 [86304.300036] [a02ee9cc] free_extent_map+0x5c/0xb0 [btrfs] [86304.300036] [a02e3775] btrfs_evict_inode+0xd5/0x660 [btrfs] [86304.300036] [811e7c8d] ? __inode_wait_for_writeback+0x6d/0xc0 [86304.300036] [816a389b] ? _raw_spin_unlock+0x2b/0x40 [86304.300036] [811d8cbb] evict+0xab/0x180 [86304.300036] [811d8dce] dispose_list+0x3e/0x60 [86304.300036] [811d9b04] evict_inodes+0xf4/0x110 [86304.300036] [811bd953] generic_shutdown_super+0x53/0x110 [86304.300036] [811bdaa6] kill_anon_super+0x16/0x30 [86304.300036] [a02a78ba] btrfs_kill_super+0x1a/0xa0 [btrfs] [86304.300036] [811bd3a9] deactivate_locked_super+0x59/0x80 [86304.300036] [811be44e] deactivate_super+0x4e/0x70 [86304.300036] [811dec14] mntput_no_expire+0x174/0x1f0 [86304.300036] [811deab7] ? mntput_no_expire+0x17/0x1f0 [86304.300036] [811e0517] SyS_umount+0x97/0x100 (...) Signed-off-by: Filipe Manana fdman...@suse.com --- fs/btrfs/inode.c | 6 ++ 1 file changed, 6 insertions(+) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 8ad3ea9..00b4bd3 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4718,6 +4718,11 @@ static void evict_inode_truncate_pages(struct inode *inode) clear_bit(EXTENT_FLAG_LOGGING, em-flags); remove_extent_mapping(map_tree, em); free_extent_map(em); + if (need_resched()) { + write_unlock(map_tree-lock); + cond_resched(); + write_lock(map_tree-lock); + } } write_unlock(map_tree-lock); @@ -4740,6 +4745,7 @@ static void evict_inode_truncate_pages(struct inode *inode) cached_state, GFP_NOFS); free_extent_state(state); + cond_resched(); spin_lock(io_tree-lock); } spin_unlock(io_tree-lock); -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] Btrfs: fix csum tree corruption, duplicate and outdated checksums
Under rare circumstances we can end up leaving 2 versions of a checksum for the same file extent range. The reason for this is that after calling btrfs_next_leaf we process slot 0 of the leaf it returns, instead of processing the slot set in path-slots[0]. Most of the time (by far) path-slots[0] is 0, but after btrfs_next_leaf() releases the path and before it searches for the next leaf, another task might cause a split of the next leaf, which migrates some of its keys to the leaf we were processing before calling btrfs_next_leaf(). In this case btrfs_next_leaf() returns again the same leaf but with path-slots[0] having a slot number corresponding to the first new key it got, that is, a slot number that didn't exist before calling btrfs_next_leaf(), as the leaf now has more keys than it had before. So we must really process the returned leaf starting at path-slots[0] always, as it isn't always 0, and the key at slot 0 can have an offset much lower than our search offset/bytenr. For example, consider the following scenario, where we have: sums-bytenr: 40157184, sums-len: 16384, sums end: 40173568 four 4kb file data blocks with offsets 40157184, 40161280, 40165376, 40169472 Leaf N: slot = 0 slot = btrfs_header_nritems() - 1 |---| | [(CSUM CSUM 39239680), size 8] ... [(CSUM CSUM 40116224), size 4] | |---| Leaf N + 1: slot = 0 slot = btrfs_header_nritems() - 1 || | [(CSUM CSUM 40161280), size 32] ... [((CSUM CSUM 40615936), size 8 | || Because we are at the last slot of leaf N, we call btrfs_next_leaf() to find the next highest key, which releases the current path and then searches for that next key. However after releasing the path and before finding that next key, the item at slot 0 of leaf N + 1 gets moved to leaf N, due to a call to ctree.c:push_leaf_left() (via ctree.c:split_leaf()), and therefore btrfs_next_leaf() will returns us a path again with leaf N but with the slot pointing to its new last key (CSUM CSUM 40161280). This new version of leaf N is then: slot = 0slot = btrfs_header_nritems() - 2 slot = btrfs_header_nritems() - 1 || | [(CSUM CSUM 39239680), size 8] ... [(CSUM CSUM 40116224), size 4] [(CSUM CSUM 40161280), size 32] | || And incorrecly using slot 0, makes us set next_offset to 39239680 and we jump into the insert: label, which will set tmp to: tmp = min((sums-len - total_bytes) blocksize_bits, (next_offset - file_key.offset) blocksize_bits) = min((16384 - 0) 12, (39239680 - 40157184) 12) = min(4, (u64)-917504 = 18446744073708634112 12) = 4 and ins_size = csum_size * tmp = 4 * 4 = 16 bytes. In other words, we insert a new csum item in the tree with key (CSUM_OBJECTID CSUM_KEY 40157184 = sums-bytenr) that contains the checksums for all the data (4 blocks of 4096 bytes each = sums-len). Which is wrong, because the item with key (CSUM CSUM 40161280) (the one that was moved from leaf N + 1 to the end of leaf N) contains the old checksums of the last 12288 bytes of our data and won't get those old checksums removed. So this leaves us 2 different checksums for 3 4kb blocks of data in the tree, and breaks the logical rule: Key_N+1.offset = Key_N.offset + length_of_data_its_checksums_cover An obvious bad effect of this is that a subsequent csum tree lookup to get the checksum of any of the blocks with logical offset of 40161280, 40165376 or 40169472 (the last 3 4kb blocks of file data), will get the old checksums. Cc: sta...@vger.kernel.org Signed-off-by: Filipe Manana fdman...@suse.com --- fs/btrfs/file-item.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index a1f97de..7897dcd 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -746,7 +746,7 @@ again: found_next = 1; if (ret != 0) goto insert; - slot = 0; + slot = path-slots[0]; } btrfs_item_key_to_cpu(path-nodes[0], found_key, slot); if (found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID || -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] Btrfs: send, lower mem requirements for processing xattrs
Maximum xattr size can be up to nearly the leaf size. For an fs with a leaf size larger than the page size, using kmalloc requires allocating multiple pages that are contiguous, which might not be possible if there's heavy memory fragmentation. Therefore fallback to vmalloc if we fail to allocate with kmalloc. Also start with a smaller buffer size, since xattr values typically are smaller than a page. Reported-by: Chris Murphy li...@colorremedies.com Signed-off-by: Filipe Manana fdman...@suse.com --- fs/btrfs/send.c | 41 + 1 file changed, 33 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 3c63b29..215064d 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -997,6 +997,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key di_key; char *buf = NULL; int buf_len; + bool contig_buf; u32 name_len; u32 data_len; u32 cur; @@ -1006,11 +1007,13 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, int num; u8 type; - if (found_key-type == BTRFS_XATTR_ITEM_KEY) - buf_len = BTRFS_MAX_XATTR_SIZE(root); - else - buf_len = PATH_MAX; - + /* +* Start with a small buffer (1 page). If later we end up needing more +* space, which can happen for xattrs on a fs with a leaf size 4Kb, +* attempt to increase the buffer. Typically xattr values are small. +*/ + buf_len = PATH_MAX; + contig_buf = true; buf = kmalloc(buf_len, GFP_NOFS); if (!buf) { ret = -ENOMEM; @@ -1037,7 +1040,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, ret = -ENAMETOOLONG; goto out; } - if (name_len + data_len buf_len) { + if (name_len + data_len BTRFS_MAX_XATTR_SIZE(root)) { ret = -E2BIG; goto out; } @@ -1045,12 +1048,31 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, /* * Path too long */ - if (name_len + data_len buf_len) { + if (name_len + data_len PATH_MAX) { ret = -ENAMETOOLONG; goto out; } } + if (name_len + data_len buf_len) { + if (contig_buf) + kfree(buf); + else + vfree(buf); + buf = NULL; + buf_len = name_len + data_len; + if (contig_buf) + buf = kmalloc(buf_len, GFP_NOFS); + if (!buf) { + buf = vmalloc(buf_len); + if (!buf) { + ret = -ENOMEM; + goto out; + } + contig_buf = false; + } + } + read_extent_buffer(eb, buf, (unsigned long)(di + 1), name_len + data_len); @@ -1071,7 +1093,10 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, } out: - kfree(buf); + if (contig_buf) + kfree(buf); + else + vfree(buf); return ret; } -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v2] Btrfs: send, lower mem requirements for processing xattrs
Maximum xattr size can be up to nearly the leaf size. For an fs with a leaf size larger than the page size, using kmalloc requires allocating multiple pages that are contiguous, which might not be possible if there's heavy memory fragmentation. Therefore fallback to vmalloc if we fail to allocate with kmalloc. Also start with a smaller buffer size, since xattr values typically are smaller than a page. Reported-by: Chris Murphy li...@colorremedies.com Signed-off-by: Filipe Manana fdman...@suse.com --- V2: Use is_vmalloc_addr() instead of keeping a boolean variable around. fs/btrfs/send.c | 39 +++ 1 file changed, 31 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 3c63b29..a7ce318 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1006,11 +1006,13 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, int num; u8 type; - if (found_key-type == BTRFS_XATTR_ITEM_KEY) - buf_len = BTRFS_MAX_XATTR_SIZE(root); - else - buf_len = PATH_MAX; - + /* +* Start with a small buffer (1 page). If later we end up needing more +* space, which can happen for xattrs on a fs with a leaf size greater +* then the page size, attempt to increase the buffer. Typically xattr +* values are small. +*/ + buf_len = PATH_MAX; buf = kmalloc(buf_len, GFP_NOFS); if (!buf) { ret = -ENOMEM; @@ -1037,7 +1039,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, ret = -ENAMETOOLONG; goto out; } - if (name_len + data_len buf_len) { + if (name_len + data_len BTRFS_MAX_XATTR_SIZE(root)) { ret = -E2BIG; goto out; } @@ -1045,12 +1047,30 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, /* * Path too long */ - if (name_len + data_len buf_len) { + if (name_len + data_len PATH_MAX) { ret = -ENAMETOOLONG; goto out; } } + if (name_len + data_len buf_len) { + buf_len = name_len + data_len; + if (is_vmalloc_addr(buf)) { + vfree(buf); + buf = NULL; + } else { + kfree(buf); + buf = kmalloc(buf_len, GFP_NOFS); + } + if (!buf) { + buf = vmalloc(buf_len); + if (!buf) { + ret = -ENOMEM; + goto out; + } + } + } + read_extent_buffer(eb, buf, (unsigned long)(di + 1), name_len + data_len); @@ -1071,7 +1091,10 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, } out: - kfree(buf); + if (is_vmalloc_addr(buf)) + vfree(buf); + else + kfree(buf); return ret; } -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v3] Btrfs: send, lower mem requirements for processing xattrs
Maximum xattr size can be up to nearly the leaf size. For an fs with a leaf size larger than the page size, using kmalloc requires allocating multiple pages that are contiguous, which might not be possible if there's heavy memory fragmentation. Therefore fallback to vmalloc if we fail to allocate with kmalloc. Also start with a smaller buffer size, since xattr values typically are smaller than a page. Reported-by: Chris Murphy li...@colorremedies.com Signed-off-by: Filipe Manana fdman...@suse.com --- V2: Use is_vmalloc_addr() instead of keeping a boolean variable around. V3: Use krealloc instead of kfree + kmalloc. fs/btrfs/send.c | 41 + 1 file changed, 33 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 3c63b29..8b2780d 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1006,11 +1006,13 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, int num; u8 type; - if (found_key-type == BTRFS_XATTR_ITEM_KEY) - buf_len = BTRFS_MAX_XATTR_SIZE(root); - else - buf_len = PATH_MAX; - + /* +* Start with a small buffer (1 page). If later we end up needing more +* space, which can happen for xattrs on a fs with a leaf size greater +* then the page size, attempt to increase the buffer. Typically xattr +* values are small. +*/ + buf_len = PATH_MAX; buf = kmalloc(buf_len, GFP_NOFS); if (!buf) { ret = -ENOMEM; @@ -1037,7 +1039,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, ret = -ENAMETOOLONG; goto out; } - if (name_len + data_len buf_len) { + if (name_len + data_len BTRFS_MAX_XATTR_SIZE(root)) { ret = -E2BIG; goto out; } @@ -1045,12 +1047,32 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, /* * Path too long */ - if (name_len + data_len buf_len) { + if (name_len + data_len PATH_MAX) { ret = -ENAMETOOLONG; goto out; } } + if (name_len + data_len buf_len) { + buf_len = name_len + data_len; + if (is_vmalloc_addr(buf)) { + vfree(buf); + buf = NULL; + } else { + char *tmp = krealloc(buf, buf_len, GFP_NOFS); + if (!tmp) + kfree(buf); + buf = tmp; + } + if (!buf) { + buf = vmalloc(buf_len); + if (!buf) { + ret = -ENOMEM; + goto out; + } + } + } + read_extent_buffer(eb, buf, (unsigned long)(di + 1), name_len + data_len); @@ -1071,7 +1093,10 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, } out: - kfree(buf); + if (is_vmalloc_addr(buf)) + vfree(buf); + else + kfree(buf); return ret; } -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v4] Btrfs: send, lower mem requirements for processing xattrs
Maximum xattr size can be up to nearly the leaf size. For an fs with a leaf size larger than the page size, using kmalloc requires allocating multiple pages that are contiguous, which might not be possible if there's heavy memory fragmentation. Therefore fallback to vmalloc if we fail to allocate with kmalloc. Also start with a smaller buffer size, since xattr values typically are smaller than a page. Reported-by: Chris Murphy li...@colorremedies.com Signed-off-by: Filipe Manana fdman...@suse.com --- V2: Use is_vmalloc_addr() instead of keeping a boolean variable around. V3: Use krealloc instead of kfree + kmalloc. V4: Fixed a checkpatch warning about missing blank line after var declaration. fs/btrfs/send.c | 42 ++ 1 file changed, 34 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 3c63b29..b29fc5c 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1006,11 +1006,13 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, int num; u8 type; - if (found_key-type == BTRFS_XATTR_ITEM_KEY) - buf_len = BTRFS_MAX_XATTR_SIZE(root); - else - buf_len = PATH_MAX; - + /* +* Start with a small buffer (1 page). If later we end up needing more +* space, which can happen for xattrs on a fs with a leaf size greater +* then the page size, attempt to increase the buffer. Typically xattr +* values are small. +*/ + buf_len = PATH_MAX; buf = kmalloc(buf_len, GFP_NOFS); if (!buf) { ret = -ENOMEM; @@ -1037,7 +1039,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, ret = -ENAMETOOLONG; goto out; } - if (name_len + data_len buf_len) { + if (name_len + data_len BTRFS_MAX_XATTR_SIZE(root)) { ret = -E2BIG; goto out; } @@ -1045,12 +1047,33 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, /* * Path too long */ - if (name_len + data_len buf_len) { + if (name_len + data_len PATH_MAX) { ret = -ENAMETOOLONG; goto out; } } + if (name_len + data_len buf_len) { + buf_len = name_len + data_len; + if (is_vmalloc_addr(buf)) { + vfree(buf); + buf = NULL; + } else { + char *tmp = krealloc(buf, buf_len, GFP_NOFS); + + if (!tmp) + kfree(buf); + buf = tmp; + } + if (!buf) { + buf = vmalloc(buf_len); + if (!buf) { + ret = -ENOMEM; + goto out; + } + } + } + read_extent_buffer(eb, buf, (unsigned long)(di + 1), name_len + data_len); @@ -1071,7 +1094,10 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, } out: - kfree(buf); + if (is_vmalloc_addr(buf)) + vfree(buf); + else + kfree(buf); return ret; } -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] Btrfs: shrink further sizeof(struct extent_buffer)
The map_start and map_len fields aren't used anywhere, so just remove them. On a x86_64 system, this reduced sizeof(struct extent_buffer) from 296 bytes to 280 bytes, and therefore 14 extent_buffer structs can now fit into a page instead of 13. Signed-off-by: Filipe Manana fdman...@suse.com --- fs/btrfs/extent_io.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index ce02cc9..5e91fb9 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -125,8 +125,6 @@ struct extent_state { struct extent_buffer { u64 start; unsigned long len; - unsigned long map_start; - unsigned long map_len; unsigned long bflags; struct btrfs_fs_info *fs_info; spinlock_t refs_lock; -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v5] Btrfs: send, lower mem requirements for processing xattrs
Maximum xattr size can be up to nearly the leaf size. For an fs with a leaf size larger than the page size, using kmalloc requires allocating multiple pages that are contiguous, which might not be possible if there's heavy memory fragmentation. Therefore fallback to vmalloc if we fail to allocate with kmalloc. Also start with a smaller buffer size, since xattr values typically are smaller than a page. Reported-by: Chris Murphy li...@colorremedies.com Signed-off-by: Filipe Manana fdman...@suse.com --- V2: Use is_vmalloc_addr() instead of keeping a boolean variable around. V3: Use krealloc instead of kfree + kmalloc. V4: Fixed a checkpatch warning about missing blank line after var declaration. V5: Use kvfree() and pass __GFP_NOWARN to krealloc(). fs/btrfs/send.c | 40 1 file changed, 32 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 3c63b29..3290da9 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1006,11 +1006,13 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, int num; u8 type; - if (found_key-type == BTRFS_XATTR_ITEM_KEY) - buf_len = BTRFS_MAX_XATTR_SIZE(root); - else - buf_len = PATH_MAX; - + /* +* Start with a small buffer (1 page). If later we end up needing more +* space, which can happen for xattrs on a fs with a leaf size greater +* then the page size, attempt to increase the buffer. Typically xattr +* values are small. +*/ + buf_len = PATH_MAX; buf = kmalloc(buf_len, GFP_NOFS); if (!buf) { ret = -ENOMEM; @@ -1037,7 +1039,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, ret = -ENAMETOOLONG; goto out; } - if (name_len + data_len buf_len) { + if (name_len + data_len BTRFS_MAX_XATTR_SIZE(root)) { ret = -E2BIG; goto out; } @@ -1045,12 +1047,34 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, /* * Path too long */ - if (name_len + data_len buf_len) { + if (name_len + data_len PATH_MAX) { ret = -ENAMETOOLONG; goto out; } } + if (name_len + data_len buf_len) { + buf_len = name_len + data_len; + if (is_vmalloc_addr(buf)) { + vfree(buf); + buf = NULL; + } else { + char *tmp = krealloc(buf, buf_len, +GFP_NOFS | __GFP_NOWARN); + + if (!tmp) + kfree(buf); + buf = tmp; + } + if (!buf) { + buf = vmalloc(buf_len); + if (!buf) { + ret = -ENOMEM; + goto out; + } + } + } + read_extent_buffer(eb, buf, (unsigned long)(di + 1), name_len + data_len); @@ -1071,7 +1095,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, } out: - kfree(buf); + kvfree(buf); return ret; } -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] Btrfs: improve free space cache management and space allocation
While under random IO, a block group's free space cache eventually reaches a state where it has a mix of extent entries and bitmap entries representing free space regions. As later free space regions are returned to the cache, some of them are merged with existing extent entries if they are contiguous with them. But others are not merged, because despite the existence of adjacent free space regions in the cache, the merging doesn't happen because the existing free space regions are represented in bitmap extents. Even when new free space regions are merged with existing extent entries (enlarging the free space range they represent), we create chances of having after an enlarged region that is contiguous with some other region represented in a bitmap entry. Both clustered and non-clustered space allocation work by iterating over our extent and bitmap entries and skipping any that represents a region smaller then the allocation request (and giving preference to extent entries before bitmap entries). By having a contiguous free space region that is represented by 2 (or more) entries (mix of extent and bitmap entries), we end up not satisfying an allocation request with a size larger than the size of any of the entries but no larger than the sum of their sizes. Making the caller assume we're under a ENOSPC condition or force it to allocate multiple smaller space regions (as we do for file data writes), which adds extra overhead and more chances of causing fragmentation due to the smaller regions being all spread apart from each other (more likely when under concurrency). For example, if we have the following in the cache: * extent entry representing free space range: [128Mb - 256Kb, 128Mb[ * bitmap entry covering the range [128Mb, 256Mb[, but only with the bits representing the range [128Mb, 128Mb + 768Kb[ set - that is, only that space in this 128Mb area is marked as free An allocation request for 1Mb, starting at offset not greater than 128Mb - 256Kb, would fail before, despite the existence of such contiguous free space area in the cache. The caller could only allocate up to 768Kb of space at once and later another 256Kb (or vice-versa). In between each smaller allocation request, another task working on a different file/inode might come in and take that space, preventing the former task of getting a contiguous 1Mb region of free space. Therefore this change implements the ability to move free space from bitmap entries into existing and new free space regions represented with extent entries. This is done when a space region is added to the cache. A test was added to the sanity tests that explains in detail the issue too. Some performance test results with compilebench on a 4 cores machine, with 32Gb of ram and using an HDD follow. Test: compilebench -D /mnt -i 30 -r 1000 --makej Before this change: intial create total runs 30 avg 69.02 MB/s (user 0.28s sys 0.57s) compile total runs 30 avg 314.96 MB/s (user 0.12s sys 0.25s) read compiled tree total runs 3 avg 27.14 MB/s (user 1.52s sys 0.90s) delete compiled tree total runs 30 avg 3.14 seconds (user 0.15s sys 0.66s) After this change: intial create total runs 30 avg 68.37 MB/s (user 0.29s sys 0.55s) compile total runs 30 avg 382.83 MB/s (user 0.12s sys 0.24s) read compiled tree total runs 3 avg 27.82 MB/s (user 1.45s sys 0.97s) delete compiled tree total runs 30 avg 3.18 seconds (user 0.17s sys 0.65s) Signed-off-by: Filipe Manana fdman...@suse.com --- fs/btrfs/free-space-cache.c | 149 ++- fs/btrfs/tests/free-space-tests.c | 514 ++ 2 files changed, 662 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 2f0fe10..23632ba 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -1951,6 +1951,137 @@ out: return ret; } +static void steal_from_bitmap_to_end(struct btrfs_free_space_ctl *ctl, +struct btrfs_free_space *info, +bool update_stat) +{ + struct btrfs_free_space *bitmap; + u64 bitmap_offset = info-offset; + unsigned long i; + unsigned long j; + const u64 end = info-offset + info-bytes; + u64 bytes; + +again: + bitmap = tree_search_offset(ctl, offset_to_bitmap(ctl, bitmap_offset), + 1, 0); + if (!bitmap) + goto out; + + if (end bitmap-offset || (bitmap-offset + bitmap-bytes end)) + return; + + i = offset_to_bit(bitmap-offset, ctl-unit, end); + j = find_next_zero_bit(bitmap-bitmap, BITS_PER_BITMAP, i); + if (j == i) + return; + bytes = (j - i) * ctl-unit; + info-bytes += bytes; + + if (update_stat) + bitmap_clear_bits(ctl, bitmap, end, bytes); + else + __bitmap_clear_bits(ctl, bitmap, end, bytes
[PATCH] Btrfs: fix corruption after write/fsync failure + fsync + log recovery
While writing to a file, in inode.c:cow_file_range() (and same applies to submit_compressed_extents()), after reserving an extent for the file data, we create a new extent map for the written range and insert it into the extent map cache. After that, we create an ordered operation, but if it fails (due to a transient/temporary-ENOMEM), we return without dropping that extent map, which points to a reserved extent that is freed when we return. A subsequent incremental fsync (when the btrfs inode doesn't have the flag BTRFS_INODE_NEEDS_FULL_SYNC) considers this extent map valid and logs a file extent item based on that extent map, which points to a disk extent that doesn't contain valid data - it was freed by us earlier, at this point it might contain any random/garbage data. Therefore, if we reach an error condition when cowing a file range after we added the new extent map to the cache, drop it from the cache before returning. Some sequence of steps that lead to this: $ mkfs.btrfs -f /dev/sdd $ mount -o commit= /dev/sdd /mnt $ cd /mnt $ xfs_io -f -c pwrite -S 0x01 -b 4096 0 4096 -c fsync foo $ xfs_io -c pwrite -S 0x02 -b 4096 4096 4096 $ sync $ od -t x1 foo 000 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 * 001 02 02 02 02 02 02 02 02 02 02 02 02 02 02 02 02 * 002 $ xfs_io -c pwrite -S 0xa1 -b 4096 0 4096 foo # Now this write + fsync fail with -ENOMEM, which was returned by # btrfs_add_ordered_extent() in inode.c:cow_file_range(). $ xfs_io -c pwrite -S 0xff -b 4096 4096 4096 foo $ xfs_io -c fsync foo fsync: Cannot allocate memory # Now do a new write + fsync, which will succeed. Our previous # -ENOMEM was a transient/temporary error. $ xfs_io -c pwrite -S 0xee -b 4096 16384 4096 foo $ xfs_io -c fsync foo # Our file content (in page cache) is now: $ od -t x1 foo 000 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 * 001 ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff * 002 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 * 004 ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee * 005 # Now reboot the machine, and mount the fs, so that fsync log replay # takes place. # The file content is now weird, in particular the first 8Kb, which # do not match our data before nor after the sync command above. $ od -t x1 foo 000 ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee * 001 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 * 002 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 * 004 ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee * 005 # In fact these first 4Kb are a duplicate of the last 4kb block. # The last write got an extent map/file extent item that points to # the same disk extent that we got in the write+fsync that failed # with the -ENOMEM error. btrfs-debug-tree and btrfsck allow us to # verify that: $ btrfs-debug-tree /dev/sdd (...) item 6 key (257 EXTENT_DATA 0) itemoff 15819 itemsize 53 extent data disk byte 12582912 nr 8192 extent data offset 0 nr 8192 ram 8192 item 7 key (257 EXTENT_DATA 8192) itemoff 15766 itemsize 53 extent data disk byte 0 nr 0 extent data offset 0 nr 8192 ram 8192 item 8 key (257 EXTENT_DATA 16384) itemoff 15713 itemsize 53 extent data disk byte 12582912 nr 4096 extent data offset 0 nr 4096 ram 4096 $ umount /dev/sdd $ btrfsck /dev/sdd Checking filesystem on /dev/sdd UUID: db5e60e1-050d-41e6-8c7f-3d742dea5d8f checking extents extent item 12582912 has multiple extent items ref mismatch on [12582912 4096] extent item 1, found 2 Backref bytes do not match extent backref, bytenr=12582912, ref bytes=4096, backref bytes=8192 backpointer mismatch on [12582912 4096] Errors found in extent allocation tree or chunk allocation checking free space cache checking fs roots root 5 inode 257 errors 1000, some csum missing found 131074 bytes used err is 1 total csum bytes: 4 total tree bytes: 131072 total fs tree bytes: 32768 total extent tree bytes: 16384 btree space waste bytes: 123404 file data blocks allocated: 274432 referenced 274432 Btrfs v3.14.1-96-gcc7fd5a-dirty Signed-off-by: Filipe Manana fdman...@suse.com --- fs/btrfs/inode.c | 12 +--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index c678dea..16e8146 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -792,8 +792,12 @@ retry: ins.offset, BTRFS_ORDERED_COMPRESSED, async_extent-compress_type); - if (ret) + if (ret
[PATCH v3] Btrfs: improve free space cache management and space allocation
While under random IO, a block group's free space cache eventually reaches a state where it has a mix of extent entries and bitmap entries representing free space regions. As later free space regions are returned to the cache, some of them are merged with existing extent entries if they are contiguous with them. But others are not merged, because despite the existence of adjacent free space regions in the cache, the merging doesn't happen because the existing free space regions are represented in bitmap extents. Even when new free space regions are merged with existing extent entries (enlarging the free space range they represent), we create chances of having after an enlarged region that is contiguous with some other region represented in a bitmap entry. Both clustered and non-clustered space allocation work by iterating over our extent and bitmap entries and skipping any that represents a region smaller then the allocation request (and giving preference to extent entries before bitmap entries). By having a contiguous free space region that is represented by 2 (or more) entries (mix of extent and bitmap entries), we end up not satisfying an allocation request with a size larger than the size of any of the entries but no larger than the sum of their sizes. Making the caller assume we're under a ENOSPC condition or force it to allocate multiple smaller space regions (as we do for file data writes), which adds extra overhead and more chances of causing fragmentation due to the smaller regions being all spread apart from each other (more likely when under concurrency). For example, if we have the following in the cache: * extent entry representing free space range: [128Mb - 256Kb, 128Mb[ * bitmap entry covering the range [128Mb, 256Mb[, but only with the bits representing the range [128Mb, 128Mb + 768Kb[ set - that is, only that space in this 128Mb area is marked as free An allocation request for 1Mb, starting at offset not greater than 128Mb - 256Kb, would fail before, despite the existence of such contiguous free space area in the cache. The caller could only allocate up to 768Kb of space at once and later another 256Kb (or vice-versa). In between each smaller allocation request, another task working on a different file/inode might come in and take that space, preventing the former task of getting a contiguous 1Mb region of free space. Therefore this change implements the ability to move free space from bitmap entries into existing and new free space regions represented with extent entries. This is done when a space region is added to the cache. A test was added to the sanity tests that explains in detail the issue too. Some performance test results with compilebench on a 4 cores machine, with 32Gb of ram and using an HDD follow. Test: compilebench -D /mnt -i 30 -r 1000 --makej Before this change: intial create total runs 30 avg 69.02 MB/s (user 0.28s sys 0.57s) compile total runs 30 avg 314.96 MB/s (user 0.12s sys 0.25s) read compiled tree total runs 3 avg 27.14 MB/s (user 1.52s sys 0.90s) delete compiled tree total runs 30 avg 3.14 seconds (user 0.15s sys 0.66s) After this change: intial create total runs 30 avg 68.37 MB/s (user 0.29s sys 0.55s) compile total runs 30 avg 382.83 MB/s (user 0.12s sys 0.24s) read compiled tree total runs 3 avg 27.82 MB/s (user 1.45s sys 0.97s) delete compiled tree total runs 30 avg 3.18 seconds (user 0.17s sys 0.65s) Signed-off-by: Filipe Manana fdman...@suse.com --- V2: Simplified bitmap search logic, shorter and cleaner now, and one less rbtree search. V3: Fixed a corner case where all bits in the bitmap to the left of our range were set but we didn't claim the first bit. Attempt regular extent merge if we were able to steal free space from a bitmap into our new extent. fs/btrfs/free-space-cache.c | 140 ++- fs/btrfs/tests/free-space-tests.c | 514 ++ 2 files changed, 653 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 2f0fe10..3384819 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -1997,6 +1997,128 @@ static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl, return merged; } +static bool steal_from_bitmap_to_end(struct btrfs_free_space_ctl *ctl, +struct btrfs_free_space *info, +bool update_stat) +{ + struct btrfs_free_space *bitmap; + unsigned long i; + unsigned long j; + const u64 end = info-offset + info-bytes; + const u64 bitmap_offset = offset_to_bitmap(ctl, end); + u64 bytes; + + bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0); + if (!bitmap) + return false; + + i = offset_to_bit(bitmap-offset, ctl-unit, end); + j = find_next_zero_bit(bitmap-bitmap, BITS_PER_BITMAP, i); + if (j == i
[PATCH] Btrfs: fix crash while a ranged msync() is ongoing
After the commit 7fc34a62ca4434a79c68e23e70ed26111b7a4cf8 (titled mm/msync.c: sync only the requested range in msync()), our fsync callback can be called with a range that covers only part of the file and not the whole file anymore. Under certain circumstances this leads to crashes that produce the following trace: [41074.641913] invalid opcode: [#1] SMP DEBUG_PAGEALLOC (...) [41074.642692] CPU: 0 PID: 24580 Comm: fsx Not tainted 3.16.0-fdm-btrfs-next-45+ #1 (...) [41074.643886] RIP: 0010:[a01ecc99] [a01ecc99] btrfs_ordered_update_i_size+0x279/0x2b0 [btrfs] (...) [41074.644919] Stack: (...) [41074.644919] Call Trace: [41074.644919] [a01db531] btrfs_truncate_inode_items+0x3f1/0xa10 [btrfs] [41074.644919] [a01eb54f] ? btrfs_get_logged_extents+0x4f/0x80 [btrfs] [41074.644919] [a02137a9] btrfs_log_inode+0x2f9/0x970 [btrfs] [41074.644919] [81090875] ? sched_clock_local+0x25/0xa0 [41074.644919] [8164a55e] ? mutex_unlock+0xe/0x10 [41074.644919] [810af51d] ? trace_hardirqs_on+0xd/0x10 [41074.644919] [a0214b4f] btrfs_log_inode_parent+0x1ef/0x560 [btrfs] [41074.644919] [811d0c55] ? dget_parent+0x5/0x180 [41074.644919] [a0215d11] btrfs_log_dentry_safe+0x51/0x80 [btrfs] [41074.644919] [a01e2d1a] btrfs_sync_file+0x1ba/0x3e0 [btrfs] [41074.644919] [811eda6b] vfs_fsync_range+0x1b/0x30 (...) The necessary conditions that lead to such crash are: * an incremental fsync (when the inode doesn't have the BTRFS_INODE_NEEDS_FULL_SYNC flag set) happened for our file and it logged a file extent item ending at offset X; * the file got the flag BTRFS_INODE_NEEDS_FULL_SYNC set in its inode, due to a file truncate operation that reduces the file to a size smaller than X; * an msync call happens, with a range that doesn't cover the whole file and the end of this range, lets call it Y, is smaller than X; * btrfs_log_inode, sees the flag BTRFS_INODE_NEEDS_FULL_SYNC set and calls btrfs_truncate_inode_items() to remove all items from the log tree that are associated with our file; * btrfs_truncate_inode_items() removes all of the inode's items, and the lowest file extent item it removed is the one ending at offset X, where X 0 and X Y - before returning, it calls btrfs_ordered_update_i_size() with an offset parameter set to X; * btrfs_ordered_update_i_size() sees that X is greater then the current ordered size (btrfs_inode's disk_i_size) and then it assumes there can't be any ongoing ordered operation with a range covering the offset X, calling a BUG_ON() if such ordered operation exists. This assumption is made because the disk_i_size is only increased after the corresponding file extent item is added to the btree (btrfs_finish_ordered_io); * But because our msync/fsync covers only a limited range, such an ordered extent might exist, and our fsync callback (btrfs_sync_file) doesn't wait for such ordered extent to finish when calling btrfs_wait_ordered_range(); And then by the time btrfs_ordered_update_i_size() is called, via: btrfs_sync_file() - btrfs_log_dentry_safe() - btrfs_log_inode_parent() - btrfs_log_inode() - btrfs_truncate_inode_items() - btrfs_ordered_update_i_size() We hit the BUG_ON(), which could never happen when msync() used the whole file range when calling fsync (i.e. before 7fc34a62ca4434a79c68e23e70ed26111b7a4cf8). So just don't call btrfs_ordered_update_i_size() if we're removing inode items from a log tree, which isn't supposed to change the in memory inode's disk_i_size, and never did before commit 7fc34a62ca4434a79c68e23e70ed26111b7a4cf8 because we used to wait for all ordered extents (and therefore the end of any extent found by btrfs_truncate_inode_items was always smaller than the in memory inode's disk_i_size). Issue found while running xfstests/generic/127 (happens very rarely for me). Signed-off-by: Filipe Manana fdman...@suse.com --- fs/btrfs/inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 16e8146..c5ef9eb 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4296,7 +4296,8 @@ out: btrfs_abort_transaction(trans, root, ret); } error: - if (last_size != (u64)-1) + if (last_size != (u64)-1 + root-root_key.objectid != BTRFS_TREE_LOG_OBJECTID) btrfs_ordered_update_i_size(inode, last_size, NULL); btrfs_free_path(path); return err; -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v2] Btrfs: fix crash while doing a ranged fsync
While doing a ranged fsync, that is, one whose range doesn't cover the whole possible file range (0 to LLONG_MAX), we can crash under certain circumstances with a trace like the following: [41074.641913] invalid opcode: [#1] SMP DEBUG_PAGEALLOC (...) [41074.642692] CPU: 0 PID: 24580 Comm: fsx Not tainted 3.16.0-fdm-btrfs-next-45+ #1 (...) [41074.643886] RIP: 0010:[a01ecc99] [a01ecc99] btrfs_ordered_update_i_size+0x279/0x2b0 [btrfs] (...) [41074.644919] Stack: (...) [41074.644919] Call Trace: [41074.644919] [a01db531] btrfs_truncate_inode_items+0x3f1/0xa10 [btrfs] [41074.644919] [a01eb54f] ? btrfs_get_logged_extents+0x4f/0x80 [btrfs] [41074.644919] [a02137a9] btrfs_log_inode+0x2f9/0x970 [btrfs] [41074.644919] [81090875] ? sched_clock_local+0x25/0xa0 [41074.644919] [8164a55e] ? mutex_unlock+0xe/0x10 [41074.644919] [810af51d] ? trace_hardirqs_on+0xd/0x10 [41074.644919] [a0214b4f] btrfs_log_inode_parent+0x1ef/0x560 [btrfs] [41074.644919] [811d0c55] ? dget_parent+0x5/0x180 [41074.644919] [a0215d11] btrfs_log_dentry_safe+0x51/0x80 [btrfs] [41074.644919] [a01e2d1a] btrfs_sync_file+0x1ba/0x3e0 [btrfs] [41074.644919] [811eda6b] vfs_fsync_range+0x1b/0x30 (...) The necessary conditions that lead to such crash are: * an incremental fsync (when the inode doesn't have the BTRFS_INODE_NEEDS_FULL_SYNC flag set) happened for our file and it logged a file extent item ending at offset X; * the file got the flag BTRFS_INODE_NEEDS_FULL_SYNC set in its inode, due to a file truncate operation that reduces the file to a size smaller than X; * a ranged fsync call happens (via an msync for example), with a range that doesn't cover the whole file and the end of this range, lets call it Y, is smaller than X; * btrfs_log_inode, sees the flag BTRFS_INODE_NEEDS_FULL_SYNC set and calls btrfs_truncate_inode_items() to remove all items from the log tree that are associated with our file; * btrfs_truncate_inode_items() removes all of the inode's items, and the lowest file extent item it removed is the one ending at offset X, where X 0 and X Y - before returning, it calls btrfs_ordered_update_i_size() with an offset parameter set to X; * btrfs_ordered_update_i_size() sees that X is greater then the current ordered size (btrfs_inode's disk_i_size) and then it assumes there can't be any ongoing ordered operation with a range covering the offset X, calling a BUG_ON() if such ordered operation exists. This assumption is made because the disk_i_size is only increased after the corresponding file extent item is added to the btree (btrfs_finish_ordered_io); * But because our fsync covers only a limited range, such an ordered extent might exist, and our fsync callback (btrfs_sync_file) doesn't wait for such ordered extent to finish when calling btrfs_wait_ordered_range(); And then by the time btrfs_ordered_update_i_size() is called, via: btrfs_sync_file() - btrfs_log_dentry_safe() - btrfs_log_inode_parent() - btrfs_log_inode() - btrfs_truncate_inode_items() - btrfs_ordered_update_i_size() We hit the BUG_ON(), which could never happen if the fsync range covered the whole possible file range (0 to LLONG_MAX), as we would wait for all ordered extents to finish before calling btrfs_truncate_inode_items(). So just don't call btrfs_ordered_update_i_size() if we're removing the inode's items from a log tree, which isn't supposed to change the in memory inode's disk_i_size. Issue found while running xfstests/generic/127 (happens very rarely for me), more specifically via the fsx calls that use memory mapped IO (and issue msync calls). Signed-off-by: Filipe Manana fdman...@suse.com --- V2: Updated commit message, such that it reflects the fact that ranged fsyncs are not used only by msync. fs/btrfs/inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 16e8146..c5ef9eb 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4296,7 +4296,8 @@ out: btrfs_abort_transaction(trans, root, ret); } error: - if (last_size != (u64)-1) + if (last_size != (u64)-1 + root-root_key.objectid != BTRFS_TREE_LOG_OBJECTID) btrfs_ordered_update_i_size(inode, last_size, NULL); btrfs_free_path(path); return err; -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] Btrfs: fix fsync data loss after a ranged fsync
While we're doing a full fsync (when the inode has the flag BTRFS_INODE_NEEDS_FULL_SYNC set) that is ranged too (covers only a portion of the file), we might have ordered operations that are started while we're logging the inode and that fall outside the fsync range. This means we can get extent maps outside our range added to the inode's extent map tree's modified list for which the corresponding ordered operation wasn't captured by our call to btrfs_get_logged_extents() - the fill delalloc callbacks, inode.c:cow_file_range() and inode.c:submit_compressed_extents() add an extent map to the modified list before creating the respective ordered operation - and they do this without holding the inode's mutex nor the inode's log mutex. Therefore when a full ranged fsync finishes don't remove every extent map from the modified list of extent maps - as for some of them, that fall outside our fsync range, we might have not waited for their respective ordered operation to finish (meaning the corresponding file extent item wasn't inserted into the fs/subvol tree yet), and we must let the next fsync (very likely a fast one that checks only the modified list) see this extent map and log a matching file extent item to the log btree and wait for its ordered operation to finish (if it's still ongoing). Signed-off-by: Filipe Manana fdman...@suse.com --- fs/btrfs/file.c | 2 +- fs/btrfs/tree-log.c | 50 -- fs/btrfs/tree-log.h | 2 ++ 3 files changed, 43 insertions(+), 11 deletions(-) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 66c4076..e5534c1 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1979,7 +1979,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) btrfs_init_log_ctx(ctx); - ret = btrfs_log_dentry_safe(trans, root, dentry, ctx); + ret = btrfs_log_dentry_safe(trans, root, dentry, start, end, ctx); if (ret 0) { /* Fallthrough and commit/free transaction. */ ret = 1; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 5a917a6..8b18a2d 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -94,8 +94,10 @@ #define LOG_WALK_REPLAY_ALL 3 static int btrfs_log_inode(struct btrfs_trans_handle *trans, -struct btrfs_root *root, struct inode *inode, -int inode_only); + struct btrfs_root *root, struct inode *inode, + int inode_only, + const loff_t start, + const loff_t end); static int link_to_fixup_dir(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 objectid); @@ -3856,8 +3858,10 @@ process: * This handles both files and directories. */ static int btrfs_log_inode(struct btrfs_trans_handle *trans, -struct btrfs_root *root, struct inode *inode, -int inode_only) + struct btrfs_root *root, struct inode *inode, + int inode_only, + const loff_t start, + const loff_t end) { struct btrfs_path *path; struct btrfs_path *dst_path; @@ -4050,8 +4054,27 @@ log_extents: struct extent_map *em, *n; write_lock(tree-lock); - list_for_each_entry_safe(em, n, tree-modified_extents, list) + /* +* We can't just remove every em if we're called for a ranged +* fsync - that is, one that doesn't cover the whole possible +* file range (0 to LLONG_MAX). This is because we can have +* em's that fall outside the range and therefore their ordered +* operations haven't completed yet (btrfs_finish_ordered_io() +* not invoked yet). Their ordered operations might have started +* after we called btrfs_get_logged_extents() too, so we don't +* end up waiting for them to complete when syncing the log. +* Removing every em outside the range would make a subsequent +* fsync that does a fast search (BTRFS_INODE_NEEDS_FULL_SYNC +* flag not set) not log the extent represented by an em, +* therefore making us lose data after a log replay. +*/ + list_for_each_entry_safe(em, n, tree-modified_extents, list) { + if (em-mod_start end) + continue; + if (em-mod_start + em-mod_len = start) + continue; list_del_init(em-list); + } write_unlock(tree-lock); } @@ -4158,7 +4181,10 @@ out: */ static int btrfs_log_inode_parent
[PATCH] Btrfs: fix fsync race leading to invalid data after log replay
When the fsync callback (btrfs_sync_file) starts, it first waits for the writeback of any dirty pages to start and finish without holding the inode's mutex (to reduce contention). After this it acquires the inode's mutex and repeats that process via btrfs_wait_ordered_range only if we're doing a full sync (BTRFS_INODE_NEEDS_FULL_SYNC flag is set on the inode). This is not safe for a non full sync - we need to start and wait for writeback to finish of any pages that might have been made dirty before acquiring the inode's mutex and after that first step mentioned before. Why this is needed is explained by the following comment added to btrfs_sync_file: Right before acquiring the inode's mutex, we might have new writes dirtying pages, which won't immediately start the respective ordered operations - that is done through the fill_delalloc callbacks invoked from the writepage and writepages address space operations. So make sure we start all ordered operations before starting to log our inode. Not doing this means that while logging the inode, writeback could start and invoke writepage/writepages, which would call the fill_delalloc callbacks (cow_file_range, submit_compressed_extents). These callbacks add first an extent map to the modified list of extents and then create the respective ordered operation, which means in tree-log.c:btrfs_log_inode() we might capture all existing ordered operations (with btrfs_get_logged_extents()) before the fill_delalloc callback adds its ordered operation, and by the time we visit the modified list of extent maps (with btrfs_log_changed_extents()), we see and process the extent map they created. We then use their extent map to construct a file extent item for logging without waiting for the respective ordered operation to finish - these file extent items point to a disk location that might not have yet been written to, containing random data - so after a crash a log replay will make our inode have file extent items that point to disk locations containing invalid data. Signed-off-by: Filipe Manana fdman...@suse.com --- fs/btrfs/file.c | 33 +++-- 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index e5534c1..5e9d108 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1912,12 +1912,33 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) atomic_inc(root-log_batch); full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, BTRFS_I(inode)-runtime_flags); - if (full_sync) { - ret = btrfs_wait_ordered_range(inode, start, end - start + 1); - if (ret) { - mutex_unlock(inode-i_mutex); - goto out; - } + /* +* Right before acquiring the inode's mutex, we might have new writes +* dirtying pages, which won't immediately start the respective ordered +* operations - that is done through the fill_delalloc callbacks invoked +* from the writepage and writepages address space operations. So make +* sure we start all ordered operations before starting to log our +* inode. Not doing this means that while logging the inode, writeback +* could start and invoke writepage/writepages, which would call the +* fill_delalloc callbacks (cow_file_range, submit_compressed_extents). +* These callbacks add first an extent map to the modified list of +* extents and then create the respective ordered operation, which means +* in tree-log.c:btrfs_log_inode() we might capture all existing ordered +* operations (with btrfs_get_logged_extents()) before the fill_delalloc +* callback adds its ordered operation, and by the time we visit the +* modified list of extent maps (with btrfs_log_changed_extents()), we +* see and process the extent map they created. We then use their extent +* map to construct a file extent item for logging without waiting for +* the respective ordered operation to finish - these file extent items +* point to a disk location that might not have yet been written to, +* containing random data - so after a crash a log replay will make our +* inode have file extent items that point to disk locations containing +* invalid data. +*/ + ret = btrfs_wait_ordered_range(inode, start, end - start + 1); + if (ret) { + mutex_unlock(inode-i_mutex); + goto out; } atomic_inc(root-log_batch); -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v2] Btrfs: fix fsync race leading to invalid data after log replay
When the fsync callback (btrfs_sync_file) starts, it first waits for the writeback of any dirty pages to start and finish without holding the inode's mutex (to reduce contention). After this it acquires the inode's mutex and repeats that process via btrfs_wait_ordered_range only if we're doing a full sync (BTRFS_INODE_NEEDS_FULL_SYNC flag is set on the inode). This is not safe for a non full sync - we need to start and wait for writeback to finish for any pages that might have been made dirty before acquiring the inode's mutex and after that first step mentioned before. Why this is needed is explained by the following comment added to btrfs_sync_file: Right before acquiring the inode's mutex, we might have new writes dirtying pages, which won't immediately start the respective ordered operations - that is done through the fill_delalloc callbacks invoked from the writepage and writepages address space operations. So make sure we start all ordered operations before starting to log our inode. Not doing this means that while logging the inode, writeback could start and invoke writepage/writepages, which would call the fill_delalloc callbacks (cow_file_range, submit_compressed_extents). These callbacks add first an extent map to the modified list of extents and then create the respective ordered operation, which means in tree-log.c:btrfs_log_inode() we might capture all existing ordered operations (with btrfs_get_logged_extents()) before the fill_delalloc callback adds its ordered operation, and by the time we visit the modified list of extent maps (with btrfs_log_changed_extents()), we see and process the extent map they created. We then use the extent map to construct a file extent item for logging without waiting for the respective ordered operation to finish - this file extent item points to a disk location that might not have yet been written to, containing random data - so after a crash a log replay will make our inode have file extent items that point to disk locations containing invalid data, as we returned success to userspace without waiting for the respective ordered operation to finish, because it wasn't captured by btrfs_get_logged_extents(). Signed-off-by: Filipe Manana fdman...@suse.com --- V2: Better comments and for the non full sync case, start only the ordered operations, instead of starting them and waiting for them to complete. Waiting for their completion is already done later by btrfs_sync_log(), so like this we can reduce some latency as some ordered operations might complete (or get closer to) while writing to the log tree (btrfs_log_dentry_safe). fs/btrfs/file.c | 78 ++--- 1 file changed, 63 insertions(+), 15 deletions(-) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index e5534c1..5427ba8 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1865,6 +1865,20 @@ int btrfs_release_file(struct inode *inode, struct file *filp) return 0; } +static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end) +{ + int ret; + + atomic_inc(BTRFS_I(inode)-sync_writers); + ret = filemap_fdatawrite_range(inode-i_mapping, start, end); + if (!ret test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, +BTRFS_I(inode)-runtime_flags)) + ret = filemap_fdatawrite_range(inode-i_mapping, start, end); + atomic_dec(BTRFS_I(inode)-sync_writers); + + return ret; +} + /* * fsync call for both files and directories. This logs the inode into * the tree log instead of forcing full commits whenever possible. @@ -1894,30 +1908,64 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * multi-task, and make the performance up. See * btrfs_wait_ordered_range for an explanation of the ASYNC check. */ - atomic_inc(BTRFS_I(inode)-sync_writers); - ret = filemap_fdatawrite_range(inode-i_mapping, start, end); - if (!ret test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, -BTRFS_I(inode)-runtime_flags)) - ret = filemap_fdatawrite_range(inode-i_mapping, start, end); - atomic_dec(BTRFS_I(inode)-sync_writers); + ret = start_ordered_ops(inode, start, end); if (ret) return ret; mutex_lock(inode-i_mutex); - - /* -* We flush the dirty pages again to avoid some dirty pages in the -* range being left. -*/ atomic_inc(root-log_batch); full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, BTRFS_I(inode)-runtime_flags); + /* +* We might have have had more pages made dirty after calling +* start_ordered_ops and before acquiring the inode's i_mutex. +*/ if (full_sync) { + /* +* For a full sync, we need
[PATCH v2] Btrfs: fix fsync data loss after a ranged fsync
While we're doing a full fsync (when the inode has the flag BTRFS_INODE_NEEDS_FULL_SYNC set) that is ranged too (covers only a portion of the file), we might have ordered operations that are started before or while we're logging the inode and that fall outside the fsync range. Therefore when a full ranged fsync finishes don't remove every extent map from the list of modified extent maps - as for some of them, that fall outside our fsync range, their respective ordered operation hasn't finished yet, meaning the corresponding file extent item wasn't inserted into the fs/subvol tree yet and therefore we didn't log it, and we must let the next fast fsync (one that checks only the modified list) see this extent map and log a matching file extent item to the log btree and wait for its ordered operation to finish (if it's still ongoing). Signed-off-by: Filipe Manana fdman...@suse.com --- V2: No code change, only updated the changelog and the comment, to make them more clear and accurate. fs/btrfs/file.c | 2 +- fs/btrfs/tree-log.c | 54 +++-- fs/btrfs/tree-log.h | 2 ++ 3 files changed, 47 insertions(+), 11 deletions(-) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 66c4076..e5534c1 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1979,7 +1979,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) btrfs_init_log_ctx(ctx); - ret = btrfs_log_dentry_safe(trans, root, dentry, ctx); + ret = btrfs_log_dentry_safe(trans, root, dentry, start, end, ctx); if (ret 0) { /* Fallthrough and commit/free transaction. */ ret = 1; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 5a917a6..93d3c16 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -94,8 +94,10 @@ #define LOG_WALK_REPLAY_ALL 3 static int btrfs_log_inode(struct btrfs_trans_handle *trans, -struct btrfs_root *root, struct inode *inode, -int inode_only); + struct btrfs_root *root, struct inode *inode, + int inode_only, + const loff_t start, + const loff_t end); static int link_to_fixup_dir(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 objectid); @@ -3856,8 +3858,10 @@ process: * This handles both files and directories. */ static int btrfs_log_inode(struct btrfs_trans_handle *trans, -struct btrfs_root *root, struct inode *inode, -int inode_only) + struct btrfs_root *root, struct inode *inode, + int inode_only, + const loff_t start, + const loff_t end) { struct btrfs_path *path; struct btrfs_path *dst_path; @@ -4050,8 +4054,31 @@ log_extents: struct extent_map *em, *n; write_lock(tree-lock); - list_for_each_entry_safe(em, n, tree-modified_extents, list) + /* +* We can't just remove every em if we're called for a ranged +* fsync - that is, one that doesn't cover the whole possible +* file range (0 to LLONG_MAX). This is because we can have +* em's that fall outside the range we're logging and therefore +* their ordered operations haven't completed yet +* (btrfs_finish_ordered_io() not invoked yet). This means we +* didn't get their respective file extent item in the fs/subvol +* tree yet, and need to let the next fast fsync (one which +* consults the list of modified extent maps) find the em so +* that it logs a matching file extent item and waits for the +* respective ordered operation to complete (if it's still +* running). +* +* Removing every em outside the range we're logging would make +* the next fast fsync not log their matching file extent items, +* therefore making us lose data after a log replay. +*/ + list_for_each_entry_safe(em, n, tree-modified_extents, list) { + if (em-mod_start end) + continue; + if (em-mod_start + em-mod_len = start) + continue; list_del_init(em-list); + } write_unlock(tree-lock); } @@ -4158,7 +4185,10 @@ out: */ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, - struct dentry
[PATCH] Btrfs: fix data corruption after fast fsync and writeback error
When we do a fast fsync, we start all ordered operations and then while they're running in parallel we visit the list of modified extent maps and construct their matching file extent items and write them to the log btree. After that, in btrfs_sync_log() we wait for all the ordered operations to finish (via btrfs_wait_logged_extents). The problem with this is that we were completely ignoring errors that can happen in the extent write path, such as -ENOSPC, a temporary -ENOMEM or -EIO errors for example. When such error happens, it means we have parts of the on disk extent that weren't written to, and so we end up logging file extent items that point to these extents that contain garbage/random data - so after a crash/reboot plus log replay, we get our inode's metadata pointing to those extents. This worked in contrast with the full (non-fast) fsync path, where we start all ordered operations, wait for them to finish and then write to the log btree. In this path, after each ordered operation completes we check if it's flagged with an error (BTRFS_ORDERED_IOERR) and return -EIO if so (via btrfs_wait_ordered_range). So if an error happens with any ordered operation, just return a -EIO error to userspace, so that it knows that not all of its previous writes were durably persisted and the application can take proper action (like redo the writes for e.g.) - and definitely not leave any file extent items in the log refer to non fully written extents. Signed-off-by: Filipe Manana fdman...@suse.com --- This patch applies on top of the patches with the titles: Btrfs: fix fsync data loss after a ranged fsync Btrfs: fix fsync race leading to invalid data after log replay fs/btrfs/file.c | 19 fs/btrfs/tree-log.c | 247 ++-- fs/btrfs/tree-log.h | 2 + 3 files changed, 166 insertions(+), 102 deletions(-) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 5427ba8..4494b4e 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2045,6 +2045,25 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) */ mutex_unlock(inode-i_mutex); + /* +* If any of the ordered extents had an error, just return it to user +* space, so that the application knows some writes didn't succeed and +* can take proper action (retry for e.g.). Blindly committing the +* transaction in this case, would fool userspace that everything was +* successful. And we also want to make sure our log doesn't contain +* file extent items pointing to extents that weren't fully written to - +* just like in the non fast fsync path, where we check for the ordered +* operation's error flag before writing to the log tree and return -EIO +* if any of them had this flag set (btrfs_wait_ordered_range) - +* therefore we need to check for errors in the ordered operations, +* which are indicated by ctx.io_err. +*/ + if (ctx.io_err) { + btrfs_end_transaction(trans, root); + ret = ctx.io_err; + goto out; + } + if (ret != BTRFS_NO_LOG_SYNC) { if (!ret) { ret = btrfs_sync_log(trans, root, ctx); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 93d3c16..128f301 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -97,7 +97,8 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, int inode_only, const loff_t start, - const loff_t end); + const loff_t end, + struct btrfs_log_ctx *ctx); static int link_to_fixup_dir(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 objectid); @@ -3571,107 +3572,33 @@ static int extent_cmp(void *priv, struct list_head *a, struct list_head *b) return 0; } -static int log_one_extent(struct btrfs_trans_handle *trans, - struct inode *inode, struct btrfs_root *root, - struct extent_map *em, struct btrfs_path *path, - struct list_head *logged_list) +static int wait_ordered_extents(struct btrfs_trans_handle *trans, + struct inode *inode, + struct btrfs_root *root, + const struct extent_map *em, + const struct list_head *logged_list, + bool *ordered_io_error) { - struct btrfs_root *log = root-log_root; - struct btrfs_file_extent_item *fi; - struct extent_buffer *leaf; struct btrfs_ordered_extent *ordered; - struct list_head ordered_sums; - struct
[PATCH v3] Btrfs: fix fsync data loss after a ranged fsync
While we're doing a full fsync (when the inode has the flag BTRFS_INODE_NEEDS_FULL_SYNC set) that is ranged too (covers only a portion of the file), we might have ordered operations that are started before or while we're logging the inode and that fall outside the fsync range. Therefore when a full ranged fsync finishes don't remove every extent map from the list of modified extent maps - as for some of them, that fall outside our fsync range, their respective ordered operation hasn't finished yet, meaning the corresponding file extent item wasn't inserted into the fs/subvol tree yet and therefore we didn't log it, and we must let the next fast fsync (one that checks only the modified list) see this extent map and log a matching file extent item to the log btree and wait for its ordered operation to finish (if it's still ongoing). A test case for xfstests follows. Signed-off-by: Filipe Manana fdman...@suse.com --- V2: No code change, only updated the changelog and the comment, to make them more clear and accurate. V3: Added missing condition to exclude extent map from the modified list and ensure btrfs_log_inode() is called for the next fsync if the modified list didn't get empty after logging the inode. This time this follows with a test case for xfstests that is better then my previous local test and benefits everyone. fs/btrfs/file.c | 2 +- fs/btrfs/tree-log.c | 78 ++--- fs/btrfs/tree-log.h | 2 ++ 3 files changed, 66 insertions(+), 16 deletions(-) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 66c4076..e5534c1 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1979,7 +1979,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) btrfs_init_log_ctx(ctx); - ret = btrfs_log_dentry_safe(trans, root, dentry, ctx); + ret = btrfs_log_dentry_safe(trans, root, dentry, start, end, ctx); if (ret 0) { /* Fallthrough and commit/free transaction. */ ret = 1; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 5a917a6..6d774c9 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -94,8 +94,10 @@ #define LOG_WALK_REPLAY_ALL 3 static int btrfs_log_inode(struct btrfs_trans_handle *trans, -struct btrfs_root *root, struct inode *inode, -int inode_only); + struct btrfs_root *root, struct inode *inode, + int inode_only, + const loff_t start, + const loff_t end); static int link_to_fixup_dir(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 objectid); @@ -3856,8 +3858,10 @@ process: * This handles both files and directories. */ static int btrfs_log_inode(struct btrfs_trans_handle *trans, -struct btrfs_root *root, struct inode *inode, -int inode_only) + struct btrfs_root *root, struct inode *inode, + int inode_only, + const loff_t start, + const loff_t end) { struct btrfs_path *path; struct btrfs_path *dst_path; @@ -3874,6 +3878,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, int ins_nr; bool fast_search = false; u64 ino = btrfs_ino(inode); + struct extent_map_tree *em_tree = BTRFS_I(inode)-extent_tree; path = btrfs_alloc_path(); if (!path) @@ -4046,13 +4051,38 @@ log_extents: goto out_unlock; } } else if (inode_only == LOG_INODE_ALL) { - struct extent_map_tree *tree = BTRFS_I(inode)-extent_tree; struct extent_map *em, *n; - write_lock(tree-lock); - list_for_each_entry_safe(em, n, tree-modified_extents, list) + write_lock(em_tree-lock); + /* +* We can't just remove every em if we're called for a ranged +* fsync - that is, one that doesn't cover the whole possible +* file range (0 to LLONG_MAX). This is because we can have +* em's that fall outside the range we're logging and therefore +* their ordered operations haven't completed yet +* (btrfs_finish_ordered_io() not invoked yet). This means we +* didn't get their respective file extent item in the fs/subvol +* tree yet, and need to let the next fast fsync (one which +* consults the list of modified extent maps) find the em so +* that it logs a matching file extent item and waits for the +* respective ordered operation to complete (if it's still +* running
[PATCH] xfstests: generic: add test for double msync, motivated by a btrfs bug
This test is motivated by a btrfs issue where a ranged fsync would prevent a subsequent fsync from persisting any extents that were dirty at the time of the first fsync but that were outside the range of that first fsync (which should have been persisted by the second fsync). This bug in btrfs is fixed by the following linux kernel patch: Btrfs: fix fsync data loss after a ranged fsync Signed-off-by: Filipe Manana fdman...@suse.com --- tests/generic/325 | 106 ++ tests/generic/325.out | 19 + tests/generic/group | 1 + 3 files changed, 126 insertions(+) create mode 100755 tests/generic/325 create mode 100644 tests/generic/325.out diff --git a/tests/generic/325 b/tests/generic/325 new file mode 100755 index 000..c47e372 --- /dev/null +++ b/tests/generic/325 @@ -0,0 +1,106 @@ +#! /bin/bash +# FS QA Test No. 325 +# +# Make some pages/extents of a file dirty, do a ranged fsync that covers +# only some of the dirty pages/extents, and then do a regular fsync (or +# another ranged fsync that covers the remaining dirty pages/extents). +# Verify after that all extents were persisted. +# +# This test is motivated by a btrfs issue where the first ranged fsync +# would prevent the following fsync from persisting the remaining dirty +# pages/extents. This was fixed by the following btrfs kernel patch: +# +# Btrfs: fix fsync data loss after a ranged fsync +# +#--- +# Copyright (C) 2014 SUSE Linux Products GmbH. All Rights Reserved. +# Author: Filipe Manana fdman...@suse.com +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +#--- +# + +seq=`basename $0` +seqres=$RESULT_DIR/$seq +echo QA output created by $seq + +here=`pwd` +status=1 # failure is the default! + +_cleanup() +{ + _cleanup_flakey +} +trap _cleanup; exit \$status 0 1 2 3 15 + +# get standard environment, filters and checks +. ./common/rc +. ./common/filter +. ./common/dmflakey + +# real QA test starts here +_supported_fs generic +_supported_os Linux +_need_to_be_root +_require_scratch +_require_dm_flakey + +rm -f $seqres.full + +_scratch_mkfs $seqres.full 21 + +_init_flakey +_mount_flakey + +# Create the file first. +$XFS_IO_PROG -f -c pwrite -S 0xff 0 64K $SCRATCH_MNT/foo | _filter_xfs_io + +# Now sync the file data to disk using 'sync' and not an fsync. This is because +# in btrfs the first fsync clears the btrfs inode full fsync flag, which must +# be set when the first msync below happens in order to trigger the bug. +sync + +# Now update the first 4Kb and the last 4Kb of the file, using memory mapped IO +# because an msync(), since the linux kernel commit +# 7fc34a62ca4434a79c68e23e70ed26111b7a4cf8, invokes a ranged fsync. +# +# After those writes, msync a range covering the first 4Kb and then after +# perform a msync with a range covering the last 4Kb of the file. +# This second msync() used to be a no-op for that btrfs bug (and the first fsync +# didn't log the last 4Kb extent as expected too). +$XFS_IO_PROG \ + -c mmap -w 0 64K \ + -c mwrite -S 0xaa 0 4K \ + -c mwrite -S 0xbb 60K 4K \ + -c msync -s 0K 16K \ + -c msync -s 32K 32K \ + -c munmap\ + $SCRATCH_MNT/foo | _filter_xfs_io + +echo File content before crash/reboot: +od -t x1 $SCRATCH_MNT/foo + +_load_flakey_table $FLAKEY_DROP_WRITES +_unmount_flakey + +_load_flakey_table $FLAKEY_ALLOW_WRITES +_mount_flakey + +echo File content after crash/reboot and fs mount: +od -t x1 $SCRATCH_MNT/foo + +_unmount_flakey + +status=0 +exit diff --git a/tests/generic/325.out b/tests/generic/325.out new file mode 100644 index 000..9a78c3e --- /dev/null +++ b/tests/generic/325.out @@ -0,0 +1,19 @@ +QA output created by 325 +wrote 65536/65536 bytes at offset 0 +XXX Bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +File content before crash/reboot: +000 aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa +* +001 ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff +* +017 bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb +* +020 +File content after crash/reboot and fs mount: +000 aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa +* +001 ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff +* +017 bb bb bb bb bb
[PATCH v4] Btrfs: fix fsync data loss after a ranged fsync
While we're doing a full fsync (when the inode has the flag BTRFS_INODE_NEEDS_FULL_SYNC set) that is ranged too (covers only a portion of the file), we might have ordered operations that are started before or while we're logging the inode and that fall outside the fsync range. Therefore when a full ranged fsync finishes don't remove every extent map from the list of modified extent maps - as for some of them, that fall outside our fsync range, their respective ordered operation hasn't finished yet, meaning the corresponding file extent item wasn't inserted into the fs/subvol tree yet and therefore we didn't log it, and we must let the next fast fsync (one that checks only the modified list) see this extent map and log a matching file extent item to the log btree and wait for its ordered operation to finish (if it's still ongoing). A test case for xfstests follows. Signed-off-by: Filipe Manana fdman...@suse.com --- V2: No code change, only updated the changelog and the comment, to make them more clear and accurate. V3: Added missing condition to exclude extent map from the modified list and ensure btrfs_log_inode() is called for the next fsync if the modified list didn't get empty after logging the inode. This time this follows with a test case for xfstests that is better then my previous local test and benefits everyone. V4: Simplifed em exclusion logic. fs/btrfs/file.c | 2 +- fs/btrfs/tree-log.c | 77 ++--- fs/btrfs/tree-log.h | 2 ++ 3 files changed, 64 insertions(+), 17 deletions(-) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 66c4076..e5534c1 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1979,7 +1979,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) btrfs_init_log_ctx(ctx); - ret = btrfs_log_dentry_safe(trans, root, dentry, ctx); + ret = btrfs_log_dentry_safe(trans, root, dentry, start, end, ctx); if (ret 0) { /* Fallthrough and commit/free transaction. */ ret = 1; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 5a917a6..82db14f 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -94,8 +94,10 @@ #define LOG_WALK_REPLAY_ALL 3 static int btrfs_log_inode(struct btrfs_trans_handle *trans, -struct btrfs_root *root, struct inode *inode, -int inode_only); + struct btrfs_root *root, struct inode *inode, + int inode_only, + const loff_t start, + const loff_t end); static int link_to_fixup_dir(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 objectid); @@ -3856,8 +3858,10 @@ process: * This handles both files and directories. */ static int btrfs_log_inode(struct btrfs_trans_handle *trans, -struct btrfs_root *root, struct inode *inode, -int inode_only) + struct btrfs_root *root, struct inode *inode, + int inode_only, + const loff_t start, + const loff_t end) { struct btrfs_path *path; struct btrfs_path *dst_path; @@ -3874,6 +3878,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, int ins_nr; bool fast_search = false; u64 ino = btrfs_ino(inode); + struct extent_map_tree *em_tree = BTRFS_I(inode)-extent_tree; path = btrfs_alloc_path(); if (!path) @@ -4046,13 +4051,35 @@ log_extents: goto out_unlock; } } else if (inode_only == LOG_INODE_ALL) { - struct extent_map_tree *tree = BTRFS_I(inode)-extent_tree; struct extent_map *em, *n; - write_lock(tree-lock); - list_for_each_entry_safe(em, n, tree-modified_extents, list) - list_del_init(em-list); - write_unlock(tree-lock); + write_lock(em_tree-lock); + /* +* We can't just remove every em if we're called for a ranged +* fsync - that is, one that doesn't cover the whole possible +* file range (0 to LLONG_MAX). This is because we can have +* em's that fall outside the range we're logging and therefore +* their ordered operations haven't completed yet +* (btrfs_finish_ordered_io() not invoked yet). This means we +* didn't get their respective file extent item in the fs/subvol +* tree yet, and need to let the next fast fsync (one which +* consults the list of modified extent maps) find the em so +* that it logs a matching file extent item
[PATCH] Btrfs: fix directory recovery from fsync log
When replaying a directory from the fsync log, if a directory entry exists both in the fs/subvol tree and in the log, the directory's inode got its i_size updated incorrectly, accounting for the dentry's name twice. Reproducer, from a test for xfstests: _scratch_mkfs $seqres.full 21 _init_flakey _mount_flakey touch $SCRATCH_MNT/foo sync touch $SCRATCH_MNT/bar xfs_io -c fsync $SCRATCH_MNT xfs_io -c fsync $SCRATCH_MNT/bar _load_flakey_table $FLAKEY_DROP_WRITES _unmount_flakey _load_flakey_table $FLAKEY_ALLOW_WRITES _mount_flakey [ -f $SCRATCH_MNT/foo ] || echo file foo is missing [ -f $SCRATCH_MNT/bar ] || echo file bar is missing _unmount_flakey _check_scratch_fs $FLAKEY_DEV The filesystem check at the end failed with the message: root 5 root dir 256 error. A test case for xfstests follows. Signed-off-by: Filipe Manana fdman...@suse.com --- fs/btrfs/tree-log.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index faa568e..d60425c 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -1638,6 +1638,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, found_key.type == log_key.type found_key.offset == log_key.offset btrfs_dir_type(path-nodes[0], dst_di) == log_type) { + update_size = false; goto out; } -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] xfstests: generic: add dir fsync test, motivated by a btrfs bug
This test is motivated by a bug found in btrfs when fsync'ing a directory. The issue was that if a directory entry is both found in the persisted metadata and in the fsync log, at log replay time the directory got set ith a wrong i_size. This was fixed in btrfs with the following linux kernel patch: Btrfs: fix directory recovery from fsync log Signed-off-by: Filipe Manana fdman...@suse.com --- tests/generic/326 | 95 +++ tests/generic/326.out | 2 ++ tests/generic/group | 1 + 3 files changed, 98 insertions(+) create mode 100755 tests/generic/326 create mode 100644 tests/generic/326.out diff --git a/tests/generic/326 b/tests/generic/326 new file mode 100755 index 000..47f4558 --- /dev/null +++ b/tests/generic/326 @@ -0,0 +1,95 @@ +#! /bin/bash +# FS QA Test No. 326 +# +# This test is motivated by a bug found in btrfs when fsync'ing a directory. +# The issue was that if a directory entry is both found in the persisted +# metadata and in the fsync log, at log replay time the directory got set +# with a wrong i_size. This was fixed in btrfs with the following linux +# kernel patch: +# +# Btrfs: fix directory recovery from fsync log +# +#--- +# Copyright (C) 2014 SUSE Linux Products GmbH. All Rights Reserved. +# Author: Filipe Manana fdman...@suse.com +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +#--- +# + +seq=`basename $0` +seqres=$RESULT_DIR/$seq +echo QA output created by $seq + +here=`pwd` +status=1 # failure is the default! + +_cleanup() +{ + _cleanup_flakey +} +trap _cleanup; exit \$status 0 1 2 3 15 + +# get standard environment, filters and checks +. ./common/rc +. ./common/filter +. ./common/dmflakey + +# real QA test starts here +_supported_fs generic +_supported_os Linux +_need_to_be_root +_require_scratch +_require_dm_flakey + +rm -f $seqres.full + +_scratch_mkfs $seqres.full 21 + +_init_flakey +_mount_flakey + +touch $SCRATCH_MNT/foo + +# Invoke sync here because it's necessary to trigger the original bug in btrfs. +# The intention is that at log recovery time we have a dir entry for 'foo' both +# in the fs/subvol tree and in the log tree - this is necessary to trigger the +# bug on btrfs. +sync + +touch $SCRATCH_MNT/bar +xfs_io -c fsync $SCRATCH_MNT +xfs_io -c fsync $SCRATCH_MNT/bar + +_load_flakey_table $FLAKEY_DROP_WRITES +_unmount_flakey +_check_scratch_fs $FLAKEY_DEV + +_load_flakey_table $FLAKEY_ALLOW_WRITES +_mount_flakey + +[ -f $SCRATCH_MNT/foo ] || echo file foo is missing +[ -f $SCRATCH_MNT/bar ] || echo file bar is missing + +_unmount_flakey + +# In the original btrfs bug, the filesystem consistency check failed here +# because the directory inode got set with a wrong i_size by the log replay +# at mount time (dentry 'foo' was accounted for twice). +_check_scratch_fs $FLAKEY_DEV + +echo Silence is golden + +status=0 +exit diff --git a/tests/generic/326.out b/tests/generic/326.out new file mode 100644 index 000..4ac0db5 --- /dev/null +++ b/tests/generic/326.out @@ -0,0 +1,2 @@ +QA output created by 326 +Silence is golden diff --git a/tests/generic/group b/tests/generic/group index bdcfd9d..cc5fecc 100644 --- a/tests/generic/group +++ b/tests/generic/group @@ -145,3 +145,4 @@ 323 auto aio stress 324 auto fsr quick 325 auto quick data log +326 auto quick metadata log -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v5] Btrfs: fix fsync data loss after a ranged fsync
While we're doing a full fsync (when the inode has the flag BTRFS_INODE_NEEDS_FULL_SYNC set) that is ranged too (covers only a portion of the file), we might have ordered operations that are started before or while we're logging the inode and that fall outside the fsync range. Therefore when a full ranged fsync finishes don't remove every extent map from the list of modified extent maps - as for some of them, that fall outside our fsync range, their respective ordered operation hasn't finished yet, meaning the corresponding file extent item wasn't inserted into the fs/subvol tree yet and therefore we didn't log it, and we must let the next fast fsync (one that checks only the modified list) see this extent map and log a matching file extent item to the log btree and wait for its ordered operation to finish (if it's still ongoing). A test case for xfstests follows. Signed-off-by: Filipe Manana fdman...@suse.com --- V2: No code change, only updated the changelog and the comment, to make them more clear and accurate. V3: Added missing condition to exclude extent map from the modified list and ensure btrfs_log_inode() is called for the next fsync if the modified list didn't get empty after logging the inode. This time this follows with a test case for xfstests that is better then my previous local test and benefits everyone. V4: Simplifed em exclusion logic. V5: Removed the hack that doesn't set the inode's logged_trans and last_log_commit if the list of modified extent maps isn't empty. This prevented an unlink in the same transaction from removing the dentry from the log tree (if an fsync against the parent dir was made before). fs/btrfs/btrfs_inode.h | 13 ++-- fs/btrfs/file.c| 2 +- fs/btrfs/tree-log.c| 55 -- fs/btrfs/tree-log.h| 2 ++ 4 files changed, 58 insertions(+), 14 deletions(-) diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 74ff403..3511031 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -246,8 +246,17 @@ static inline int btrfs_inode_in_log(struct inode *inode, u64 generation) BTRFS_I(inode)-last_sub_trans = BTRFS_I(inode)-last_log_commit BTRFS_I(inode)-last_sub_trans = - BTRFS_I(inode)-root-last_log_commit) - return 1; + BTRFS_I(inode)-root-last_log_commit) { + /* +* After a ranged fsync we might have left some extent maps +* (that fall outside the fsync's range). So return false +* here if the list isn't empty, to make sure btrfs_log_inode() +* will be called and process those extent maps. +*/ + smp_mb(); + if (list_empty(BTRFS_I(inode)-extent_tree.modified_extents)) + return 1; + } return 0; } diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 66c4076..e5534c1 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1979,7 +1979,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) btrfs_init_log_ctx(ctx); - ret = btrfs_log_dentry_safe(trans, root, dentry, ctx); + ret = btrfs_log_dentry_safe(trans, root, dentry, start, end, ctx); if (ret 0) { /* Fallthrough and commit/free transaction. */ ret = 1; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 5a917a6..cf4ead8 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -94,8 +94,10 @@ #define LOG_WALK_REPLAY_ALL 3 static int btrfs_log_inode(struct btrfs_trans_handle *trans, -struct btrfs_root *root, struct inode *inode, -int inode_only); + struct btrfs_root *root, struct inode *inode, + int inode_only, + const loff_t start, + const loff_t end); static int link_to_fixup_dir(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 objectid); @@ -3856,8 +3858,10 @@ process: * This handles both files and directories. */ static int btrfs_log_inode(struct btrfs_trans_handle *trans, -struct btrfs_root *root, struct inode *inode, -int inode_only) + struct btrfs_root *root, struct inode *inode, + int inode_only, + const loff_t start, + const loff_t end) { struct btrfs_path *path; struct btrfs_path *dst_path; @@ -4050,8 +4054,30 @@ log_extents: struct extent_map *em, *n; write_lock(tree-lock); - list_for_each_entry_safe(em, n, tree-modified_extents, list
[PATCH v2] xfstests: generic: add dir fsync test, motivated by a btrfs bug
This test is motivated by a bug found in btrfs when fsync'ing a directory. The issue was that if a directory entry is both found in the persisted metadata and in the fsync log, at log replay time the directory got set ith a wrong i_size. This was fixed in btrfs with the following linux kernel patch: Btrfs: fix directory recovery from fsync log Signed-off-by: Filipe Manana fdman...@suse.com --- V2: Replaced direct use of xfs_io with $XFS_IO_PROG. tests/generic/326 | 95 +++ tests/generic/326.out | 2 ++ tests/generic/group | 1 + 3 files changed, 98 insertions(+) create mode 100755 tests/generic/326 create mode 100644 tests/generic/326.out diff --git a/tests/generic/326 b/tests/generic/326 new file mode 100755 index 000..886c856 --- /dev/null +++ b/tests/generic/326 @@ -0,0 +1,95 @@ +#! /bin/bash +# FS QA Test No. 326 +# +# This test is motivated by a bug found in btrfs when fsync'ing a directory. +# The issue was that if a directory entry is both found in the persisted +# metadata and in the fsync log, at log replay time the directory got set +# with a wrong i_size. This was fixed in btrfs with the following linux +# kernel patch: +# +# Btrfs: fix directory recovery from fsync log +# +#--- +# Copyright (C) 2014 SUSE Linux Products GmbH. All Rights Reserved. +# Author: Filipe Manana fdman...@suse.com +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +#--- +# + +seq=`basename $0` +seqres=$RESULT_DIR/$seq +echo QA output created by $seq + +here=`pwd` +status=1 # failure is the default! + +_cleanup() +{ + _cleanup_flakey +} +trap _cleanup; exit \$status 0 1 2 3 15 + +# get standard environment, filters and checks +. ./common/rc +. ./common/filter +. ./common/dmflakey + +# real QA test starts here +_supported_fs generic +_supported_os Linux +_need_to_be_root +_require_scratch +_require_dm_flakey + +rm -f $seqres.full + +_scratch_mkfs $seqres.full 21 + +_init_flakey +_mount_flakey + +touch $SCRATCH_MNT/foo + +# Invoke sync here because it's necessary to trigger the original bug in btrfs. +# The intention is that at log recovery time we have a dir entry for 'foo' both +# in the fs/subvol tree and in the log tree - this is necessary to trigger the +# bug on btrfs. +sync + +touch $SCRATCH_MNT/bar +$XFS_IO_PROG -c fsync $SCRATCH_MNT +$XFS_IO_PROG -c fsync $SCRATCH_MNT/bar + +_load_flakey_table $FLAKEY_DROP_WRITES +_unmount_flakey +_check_scratch_fs $FLAKEY_DEV + +_load_flakey_table $FLAKEY_ALLOW_WRITES +_mount_flakey + +[ -f $SCRATCH_MNT/foo ] || echo file foo is missing +[ -f $SCRATCH_MNT/bar ] || echo file bar is missing + +_unmount_flakey + +# In the original btrfs bug, the filesystem consistency check failed here +# because the directory inode got set with a wrong i_size by the log replay +# at mount time (dentry 'foo' was accounted for twice). +_check_scratch_fs $FLAKEY_DEV + +echo Silence is golden + +status=0 +exit diff --git a/tests/generic/326.out b/tests/generic/326.out new file mode 100644 index 000..4ac0db5 --- /dev/null +++ b/tests/generic/326.out @@ -0,0 +1,2 @@ +QA output created by 326 +Silence is golden diff --git a/tests/generic/group b/tests/generic/group index bdcfd9d..cc5fecc 100644 --- a/tests/generic/group +++ b/tests/generic/group @@ -145,3 +145,4 @@ 323 auto aio stress 324 auto fsr quick 325 auto quick data log +326 auto quick metadata log -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] Btrfs: add missing compression property remove in btrfs_ioctl_setflags
The behaviour of a 'chattr -c' consists of getting the current flags, clearing the FS_COMPR_FL bit and then sending the result to the set flags ioctl - this means the bit FS_NOCOMP_FL isn't set in the flags passed to the ioctl. This results in the compression property not being cleared from the inode - it was cleared only if the bit FS_NOCOMP_FL was set in the received flags. Reproducer: $ mkfs.btrfs -f /dev/sdd $ mount /dev/sdd /mnt cd /mnt $ mkdir a $ chattr +c a $ touch a/file $ lsattr a/file c--- a/file $ chattr -c a $ touch a/file2 $ lsattr a/file2 c--- a/file2 $ lsattr -d a a Reported-by: Andreas Schneider a...@cryptomilk.org Signed-off-by: Filipe Manana fdman...@suse.com --- fs/btrfs/ioctl.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index a010c44..8e6950c 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -333,6 +333,9 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) } else { ip-flags = ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS); + ret = btrfs_set_prop(inode, btrfs.compression, NULL, 0, 0); + if (ret ret != -ENODATA) + goto out_drop; } trans = btrfs_start_transaction(root, 1); -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v2] Btrfs: add missing compression property remove in btrfs_ioctl_setflags
The behaviour of a 'chattr -c' consists of getting the current flags, clearing the FS_COMPR_FL bit and then sending the result to the set flags ioctl - this means the bit FS_NOCOMP_FL isn't set in the flags passed to the ioctl. This results in the compression property not being cleared from the inode - it was cleared only if the bit FS_NOCOMP_FL was set in the received flags. Reproducer: $ mkfs.btrfs -f /dev/sdd $ mount /dev/sdd /mnt cd /mnt $ mkdir a $ chattr +c a $ touch a/file $ lsattr a/file c--- a/file $ chattr -c a $ touch a/file2 $ lsattr a/file2 c--- a/file2 $ lsattr -d a a Reported-by: Andreas Schneider a...@cryptomilk.org Signed-off-by: Filipe Manana fdman...@suse.com --- V2: Ensure BTRFS_INODE_NOCOMPRESS isn't set (unless the bit FS_NOCOMP_FL is set). fs/btrfs/ioctl.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index a010c44..a46c169 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -332,6 +332,9 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) goto out_drop; } else { + ret = btrfs_set_prop(inode, btrfs.compression, NULL, 0, 0); + if (ret ret != -ENODATA) + goto out_drop; ip-flags = ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS); } -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] xfstests: btrfs: add test regarding clearing compression flag/property
Regression test for btrfs where removing the flag FS_COMPR_FL (chattr -c) from an inode wouldn't clear its compression property. This was fixed in the following linux kernel patch: Btrfs: add missing compression property remove in btrfs_ioctl_setflags Signed-off-by: Filipe Manana fdman...@suse.com --- tests/btrfs/059 | 85 + tests/btrfs/059.out | 11 +++ tests/btrfs/group | 1 + 3 files changed, 97 insertions(+) create mode 100755 tests/btrfs/059 create mode 100644 tests/btrfs/059.out diff --git a/tests/btrfs/059 b/tests/btrfs/059 new file mode 100755 index 000..3379ead --- /dev/null +++ b/tests/btrfs/059 @@ -0,0 +1,85 @@ +#! /bin/bash +# FS QA Test No. btrfs/059 +# +# Regression test for btrfs where removing the flag FS_COMPR_FL (chattr -c) +# from an inode wouldn't clear its compression property. +# This was fixed in the following linux kernel patch: +# +# Btrfs: add missing compression property remove in btrfs_ioctl_setflags +# +#--- +# Copyright (C) 2014 SUSE Linux Products GmbH. All Rights Reserved. +# Author: Filipe Manana fdman...@suse.com +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +#--- +# + +seq=`basename $0` +seqres=$RESULT_DIR/$seq +echo QA output created by $seq + +here=`pwd` +tmp=/tmp/$$ + +status=1 # failure is the default! +trap _cleanup; exit \$status 0 1 2 3 15 + +_cleanup() +{ + rm -fr $tmp +} + +# get standard environment, filters and checks +. ./common/rc +. ./common/filter + +# real QA test starts here +_supported_fs btrfs +_supported_os Linux +_require_test +_require_scratch +_require_btrfs property +_need_to_be_root + +rm -f $seqres.full + +_scratch_mkfs $seqres.full 21 +_scratch_mount + +mkdir $SCRATCH_MNT/testdir +echo Setting compression flag in the directory... +chattr +c $SCRATCH_MNT/testdir +echo Directory compression property value: +$BTRFS_UTIL_PROG property get $SCRATCH_MNT/testdir compression + +touch $SCRATCH_MNT/testdir/file1 +echo file1 compression property value: +$BTRFS_UTIL_PROG property get $SCRATCH_MNT/testdir/file1 compression + +echo Clearing compression flag from directory... +chattr -c $SCRATCH_MNT/testdir +echo Directory compression property value: +$BTRFS_UTIL_PROG property get $SCRATCH_MNT/testdir compression + +touch $SCRATCH_MNT/testdir/file2 +echo file2 compression property value: +$BTRFS_UTIL_PROG property get $SCRATCH_MNT/testdir/file2 compression + +touch $SCRATCH_MNT/testdir/file1 +echo file1 compression property value: +$BTRFS_UTIL_PROG property get $SCRATCH_MNT/testdir/file1 compression + +status=0 +exit diff --git a/tests/btrfs/059.out b/tests/btrfs/059.out new file mode 100644 index 000..9ec9a53 --- /dev/null +++ b/tests/btrfs/059.out @@ -0,0 +1,11 @@ +QA output created by 059 +Setting compression flag in the directory... +Directory compression property value: +compression=zlib +file1 compression property value: +compression=zlib +Clearing compression flag from directory... +Directory compression property value: +file2 compression property value: +file1 compression property value: +compression=zlib diff --git a/tests/btrfs/group b/tests/btrfs/group index 3fa9778..68b5c79 100644 --- a/tests/btrfs/group +++ b/tests/btrfs/group @@ -61,3 +61,4 @@ 056 auto quick 057 auto quick 058 auto quick +059 auto quick -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] Btrfs: set inode's logged_trans/last_log_commit after ranged fsync
When a ranged fsync finishes if there are still extent maps in the modified list, still set the inode's logged_trans and last_log_commit. This is important in case an inode is fsync'ed and unlinked in the same transaction, to ensure its inode ref gets deleted from the log and the respective dentries in its parent are deleted too from the log (if the parent directory was fsync'ed in the same transaction). Instead make btrfs_inode_in_log() return false if the list of modified extent maps isn't empty. This is an incremental on top of the v4 version of the patch: Btrfs: fix fsync data loss after a ranged fsync which was added to its v5, but didn't make it on time. Signed-off-by: Filipe Manana fdman...@suse.com --- fs/btrfs/btrfs_inode.h | 13 +++-- fs/btrfs/tree-log.c| 14 ++ 2 files changed, 13 insertions(+), 14 deletions(-) diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 74ff403..3511031 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -246,8 +246,17 @@ static inline int btrfs_inode_in_log(struct inode *inode, u64 generation) BTRFS_I(inode)-last_sub_trans = BTRFS_I(inode)-last_log_commit BTRFS_I(inode)-last_sub_trans = - BTRFS_I(inode)-root-last_log_commit) - return 1; + BTRFS_I(inode)-root-last_log_commit) { + /* +* After a ranged fsync we might have left some extent maps +* (that fall outside the fsync's range). So return false +* here if the list isn't empty, to make sure btrfs_log_inode() +* will be called and process those extent maps. +*/ + smp_mb(); + if (list_empty(BTRFS_I(inode)-extent_tree.modified_extents)) + return 1; + } return 0; } diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 82db14f..d7c1459 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -4090,18 +4090,8 @@ log_extents: } } - write_lock(em_tree-lock); - /* -* If we're doing a ranged fsync and there are still modified extents -* in the list, we must run on the next fsync call as it might cover -* those extents (a full fsync or an fsync for other range). -*/ - if (list_empty(em_tree-modified_extents)) { - BTRFS_I(inode)-logged_trans = trans-transid; - BTRFS_I(inode)-last_log_commit = - BTRFS_I(inode)-last_sub_trans; - } - write_unlock(em_tree-lock); + BTRFS_I(inode)-logged_trans = trans-transid; + BTRFS_I(inode)-last_log_commit = BTRFS_I(inode)-last_sub_trans; out_unlock: if (unlikely(err)) btrfs_put_logged_extents(logged_list); -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 1/2] Btrfs: add missing end_page_writeback on submit_extent_page failure
If submit_extent_page() fails in write_one_eb(), we end up with the current page not marked dirty anymore, unlocked and marked for writeback. But we never end up calling end_page_writeback() against the page, which will make calls to filemap_fdatawait_range (e.g. at transaction commit time) hang forever waiting for the writeback bit to be cleared from the page. Signed-off-by: Filipe Manana fdman...@suse.com --- fs/btrfs/extent_io.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 3af4966..91f866c 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3668,6 +3668,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, if (ret) { set_bit(EXTENT_BUFFER_IOERR, eb-bflags); SetPageError(p); + end_page_writeback(p); if (atomic_sub_and_test(num_pages - i, eb-io_pages)) end_extent_buffer_writeback(eb); ret = -EIO; -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 2/2] Btrfs: be aware of btree inode write errors to avoid fs corruption
While we have a transaction ongoing, the VM might decide at any time to call btree_inode-i_mapping-a_ops-writepages(), which will start writeback of dirty pages belonging to btree nodes/leafs. This call might return an error or the writeback might finish with an error before we attempt to commit the running transaction. If this happens, we might have no way of knowing that such error happened when we are committing the transaction - because the pages might no longer be marked dirty nor tagged for writeback (if a subsequent modification to the extent buffer didn't happen before the transaction commit) which makes filemap_fdata[write|wait]_range unable to find such pages (even if they're marked with SetPageError). So if this happens we must abort the transaction, otherwise we commit a super block with btree roots that point to btree nodes/leafs whose content on disk is invalid - either garbage or the content of some node/leaf from a past generation that got cowed or deleted and is no longer valid (for this later case we end up getting error messages like parent transid verify failed on 10826481664 wanted 25748 found 29562 when reading btree nodes/leafs from disk). Note that setting and checking AS_EIO/AS_ENOSPC in the btree inode's i_mapping would not be enough because we need to distinguish between log tree extents (not fatal) vs non-log tree extents (fatal) and because the next call to filemap_fdatawait_range() will catch and clear such errors in the mapping - and that call might be from a log sync and not from a transaction commit, which means we would not know about the error at transaction commit time. Also, checking for the eb flag EXTENT_BUFFER_IOERR at transaction commit time isn't done and would not be completely reliable, as the eb might be removed from memory and read back when trying to get it, which clears that flag right before reading the eb's pages from disk, making us not know about the previous write error. Using the BTRFS_INODE_BTREE_IO_ERR and BTRFS_INODE_BTREE_LOG_IO_ERR inode flags also makes us achieve the goal of AS_EIO/AS_ENOSPC when writepages() returns success, started writeback for all dirty pages and before filemap_fdatawait_range() is called, the writeback for all dirty pages had already finished with errors - because we were not using AS_EIO/AS_ENOSPC, filemap_fdatawait_range() would return success, as it could not know that writeback errors happened (the pages were no longer tagged for writeback). Signed-off-by: Filipe Manana fdman...@suse.com --- fs/btrfs/btrfs_inode.h | 2 ++ fs/btrfs/extent_io.c | 69 +++--- fs/btrfs/transaction.c | 20 --- fs/btrfs/transaction.h | 3 +-- fs/btrfs/tree-log.c| 13 ++ 5 files changed, 93 insertions(+), 14 deletions(-) diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 3511031..dbe37dc 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -44,6 +44,8 @@ #define BTRFS_INODE_IN_DELALLOC_LIST 9 #define BTRFS_INODE_READDIO_NEED_LOCK 10 #define BTRFS_INODE_HAS_PROPS 11 +#define BTRFS_INODE_BTREE_IO_ERR 12 +#define BTRFS_INODE_BTREE_LOG_IO_ERR 13 /* in memory btrfs inode */ struct btrfs_inode { diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 91f866c..33b113b 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -20,6 +20,7 @@ #include locking.h #include rcu-string.h #include backref.h +#include transaction.h static struct kmem_cache *extent_state_cache; static struct kmem_cache *extent_buffer_cache; @@ -3606,6 +3607,68 @@ static void end_extent_buffer_writeback(struct extent_buffer *eb) wake_up_bit(eb-bflags, EXTENT_BUFFER_WRITEBACK); } +static void set_btree_ioerr(struct page *page, int err) +{ + struct extent_buffer *eb = (struct extent_buffer *)page-private; + const u64 start = eb-start; + const u64 end = eb-start + eb-len - 1; + struct btrfs_fs_info *fs_info = eb-fs_info; + int ret; + + set_bit(EXTENT_BUFFER_IOERR, eb-bflags); + SetPageError(page); + + /* +* If writeback for a btree extent that doesn't belong to a log tree +* failed, set the bit BTRFS_INODE_BTREE_IO_ERR in the inode btree. +* We do this because while the transaction is running and before it's +* committing (when we call filemap_fdata[write|wait]_range against +* the btree inode), we might have +* btree_inode-i_mapping-a_ops-writepages() called by the VM - if it +* returns an error or an error happens during writeback, when we're +* committing the transaction we wouldn't know about it, since the pages +* can be no longer dirty nor marked anymore for writeback (if a +* subsequent modification to the extent buffer didn't happen before the +* transaction commit), which makes filemap_fdata[write|wait]_range not +* able to find
[PATCH 2/2 v2] Btrfs: be aware of btree inode write errors to avoid fs corruption
While we have a transaction ongoing, the VM might decide at any time to call btree_inode-i_mapping-a_ops-writepages(), which will start writeback of dirty pages belonging to btree nodes/leafs. This call might return an error or the writeback might finish with an error before we attempt to commit the running transaction. If this happens, we might have no way of knowing that such error happened when we are committing the transaction - because the pages might no longer be marked dirty nor tagged for writeback (if a subsequent modification to the extent buffer didn't happen before the transaction commit) which makes filemap_fdata[write|wait]_range unable to find such pages (even if they're marked with SetPageError). So if this happens we must abort the transaction, otherwise we commit a super block with btree roots that point to btree nodes/leafs whose content on disk is invalid - either garbage or the content of some node/leaf from a past generation that got cowed or deleted and is no longer valid (for this later case we end up getting error messages like parent transid verify failed on 10826481664 wanted 25748 found 29562 when reading btree nodes/leafs from disk). Note that setting and checking AS_EIO/AS_ENOSPC in the btree inode's i_mapping would not be enough because we need to distinguish between log tree extents (not fatal) vs non-log tree extents (fatal) and because the next call to filemap_fdatawait_range() will catch and clear such errors in the mapping - and that call might be from a log sync and not from a transaction commit, which means we would not know about the error at transaction commit time. Also, checking for the eb flag EXTENT_BUFFER_IOERR at transaction commit time isn't done and would not be completely reliable, as the eb might be removed from memory and read back when trying to get it, which clears that flag right before reading the eb's pages from disk, making us not know about the previous write error. Using the new counters eb_write_errors and log_eb_write_errors in the transaction also makes us achieve the goal of AS_EIO/AS_ENOSPC when writepages() returns success, started writeback for all dirty pages and before filemap_fdatawait_range() is called, the writeback for all dirty pages had already finished with errors - because we were not using AS_EIO/AS_ENOSPC, filemap_fdatawait_range() would return success, as it could not know that writeback errors happened (the pages were no longer tagged for writeback). Signed-off-by: Filipe Manana fdman...@suse.com --- V2: If an extent buffer's write failed but it's also deleted from the tree before the transaction commits, don't abort the transaction with -EIO, since the unwritten node/leaf it represents can't be pointed to by any other node in a tree. fs/btrfs/disk-io.c | 4 +-- fs/btrfs/extent-tree.c | 8 ++ fs/btrfs/extent_io.c | 76 +- fs/btrfs/extent_io.h | 3 +- fs/btrfs/transaction.c | 22 +-- fs/btrfs/transaction.h | 5 ++-- fs/btrfs/tree-log.c| 13 + 7 files changed, 111 insertions(+), 20 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 23393ec..8b54acf 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -610,7 +610,7 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, goto err; eb-read_mirror = mirror; - if (test_bit(EXTENT_BUFFER_IOERR, eb-bflags)) { + if (test_bit(EXTENT_BUFFER_READ_ERR, eb-bflags)) { ret = -EIO; goto err; } @@ -683,7 +683,7 @@ static int btree_io_failed_hook(struct page *page, int failed_mirror) struct btrfs_root *root = BTRFS_I(page-mapping-host)-root; eb = (struct extent_buffer *)page-private; - set_bit(EXTENT_BUFFER_IOERR, eb-bflags); + set_bit(EXTENT_BUFFER_READ_ERR, eb-bflags); eb-read_mirror = failed_mirror; atomic_dec(eb-io_pages); if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, eb-bflags)) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index ef0845d..608814b 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -6221,6 +6221,14 @@ out: * anymore. */ clear_bit(EXTENT_BUFFER_CORRUPT, buf-bflags); + /* +* The unwritten node/leaf (due to an IO error) isn't pointed to by any +* other node in a tree, so it's safe to forget about the write error +* and avoid a transaction abort. +*/ + if (test_and_clear_bit(EXTENT_BUFFER_WRITE_ERR, buf-bflags)) + atomic_dec(trans-transaction-eb_write_errors); + btrfs_put_block_group(cache); } diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 91f866c..e21f200 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -20,6 +20,7 @@ #include locking.h #include rcu-string.h #include backref.h +#include transaction.h static struct kmem_cache
[PATCH 2/2 v3] Btrfs: be aware of btree inode write errors to avoid fs corruption
While we have a transaction ongoing, the VM might decide at any time to call btree_inode-i_mapping-a_ops-writepages(), which will start writeback of dirty pages belonging to btree nodes/leafs. This call might return an error or the writeback might finish with an error before we attempt to commit the running transaction. If this happens, we might have no way of knowing that such error happened when we are committing the transaction - because the pages might no longer be marked dirty nor tagged for writeback (if a subsequent modification to the extent buffer didn't happen before the transaction commit) which makes filemap_fdata[write|wait]_range unable to find such pages (even if they're marked with SetPageError). So if this happens we must abort the transaction, otherwise we commit a super block with btree roots that point to btree nodes/leafs whose content on disk is invalid - either garbage or the content of some node/leaf from a past generation that got cowed or deleted and is no longer valid (for this later case we end up getting error messages like parent transid verify failed on 10826481664 wanted 25748 found 29562 when reading btree nodes/leafs from disk). Note that setting and checking AS_EIO/AS_ENOSPC in the btree inode's i_mapping would not be enough because we need to distinguish between log tree extents (not fatal) vs non-log tree extents (fatal) and because the next call to filemap_fdatawait_range() will catch and clear such errors in the mapping - and that call might be from a log sync and not from a transaction commit, which means we would not know about the error at transaction commit time. Also, checking for the eb flag EXTENT_BUFFER_IOERR at transaction commit time isn't done and would not be completely reliable, as the eb might be removed from memory and read back when trying to get it, which clears that flag right before reading the eb's pages from disk, making us not know about the previous write error. Using the new counters eb_write_errors and log_eb_write_errors in the transaction also makes us achieve the goal of AS_EIO/AS_ENOSPC when writepages() returns success, started writeback for all dirty pages and before filemap_fdatawait_range() is called, the writeback for all dirty pages had already finished with errors - because we were not using AS_EIO/AS_ENOSPC, filemap_fdatawait_range() would return success, as it could not know that writeback errors happened (the pages were no longer tagged for writeback). Signed-off-by: Filipe Manana fdman...@suse.com --- V2: If an extent buffer's write failed but it's also deleted from the tree before the transaction commits, don't abort the transaction with -EIO, since the unwritten node/leaf it represents can't be pointed to by any other node in a tree. V3: Correct V2, missed unstaged changes. fs/btrfs/disk-io.c | 4 +-- fs/btrfs/extent-tree.c | 8 ++ fs/btrfs/extent_io.c | 76 +- fs/btrfs/extent_io.h | 3 +- fs/btrfs/transaction.c | 22 +-- fs/btrfs/transaction.h | 5 ++-- fs/btrfs/tree-log.c| 13 + 7 files changed, 111 insertions(+), 20 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 23393ec..8b54acf 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -610,7 +610,7 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, goto err; eb-read_mirror = mirror; - if (test_bit(EXTENT_BUFFER_IOERR, eb-bflags)) { + if (test_bit(EXTENT_BUFFER_READ_ERR, eb-bflags)) { ret = -EIO; goto err; } @@ -683,7 +683,7 @@ static int btree_io_failed_hook(struct page *page, int failed_mirror) struct btrfs_root *root = BTRFS_I(page-mapping-host)-root; eb = (struct extent_buffer *)page-private; - set_bit(EXTENT_BUFFER_IOERR, eb-bflags); + set_bit(EXTENT_BUFFER_READ_ERR, eb-bflags); eb-read_mirror = failed_mirror; atomic_dec(eb-io_pages); if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, eb-bflags)) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index ef0845d..608814b 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -6221,6 +6221,14 @@ out: * anymore. */ clear_bit(EXTENT_BUFFER_CORRUPT, buf-bflags); + /* +* The unwritten node/leaf (due to an IO error) isn't pointed to by any +* other node in a tree, so it's safe to forget about the write error +* and avoid a transaction abort. +*/ + if (test_and_clear_bit(EXTENT_BUFFER_WRITE_ERR, buf-bflags)) + atomic_dec(trans-transaction-eb_write_errors); + btrfs_put_block_group(cache); } diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 91f866c..da1706f 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -20,6 +20,7 @@ #include locking.h #include rcu-string.h #include backref.h +#include transaction.h
[PATCH 2/2 v4] Btrfs: be aware of btree inode write errors to avoid fs corruption
While we have a transaction ongoing, the VM might decide at any time to call btree_inode-i_mapping-a_ops-writepages(), which will start writeback of dirty pages belonging to btree nodes/leafs. This call might return an error or the writeback might finish with an error before we attempt to commit the running transaction. If this happens, we might have no way of knowing that such error happened when we are committing the transaction - because the pages might no longer be marked dirty nor tagged for writeback (if a subsequent modification to the extent buffer didn't happen before the transaction commit) which makes filemap_fdata[write|wait]_range unable to find such pages (even if they're marked with SetPageError). So if this happens we must abort the transaction, otherwise we commit a super block with btree roots that point to btree nodes/leafs whose content on disk is invalid - either garbage or the content of some node/leaf from a past generation that got cowed or deleted and is no longer valid (for this later case we end up getting error messages like parent transid verify failed on 10826481664 wanted 25748 found 29562 when reading btree nodes/leafs from disk). Note that setting and checking AS_EIO/AS_ENOSPC in the btree inode's i_mapping would not be enough because we need to distinguish between log tree extents (not fatal) vs non-log tree extents (fatal) and because the next call to filemap_fdatawait_range() will catch and clear such errors in the mapping - and that call might be from a log sync and not from a transaction commit, which means we would not know about the error at transaction commit time. Also, checking for the eb flag EXTENT_BUFFER_IOERR at transaction commit time isn't done and would not be completely reliable, as the eb might be removed from memory and read back when trying to get it, which clears that flag right before reading the eb's pages from disk, making us not know about the previous write error. Using the new counters eb_write_errors and log_eb_write_errors in the transaction also makes us achieve the goal of AS_EIO/AS_ENOSPC when writepages() returns success, started writeback for all dirty pages and before filemap_fdatawait_range() is called, the writeback for all dirty pages had already finished with errors - because we were not using AS_EIO/AS_ENOSPC, filemap_fdatawait_range() would return success, as it could not know that writeback errors happened (the pages were no longer tagged for writeback). Signed-off-by: Filipe Manana fdman...@suse.com --- V2: If an extent buffer's write failed but it's also deleted from the tree before the transaction commits, don't abort the transaction with -EIO, since the unwritten node/leaf it represents can't be pointed to by any other node in a tree. V3: Correct V2, missed unstaged changes. V4: Use root's key to figure out which counter to update. fs/btrfs/disk-io.c | 4 +-- fs/btrfs/extent-tree.c | 12 + fs/btrfs/extent_io.c | 71 +- fs/btrfs/extent_io.h | 3 ++- fs/btrfs/transaction.c | 22 +--- fs/btrfs/transaction.h | 5 ++-- fs/btrfs/tree-log.c| 13 + 7 files changed, 110 insertions(+), 20 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 23393ec..8b54acf 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -610,7 +610,7 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, goto err; eb-read_mirror = mirror; - if (test_bit(EXTENT_BUFFER_IOERR, eb-bflags)) { + if (test_bit(EXTENT_BUFFER_READ_ERR, eb-bflags)) { ret = -EIO; goto err; } @@ -683,7 +683,7 @@ static int btree_io_failed_hook(struct page *page, int failed_mirror) struct btrfs_root *root = BTRFS_I(page-mapping-host)-root; eb = (struct extent_buffer *)page-private; - set_bit(EXTENT_BUFFER_IOERR, eb-bflags); + set_bit(EXTENT_BUFFER_READ_ERR, eb-bflags); eb-read_mirror = failed_mirror; atomic_dec(eb-io_pages); if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, eb-bflags)) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index ef0845d..bdacd33 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -6221,6 +6221,18 @@ out: * anymore. */ clear_bit(EXTENT_BUFFER_CORRUPT, buf-bflags); + /* +* The unwritten node/leaf (due to an IO error) isn't pointed to by any +* other node in a tree, so it's safe to forget about the write error +* and avoid a transaction abort. +*/ + if (test_and_clear_bit(EXTENT_BUFFER_WRITE_ERR, buf-bflags)) { + if (root-root_key.objectid == BTRFS_TREE_LOG_OBJECTID) + atomic_dec(trans-transaction-log_eb_write_errors); + else + atomic_dec(trans-transaction-eb_write_errors); + } + btrfs_put_block_group(cache
[PATCH 2/2 v5] Btrfs: be aware of btree inode write errors to avoid fs corruption
While we have a transaction ongoing, the VM might decide at any time to call btree_inode-i_mapping-a_ops-writepages(), which will start writeback of dirty pages belonging to btree nodes/leafs. This call might return an error or the writeback might finish with an error before we attempt to commit the running transaction. If this happens, we might have no way of knowing that such error happened when we are committing the transaction - because the pages might no longer be marked dirty nor tagged for writeback (if a subsequent modification to the extent buffer didn't happen before the transaction commit) which makes filemap_fdata[write|wait]_range unable to find such pages (even if they're marked with SetPageError). So if this happens we must abort the transaction, otherwise we commit a super block with btree roots that point to btree nodes/leafs whose content on disk is invalid - either garbage or the content of some node/leaf from a past generation that got cowed or deleted and is no longer valid (for this later case we end up getting error messages like parent transid verify failed on 10826481664 wanted 25748 found 29562 when reading btree nodes/leafs from disk). Note that setting and checking AS_EIO/AS_ENOSPC in the btree inode's i_mapping would not be enough because we need to distinguish between log tree extents (not fatal) vs non-log tree extents (fatal) and because the next call to filemap_fdatawait_range() will catch and clear such errors in the mapping - and that call might be from a log sync and not from a transaction commit, which means we would not know about the error at transaction commit time. Also, checking for the eb flag EXTENT_BUFFER_IOERR at transaction commit time isn't done and would not be completely reliable, as the eb might be removed from memory and read back when trying to get it, which clears that flag right before reading the eb's pages from disk, making us not know about the previous write error. Using the new counters eb_write_errors and log_eb_write_errors in the transaction also makes us achieve the goal of AS_EIO/AS_ENOSPC when writepages() returns success, started writeback for all dirty pages and before filemap_fdatawait_range() is called, the writeback for all dirty pages had already finished with errors - because we were not using AS_EIO/AS_ENOSPC, filemap_fdatawait_range() would return success, as it could not know that writeback errors happened (the pages were no longer tagged for writeback). Signed-off-by: Filipe Manana fdman...@suse.com --- V2: If an extent buffer's write failed but it's also deleted from the tree before the transaction commits, don't abort the transaction with -EIO, since the unwritten node/leaf it represents can't be pointed to by any other node in a tree. V3: Correct V2, missed unstaged changes. V4: Use root's key to figure out which counter to update. V5: Decrement the error counters too when an eb is made dirty again (the next write attempt might succeed). fs/btrfs/disk-io.c | 4 +-- fs/btrfs/extent-tree.c | 1 + fs/btrfs/extent_io.c | 98 ++ fs/btrfs/extent_io.h | 4 ++- fs/btrfs/transaction.c | 21 +-- fs/btrfs/transaction.h | 5 +-- fs/btrfs/tree-log.c| 13 --- 7 files changed, 126 insertions(+), 20 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 23393ec..8b54acf 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -610,7 +610,7 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, goto err; eb-read_mirror = mirror; - if (test_bit(EXTENT_BUFFER_IOERR, eb-bflags)) { + if (test_bit(EXTENT_BUFFER_READ_ERR, eb-bflags)) { ret = -EIO; goto err; } @@ -683,7 +683,7 @@ static int btree_io_failed_hook(struct page *page, int failed_mirror) struct btrfs_root *root = BTRFS_I(page-mapping-host)-root; eb = (struct extent_buffer *)page-private; - set_bit(EXTENT_BUFFER_IOERR, eb-bflags); + set_bit(EXTENT_BUFFER_READ_ERR, eb-bflags); eb-read_mirror = failed_mirror; atomic_dec(eb-io_pages); if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, eb-bflags)) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index ef0845d..ec185b5 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -6221,6 +6221,7 @@ out: * anymore. */ clear_bit(EXTENT_BUFFER_CORRUPT, buf-bflags); + clear_extent_buffer_write_err(buf); btrfs_put_block_group(cache); } diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 91f866c..ed1be9c 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -20,6 +20,7 @@ #include locking.h #include rcu-string.h #include backref.h +#include transaction.h static struct kmem_cache *extent_state_cache; static struct kmem_cache *extent_buffer_cache; @@ -3606,6 +3607,63 @@ static void
[PATCH 2/2 v6] Btrfs: be aware of btree inode write errors to avoid fs corruption
While we have a transaction ongoing, the VM might decide at any time to call btree_inode-i_mapping-a_ops-writepages(), which will start writeback of dirty pages belonging to btree nodes/leafs. This call might return an error or the writeback might finish with an error before we attempt to commit the running transaction. If this happens, we might have no way of knowing that such error happened when we are committing the transaction - because the pages might no longer be marked dirty nor tagged for writeback (if a subsequent modification to the extent buffer didn't happen before the transaction commit) which makes filemap_fdata[write|wait]_range unable to find such pages (even if they're marked with SetPageError). So if this happens we must abort the transaction, otherwise we commit a super block with btree roots that point to btree nodes/leafs whose content on disk is invalid - either garbage or the content of some node/leaf from a past generation that got cowed or deleted and is no longer valid (for this later case we end up getting error messages like parent transid verify failed on 10826481664 wanted 25748 found 29562 when reading btree nodes/leafs from disk). Note that setting and checking AS_EIO/AS_ENOSPC in the btree inode's i_mapping would not be enough because we need to distinguish between log tree extents (not fatal) vs non-log tree extents (fatal) and because the next call to filemap_fdatawait_range() will catch and clear such errors in the mapping - and that call might be from a log sync and not from a transaction commit, which means we would not know about the error at transaction commit time. Also, checking for the eb flag EXTENT_BUFFER_IOERR at transaction commit time isn't done and would not be completely reliable, as the eb might be removed from memory and read back when trying to get it, which clears that flag right before reading the eb's pages from disk, making us not know about the previous write error. Using the new counters eb_write_errors and log_eb_write_errors in the transaction also makes us achieve the goal of AS_EIO/AS_ENOSPC when writepages() returns success, started writeback for all dirty pages and before filemap_fdatawait_range() is called, the writeback for all dirty pages had already finished with errors - because we were not using AS_EIO/AS_ENOSPC, filemap_fdatawait_range() would return success, as it could not know that writeback errors happened (the pages were no longer tagged for writeback). Signed-off-by: Filipe Manana fdman...@suse.com --- V2: If an extent buffer's write failed but it's also deleted from the tree before the transaction commits, don't abort the transaction with -EIO, since the unwritten node/leaf it represents can't be pointed to by any other node in a tree. V3: Correct V2, missed unstaged changes. V4: Use root's key to figure out which counter to update. V5: Decrement the error counters too when an eb is made dirty again (the next write attempt might succeed). V6: Moved counters from transaction struct to fs_info struct, because there's a (short) time window where fs_info-running_transaction is NULL. There's now 2 counters for log extent buffers too, each one representing a different log transaction. fs/btrfs/ctree.h | 3 ++ fs/btrfs/disk-io.c | 7 +++- fs/btrfs/extent-tree.c | 1 + fs/btrfs/extent_io.c | 105 + fs/btrfs/extent_io.h | 4 +- fs/btrfs/transaction.c | 18 + fs/btrfs/tree-log.c| 23 +++ fs/btrfs/tree-log.h| 2 + 8 files changed, 153 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index f20b60d..0e5ca39 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1510,6 +1510,9 @@ struct btrfs_fs_info { atomic_t async_delalloc_pages; atomic_t open_ioctl_trans; + atomic_t eb_write_errors; + atomic_t log_eb_write_errors[2]; + /* * this is used to protect the following list -- ordered_roots. */ diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 23393ec..8f1deca 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -610,7 +610,7 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, goto err; eb-read_mirror = mirror; - if (test_bit(EXTENT_BUFFER_IOERR, eb-bflags)) { + if (test_bit(EXTENT_BUFFER_READ_ERR, eb-bflags)) { ret = -EIO; goto err; } @@ -683,7 +683,7 @@ static int btree_io_failed_hook(struct page *page, int failed_mirror) struct btrfs_root *root = BTRFS_I(page-mapping-host)-root; eb = (struct extent_buffer *)page-private; - set_bit(EXTENT_BUFFER_IOERR, eb-bflags); + set_bit(EXTENT_BUFFER_READ_ERR, eb-bflags); eb-read_mirror = failed_mirror; atomic_dec(eb-io_pages); if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, eb-bflags)) @@ -2271,6 +2271,9 @@ int
[PATCH 2/2 v7] Btrfs: be aware of btree inode write errors to avoid fs corruption
While we have a transaction ongoing, the VM might decide at any time to call btree_inode-i_mapping-a_ops-writepages(), which will start writeback of dirty pages belonging to btree nodes/leafs. This call might return an error or the writeback might finish with an error before we attempt to commit the running transaction. If this happens, we might have no way of knowing that such error happened when we are committing the transaction - because the pages might no longer be marked dirty nor tagged for writeback (if a subsequent modification to the extent buffer didn't happen before the transaction commit) which makes filemap_fdata[write|wait]_range unable to find such pages (even if they're marked with SetPageError). So if this happens we must abort the transaction, otherwise we commit a super block with btree roots that point to btree nodes/leafs whose content on disk is invalid - either garbage or the content of some node/leaf from a past generation that got cowed or deleted and is no longer valid (for this later case we end up getting error messages like parent transid verify failed on 10826481664 wanted 25748 found 29562 when reading btree nodes/leafs from disk). Note that setting and checking AS_EIO/AS_ENOSPC in the btree inode's i_mapping would not be enough because we need to distinguish between log tree extents (not fatal) vs non-log tree extents (fatal) and because the next call to filemap_fdatawait_range() will catch and clear such errors in the mapping - and that call might be from a log sync and not from a transaction commit, which means we would not know about the error at transaction commit time. Also, checking for the eb flag EXTENT_BUFFER_IOERR at transaction commit time isn't done and would not be completely reliable, as the eb might be removed from memory and read back when trying to get it, which clears that flag right before reading the eb's pages from disk, making us not know about the previous write error. Using the new counters eb_write_errors and log_eb_write_errors in the transaction also makes us achieve the goal of AS_EIO/AS_ENOSPC when writepages() returns success, started writeback for all dirty pages and before filemap_fdatawait_range() is called, the writeback for all dirty pages had already finished with errors - because we were not using AS_EIO/AS_ENOSPC, filemap_fdatawait_range() would return success, as it could not know that writeback errors happened (the pages were no longer tagged for writeback). Signed-off-by: Filipe Manana fdman...@suse.com --- V2: If an extent buffer's write failed but it's also deleted from the tree before the transaction commits, don't abort the transaction with -EIO, since the unwritten node/leaf it represents can't be pointed to by any other node in a tree. V3: Correct V2, missed unstaged changes. V4: Use root's key to figure out which counter to update. V5: Decrement the error counters too when an eb is made dirty again (the next write attempt might succeed). V6: Moved counters from transaction struct to fs_info struct, because there's a (short) time window where fs_info-running_transaction is NULL. There's now 2 counters for log extent buffers too, each one representing a different log transaction. V7: Track the eb's log index in the eb itself, otherwise it wasn't possible to find it when writeback triggered from a transaction commit. fs/btrfs/ctree.h | 3 ++ fs/btrfs/disk-io.c | 7 +++- fs/btrfs/extent-tree.c | 4 +- fs/btrfs/extent_io.c | 101 + fs/btrfs/extent_io.h | 6 ++- fs/btrfs/transaction.c | 18 + 6 files changed, 128 insertions(+), 11 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index f20b60d..0e5ca39 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1510,6 +1510,9 @@ struct btrfs_fs_info { atomic_t async_delalloc_pages; atomic_t open_ioctl_trans; + atomic_t eb_write_errors; + atomic_t log_eb_write_errors[2]; + /* * this is used to protect the following list -- ordered_roots. */ diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 23393ec..8f1deca 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -610,7 +610,7 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, goto err; eb-read_mirror = mirror; - if (test_bit(EXTENT_BUFFER_IOERR, eb-bflags)) { + if (test_bit(EXTENT_BUFFER_READ_ERR, eb-bflags)) { ret = -EIO; goto err; } @@ -683,7 +683,7 @@ static int btree_io_failed_hook(struct page *page, int failed_mirror) struct btrfs_root *root = BTRFS_I(page-mapping-host)-root; eb = (struct extent_buffer *)page-private; - set_bit(EXTENT_BUFFER_IOERR, eb-bflags); + set_bit(EXTENT_BUFFER_READ_ERR, eb-bflags); eb-read_mirror = failed_mirror; atomic_dec(eb-io_pages
[PATCH 2/2 v8] Btrfs: be aware of btree inode write errors to avoid fs corruption
While we have a transaction ongoing, the VM might decide at any time to call btree_inode-i_mapping-a_ops-writepages(), which will start writeback of dirty pages belonging to btree nodes/leafs. This call might return an error or the writeback might finish with an error before we attempt to commit the running transaction. If this happens, we might have no way of knowing that such error happened when we are committing the transaction - because the pages might no longer be marked dirty nor tagged for writeback (if a subsequent modification to the extent buffer didn't happen before the transaction commit) which makes filemap_fdata[write|wait]_range unable to find such pages (even if they're marked with SetPageError). So if this happens we must abort the transaction, otherwise we commit a super block with btree roots that point to btree nodes/leafs whose content on disk is invalid - either garbage or the content of some node/leaf from a past generation that got cowed or deleted and is no longer valid (for this later case we end up getting error messages like parent transid verify failed on 10826481664 wanted 25748 found 29562 when reading btree nodes/leafs from disk). Note that setting and checking AS_EIO/AS_ENOSPC in the btree inode's i_mapping would not be enough because we need to distinguish between log tree extents (not fatal) vs non-log tree extents (fatal) and because the next call to filemap_fdatawait_range() will catch and clear such errors in the mapping - and that call might be from a log sync and not from a transaction commit, which means we would not know about the error at transaction commit time. Also, checking for the eb flag EXTENT_BUFFER_IOERR at transaction commit time isn't done and would not be completely reliable, as the eb might be removed from memory and read back when trying to get it, which clears that flag right before reading the eb's pages from disk, making us not know about the previous write error. Using the new counters eb_write_errors and log_eb_write_errors in the transaction also makes us achieve the goal of AS_EIO/AS_ENOSPC when writepages() returns success, started writeback for all dirty pages and before filemap_fdatawait_range() is called, the writeback for all dirty pages had already finished with errors - because we were not using AS_EIO/AS_ENOSPC, filemap_fdatawait_range() would return success, as it could not know that writeback errors happened (the pages were no longer tagged for writeback). Signed-off-by: Filipe Manana fdman...@suse.com --- V2: If an extent buffer's write failed but it's also deleted from the tree before the transaction commits, don't abort the transaction with -EIO, since the unwritten node/leaf it represents can't be pointed to by any other node in a tree. V3: Correct V2, missed unstaged changes. V4: Use root's key to figure out which counter to update. V5: Decrement the error counters too when an eb is made dirty again (the next write attempt might succeed). V6: Moved counters from transaction struct to fs_info struct, because there's a (short) time window where fs_info-running_transaction is NULL. There's now 2 counters for log extent buffers too, each one representing a different log transaction. V7: Track the eb's log index in the eb itself, otherwise it wasn't possible to find it when writeback triggered from a transaction commit. V8: Track the log eb write errors per root instead, and reset them on a transaction commit. fs/btrfs/ctree.h | 2 + fs/btrfs/disk-io.c | 7 +++- fs/btrfs/extent-tree.c | 4 +- fs/btrfs/extent_io.c | 100 + fs/btrfs/extent_io.h | 8 +++- fs/btrfs/transaction.c | 18 + fs/btrfs/tree-log.c| 2 + 7 files changed, 129 insertions(+), 12 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index f20b60d..96f5186 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1509,6 +1509,7 @@ struct btrfs_fs_info { atomic_t nr_async_bios; atomic_t async_delalloc_pages; atomic_t open_ioctl_trans; + atomic_t eb_write_errors; /* * this is used to protect the following list -- ordered_roots. @@ -1790,6 +1791,7 @@ struct btrfs_root { atomic_t log_writers; atomic_t log_commit[2]; atomic_t log_batch; + atomic_t log_eb_write_errors[2]; int log_transid; /* No matter the commit succeeds or not*/ int log_transid_committed; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 23393ec..e792ee3 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -610,7 +610,7 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, goto err; eb-read_mirror = mirror; - if (test_bit(EXTENT_BUFFER_IOERR, eb-bflags)) { + if (test_bit(EXTENT_BUFFER_READ_ERR, eb-bflags)) { ret = -EIO; goto err; } @@ -683,7 +683,7
[PATCH v3] xfstests: generic: add dir fsync test, motivated by a btrfs bug
This test is motivated by a bug found in btrfs when replaying a directory from the fsync log. The issue was that if a directory entry is both found in the persisted metadata and in the fsync log, at log replay time the directory got set with a wrong i_size. This had the consequence of not being able to rmdir empty directories (failed with errno ENOTEMPTY). This was fixed in btrfs with the following linux kernel patch: Btrfs: fix directory recovery from fsync log Signed-off-by: Filipe Manana fdman...@suse.com --- V2: Replaced direct use of xfs_io with $XFS_IO_PROG. V3: Made the explanations more clear and mention what's the consequence of getting a directory's inode with an incorrect i_size on btrfs. Changed the test to not call the filesystem check (since it's already done by the test framework) and verify that an rmdir against an empty directory succeeds. tests/generic/326 | 105 ++ tests/generic/326.out | 2 + tests/generic/group | 1 + 3 files changed, 108 insertions(+) create mode 100755 tests/generic/326 create mode 100644 tests/generic/326.out diff --git a/tests/generic/326 b/tests/generic/326 new file mode 100755 index 000..e59c560 --- /dev/null +++ b/tests/generic/326 @@ -0,0 +1,105 @@ +#! /bin/bash +# FS QA Test No. 326 +# +# This test is motivated by a bug found in btrfs when replaying a directory +# from the fsync log. The issue was that if a directory entry is both found +# in the persisted metadata and in the fsync log, at log replay time the +# directory got set with a wrong i_size. This had the consequence of not being +# able to rmdir empty directories (failed with errno ENOTEMPTY). +# This was fixed in btrfs with the following linux kernel patch: +# +# Btrfs: fix directory recovery from fsync log +# +#--- +# Copyright (C) 2014 SUSE Linux Products GmbH. All Rights Reserved. +# Author: Filipe Manana fdman...@suse.com +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +#--- +# + +seq=`basename $0` +seqres=$RESULT_DIR/$seq +echo QA output created by $seq + +here=`pwd` +status=1 # failure is the default! + +_cleanup() +{ + _cleanup_flakey +} +trap _cleanup; exit \$status 0 1 2 3 15 + +# get standard environment, filters and checks +. ./common/rc +. ./common/filter +. ./common/dmflakey + +# real QA test starts here +_supported_fs generic +_supported_os Linux +_need_to_be_root +_require_scratch +_require_dm_flakey + +rm -f $seqres.full + +_scratch_mkfs $seqres.full 21 + +_init_flakey +_mount_flakey + +mkdir $SCRATCH_MNT/test_dir +touch $SCRATCH_MNT/test_dir/foo + +# Invoke sync here because it's necessary to trigger the original bug in btrfs. +# The intention is that at log recovery time we have a dir entry for 'foo' both +# in the fs/subvol tree and in the log tree - this is necessary to trigger the +# bug on btrfs. +sync + +touch $SCRATCH_MNT/test_dir/bar +$XFS_IO_PROG -c fsync $SCRATCH_MNT/test_dir +$XFS_IO_PROG -c fsync $SCRATCH_MNT/test_dir/bar + +_load_flakey_table $FLAKEY_DROP_WRITES +_unmount_flakey + +_load_flakey_table $FLAKEY_ALLOW_WRITES +# In the original btrfs bug, log replay would update the directory's inode +# i_size incorrectly - it would sum again the size of dentry 'foo' (3) to +# the inode's i_size, which is incorrect because the dentry was already +# persisted before (in the fs/subvol tree). +_mount_flakey + +[ -f $SCRATCH_MNT/test_dir/foo ] || echo file foo is missing +[ -f $SCRATCH_MNT/test_dir/bar ] || echo file bar is missing + +rm -f $SCRATCH_MNT/test_dir/foo +rm -f $SCRATCH_MNT/test_dir/bar + +# In btrfs removing all entries from a directory should set the directory's +# inode i_size to 0, but with this bug that didn't happen and this made +# an rmdir fail with errno ENOTEMPTY (even though the directory had no more +# entries in it). +rmdir $SCRATCH_MNT/test_dir +[ -d $SCRATCH_MNT/test_dir ] echo rmdir didn't succeed + +_unmount_flakey + +echo Silence is golden + +status=0 +exit diff --git a/tests/generic/326.out b/tests/generic/326.out new file mode 100644 index 000..4ac0db5 --- /dev/null +++ b/tests/generic/326.out @@ -0,0 +1,2 @@ +QA output created by 326 +Silence is golden diff --git a/tests/generic/group b/tests/generic/group index bdcfd9d
[PATCH 2/2 v9] Btrfs: be aware of btree inode write errors to avoid fs corruption
While we have a transaction ongoing, the VM might decide at any time to call btree_inode-i_mapping-a_ops-writepages(), which will start writeback of dirty pages belonging to btree nodes/leafs. This call might return an error or the writeback might finish with an error before we attempt to commit the running transaction. If this happens, we might have no way of knowing that such error happened when we are committing the transaction - because the pages might no longer be marked dirty nor tagged for writeback (if a subsequent modification to the extent buffer didn't happen before the transaction commit) which makes filemap_fdata[write|wait]_range unable to find such pages (even if they're marked with SetPageError). So if this happens we must abort the transaction, otherwise we commit a super block with btree roots that point to btree nodes/leafs whose content on disk is invalid - either garbage or the content of some node/leaf from a past generation that got cowed or deleted and is no longer valid (for this later case we end up getting error messages like parent transid verify failed on 10826481664 wanted 25748 found 29562 when reading btree nodes/leafs from disk). Note that setting and checking AS_EIO/AS_ENOSPC in the btree inode's i_mapping would not be enough because we need to distinguish between log tree extents (not fatal) vs non-log tree extents (fatal) and because the next call to filemap_fdatawait_range() will catch and clear such errors in the mapping - and that call might be from a log sync and not from a transaction commit, which means we would not know about the error at transaction commit time. Also, checking for the eb flag EXTENT_BUFFER_IOERR at transaction commit time isn't done and would not be completely reliable, as the eb might be removed from memory and read back when trying to get it, which clears that flag right before reading the eb's pages from disk, making us not know about the previous write error. Using the new counters eb_write_errors and log_eb_write_errors in the transaction also makes us achieve the goal of AS_EIO/AS_ENOSPC when writepages() returns success, started writeback for all dirty pages and before filemap_fdatawait_range() is called, the writeback for all dirty pages had already finished with errors - because we were not using AS_EIO/AS_ENOSPC, filemap_fdatawait_range() would return success, as it could not know that writeback errors happened (the pages were no longer tagged for writeback). Signed-off-by: Filipe Manana fdman...@suse.com --- V2: If an extent buffer's write failed but it's also deleted from the tree before the transaction commits, don't abort the transaction with -EIO, since the unwritten node/leaf it represents can't be pointed to by any other node in a tree. V3: Correct V2, missed unstaged changes. V4: Use root's key to figure out which counter to update. V5: Decrement the error counters too when an eb is made dirty again (the next write attempt might succeed). V6: Moved counters from transaction struct to fs_info struct, because there's a (short) time window where fs_info-running_transaction is NULL. There's now 2 counters for log extent buffers too, each one representing a different log transaction. V7: Track the eb's log index in the eb itself, otherwise it wasn't possible to find it when writeback triggered from a transaction commit. V8: Track the log eb write errors per root instead, and reset them on a transaction commit. V9: Don't decrement the error counters if the eb is deleted or re-written. It is not safe because there's a time window when committing a transaction, between setting fs_info-current_transaction to NULL and checking the error counters in btrfs_write_and_wait_transaction(), where a new transaction can start and delete or re-write an eb that has the write error flag set. If this happens it means the previous transaction can write a superblock that refers to trees that point to unwritten nodes. Replaced the counters with simple flags in the btree inode's runtime flags - essentially back to V1 but accounting for the 2 different log sub-transactions. Removed access to an eb's parent root through BTRFS_I(eb-pages[0]-mapping-host)-root since it was not correct, as this always gives us the btree inode's root (objectid 1ULL). Instead use the field eb-log_index to know wether it's a log btree eb (and which sub- -transaction) or a non-log btree eb. fs/btrfs/btrfs_inode.h | 11 fs/btrfs/disk-io.c | 4 +-- fs/btrfs/extent-tree.c | 4 ++- fs/btrfs/extent_io.c | 76 +- fs/btrfs/extent_io.h | 8 -- fs/btrfs/transaction.c | 22 +++ fs/btrfs/tree-log.c| 6 7 files changed, 119 insertions(+), 12 deletions(-) diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 3511031..aee4050 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs
[PATCH 2/2 v10] Btrfs: be aware of btree inode write errors to avoid fs corruption
While we have a transaction ongoing, the VM might decide at any time to call btree_inode-i_mapping-a_ops-writepages(), which will start writeback of dirty pages belonging to btree nodes/leafs. This call might return an error or the writeback might finish with an error before we attempt to commit the running transaction. If this happens, we might have no way of knowing that such error happened when we are committing the transaction - because the pages might no longer be marked dirty nor tagged for writeback (if a subsequent modification to the extent buffer didn't happen before the transaction commit) which makes filemap_fdata[write|wait]_range unable to find such pages (even if they're marked with SetPageError). So if this happens we must abort the transaction, otherwise we commit a super block with btree roots that point to btree nodes/leafs whose content on disk is invalid - either garbage or the content of some node/leaf from a past generation that got cowed or deleted and is no longer valid (for this later case we end up getting error messages like parent transid verify failed on 10826481664 wanted 25748 found 29562 when reading btree nodes/leafs from disk). Note that setting and checking AS_EIO/AS_ENOSPC in the btree inode's i_mapping would not be enough because we need to distinguish between log tree extents (not fatal) vs non-log tree extents (fatal) and because the next call to filemap_fdatawait_range() will catch and clear such errors in the mapping - and that call might be from a log sync and not from a transaction commit, which means we would not know about the error at transaction commit time. Also, checking for the eb flag EXTENT_BUFFER_IOERR at transaction commit time isn't done and would not be completely reliable, as the eb might be removed from memory and read back when trying to get it, which clears that flag right before reading the eb's pages from disk, making us not know about the previous write error. Using the new counters eb_write_errors and log_eb_write_errors in the transaction also makes us achieve the goal of AS_EIO/AS_ENOSPC when writepages() returns success, started writeback for all dirty pages and before filemap_fdatawait_range() is called, the writeback for all dirty pages had already finished with errors - because we were not using AS_EIO/AS_ENOSPC, filemap_fdatawait_range() would return success, as it could not know that writeback errors happened (the pages were no longer tagged for writeback). Signed-off-by: Filipe Manana fdman...@suse.com --- V2: If an extent buffer's write failed but it's also deleted from the tree before the transaction commits, don't abort the transaction with -EIO, since the unwritten node/leaf it represents can't be pointed to by any other node in a tree. V3: Correct V2, missed unstaged changes. V4: Use root's key to figure out which counter to update. V5: Decrement the error counters too when an eb is made dirty again (the next write attempt might succeed). V6: Moved counters from transaction struct to fs_info struct, because there's a (short) time window where fs_info-running_transaction is NULL. There's now 2 counters for log extent buffers too, each one representing a different log transaction. V7: Track the eb's log index in the eb itself, otherwise it wasn't possible to find it when writeback triggered from a transaction commit. V8: Track the log eb write errors per root instead, and reset them on a transaction commit. V9: Don't decrement the error counters if the eb is deleted or re-written. It is not safe because there's a time window when committing a transaction, between setting fs_info-current_transaction to NULL and checking the error counters in btrfs_write_and_wait_transaction(), where a new transaction can start and delete or re-write an eb that has the write error flag set. If this happens it means the previous transaction can write a superblock that refers to trees that point to unwritten nodes. Replaced the counters with simple flags in the btree inode's runtime flags - essentially back to V1 but accounting for the 2 different log sub-transactions. Removed access to an eb's parent root through BTRFS_I(eb-pages[0]-mapping-host)-root since it was not correct, as this always gives us the btree inode's root (objectid 1ULL). Instead use the field eb-log_index to know wether it's a log btree eb (and which sub- -transaction) or a non-log btree eb. V10: Clear the log eb write error flags in a more logical place (transaction commit function). fs/btrfs/btrfs_inode.h | 11 fs/btrfs/disk-io.c | 4 +-- fs/btrfs/extent-tree.c | 4 ++- fs/btrfs/extent_io.c | 76 +- fs/btrfs/extent_io.h | 8 -- fs/btrfs/transaction.c | 26 + 6 files changed, 117 insertions(+), 12 deletions(-) diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index
[PATCH 2/2 v11] Btrfs: be aware of btree inode write errors to avoid fs corruption
While we have a transaction ongoing, the VM might decide at any time to call btree_inode-i_mapping-a_ops-writepages(), which will start writeback of dirty pages belonging to btree nodes/leafs. This call might return an error or the writeback might finish with an error before we attempt to commit the running transaction. If this happens, we might have no way of knowing that such error happened when we are committing the transaction - because the pages might no longer be marked dirty nor tagged for writeback (if a subsequent modification to the extent buffer didn't happen before the transaction commit) which makes filemap_fdata[write|wait]_range unable to find such pages (even if they're marked with SetPageError). So if this happens we must abort the transaction, otherwise we commit a super block with btree roots that point to btree nodes/leafs whose content on disk is invalid - either garbage or the content of some node/leaf from a past generation that got cowed or deleted and is no longer valid (for this later case we end up getting error messages like parent transid verify failed on 10826481664 wanted 25748 found 29562 when reading btree nodes/leafs from disk). Note that setting and checking AS_EIO/AS_ENOSPC in the btree inode's i_mapping would not be enough because we need to distinguish between log tree extents (not fatal) vs non-log tree extents (fatal) and because the next call to filemap_fdatawait_range() will catch and clear such errors in the mapping - and that call might be from a log sync and not from a transaction commit, which means we would not know about the error at transaction commit time. Also, checking for the eb flag EXTENT_BUFFER_IOERR at transaction commit time isn't done and would not be completely reliable, as the eb might be removed from memory and read back when trying to get it, which clears that flag right before reading the eb's pages from disk, making us not know about the previous write error. Using the new 3 flags for the btree inode also makes us achieve the goal of AS_EIO/AS_ENOSPC when writepages() returns success, started writeback for all dirty pages and before filemap_fdatawait_range() is called, the writeback for all dirty pages had already finished with errors - because we were not using AS_EIO/AS_ENOSPC, filemap_fdatawait_range() would return success, as it could not know that writeback errors happened (the pages were no longer tagged for writeback). Signed-off-by: Filipe Manana fdman...@suse.com --- V2: If an extent buffer's write failed but it's also deleted from the tree before the transaction commits, don't abort the transaction with -EIO, since the unwritten node/leaf it represents can't be pointed to by any other node in a tree. V3: Correct V2, missed unstaged changes. V4: Use root's key to figure out which counter to update. V5: Decrement the error counters too when an eb is made dirty again (the next write attempt might succeed). V6: Moved counters from transaction struct to fs_info struct, because there's a (short) time window where fs_info-running_transaction is NULL. There's now 2 counters for log extent buffers too, each one representing a different log transaction. V7: Track the eb's log index in the eb itself, otherwise it wasn't possible to find it when writeback triggered from a transaction commit. V8: Track the log eb write errors per root instead, and reset them on a transaction commit. V9: Don't decrement the error counters if the eb is deleted or re-written. It is not safe because there's a time window when committing a transaction, between setting fs_info-current_transaction to NULL and checking the error counters in btrfs_write_and_wait_transaction(), where a new transaction can start and delete or re-write an eb that has the write error flag set. If this happens it means the previous transaction can write a superblock that refers to trees that point to unwritten nodes. Replaced the counters with simple flags in the btree inode's runtime flags - essentially back to V1 but accounting for the 2 different log sub-transactions. Removed access to an eb's parent root through BTRFS_I(eb-pages[0]-mapping-host)-root since it was not correct, as this always gives us the btree inode's root (objectid 1ULL). Instead use the field eb-log_index to know wether it's a log btree eb (and which sub- -transaction) or a non-log btree eb. V10: Clear the log eb write error flags in a more logical place (transaction commit function). V11: Updated commit message and a comment, replaced an ASSERT() with a BUG() and changed eb-lock_nested to a short to keep the structure size. fs/btrfs/btrfs_inode.h | 11 fs/btrfs/disk-io.c | 4 +-- fs/btrfs/extent-tree.c | 4 ++- fs/btrfs/extent_io.c | 76 +- fs/btrfs/extent_io.h | 8 -- fs/btrfs/transaction.c | 26 + 6 files changed
[PATCH 2/2 v12] Btrfs: be aware of btree inode write errors to avoid fs corruption
While we have a transaction ongoing, the VM might decide at any time to call btree_inode-i_mapping-a_ops-writepages(), which will start writeback of dirty pages belonging to btree nodes/leafs. This call might return an error or the writeback might finish with an error before we attempt to commit the running transaction. If this happens, we might have no way of knowing that such error happened when we are committing the transaction - because the pages might no longer be marked dirty nor tagged for writeback (if a subsequent modification to the extent buffer didn't happen before the transaction commit) which makes filemap_fdata[write|wait]_range unable to find such pages (even if they're marked with SetPageError). So if this happens we must abort the transaction, otherwise we commit a super block with btree roots that point to btree nodes/leafs whose content on disk is invalid - either garbage or the content of some node/leaf from a past generation that got cowed or deleted and is no longer valid (for this later case we end up getting error messages like parent transid verify failed on 10826481664 wanted 25748 found 29562 when reading btree nodes/leafs from disk). Note that setting and checking AS_EIO/AS_ENOSPC in the btree inode's i_mapping would not be enough because we need to distinguish between log tree extents (not fatal) vs non-log tree extents (fatal) and because the next call to filemap_fdatawait_range() will catch and clear such errors in the mapping - and that call might be from a log sync and not from a transaction commit, which means we would not know about the error at transaction commit time. Also, checking for the eb flag EXTENT_BUFFER_IOERR at transaction commit time isn't done and would not be completely reliable, as the eb might be removed from memory and read back when trying to get it, which clears that flag right before reading the eb's pages from disk, making us not know about the previous write error. Using the new 3 flags for the btree inode also makes us achieve the goal of AS_EIO/AS_ENOSPC when writepages() returns success, started writeback for all dirty pages and before filemap_fdatawait_range() is called, the writeback for all dirty pages had already finished with errors - because we were not using AS_EIO/AS_ENOSPC, filemap_fdatawait_range() would return success, as it could not know that writeback errors happened (the pages were no longer tagged for writeback). Signed-off-by: Filipe Manana fdman...@suse.com --- V2: If an extent buffer's write failed but it's also deleted from the tree before the transaction commits, don't abort the transaction with -EIO, since the unwritten node/leaf it represents can't be pointed to by any other node in a tree. V3: Correct V2, missed unstaged changes. V4: Use root's key to figure out which counter to update. V5: Decrement the error counters too when an eb is made dirty again (the next write attempt might succeed). V6: Moved counters from transaction struct to fs_info struct, because there's a (short) time window where fs_info-running_transaction is NULL. There's now 2 counters for log extent buffers too, each one representing a different log transaction. V7: Track the eb's log index in the eb itself, otherwise it wasn't possible to find it when writeback triggered from a transaction commit. V8: Track the log eb write errors per root instead, and reset them on a transaction commit. V9: Don't decrement the error counters if the eb is deleted or re-written. It is not safe because there's a time window when committing a transaction, between setting fs_info-current_transaction to NULL and checking the error counters in btrfs_write_and_wait_transaction(), where a new transaction can start and delete or re-write an eb that has the write error flag set. If this happens it means the previous transaction can write a superblock that refers to trees that point to unwritten nodes. Replaced the counters with simple flags in the btree inode's runtime flags - essentially back to V1 but accounting for the 2 different log sub-transactions. Removed access to an eb's parent root through BTRFS_I(eb-pages[0]-mapping-host)-root since it was not correct, as this always gives us the btree inode's root (objectid 1ULL). Instead use the field eb-log_index to know wether it's a log btree eb (and which sub- -transaction) or a non-log btree eb. V10: Clear the log eb write error flags in a more logical place (transaction commit function). V11: Updated commit message and a comment, replaced an ASSERT() with a BUG() and changed eb-lock_nested to a short to keep the structure size. V12: Removed leftovers from previous versions (no longer necessary #include and prototype in extent_io.h of no longer existing function) and updated parts from a comment that apply only to some past versions. Rebased against latest integration branch (didn't apply
[PATCH] Btrfs: send, don't delay dir move if there's a new parent inode
If between two snapshots we rename an existing directory named X to Y and make it a child (direct or not) of a new inode named X, we were delaying the move/rename of the former directory unnecessarily, which would result in attempting to rename the new directory from its orphan name to name X prematurely. Minimal reproducer: $ mkfs.btrfs -f /dev/vdd $ mount /dev/vdd /mnt $ mkdir -p /mnt/merlin/RC/OSD/Source $ btrfs subvolume snapshot -r /mnt /mnt/mysnap1 $ mkdir /mnt/OSD $ mv /mnt/merlin/RC/OSD /mnt/OSD/OSD-Plane_788 $ mv /mnt/OSD /mnt/merlin/RC $ btrfs subvolume snapshot -r /mnt /mnt/mysnap2 $ btrfs send /mnt/mysnap1 -f /tmp/1.snap $ btrfs send -p /mnt/mysnap1 /mnt/mysnap2 -f /tmp/2.snap $ mkfs.btrfs -f /dev/vdc $ mount /dev/vdc /mnt2 $ btrfs receive /mnt2 -f /tmp/1.snap $ btrfs receive /mnt2 -f /tmp/2.snap The second receive (from an incremental send) failed with the following error message: rename o261-7-0 - merlin/RC/OSD failed. This is a regression introduced in the 3.16 kernel. A test case for xfstests follows. Reported-by: Marc Merlin m...@merlins.org Signed-off-by: Filipe Manana fdman...@suse.com --- fs/btrfs/send.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 7edfc7c..b9c27aa 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -3327,7 +3327,7 @@ static int wait_for_parent_move(struct send_ctx *sctx, if (ret 0 ret != -ENOENT) { goto out; } else if (ret == -ENOENT) { - ret = 1; + ret = 0; break; } -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] fstests: regression test for btrfs incremental send
This is a regression test for a btrfs incremental send issue. If between two snapshots we rename an existing directory named X to Y and make it a child (direct or not) of a new inode named X, we were delaying the move/rename of the former directory unnecessarily, which would result in attempting to rename the new directory from its orphan name to name X prematurely. This made btrfs receive fail with an error message like the following: rename o261-7-0 - merlin/RC/OSD failed This issue was a regression in the 3.16 kernel and got fixed by the following linux kernel btrfs patch: Btrfs: send, don't delay dir move if there's a new parent inode Signed-off-by: Filipe Manana fdman...@suse.com --- tests/btrfs/060 | 148 tests/btrfs/060.out | 2 + tests/btrfs/group | 1 + 3 files changed, 151 insertions(+) create mode 100755 tests/btrfs/060 create mode 100644 tests/btrfs/060.out diff --git a/tests/btrfs/060 b/tests/btrfs/060 new file mode 100755 index 000..20dc0ad --- /dev/null +++ b/tests/btrfs/060 @@ -0,0 +1,148 @@ +#! /bin/bash +# FS QA Test No. btrfs/060 +# +# Regression test for a btrfs incremental send issue. +# If between two snapshots we rename an existing directory named X to Y and +# make it a child (direct or not) of a new inode named X, we were delaying +# the move/rename of the former directory unnecessarily, which would result +# in attempting to rename the new directory from its orphan name to name X +# prematurely. This made btrfs receive fail with an error message like the +# following: +# +# rename o261-7-0 - merlin/RC/OSD failed +# +# This issue was a regression in the 3.16 kernel and got fixed by the following +# linux kernel btrfs patch: +# +# Btrfs: send, don't delay dir move if there's a new parent inode +# +#--- +# Copyright (C) 2014 SUSE Linux Products GmbH. All Rights Reserved. +# Author: Filipe Manana fdman...@suse.com +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +#--- +# + +seq=`basename $0` +seqres=$RESULT_DIR/$seq +echo QA output created by $seq + +tmp=`mktemp -d` +status=1 # failure is the default! +trap _cleanup; exit \$status 0 1 2 3 15 + +_cleanup() +{ +rm -fr $send_files_dir +rm -fr $tmp +} + +# get standard environment, filters and checks +. ./common/rc +. ./common/filter + +# real QA test starts here +_supported_fs btrfs +_supported_os Linux +_require_scratch +_require_fssum +_need_to_be_root + +send_files_dir=$TEST_DIR/btrfs-test-$seq + +rm -f $seqres.full +rm -fr $send_files_dir +mkdir $send_files_dir + +_scratch_mkfs $seqres.full 21 +_scratch_mount + +mkdir -p $SCRATCH_MNT/merlin/RC/OSD/Source +mkdir -p $SCRATCH_MNT/fdm/RCz/OSDz/Sourcez +mkdir -p $SCRATCH_MNT/Z/Z2 + +# Filesystem looks like: +# +# . (ino 256) +# | merlin/ (ino 257) +# || RC/(ino 258) +# | |- OSD/(ino 259) +# | | Source/ (ino 260) +# | +# | fdm/(ino 261) +# | | RCz/(ino 262) +# | |- OSDz/(ino 263) +# | | Sourcez/ (ino 264) +# | +# | Z/ (ino 265) +# | Z2/ (ino 266) +# +_run_btrfs_util_prog subvolume snapshot -r $SCRATCH_MNT $SCRATCH_MNT/mysnap1 + +mkdir $SCRATCH_MNT/OSD +mv $SCRATCH_MNT/merlin/RC/OSD $SCRATCH_MNT/OSD/OSD-Plane_788 +mv $SCRATCH_MNT/OSD $SCRATCH_MNT/merlin/RC + +mkdir $SCRATCH_MNT/OSDz +mv $SCRATCH_MNT/Z/Z2 $SCRATCH_MNT/OSDz/xz2 +mv $SCRATCH_MNT/Z $SCRATCH_MNT/OSDz/xz2/xz +mv $SCRATCH_MNT/fdm/RCz/OSDz $SCRATCH_MNT/OSDz/xz2/xz/OSD-Plane_788z +mv $SCRATCH_MNT/OSDz $SCRATCH_MNT/fdm/RCz + +# Filesystem now looks like: +# +# +# . (ino 256) +# | merlin/ (ino 257) +# | | RC/ (ino 258) +# | |- OSD/ (ino 267
Re: FIBMAP unsupported
On 10/02/2014 11:11 PM, Marc Dietrich wrote: Am Donnerstag 02 Oktober 2014, 21:55:55 schrieb Marc Dietrich: Will try to restore the file using btrfs restore ok, restore worked. I did some more tests. This is unrelated to CoW. It seems that the fallocate -n in combination with dd conv=notrunc using large files (10G) triggers it. Maybe this rings some bells. Just tried it and I confirm filefrag's call to ioctl FS_IOC_FIEMAP fails with -EEXIST. It's actually a known issue affecting any of the 3.17 RCs (except RC1). The extent map manipulation/merging is broken for some cases. Try with this 2 patches on top of 3.17-rcX: https://patchwork.kernel.org/patch/4929981/ https://patchwork.kernel.org/patch/4945191/ Or, alternatively, reverting this patch: https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=51f395ad4058883e4273b02fdebe98072dbdc0d2 Someone else reported on this list a write/pwrite/writev failure with errno EEXIST too (and apparently caused by the same reason). This broken extent map handling is serious IMHO, it can make fsync log bogus extent items for example, amongst other possible bad and weird things. Marc -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 4/5] Btrfs: process all async extents on compressed write failure
If we had an error when processing one of the async extents from our list, we were not processing the remaining async extents, meaning we would leak those async_extent structs, never release the pages with the compressed data and never unlock and clear the dirty flag from the inode's pages (those that correspond to the uncompressed content). Signed-off-by: Filipe Manana fdman...@suse.com --- fs/btrfs/inode.c | 6 +- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a3e2330..8636499 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -839,13 +839,9 @@ retry: } alloc_hint = ins.objectid + ins.offset; kfree(async_extent); - if (ret) - goto out; cond_resched(); } - ret = 0; -out: - return ret; + return 0; out_free_reserve: btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); out_free: -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 1/5] Btrfs: set page and mapping error on compressed write failure
If we fail in submit_compressed_extents() before calling btrfs_submit_compressed_write(), we start and end the writeback for the pages (clear their dirty flag, unlock them, etc) but we don't tag the pages, nor the inode's mapping, with an error. This makes it impossible for a caller of filemap_fdatawait_range() (fsync, or transaction commit for e.g.) know that there was an error. Note that the return value of submit_compressed_extents() is useless, as that function is executed by a workqueue task and not directly by the fill_delalloc callback. This means the writepage/s callbacks of the inode's address space operations don't get that return value. Signed-off-by: Filipe Manana fdman...@suse.com --- fs/btrfs/extent_io.c | 5 + fs/btrfs/extent_io.h | 1 + fs/btrfs/inode.c | 3 ++- 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 9cc757f..865594c 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1746,6 +1746,9 @@ int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, if (page_ops == 0) return 0; + if ((page_ops PAGE_SET_ERROR) nr_pages 0) + mapping_set_error(inode-i_mapping, -EIO); + while (nr_pages 0) { ret = find_get_pages_contig(inode-i_mapping, index, min_t(unsigned long, @@ -1763,6 +1766,8 @@ int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, clear_page_dirty_for_io(pages[i]); if (page_ops PAGE_SET_WRITEBACK) set_page_writeback(pages[i]); + if (page_ops PAGE_SET_ERROR) + SetPageError(pages[i]); if (page_ops PAGE_END_WRITEBACK) end_page_writeback(pages[i]); if (page_ops PAGE_UNLOCK) diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 06f030c..5654e14 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -52,6 +52,7 @@ #define PAGE_SET_WRITEBACK (1 2) #define PAGE_END_WRITEBACK (1 3) #define PAGE_SET_PRIVATE2 (1 4) +#define PAGE_SET_ERROR (1 5) /* * page-private values. Every page that is controlled by the extent diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 344a322..cefa618 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -832,7 +832,8 @@ out_free: NULL, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | PAGE_CLEAR_DIRTY | -PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK); +PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK | +PAGE_SET_ERROR); kfree(async_extent); goto again; } -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 2/5] Btrfs: fix hang on compressed write error
In inode.c:submit_compressed_extents(), before calling btrfs_submit_compressed_write() we start writeback for all pages, clear their dirty flag, unlock them, etc, but if btrfs_submit_compressed_write() fails (at the moment it can only fail with -ENOMEM), we never end the writeback on the pages, so any filemap_fdatawait_range() call will hang forever. We were also not calling the writepage end io hook, which means the corresponding ordered extent will never complete and all its waiters will block forever, such as a full fsync (via btrfs_wait_ordered_range()). Signed-off-by: Filipe Manana fdman...@suse.com --- fs/btrfs/inode.c | 14 ++ 1 file changed, 14 insertions(+) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index cefa618..e2c4650 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -814,6 +814,20 @@ retry: ins.objectid, ins.offset, async_extent-pages, async_extent-nr_pages); + if (ret) { + struct extent_io_tree *tree = BTRFS_I(inode)-io_tree; + struct page *p = async_extent-pages[0]; + const u64 start = async_extent-start; + const u64 end = start + async_extent-ram_size - 1; + + p-mapping = inode-i_mapping; + tree-ops-writepage_end_io_hook(p, start, end, +NULL, 0); + p-mapping = NULL; + extent_clear_unlock_delalloc(inode, start, end, NULL, 0, +PAGE_END_WRITEBACK | +PAGE_SET_ERROR); + } alloc_hint = ins.objectid + ins.offset; kfree(async_extent); if (ret) -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 0/5] Proper error handling for the compressed write path
This patchset fixes several issues in inode.c:submit_compressed_extents() when one of the functions it calls fails. These issues range from hangs, missing error reporting (silent failure), memory leaks and pages not getting released. Filipe Manana (5): Btrfs: set page and mapping error on compressed write failure Btrfs: fix hang on compressed write error Btrfs: don't leak pages and memory on compressed write error Btrfs: process all async extents on compressed write failure Btrfs: make inode.c:submit_compressed_extents() return void fs/btrfs/extent_io.c | 5 + fs/btrfs/extent_io.h | 1 + fs/btrfs/inode.c | 56 ++-- 3 files changed, 43 insertions(+), 19 deletions(-) -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 3/5] Btrfs: don't leak pages and memory on compressed write error
In inode.c:submit_compressed_extents(), if we fail before calling btrfs_submit_compressed_write(), or when that function fails, we were freeing the async_extent structure without releasing its pages and freeing the pages array. Signed-off-by: Filipe Manana fdman...@suse.com --- fs/btrfs/inode.c | 28 +++- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index e2c4650..a3e2330 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -633,6 +633,22 @@ free_pages_out: goto out; } +static void free_async_extent_pages(struct async_extent *async_extent) +{ + int i; + + if (!async_extent-pages) + return; + + for (i = 0; i async_extent-nr_pages; i++) { + WARN_ON(async_extent-pages[i]-mapping); + page_cache_release(async_extent-pages[i]); + } + kfree(async_extent-pages); + async_extent-nr_pages = 0; + async_extent-pages = NULL; +} + /* * phase two of compressed writeback. This is the ordered portion * of the code, which only gets called in the order the work was @@ -709,15 +725,7 @@ retry: async_extent-compressed_size, 0, alloc_hint, ins, 1, 1); if (ret) { - int i; - - for (i = 0; i async_extent-nr_pages; i++) { - WARN_ON(async_extent-pages[i]-mapping); - page_cache_release(async_extent-pages[i]); - } - kfree(async_extent-pages); - async_extent-nr_pages = 0; - async_extent-pages = NULL; + free_async_extent_pages(async_extent); if (ret == -ENOSPC) { unlock_extent(io_tree, async_extent-start, @@ -827,6 +835,7 @@ retry: extent_clear_unlock_delalloc(inode, start, end, NULL, 0, PAGE_END_WRITEBACK | PAGE_SET_ERROR); + free_async_extent_pages(async_extent); } alloc_hint = ins.objectid + ins.offset; kfree(async_extent); @@ -848,6 +857,7 @@ out_free: PAGE_UNLOCK | PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK | PAGE_SET_ERROR); + free_async_extent_pages(async_extent); kfree(async_extent); goto again; } -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 5/5] Btrfs: make inode.c:submit_compressed_extents() return void
Its return value is completely ignored by its single caller and it's useless anyway, since errors are indicated through SetPageError and the bit AS_EIO set in the flags of the inode's mapping. The caller can't do anything with the value, as it's invoked from a workqueue task and not by the task calling filemap_fdatawrite_range (which calls the writepages address space callback, which in turn calls the inode's fill_delalloc callback). Signed-off-by: Filipe Manana fdman...@suse.com --- fs/btrfs/inode.c | 7 ++- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 8636499..7635b1d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -655,7 +655,7 @@ static void free_async_extent_pages(struct async_extent *async_extent) * queued. We walk all the async extents created by compress_file_range * and send them down to the disk. */ -static noinline int submit_compressed_extents(struct inode *inode, +static noinline void submit_compressed_extents(struct inode *inode, struct async_cow *async_cow) { struct async_extent *async_extent; @@ -667,9 +667,6 @@ static noinline int submit_compressed_extents(struct inode *inode, struct extent_io_tree *io_tree; int ret = 0; - if (list_empty(async_cow-extents)) - return 0; - again: while (!list_empty(async_cow-extents)) { async_extent = list_entry(async_cow-extents.next, @@ -841,7 +838,7 @@ retry: kfree(async_extent); cond_resched(); } - return 0; + return; out_free_reserve: btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); out_free: -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] Btrfs: don't ignore compressed bio write errors
Our compressed bio write end callback was essentially ignoring the error parameter. When a write error happens, it must pass a value of 0 to the inode's write_page_end_io_hook callback, SetPageError on the respective pages and set AS_EIO in the inode's mapping flags, so that a call to filemap_fdatawait_range() / filemap_fdatawait() can find out that errors happened (we surely don't want silent failures on fsync for example). Signed-off-by: Filipe Manana fdman...@suse.com --- fs/btrfs/compression.c | 18 -- 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 138..9f0e882 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -224,16 +224,19 @@ out: * Clear the writeback bits on all of the file * pages for a compressed write */ -static noinline void end_compressed_writeback(struct inode *inode, u64 start, - unsigned long ram_size) +static noinline void end_compressed_writeback(struct inode *inode, + const struct compressed_bio *cb) { - unsigned long index = start PAGE_CACHE_SHIFT; - unsigned long end_index = (start + ram_size - 1) PAGE_CACHE_SHIFT; + unsigned long index = cb-start PAGE_CACHE_SHIFT; + unsigned long end_index = (cb-start + cb-len - 1) PAGE_CACHE_SHIFT; struct page *pages[16]; unsigned long nr_pages = end_index - index + 1; int i; int ret; + if (cb-errors) + mapping_set_error(inode-i_mapping, -EIO); + while (nr_pages 0) { ret = find_get_pages_contig(inode-i_mapping, index, min_t(unsigned long, @@ -244,6 +247,8 @@ static noinline void end_compressed_writeback(struct inode *inode, u64 start, continue; } for (i = 0; i ret; i++) { + if (cb-errors) + SetPageError(pages[i]); end_page_writeback(pages[i]); page_cache_release(pages[i]); } @@ -287,10 +292,11 @@ static void end_compressed_bio_write(struct bio *bio, int err) tree-ops-writepage_end_io_hook(cb-compressed_pages[0], cb-start, cb-start + cb-len - 1, -NULL, 1); +NULL, +err ? 0 : 1); cb-compressed_pages[0]-mapping = NULL; - end_compressed_writeback(inode, cb-start, cb-len); + end_compressed_writeback(inode, cb); /* note, our inode could be gone now */ /* -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 2/2] Btrfs: make inode.c:compress_file_range() return void
Its return value is useless, its single caller ignores it and can't do anything with it anyway, since it's a workqueue task and not the task calling filemap_fdatawrite_range (writepages) nor filemap_fdatawait_range(). Failure is communicated to such functions via start and end of writeback with the respective pages tagged with an error and AS_EIO flag set in the inode's imapping. Signed-off-by: Filipe Manana fdman...@suse.com --- fs/btrfs/inode.c | 7 ++- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index b91a171..aef0fa3 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -382,7 +382,7 @@ static inline int inode_need_compress(struct inode *inode) * are written in the same order that the flusher thread sent them * down. */ -static noinline int compress_file_range(struct inode *inode, +static noinline void compress_file_range(struct inode *inode, struct page *locked_page, u64 start, u64 end, struct async_cow *async_cow, @@ -621,8 +621,7 @@ cleanup_and_bail_uncompressed: *num_added += 1; } -out: - return ret; + return; free_pages_out: for (i = 0; i nr_pages_ret; i++) { @@ -630,8 +629,6 @@ free_pages_out: page_cache_release(pages[i]); } kfree(pages); - - goto out; } static void free_async_extent_pages(struct async_extent *async_extent) -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 1/2] Btrfs: report error after failure inlining extent in compressed write path
If cow_file_range_inline() failed, when called from compress_file_range(), we were tagging the locked page for writeback, end its writeback and unlock it, but not marking it with an error nor setting AS_EIO in inode's mapping flags. This made it impossible for a caller of filemap_fdatawrite_range (writepages) or filemap_fdatawait_range() to know that an error happened. And the return value of compress_file_range() is useless because it's returned to a workqueue task and not to the task calling filemap_fdatawrite_range (writepages). This change applies on top of the previous patchset starting at the patch titled: [1/5] Btrfs: set page and mapping error on compressed write failure Which changed extent_clear_unlock_delalloc() to use SetPageError and mapping_set_error(). Signed-off-by: Filipe Manana fdman...@suse.com --- fs/btrfs/inode.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 7635b1d..b91a171 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -538,6 +538,7 @@ cont: clear_flags, PAGE_UNLOCK | PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK | +PAGE_SET_ERROR | PAGE_END_WRITEBACK); goto free_pages_out; } -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 1/2] Btrfs: correctly flush compressed data before/after direct IO
For compressed writes, after doing the first filemap_fdatawrite_range() we don't get the pages tagged for writeback immediately. Instead we create a workqueue task, which is run by other kthread, and keep the pages locked. That other kthread compresses data, creates the respective ordered extent/s, tags the pages for writeback and unlocks them. Therefore we need a second call to filemap_fdatawrite_range() if we have compressed writes, as this second call will wait for the pages to become unlocked, then see they became tagged for writeback and finally wait for the writeback to finish. Signed-off-by: Filipe Manana fdman...@suse.com --- fs/btrfs/file.c | 12 +++- fs/btrfs/inode.c | 16 +--- 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 29b147d..82c7229 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1692,8 +1692,18 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb, err = written_buffered; goto out; } + /* +* Ensure all data is persisted. We want the next direct IO read to be +* able to read what was just written. +*/ endbyte = pos + written_buffered - 1; - err = filemap_write_and_wait_range(file-f_mapping, pos, endbyte); + err = filemap_fdatawrite_range(file-f_mapping, pos, endbyte); + if (!err test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, +BTRFS_I(file_inode(file))-runtime_flags)) + err = filemap_fdatawrite_range(file-f_mapping, pos, endbyte); + if (err) + goto out; + err = filemap_fdatawait_range(file-f_mapping, pos, endbyte); if (err) goto out; written += written_buffered; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index aef0fa3..752ff18 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7052,9 +7052,19 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, btrfs_put_ordered_extent(ordered); } else { /* Screw you mmap */ - ret = filemap_write_and_wait_range(inode-i_mapping, - lockstart, - lockend); + ret = filemap_fdatawrite_range(inode-i_mapping, + lockstart, + lockend); + if (!ret test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, +BTRFS_I(inode)-runtime_flags)) + ret = filemap_fdatawrite_range(inode-i_mapping, + lockstart, + lockend); + if (ret) + break; + ret = filemap_fdatawait_range(inode-i_mapping, + lockstart, + lockend); if (ret) break; -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 2/2] Btrfs: add helper btrfs_fdatawrite_range
To avoid duplicating this double filemap_fdatawrite_range() call for inodes with async extents (compressed writes) so often. Signed-off-by: Filipe Manana fdman...@suse.com --- fs/btrfs/ctree.h| 1 + fs/btrfs/file.c | 36 fs/btrfs/inode.c| 9 + fs/btrfs/ordered-data.c | 24 ++-- 4 files changed, 32 insertions(+), 38 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 089f6da..4e0ad8c 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3896,6 +3896,7 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode, struct page **pages, size_t num_pages, loff_t pos, size_t write_bytes, struct extent_state **cached); +int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end); /* tree-defrag.c */ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 82c7229..2df1dce 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1697,10 +1697,7 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb, * able to read what was just written. */ endbyte = pos + written_buffered - 1; - err = filemap_fdatawrite_range(file-f_mapping, pos, endbyte); - if (!err test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, -BTRFS_I(file_inode(file))-runtime_flags)) - err = filemap_fdatawrite_range(file-f_mapping, pos, endbyte); + err = btrfs_fdatawrite_range(file-f_mapping, pos, endbyte); if (err) goto out; err = filemap_fdatawait_range(file-f_mapping, pos, endbyte); @@ -1864,10 +1861,7 @@ static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end) int ret; atomic_inc(BTRFS_I(inode)-sync_writers); - ret = filemap_fdatawrite_range(inode-i_mapping, start, end); - if (!ret test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, -BTRFS_I(inode)-runtime_flags)) - ret = filemap_fdatawrite_range(inode-i_mapping, start, end); + ret = btrfs_fdatawrite_range(inode-i_mapping, start, end); atomic_dec(BTRFS_I(inode)-sync_writers); return ret; @@ -2820,3 +2814,29 @@ int btrfs_auto_defrag_init(void) return 0; } + +int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end) +{ + int ret; + + /* +* So with compression we will find and lock a dirty page and clear the +* first one as dirty, setup an async extent, and immediately return +* with the entire range locked but with nobody actually marked with +* writeback. So we can't just filemap_write_and_wait_range() and +* expect it to work since it will just kick off a thread to do the +* actual work. So we need to call filemap_fdatawrite_range _again_ +* since it will wait on the page lock, which won't be unlocked until +* after the pages have been marked as writeback and so we're good to go +* from there. We have to do this otherwise we'll miss the ordered +* extents and that results in badness. Please Josef, do not think you +* know better and pull this out at some point in the future, it is +* right and you are wrong. +*/ + ret = filemap_fdatawrite_range(inode-i_mapping, start, end); + if (!ret test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, +BTRFS_I(inode)-runtime_flags)) + ret = filemap_fdatawrite_range(inode-i_mapping, start, end); + + return ret; +} diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 752ff18..be955481 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7052,14 +7052,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, btrfs_put_ordered_extent(ordered); } else { /* Screw you mmap */ - ret = filemap_fdatawrite_range(inode-i_mapping, - lockstart, - lockend); - if (!ret test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, -BTRFS_I(inode)-runtime_flags)) - ret = filemap_fdatawrite_range(inode-i_mapping, - lockstart, - lockend); + ret = btrfs_fdatawrite_range(inode, lockstart, lockend); if (ret) break; ret = filemap_fdatawait_range(inode-i_mapping, diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index ac734ec..1401b1a 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -725,30
[PATCH 2/2 v2] Btrfs: add helper btrfs_fdatawrite_range
To avoid duplicating this double filemap_fdatawrite_range() call for inodes with async extents (compressed writes) so often. Signed-off-by: Filipe Manana fdman...@suse.com --- V2: Pass right arguments to the new helper. Missed unstaged changes. fs/btrfs/ctree.h| 1 + fs/btrfs/file.c | 39 ++- fs/btrfs/inode.c| 9 + fs/btrfs/ordered-data.c | 24 ++-- 4 files changed, 34 insertions(+), 39 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 089f6da..4e0ad8c 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3896,6 +3896,7 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode, struct page **pages, size_t num_pages, loff_t pos, size_t write_bytes, struct extent_state **cached); +int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end); /* tree-defrag.c */ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 82c7229..bbd474b 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1676,6 +1676,7 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb, loff_t pos) { struct file *file = iocb-ki_filp; + struct inode *inode = file_inode(file); ssize_t written; ssize_t written_buffered; loff_t endbyte; @@ -1697,13 +1698,10 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb, * able to read what was just written. */ endbyte = pos + written_buffered - 1; - err = filemap_fdatawrite_range(file-f_mapping, pos, endbyte); - if (!err test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, -BTRFS_I(file_inode(file))-runtime_flags)) - err = filemap_fdatawrite_range(file-f_mapping, pos, endbyte); + err = btrfs_fdatawrite_range(inode, pos, endbyte); if (err) goto out; - err = filemap_fdatawait_range(file-f_mapping, pos, endbyte); + err = filemap_fdatawait_range(inode-i_mapping, pos, endbyte); if (err) goto out; written += written_buffered; @@ -1864,10 +1862,7 @@ static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end) int ret; atomic_inc(BTRFS_I(inode)-sync_writers); - ret = filemap_fdatawrite_range(inode-i_mapping, start, end); - if (!ret test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, -BTRFS_I(inode)-runtime_flags)) - ret = filemap_fdatawrite_range(inode-i_mapping, start, end); + ret = btrfs_fdatawrite_range(inode, start, end); atomic_dec(BTRFS_I(inode)-sync_writers); return ret; @@ -2820,3 +2815,29 @@ int btrfs_auto_defrag_init(void) return 0; } + +int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end) +{ + int ret; + + /* +* So with compression we will find and lock a dirty page and clear the +* first one as dirty, setup an async extent, and immediately return +* with the entire range locked but with nobody actually marked with +* writeback. So we can't just filemap_write_and_wait_range() and +* expect it to work since it will just kick off a thread to do the +* actual work. So we need to call filemap_fdatawrite_range _again_ +* since it will wait on the page lock, which won't be unlocked until +* after the pages have been marked as writeback and so we're good to go +* from there. We have to do this otherwise we'll miss the ordered +* extents and that results in badness. Please Josef, do not think you +* know better and pull this out at some point in the future, it is +* right and you are wrong. +*/ + ret = filemap_fdatawrite_range(inode-i_mapping, start, end); + if (!ret test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, +BTRFS_I(inode)-runtime_flags)) + ret = filemap_fdatawrite_range(inode-i_mapping, start, end); + + return ret; +} diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 752ff18..be955481 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7052,14 +7052,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, btrfs_put_ordered_extent(ordered); } else { /* Screw you mmap */ - ret = filemap_fdatawrite_range(inode-i_mapping, - lockstart, - lockend); - if (!ret test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, -BTRFS_I(inode)-runtime_flags)) - ret = filemap_fdatawrite_range(inode-i_mapping
[PATCH 1/2 v2] Btrfs: report error after failure inlining extent in compressed write path
If cow_file_range_inline() failed, when called from compress_file_range(), we were tagging the locked page for writeback, end its writeback and unlock it, but not marking it with an error nor setting AS_EIO in inode's mapping flags. This made it impossible for a caller of filemap_fdatawrite_range (writepages) or filemap_fdatawait_range() to know that an error happened. And the return value of compress_file_range() is useless because it's returned to a workqueue task and not to the task calling filemap_fdatawrite_range (writepages). This change applies on top of the previous patchset starting at the patch titled: [1/5] Btrfs: set page and mapping error on compressed write failure Which changed extent_clear_unlock_delalloc() to use SetPageError and mapping_set_error(). Signed-off-by: Filipe Manana fdman...@suse.com --- V2: Use SET_PAGE_ERROR only if ret 0, obviously. Thanks btrfs/056. fs/btrfs/inode.c | 4 1 file changed, 4 insertions(+) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 7635b1d..2b09425 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -527,7 +527,10 @@ cont: if (ret = 0) { unsigned long clear_flags = EXTENT_DELALLOC | EXTENT_DEFRAG; + unsigned long page_error_op; + clear_flags |= (ret 0) ? EXTENT_DO_ACCOUNTING : 0; + page_error_op = ret 0 ? PAGE_SET_ERROR : 0; /* * inline extent creation worked or returned error, @@ -538,6 +541,7 @@ cont: clear_flags, PAGE_UNLOCK | PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK | +page_error_op | PAGE_END_WRITEBACK); goto free_pages_out; } -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 1/3] Btrfs: deal with convert_extent_bit errors to avoid fs corruption
When committing a transaction or a log, we look for btree extents that need to be durably persisted by searching for ranges in a io tree that have some bits set (EXTENT_DIRTY or EXTENT_NEW). We then attempt to clear those bits and set the EXTENT_NEED_WAIT bit, with calls to the function convert_extent_bit, and then start writeback for the extents. That function however can return an error (at the moment only -ENOMEM is possible, specially when it does GFP_ATOMIC allocation requests through alloc_extent_state_atomic) - that means the ranges didn't got the EXTENT_NEED_WAIT bit set (or at least not for the whole range), which in turn means a call to btrfs_wait_marked_extents() won't find those ranges for which we started writeback, causing a transaction commit or a log commit to persist a new superblock without waiting for the writeback of extents in that range to finish first. Therefore if a crash happens after persisting the new superblock and before writeback finishes, we have a superblock pointing to roots that weren't fully persisted or roots that point to nodes or leafs that weren't fully persisted, causing all sorts of unexpected/bad behaviour as we endup reading garbage from disk or the content of some node/leaf from a past generation that got cowed or deleted and is no longer valid (for this later case we end up getting error messages like parent transid verify failed on X wanted Y found Z when reading btree nodes/leafs from disk). Signed-off-by: Filipe Manana fdman...@suse.com --- fs/btrfs/transaction.c | 92 +- fs/btrfs/transaction.h | 2 -- 2 files changed, 76 insertions(+), 18 deletions(-) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 8f1a408..cb673d4 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -76,6 +76,32 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction) } } +static void clear_btree_io_tree(struct extent_io_tree *tree) +{ + spin_lock(tree-lock); + while (!RB_EMPTY_ROOT(tree-state)) { + struct rb_node *node; + struct extent_state *state; + + node = rb_first(tree-state); + state = rb_entry(node, struct extent_state, rb_node); + rb_erase(state-rb_node, tree-state); + RB_CLEAR_NODE(state-rb_node); + /* +* btree io trees aren't supposed to have tasks waiting for +* changes in the flags of extent states ever. +*/ + ASSERT(!waitqueue_active(state-wq)); + free_extent_state(state); + if (need_resched()) { + spin_unlock(tree-lock); + cond_resched(); + spin_lock(tree-lock); + } + } + spin_unlock(tree-lock); +} + static noinline void switch_commit_roots(struct btrfs_transaction *trans, struct btrfs_fs_info *fs_info) { @@ -89,6 +115,7 @@ static noinline void switch_commit_roots(struct btrfs_transaction *trans, root-commit_root = btrfs_root_node(root); if (is_fstree(root-objectid)) btrfs_unpin_free_ino(root); + clear_btree_io_tree(root-dirty_log_pages); } up_write(fs_info-commit_root_sem); } @@ -827,17 +854,38 @@ int btrfs_write_marked_extents(struct btrfs_root *root, while (!find_first_extent_bit(dirty_pages, start, start, end, mark, cached_state)) { - convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, - mark, cached_state, GFP_NOFS); - cached_state = NULL; - err = filemap_fdatawrite_range(mapping, start, end); + bool wait_writeback = false; + + err = convert_extent_bit(dirty_pages, start, end, +EXTENT_NEED_WAIT, +mark, cached_state, GFP_NOFS); + /* +* convert_extent_bit can return -ENOMEM, which is most of the +* time a temporary error. So when it happens, ignore the error +* and wait for writeback of this range to finish - because we +* failed to set the bit EXTENT_NEED_WAIT for the range, a call +* to btrfs_wait_marked_extents() would not know that writeback +* for this range started and therefore wouldn't wait for it to +* finish - we don't want to commit a superblock that points to +* btree nodes/leafs for which writeback hasn't finished yet +* (and without errors). +* We cleanup any entries left in the io tree when committing +* the transaction (through clear_btree_io_tree()). +*/ + if (err == -ENOMEM
[PATCH 3/3] Btrfs: avoid returning -ENOMEM in convert_extent_bit() too early
We try to allocate an extent state before acquiring the tree's spinlock just in case we end up needing to split an existing extent state into two. If that allocation failed, we would return -ENOMEM. However, our only single caller (transaction/log commit code), passes in an extent state that was cached from a call to find_first_extent_bit() and that has a very high chance to match exactly the input range (always true for a transaction commit and very often, but not always, true for a log commit) - in this case we end up not needing at all that initial extent state used for an eventual split. Therefore just don't return -ENOMEM if we can't allocate the temporary extent state, since we might not need it at all, and if we end up needing one, we'll do it later anyway. Signed-off-by: Filipe Manana fdman...@suse.com --- fs/btrfs/extent_io.c | 11 ++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 0d931b1..654ed3d 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1066,13 +1066,21 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int err = 0; u64 last_start; u64 last_end; + bool first_iteration = true; btrfs_debug_check_extent_io_range(tree, start, end); again: if (!prealloc (mask __GFP_WAIT)) { + /* +* Best effort, don't worry if extent state allocation fails +* here for the first iteration. We might have a cached state +* that matches exactly the target range, in which case no +* extent state allocations are needed. We'll only know this +* after locking the tree. +*/ prealloc = alloc_extent_state(mask); - if (!prealloc) + if (!prealloc !first_iteration) return -ENOMEM; } @@ -1242,6 +1250,7 @@ search_again: spin_unlock(tree-lock); if (mask __GFP_WAIT) cond_resched(); + first_iteration = false; goto again; } -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 2/3] Btrfs: make find_first_extent_bit be able to cache any state
Right now the only caller of find_first_extent_bit() that is interested in caching extent states (transaction or log commit), never gets an extent state cached. This is because find_first_extent_bit() only caches states that have at least one of the flags EXTENT_IOBITS or EXTENT_BOUNDARY, and the transaction/log commit caller always passes a tree that doesn't have ever extent states with any of those flags (they can only have one of the following flags: EXTENT_DIRTY, EXTENT_NEW or EXTENT_NEED_WAIT). This change together with the following one in the patch series (titled Btrfs: avoid returning -ENOMEM in convert_extent_bit() too early) will help reduce significantly the chances of calls to convert_extent_bit() fail with -ENOMEM when called from the transaction/log commit code. Signed-off-by: Filipe Manana fdman...@suse.com --- fs/btrfs/extent_io.c | 16 fs/btrfs/transaction.c | 3 +++ 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 420fe26..0d931b1 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -796,17 +796,25 @@ static void set_state_bits(struct extent_io_tree *tree, state-state |= bits_to_set; } -static void cache_state(struct extent_state *state, - struct extent_state **cached_ptr) +static void cache_state_if_flags(struct extent_state *state, +struct extent_state **cached_ptr, +const u64 flags) { if (cached_ptr !(*cached_ptr)) { - if (state-state (EXTENT_IOBITS | EXTENT_BOUNDARY)) { + if (!flags || (state-state flags)) { *cached_ptr = state; atomic_inc(state-refs); } } } +static void cache_state(struct extent_state *state, + struct extent_state **cached_ptr) +{ + return cache_state_if_flags(state, cached_ptr, + EXTENT_IOBITS | EXTENT_BOUNDARY); +} + /* * set some bits on a range in the tree. This may require allocations or * sleeping, so the gfp mask is used to indicate what is allowed. @@ -1482,7 +1490,7 @@ int find_first_extent_bit(struct extent_io_tree *tree, u64 start, state = find_first_extent_bit_state(tree, start, bits); got_it: if (state) { - cache_state(state, cached_state); + cache_state_if_flags(state, cached_state, 0); *start_ret = state-start; *end_ret = state-end; ret = 0; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index cb673d4..396ae8b 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -882,6 +882,7 @@ int btrfs_write_marked_extents(struct btrfs_root *root, werr = err; else if (wait_writeback) werr = filemap_fdatawait_range(mapping, start, end); + free_extent_state(cached_state); cached_state = NULL; cond_resched(); start = end + 1; @@ -926,6 +927,8 @@ int btrfs_wait_marked_extents(struct btrfs_root *root, err = filemap_fdatawait_range(mapping, start, end); if (err) werr = err; + free_extent_state(cached_state); + cached_state = NULL; cond_resched(); start = end + 1; } -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] Revert Btrfs: race free update of commit root for ro snapshots
This reverts commit 9c3b306e1c9e6be4be09e99a8fe2227d1005effc. Switching only one commit root during a transaction is wrong because it leads the fs into an inconsistent state. All commit roots should be switched at once, at transaction commit time, otherwise backref walking can often miss important references that were only accessible through the old commit root. Plus, the root item for the snapshot's root wasn't getting updated and preventing the next transaction commit to do it. This made several users get into random corruption issues after creation of readonly snapshots. A regression test for xfstests will follow soon. Cc: sta...@vger.kernel.org # 3.17 Signed-off-by: Filipe Manana fdman...@suse.com --- fs/btrfs/inode.c | 36 fs/btrfs/ioctl.c | 33 + 2 files changed, 33 insertions(+), 36 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index fc9c043..d23362f 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5261,42 +5261,6 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) iput(inode); inode = ERR_PTR(ret); } - /* -* If orphan cleanup did remove any orphans, it means the tree -* was modified and therefore the commit root is not the same as -* the current root anymore. This is a problem, because send -* uses the commit root and therefore can see inode items that -* don't exist in the current root anymore, and for example make -* calls to btrfs_iget, which will do tree lookups based on the -* current root and not on the commit root. Those lookups will -* fail, returning a -ESTALE error, and making send fail with -* that error. So make sure a send does not see any orphans we -* have just removed, and that it will see the same inodes -* regardless of whether a transaction commit happened before -* it started (meaning that the commit root will be the same as -* the current root) or not. -*/ - if (sub_root-node != sub_root-commit_root) { - u64 sub_flags = btrfs_root_flags(sub_root-root_item); - - if (sub_flags BTRFS_ROOT_SUBVOL_RDONLY) { - struct extent_buffer *eb; - - /* -* Assert we can't have races between dentry -* lookup called through the snapshot creation -* ioctl and the VFS. -*/ - ASSERT(mutex_is_locked(dir-i_mutex)); - - down_write(root-fs_info-commit_root_sem); - eb = sub_root-commit_root; - sub_root-commit_root = - btrfs_root_node(sub_root); - up_write(root-fs_info-commit_root_sem); - free_extent_buffer(eb); - } - } } return inode; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index e732274..33c80f5 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -713,6 +713,39 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, if (ret) goto fail; + ret = btrfs_orphan_cleanup(pending_snapshot-snap); + if (ret) + goto fail; + + /* +* If orphan cleanup did remove any orphans, it means the tree was +* modified and therefore the commit root is not the same as the +* current root anymore. This is a problem, because send uses the +* commit root and therefore can see inode items that don't exist +* in the current root anymore, and for example make calls to +* btrfs_iget, which will do tree lookups based on the current root +* and not on the commit root. Those lookups will fail, returning a +* -ESTALE error, and making send fail with that error. So make sure +* a send does not see any orphans we have just removed, and that it +* will see the same inodes regardless of whether a transaction +* commit happened before it started (meaning that the commit root +* will be the same as the current root) or not. +*/ + if (readonly pending_snapshot-snap-node != + pending_snapshot-snap-commit_root) { + trans = btrfs_join_transaction(pending_snapshot-snap); + if (IS_ERR(trans) PTR_ERR(trans) != -ENOENT) { + ret = PTR_ERR(trans); + goto fail; + } + if (!IS_ERR(trans)) { + ret
[PATCH] fstests: btrfs: regression test for ro snapshot creation
Regression test for a btrfs issue where creation of readonly snapshots caused the filesystem to get into an inconsistent state. This regression was introduced in the 3.17 kernel and fixed by reverting the following linux kernel commit: Btrfs: race free update of commit root for ro snapshots 9c3b306e1c9e6be4be09e99a8fe2227d1005effc Signed-off-by: Filipe Manana fdman...@suse.com --- tests/btrfs/078 | 85 + tests/btrfs/078.out | 2 ++ tests/btrfs/group | 1 + 3 files changed, 88 insertions(+) create mode 100755 tests/btrfs/078 create mode 100644 tests/btrfs/078.out diff --git a/tests/btrfs/078 b/tests/btrfs/078 new file mode 100755 index 000..48de357 --- /dev/null +++ b/tests/btrfs/078 @@ -0,0 +1,85 @@ +#! /bin/bash +# FSQA Test No. 078 +# +# Regression test for a btrfs issue where creation of readonly snapshots caused +# the filesystem to get into an inconsistent state. +# +# This regression was introduced in the 3.17 kernel and fixed by reverting the +# following linux kernel commit: +# +# Btrfs: race free update of commit root for ro snapshots +# 9c3b306e1c9e6be4be09e99a8fe2227d1005effc +# +#--- +# +# Copyright (C) 2014 SUSE Linux Products GmbH. All Rights Reserved. +# Author: Filipe Manana fdman...@suse.com +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +#--- +# + +seq=`basename $0` +seqres=$RESULT_DIR/$seq +echo QA output created by $seq + +tmp=`mktemp -d` +status=1 # failure is the default! +trap _cleanup; exit \$status 0 1 2 3 15 + +_cleanup() +{ + rm -fr $tmp +} + +# get standard environment, filters and checks +. ./common/rc +. ./common/filter + +# real QA test starts here +_need_to_be_root +_supported_fs btrfs +_supported_os Linux +_require_scratch + +rm -f $seqres.full + +workout() +{ + ops=$1 + procs=$2 + num_snapshots=$3 + + _scratch_mkfs $seqres.full 21 + _scratch_mount + + snapshot_cmd=$BTRFS_UTIL_PROG subvolume snapshot -r $SCRATCH_MNT + snapshot_cmd=$snapshot_cmd $SCRATCH_MNT/snap_\`date +'%H_%M_%S_%N'\` + run_check $FSSTRESS_PROG -p $procs \ + -x $snapshot_cmd -X $num_snapshots -d $SCRATCH_MNT -n $ops +} + +ops=8000 +procs=4 +snapshots=100 +workout $ops $procs $snapshots + +# The fstests framework runs a file system check against the scratch device +# automatically when a test case finishes (if the test calls _require_scratch). +# That filesystem check (btrfsck, btrfs.fsck) failed reporting several fs +# inconsistencies. Therefore there's no need to call _check_scratch_fs here. + +echo Silence is golden +status=0 +exit diff --git a/tests/btrfs/078.out b/tests/btrfs/078.out new file mode 100644 index 000..b8acea8 --- /dev/null +++ b/tests/btrfs/078.out @@ -0,0 +1,2 @@ +QA output created by 078 +Silence is golden diff --git a/tests/btrfs/group b/tests/btrfs/group index 9adf862..40e7430 100644 --- a/tests/btrfs/group +++ b/tests/btrfs/group @@ -80,3 +80,4 @@ 075 auto quick subvol 076 auto quick 077 auto quick +078 auto -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] Btrfs-progs: check, fix return value check of is_child_root()
The following commit: btrfs-progs: fsck: remove unfriendly BUG_ON() for searching tree failure f495a2ac66116f0a1b15e73380c8cbca6e0a4ca0 introduced a regression, detected through xfstests/btrfs/054, where previously a negative return value (-1) was used to mean a particular root didn't had any parent root, and now, after that change, a negative value is also used to mean that an error happened. That change also made the only caller of is_child_root() interpret any negative return value as an error and therefore incorrectly made the caller leave with an error, instead of jumping to its skip label. Since the return value that means the root with id child_root_id doesn't have any parent root isn't used by the only caller of is_child_root(), just get rid of it and make is_child_root() return 0 if parent_root_id isn't a parent of child_root_id, return 1 if it is, and a negative value on error. This affects only the 3.17 release candidates (3.16 and older releases don't have this issue). Signed-off-by: Filipe Manana fdman...@suse.com --- cmds-check.c | 5 + 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/cmds-check.c b/cmds-check.c index 99d1a94..002d3e9 100644 --- a/cmds-check.c +++ b/cmds-check.c @@ -901,7 +901,6 @@ static int is_child_root(struct btrfs_root *root, u64 parent_root_id, struct btrfs_path path; struct btrfs_key key; struct extent_buffer *leaf; - int has_parent = 0; int ret; btrfs_init_path(path); @@ -939,8 +938,6 @@ static int is_child_root(struct btrfs_root *root, u64 parent_root_id, key.type != BTRFS_ROOT_BACKREF_KEY) break; - has_parent = 1; - if (key.offset == parent_root_id) { btrfs_release_path(path); return 1; @@ -952,7 +949,7 @@ out: btrfs_release_path(path); if (ret 0) return ret; - return has_parent? 0 : -1; + return 0; } static int process_dir_item(struct btrfs_root *root, -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v2] Btrfs-progs: check, fix return value check of is_child_root()
The following commit: btrfs-progs: fsck: remove unfriendly BUG_ON() for searching tree failure f495a2ac66116f0a1b15e73380c8cbca6e0a4ca0 introduced a regression, detected through xfstests/btrfs/054, where previously a negative return value (-1) was used to mean a particular root didn't had any parent root, and now, after that change, a negative value is also used to mean that an error happened. That change also made the only caller of is_child_root() interpret any negative return value as an error and therefore incorrectly made the caller leave with an error, instead of continuing. This affects only the 3.17 release candidates (3.16 and older releases don't have this issue). Signed-off-by: Filipe Manana fdman...@suse.com --- V2: Made it return 2 (instead of -1) when the root child_root_id doesn't have any parent roots, in order to behave exactly like the code pre-commit f495a2ac66116f0a1b15e73380c8cbca6e0a4ca0. cmds-check.c | 10 +- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/cmds-check.c b/cmds-check.c index 99d1a94..310eb2a 100644 --- a/cmds-check.c +++ b/cmds-check.c @@ -895,6 +895,14 @@ static int leave_shared_node(struct btrfs_root *root, return 0; } +/* + * Returns: + * 0 - on error + * 1 - if the root with id child_root_id is a child of root parent_root_id + * 0 - if the root child_root_id isn't a child of the root parent_root_id but + * has other root(s) as parent(s) + * 2 - if the root child_root_id doesn't have any parent roots + */ static int is_child_root(struct btrfs_root *root, u64 parent_root_id, u64 child_root_id) { @@ -952,7 +960,7 @@ out: btrfs_release_path(path); if (ret 0) return ret; - return has_parent? 0 : -1; + return has_parent ? 0 : 2; } static int process_dir_item(struct btrfs_root *root, -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] Btrfs-progs: check, ability to detect and fix outdated snapshot root items
This change adds code to detect and fix the issue introduced in the kernel release 3.17, where creation of read-only snapshots lead to a corrupted filesystem if they were created at a moment when the source subvolume/snapshot had orphan items. The issue was that the on-disk root items became incorrect, referring to the pre orphan cleanup root node instead of the post orphan cleanup root node. A test filesystem can be generated with the test case recently submitted for xfstests/fstests, which is essencially the following (bash script): workout() { ops=$1 procs=$2 num_snapshots=$3 _scratch_mkfs $seqres.full 21 _scratch_mount snapshot_cmd=$BTRFS_UTIL_PROG subvolume snapshot -r $SCRATCH_MNT snapshot_cmd=$snapshot_cmd $SCRATCH_MNT/snap_\`date +'%H_%M_%S_%N'\` run_check $FSSTRESS_PROG -p $procs \ -x $snapshot_cmd -X $num_snapshots -d $SCRATCH_MNT -n $ops } ops=1 procs=4 snapshots=500 workout $ops $procs $snapshots Example of btrfsck's (btrfs check) behaviour against such filesystem: $ btrfsck /dev/loop0 root item for root 311, current bytenr 44630016, current gen 60, current level 1, new bytenr 44957696, new gen 61, new level 1 root item for root 1480, current bytenr 1003569152, current gen 1271, current level 1, new bytenr 1004175360, new gen 1272, new level 1 root item for root 1509, current bytenr 1037434880, current gen 1300, current level 1, new bytenr 1038467072, new gen 1301, new level 1 root item for root 1562, current bytenr 33636352, current gen 1354, current level 1, new bytenr 3442, new gen 1355, new level 1 root item for root 3094, current bytenr 1011712000, current gen 2935, current level 1, new bytenr 1008484352, new gen 2936, new level 1 root item for root 3716, current bytenr 80805888, current gen 3578, current level 1, new bytenr 73515008, new gen 3579, new level 1 root item for root 4085, current bytenr 714031104, current gen 3958, current level 1, new bytenr 716816384, new gen 3959, new level 1 Found 7 roots with an outdated root item. Please run a filesystem check with the option --repair to fix them. $ echo $? 1 $ btrfsck --repair /dev/loop0 enabling repair mode fixing root item for root 311, current bytenr 44630016, current gen 60, current level 1, new bytenr 44957696, new gen 61, new level 1 fixing root item for root 1480, current bytenr 1003569152, current gen 1271, current level 1, new bytenr 1004175360, new gen 1272, new level 1 fixing root item for root 1509, current bytenr 1037434880, current gen 1300, current level 1, new bytenr 1038467072, new gen 1301, new level 1 fixing root item for root 1562, current bytenr 33636352, current gen 1354, current level 1, new bytenr 3442, new gen 1355, new level 1 fixing root item for root 3094, current bytenr 1011712000, current gen 2935, current level 1, new bytenr 1008484352, new gen 2936, new level 1 fixing root item for root 3716, current bytenr 80805888, current gen 3578, current level 1, new bytenr 73515008, new gen 3579, new level 1 fixing root item for root 4085, current bytenr 714031104, current gen 3958, current level 1, new bytenr 716816384, new gen 3959, new level 1 Fixed 7 roots. Checking filesystem on /dev/loop0 UUID: 2186e9b9-c977-4a35-9c7b-69c6609d4620 checking extents checking free space cache cache and super generation don't match, space cache will be invalidated checking fs roots checking csums checking root refs found 618537000 bytes used err is 0 total csum bytes: 130824 total tree bytes: 601620480 total fs tree bytes: 580288512 total extent tree bytes: 18464768 btree space waste bytes: 136939144 file data blocks allocated: 34150318080 referenced 27815415808 Btrfs v3.17-rc3-2-gbbe1dd8 $ echo $? 0 Signed-off-by: Filipe Manana fdman...@suse.com --- cmds-check.c | 356 + disk-io.c | 2 + extent-tree.c | 22 +- tests/fsck-tests.sh| 15 +- tests/fsck-tests/006-bad_root_items_fs.tar.xz | Bin 0 - 24980 bytes .../fsck-tests/007-bad_root_items_fs_skinny.tar.xz | Bin 0 - 26520 bytes utils.c| 21 ++ utils.h| 2 + 8 files changed, 395 insertions(+), 23 deletions(-) create mode 100644 tests/fsck-tests/006-bad_root_items_fs.tar.xz create mode 100644 tests/fsck-tests/007-bad_root_items_fs_skinny.tar.xz diff --git a/cmds-check.c b/cmds-check.c index 310eb2a..2a5f823 100644 --- a/cmds-check.c +++ b/cmds-check.c @@ -7198,6 +7198,345 @@ static int fill_csum_tree(struct btrfs_trans_handle *trans, return ret; } +struct root_item_info { + /* level of the root */ + u8 level; + /* number of nodes at this level, must be 1
Re: [GIT PULL] Btrfs for stable (mostly 3.17)
On 10/20/2014 12:13 AM, Greg KH wrote: On Sun, Oct 19, 2014 at 09:55:11PM +0200, Greg KH wrote: On Sun, Oct 19, 2014 at 06:01:16AM -0400, Chris Mason wrote: Hi everyone, I've pulled out some of the btrfs commits from the merge window that we'd like to see in stable. The full list of sha's from Linus is below, you can see 4 of them are only needed on 3.17 2fad4e83e12591eb3bd213875b9edc2d18e93383 0b4699dcb65c2cff793210b07f40b98c2d423a43 # v3.17 12b894cb288d57292b01cf158177b6d5c89a6272 78a017a2c92df9b571db0a55a016280f9019c65e 4d1a40c66bed0b3fa43b9da5fbd5cbe332e4eccf e6c4efd87ab04e5ead363f24e6ac35ed3506d401 # v3.17 f6acfd50110b335c7af636cf1fc8e55319cae5fc 1d52c78afbbf80b58299e076a159617d6b42fe3c 75bfb9aff45e44625260f52a5fd581b92ace3e62 bbe9051441effce51c9a533d2c56440df64db2d7 32be3a1ac6d09576c57063c6c350ca36eaebdbd3 # v3.17 42383020beb1cfb05f5d330cc311931bc4917a97 d37973082b453ba6b89ec07eb7b84305895d35e1 # v3.17 I'm confused, the others not marked with a # v3.17 need to go on older kernels as well? I've picked up the ones that apply and build for the older stable kernels I maintain now, thanks for the list. May I suggest porting the following commit to 3.14 too? https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=766b5e5ae78dd04a93a275690a49e23d7dcb1f39 It fixes a data corruption issue for an incremental send. Particularly important, IMHO, as the corruption happens silently (no errors returned to user space nor any sort of warnings/errors in syslog, etc). It affects only 3.14, and the change applies cleanly on 3.14.22. Thanks greg k-h -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] Btrfs: ensure send always works on roots without orphans
Move the logic from the snapshot creation ioctl into send. This avoids doing the transaction commit if send isn't used, and ensures that if a crash/reboot happens after the transaction commit that created the snapshot and before the transaction commit that switched the commit root, send will not get a commit root that differs from the main root (that has orphan items). Signed-off-by: Filipe Manana fdman...@suse.com --- fs/btrfs/ioctl.c | 29 - fs/btrfs/send.c | 49 + 2 files changed, 49 insertions(+), 29 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 33c80f5..994c573 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -717,35 +717,6 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, if (ret) goto fail; - /* -* If orphan cleanup did remove any orphans, it means the tree was -* modified and therefore the commit root is not the same as the -* current root anymore. This is a problem, because send uses the -* commit root and therefore can see inode items that don't exist -* in the current root anymore, and for example make calls to -* btrfs_iget, which will do tree lookups based on the current root -* and not on the commit root. Those lookups will fail, returning a -* -ESTALE error, and making send fail with that error. So make sure -* a send does not see any orphans we have just removed, and that it -* will see the same inodes regardless of whether a transaction -* commit happened before it started (meaning that the commit root -* will be the same as the current root) or not. -*/ - if (readonly pending_snapshot-snap-node != - pending_snapshot-snap-commit_root) { - trans = btrfs_join_transaction(pending_snapshot-snap); - if (IS_ERR(trans) PTR_ERR(trans) != -ENOENT) { - ret = PTR_ERR(trans); - goto fail; - } - if (!IS_ERR(trans)) { - ret = btrfs_commit_transaction(trans, - pending_snapshot-snap); - if (ret) - goto fail; - } - } - inode = btrfs_lookup_dentry(dentry-d_parent-d_inode, dentry); if (IS_ERR(inode)) { ret = PTR_ERR(inode); diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 874828d..804432d 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -5507,6 +5507,51 @@ out: return ret; } +/* + * If orphan cleanup did remove any orphans from a root, it means the tree + * was modified and therefore the commit root is not the same as the current + * root anymore. This is a problem, because send uses the commit root and + * therefore can see inode items that don't exist in the current root anymore, + * and for example make calls to btrfs_iget, which will do tree lookups based + * on the current root and not on the commit root. Those lookups will fail, + * returning a -ESTALE error, and making send fail with that error. So make + * sure a send does not see any orphans we have just removed, and that it will + * see the same inodes regardless of whether a transaction commit happened + * before it started (meaning that the commit root will be the same as the + * current root) or not. + */ +static int ensure_commit_roots_uptodate(struct send_ctx *sctx) +{ + int i; + struct btrfs_trans_handle *trans = NULL; + +again: + if (sctx-parent_root + sctx-parent_root-node != sctx-parent_root-commit_root) + goto commit_trans; + + for (i = 0; i sctx-clone_roots_cnt; i++) + if (sctx-clone_roots[i].root-node != + sctx-clone_roots[i].root-commit_root) + goto commit_trans; + + if (trans) + return btrfs_end_transaction(trans, sctx-send_root); + + return 0; + +commit_trans: + /* Use any root, all fs roots will get their commit roots updated. */ + if (!trans) { + trans = btrfs_join_transaction(sctx-send_root); + if (IS_ERR(trans)) + return PTR_ERR(trans); + goto again; + } + + return btrfs_commit_transaction(trans, sctx-send_root); +} + static void btrfs_root_dec_send_in_progress(struct btrfs_root* root) { spin_lock(root-root_item_lock); @@ -5728,6 +5773,10 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) NULL); sort_clone_roots = 1; + ret = ensure_commit_roots_uptodate(sctx); + if (ret) + goto out; + current-journal_info = BTRFS_SEND_TRANS_STUB; ret = send_subvol(sctx); current-journal_info = NULL; -- 1.9.1 -- To unsubscribe from this list
[PATCH] Btrfs: fix snapshot inconsistency after a file write followed by truncate
If right after starting the snapshot creation ioctl we perform a write against a file followed by a truncate, with both operations increasing the file's size, we can get a snapshot tree that reflects a state of the source subvolume's tree where the file truncation happened but the write operation didn't. This leaves a gap between 2 file extent items of the inode, which makes btrfs' fsck complain about it. For example, if we perform the following file operations: $ mkfs.btrfs -f /dev/vdd $ mount /dev/vdd /mnt $ xfs_io -f \ -c pwrite -S 0xaa -b 32K 0 32K \ -c fsync \ -c pwrite -S 0xbb -b 32770 16K 32770 \ -c truncate 90123 \ /mnt/foobar and the snapshot creation ioctl was just called before the second write, we often can get the following inode items in the snapshot's btree: item 120 key (257 INODE_ITEM 0) itemoff 7987 itemsize 160 inode generation 146 transid 7 size 90123 block group 0 mode 100600 links 1 uid 0 gid 0 rdev 0 flags 0x0 item 121 key (257 INODE_REF 256) itemoff 7967 itemsize 20 inode ref index 282 namelen 10 name: foobar item 122 key (257 EXTENT_DATA 0) itemoff 7914 itemsize 53 extent data disk byte 1104855040 nr 32768 extent data offset 0 nr 32768 ram 32768 extent compression 0 item 123 key (257 EXTENT_DATA 53248) itemoff 7861 itemsize 53 extent data disk byte 0 nr 0 extent data offset 0 nr 40960 ram 40960 extent compression 0 There's a file range, corresponding to the interval [32K; ALIGN(16K + 32770, 4096)[ for which there's no file extent item covering it. This is because the file write and file truncate operations happened both right after the snapshot creation ioctl called btrfs_start_delalloc_inodes(), which means we didn't start and wait for the ordered extent that matches the write and, in btrfs_setsize(), we were able to call btrfs_cont_expand() before being able to commit the current transaction in the snapshot creation ioctl. So this made it possibe to insert the hole file extent item in the source subvolume (which represents the region added by the truncate) right before the transaction commit from the snapshot creation ioctl. Btrfs' fsck tool complains about such cases with a message like the following: root 331 inode 257 errors 100, file extent discount From a user perspective, the expectation when a snapshot is created while those file operations are being performed is that the snapshot will have a file that either: 1) is empty 2) only the first write was captured 3) only the 2 writes were captured 4) both writes and the truncation were captured But never capture a state where only the first write and the truncation were captured (since the second write was performed before the truncation). A test case for xfstests follows. Signed-off-by: Filipe Manana fdman...@suse.com --- fs/btrfs/inode.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 0d41741..c28b78f 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4622,6 +4622,9 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) } if (newsize oldsize) { + ret = btrfs_wait_ordered_range(inode, 0, (u64)-1); + if (ret) + return ret; truncate_pagecache(inode, newsize); ret = btrfs_cont_expand(inode, oldsize, newsize); if (ret) -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] fstests: btrfs, add test for snapshoting after file write + truncate
Regression test for a btrfs issue where if right after the snapshot creation ioctl started, a file write followed by a file truncate happened, with both operations increasing the file's size, the created snapshot would capture an inconsistent state of the file system tree. That state reflected the file truncation but it didn't reflect the write operation, and left a gap between two file extent items (and that gap corresponded to the total or a partial area of the write operation's range). This issue was fixed by the following linux kernel patch: Btrfs: fix snapshot inconsistency after a file write followed by truncate Signed-off-by: Filipe Manana fdman...@suse.com --- tests/btrfs/080 | 152 tests/btrfs/080.out | 2 + tests/btrfs/group | 1 + 3 files changed, 155 insertions(+) create mode 100755 tests/btrfs/080 create mode 100644 tests/btrfs/080.out diff --git a/tests/btrfs/080 b/tests/btrfs/080 new file mode 100755 index 000..a73e534 --- /dev/null +++ b/tests/btrfs/080 @@ -0,0 +1,152 @@ +#! /bin/bash +# FSQA Test No. 080 +# +# Regression test for a btrfs issue where if right after the snapshot creation +# ioctl started, a file write followed by a file truncate happened, with both +# operations increasing the file's size, the created snapshot would capture an +# inconsistent state of the file system tree. That state reflected the file +# truncation but it didn't reflect the write operation, and left a gap between +# two file extent items (and that gap corresponded to the total or a partial +# area of the write operation's range). +# +# This issue was fixed by the following linux kernel patch: +# +# Btrfs: fix snapshot inconsistency after a file write followed by truncate +# +#--- +# +# Copyright (C) 2014 SUSE Linux Products GmbH. All Rights Reserved. +# Author: Filipe Manana fdman...@suse.com +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +#--- +# + +seq=`basename $0` +seqres=$RESULT_DIR/$seq +echo QA output created by $seq +tmp=/tmp/$$ +status=1 # failure is the default! +trap _cleanup; exit \$status 0 1 2 3 15 + +_cleanup() +{ + rm -f $tmp.* +} + +# get standard environment, filters and checks +. ./common/rc +. ./common/filter + +# real QA test starts here +_need_to_be_root +_supported_fs btrfs +_supported_os Linux +_require_scratch_nocheck + +rm -f $seqres.full + +create_snapshots() +{ + count=$1 + ts=`date +'%H_%M_%S_%N'` + + for ((i = 1; i = $count; i++)); do + _run_btrfs_util_prog subvolume snapshot -r \ + $SCRATCH_MNT $SCRATCH_MNT/${ts}_snap_$i + done +} + +create_file() +{ + name=$1 + + run_check $XFS_IO_PROG -f \ + -c pwrite -S 0xaa -b 32K 0 32K \ + -c fsync \ + -c pwrite -S 0xbb -b 32770 16K 32770 \ + -c truncate 90123 \ + $SCRATCH_MNT/$name +} + +workout() +{ + name=$1 + snapshots=$2 + + create_file $name + fpid=$! + create_snapshots $snapshots + spid=$! + wait $fpid + create_ret=$? + wait $spid + snap_ret=$? + if [ $create_ret != 0 -o $snap_ret != 0 ]; then + _fail Failure creating file or snapshots, check $seqres.full for details + fi +} + +# If the installed btrfs mkfs supports the no-holes feature, make sure the +# created fs doesn't get that feature enabled. With it enabled, the below fsck +# call wouldn't fail. This feature hasn't been enabled by default since it was +# introduced, but be safe and explicitly disable it. +_scratch_mkfs -O list-all 21 | grep -q '\bno\-holes\b' +if [ $? -eq 0 ]; then + mkfs_options=-O ^no-holes +fi +_scratch_mkfs $mkfs_options $seqres.full 21 + +_scratch_mount +for ((i = 1; i = 100; i++)); do + workout foobar_$i 1 +done + +for f in $(find $SCRATCH_MNT -name 'foobar_*'); do + digest=`md5sum $f | cut -d ' ' -f 1` + case $digest in + d41d8cd98f00b204e9800998ecf8427e) + # ok, empty file + ;; + c28418534a020122aca59fd3ff9581b5) + # ok, only first write captured + ;; + cd0032da89254cdc498fda396e6a9b54) + # ok, only 2
[PATCH] fstests: btrfs, add regression test for clone ioctl
Regression test for a btrfs clone ioctl issue where races between a clone operation and concurrent target file reads would result in leaving stale data in the page cache. After the clone operation finished, reading from the clone target file would return the old and no longer valid data. This affected only buffered reads (i.e. didn't affect direct IO reads). This issue was fixed by the following linux kernel patch: Btrfs: ensure readers see new data after a clone operation (commit c125b8bff1d9f6c8c91ce4eb8bd5616058c7d510) Signed-off-by: Filipe Manana fdman...@suse.com --- tests/btrfs/081 | 131 tests/btrfs/081.out | 4 ++ tests/btrfs/group | 1 + 3 files changed, 136 insertions(+) create mode 100755 tests/btrfs/081 create mode 100644 tests/btrfs/081.out diff --git a/tests/btrfs/081 b/tests/btrfs/081 new file mode 100755 index 000..d2e3767 --- /dev/null +++ b/tests/btrfs/081 @@ -0,0 +1,131 @@ +#! /bin/bash +# FSQA Test No. 081 +# +# Regression test for a btrfs clone ioctl issue where races between +# a clone operation and concurrent target file reads would result in +# leaving stale data in the page cache. After the clone operation +# finished, reading from the clone target file would return the old +# and no longer valid data. This affected only buffered reads (i.e. +# didn't affect direct IO reads). +# +# This issue was fixed by the following linux kernel patch: +# +# Btrfs: ensure readers see new data after a clone operation +# (commit c125b8bff1d9f6c8c91ce4eb8bd5616058c7d510) +# +#--- +# +# Copyright (C) 2014 SUSE Linux Products GmbH. All Rights Reserved. +# Author: Filipe Manana fdman...@suse.com +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +#--- +# + +seq=`basename $0` +seqres=$RESULT_DIR/$seq +echo QA output created by $seq +tmp=/tmp/$$ +status=1 # failure is the default! +trap _cleanup; exit \$status 0 1 2 3 15 + +_cleanup() +{ + rm -f $tmp.* +} + +# get standard environment, filters and checks +. ./common/rc +. ./common/filter + +# real QA test starts here +_need_to_be_root +_supported_fs btrfs +_supported_os Linux +_require_scratch +_require_btrfs_cloner + +rm -f $seqres.full + +num_extents=100 +extent_size=8192 + +create_source_file() +{ + name=$1 + + # Create a file with $num_extents extents, each with a size of + # $extent_size bytes. + touch $SCRATCH_MNT/$name + for ((i = 0; i $num_extents; i++)); do + off=$((i * $extent_size)) + run_check $XFS_IO_PROG \ + -c pwrite -S $i -b $extent_size $off $extent_size \ + -c fsync $SCRATCH_MNT/$name + done +} + +create_target_file() +{ + name=$1 + file_size=$(($num_extents * $extent_size)) + + run_check $XFS_IO_PROG -f -c pwrite -S 0xff 0 $file_size \ + -c fsync $SCRATCH_MNT/$name +} + +reader_loop() +{ + name=$1 + + while true; do + cat $SCRATCH_MNT/$name /dev/null + done +} + +_scratch_mkfs $seqres.full 21 +_scratch_mount + +create_source_file foo +create_target_file bar + +reader_loop bar +reader_pid=$! + +$CLONER_PROG -s 0 -d 0 -l $(($num_extents * $extent_size)) \ + $SCRATCH_MNT/foo $SCRATCH_MNT/bar + +kill $reader_pid /dev/null 21 + +# Now both foo and bar should have exactly the same content. +# This didn't use to be the case before the btrfs kernel fix mentioned +# above. The clone ioctl was racy, as it removed bar's pages from the +# page cache and only after it would update bar's metadata to point to +# the same extents that foo's metadata points to - and this was done in +# an unprotected way, so that a file read request done right after the +# clone ioctl removed the pages from the page cache and before it updated +# bar's metadata, would result in populating the page cache with stale +# data. Therefore a file read after the clone operation finished would +# not get the cloned data but it would get instead the old and no longer +# valid data. +md5sum $SCRATCH_MNT/foo | _filter_scratch +md5sum $SCRATCH_MNT/bar | _filter_scratch + +# Validate the content of bar still matches foo's content even after +# clearing all of bar's data from the page cache
[PATCH v2] fstests: btrfs, add test for snapshoting after file write + truncate
Regression test for a btrfs issue where if right after the snapshot creation ioctl started, a file write followed by a file truncate happened, with both operations increasing the file's size, the created snapshot would capture an inconsistent state of the file system tree. That state reflected the file truncation but it didn't reflect the write operation, and left a gap between two file extent items (and that gap corresponded to the total or a partial area of the write operation's range). This issue was fixed by the following linux kernel patch: Btrfs: fix snapshot inconsistency after a file write followed by truncate Signed-off-by: Filipe Manana fdman...@suse.com --- V2: Added some background processes to cause some cpu load. This makes the test fail always on environments with a non-debug kernel and where no other significant load (other the test itself) is running. tests/btrfs/080 | 169 tests/btrfs/080.out | 2 + tests/btrfs/group | 1 + 3 files changed, 172 insertions(+) create mode 100755 tests/btrfs/080 create mode 100644 tests/btrfs/080.out diff --git a/tests/btrfs/080 b/tests/btrfs/080 new file mode 100755 index 000..a5d3b38 --- /dev/null +++ b/tests/btrfs/080 @@ -0,0 +1,169 @@ +#! /bin/bash +# FSQA Test No. 080 +# +# Regression test for a btrfs issue where if right after the snapshot creation +# ioctl started, a file write followed by a file truncate happened, with both +# operations increasing the file's size, the created snapshot would capture an +# inconsistent state of the file system tree. That state reflected the file +# truncation but it didn't reflect the write operation, and left a gap between +# two file extent items (and that gap corresponded to the total or a partial +# area of the write operation's range). +# +# This issue was fixed by the following linux kernel patch: +# +# Btrfs: fix snapshot inconsistency after a file write followed by truncate +# +#--- +# +# Copyright (C) 2014 SUSE Linux Products GmbH. All Rights Reserved. +# Author: Filipe Manana fdman...@suse.com +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +#--- +# + +seq=`basename $0` +seqres=$RESULT_DIR/$seq +echo QA output created by $seq +tmp=/tmp/$$ +status=1 # failure is the default! +trap _cleanup; exit \$status 0 1 2 3 15 + +_cleanup() +{ + for p in ${cpu_stress_pids[*]}; do + kill $p /dev/null + done + rm -f $tmp.* +} + +# get standard environment, filters and checks +. ./common/rc +. ./common/filter + +# real QA test starts here +_need_to_be_root +_supported_fs btrfs +_supported_os Linux +_require_scratch_nocheck + +rm -f $seqres.full + +create_snapshot() +{ + local ts=`date +'%H_%M_%S_%N'` + + _run_btrfs_util_prog subvolume snapshot -r \ + $SCRATCH_MNT $SCRATCH_MNT/${ts}_snap +} + +create_file() +{ + local name=$1 + + run_check $XFS_IO_PROG -f \ + -c pwrite -S 0xaa -b 32K 0 32K \ + -c fsync \ + -c pwrite -S 0xbb -b 32770 16K 32770 \ + -c truncate 90123 \ + $SCRATCH_MNT/$name +} + +workout() +{ + local name=$1 + + create_file $name + fpid=$! + create_snapshot + spid=$! + wait $fpid + create_ret=$? + wait $spid + snap_ret=$? + if [ $create_ret != 0 -o $snap_ret != 0 ]; then + _fail Failure creating file or snapshot, check $seqres.full for details + fi +} + +# If the installed btrfs mkfs supports the no-holes feature, make sure the +# created fs doesn't get that feature enabled. With it enabled, the below fsck +# call wouldn't fail. This feature hasn't been enabled by default since it was +# introduced, but be safe and explicitly disable it. +_scratch_mkfs -O list-all 21 | grep -q '\bno\-holes\b' +if [ $? -eq 0 ]; then + mkfs_options=-O ^no-holes +fi +_scratch_mkfs $mkfs_options $seqres.full 21 + +_scratch_mount + +# Run some background load in order to make the issue easier to trigger. +# Specially needed when testing with non-debug kernels and there isn't +# any other significant load on the test machine other than this test. +num_cpus=`$here/src/feature -o` +num_procs
[PATCH] Btrfs: fix invalid leaf slot access in btrfs_lookup_extent()
If we couldn't find our extent item, we accessed the current slot (path-slots[0]) to check if it corresponds to an equivalent skinny metadata item. However this slot could be beyond our last item in the leaf (i.e. path-slots[0] = btrfs_header_nritems(leaf)), in which case we shouldn't process it. Since btrfs_lookup_extent() is only used to find extent items for data extents, fix this by removing completely the logic that looks up for an equivalent skinny metadata item, since it can not exist. Signed-off-by: Filipe Manana fdman...@suse.com --- fs/btrfs/extent-tree.c | 8 +--- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 0d599ba..9141b2b 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -710,7 +710,7 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info) rcu_read_unlock(); } -/* simple helper to search for an existing extent at a given offset */ +/* simple helper to search for an existing data extent at a given offset */ int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len) { int ret; @@ -726,12 +726,6 @@ int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len) key.type = BTRFS_EXTENT_ITEM_KEY; ret = btrfs_search_slot(NULL, root-fs_info-extent_root, key, path, 0, 0); - if (ret 0) { - btrfs_item_key_to_cpu(path-nodes[0], key, path-slots[0]); - if (key.objectid == start - key.type == BTRFS_METADATA_ITEM_KEY) - ret = 0; - } btrfs_free_path(path); return ret; } -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] Btrfs: fix race that makes btrfs_lookup_extent_info miss skinny extent items
We have a race that can lead us to miss skinny extent items in the function btrfs_lookup_extent_info() when the skinny metadata feature is enabled. So basically the sequence of steps is: 1) We search in the extent tree for the skinny extent, which returns 0 (not found); 2) We check the previous item in the returned leaf for a non-skinny extent, and we don't find it; 3) Because we didn't find the non-skinny extent in step 2), we release our path to search the extent tree again, but this time for a non-skinny extent key; 4) Right after we released our path in step 3), a skinny extent was inserted in the extent tree (delayed refs were run) - our second extent tree search will miss it, because it's not looking for a skinny extent; 5) After the second search returned (with ret 0), we look for any delayed ref for our extent's bytenr (and we do it while holding a read lock on the leaf), but we won't find any, as such delayed ref had just run and completed after we released out path in step 3) before doing the second search. Fix this by removing completely the path release and re-search logic. This is safe, because if we seach for a metadata item and we don't find it, we have the guarantee that the returned leaf is the one where the item would be inserted, and so path-slots[0] 0 and path-slots[0] - 1 must be the slot where the non-skinny extent item is if it exists. The only case where path-slots[0] is zero is when there are no smaller keys in the tree (i.e. no left siblings for our leaf), in which case the re-search logic isn't needed as well. This race has been present since the introduction of skinny metadata (change 3173a18f70554fe7880bb2d85c7da566e364eb3c). Signed-off-by: Filipe Manana fdman...@suse.com --- fs/btrfs/extent-tree.c | 8 1 file changed, 8 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 9141b2b..2cedd06 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -780,7 +780,6 @@ search_again: else key.type = BTRFS_EXTENT_ITEM_KEY; -again: ret = btrfs_search_slot(trans, root-fs_info-extent_root, key, path, 0, 0); if (ret 0) @@ -796,13 +795,6 @@ again: key.offset == root-nodesize) ret = 0; } - if (ret) { - key.objectid = bytenr; - key.type = BTRFS_EXTENT_ITEM_KEY; - key.offset = root-nodesize; - btrfs_release_path(path); - goto again; - } } if (ret == 0) { -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v2] Btrfs: fix invalid leaf slot access in btrfs_lookup_extent()
If we couldn't find our extent item, we accessed the current slot (path-slots[0]) to check if it corresponds to an equivalent skinny metadata item. However this slot could be beyond our last item in the leaf (i.e. path-slots[0] = btrfs_header_nritems(leaf)), in which case we shouldn't process it. Since btrfs_lookup_extent() is only used to find extent items for data extents, fix this by removing completely the logic that looks up for an equivalent skinny metadata item, since it can not exist. Signed-off-by: Filipe Manana fdman...@suse.com --- V2: Renamed btrfs_lookup_extent() to btrfs_lookup_data_extent(). fs/btrfs/ctree.h | 2 +- fs/btrfs/extent-tree.c | 10 ++ fs/btrfs/tree-log.c| 2 +- 3 files changed, 4 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index dd8b275..b72b358 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3276,7 +3276,7 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_root *root, unsigned long count); int btrfs_async_run_delayed_refs(struct btrfs_root *root, unsigned long count, int wait); -int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len); +int btrfs_lookup_data_extent(struct btrfs_root *root, u64 start, u64 len); int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 offset, int metadata, u64 *refs, u64 *flags); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 0d599ba..87c0b46f 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -710,8 +710,8 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info) rcu_read_unlock(); } -/* simple helper to search for an existing extent at a given offset */ -int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len) +/* simple helper to search for an existing data extent at a given offset */ +int btrfs_lookup_data_extent(struct btrfs_root *root, u64 start, u64 len) { int ret; struct btrfs_key key; @@ -726,12 +726,6 @@ int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len) key.type = BTRFS_EXTENT_ITEM_KEY; ret = btrfs_search_slot(NULL, root-fs_info-extent_root, key, path, 0, 0); - if (ret 0) { - btrfs_item_key_to_cpu(path-nodes[0], key, path-slots[0]); - if (key.objectid == start - key.type == BTRFS_METADATA_ITEM_KEY) - ret = 0; - } btrfs_free_path(path); return ret; } diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 2b26dad..6d58d72 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -672,7 +672,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, * is this extent already allocated in the extent * allocation tree? If so, just add a reference */ - ret = btrfs_lookup_extent(root, ins.objectid, + ret = btrfs_lookup_data_extent(root, ins.objectid, ins.offset); if (ret == 0) { ret = btrfs_inc_extent_ref(trans, root, -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v2] Btrfs: fix snapshot inconsistency after a file write followed by truncate
If right after starting the snapshot creation ioctl we perform a write against a file followed by a truncate, with both operations increasing the file's size, we can get a snapshot tree that reflects a state of the source subvolume's tree where the file truncation happened but the write operation didn't. This leaves a gap between 2 file extent items of the inode, which makes btrfs' fsck complain about it. For example, if we perform the following file operations: $ mkfs.btrfs -f /dev/vdd $ mount /dev/vdd /mnt $ xfs_io -f \ -c pwrite -S 0xaa -b 32K 0 32K \ -c fsync \ -c pwrite -S 0xbb -b 32770 16K 32770 \ -c truncate 90123 \ /mnt/foobar and the snapshot creation ioctl was just called before the second write, we often can get the following inode items in the snapshot's btree: item 120 key (257 INODE_ITEM 0) itemoff 7987 itemsize 160 inode generation 146 transid 7 size 90123 block group 0 mode 100600 links 1 uid 0 gid 0 rdev 0 flags 0x0 item 121 key (257 INODE_REF 256) itemoff 7967 itemsize 20 inode ref index 282 namelen 10 name: foobar item 122 key (257 EXTENT_DATA 0) itemoff 7914 itemsize 53 extent data disk byte 1104855040 nr 32768 extent data offset 0 nr 32768 ram 32768 extent compression 0 item 123 key (257 EXTENT_DATA 53248) itemoff 7861 itemsize 53 extent data disk byte 0 nr 0 extent data offset 0 nr 40960 ram 40960 extent compression 0 There's a file range, corresponding to the interval [32K; ALIGN(16K + 32770, 4096)[ for which there's no file extent item covering it. This is because the file write and file truncate operations happened both right after the snapshot creation ioctl called btrfs_start_delalloc_inodes(), which means we didn't start and wait for the ordered extent that matches the write and, in btrfs_setsize(), we were able to call btrfs_cont_expand() before being able to commit the current transaction in the snapshot creation ioctl. So this made it possibe to insert the hole file extent item in the source subvolume (which represents the region added by the truncate) right before the transaction commit from the snapshot creation ioctl. Btrfs' fsck tool complains about such cases with a message like the following: root 331 inode 257 errors 100, file extent discount From a user perspective, the expectation when a snapshot is created while those file operations are being performed is that the snapshot will have a file that either: 1) is empty 2) only the first write was captured 3) only the 2 writes were captured 4) both writes and the truncation were captured But never capture a state where only the first write and the truncation were captured (since the second write was performed before the truncation). A test case for xfstests follows. Signed-off-by: Filipe Manana fdman...@suse.com --- V2: Use different approach to solve the problem. Don't start and wait for all dellaloc to finish after every expanding truncate, instead add an additional flush at transaction commit time if we're doing a transaction commit that creates snapshots. fs/btrfs/transaction.c | 59 ++ 1 file changed, 59 insertions(+) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 396ae8b..18c356e 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1714,12 +1714,65 @@ static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info) btrfs_wait_ordered_roots(fs_info, -1); } +static int +start_pending_snapshot_roots_delalloc(struct btrfs_trans_handle *trans, + struct list_head *splice) +{ + struct btrfs_pending_snapshot *pending_snapshot; + int ret = 0; + + if (btrfs_test_opt(trans-root, FLUSHONCOMMIT)) + return 0; + + spin_lock(trans-root-fs_info-trans_lock); + list_splice_init(trans-transaction-pending_snapshots, splice); + spin_unlock(trans-root-fs_info-trans_lock); + + /* +* Start again delalloc for the roots our pending snapshots are made +* from. We did it before starting/joining a transaction and we do it +* here again because new inode operations might have happened since +* then and we want to make sure the snapshot captures a fully +* consistent state of the source root tree. For example, if after the +* first delalloc flush a write is made against an inode followed by +* an expanding truncate, we want to make sure the snapshot captured +* both the write and the truncation, and not just the truncation. +* Here we shouldn't have much delalloc work to do, as the bulk of it +* was done before and outside the transaction. +*/ + list_for_each_entry(pending_snapshot, splice
[PATCH v3] Btrfs: fix snapshot inconsistency after a file write followed by truncate
If right after starting the snapshot creation ioctl we perform a write against a file followed by a truncate, with both operations increasing the file's size, we can get a snapshot tree that reflects a state of the source subvolume's tree where the file truncation happened but the write operation didn't. This leaves a gap between 2 file extent items of the inode, which makes btrfs' fsck complain about it. For example, if we perform the following file operations: $ mkfs.btrfs -f /dev/vdd $ mount /dev/vdd /mnt $ xfs_io -f \ -c pwrite -S 0xaa -b 32K 0 32K \ -c fsync \ -c pwrite -S 0xbb -b 32770 16K 32770 \ -c truncate 90123 \ /mnt/foobar and the snapshot creation ioctl was just called before the second write, we often can get the following inode items in the snapshot's btree: item 120 key (257 INODE_ITEM 0) itemoff 7987 itemsize 160 inode generation 146 transid 7 size 90123 block group 0 mode 100600 links 1 uid 0 gid 0 rdev 0 flags 0x0 item 121 key (257 INODE_REF 256) itemoff 7967 itemsize 20 inode ref index 282 namelen 10 name: foobar item 122 key (257 EXTENT_DATA 0) itemoff 7914 itemsize 53 extent data disk byte 1104855040 nr 32768 extent data offset 0 nr 32768 ram 32768 extent compression 0 item 123 key (257 EXTENT_DATA 53248) itemoff 7861 itemsize 53 extent data disk byte 0 nr 0 extent data offset 0 nr 40960 ram 40960 extent compression 0 There's a file range, corresponding to the interval [32K; ALIGN(16K + 32770, 4096)[ for which there's no file extent item covering it. This is because the file write and file truncate operations happened both right after the snapshot creation ioctl called btrfs_start_delalloc_inodes(), which means we didn't start and wait for the ordered extent that matches the write and, in btrfs_setsize(), we were able to call btrfs_cont_expand() before being able to commit the current transaction in the snapshot creation ioctl. So this made it possibe to insert the hole file extent item in the source subvolume (which represents the region added by the truncate) right before the transaction commit from the snapshot creation ioctl. Btrfs' fsck tool complains about such cases with a message like the following: root 331 inode 257 errors 100, file extent discount From a user perspective, the expectation when a snapshot is created while those file operations are being performed is that the snapshot will have a file that either: 1) is empty 2) only the first write was captured 3) only the 2 writes were captured 4) both writes and the truncation were captured But never capture a state where only the first write and the truncation were captured (since the second write was performed before the truncation). A test case for xfstests follows. Signed-off-by: Filipe Manana fdman...@suse.com --- V2: Use different approach to solve the problem. Don't start and wait for all dellaloc to finish after every expanding truncate, instead add an additional flush at transaction commit time if we're doing a transaction commit that creates snapshots. V3: Removed useless test condition in +wait_pending_snapshot_roots_delalloc(). fs/btrfs/transaction.c | 59 ++ 1 file changed, 59 insertions(+) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 396ae8b..5e7f004 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1714,12 +1714,65 @@ static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info) btrfs_wait_ordered_roots(fs_info, -1); } +static int +start_pending_snapshot_roots_delalloc(struct btrfs_trans_handle *trans, + struct list_head *splice) +{ + struct btrfs_pending_snapshot *pending_snapshot; + int ret = 0; + + if (btrfs_test_opt(trans-root, FLUSHONCOMMIT)) + return 0; + + spin_lock(trans-root-fs_info-trans_lock); + list_splice_init(trans-transaction-pending_snapshots, splice); + spin_unlock(trans-root-fs_info-trans_lock); + + /* +* Start again delalloc for the roots our pending snapshots are made +* from. We did it before starting/joining a transaction and we do it +* here again because new inode operations might have happened since +* then and we want to make sure the snapshot captures a fully +* consistent state of the source root tree. For example, if after the +* first delalloc flush a write is made against an inode followed by +* an expanding truncate, we want to make sure the snapshot captured +* both the write and the truncation, and not just the truncation. +* Here we shouldn't have much delalloc work to do, as the bulk of it +* was done before and outside
[PATCH v4] Btrfs: fix snapshot inconsistency after a file write followed by truncate
If right after starting the snapshot creation ioctl we perform a write against a file followed by a truncate, with both operations increasing the file's size, we can get a snapshot tree that reflects a state of the source subvolume's tree where the file truncation happened but the write operation didn't. This leaves a gap between 2 file extent items of the inode, which makes btrfs' fsck complain about it. For example, if we perform the following file operations: $ mkfs.btrfs -f /dev/vdd $ mount /dev/vdd /mnt $ xfs_io -f \ -c pwrite -S 0xaa -b 32K 0 32K \ -c fsync \ -c pwrite -S 0xbb -b 32770 16K 32770 \ -c truncate 90123 \ /mnt/foobar and the snapshot creation ioctl was just called before the second write, we often can get the following inode items in the snapshot's btree: item 120 key (257 INODE_ITEM 0) itemoff 7987 itemsize 160 inode generation 146 transid 7 size 90123 block group 0 mode 100600 links 1 uid 0 gid 0 rdev 0 flags 0x0 item 121 key (257 INODE_REF 256) itemoff 7967 itemsize 20 inode ref index 282 namelen 10 name: foobar item 122 key (257 EXTENT_DATA 0) itemoff 7914 itemsize 53 extent data disk byte 1104855040 nr 32768 extent data offset 0 nr 32768 ram 32768 extent compression 0 item 123 key (257 EXTENT_DATA 53248) itemoff 7861 itemsize 53 extent data disk byte 0 nr 0 extent data offset 0 nr 40960 ram 40960 extent compression 0 There's a file range, corresponding to the interval [32K; ALIGN(16K + 32770, 4096)[ for which there's no file extent item covering it. This is because the file write and file truncate operations happened both right after the snapshot creation ioctl called btrfs_start_delalloc_inodes(), which means we didn't start and wait for the ordered extent that matches the write and, in btrfs_setsize(), we were able to call btrfs_cont_expand() before being able to commit the current transaction in the snapshot creation ioctl. So this made it possibe to insert the hole file extent item in the source subvolume (which represents the region added by the truncate) right before the transaction commit from the snapshot creation ioctl. Btrfs' fsck tool complains about such cases with a message like the following: root 331 inode 257 errors 100, file extent discount From a user perspective, the expectation when a snapshot is created while those file operations are being performed is that the snapshot will have a file that either: 1) is empty 2) only the first write was captured 3) only the 2 writes were captured 4) both writes and the truncation were captured But never capture a state where only the first write and the truncation were captured (since the second write was performed before the truncation). A test case for xfstests follows. Signed-off-by: Filipe Manana fdman...@suse.com --- V2: Use different approach to solve the problem. Don't start and wait for all dellaloc to finish after every expanding truncate, instead add an additional flush at transaction commit time if we're doing a transaction commit that creates snapshots. V3: Removed useless test condition in +wait_pending_snapshot_roots_delalloc(). V4: Use another approach that doesn't imply starting delalloc work and wait for it to finish at transaction commit time. fs/btrfs/ctree.h | 4 ++-- fs/btrfs/extent-tree.c | 16 +--- fs/btrfs/file.c| 10 +- fs/btrfs/inode.c | 47 --- fs/btrfs/ioctl.c | 7 --- 5 files changed, 60 insertions(+), 24 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index b72b358..36f82ba 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3427,8 +3427,8 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info); int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); int __get_raid_index(u64 flags); -int btrfs_start_nocow_write(struct btrfs_root *root); -void btrfs_end_nocow_write(struct btrfs_root *root); +int btrfs_start_write_no_snapshoting(struct btrfs_root *root); +void btrfs_end_write_no_snapshoting(struct btrfs_root *root); /* ctree.c */ int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, int level, int *slot); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index a84e00d..9ba886c 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -9657,12 +9657,14 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range) } /* - * btrfs_{start,end}_write() is similar to mnt_{want, drop}_write(), - * they are used to prevent the some tasks writing data into the page cache - * by nocow before the subvolume is snapshoted, but flush the data into - * the disk
[PATCH] Btrfs: fix freeing used extent after removing empty block group
Due to ignoring errors returned by clear_extent_bits (at the moment only -ENOMEM is possible), we can end up freeing an extent that is actually in use (i.e. return the extent to the free space cache). The sequence of steps that lead to this: 1) Cleaner thread starts execution and calls btrfs_delete_unused_bgs(), with the goal of freeing empty block groups; 2) btrfs_delete_unused_bgs() finds an empty block group, joins the current transaction (or starts a new one if none is running) and attempts to clear the EXTENT_DIRTY bit for the block group's range from freed_extents[0] and freed_extents[1] (of which one corresponds to fs_info-pinned_extents); 3) Clearing the EXTENT_DIRTY bit (via clear_extent_bits()) fails with -ENOMEM, but such error is ignored and btrfs_delete_unused_bgs() proceeds to delete the block group and the respective chunk, while pinned_extents remains with that bit set for the whole (or a part of the) range covered by the block group; 4) Later while the transaction is still running, the chunk ends up being reused for a new block group (maybe for different purpose, data or metadata), and extents belonging to the new block group are allocated for file data or btree nodes/leafs; 5) The current transaction is committed, meaning that we unpinned one or more extents from the new block group (through btrfs_finish_extent_commit() and unpin_extent_range()) which are now being used for new file data or new metadata (through btrfs_finish_extent_commit() and unpin_extent_range()). And unpinning means we returned the extents to the free space cache of the new block group, which implies those extents can be used for future allocations while they're still in use. Alternatively, we can hit a BUG_ON() when doing a lookup for a block group's cache object in unpin_extent_range() if a new block group didn't end up being allocated for the same chunk (step 4 above). Fix this by not freeing the block group and chunk if we fail to clear the dirty bit. Signed-off-by: Filipe Manana fdman...@suse.com --- fs/btrfs/extent-tree.c | 13 +++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 9ba886c..744b580 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -9523,10 +9523,18 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) */ start = block_group-key.objectid; end = start + block_group-key.offset - 1; - clear_extent_bits(fs_info-freed_extents[0], start, end, + ret = clear_extent_bits(fs_info-freed_extents[0], start, end, EXTENT_DIRTY, GFP_NOFS); - clear_extent_bits(fs_info-freed_extents[1], start, end, + if (ret) { + btrfs_set_block_group_rw(root, block_group); + goto end_trans; + } + ret = clear_extent_bits(fs_info-freed_extents[1], start, end, EXTENT_DIRTY, GFP_NOFS); + if (ret) { + btrfs_set_block_group_rw(root, block_group); + goto end_trans; + } /* Reset pinned so btrfs_put_block_group doesn't complain */ block_group-pinned = 0; @@ -9537,6 +9545,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) */ ret = btrfs_remove_chunk(trans, root, block_group-key.objectid); +end_trans: btrfs_end_transaction(trans, root); next: btrfs_put_block_group(block_group); -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] Btrfs: avoid premature -ENOMEM in clear_extent_bit()
We try to allocate an extent state structure before acquiring the extent state tree's spinlock as we might need a new one later and therefore avoid doing later an atomic allocation while holding the tree's spinlock. However we returned -ENOMEM if that initial non-atomic allocation failed, which is a bit excessive since we might end up not needing the pre-allocated extent state at all - for the case where the tree doesn't have any extent states that cover the input range and cover too any other range. Therefore don't return -ENOMEM if that pre-allocation fails. Signed-off-by: Filipe Manana fdman...@suse.com --- fs/btrfs/extent_io.c | 9 +++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 654ed3d..4ebabd2 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -595,9 +595,14 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, clear = 1; again: if (!prealloc (mask __GFP_WAIT)) { + /* +* Don't care for allocation failure here because we might end +* up not needing the pre-allocated extent state at all, which +* is the case if we only have in the tree extent states that +* cover our input range and don't cover too any other range. +* If we end up needing a new extent state we allocate it later. +*/ prealloc = alloc_extent_state(mask); - if (!prealloc) - return -ENOMEM; } spin_lock(tree-lock); -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] Btrfs: fix race when cleaning unused block groups
We have a race while deleting unused block groups that causes extents written by past generations/transactions to be rewritten by the current transaction before that transaction is committed. The steps that lead to this issue: 1) At transaction N one or more block groups became unused and we added them to the list fs_info-unused_bgs; 2) While still at transaction N we write btree extents to block group X and the transaction is committed; 3) The cleaner kthread is awaken and calls btrfs_delete_unused_bgs() to go through the list fs_info-unused_bgs and remove unused block groups; 4) Transaction N + 1 starts; 5) At transaction N + 1, block group X becomes unused and is added to the list fs_info-unused_bgs - this implies delayed refs were run, so we had the following function calls: btrfs_run_delayed_refs() - __btrfs_free_extent() - update_block_group(). The update_block_group() function grabs the lock fs_info-unused_bgs_lock, adds block group X to fs_info-unused_bgs and releases that lock; 6) The cleaner kthread, while at btrfs_delete_unused_bgs(), sees block group X added by transaction N + 1 because it's doing a loop that finishes only when the list fs_info-unused_bgs is empty and locks and unlocks the spinlock fs_info-unused_bgs_lock on each iteration. So it deletes the block group and its respective chunk is released. Even if it didn't do the lock/unlock per iteration, it could still see block group X in the list, because the cleaner kthread might call btrfs_delete_unused_bgs() multiple times (for example if there are several snapshots to delete); 7) A new block group X' is created for data, and it's associated to the same chunk that block group X was associated to; 8) Extents from block group X' are allocated for file data and for example an fsync makes the file data be effectively written to disk; 9) A crash/reboot happens before transaction N + 1 is committed; 10) On the next mount, we will read extents from block group/chunk X but they no longer have valid btree nodes/leafs - they have instead file data, and therefore all sorts of errors will happen. So fix this by ensuring the cleaner kthread can never delete a block group that became unused in the current transaction, that is, only delete block groups that were added to the unused_bgs list by past transactions. Signed-off-by: Filipe Manana fdman...@suse.com --- fs/btrfs/ctree.h | 1 + fs/btrfs/disk-io.c | 1 + fs/btrfs/extent-tree.c | 5 +++-- fs/btrfs/transaction.c | 5 + 4 files changed, 10 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 36f82ba..a5e471a 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1726,6 +1726,7 @@ struct btrfs_fs_info { spinlock_t unused_bgs_lock; struct list_head unused_bgs; + struct list_head unused_bgs_to_clean; /* For btrfs to record security options */ struct security_mnt_opts security_opts; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 2409718..702bbdf 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2243,6 +2243,7 @@ int open_ctree(struct super_block *sb, INIT_LIST_HEAD(fs_info-space_info); INIT_LIST_HEAD(fs_info-tree_mod_seq_list); INIT_LIST_HEAD(fs_info-unused_bgs); + INIT_LIST_HEAD(fs_info-unused_bgs_to_clean); btrfs_mapping_init(fs_info-mapping_tree); btrfs_init_block_rsv(fs_info-global_block_rsv, BTRFS_BLOCK_RSV_GLOBAL); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 744b580..bc1c0b7 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -8858,6 +8858,7 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) up_write(info-commit_root_sem); spin_lock(info-unused_bgs_lock); + list_splice_init(info-unused_bgs_to_clean, info-unused_bgs); while (!list_empty(info-unused_bgs)) { block_group = list_first_entry(info-unused_bgs, struct btrfs_block_group_cache, @@ -9466,10 +9467,10 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) return; spin_lock(fs_info-unused_bgs_lock); - while (!list_empty(fs_info-unused_bgs)) { + while (!list_empty(fs_info-unused_bgs_to_clean)) { u64 start, end; - block_group = list_first_entry(fs_info-unused_bgs, + block_group = list_first_entry(fs_info-unused_bgs_to_clean, struct btrfs_block_group_cache, bg_list); space_info = block_group-space_info; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 396ae8b..86d7cf5 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1937,6 +1937,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans
[PATCH] Btrfs: avoid crash when overflowing a dir_item
When attempting to insert a new dir_item, we were calling btrfs_extent_item() without checking if the leaf has enough space to extend the item. This made btrfs_extent_item() crash through a BUG() call. Therefore do the check and return ENOSPC if the leaf doesn't have enough space. Signed-off-by: Filipe Manana fdman...@suse.com --- fs/btrfs/dir-item.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index fc8df86..65bf60e 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -53,6 +53,8 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle di = btrfs_match_dir_item_name(root, path, name, name_len); if (di) return ERR_PTR(-EEXIST); + if (btrfs_leaf_free_space(root, path-nodes[0]) data_size) + return ERR_PTR(-ENOSPC); btrfs_extend_item(root, path, data_size); } else if (ret 0) return ERR_PTR(ret); -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] Btrfs: make xattr replace operations atomic
Replacing a xattr consists of doing a lookup for its existing value, delete the current value from the respective leaf, release the search path and then finally insert the new value. This leaves a time window where readers (getxattr, listxattrs) won't see any value for the xattr. Xattrs are used to store ACLs, so this has security implications. This change also fixes 2 other existing issues which were: *) Deleting the old xattr value without verifying first if the new xattr will fit in the existing leaf item (in case multiple xattrs are packed in the same item due to name hash collision); *) Returning -EEXIST when the flag XATTR_CREATE is given and the xattr doesn't exist but we have have an existing item that packs muliple xattrs with the same name hash as the input xattr. In this case we should return ENOSPC. A test case for xfstests follows soon. Thanks to Alexandre Oliva for reporting the non-atomicity of the xattr replace implementation. Reported-by: Alexandre Oliva ol...@gnu.org Signed-off-by: Filipe Manana fdman...@suse.com --- fs/btrfs/ctree.h| 4 ++ fs/btrfs/dir-item.c | 10 ++-- fs/btrfs/xattr.c| 142 ++-- 3 files changed, 88 insertions(+), 68 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index a5e471a..9a47dfe 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3687,6 +3687,10 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, int verify_dir_item(struct btrfs_root *root, struct extent_buffer *leaf, struct btrfs_dir_item *dir_item); +struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root, +struct btrfs_path *path, +const char *name, +int name_len); /* orphan.c */ int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index 65bf60e..c14e682 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -21,10 +21,6 @@ #include hash.h #include transaction.h -static struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root, - struct btrfs_path *path, - const char *name, int name_len); - /* * insert a name into a directory, doing overflow properly if there is a hash * collision. data_size indicates how big the item inserted should be. On @@ -385,9 +381,9 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, * this walks through all the entries in a dir item and finds one * for a specific name. */ -static struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root, - struct btrfs_path *path, - const char *name, int name_len) +struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root, +struct btrfs_path *path, +const char *name, int name_len) { struct btrfs_dir_item *dir_item; unsigned long name_ptr; diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index dcf2013..3c8fba1 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -91,7 +91,7 @@ static int do_setxattr(struct btrfs_trans_handle *trans, struct inode *inode, const char *name, const void *value, size_t size, int flags) { - struct btrfs_dir_item *di; + struct btrfs_dir_item *di = NULL; struct btrfs_root *root = BTRFS_I(inode)-root; struct btrfs_path *path; size_t name_len = strlen(name); @@ -104,83 +104,103 @@ static int do_setxattr(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; - if (flags XATTR_REPLACE) { - di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(inode), name, - name_len, -1); - if (IS_ERR(di)) { - ret = PTR_ERR(di); - goto out; - } else if (!di) { + if (!value ((flags XATTR_REPLACE) || !flags)) { + /* +* We're deleting only a xattr (no replace). +* Don't follow the path below because it could leave a leaf +* empty. +*/ + di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(inode), + name, name_len, -1); + if (!di (flags XATTR_REPLACE)) ret = -ENODATA; - goto out; - } - ret = btrfs_delete_one_dir_name(trans, root, path, di); - if (ret) - goto out; - btrfs_release_path(path); + else if (di
[PATCH] fstests: add generic test to verify xattr replace operations are atomic
This test verifies that replacing a xattr's value is an atomic operation. This is motivated by an issue in btrfs where replacing a xattr's value wasn't an atomic operation, it consisted of removing the old value and then inserting the new value in a btree. This made readers (getxattr and listxattrs) not getting neither the old nor the new value during a short time window. Signed-off-by: Filipe Manana fdman...@suse.com --- tests/generic/326 | 107 ++ tests/generic/326.out | 2 + tests/generic/group | 1 + 3 files changed, 110 insertions(+) create mode 100755 tests/generic/326 create mode 100644 tests/generic/326.out diff --git a/tests/generic/326 b/tests/generic/326 new file mode 100755 index 000..c110fc0 --- /dev/null +++ b/tests/generic/326 @@ -0,0 +1,107 @@ +#! /bin/bash +# FSQA Test No. 326 +# +# Verify that replacing a xattr's value is an atomic operation. +# This is motivated by an issue in btrfs where replacing a xattr's value +# wasn't an atomic operation, it consisted of removing the old value and +# then inserting the new value in a btree. This made readers (getxattr +# and listxattrs) not getting neither the old nor the new value during +# a short time window. +# +# The btrfs issue was fixed by the following linux kernel patch: +# +#Btrfs: make xattr replace operations atomic +# +#--- +# +# Copyright (C) 2014 SUSE Linux Products GmbH. All Rights Reserved. +# Author: Filipe Manana fdman...@suse.com +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +#--- +# + +seq=`basename $0` +seqres=$RESULT_DIR/$seq +echo QA output created by $seq +tmp=/tmp/$$ +status=1 # failure is the default! +trap _cleanup; exit \$status 0 1 2 3 15 + +_cleanup() +{ + if [ ! -z $setter_pid ]; then + kill $setter_pid /dev/null + fi + rm -f $tmp.* +} + +# get standard environment, filters and checks +. ./common/rc +. ./common/filter +. ./common/attr + +# real QA test starts here +_need_to_be_root +_supported_fs generic +_supported_os Linux +_require_scratch +_require_attrs + +rm -f $seqres.full + +xattr_name=user.something +xattr_value1=foobar +xattr_value2=rabbit_hole + +set_xattr_loop() +{ + local name=$1 + + local cur_val=$xattr_value1 + while true; do + $SETFATTR_PROG -n $xattr_name -v $cur_val $SCRATCH_MNT/$name + if [ $cur_val == $xattr_value1 ]; then + cur_val=$xattr_value2 + else + cur_val=$xattr_value1 + fi + done +} + +_scratch_mkfs $seqres.full 21 +_scratch_mount + +test_file=test_xattr_replace +touch $SCRATCH_MNT/$test_file +$SETFATTR_PROG -n $xattr_name -v $xattr_value1 $SCRATCH_MNT/$test_file + +set_xattr_loop $test_file +setter_pid=$! + +for ((i = 0; i 1000; i++)); do + xattr_val=$($GETFATTR_PROG --absolute-names -n $xattr_name \ + $SCRATCH_MNT/$test_file | grep $xattr_name= | cut -d '=' -f 2) + if [ $xattr_val != \$xattr_value1\ -a \ + $xattr_val != \$xattr_value2\ ]; then + _fail Missing or unexpected xattr value: $xattr_val + fi +done + +kill $setter_pid /dev/null +unset setter_pid + +echo Silence is golden +status=0 +exit diff --git a/tests/generic/326.out b/tests/generic/326.out new file mode 100644 index 000..4ac0db5 --- /dev/null +++ b/tests/generic/326.out @@ -0,0 +1,2 @@ +QA output created by 326 +Silence is golden diff --git a/tests/generic/group b/tests/generic/group index 9c82a6f..01f442d 100644 --- a/tests/generic/group +++ b/tests/generic/group @@ -152,3 +152,4 @@ 323 auto aio stress 324 auto fsr quick 325 auto quick data log +326 auto quick xattr -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html