from:"Filipe Manana"

Re: [PATCH 1/5] Btrfs-progs: fix closing of devices

2013-06-11 Thread Filipe Manana

On 11/06/2013, at 16:00, David Sterba dste...@suse.cz wrote:

 On Mon, Jun 10, 2013 at 09:07:55PM +0100, Filipe David Borba Manana wrote:
 --- a/disk-io.c
 +++ b/disk-io.c
 @@ -1267,12 +1267,12 @@ static int close_all_devices(struct btrfs_fs_info 
 *fs_info)
while (!list_empty(list)) {
device = list_entry(list-next, struct btrfs_device, dev_list);
list_del_init(device-dev_list);
 -if (device-fd) {
 +if (device-fd  0) {
 
 -device = kmalloc(sizeof(*device), GFP_NOFS);
 +device = kzalloc(sizeof(*device), GFP_NOFS);
if (!device)
return -ENOMEM;
 -device-total_ios = 0;
 
 Can you please describe the incremental change between the two patches?
 Why do you exclude fd == 0 ?

Sorry, I messed up my git send-email foo.

I excluded 0 because the original code did it too, however it was logically
incorrect. The last patch version (v2, 3rd email) does not exclude 0 and
ensures that -1 is used everywhere as a marker for invalid fd.

(Hopefully now I figured how to use git send-email to update a patch
correctly)

Thanks


 
 david
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH] Btrfs: read lock extent buffer while walking backrefs

2014-07-02 Thread Filipe Manana

Before processing the extent buffer, acquire a read lock on it, so
that we're safe against concurrent updates on the extent buffer.

Signed-off-by: Filipe Manana fdman...@suse.com
---
 fs/btrfs/backref.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index e25564b..a1efd39 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1001,8 +1001,11 @@ again:
ret = -EIO;
goto out;
}
+   btrfs_tree_read_lock(eb);
+   btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
ret = find_extent_in_eb(eb, bytenr,
*extent_item_pos, eie);
+   btrfs_tree_read_unlock_blocking(eb);
free_extent_buffer(eb);
if (ret  0)
goto out;
-- 
1.9.1

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH] Btrfs: reduce size of struct extent_state

2014-07-06 Thread Filipe Manana

The tree field of struct extent_state was only used to figure out if
an extent state was connected to an inode's io tree or not. For this
we can just use the rb_node field itself.

On a x86_64 system with this change the sizeof(struct extent_state) is
reduced from 96 bytes down to 88 bytes, meaning that with a page size
of 4096 bytes we can now store 46 extent states per page instead of 42.

Signed-off-by: Filipe Manana fdman...@suse.com
---
 fs/btrfs/extent_io.c | 37 -
 fs/btrfs/extent_io.h |  1 -
 2 files changed, 20 insertions(+), 18 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 23398ad..7e44e18 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -25,6 +25,11 @@ static struct kmem_cache *extent_state_cache;
 static struct kmem_cache *extent_buffer_cache;
 static struct bio_set *btrfs_bioset;
 
+static inline bool extent_state_in_tree(const struct extent_state *state)
+{
+   return !RB_EMPTY_NODE(state-rb_node);
+}
+
 #ifdef CONFIG_BTRFS_DEBUG
 static LIST_HEAD(buffers);
 static LIST_HEAD(states);
@@ -59,9 +64,9 @@ void btrfs_leak_debug_check(void)
 
while (!list_empty(states)) {
state = list_entry(states.next, struct extent_state, leak_list);
-   printk(KERN_ERR BTRFS: state leak: start %llu end %llu 
-  state %lu in tree %p refs %d\n,
-  state-start, state-end, state-state, state-tree,
+   pr_err(BTRFS: state leak: start %llu end %llu state %lu in 
tree %d refs %d\n,
+  state-start, state-end, state-state,
+  extent_state_in_tree(state),
   atomic_read(state-refs));
list_del(state-leak_list);
kmem_cache_free(extent_state_cache, state);
@@ -209,7 +214,7 @@ static struct extent_state *alloc_extent_state(gfp_t mask)
return state;
state-state = 0;
state-private = 0;
-   state-tree = NULL;
+   RB_CLEAR_NODE(state-rb_node);
btrfs_leak_debug_add(state-leak_list, states);
atomic_set(state-refs, 1);
init_waitqueue_head(state-wq);
@@ -222,7 +227,7 @@ void free_extent_state(struct extent_state *state)
if (!state)
return;
if (atomic_dec_and_test(state-refs)) {
-   WARN_ON(state-tree);
+   WARN_ON(extent_state_in_tree(state));
btrfs_leak_debug_del(state-leak_list);
trace_free_extent_state(state, _RET_IP_);
kmem_cache_free(extent_state_cache, state);
@@ -371,8 +376,8 @@ static void merge_state(struct extent_io_tree *tree,
other-state == state-state) {
merge_cb(tree, state, other);
state-start = other-start;
-   other-tree = NULL;
rb_erase(other-rb_node, tree-state);
+   RB_CLEAR_NODE(other-rb_node);
free_extent_state(other);
}
}
@@ -383,8 +388,8 @@ static void merge_state(struct extent_io_tree *tree,
other-state == state-state) {
merge_cb(tree, state, other);
state-end = other-end;
-   other-tree = NULL;
rb_erase(other-rb_node, tree-state);
+   RB_CLEAR_NODE(other-rb_node);
free_extent_state(other);
}
}
@@ -442,7 +447,6 @@ static int insert_state(struct extent_io_tree *tree,
   found-start, found-end, start, end);
return -EEXIST;
}
-   state-tree = tree;
merge_state(tree, state);
return 0;
 }
@@ -486,7 +490,6 @@ static int split_state(struct extent_io_tree *tree, struct 
extent_state *orig,
free_extent_state(prealloc);
return -EEXIST;
}
-   prealloc-tree = tree;
return 0;
 }
 
@@ -524,9 +527,9 @@ static struct extent_state *clear_state_bit(struct 
extent_io_tree *tree,
wake_up(state-wq);
if (state-state == 0) {
next = next_state(state);
-   if (state-tree) {
+   if (extent_state_in_tree(state)) {
rb_erase(state-rb_node, tree-state);
-   state-tree = NULL;
+   RB_CLEAR_NODE(state-rb_node);
free_extent_state(state);
} else {
WARN_ON(1);
@@ -606,8 +609,8 @@ again:
cached_state = NULL;
}
 
-   if (cached  cached-tree  cached-start = start 
-   cached-end  start) {
+   if (cached  extent_state_in_tree(cached) 
+   cached-start = start  cached-end  start) {
if (clear)
atomic_dec(cached-refs

[PATCH] Btrfs: set error return value in btrfs_get_blocks_direct

2014-07-07 Thread Filipe Manana

We were returning with 0 (success) because we weren't extracting the
error code from em (PTR_ERR(em)). Fix it.

Signed-off-by: Filipe Manana fdman...@suse.com
---
 fs/btrfs/inode.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 6b65fab..8a946c0 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6998,8 +6998,10 @@ static int btrfs_get_blocks_direct(struct inode *inode, 
sector_t iblock,
   block_start, len,
   orig_block_len,
   ram_bytes, type);
-   if (IS_ERR(em))
+   if (IS_ERR(em)) {
+   ret = PTR_ERR(em);
goto unlock_err;
+   }
}
 
ret = btrfs_add_ordered_extent_dio(inode, start,
-- 
1.9.1

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH] Btrfs: race free update of commit root for ro snapshots

2014-07-28 Thread Filipe Manana

This is a better solution for the problem addressed in the following
commit:

Btrfs: update commit root on snapshot creation after orphan cleanup
(3821f348889e506efbd268cc8149e0ebfa47c4e5)

The previous solution wasn't the best because of 2 reasons:

1) It added another full transaction commit, which is more expensive
   than just swapping the commit root with the root;

2) Not completely race-free. As soon as the transaction commits, the
   snapshots becomes visible from user space, and before we do the
   orphan cleanup, user space can ask for a send operation that uses
   the new snapshot.

This change addresses those 2 issues. Special thanks to Alex Lyakas for
spotting the second issue.

Cc: Alex Lyakas alex.bt...@zadarastorage.com
Signed-off-by: Filipe Manana fdman...@suse.com
---
 fs/btrfs/inode.c | 29 +
 fs/btrfs/ioctl.c | 29 -
 2 files changed, 29 insertions(+), 29 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1d5f0b3..982a8f7 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -5227,6 +5227,35 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, 
struct dentry *dentry)
iput(inode);
inode = ERR_PTR(ret);
}
+   /*
+* If orphan cleanup did remove any orphans, it means the tree
+* was modified and therefore the commit root is not the same as
+* the current root anymore. This is a problem, because send
+* uses the commit root and therefore can see inode items that
+* don't exist in the current root anymore, and for example make
+* calls to btrfs_iget, which will do tree lookups based on the
+* current root and not on the commit root. Those lookups will
+* fail, returning a -ESTALE error, and making send fail with
+* that error. So make sure a send does not see any orphans we
+* have just removed, and that it will see the same inodes
+* regardless of whether a transaction commit happened before
+* it started (meaning that the commit root will be the same as
+* the current root) or not.
+*/
+   if (sub_root-node != sub_root-commit_root) {
+   u64 sub_flags = btrfs_root_flags(sub_root-root_item);
+
+   if (sub_flags  BTRFS_ROOT_SUBVOL_RDONLY) {
+   struct extent_buffer *eb;
+
+   down_write(root-fs_info-commit_root_sem);
+   eb = sub_root-commit_root;
+   sub_root-commit_root =
+   btrfs_root_node(sub_root);
+   up_write(root-fs_info-commit_root_sem);
+   free_extent_buffer(eb);
+   }
+   }
}
 
return inode;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 2a30ac1..d44abc0 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -715,35 +715,6 @@ static int create_snapshot(struct btrfs_root *root, struct 
inode *dir,
if (ret)
goto fail;
 
-   /*
-* If orphan cleanup did remove any orphans, it means the tree was
-* modified and therefore the commit root is not the same as the
-* current root anymore. This is a problem, because send uses the
-* commit root and therefore can see inode items that don't exist
-* in the current root anymore, and for example make calls to
-* btrfs_iget, which will do tree lookups based on the current root
-* and not on the commit root. Those lookups will fail, returning a
-* -ESTALE error, and making send fail with that error. So make sure
-* a send does not see any orphans we have just removed, and that it
-* will see the same inodes regardless of whether a transaction
-* commit happened before it started (meaning that the commit root
-* will be the same as the current root) or not.
-*/
-   if (readonly  pending_snapshot-snap-node !=
-   pending_snapshot-snap-commit_root) {
-   trans = btrfs_join_transaction(pending_snapshot-snap);
-   if (IS_ERR(trans)  PTR_ERR(trans) != -ENOENT) {
-   ret = PTR_ERR(trans);
-   goto fail;
-   }
-   if (!IS_ERR(trans)) {
-   ret = btrfs_commit_transaction(trans,
-  pending_snapshot-snap);
-   if (ret)
-   goto fail;
-   }
-   }
-
inode = btrfs_lookup_dentry(dentry-d_parent-d_inode, dentry);
if (IS_ERR(inode)) {
ret = PTR_ERR(inode

[PATCH] Btrfs: unlock nodes earlier when inserting items in a btree

2014-07-28 Thread Filipe Manana

In ctree.c:setup_items_for_insert(), we can unlock all nodes in our
path before we process the leaf (shift items and data, adjust data
offsets, etc). This allows for better btree concurrency, as we're
often holding a write lock on at least the node at level 1.

Signed-off-by: Filipe Manana fdman...@suse.com
---
 fs/btrfs/ctree.c | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 263145b..bd0ae3e 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -4738,6 +4738,12 @@ void setup_items_for_insert(struct btrfs_root *root, 
struct btrfs_path *path,
int slot;
struct btrfs_map_token token;
 
+   if (path-slots[0] == 0) {
+   btrfs_cpu_key_to_disk(disk_key, cpu_key);
+   fixup_low_keys(root, path, disk_key, 1);
+   }
+   btrfs_unlock_up_safe(path, 1);
+
btrfs_init_map_token(token);
 
leaf = path-nodes[0];
@@ -4798,12 +4804,6 @@ void setup_items_for_insert(struct btrfs_root *root, 
struct btrfs_path *path,
}
 
btrfs_set_header_nritems(leaf, nritems + nr);
-
-   if (slot == 0) {
-   btrfs_cpu_key_to_disk(disk_key, cpu_key);
-   fixup_low_keys(root, path, disk_key, 1);
-   }
-   btrfs_unlock_up_safe(path, 1);
btrfs_mark_buffer_dirty(leaf);
 
if (btrfs_leaf_free_space(root, leaf)  0) {
-- 
1.9.1

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH] Btrfs: avoid unnecessary switch of path locks to blocking mode

2014-07-28 Thread Filipe Manana

If we need to cow a node, increase the write lock level and retry the
tree search, there's no point of changing the node locks in our path
to blocking mode, as we only waste time and unnecessarily wake up other
tasks waiting on the spinning locks (just to block them again shortly
after) because we release our path before repeating the tree search.

Signed-off-by: Filipe Manana fdman...@suse.com
---
 fs/btrfs/ctree.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index bd0ae3e..783ea3b 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -2792,8 +2792,6 @@ again:
if (!should_cow_block(trans, root, b))
goto cow_done;
 
-   btrfs_set_path_blocking(p);
-
/*
 * must have write locks on this node and the
 * parent
@@ -2807,6 +2805,7 @@ again:
goto again;
}
 
+   btrfs_set_path_blocking(p);
err = btrfs_cow_block(trans, root, b,
  p-nodes[level + 1],
  p-slots[level + 1], b);
-- 
1.9.1

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH] Btrfs: return path with unlocked nodes in btrfs_next_leaf

2014-07-31 Thread Filipe Manana

Calling unlock_up() to release our new path doesn't release the
read lock on the node at level 1, because our return path has
path-slots[0] == 0, which makes unlock_up() skip unlocking that
node. Since we don't need to return that node locked, call
btrfs_unlock_up_safe() instead of unlock_up(), which will release
all nodes in the path (except the leaf of course).
For any level N = 2, the corresponding node lock isn't released by
unlock_up() too if path-slots[N - 1] == 0.

Releasing the read lock immediately will allow concurrent writers
to write lock that node at level 1 (or higher levels if applicable)
while the btrfs_next_leaf() caller processes the leaf.

Signed-off-by: Filipe Manana fdman...@suse.com
---
 fs/btrfs/ctree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 783ea3b..8ca6761 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -5833,7 +5833,7 @@ again:
}
ret = 0;
 done:
-   unlock_up(path, 0, 1, 0, NULL);
+   btrfs_unlock_up_safe(path, 1);
path-leave_spinning = old_spinning;
if (!old_spinning)
btrfs_set_path_blocking(path);
-- 
1.9.1

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v2] Btrfs: race free update of commit root for ro snapshots

2014-07-31 Thread Filipe Manana

This is a better solution for the problem addressed in the following
commit:

Btrfs: update commit root on snapshot creation after orphan cleanup
(3821f348889e506efbd268cc8149e0ebfa47c4e5)

The previous solution wasn't the best because of 2 reasons:

1) It added another full transaction commit, which is more expensive
   than just swapping the commit root with the root;

2) If a reboot happened after the first transaction commit (the one
   that creates the snapshot) and before the second transaction commit,
   then we would end up with the same problem if a send using that
   snapshot was requested before the first transaction commit after
   the reboot.

This change addresses those 2 issues. The second issue is addressed by
switching the commit root in the dentry lookup VFS callback, which is
also called by the snapshot/subvol creation ioctl and performs orphan
cleanup if needed. Like the vfs, the ioctl locks the parent inode too,
preventing race issues between a dentry lookup and snapshot creation.

Cc: Alex Lyakas alex.bt...@zadarastorage.com
Signed-off-by: Filipe Manana fdman...@suse.com
---

V2: Updated commit message, as original second issue  was not correct.
Removed redundant btrfs_orphan_cleanup() call in the snapshot creation
ioctl, as it's performed by btrfs_lookup_dentry() which is called by
the ioctl.

 fs/btrfs/inode.c | 36 
 fs/btrfs/ioctl.c | 33 -
 2 files changed, 36 insertions(+), 33 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1d5f0b3..4f35c6c 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -5227,6 +5227,42 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, 
struct dentry *dentry)
iput(inode);
inode = ERR_PTR(ret);
}
+   /*
+* If orphan cleanup did remove any orphans, it means the tree
+* was modified and therefore the commit root is not the same as
+* the current root anymore. This is a problem, because send
+* uses the commit root and therefore can see inode items that
+* don't exist in the current root anymore, and for example make
+* calls to btrfs_iget, which will do tree lookups based on the
+* current root and not on the commit root. Those lookups will
+* fail, returning a -ESTALE error, and making send fail with
+* that error. So make sure a send does not see any orphans we
+* have just removed, and that it will see the same inodes
+* regardless of whether a transaction commit happened before
+* it started (meaning that the commit root will be the same as
+* the current root) or not.
+*/
+   if (sub_root-node != sub_root-commit_root) {
+   u64 sub_flags = btrfs_root_flags(sub_root-root_item);
+
+   if (sub_flags  BTRFS_ROOT_SUBVOL_RDONLY) {
+   struct extent_buffer *eb;
+
+   /*
+* Assert we can't have races between dentry
+* lookup called through the snapshot creation
+* ioctl and the VFS.
+*/
+   ASSERT(mutex_is_locked(dir-i_mutex));
+
+   down_write(root-fs_info-commit_root_sem);
+   eb = sub_root-commit_root;
+   sub_root-commit_root =
+   btrfs_root_node(sub_root);
+   up_write(root-fs_info-commit_root_sem);
+   free_extent_buffer(eb);
+   }
+   }
}
 
return inode;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 2a30ac1..ef2e073 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -711,39 +711,6 @@ static int create_snapshot(struct btrfs_root *root, struct 
inode *dir,
if (ret)
goto fail;
 
-   ret = btrfs_orphan_cleanup(pending_snapshot-snap);
-   if (ret)
-   goto fail;
-
-   /*
-* If orphan cleanup did remove any orphans, it means the tree was
-* modified and therefore the commit root is not the same as the
-* current root anymore. This is a problem, because send uses the
-* commit root and therefore can see inode items that don't exist
-* in the current root anymore, and for example make calls to
-* btrfs_iget, which will do tree lookups based on the current root
-* and not on the commit root. Those lookups will fail, returning a
-* -ESTALE error, and making send fail with that error. So make sure
-* a send does

[PATCH] Btrfs: ensure tmpfile inode is always persisted with link count of 0

2014-07-31 Thread Filipe Manana

If we open a file with O_TMPFILE, don't do any further operation on
it (so that the inode item isn't updated) and then force a transaction
commit, we get a persisted inode item with a link count of 1, and not 0
as it should be.

Steps to reproduce it (requires a modern xfs_io with -T support):

$ mkfs.btrfs -f /dev/sdd
$ mount -o /dev/sdd /mnt
$ xfs_io -T /mnt 
$ sync

Then btrfs-debug-tree shows the inode item with a link count of 1:

$ btrfs-debug-tree /dev/sdd
(...)
fs tree key (FS_TREE ROOT_ITEM 0)
leaf 29556736 items 4 free space 15851 generation 6 owner 5
fs uuid f164d01b-1b92-481d-a4e4-435fb0f843d0
chunk uuid 0e3d0e56-bcca-4a1c-aa5f-cec2c6f4f7a6
item 0 key (256 INODE_ITEM 0) itemoff 16123 itemsize 160
inode generation 3 transid 6 size 0 block group 0 mode 40755 
links 1
item 1 key (256 INODE_REF 256) itemoff 16111 itemsize 12
inode ref index 0 namelen 2 name: ..
item 2 key (257 INODE_ITEM 0) itemoff 15951 itemsize 160
inode generation 6 transid 6 size 0 block group 0 mode 100600 
links 1
item 3 key (ORPHAN ORPHAN_ITEM 257) itemoff 15951 itemsize 0
orphan item
checksum tree key (CSUM_TREE ROOT_ITEM 0)
(...)

Signed-off-by: Filipe Manana fdman...@suse.com
---
 fs/btrfs/inode.c | 15 +++
 1 file changed, 15 insertions(+)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 4f35c6c..8ad3ea9 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -5688,6 +5688,13 @@ static struct inode *btrfs_new_inode(struct 
btrfs_trans_handle *trans,
}
 
/*
+* O_TMPFILE, set link count to 0, so that after this point,
+* we fill in an inode item with the correct link count.
+*/
+   if (!name)
+   set_nlink(inode, 0);
+
+   /*
 * we have to initialize this early, so we can reclaim the inode
 * number if we fail afterwards in this function.
 */
@@ -9133,6 +9140,14 @@ static int btrfs_tmpfile(struct inode *dir, struct 
dentry *dentry, umode_t mode)
if (ret)
goto out;
 
+   /*
+* We set number of links to 0 in btrfs_new_inode(), and here we set
+* it to 1 because d_tmpfile() will issue a warning if the count is 0,
+* through:
+*
+*d_tmpfile() - inode_dec_link_count() - drop_nlink()
+*/
+   set_nlink(inode, 1);
d_tmpfile(dentry, inode);
mark_inode_dirty(inode);
 
-- 
1.9.1

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH] xfstests: add regression test for btrfs send with orphans

2014-07-31 Thread Filipe Manana

Regression test for a btrfs issue where we create a RO snapshot
to use for a send operation, which fails with a -ESTALE error,
due to the presence of orphan inodes accessible through the
snapshot's commit root but no longer present through the main
root.

This issue is fixed by the following linux kernel btrfs patch:

  Btrfs: update commit root on snapshot creation after orphan cleanup

Signed-off-by: Filipe Manana fdman...@suse.com
---
 tests/btrfs/057 | 81 +
 tests/btrfs/057.out |  1 +
 tests/btrfs/group   |  1 +
 3 files changed, 83 insertions(+)
 create mode 100755 tests/btrfs/057
 create mode 100644 tests/btrfs/057.out

diff --git a/tests/btrfs/057 b/tests/btrfs/057
new file mode 100755
index 000..2174077
--- /dev/null
+++ b/tests/btrfs/057
@@ -0,0 +1,81 @@
+#! /bin/bash
+# FS QA Test No. btrfs/057
+#
+# Regression test for a btrfs issue where we create a RO snapshot to use for
+# a send operation which fails with a -ESTALE error, due to the presence of
+# orphan inodes accessible through the snapshot's commit root but no longer
+# present through the main root.
+#
+# This issue is fixed by the following linux kernel btrfs patch:
+#
+#Btrfs: update commit root on snapshot creation after orphan cleanup
+#
+#---
+# Copyright (C) 2014 SUSE Linux Products GmbH. All Rights Reserved.
+# Author: Filipe Manana fdman...@suse.com
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it would be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write the Free Software Foundation,
+# Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+#---
+#
+
+seq=`basename $0`
+seqres=$RESULT_DIR/$seq
+echo QA output created by $seq
+
+tmp=/tmp/$$
+status=1   # failure is the default!
+trap _cleanup; exit \$status 0 1 2 3 15
+
+_cleanup()
+{
+   if [ ! -z $XFS_IO_PID ]; then
+   kill $XFS_IO_PID  /dev/null 21
+   fi
+   rm -fr $tmp
+}
+
+# get standard environment, filters and checks
+. ./common/rc
+. ./common/filter
+
+# real QA test starts here
+_supported_fs btrfs
+_supported_os Linux
+_require_scratch
+# Requiring flink command tests for the presence of the -T option used
+# to pass O_TMPFILE to open(2).
+_require_xfs_io_command flink
+_need_to_be_root
+
+rm -f $seqres.full
+
+_scratch_mkfs /dev/null 21
+_scratch_mount
+
+# Create a tmpfile file, write some data to it and leave it open, so that our
+# main subvolume has an orphan inode item.
+$XFS_IO_PROG -T $SCRATCH_MNT $seqres.full 21  (
+   echo pwrite 0 65536
+   read
+) 
+XFS_IO_PID=$!
+
+# With the tmpfile open, create a RO snapshot and use it for a send operation.
+# The send operation used to fail with -ESTALE due to the presence of the
+# orphan inode.
+_run_btrfs_util_prog subvolume snapshot -r $SCRATCH_MNT $SCRATCH_MNT/mysnap
+_run_btrfs_util_prog send $SCRATCH_MNT/mysnap -f /dev/null
+
+status=0
+exit
diff --git a/tests/btrfs/057.out b/tests/btrfs/057.out
new file mode 100644
index 000..b26eefe
--- /dev/null
+++ b/tests/btrfs/057.out
@@ -0,0 +1 @@
+QA output created by 057
diff --git a/tests/btrfs/group b/tests/btrfs/group
index 2da7127..ebc38c5 100644
--- a/tests/btrfs/group
+++ b/tests/btrfs/group
@@ -59,3 +59,4 @@
 054 auto quick
 055 auto quick
 056 auto quick
+057 auto quick
-- 
1.9.1

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v2] xfstests: add regression test for btrfs send with orphans

2014-08-01 Thread Filipe Manana

Regression test for a btrfs issue where we create a RO snapshot
to use for a send operation, which fails with a -ESTALE error,
due to the presence of orphan inodes accessible through the
snapshot's commit root but no longer present through the main
root.

This issue is fixed by the following linux kernel btrfs patch:

  Btrfs: update commit root on snapshot creation after orphan cleanup

Signed-off-by: Filipe Manana fdman...@suse.com
---

V2: Replaced a  redirect with a  redirect to $seqres.full, and added a
sleep.

 tests/btrfs/057 | 84 +
 tests/btrfs/057.out |  1 +
 tests/btrfs/group   |  1 +
 3 files changed, 86 insertions(+)
 create mode 100755 tests/btrfs/057
 create mode 100644 tests/btrfs/057.out

diff --git a/tests/btrfs/057 b/tests/btrfs/057
new file mode 100755
index 000..1e313e9
--- /dev/null
+++ b/tests/btrfs/057
@@ -0,0 +1,84 @@
+#! /bin/bash
+# FS QA Test No. btrfs/057
+#
+# Regression test for a btrfs issue where we create a RO snapshot to use for
+# a send operation which fails with a -ESTALE error, due to the presence of
+# orphan inodes accessible through the snapshot's commit root but no longer
+# present through the main root.
+#
+# This issue is fixed by the following linux kernel btrfs patch:
+#
+#Btrfs: update commit root on snapshot creation after orphan cleanup
+#
+#---
+# Copyright (C) 2014 SUSE Linux Products GmbH. All Rights Reserved.
+# Author: Filipe Manana fdman...@suse.com
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it would be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write the Free Software Foundation,
+# Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+#---
+#
+
+seq=`basename $0`
+seqres=$RESULT_DIR/$seq
+echo QA output created by $seq
+
+tmp=/tmp/$$
+status=1   # failure is the default!
+trap _cleanup; exit \$status 0 1 2 3 15
+
+_cleanup()
+{
+   if [ ! -z $XFS_IO_PID ]; then
+   kill $XFS_IO_PID  /dev/null 21
+   fi
+   rm -fr $tmp
+}
+
+# get standard environment, filters and checks
+. ./common/rc
+. ./common/filter
+
+# real QA test starts here
+_supported_fs btrfs
+_supported_os Linux
+_require_scratch
+# Requiring flink command tests for the presence of the -T option used
+# to pass O_TMPFILE to open(2).
+_require_xfs_io_command flink
+_need_to_be_root
+
+rm -f $seqres.full
+
+_scratch_mkfs /dev/null 21
+_scratch_mount
+
+# Create a tmpfile file, write some data to it and leave it open, so that our
+# main subvolume has an orphan inode item.
+$XFS_IO_PROG -T $SCRATCH_MNT $seqres.full 21  (
+   echo pwrite 0 65536
+   read
+) 
+XFS_IO_PID=$!
+
+# Give it some time to the xfs_io process to create the tmpfile.
+sleep 3
+
+# With the tmpfile open, create a RO snapshot and use it for a send operation.
+# The send operation used to fail with -ESTALE due to the presence of the
+# orphan inode.
+_run_btrfs_util_prog subvolume snapshot -r $SCRATCH_MNT $SCRATCH_MNT/mysnap
+_run_btrfs_util_prog send $SCRATCH_MNT/mysnap -f /dev/null
+
+status=0
+exit
diff --git a/tests/btrfs/057.out b/tests/btrfs/057.out
new file mode 100644
index 000..b26eefe
--- /dev/null
+++ b/tests/btrfs/057.out
@@ -0,0 +1 @@
+QA output created by 057
diff --git a/tests/btrfs/group b/tests/btrfs/group
index 2da7127..ebc38c5 100644
--- a/tests/btrfs/group
+++ b/tests/btrfs/group
@@ -59,3 +59,4 @@
 054 auto quick
 055 auto quick
 056 auto quick
+057 auto quick
-- 
1.9.1

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH] Btrfs: make btrfs_search_forward return with nodes unlocked

2014-08-04 Thread Filipe Manana

None of the uses of btrfs_search_forward() need to have the path
nodes (level = 1) read locked, only the leaf needs to be locked
while the caller processes it. Therefore make it return a path
with all nodes unlocked, except for the leaf.

This change is motivated by the observation that during a file
fsync we repeatdly call btrfs_search_forward() and process the
returned leaf while upper nodes of the returned path (level = 1)
are read locked, which unnecessarily blocks other tasks that want
to write to the same fs/subvol btree.
Therefore instead of modifying the fsync code to unlock all nodes
with level = 1 immediately after calling btrfs_search_forward(),
change btrfs_search_forward() to do it, so that it benefits all
callers.

Signed-off-by: Filipe Manana fdman...@suse.com
---
 fs/btrfs/ctree.c | 11 +++
 fs/btrfs/ioctl.c |  5 -
 fs/btrfs/tree-log.c  |  3 ---
 fs/btrfs/uuid-tree.c |  1 -
 fs/btrfs/volumes.c   |  2 --
 5 files changed, 7 insertions(+), 15 deletions(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 8ca6761..993d81b 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -5144,8 +5144,9 @@ int btrfs_search_forward(struct btrfs_root *root, struct 
btrfs_key *min_key,
u32 nritems;
int level;
int ret = 1;
+   int keep_locks = path-keep_locks;
 
-   WARN_ON(!path-keep_locks);
+   path-keep_locks = 1;
 again:
cur = btrfs_read_lock_root_node(root);
level = btrfs_header_level(cur);
@@ -5209,7 +5210,6 @@ find_next_key:
path-slots[level] = slot;
if (level == path-lowest_level) {
ret = 0;
-   unlock_up(path, level, 1, 0, NULL);
goto out;
}
btrfs_set_path_blocking(path);
@@ -5224,9 +5224,12 @@ find_next_key:
btrfs_clear_path_blocking(path, NULL, 0);
}
 out:
-   if (ret == 0)
+   path-keep_locks = keep_locks;
+   if (ret == 0) {
+   btrfs_unlock_up_safe(path, path-lowest_level + 1);
+   btrfs_set_path_blocking(path);
memcpy(min_key, found_key, sizeof(found_key));
-   btrfs_set_path_blocking(path);
+   }
return ret;
 }
 
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index ef2e073..d490abd 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -936,12 +936,9 @@ static int find_new_extents(struct btrfs_root *root,
min_key.offset = *off;
 
while (1) {
-   path-keep_locks = 1;
ret = btrfs_search_forward(root, min_key, path, newer_than);
if (ret != 0)
goto none;
-   path-keep_locks = 0;
-   btrfs_unlock_up_safe(path, 1);
 process_slot:
if (min_key.objectid != ino)
goto none;
@@ -2083,8 +2080,6 @@ static noinline int search_ioctl(struct inode *inode,
key.type = sk-min_type;
key.offset = sk-min_offset;
 
-   path-keep_locks = 1;
-
while (1) {
ret = btrfs_search_forward(root, key, path, sk-min_transid);
if (ret != 0) {
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 6e0fa17..df332dd 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2981,8 +2981,6 @@ static noinline int log_dir_items(struct 
btrfs_trans_handle *trans,
min_key.type = key_type;
min_key.offset = min_offset;
 
-   path-keep_locks = 1;
-
ret = btrfs_search_forward(root, min_key, path, trans-transid);
 
/*
@@ -3950,7 +3948,6 @@ static int btrfs_log_inode(struct btrfs_trans_handle 
*trans,
err = ret;
goto out_unlock;
}
-   path-keep_locks = 1;
 
while (1) {
ins_nr = 0;
diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c
index f6a4c03..7782829 100644
--- a/fs/btrfs/uuid-tree.c
+++ b/fs/btrfs/uuid-tree.c
@@ -279,7 +279,6 @@ int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info,
key.offset = 0;
 
 again_search_slot:
-   path-keep_locks = 1;
ret = btrfs_search_forward(root, key, path, 0);
if (ret) {
if (ret  0)
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 0daf748..73e4d30 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -3629,8 +3629,6 @@ static int btrfs_uuid_scan_kthread(void *data)
max_key.type = BTRFS_ROOT_ITEM_KEY;
max_key.offset = (u64)-1;
 
-   path-keep_locks = 1;
-
while (1) {
ret = btrfs_search_forward(root, key, path, 0);
if (ret) {
-- 
1.9.1

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH] Btrfs: fix hole detection during file fsync

2014-08-06 Thread Filipe Manana

The file hole detection logic during a file fsync wasn't correct,
because it didn't look back (in a previous leaf) for the last file
extent item that can be in a leaf to the left of our leaf and that
has a generation lower than the current transaction id. This made it
assume that a hole exists when it really doesn't exist in the file.

Such false positive hole detection happens in the following scenario:

* We have a file that has many file extent items, covering 3 or more
  btree leafs (the first leaf must contain non file extent items too).

* Two ranges of the file are modified, with their extent items being
  located at 2 different leafs and those leafs aren't consecutive.

* When processing the second leaf, we weren't checking if some file
  extent item exists that is located in some leaf that is between
  our 2 leafs, and therefore assumed the range defined between the
  last file extent item in first leaf and the first file extent item
  in the second leaf matched a hole.

Fortunately this didn't result in overriding the log with wrong data,
instead it made the last loop in copy_items() attempt to insert a
duplicated key (for a hole file extent item), which makes the file
fsync code return with -EEXIST to file.c:btrfs_sync_file() which in
turn ends up doing a full transaction commit.

I could trigger this issue with the following test for xfstests (which
never fails, either without or with this patch). The last fsync call
results in a full transaction commit, due to the -EEXIST error mentioned
above. I could also observe this behaviour happening frequently when
running xfstests/generic/075 in a loop.

Test:

_cleanup()
{
_cleanup_flakey
rm -fr $tmp
}

# get standard environment, filters and checks
. ./common/rc
. ./common/filter
. ./common/dmflakey

# real QA test starts here
_supported_fs btrfs
_supported_os Linux
_require_scratch
_require_dm_flakey
_need_to_be_root

rm -f $seqres.full

# Create a file with many file extent items, each representing a 4Kb extent.
# These items span 3 btree leaves, of 16Kb each (default mkfs.btrfs leaf 
size
# as of btrfs-progs 3.12).
_scratch_mkfs -l 16384 /dev/null 21
_init_flakey
SAVE_MOUNT_OPTIONS=$MOUNT_OPTIONS
MOUNT_OPTIONS=$MOUNT_OPTIONS -o commit=999
_mount_flakey

# First fsync, inode has BTRFS_INODE_NEEDS_FULL_SYNC flag set.
$XFS_IO_PROG -f -c pwrite -S 0x01 -b 4096 0 4096 -c fsync \
$SCRATCH_MNT/foo | _filter_xfs_io

# For any of the following fsync calls, inode doesn't have the flag
# BTRFS_INODE_NEEDS_FULL_SYNC set.
for ((i = 1; i = 500; i++)); do
OFFSET=$((4096 * i))
LEN=4096
$XFS_IO_PROG -c pwrite -S 0x01 $OFFSET $LEN -c fsync \
$SCRATCH_MNT/foo | _filter_xfs_io
done

# Commit transaction and bump next transaction's id (to 7).
sync

# Truncate will set the BTRFS_INODE_NEEDS_FULL_SYNC flag in the btrfs's
# inode runtime flags.
$XFS_IO_PROG -c truncate 2048000 $SCRATCH_MNT/foo

# Commit transaction and bump next transaction's id (to 8).
sync

# Touch 1 extent item from the first leaf and 1 from the last leaf. The leaf
# in the middle, containing only file extent items, isn't touched. So the
# next fsync, when calling btrfs_search_forward(), won't visit that middle
# leaf. First and 3rd leaf have generation 6, while middle one has 
generation 8.
$XFS_IO_PROG \
-c pwrite -S 0xee -b 4096 0 4096 \
-c pwrite -S 0xff -b 4096 2043904 4096 \
-c fsync \
$SCRATCH_MNT/foo | _filter_xfs_io

_load_flakey_table $FLAKEY_DROP_WRITES
md5sum $SCRATCH_MNT/foo | _filter_scratch
_unmount_flakey

_load_flakey_table $FLAKEY_ALLOW_WRITES
# During mount, we'll replay the log created by the fsync above, and the 
file's
# md5 digest should be the same we got before the unmount.
_mount_flakey
md5sum $SCRATCH_MNT/foo | _filter_scratch
_unmount_flakey
MOUNT_OPTIONS=$SAVE_MOUNT_OPTIONS

status=0
exit

Signed-off-by: Filipe Manana fdman...@suse.com
---
 fs/btrfs/tree-log.c | 17 +
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index df332dd..5a917a6 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3296,7 +3296,7 @@ static noinline int copy_items(struct btrfs_trans_handle 
*trans,
struct list_head ordered_sums;
int skip_csum = BTRFS_I(inode)-flags  BTRFS_INODE_NODATASUM;
bool has_extents = false;
-   bool need_find_last_extent = (*last_extent == 0);
+   bool need_find_last_extent = true;
bool done = false;
 
INIT_LIST_HEAD(ordered_sums);
@@ -3350,8 +3350,7 @@ static noinline int copy_items(struct btrfs_trans_handle 
*trans,
 */
if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY) {
has_extents = true

[PATCH v2] Btrfs: fix hole detection during file fsync

2014-08-07 Thread Filipe Manana

The file hole detection logic during a file fsync wasn't correct,
because it didn't look back (in a previous leaf) for the last file
extent item that can be in a leaf to the left of our leaf and that
has a generation lower than the current transaction id. This made it
assume that a hole exists when it really doesn't exist in the file.

Such false positive hole detection happens in the following scenario:

* We have a file that has many file extent items, covering 3 or more
  btree leafs (the first leaf must contain non file extent items too).

* Two ranges of the file are modified, with their extent items being
  located at 2 different leafs and those leafs aren't consecutive.

* When processing the second modified leaf, we weren't checking if
  some file extent item exists that is located in some leaf that is
  between our 2 modified leafs, and therefore assumed the range defined
  between the last file extent item in the first leaf and the first file
  extent item in the second leaf matched a hole.

Fortunately this didn't result in overriding the log with wrong data,
instead it made the last loop in copy_items() attempt to insert a
duplicated key (for a hole file extent item), which makes the file
fsync code return with -EEXIST to file.c:btrfs_sync_file() which in
turn ends up doing a full transaction commit, which is much more
expensive then writing only to the log tree and wait for it to be
durably persisted (as well as the file's modified extents/pages).
Therefore fix the hole detection logic, so that we don't pay the
cost of doing full transaction commits.

I could trigger this issue with the following test for xfstests (which
never fails, either without or with this patch). The last fsync call
results in a full transaction commit, due to the -EEXIST error mentioned
above. I could also observe this behaviour happening frequently when
running xfstests/generic/075 in a loop.

Test:

_cleanup()
{
_cleanup_flakey
rm -fr $tmp
}

# get standard environment, filters and checks
. ./common/rc
. ./common/filter
. ./common/dmflakey

# real QA test starts here
_supported_fs btrfs
_supported_os Linux
_require_scratch
_require_dm_flakey
_need_to_be_root

rm -f $seqres.full

# Create a file with many file extent items, each representing a 4Kb extent.
# These items span 3 btree leaves, of 16Kb each (default mkfs.btrfs leaf 
size
# as of btrfs-progs 3.12).
_scratch_mkfs -l 16384 /dev/null 21
_init_flakey
SAVE_MOUNT_OPTIONS=$MOUNT_OPTIONS
MOUNT_OPTIONS=$MOUNT_OPTIONS -o commit=999
_mount_flakey

# First fsync, inode has BTRFS_INODE_NEEDS_FULL_SYNC flag set.
$XFS_IO_PROG -f -c pwrite -S 0x01 -b 4096 0 4096 -c fsync \
$SCRATCH_MNT/foo | _filter_xfs_io

# For any of the following fsync calls, inode doesn't have the flag
# BTRFS_INODE_NEEDS_FULL_SYNC set.
for ((i = 1; i = 500; i++)); do
OFFSET=$((4096 * i))
LEN=4096
$XFS_IO_PROG -c pwrite -S 0x01 $OFFSET $LEN -c fsync \
$SCRATCH_MNT/foo | _filter_xfs_io
done

# Commit transaction and bump next transaction's id (to 7).
sync

# Truncate will set the BTRFS_INODE_NEEDS_FULL_SYNC flag in the btrfs's
# inode runtime flags.
$XFS_IO_PROG -c truncate 2048000 $SCRATCH_MNT/foo

# Commit transaction and bump next transaction's id (to 8).
sync

# Touch 1 extent item from the first leaf and 1 from the last leaf. The leaf
# in the middle, containing only file extent items, isn't touched. So the
# next fsync, when calling btrfs_search_forward(), won't visit that middle
# leaf. First and 3rd leaf have now a generation with value 8, while the
# middle leaf remains with a generation with value 6.
$XFS_IO_PROG \
-c pwrite -S 0xee -b 4096 0 4096 \
-c pwrite -S 0xff -b 4096 2043904 4096 \
-c fsync \
$SCRATCH_MNT/foo | _filter_xfs_io

_load_flakey_table $FLAKEY_DROP_WRITES
md5sum $SCRATCH_MNT/foo | _filter_scratch
_unmount_flakey

_load_flakey_table $FLAKEY_ALLOW_WRITES
# During mount, we'll replay the log created by the fsync above, and the 
file's
# md5 digest should be the same we got before the unmount.
_mount_flakey
md5sum $SCRATCH_MNT/foo | _filter_scratch
_unmount_flakey
MOUNT_OPTIONS=$SAVE_MOUNT_OPTIONS

status=0
exit

Signed-off-by: Filipe Manana fdman...@suse.com
---

V2: Updated commit message, fixed a couple typos, grammar and a more
clear explanation of the problem.

 fs/btrfs/tree-log.c | 17 +
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index df332dd..5a917a6 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3296,7 +3296,7 @@ static noinline int copy_items(struct btrfs_trans_handle 
*trans,
struct list_head ordered_sums;
int skip_csum = BTRFS_I(inode)-flags

[PATCH] Btrfs: don't monopolize a core when evicting inode

2014-08-07 Thread Filipe Manana

If an inode has a very large number of extent maps, we can spend
a lot of time freeing them, which triggers a soft lockup warning.
Therefore reschedule if we need to when freeing the extent maps
while evicting the inode.

I could trigger this all the time by running xfstests/generic/299 on
a file system with the no-holes feature enabled. That test creates
an inode with 11386677 extent maps.

$ mkfs.btrfs -f -O no-holes $TEST_DEV
$ MKFS_OPTIONS=-O no-holes ./check generic/299
generic/299 382s ...
Message from syslogd@debian-vm3 at Aug  7 10:44:29 ...
 kernel:[85304.208017] BUG: soft lockup - CPU#0 stuck for 22s! 
[umount:25330]
 384s
Ran: generic/299
Passed all 1 tests

$ dmesg
(...)
[86304.300017] BUG: soft lockup - CPU#0 stuck for 23s! [umount:25330]
(...)
[86304.300036] Call Trace:
[86304.300036]  [81698ba9] __slab_free+0x54/0x295
[86304.300036]  [a02ee9cc] ? free_extent_map+0x5c/0xb0 [btrfs]
[86304.300036]  [811a6cd2] kmem_cache_free+0x282/0x2a0
[86304.300036]  [a02ee9cc] free_extent_map+0x5c/0xb0 [btrfs]
[86304.300036]  [a02e3775] btrfs_evict_inode+0xd5/0x660 [btrfs]
[86304.300036]  [811e7c8d] ? __inode_wait_for_writeback+0x6d/0xc0
[86304.300036]  [816a389b] ? _raw_spin_unlock+0x2b/0x40
[86304.300036]  [811d8cbb] evict+0xab/0x180
[86304.300036]  [811d8dce] dispose_list+0x3e/0x60
[86304.300036]  [811d9b04] evict_inodes+0xf4/0x110
[86304.300036]  [811bd953] generic_shutdown_super+0x53/0x110
[86304.300036]  [811bdaa6] kill_anon_super+0x16/0x30
[86304.300036]  [a02a78ba] btrfs_kill_super+0x1a/0xa0 [btrfs]
[86304.300036]  [811bd3a9] deactivate_locked_super+0x59/0x80
[86304.300036]  [811be44e] deactivate_super+0x4e/0x70
[86304.300036]  [811dec14] mntput_no_expire+0x174/0x1f0
[86304.300036]  [811deab7] ? mntput_no_expire+0x17/0x1f0
[86304.300036]  [811e0517] SyS_umount+0x97/0x100
(...)

Signed-off-by: Filipe Manana fdman...@suse.com
---
 fs/btrfs/inode.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 8ad3ea9..00b4bd3 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4718,6 +4718,11 @@ static void evict_inode_truncate_pages(struct inode 
*inode)
clear_bit(EXTENT_FLAG_LOGGING, em-flags);
remove_extent_mapping(map_tree, em);
free_extent_map(em);
+   if (need_resched()) {
+   write_unlock(map_tree-lock);
+   cond_resched();
+   write_lock(map_tree-lock);
+   }
}
write_unlock(map_tree-lock);
 
@@ -4740,6 +4745,7 @@ static void evict_inode_truncate_pages(struct inode 
*inode)
 cached_state, GFP_NOFS);
free_extent_state(state);
 
+   cond_resched();
spin_lock(io_tree-lock);
}
spin_unlock(io_tree-lock);
-- 
1.9.1

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH] Btrfs: fix csum tree corruption, duplicate and outdated checksums

2014-08-09 Thread Filipe Manana

Under rare circumstances we can end up leaving 2 versions of a checksum
for the same file extent range.

The reason for this is that after calling btrfs_next_leaf we process
slot 0 of the leaf it returns, instead of processing the slot set in
path-slots[0]. Most of the time (by far) path-slots[0] is 0, but after
btrfs_next_leaf() releases the path and before it searches for the next
leaf, another task might cause a split of the next leaf, which migrates
some of its keys to the leaf we were processing before calling
btrfs_next_leaf(). In this case btrfs_next_leaf() returns again the
same leaf but with path-slots[0] having a slot number corresponding
to the first new key it got, that is, a slot number that didn't exist
before calling btrfs_next_leaf(), as the leaf now has more keys than
it had before. So we must really process the returned leaf starting at
path-slots[0] always, as it isn't always 0, and the key at slot 0 can
have an offset much lower than our search offset/bytenr.

For example, consider the following scenario, where we have:

sums-bytenr: 40157184, sums-len: 16384, sums end: 40173568
four 4kb file data blocks with offsets 40157184, 40161280, 40165376, 40169472

  Leaf N:

slot = 0   slot = btrfs_header_nritems() - 1
  |---|
  | [(CSUM CSUM 39239680), size 8] ... [(CSUM CSUM 40116224), size 4] |
  |---|

  Leaf N + 1:

  slot = 0  slot = btrfs_header_nritems() - 1
  ||
  | [(CSUM CSUM 40161280), size 32] ... [((CSUM CSUM 40615936), size 8 |
  ||

Because we are at the last slot of leaf N, we call btrfs_next_leaf() to
find the next highest key, which releases the current path and then searches
for that next key. However after releasing the path and before finding that
next key, the item at slot 0 of leaf N + 1 gets moved to leaf N, due to a call
to ctree.c:push_leaf_left() (via ctree.c:split_leaf()), and therefore
btrfs_next_leaf() will returns us a path again with leaf N but with the slot
pointing to its new last key (CSUM CSUM 40161280). This new version of leaf N
is then:

slot = 0slot = btrfs_header_nritems() - 2  slot = 
btrfs_header_nritems() - 1
  
||
  | [(CSUM CSUM 39239680), size 8] ... [(CSUM CSUM 40116224), size 4]  [(CSUM 
CSUM 40161280), size 32] |
  
||

And incorrecly using slot 0, makes us set next_offset to 39239680 and we jump
into the insert: label, which will set tmp to:

tmp = min((sums-len - total_bytes)  blocksize_bits,
(next_offset - file_key.offset)  blocksize_bits) =
min((16384 - 0)  12, (39239680 - 40157184)  12) =
min(4, (u64)-917504 = 18446744073708634112  12) = 4

and

   ins_size = csum_size * tmp = 4 * 4 = 16 bytes.

In other words, we insert a new csum item in the tree with key
(CSUM_OBJECTID CSUM_KEY 40157184 = sums-bytenr) that contains the checksums
for all the data (4 blocks of 4096 bytes each = sums-len). Which is wrong,
because the item with key (CSUM CSUM 40161280) (the one that was moved from
leaf N + 1 to the end of leaf N) contains the old checksums of the last 12288
bytes of our data and won't get those old checksums removed.

So this leaves us 2 different checksums for 3 4kb blocks of data in the tree,
and breaks the logical rule:

   Key_N+1.offset = Key_N.offset + length_of_data_its_checksums_cover

An obvious bad effect of this is that a subsequent csum tree lookup to get
the checksum of any of the blocks with logical offset of 40161280, 40165376
or 40169472 (the last 3 4kb blocks of file data), will get the old checksums.

Cc: sta...@vger.kernel.org
Signed-off-by: Filipe Manana fdman...@suse.com
---
 fs/btrfs/file-item.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index a1f97de..7897dcd 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -746,7 +746,7 @@ again:
found_next = 1;
if (ret != 0)
goto insert;
-   slot = 0;
+   slot = path-slots[0];
}
btrfs_item_key_to_cpu(path-nodes[0], found_key, slot);
if (found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
-- 
1.9.1

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH] Btrfs: send, lower mem requirements for processing xattrs

2014-08-10 Thread Filipe Manana

Maximum xattr size can be up to nearly the leaf size. For an fs with a
leaf size larger than the page size, using kmalloc requires allocating
multiple pages that are contiguous, which might not be possible if
there's heavy memory fragmentation. Therefore fallback to vmalloc if
we fail to allocate with kmalloc. Also start with a smaller buffer size,
since xattr values typically are smaller than a page.

Reported-by: Chris Murphy li...@colorremedies.com
Signed-off-by: Filipe Manana fdman...@suse.com
---
 fs/btrfs/send.c | 41 +
 1 file changed, 33 insertions(+), 8 deletions(-)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 3c63b29..215064d 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -997,6 +997,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct 
btrfs_path *path,
struct btrfs_key di_key;
char *buf = NULL;
int buf_len;
+   bool contig_buf;
u32 name_len;
u32 data_len;
u32 cur;
@@ -1006,11 +1007,13 @@ static int iterate_dir_item(struct btrfs_root *root, 
struct btrfs_path *path,
int num;
u8 type;
 
-   if (found_key-type == BTRFS_XATTR_ITEM_KEY)
-   buf_len = BTRFS_MAX_XATTR_SIZE(root);
-   else
-   buf_len = PATH_MAX;
-
+   /*
+* Start with a small buffer (1 page). If later we end up needing more
+* space, which can happen for xattrs on a fs with a leaf size  4Kb,
+* attempt to increase the buffer. Typically xattr values are small.
+*/
+   buf_len = PATH_MAX;
+   contig_buf = true;
buf = kmalloc(buf_len, GFP_NOFS);
if (!buf) {
ret = -ENOMEM;
@@ -1037,7 +1040,7 @@ static int iterate_dir_item(struct btrfs_root *root, 
struct btrfs_path *path,
ret = -ENAMETOOLONG;
goto out;
}
-   if (name_len + data_len  buf_len) {
+   if (name_len + data_len  BTRFS_MAX_XATTR_SIZE(root)) {
ret = -E2BIG;
goto out;
}
@@ -1045,12 +1048,31 @@ static int iterate_dir_item(struct btrfs_root *root, 
struct btrfs_path *path,
/*
 * Path too long
 */
-   if (name_len + data_len  buf_len) {
+   if (name_len + data_len  PATH_MAX) {
ret = -ENAMETOOLONG;
goto out;
}
}
 
+   if (name_len + data_len  buf_len) {
+   if (contig_buf)
+   kfree(buf);
+   else
+   vfree(buf);
+   buf = NULL;
+   buf_len = name_len + data_len;
+   if (contig_buf)
+   buf = kmalloc(buf_len, GFP_NOFS);
+   if (!buf) {
+   buf = vmalloc(buf_len);
+   if (!buf) {
+   ret = -ENOMEM;
+   goto out;
+   }
+   contig_buf = false;
+   }
+   }
+
read_extent_buffer(eb, buf, (unsigned long)(di + 1),
name_len + data_len);
 
@@ -1071,7 +1093,10 @@ static int iterate_dir_item(struct btrfs_root *root, 
struct btrfs_path *path,
}
 
 out:
-   kfree(buf);
+   if (contig_buf)
+   kfree(buf);
+   else
+   vfree(buf);
return ret;
 }
 
-- 
1.9.1

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v2] Btrfs: send, lower mem requirements for processing xattrs

2014-08-10 Thread Filipe Manana

Maximum xattr size can be up to nearly the leaf size. For an fs with a
leaf size larger than the page size, using kmalloc requires allocating
multiple pages that are contiguous, which might not be possible if
there's heavy memory fragmentation. Therefore fallback to vmalloc if
we fail to allocate with kmalloc. Also start with a smaller buffer size,
since xattr values typically are smaller than a page.

Reported-by: Chris Murphy li...@colorremedies.com
Signed-off-by: Filipe Manana fdman...@suse.com
---

V2: Use is_vmalloc_addr() instead of keeping a boolean variable around.

 fs/btrfs/send.c | 39 +++
 1 file changed, 31 insertions(+), 8 deletions(-)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 3c63b29..a7ce318 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -1006,11 +1006,13 @@ static int iterate_dir_item(struct btrfs_root *root, 
struct btrfs_path *path,
int num;
u8 type;
 
-   if (found_key-type == BTRFS_XATTR_ITEM_KEY)
-   buf_len = BTRFS_MAX_XATTR_SIZE(root);
-   else
-   buf_len = PATH_MAX;
-
+   /*
+* Start with a small buffer (1 page). If later we end up needing more
+* space, which can happen for xattrs on a fs with a leaf size greater
+* then the page size, attempt to increase the buffer. Typically xattr
+* values are small.
+*/
+   buf_len = PATH_MAX;
buf = kmalloc(buf_len, GFP_NOFS);
if (!buf) {
ret = -ENOMEM;
@@ -1037,7 +1039,7 @@ static int iterate_dir_item(struct btrfs_root *root, 
struct btrfs_path *path,
ret = -ENAMETOOLONG;
goto out;
}
-   if (name_len + data_len  buf_len) {
+   if (name_len + data_len  BTRFS_MAX_XATTR_SIZE(root)) {
ret = -E2BIG;
goto out;
}
@@ -1045,12 +1047,30 @@ static int iterate_dir_item(struct btrfs_root *root, 
struct btrfs_path *path,
/*
 * Path too long
 */
-   if (name_len + data_len  buf_len) {
+   if (name_len + data_len  PATH_MAX) {
ret = -ENAMETOOLONG;
goto out;
}
}
 
+   if (name_len + data_len  buf_len) {
+   buf_len = name_len + data_len;
+   if (is_vmalloc_addr(buf)) {
+   vfree(buf);
+   buf = NULL;
+   } else {
+   kfree(buf);
+   buf = kmalloc(buf_len, GFP_NOFS);
+   }
+   if (!buf) {
+   buf = vmalloc(buf_len);
+   if (!buf) {
+   ret = -ENOMEM;
+   goto out;
+   }
+   }
+   }
+
read_extent_buffer(eb, buf, (unsigned long)(di + 1),
name_len + data_len);
 
@@ -1071,7 +1091,10 @@ static int iterate_dir_item(struct btrfs_root *root, 
struct btrfs_path *path,
}
 
 out:
-   kfree(buf);
+   if (is_vmalloc_addr(buf))
+   vfree(buf);
+   else
+   kfree(buf);
return ret;
 }
 
-- 
1.9.1

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v3] Btrfs: send, lower mem requirements for processing xattrs

2014-08-10 Thread Filipe Manana

Maximum xattr size can be up to nearly the leaf size. For an fs with a
leaf size larger than the page size, using kmalloc requires allocating
multiple pages that are contiguous, which might not be possible if
there's heavy memory fragmentation. Therefore fallback to vmalloc if
we fail to allocate with kmalloc. Also start with a smaller buffer size,
since xattr values typically are smaller than a page.

Reported-by: Chris Murphy li...@colorremedies.com
Signed-off-by: Filipe Manana fdman...@suse.com
---

V2: Use is_vmalloc_addr() instead of keeping a boolean variable around.
V3: Use krealloc instead of kfree + kmalloc.

 fs/btrfs/send.c | 41 +
 1 file changed, 33 insertions(+), 8 deletions(-)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 3c63b29..8b2780d 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -1006,11 +1006,13 @@ static int iterate_dir_item(struct btrfs_root *root, 
struct btrfs_path *path,
int num;
u8 type;
 
-   if (found_key-type == BTRFS_XATTR_ITEM_KEY)
-   buf_len = BTRFS_MAX_XATTR_SIZE(root);
-   else
-   buf_len = PATH_MAX;
-
+   /*
+* Start with a small buffer (1 page). If later we end up needing more
+* space, which can happen for xattrs on a fs with a leaf size greater
+* then the page size, attempt to increase the buffer. Typically xattr
+* values are small.
+*/
+   buf_len = PATH_MAX;
buf = kmalloc(buf_len, GFP_NOFS);
if (!buf) {
ret = -ENOMEM;
@@ -1037,7 +1039,7 @@ static int iterate_dir_item(struct btrfs_root *root, 
struct btrfs_path *path,
ret = -ENAMETOOLONG;
goto out;
}
-   if (name_len + data_len  buf_len) {
+   if (name_len + data_len  BTRFS_MAX_XATTR_SIZE(root)) {
ret = -E2BIG;
goto out;
}
@@ -1045,12 +1047,32 @@ static int iterate_dir_item(struct btrfs_root *root, 
struct btrfs_path *path,
/*
 * Path too long
 */
-   if (name_len + data_len  buf_len) {
+   if (name_len + data_len  PATH_MAX) {
ret = -ENAMETOOLONG;
goto out;
}
}
 
+   if (name_len + data_len  buf_len) {
+   buf_len = name_len + data_len;
+   if (is_vmalloc_addr(buf)) {
+   vfree(buf);
+   buf = NULL;
+   } else {
+   char *tmp = krealloc(buf, buf_len, GFP_NOFS);
+   if (!tmp)
+   kfree(buf);
+   buf = tmp;
+   }
+   if (!buf) {
+   buf = vmalloc(buf_len);
+   if (!buf) {
+   ret = -ENOMEM;
+   goto out;
+   }
+   }
+   }
+
read_extent_buffer(eb, buf, (unsigned long)(di + 1),
name_len + data_len);
 
@@ -1071,7 +1093,10 @@ static int iterate_dir_item(struct btrfs_root *root, 
struct btrfs_path *path,
}
 
 out:
-   kfree(buf);
+   if (is_vmalloc_addr(buf))
+   vfree(buf);
+   else
+   kfree(buf);
return ret;
 }
 
-- 
1.9.1

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v4] Btrfs: send, lower mem requirements for processing xattrs

2014-08-10 Thread Filipe Manana

Maximum xattr size can be up to nearly the leaf size. For an fs with a
leaf size larger than the page size, using kmalloc requires allocating
multiple pages that are contiguous, which might not be possible if
there's heavy memory fragmentation. Therefore fallback to vmalloc if
we fail to allocate with kmalloc. Also start with a smaller buffer size,
since xattr values typically are smaller than a page.

Reported-by: Chris Murphy li...@colorremedies.com
Signed-off-by: Filipe Manana fdman...@suse.com
---

V2: Use is_vmalloc_addr() instead of keeping a boolean variable around.
V3: Use krealloc instead of kfree + kmalloc.
V4: Fixed a checkpatch warning about missing blank line after var declaration.

 fs/btrfs/send.c | 42 ++
 1 file changed, 34 insertions(+), 8 deletions(-)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 3c63b29..b29fc5c 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -1006,11 +1006,13 @@ static int iterate_dir_item(struct btrfs_root *root, 
struct btrfs_path *path,
int num;
u8 type;
 
-   if (found_key-type == BTRFS_XATTR_ITEM_KEY)
-   buf_len = BTRFS_MAX_XATTR_SIZE(root);
-   else
-   buf_len = PATH_MAX;
-
+   /*
+* Start with a small buffer (1 page). If later we end up needing more
+* space, which can happen for xattrs on a fs with a leaf size greater
+* then the page size, attempt to increase the buffer. Typically xattr
+* values are small.
+*/
+   buf_len = PATH_MAX;
buf = kmalloc(buf_len, GFP_NOFS);
if (!buf) {
ret = -ENOMEM;
@@ -1037,7 +1039,7 @@ static int iterate_dir_item(struct btrfs_root *root, 
struct btrfs_path *path,
ret = -ENAMETOOLONG;
goto out;
}
-   if (name_len + data_len  buf_len) {
+   if (name_len + data_len  BTRFS_MAX_XATTR_SIZE(root)) {
ret = -E2BIG;
goto out;
}
@@ -1045,12 +1047,33 @@ static int iterate_dir_item(struct btrfs_root *root, 
struct btrfs_path *path,
/*
 * Path too long
 */
-   if (name_len + data_len  buf_len) {
+   if (name_len + data_len  PATH_MAX) {
ret = -ENAMETOOLONG;
goto out;
}
}
 
+   if (name_len + data_len  buf_len) {
+   buf_len = name_len + data_len;
+   if (is_vmalloc_addr(buf)) {
+   vfree(buf);
+   buf = NULL;
+   } else {
+   char *tmp = krealloc(buf, buf_len, GFP_NOFS);
+
+   if (!tmp)
+   kfree(buf);
+   buf = tmp;
+   }
+   if (!buf) {
+   buf = vmalloc(buf_len);
+   if (!buf) {
+   ret = -ENOMEM;
+   goto out;
+   }
+   }
+   }
+
read_extent_buffer(eb, buf, (unsigned long)(di + 1),
name_len + data_len);
 
@@ -1071,7 +1094,10 @@ static int iterate_dir_item(struct btrfs_root *root, 
struct btrfs_path *path,
}
 
 out:
-   kfree(buf);
+   if (is_vmalloc_addr(buf))
+   vfree(buf);
+   else
+   kfree(buf);
return ret;
 }
 
-- 
1.9.1

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH] Btrfs: shrink further sizeof(struct extent_buffer)

2014-08-14 Thread Filipe Manana

The map_start and map_len fields aren't used anywhere, so just remove
them. On a x86_64 system, this reduced sizeof(struct extent_buffer)
from 296 bytes to 280 bytes, and therefore 14 extent_buffer structs can
now fit into a page instead of 13.

Signed-off-by: Filipe Manana fdman...@suse.com
---
 fs/btrfs/extent_io.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index ce02cc9..5e91fb9 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -125,8 +125,6 @@ struct extent_state {
 struct extent_buffer {
u64 start;
unsigned long len;
-   unsigned long map_start;
-   unsigned long map_len;
unsigned long bflags;
struct btrfs_fs_info *fs_info;
spinlock_t refs_lock;
-- 
1.9.1

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v5] Btrfs: send, lower mem requirements for processing xattrs

2014-08-20 Thread Filipe Manana

Maximum xattr size can be up to nearly the leaf size. For an fs with a
leaf size larger than the page size, using kmalloc requires allocating
multiple pages that are contiguous, which might not be possible if
there's heavy memory fragmentation. Therefore fallback to vmalloc if
we fail to allocate with kmalloc. Also start with a smaller buffer size,
since xattr values typically are smaller than a page.

Reported-by: Chris Murphy li...@colorremedies.com
Signed-off-by: Filipe Manana fdman...@suse.com
---

V2: Use is_vmalloc_addr() instead of keeping a boolean variable around.
V3: Use krealloc instead of kfree + kmalloc.
V4: Fixed a checkpatch warning about missing blank line after var declaration.
V5: Use kvfree() and pass __GFP_NOWARN to krealloc().

 fs/btrfs/send.c | 40 
 1 file changed, 32 insertions(+), 8 deletions(-)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 3c63b29..3290da9 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -1006,11 +1006,13 @@ static int iterate_dir_item(struct btrfs_root *root, 
struct btrfs_path *path,
int num;
u8 type;
 
-   if (found_key-type == BTRFS_XATTR_ITEM_KEY)
-   buf_len = BTRFS_MAX_XATTR_SIZE(root);
-   else
-   buf_len = PATH_MAX;
-
+   /*
+* Start with a small buffer (1 page). If later we end up needing more
+* space, which can happen for xattrs on a fs with a leaf size greater
+* then the page size, attempt to increase the buffer. Typically xattr
+* values are small.
+*/
+   buf_len = PATH_MAX;
buf = kmalloc(buf_len, GFP_NOFS);
if (!buf) {
ret = -ENOMEM;
@@ -1037,7 +1039,7 @@ static int iterate_dir_item(struct btrfs_root *root, 
struct btrfs_path *path,
ret = -ENAMETOOLONG;
goto out;
}
-   if (name_len + data_len  buf_len) {
+   if (name_len + data_len  BTRFS_MAX_XATTR_SIZE(root)) {
ret = -E2BIG;
goto out;
}
@@ -1045,12 +1047,34 @@ static int iterate_dir_item(struct btrfs_root *root, 
struct btrfs_path *path,
/*
 * Path too long
 */
-   if (name_len + data_len  buf_len) {
+   if (name_len + data_len  PATH_MAX) {
ret = -ENAMETOOLONG;
goto out;
}
}
 
+   if (name_len + data_len  buf_len) {
+   buf_len = name_len + data_len;
+   if (is_vmalloc_addr(buf)) {
+   vfree(buf);
+   buf = NULL;
+   } else {
+   char *tmp = krealloc(buf, buf_len,
+GFP_NOFS | __GFP_NOWARN);
+
+   if (!tmp)
+   kfree(buf);
+   buf = tmp;
+   }
+   if (!buf) {
+   buf = vmalloc(buf_len);
+   if (!buf) {
+   ret = -ENOMEM;
+   goto out;
+   }
+   }
+   }
+
read_extent_buffer(eb, buf, (unsigned long)(di + 1),
name_len + data_len);
 
@@ -1071,7 +1095,7 @@ static int iterate_dir_item(struct btrfs_root *root, 
struct btrfs_path *path,
}
 
 out:
-   kfree(buf);
+   kvfree(buf);
return ret;
 }
 
-- 
1.9.1

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH] Btrfs: improve free space cache management and space allocation

2014-08-25 Thread Filipe Manana

While under random IO, a block group's free space cache eventually reaches
a state where it has a mix of extent entries and bitmap entries representing
free space regions.

As later free space regions are returned to the cache, some of them are merged
with existing extent entries if they are contiguous with them. But others are
not merged, because despite the existence of adjacent free space regions in
the cache, the merging doesn't happen because the existing free space regions
are represented in bitmap extents. Even when new free space regions are merged
with existing extent entries (enlarging the free space range they represent),
we create chances of having after an enlarged region that is contiguous with
some other region represented in a bitmap entry.

Both clustered and non-clustered space allocation work by iterating over our
extent and bitmap entries and skipping any that represents a region smaller
then the allocation request (and giving preference to extent entries before
bitmap entries). By having a contiguous free space region that is represented
by 2 (or more) entries (mix of extent and bitmap entries), we end up not
satisfying an allocation request with a size larger than the size of any of
the entries but no larger than the sum of their sizes. Making the caller assume
we're under a ENOSPC condition or force it to allocate multiple smaller space
regions (as we do for file data writes), which adds extra overhead and more
chances of causing fragmentation due to the smaller regions being all spread
apart from each other (more likely when under concurrency).

For example, if we have the following in the cache:

* extent entry representing free space range: [128Mb - 256Kb, 128Mb[

* bitmap entry covering the range [128Mb, 256Mb[, but only with the bits
  representing the range [128Mb, 128Mb + 768Kb[ set - that is, only that
  space in this 128Mb area is marked as free

An allocation request for 1Mb, starting at offset not greater than 128Mb - 
256Kb,
would fail before, despite the existence of such contiguous free space area in 
the
cache. The caller could only allocate up to 768Kb of space at once and later 
another
256Kb (or vice-versa). In between each smaller allocation request, another task
working on a different file/inode might come in and take that space, preventing 
the
former task of getting a contiguous 1Mb region of free space.

Therefore this change implements the ability to move free space from bitmap
entries into existing and new free space regions represented with extent
entries. This is done when a space region is added to the cache.

A test was added to the sanity tests that explains in detail the issue too.

Some performance test results with compilebench on a 4 cores machine, with
32Gb of ram and using an HDD follow.

Test: compilebench -D /mnt -i 30 -r 1000 --makej

Before this change:

   intial create total runs 30 avg 69.02 MB/s (user 0.28s sys 0.57s)
   compile total runs 30 avg 314.96 MB/s (user 0.12s sys 0.25s)
   read compiled tree total runs 3 avg 27.14 MB/s (user 1.52s sys 0.90s)
   delete compiled tree total runs 30 avg 3.14 seconds (user 0.15s sys 0.66s)

After this change:

   intial create total runs 30 avg 68.37 MB/s (user 0.29s sys 0.55s)
   compile total runs 30 avg 382.83 MB/s (user 0.12s sys 0.24s)
   read compiled tree total runs 3 avg 27.82 MB/s (user 1.45s sys 0.97s)
   delete compiled tree total runs 30 avg 3.18 seconds (user 0.17s sys 0.65s)

Signed-off-by: Filipe Manana fdman...@suse.com
---
 fs/btrfs/free-space-cache.c   | 149 ++-
 fs/btrfs/tests/free-space-tests.c | 514 ++
 2 files changed, 662 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 2f0fe10..23632ba 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -1951,6 +1951,137 @@ out:
return ret;
 }
 
+static void steal_from_bitmap_to_end(struct btrfs_free_space_ctl *ctl,
+struct btrfs_free_space *info,
+bool update_stat)
+{
+   struct btrfs_free_space *bitmap;
+   u64 bitmap_offset = info-offset;
+   unsigned long i;
+   unsigned long j;
+   const u64 end = info-offset + info-bytes;
+   u64 bytes;
+
+again:
+   bitmap = tree_search_offset(ctl, offset_to_bitmap(ctl, bitmap_offset),
+   1, 0);
+   if (!bitmap)
+   goto out;
+
+   if (end  bitmap-offset || (bitmap-offset + bitmap-bytes  end))
+   return;
+
+   i = offset_to_bit(bitmap-offset, ctl-unit, end);
+   j = find_next_zero_bit(bitmap-bitmap, BITS_PER_BITMAP, i);
+   if (j == i)
+   return;
+   bytes = (j - i) * ctl-unit;
+   info-bytes += bytes;
+
+   if (update_stat)
+   bitmap_clear_bits(ctl, bitmap, end, bytes);
+   else
+   __bitmap_clear_bits(ctl, bitmap, end, bytes

[PATCH] Btrfs: fix corruption after write/fsync failure + fsync + log recovery

2014-08-25 Thread Filipe Manana

While writing to a file, in inode.c:cow_file_range() (and same applies to
submit_compressed_extents()), after reserving an extent for the file data,
we create a new extent map for the written range and insert it into the
extent map cache. After that, we create an ordered operation, but if it
fails (due to a transient/temporary-ENOMEM), we return without dropping
that extent map, which points to a reserved extent that is freed when we
return. A subsequent incremental fsync (when the btrfs inode doesn't have
the flag BTRFS_INODE_NEEDS_FULL_SYNC) considers this extent map valid and
logs a file extent item based on that extent map, which points to a disk
extent that doesn't contain valid data - it was freed by us earlier, at this
point it might contain any random/garbage data.

Therefore, if we reach an error condition when cowing a file range after
we added the new extent map to the cache, drop it from the cache before
returning.

Some sequence of steps that lead to this:

$ mkfs.btrfs -f /dev/sdd
$ mount -o commit= /dev/sdd /mnt
$ cd /mnt

$ xfs_io -f -c pwrite -S 0x01 -b 4096 0 4096 -c fsync foo
$ xfs_io -c pwrite -S 0x02 -b 4096 4096 4096
$ sync

$ od -t x1 foo
000 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01
*
001 02 02 02 02 02 02 02 02 02 02 02 02 02 02 02 02
*
002

$ xfs_io -c pwrite -S 0xa1 -b 4096 0 4096 foo

# Now this write + fsync fail with -ENOMEM, which was returned by
# btrfs_add_ordered_extent() in inode.c:cow_file_range().
$ xfs_io -c pwrite -S 0xff -b 4096 4096 4096 foo
$ xfs_io -c fsync foo
fsync: Cannot allocate memory

# Now do a new write + fsync, which will succeed. Our previous
# -ENOMEM was a transient/temporary error.
$ xfs_io -c pwrite -S 0xee -b 4096 16384 4096 foo
$ xfs_io -c fsync foo

# Our file content (in page cache) is now:
$ od -t x1 foo
000 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1
*
001 ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
*
002 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
*
004 ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee
*
005

# Now reboot the machine, and mount the fs, so that fsync log replay
# takes place.

# The file content is now weird, in particular the first 8Kb, which
# do not match our data before nor after the sync command above.
$ od -t x1 foo
000 ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee
*
001 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01
*
002 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
*
004 ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee
*
005

# In fact these first 4Kb are a duplicate of the last 4kb block.
# The last write got an extent map/file extent item that points to
# the same disk extent that we got in the write+fsync that failed
# with the -ENOMEM error. btrfs-debug-tree and btrfsck allow us to
# verify that:

$ btrfs-debug-tree /dev/sdd
(...)
item 6 key (257 EXTENT_DATA 0) itemoff 15819 itemsize 53
extent data disk byte 12582912 nr 8192
extent data offset 0 nr 8192 ram 8192
item 7 key (257 EXTENT_DATA 8192) itemoff 15766 itemsize 53
extent data disk byte 0 nr 0
extent data offset 0 nr 8192 ram 8192
item 8 key (257 EXTENT_DATA 16384) itemoff 15713 itemsize 53
extent data disk byte 12582912 nr 4096
extent data offset 0 nr 4096 ram 4096

$ umount /dev/sdd
$ btrfsck /dev/sdd
Checking filesystem on /dev/sdd
UUID: db5e60e1-050d-41e6-8c7f-3d742dea5d8f
checking extents
extent item 12582912 has multiple extent items
ref mismatch on [12582912 4096] extent item 1, found 2
Backref bytes do not match extent backref, bytenr=12582912, ref bytes=4096, 
backref bytes=8192
backpointer mismatch on [12582912 4096]
Errors found in extent allocation tree or chunk allocation
checking free space cache
checking fs roots
root 5 inode 257 errors 1000, some csum missing
found 131074 bytes used err is 1
total csum bytes: 4
total tree bytes: 131072
total fs tree bytes: 32768
total extent tree bytes: 16384
btree space waste bytes: 123404
file data blocks allocated: 274432
 referenced 274432
Btrfs v3.14.1-96-gcc7fd5a-dirty

Signed-off-by: Filipe Manana fdman...@suse.com
---
 fs/btrfs/inode.c | 12 +---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index c678dea..16e8146 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -792,8 +792,12 @@ retry:
ins.offset,
BTRFS_ORDERED_COMPRESSED,
async_extent-compress_type);
-   if (ret)
+   if (ret

[PATCH v3] Btrfs: improve free space cache management and space allocation

2014-08-29 Thread Filipe Manana

While under random IO, a block group's free space cache eventually reaches
a state where it has a mix of extent entries and bitmap entries representing
free space regions.

As later free space regions are returned to the cache, some of them are merged
with existing extent entries if they are contiguous with them. But others are
not merged, because despite the existence of adjacent free space regions in
the cache, the merging doesn't happen because the existing free space regions
are represented in bitmap extents. Even when new free space regions are merged
with existing extent entries (enlarging the free space range they represent),
we create chances of having after an enlarged region that is contiguous with
some other region represented in a bitmap entry.

Both clustered and non-clustered space allocation work by iterating over our
extent and bitmap entries and skipping any that represents a region smaller
then the allocation request (and giving preference to extent entries before
bitmap entries). By having a contiguous free space region that is represented
by 2 (or more) entries (mix of extent and bitmap entries), we end up not
satisfying an allocation request with a size larger than the size of any of
the entries but no larger than the sum of their sizes. Making the caller assume
we're under a ENOSPC condition or force it to allocate multiple smaller space
regions (as we do for file data writes), which adds extra overhead and more
chances of causing fragmentation due to the smaller regions being all spread
apart from each other (more likely when under concurrency).

For example, if we have the following in the cache:

* extent entry representing free space range: [128Mb - 256Kb, 128Mb[

* bitmap entry covering the range [128Mb, 256Mb[, but only with the bits
  representing the range [128Mb, 128Mb + 768Kb[ set - that is, only that
  space in this 128Mb area is marked as free

An allocation request for 1Mb, starting at offset not greater than 128Mb - 
256Kb,
would fail before, despite the existence of such contiguous free space area in 
the
cache. The caller could only allocate up to 768Kb of space at once and later 
another
256Kb (or vice-versa). In between each smaller allocation request, another task
working on a different file/inode might come in and take that space, preventing 
the
former task of getting a contiguous 1Mb region of free space.

Therefore this change implements the ability to move free space from bitmap
entries into existing and new free space regions represented with extent
entries. This is done when a space region is added to the cache.

A test was added to the sanity tests that explains in detail the issue too.

Some performance test results with compilebench on a 4 cores machine, with
32Gb of ram and using an HDD follow.

Test: compilebench -D /mnt -i 30 -r 1000 --makej

Before this change:

   intial create total runs 30 avg 69.02 MB/s (user 0.28s sys 0.57s)
   compile total runs 30 avg 314.96 MB/s (user 0.12s sys 0.25s)
   read compiled tree total runs 3 avg 27.14 MB/s (user 1.52s sys 0.90s)
   delete compiled tree total runs 30 avg 3.14 seconds (user 0.15s sys 0.66s)

After this change:

   intial create total runs 30 avg 68.37 MB/s (user 0.29s sys 0.55s)
   compile total runs 30 avg 382.83 MB/s (user 0.12s sys 0.24s)
   read compiled tree total runs 3 avg 27.82 MB/s (user 1.45s sys 0.97s)
   delete compiled tree total runs 30 avg 3.18 seconds (user 0.17s sys 0.65s)

Signed-off-by: Filipe Manana fdman...@suse.com
---

V2: Simplified bitmap search logic, shorter and cleaner now, and one
less rbtree search.

V3: Fixed a corner case where all bits in the bitmap to the left of our
range were set but we didn't claim the first bit.
Attempt regular extent merge if we were able to steal free space from
a bitmap into our new extent.

 fs/btrfs/free-space-cache.c   | 140 ++-
 fs/btrfs/tests/free-space-tests.c | 514 ++
 2 files changed, 653 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 2f0fe10..3384819 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -1997,6 +1997,128 @@ static bool try_merge_free_space(struct 
btrfs_free_space_ctl *ctl,
return merged;
 }
 
+static bool steal_from_bitmap_to_end(struct btrfs_free_space_ctl *ctl,
+struct btrfs_free_space *info,
+bool update_stat)
+{
+   struct btrfs_free_space *bitmap;
+   unsigned long i;
+   unsigned long j;
+   const u64 end = info-offset + info-bytes;
+   const u64 bitmap_offset = offset_to_bitmap(ctl, end);
+   u64 bytes;
+
+   bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0);
+   if (!bitmap)
+   return false;
+
+   i = offset_to_bit(bitmap-offset, ctl-unit, end);
+   j = find_next_zero_bit(bitmap-bitmap, BITS_PER_BITMAP, i);
+   if (j == i

[PATCH] Btrfs: fix crash while a ranged msync() is ongoing

2014-08-29 Thread Filipe Manana

After the commit 7fc34a62ca4434a79c68e23e70ed26111b7a4cf8 (titled
mm/msync.c: sync only the requested range in msync()), our fsync
callback can be called with a range that covers only part of the
file and not the whole file anymore. Under certain circumstances
this leads to crashes that produce the following trace:

[41074.641913] invalid opcode:  [#1] SMP DEBUG_PAGEALLOC
(...)
[41074.642692] CPU: 0 PID: 24580 Comm: fsx Not tainted 
3.16.0-fdm-btrfs-next-45+ #1
(...)
[41074.643886] RIP: 0010:[a01ecc99]  [a01ecc99] 
btrfs_ordered_update_i_size+0x279/0x2b0 [btrfs]
(...)
[41074.644919] Stack:
(...)
[41074.644919] Call Trace:
[41074.644919]  [a01db531] btrfs_truncate_inode_items+0x3f1/0xa10 
[btrfs]
[41074.644919]  [a01eb54f] ? btrfs_get_logged_extents+0x4f/0x80 
[btrfs]
[41074.644919]  [a02137a9] btrfs_log_inode+0x2f9/0x970 [btrfs]
[41074.644919]  [81090875] ? sched_clock_local+0x25/0xa0
[41074.644919]  [8164a55e] ? mutex_unlock+0xe/0x10
[41074.644919]  [810af51d] ? trace_hardirqs_on+0xd/0x10
[41074.644919]  [a0214b4f] btrfs_log_inode_parent+0x1ef/0x560 [btrfs]
[41074.644919]  [811d0c55] ? dget_parent+0x5/0x180
[41074.644919]  [a0215d11] btrfs_log_dentry_safe+0x51/0x80 [btrfs]
[41074.644919]  [a01e2d1a] btrfs_sync_file+0x1ba/0x3e0 [btrfs]
[41074.644919]  [811eda6b] vfs_fsync_range+0x1b/0x30
(...)

The necessary conditions that lead to such crash are:

* an incremental fsync (when the inode doesn't have the
  BTRFS_INODE_NEEDS_FULL_SYNC flag set) happened for our file and it logged
  a file extent item ending at offset X;

* the file got the flag BTRFS_INODE_NEEDS_FULL_SYNC set in its inode, due
  to a file truncate operation that reduces the file to a size smaller
  than X;

* an msync call happens, with a range that doesn't cover the whole file
  and the end of this range, lets call it Y, is smaller than X;

* btrfs_log_inode, sees the flag BTRFS_INODE_NEEDS_FULL_SYNC set and
  calls btrfs_truncate_inode_items() to remove all items from the log
  tree that are associated with our file;

* btrfs_truncate_inode_items() removes all of the inode's items, and the lowest
  file extent item it removed is the one ending at offset X, where X  0 and
  X  Y - before returning, it calls btrfs_ordered_update_i_size() with an 
offset
  parameter set to X;

* btrfs_ordered_update_i_size() sees that X is greater then the current ordered
  size (btrfs_inode's disk_i_size) and then it assumes there can't be any 
ongoing
  ordered operation with a range covering the offset X, calling a BUG_ON() if
  such ordered operation exists. This assumption is made because the disk_i_size
  is only increased after the corresponding file extent item is added to the
  btree (btrfs_finish_ordered_io);

* But because our msync/fsync covers only a limited range, such an ordered 
extent
  might exist, and our fsync callback (btrfs_sync_file) doesn't wait for such
  ordered extent to finish when calling btrfs_wait_ordered_range();

And then by the time btrfs_ordered_update_i_size() is called, via:

   btrfs_sync_file() -
   btrfs_log_dentry_safe() -
   btrfs_log_inode_parent() -
   btrfs_log_inode() -
   btrfs_truncate_inode_items() -
   btrfs_ordered_update_i_size()

We hit the BUG_ON(), which could never happen when msync() used the whole file
range when calling fsync (i.e. before 7fc34a62ca4434a79c68e23e70ed26111b7a4cf8).

So just don't call btrfs_ordered_update_i_size() if we're removing inode items
from a log tree, which isn't supposed to change the in memory inode's 
disk_i_size,
and never did before commit 7fc34a62ca4434a79c68e23e70ed26111b7a4cf8 because we
used to wait for all ordered extents (and therefore the end of any extent found
by btrfs_truncate_inode_items was always smaller than the in memory inode's
disk_i_size).

Issue found while running xfstests/generic/127 (happens very rarely for me).

Signed-off-by: Filipe Manana fdman...@suse.com
---
 fs/btrfs/inode.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 16e8146..c5ef9eb 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4296,7 +4296,8 @@ out:
btrfs_abort_transaction(trans, root, ret);
}
 error:
-   if (last_size != (u64)-1)
+   if (last_size != (u64)-1 
+   root-root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
btrfs_ordered_update_i_size(inode, last_size, NULL);
btrfs_free_path(path);
return err;
-- 
1.9.1

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v2] Btrfs: fix crash while doing a ranged fsync

2014-08-29 Thread Filipe Manana

While doing a ranged fsync, that is, one whose range doesn't cover the
whole possible file range (0 to LLONG_MAX), we can crash under certain
circumstances with a trace like the following:

[41074.641913] invalid opcode:  [#1] SMP DEBUG_PAGEALLOC
(...)
[41074.642692] CPU: 0 PID: 24580 Comm: fsx Not tainted 
3.16.0-fdm-btrfs-next-45+ #1
(...)
[41074.643886] RIP: 0010:[a01ecc99]  [a01ecc99] 
btrfs_ordered_update_i_size+0x279/0x2b0 [btrfs]
(...)
[41074.644919] Stack:
(...)
[41074.644919] Call Trace:
[41074.644919]  [a01db531] btrfs_truncate_inode_items+0x3f1/0xa10 
[btrfs]
[41074.644919]  [a01eb54f] ? btrfs_get_logged_extents+0x4f/0x80 
[btrfs]
[41074.644919]  [a02137a9] btrfs_log_inode+0x2f9/0x970 [btrfs]
[41074.644919]  [81090875] ? sched_clock_local+0x25/0xa0
[41074.644919]  [8164a55e] ? mutex_unlock+0xe/0x10
[41074.644919]  [810af51d] ? trace_hardirqs_on+0xd/0x10
[41074.644919]  [a0214b4f] btrfs_log_inode_parent+0x1ef/0x560 [btrfs]
[41074.644919]  [811d0c55] ? dget_parent+0x5/0x180
[41074.644919]  [a0215d11] btrfs_log_dentry_safe+0x51/0x80 [btrfs]
[41074.644919]  [a01e2d1a] btrfs_sync_file+0x1ba/0x3e0 [btrfs]
[41074.644919]  [811eda6b] vfs_fsync_range+0x1b/0x30
(...)

The necessary conditions that lead to such crash are:

* an incremental fsync (when the inode doesn't have the
  BTRFS_INODE_NEEDS_FULL_SYNC flag set) happened for our file and it logged
  a file extent item ending at offset X;

* the file got the flag BTRFS_INODE_NEEDS_FULL_SYNC set in its inode, due
  to a file truncate operation that reduces the file to a size smaller
  than X;

* a ranged fsync call happens (via an msync for example), with a range that
  doesn't cover the whole file and the end of this range, lets call it Y, is
  smaller than X;

* btrfs_log_inode, sees the flag BTRFS_INODE_NEEDS_FULL_SYNC set and
  calls btrfs_truncate_inode_items() to remove all items from the log
  tree that are associated with our file;

* btrfs_truncate_inode_items() removes all of the inode's items, and the lowest
  file extent item it removed is the one ending at offset X, where X  0 and
  X  Y - before returning, it calls btrfs_ordered_update_i_size() with an 
offset
  parameter set to X;

* btrfs_ordered_update_i_size() sees that X is greater then the current ordered
  size (btrfs_inode's disk_i_size) and then it assumes there can't be any 
ongoing
  ordered operation with a range covering the offset X, calling a BUG_ON() if
  such ordered operation exists. This assumption is made because the disk_i_size
  is only increased after the corresponding file extent item is added to the
  btree (btrfs_finish_ordered_io);

* But because our fsync covers only a limited range, such an ordered extent 
might
  exist, and our fsync callback (btrfs_sync_file) doesn't wait for such ordered
  extent to finish when calling btrfs_wait_ordered_range();

And then by the time btrfs_ordered_update_i_size() is called, via:

   btrfs_sync_file() -
   btrfs_log_dentry_safe() -
   btrfs_log_inode_parent() -
   btrfs_log_inode() -
   btrfs_truncate_inode_items() -
   btrfs_ordered_update_i_size()

We hit the BUG_ON(), which could never happen if the fsync range covered the 
whole
possible file range (0 to LLONG_MAX), as we would wait for all ordered extents 
to
finish before calling btrfs_truncate_inode_items().

So just don't call btrfs_ordered_update_i_size() if we're removing the inode's 
items
from a log tree, which isn't supposed to change the in memory inode's 
disk_i_size.

Issue found while running xfstests/generic/127 (happens very rarely for me), 
more
specifically via the fsx calls that use memory mapped IO (and issue msync 
calls).

Signed-off-by: Filipe Manana fdman...@suse.com
---

V2: Updated commit message, such that it reflects the fact that ranged fsyncs 
are
not used only by msync.

 fs/btrfs/inode.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 16e8146..c5ef9eb 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4296,7 +4296,8 @@ out:
btrfs_abort_transaction(trans, root, ret);
}
 error:
-   if (last_size != (u64)-1)
+   if (last_size != (u64)-1 
+   root-root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
btrfs_ordered_update_i_size(inode, last_size, NULL);
btrfs_free_path(path);
return err;
-- 
1.9.1

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH] Btrfs: fix fsync data loss after a ranged fsync

2014-08-30 Thread Filipe Manana

While we're doing a full fsync (when the inode has the flag
BTRFS_INODE_NEEDS_FULL_SYNC set) that is ranged too (covers only a
portion of the file), we might have ordered operations that are started
while we're logging the inode and that fall outside the fsync range.

This means we can get extent maps outside our range added to the
inode's extent map tree's modified list for which the corresponding
ordered operation wasn't captured by our call to btrfs_get_logged_extents() -
the fill delalloc callbacks, inode.c:cow_file_range() and
inode.c:submit_compressed_extents() add an extent map to the modified
list before creating the respective ordered operation - and they do this
without holding the inode's mutex nor the inode's log mutex.

Therefore when a full ranged fsync finishes don't remove every extent
map from the modified list of extent maps - as for some of them, that fall
outside our fsync range, we might have not waited for their respective
ordered operation to finish (meaning the corresponding file extent item
wasn't inserted into the fs/subvol tree yet), and we must let the next
fsync (very likely a fast one that checks only the modified list) see this
extent map and log a matching file extent item to the log btree and wait for
its ordered operation to finish (if it's still ongoing).

Signed-off-by: Filipe Manana fdman...@suse.com
---
 fs/btrfs/file.c |  2 +-
 fs/btrfs/tree-log.c | 50 --
 fs/btrfs/tree-log.h |  2 ++
 3 files changed, 43 insertions(+), 11 deletions(-)

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 66c4076..e5534c1 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1979,7 +1979,7 @@ int btrfs_sync_file(struct file *file, loff_t start, 
loff_t end, int datasync)
 
btrfs_init_log_ctx(ctx);
 
-   ret = btrfs_log_dentry_safe(trans, root, dentry, ctx);
+   ret = btrfs_log_dentry_safe(trans, root, dentry, start, end, ctx);
if (ret  0) {
/* Fallthrough and commit/free transaction. */
ret = 1;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 5a917a6..8b18a2d 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -94,8 +94,10 @@
 #define LOG_WALK_REPLAY_ALL 3
 
 static int btrfs_log_inode(struct btrfs_trans_handle *trans,
-struct btrfs_root *root, struct inode *inode,
-int inode_only);
+  struct btrfs_root *root, struct inode *inode,
+  int inode_only,
+  const loff_t start,
+  const loff_t end);
 static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
 struct btrfs_root *root,
 struct btrfs_path *path, u64 objectid);
@@ -3856,8 +3858,10 @@ process:
  * This handles both files and directories.
  */
 static int btrfs_log_inode(struct btrfs_trans_handle *trans,
-struct btrfs_root *root, struct inode *inode,
-int inode_only)
+  struct btrfs_root *root, struct inode *inode,
+  int inode_only,
+  const loff_t start,
+  const loff_t end)
 {
struct btrfs_path *path;
struct btrfs_path *dst_path;
@@ -4050,8 +4054,27 @@ log_extents:
struct extent_map *em, *n;
 
write_lock(tree-lock);
-   list_for_each_entry_safe(em, n, tree-modified_extents, list)
+   /*
+* We can't just remove every em if we're called for a ranged
+* fsync - that is, one that doesn't cover the whole possible
+* file range (0 to LLONG_MAX). This is because we can have
+* em's that fall outside the range and therefore their ordered
+* operations haven't completed yet (btrfs_finish_ordered_io()
+* not invoked yet). Their ordered operations might have started
+* after we called btrfs_get_logged_extents() too, so we don't
+* end up waiting for them to complete when syncing the log.
+* Removing every em outside the range would make a subsequent
+* fsync that does a fast search (BTRFS_INODE_NEEDS_FULL_SYNC
+* flag not set) not log the extent represented by an em,
+* therefore making us lose data after a log replay.
+*/
+   list_for_each_entry_safe(em, n, tree-modified_extents, list) {
+   if (em-mod_start  end)
+   continue;
+   if (em-mod_start + em-mod_len = start)
+   continue;
list_del_init(em-list);
+   }
write_unlock(tree-lock);
}
 
@@ -4158,7 +4181,10 @@ out:
  */
 static int btrfs_log_inode_parent

[PATCH] Btrfs: fix fsync race leading to invalid data after log replay

2014-08-30 Thread Filipe Manana

When the fsync callback (btrfs_sync_file) starts, it first waits for
the writeback of any dirty pages to start and finish without holding
the inode's mutex (to reduce contention). After this it acquires the
inode's mutex and repeats that process via btrfs_wait_ordered_range
only if we're doing a full sync (BTRFS_INODE_NEEDS_FULL_SYNC flag
is set on the inode).

This is not safe for a non full sync - we need to start and wait for
writeback to finish of any pages that might have been made dirty
before acquiring the inode's mutex and after that first step mentioned
before. Why this is needed is explained by the following comment added
to btrfs_sync_file:

Right before acquiring the inode's mutex, we might have new writes
 dirtying pages, which won't immediately start the respective ordered
 operations - that is done through the fill_delalloc callbacks invoked
 from the writepage and writepages address space operations. So make
 sure we start all ordered operations before starting to log our
 inode. Not doing this means that while logging the inode, writeback
 could start and invoke writepage/writepages, which would call the
 fill_delalloc callbacks (cow_file_range, submit_compressed_extents).
 These callbacks add first an extent map to the modified list of
 extents and then create the respective ordered operation, which means
 in tree-log.c:btrfs_log_inode() we might capture all existing ordered
 operations (with btrfs_get_logged_extents()) before the fill_delalloc
 callback adds its ordered operation, and by the time we visit the
 modified list of extent maps (with btrfs_log_changed_extents()), we
 see and process the extent map they created. We then use their extent
 map to construct a file extent item for logging without waiting for
 the respective ordered operation to finish - these file extent items
 point to a disk location that might not have yet been written to,
 containing random data - so after a crash a log replay will make our
 inode have file extent items that point to disk locations containing
 invalid data.

Signed-off-by: Filipe Manana fdman...@suse.com
---
 fs/btrfs/file.c | 33 +++--
 1 file changed, 27 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index e5534c1..5e9d108 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1912,12 +1912,33 @@ int btrfs_sync_file(struct file *file, loff_t start, 
loff_t end, int datasync)
atomic_inc(root-log_batch);
full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
 BTRFS_I(inode)-runtime_flags);
-   if (full_sync) {
-   ret = btrfs_wait_ordered_range(inode, start, end - start + 1);
-   if (ret) {
-   mutex_unlock(inode-i_mutex);
-   goto out;
-   }
+   /*
+* Right before acquiring the inode's mutex, we might have new writes
+* dirtying pages, which won't immediately start the respective ordered
+* operations - that is done through the fill_delalloc callbacks invoked
+* from the writepage and writepages address space operations. So make
+* sure we start all ordered operations before starting to log our
+* inode. Not doing this means that while logging the inode, writeback
+* could start and invoke writepage/writepages, which would call the
+* fill_delalloc callbacks (cow_file_range, submit_compressed_extents).
+* These callbacks add first an extent map to the modified list of
+* extents and then create the respective ordered operation, which means
+* in tree-log.c:btrfs_log_inode() we might capture all existing ordered
+* operations (with btrfs_get_logged_extents()) before the fill_delalloc
+* callback adds its ordered operation, and by the time we visit the
+* modified list of extent maps (with btrfs_log_changed_extents()), we
+* see and process the extent map they created. We then use their extent
+* map to construct a file extent item for logging without waiting for
+* the respective ordered operation to finish - these file extent items
+* point to a disk location that might not have yet been written to,
+* containing random data - so after a crash a log replay will make our
+* inode have file extent items that point to disk locations containing
+* invalid data.
+*/
+   ret = btrfs_wait_ordered_range(inode, start, end - start + 1);
+   if (ret) {
+   mutex_unlock(inode-i_mutex);
+   goto out;
}
atomic_inc(root-log_batch);
 
-- 
1.9.1

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v2] Btrfs: fix fsync race leading to invalid data after log replay

2014-09-02 Thread Filipe Manana

When the fsync callback (btrfs_sync_file) starts, it first waits for
the writeback of any dirty pages to start and finish without holding
the inode's mutex (to reduce contention). After this it acquires the
inode's mutex and repeats that process via btrfs_wait_ordered_range
only if we're doing a full sync (BTRFS_INODE_NEEDS_FULL_SYNC flag
is set on the inode).

This is not safe for a non full sync - we need to start and wait for
writeback to finish for any pages that might have been made dirty
before acquiring the inode's mutex and after that first step mentioned
before. Why this is needed is explained by the following comment added
to btrfs_sync_file:

  Right before acquiring the inode's mutex, we might have new
   writes dirtying pages, which won't immediately start the
   respective ordered operations - that is done through the
   fill_delalloc callbacks invoked from the writepage and
   writepages address space operations. So make sure we start
   all ordered operations before starting to log our inode. Not
   doing this means that while logging the inode, writeback
   could start and invoke writepage/writepages, which would call
   the fill_delalloc callbacks (cow_file_range,
   submit_compressed_extents). These callbacks add first an
   extent map to the modified list of extents and then create
   the respective ordered operation, which means in
   tree-log.c:btrfs_log_inode() we might capture all existing
   ordered operations (with btrfs_get_logged_extents()) before
   the fill_delalloc callback adds its ordered operation, and by
   the time we visit the modified list of extent maps (with
   btrfs_log_changed_extents()), we see and process the extent
   map they created. We then use the extent map to construct a
   file extent item for logging without waiting for the
   respective ordered operation to finish - this file extent
   item points to a disk location that might not have yet been
   written to, containing random data - so after a crash a log
   replay will make our inode have file extent items that point
   to disk locations containing invalid data, as we returned
   success to userspace without waiting for the respective
   ordered operation to finish, because it wasn't captured by
   btrfs_get_logged_extents().

Signed-off-by: Filipe Manana fdman...@suse.com
---

V2: Better comments and for the non full sync case, start only the
ordered operations, instead of starting them and waiting for them
to complete. Waiting for their completion is already done later by
btrfs_sync_log(), so like this we can reduce some latency as some
ordered operations might complete (or get closer to) while writing
to the log tree (btrfs_log_dentry_safe).

 fs/btrfs/file.c | 78 ++---
 1 file changed, 63 insertions(+), 15 deletions(-)

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index e5534c1..5427ba8 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1865,6 +1865,20 @@ int btrfs_release_file(struct inode *inode, struct file 
*filp)
return 0;
 }
 
+static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end)
+{
+   int ret;
+
+   atomic_inc(BTRFS_I(inode)-sync_writers);
+   ret = filemap_fdatawrite_range(inode-i_mapping, start, end);
+   if (!ret  test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+BTRFS_I(inode)-runtime_flags))
+   ret = filemap_fdatawrite_range(inode-i_mapping, start, end);
+   atomic_dec(BTRFS_I(inode)-sync_writers);
+
+   return ret;
+}
+
 /*
  * fsync call for both files and directories.  This logs the inode into
  * the tree log instead of forcing full commits whenever possible.
@@ -1894,30 +1908,64 @@ int btrfs_sync_file(struct file *file, loff_t start, 
loff_t end, int datasync)
 * multi-task, and make the performance up.  See
 * btrfs_wait_ordered_range for an explanation of the ASYNC check.
 */
-   atomic_inc(BTRFS_I(inode)-sync_writers);
-   ret = filemap_fdatawrite_range(inode-i_mapping, start, end);
-   if (!ret  test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
-BTRFS_I(inode)-runtime_flags))
-   ret = filemap_fdatawrite_range(inode-i_mapping, start, end);
-   atomic_dec(BTRFS_I(inode)-sync_writers);
+   ret = start_ordered_ops(inode, start, end);
if (ret)
return ret;
 
mutex_lock(inode-i_mutex);
-
-   /*
-* We flush the dirty pages again to avoid some dirty pages in the
-* range being left.
-*/
atomic_inc(root-log_batch);
full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
 BTRFS_I(inode)-runtime_flags);
+   /*
+* We might have have had more pages made dirty after calling
+* start_ordered_ops and before acquiring the inode's i_mutex.
+*/
if (full_sync) {
+   /*
+* For a full sync, we need

[PATCH v2] Btrfs: fix fsync data loss after a ranged fsync

2014-09-02 Thread Filipe Manana

While we're doing a full fsync (when the inode has the flag
BTRFS_INODE_NEEDS_FULL_SYNC set) that is ranged too (covers only a
portion of the file), we might have ordered operations that are started
before or while we're logging the inode and that fall outside the fsync
range.

Therefore when a full ranged fsync finishes don't remove every extent
map from the list of modified extent maps - as for some of them, that
fall outside our fsync range, their respective ordered operation hasn't
finished yet, meaning the corresponding file extent item wasn't inserted
into the fs/subvol tree yet and therefore we didn't log it, and we must
let the next fast fsync (one that checks only the modified list) see this
extent map and log a matching file extent item to the log btree and wait
for its ordered operation to finish (if it's still ongoing).

Signed-off-by: Filipe Manana fdman...@suse.com
---

V2: No code change, only updated the changelog and the comment, to make
them more clear and accurate.

 fs/btrfs/file.c |  2 +-
 fs/btrfs/tree-log.c | 54 +++--
 fs/btrfs/tree-log.h |  2 ++
 3 files changed, 47 insertions(+), 11 deletions(-)

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 66c4076..e5534c1 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1979,7 +1979,7 @@ int btrfs_sync_file(struct file *file, loff_t start, 
loff_t end, int datasync)
 
btrfs_init_log_ctx(ctx);
 
-   ret = btrfs_log_dentry_safe(trans, root, dentry, ctx);
+   ret = btrfs_log_dentry_safe(trans, root, dentry, start, end, ctx);
if (ret  0) {
/* Fallthrough and commit/free transaction. */
ret = 1;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 5a917a6..93d3c16 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -94,8 +94,10 @@
 #define LOG_WALK_REPLAY_ALL 3
 
 static int btrfs_log_inode(struct btrfs_trans_handle *trans,
-struct btrfs_root *root, struct inode *inode,
-int inode_only);
+  struct btrfs_root *root, struct inode *inode,
+  int inode_only,
+  const loff_t start,
+  const loff_t end);
 static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
 struct btrfs_root *root,
 struct btrfs_path *path, u64 objectid);
@@ -3856,8 +3858,10 @@ process:
  * This handles both files and directories.
  */
 static int btrfs_log_inode(struct btrfs_trans_handle *trans,
-struct btrfs_root *root, struct inode *inode,
-int inode_only)
+  struct btrfs_root *root, struct inode *inode,
+  int inode_only,
+  const loff_t start,
+  const loff_t end)
 {
struct btrfs_path *path;
struct btrfs_path *dst_path;
@@ -4050,8 +4054,31 @@ log_extents:
struct extent_map *em, *n;
 
write_lock(tree-lock);
-   list_for_each_entry_safe(em, n, tree-modified_extents, list)
+   /*
+* We can't just remove every em if we're called for a ranged
+* fsync - that is, one that doesn't cover the whole possible
+* file range (0 to LLONG_MAX). This is because we can have
+* em's that fall outside the range we're logging and therefore
+* their ordered operations haven't completed yet
+* (btrfs_finish_ordered_io() not invoked yet). This means we
+* didn't get their respective file extent item in the fs/subvol
+* tree yet, and need to let the next fast fsync (one which
+* consults the list of modified extent maps) find the em so
+* that it logs a matching file extent item and waits for the
+* respective ordered operation to complete (if it's still
+* running).
+*
+* Removing every em outside the range we're logging would make
+* the next fast fsync not log their matching file extent items,
+* therefore making us lose data after a log replay.
+*/
+   list_for_each_entry_safe(em, n, tree-modified_extents, list) {
+   if (em-mod_start  end)
+   continue;
+   if (em-mod_start + em-mod_len = start)
+   continue;
list_del_init(em-list);
+   }
write_unlock(tree-lock);
}
 
@@ -4158,7 +4185,10 @@ out:
  */
 static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
  struct btrfs_root *root, struct inode *inode,
- struct dentry

[PATCH] Btrfs: fix data corruption after fast fsync and writeback error

2014-09-05 Thread Filipe Manana

When we do a fast fsync, we start all ordered operations and then while
they're running in parallel we visit the list of modified extent maps
and construct their matching file extent items and write them to the
log btree. After that, in btrfs_sync_log() we wait for all the ordered
operations to finish (via btrfs_wait_logged_extents).

The problem with this is that we were completely ignoring errors that
can happen in the extent write path, such as -ENOSPC, a temporary -ENOMEM
or -EIO errors for example. When such error happens, it means we have parts
of the on disk extent that weren't written to, and so we end up logging
file extent items that point to these extents that contain garbage/random
data - so after a crash/reboot plus log replay, we get our inode's metadata
pointing to those extents.

This worked in contrast with the full (non-fast) fsync path, where we
start all ordered operations, wait for them to finish and then write
to the log btree. In this path, after each ordered operation completes
we check if it's flagged with an error (BTRFS_ORDERED_IOERR) and return
-EIO if so (via btrfs_wait_ordered_range).

So if an error happens with any ordered operation, just return a -EIO
error to userspace, so that it knows that not all of its previous writes
were durably persisted and the application can take proper action (like
redo the writes for e.g.) - and definitely not leave any file extent items
in the log refer to non fully written extents.

Signed-off-by: Filipe Manana fdman...@suse.com
---

This patch applies on top of the patches with the titles:

Btrfs: fix fsync data loss after a ranged fsync
Btrfs: fix fsync race leading to invalid data after log replay

 fs/btrfs/file.c |  19 
 fs/btrfs/tree-log.c | 247 ++--
 fs/btrfs/tree-log.h |   2 +
 3 files changed, 166 insertions(+), 102 deletions(-)

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 5427ba8..4494b4e 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -2045,6 +2045,25 @@ int btrfs_sync_file(struct file *file, loff_t start, 
loff_t end, int datasync)
 */
mutex_unlock(inode-i_mutex);
 
+   /*
+* If any of the ordered extents had an error, just return it to user
+* space, so that the application knows some writes didn't succeed and
+* can take proper action (retry for e.g.). Blindly committing the
+* transaction in this case, would fool userspace that everything was
+* successful. And we also want to make sure our log doesn't contain
+* file extent items pointing to extents that weren't fully written to -
+* just like in the non fast fsync path, where we check for the ordered
+* operation's error flag before writing to the log tree and return -EIO
+* if any of them had this flag set (btrfs_wait_ordered_range) -
+* therefore we need to check for errors in the ordered operations,
+* which are indicated by ctx.io_err.
+*/
+   if (ctx.io_err) {
+   btrfs_end_transaction(trans, root);
+   ret = ctx.io_err;
+   goto out;
+   }
+
if (ret != BTRFS_NO_LOG_SYNC) {
if (!ret) {
ret = btrfs_sync_log(trans, root, ctx);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 93d3c16..128f301 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -97,7 +97,8 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
   struct btrfs_root *root, struct inode *inode,
   int inode_only,
   const loff_t start,
-  const loff_t end);
+  const loff_t end,
+  struct btrfs_log_ctx *ctx);
 static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
 struct btrfs_root *root,
 struct btrfs_path *path, u64 objectid);
@@ -3571,107 +3572,33 @@ static int extent_cmp(void *priv, struct list_head *a, 
struct list_head *b)
return 0;
 }
 
-static int log_one_extent(struct btrfs_trans_handle *trans,
- struct inode *inode, struct btrfs_root *root,
- struct extent_map *em, struct btrfs_path *path,
- struct list_head *logged_list)
+static int wait_ordered_extents(struct btrfs_trans_handle *trans,
+   struct inode *inode,
+   struct btrfs_root *root,
+   const struct extent_map *em,
+   const struct list_head *logged_list,
+   bool *ordered_io_error)
 {
-   struct btrfs_root *log = root-log_root;
-   struct btrfs_file_extent_item *fi;
-   struct extent_buffer *leaf;
struct btrfs_ordered_extent *ordered;
-   struct list_head ordered_sums;
-   struct

[PATCH v3] Btrfs: fix fsync data loss after a ranged fsync

2014-09-05 Thread Filipe Manana

While we're doing a full fsync (when the inode has the flag
BTRFS_INODE_NEEDS_FULL_SYNC set) that is ranged too (covers only a
portion of the file), we might have ordered operations that are started
before or while we're logging the inode and that fall outside the fsync
range.

Therefore when a full ranged fsync finishes don't remove every extent
map from the list of modified extent maps - as for some of them, that
fall outside our fsync range, their respective ordered operation hasn't
finished yet, meaning the corresponding file extent item wasn't inserted
into the fs/subvol tree yet and therefore we didn't log it, and we must
let the next fast fsync (one that checks only the modified list) see this
extent map and log a matching file extent item to the log btree and wait
for its ordered operation to finish (if it's still ongoing).

A test case for xfstests follows.

Signed-off-by: Filipe Manana fdman...@suse.com
---

V2: No code change, only updated the changelog and the comment, to make
them more clear and accurate.

V3: Added missing condition to exclude extent map from the modified list
and ensure btrfs_log_inode() is called for the next fsync if the
modified list didn't get empty after logging the inode. This time this
follows with a test case for xfstests that is better then my previous
local test and benefits everyone.

 fs/btrfs/file.c |  2 +-
 fs/btrfs/tree-log.c | 78 ++---
 fs/btrfs/tree-log.h |  2 ++
 3 files changed, 66 insertions(+), 16 deletions(-)

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 66c4076..e5534c1 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1979,7 +1979,7 @@ int btrfs_sync_file(struct file *file, loff_t start, 
loff_t end, int datasync)
 
btrfs_init_log_ctx(ctx);
 
-   ret = btrfs_log_dentry_safe(trans, root, dentry, ctx);
+   ret = btrfs_log_dentry_safe(trans, root, dentry, start, end, ctx);
if (ret  0) {
/* Fallthrough and commit/free transaction. */
ret = 1;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 5a917a6..6d774c9 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -94,8 +94,10 @@
 #define LOG_WALK_REPLAY_ALL 3
 
 static int btrfs_log_inode(struct btrfs_trans_handle *trans,
-struct btrfs_root *root, struct inode *inode,
-int inode_only);
+  struct btrfs_root *root, struct inode *inode,
+  int inode_only,
+  const loff_t start,
+  const loff_t end);
 static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
 struct btrfs_root *root,
 struct btrfs_path *path, u64 objectid);
@@ -3856,8 +3858,10 @@ process:
  * This handles both files and directories.
  */
 static int btrfs_log_inode(struct btrfs_trans_handle *trans,
-struct btrfs_root *root, struct inode *inode,
-int inode_only)
+  struct btrfs_root *root, struct inode *inode,
+  int inode_only,
+  const loff_t start,
+  const loff_t end)
 {
struct btrfs_path *path;
struct btrfs_path *dst_path;
@@ -3874,6 +3878,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle 
*trans,
int ins_nr;
bool fast_search = false;
u64 ino = btrfs_ino(inode);
+   struct extent_map_tree *em_tree = BTRFS_I(inode)-extent_tree;
 
path = btrfs_alloc_path();
if (!path)
@@ -4046,13 +4051,38 @@ log_extents:
goto out_unlock;
}
} else if (inode_only == LOG_INODE_ALL) {
-   struct extent_map_tree *tree = BTRFS_I(inode)-extent_tree;
struct extent_map *em, *n;
 
-   write_lock(tree-lock);
-   list_for_each_entry_safe(em, n, tree-modified_extents, list)
+   write_lock(em_tree-lock);
+   /*
+* We can't just remove every em if we're called for a ranged
+* fsync - that is, one that doesn't cover the whole possible
+* file range (0 to LLONG_MAX). This is because we can have
+* em's that fall outside the range we're logging and therefore
+* their ordered operations haven't completed yet
+* (btrfs_finish_ordered_io() not invoked yet). This means we
+* didn't get their respective file extent item in the fs/subvol
+* tree yet, and need to let the next fast fsync (one which
+* consults the list of modified extent maps) find the em so
+* that it logs a matching file extent item and waits for the
+* respective ordered operation to complete (if it's still
+* running

[PATCH] xfstests: generic: add test for double msync, motivated by a btrfs bug

2014-09-05 Thread Filipe Manana

This test is motivated by a btrfs issue where a ranged fsync would
prevent a subsequent fsync from persisting any extents that were
dirty at the time of the first fsync but that were outside the range
of that first fsync (which should have been persisted by the second
fsync).

This bug in btrfs is fixed by the following linux kernel patch:

 Btrfs: fix fsync data loss after a ranged fsync

Signed-off-by: Filipe Manana fdman...@suse.com
---
 tests/generic/325 | 106 ++
 tests/generic/325.out |  19 +
 tests/generic/group   |   1 +
 3 files changed, 126 insertions(+)
 create mode 100755 tests/generic/325
 create mode 100644 tests/generic/325.out

diff --git a/tests/generic/325 b/tests/generic/325
new file mode 100755
index 000..c47e372
--- /dev/null
+++ b/tests/generic/325
@@ -0,0 +1,106 @@
+#! /bin/bash
+# FS QA Test No. 325
+#
+# Make some pages/extents of a file dirty, do a ranged fsync that covers
+# only some of the dirty pages/extents, and then do a regular fsync (or
+# another ranged fsync that covers the remaining dirty pages/extents).
+# Verify after that all extents were persisted.
+#
+# This test is motivated by a btrfs issue where the first ranged fsync
+# would prevent the following fsync from persisting the remaining dirty
+# pages/extents. This was fixed by the following btrfs kernel patch:
+#
+# Btrfs: fix fsync data loss after a ranged fsync
+#
+#---
+# Copyright (C) 2014 SUSE Linux Products GmbH. All Rights Reserved.
+# Author: Filipe Manana fdman...@suse.com
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it would be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write the Free Software Foundation,
+# Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+#---
+#
+
+seq=`basename $0`
+seqres=$RESULT_DIR/$seq
+echo QA output created by $seq
+
+here=`pwd`
+status=1   # failure is the default!
+
+_cleanup()
+{
+   _cleanup_flakey
+}
+trap _cleanup; exit \$status 0 1 2 3 15
+
+# get standard environment, filters and checks
+. ./common/rc
+. ./common/filter
+. ./common/dmflakey
+
+# real QA test starts here
+_supported_fs generic
+_supported_os Linux
+_need_to_be_root
+_require_scratch
+_require_dm_flakey
+
+rm -f $seqres.full
+
+_scratch_mkfs  $seqres.full 21
+
+_init_flakey
+_mount_flakey
+
+# Create the file first.
+$XFS_IO_PROG -f -c pwrite -S 0xff 0 64K $SCRATCH_MNT/foo | _filter_xfs_io
+
+# Now sync the file data to disk using 'sync' and not an fsync. This is because
+# in btrfs the first fsync clears the btrfs inode full fsync flag, which must
+# be set when the first msync below happens in order to trigger the bug.
+sync
+
+# Now update the first 4Kb and the last 4Kb of the file, using memory mapped IO
+# because an msync(), since the linux kernel commit
+# 7fc34a62ca4434a79c68e23e70ed26111b7a4cf8, invokes a ranged fsync.
+#
+# After those writes, msync a range covering the first 4Kb and then after
+# perform a msync with a range covering the last 4Kb of the file.
+# This second msync() used to be a no-op for that btrfs bug (and the first 
fsync
+# didn't log the last 4Kb extent as expected too).
+$XFS_IO_PROG \
+   -c mmap -w 0 64K \
+   -c mwrite -S 0xaa 0 4K   \
+   -c mwrite -S 0xbb 60K 4K \
+   -c msync -s 0K 16K   \
+   -c msync -s 32K 32K  \
+   -c munmap\
+   $SCRATCH_MNT/foo | _filter_xfs_io
+
+echo File content before crash/reboot:
+od -t x1 $SCRATCH_MNT/foo
+
+_load_flakey_table $FLAKEY_DROP_WRITES
+_unmount_flakey
+
+_load_flakey_table $FLAKEY_ALLOW_WRITES
+_mount_flakey
+
+echo File content after crash/reboot and fs mount:
+od -t x1 $SCRATCH_MNT/foo
+
+_unmount_flakey
+
+status=0
+exit
diff --git a/tests/generic/325.out b/tests/generic/325.out
new file mode 100644
index 000..9a78c3e
--- /dev/null
+++ b/tests/generic/325.out
@@ -0,0 +1,19 @@
+QA output created by 325
+wrote 65536/65536 bytes at offset 0
+XXX Bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+File content before crash/reboot:
+000 aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa
+*
+001 ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
+*
+017 bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb
+*
+020
+File content after crash/reboot and fs mount:
+000 aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa
+*
+001 ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
+*
+017 bb bb bb bb bb

[PATCH v4] Btrfs: fix fsync data loss after a ranged fsync

2014-09-06 Thread Filipe Manana

While we're doing a full fsync (when the inode has the flag
BTRFS_INODE_NEEDS_FULL_SYNC set) that is ranged too (covers only a
portion of the file), we might have ordered operations that are started
before or while we're logging the inode and that fall outside the fsync
range.

Therefore when a full ranged fsync finishes don't remove every extent
map from the list of modified extent maps - as for some of them, that
fall outside our fsync range, their respective ordered operation hasn't
finished yet, meaning the corresponding file extent item wasn't inserted
into the fs/subvol tree yet and therefore we didn't log it, and we must
let the next fast fsync (one that checks only the modified list) see this
extent map and log a matching file extent item to the log btree and wait
for its ordered operation to finish (if it's still ongoing).

A test case for xfstests follows.

Signed-off-by: Filipe Manana fdman...@suse.com
---

V2: No code change, only updated the changelog and the comment, to make
them more clear and accurate.

V3: Added missing condition to exclude extent map from the modified list
and ensure btrfs_log_inode() is called for the next fsync if the
modified list didn't get empty after logging the inode. This time this
follows with a test case for xfstests that is better then my previous
local test and benefits everyone.

V4: Simplifed em exclusion logic.

 fs/btrfs/file.c |  2 +-
 fs/btrfs/tree-log.c | 77 ++---
 fs/btrfs/tree-log.h |  2 ++
 3 files changed, 64 insertions(+), 17 deletions(-)

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 66c4076..e5534c1 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1979,7 +1979,7 @@ int btrfs_sync_file(struct file *file, loff_t start, 
loff_t end, int datasync)
 
btrfs_init_log_ctx(ctx);
 
-   ret = btrfs_log_dentry_safe(trans, root, dentry, ctx);
+   ret = btrfs_log_dentry_safe(trans, root, dentry, start, end, ctx);
if (ret  0) {
/* Fallthrough and commit/free transaction. */
ret = 1;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 5a917a6..82db14f 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -94,8 +94,10 @@
 #define LOG_WALK_REPLAY_ALL 3
 
 static int btrfs_log_inode(struct btrfs_trans_handle *trans,
-struct btrfs_root *root, struct inode *inode,
-int inode_only);
+  struct btrfs_root *root, struct inode *inode,
+  int inode_only,
+  const loff_t start,
+  const loff_t end);
 static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
 struct btrfs_root *root,
 struct btrfs_path *path, u64 objectid);
@@ -3856,8 +3858,10 @@ process:
  * This handles both files and directories.
  */
 static int btrfs_log_inode(struct btrfs_trans_handle *trans,
-struct btrfs_root *root, struct inode *inode,
-int inode_only)
+  struct btrfs_root *root, struct inode *inode,
+  int inode_only,
+  const loff_t start,
+  const loff_t end)
 {
struct btrfs_path *path;
struct btrfs_path *dst_path;
@@ -3874,6 +3878,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle 
*trans,
int ins_nr;
bool fast_search = false;
u64 ino = btrfs_ino(inode);
+   struct extent_map_tree *em_tree = BTRFS_I(inode)-extent_tree;
 
path = btrfs_alloc_path();
if (!path)
@@ -4046,13 +4051,35 @@ log_extents:
goto out_unlock;
}
} else if (inode_only == LOG_INODE_ALL) {
-   struct extent_map_tree *tree = BTRFS_I(inode)-extent_tree;
struct extent_map *em, *n;
 
-   write_lock(tree-lock);
-   list_for_each_entry_safe(em, n, tree-modified_extents, list)
-   list_del_init(em-list);
-   write_unlock(tree-lock);
+   write_lock(em_tree-lock);
+   /*
+* We can't just remove every em if we're called for a ranged
+* fsync - that is, one that doesn't cover the whole possible
+* file range (0 to LLONG_MAX). This is because we can have
+* em's that fall outside the range we're logging and therefore
+* their ordered operations haven't completed yet
+* (btrfs_finish_ordered_io() not invoked yet). This means we
+* didn't get their respective file extent item in the fs/subvol
+* tree yet, and need to let the next fast fsync (one which
+* consults the list of modified extent maps) find the em so
+* that it logs a matching file extent item

[PATCH] Btrfs: fix directory recovery from fsync log

2014-09-08 Thread Filipe Manana

When replaying a directory from the fsync log, if a directory entry
exists both in the fs/subvol tree and in the log, the directory's inode
got its i_size updated incorrectly, accounting for the dentry's name
twice.

Reproducer, from a test for xfstests:

_scratch_mkfs  $seqres.full 21
_init_flakey
_mount_flakey

touch $SCRATCH_MNT/foo
sync

touch $SCRATCH_MNT/bar
xfs_io -c fsync $SCRATCH_MNT
xfs_io -c fsync $SCRATCH_MNT/bar

_load_flakey_table $FLAKEY_DROP_WRITES
_unmount_flakey

_load_flakey_table $FLAKEY_ALLOW_WRITES
_mount_flakey

[ -f $SCRATCH_MNT/foo ] || echo file foo is missing
[ -f $SCRATCH_MNT/bar ] || echo file bar is missing

_unmount_flakey
_check_scratch_fs $FLAKEY_DEV

The filesystem check at the end failed with the message:
root 5 root dir 256 error.

A test case for xfstests follows.

Signed-off-by: Filipe Manana fdman...@suse.com
---
 fs/btrfs/tree-log.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index faa568e..d60425c 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -1638,6 +1638,7 @@ static noinline int replay_one_name(struct 
btrfs_trans_handle *trans,
found_key.type == log_key.type 
found_key.offset == log_key.offset 
btrfs_dir_type(path-nodes[0], dst_di) == log_type) {
+   update_size = false;
goto out;
}
 
-- 
1.9.1

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH] xfstests: generic: add dir fsync test, motivated by a btrfs bug

2014-09-08 Thread Filipe Manana

This test is motivated by a bug found in btrfs when fsync'ing a
directory. The issue was that if a directory entry is both found
in the persisted metadata and in the fsync log, at log replay time
the directory got set ith a wrong i_size. This was fixed in btrfs
with the following linux kernel patch:

 Btrfs: fix directory recovery from fsync log

Signed-off-by: Filipe Manana fdman...@suse.com
---
 tests/generic/326 | 95 +++
 tests/generic/326.out |  2 ++
 tests/generic/group   |  1 +
 3 files changed, 98 insertions(+)
 create mode 100755 tests/generic/326
 create mode 100644 tests/generic/326.out

diff --git a/tests/generic/326 b/tests/generic/326
new file mode 100755
index 000..47f4558
--- /dev/null
+++ b/tests/generic/326
@@ -0,0 +1,95 @@
+#! /bin/bash
+# FS QA Test No. 326
+#
+# This test is motivated by a bug found in btrfs when fsync'ing a directory.
+# The issue was that if a directory entry is both found in the persisted
+# metadata and in the fsync log, at log replay time the directory got set
+# with a wrong i_size. This was fixed in btrfs with the following linux
+# kernel patch:
+#
+# Btrfs: fix directory recovery from fsync log
+#
+#---
+# Copyright (C) 2014 SUSE Linux Products GmbH. All Rights Reserved.
+# Author: Filipe Manana fdman...@suse.com
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it would be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write the Free Software Foundation,
+# Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+#---
+#
+
+seq=`basename $0`
+seqres=$RESULT_DIR/$seq
+echo QA output created by $seq
+
+here=`pwd`
+status=1   # failure is the default!
+
+_cleanup()
+{
+   _cleanup_flakey
+}
+trap _cleanup; exit \$status 0 1 2 3 15
+
+# get standard environment, filters and checks
+. ./common/rc
+. ./common/filter
+. ./common/dmflakey
+
+# real QA test starts here
+_supported_fs generic
+_supported_os Linux
+_need_to_be_root
+_require_scratch
+_require_dm_flakey
+
+rm -f $seqres.full
+
+_scratch_mkfs  $seqres.full 21
+
+_init_flakey
+_mount_flakey
+
+touch $SCRATCH_MNT/foo
+
+# Invoke sync here because it's necessary to trigger the original bug in btrfs.
+# The intention is that at log recovery time we have a dir entry for 'foo' both
+# in the fs/subvol tree and in the log tree - this is necessary to trigger the
+# bug on btrfs.
+sync
+
+touch $SCRATCH_MNT/bar
+xfs_io -c fsync $SCRATCH_MNT
+xfs_io -c fsync $SCRATCH_MNT/bar
+
+_load_flakey_table $FLAKEY_DROP_WRITES
+_unmount_flakey
+_check_scratch_fs $FLAKEY_DEV
+
+_load_flakey_table $FLAKEY_ALLOW_WRITES
+_mount_flakey
+
+[ -f $SCRATCH_MNT/foo ] || echo file foo is missing
+[ -f $SCRATCH_MNT/bar ] || echo file bar is missing
+
+_unmount_flakey
+
+# In the original btrfs bug, the filesystem consistency check failed here
+# because the directory inode got set with a wrong i_size by the log replay
+# at mount time (dentry 'foo' was accounted for twice).
+_check_scratch_fs $FLAKEY_DEV
+
+echo Silence is golden
+
+status=0
+exit
diff --git a/tests/generic/326.out b/tests/generic/326.out
new file mode 100644
index 000..4ac0db5
--- /dev/null
+++ b/tests/generic/326.out
@@ -0,0 +1,2 @@
+QA output created by 326
+Silence is golden
diff --git a/tests/generic/group b/tests/generic/group
index bdcfd9d..cc5fecc 100644
--- a/tests/generic/group
+++ b/tests/generic/group
@@ -145,3 +145,4 @@
 323 auto aio stress
 324 auto fsr quick
 325 auto quick data log
+326 auto quick metadata log
-- 
1.9.1

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v5] Btrfs: fix fsync data loss after a ranged fsync

2014-09-08 Thread Filipe Manana

While we're doing a full fsync (when the inode has the flag
BTRFS_INODE_NEEDS_FULL_SYNC set) that is ranged too (covers only a
portion of the file), we might have ordered operations that are started
before or while we're logging the inode and that fall outside the fsync
range.

Therefore when a full ranged fsync finishes don't remove every extent
map from the list of modified extent maps - as for some of them, that
fall outside our fsync range, their respective ordered operation hasn't
finished yet, meaning the corresponding file extent item wasn't inserted
into the fs/subvol tree yet and therefore we didn't log it, and we must
let the next fast fsync (one that checks only the modified list) see this
extent map and log a matching file extent item to the log btree and wait
for its ordered operation to finish (if it's still ongoing).

A test case for xfstests follows.

Signed-off-by: Filipe Manana fdman...@suse.com
---

V2: No code change, only updated the changelog and the comment, to make
them more clear and accurate.

V3: Added missing condition to exclude extent map from the modified list
and ensure btrfs_log_inode() is called for the next fsync if the
modified list didn't get empty after logging the inode. This time this
follows with a test case for xfstests that is better then my previous
local test and benefits everyone.

V4: Simplifed em exclusion logic.

V5: Removed the hack that doesn't set the inode's logged_trans and
last_log_commit if the list of modified extent maps isn't empty.
This prevented an unlink in the same transaction from removing
the dentry from the log tree (if an fsync against the parent dir
was made before).

 fs/btrfs/btrfs_inode.h | 13 ++--
 fs/btrfs/file.c|  2 +-
 fs/btrfs/tree-log.c| 55 --
 fs/btrfs/tree-log.h|  2 ++
 4 files changed, 58 insertions(+), 14 deletions(-)

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 74ff403..3511031 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -246,8 +246,17 @@ static inline int btrfs_inode_in_log(struct inode *inode, 
u64 generation)
BTRFS_I(inode)-last_sub_trans =
BTRFS_I(inode)-last_log_commit 
BTRFS_I(inode)-last_sub_trans =
-   BTRFS_I(inode)-root-last_log_commit)
-   return 1;
+   BTRFS_I(inode)-root-last_log_commit) {
+   /*
+* After a ranged fsync we might have left some extent maps
+* (that fall outside the fsync's range). So return false
+* here if the list isn't empty, to make sure btrfs_log_inode()
+* will be called and process those extent maps.
+*/
+   smp_mb();
+   if (list_empty(BTRFS_I(inode)-extent_tree.modified_extents))
+   return 1;
+   }
return 0;
 }
 
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 66c4076..e5534c1 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1979,7 +1979,7 @@ int btrfs_sync_file(struct file *file, loff_t start, 
loff_t end, int datasync)
 
btrfs_init_log_ctx(ctx);
 
-   ret = btrfs_log_dentry_safe(trans, root, dentry, ctx);
+   ret = btrfs_log_dentry_safe(trans, root, dentry, start, end, ctx);
if (ret  0) {
/* Fallthrough and commit/free transaction. */
ret = 1;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 5a917a6..cf4ead8 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -94,8 +94,10 @@
 #define LOG_WALK_REPLAY_ALL 3
 
 static int btrfs_log_inode(struct btrfs_trans_handle *trans,
-struct btrfs_root *root, struct inode *inode,
-int inode_only);
+  struct btrfs_root *root, struct inode *inode,
+  int inode_only,
+  const loff_t start,
+  const loff_t end);
 static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
 struct btrfs_root *root,
 struct btrfs_path *path, u64 objectid);
@@ -3856,8 +3858,10 @@ process:
  * This handles both files and directories.
  */
 static int btrfs_log_inode(struct btrfs_trans_handle *trans,
-struct btrfs_root *root, struct inode *inode,
-int inode_only)
+  struct btrfs_root *root, struct inode *inode,
+  int inode_only,
+  const loff_t start,
+  const loff_t end)
 {
struct btrfs_path *path;
struct btrfs_path *dst_path;
@@ -4050,8 +4054,30 @@ log_extents:
struct extent_map *em, *n;
 
write_lock(tree-lock);
-   list_for_each_entry_safe(em, n, tree-modified_extents, list

[PATCH v2] xfstests: generic: add dir fsync test, motivated by a btrfs bug

2014-09-08 Thread Filipe Manana

This test is motivated by a bug found in btrfs when fsync'ing a
directory. The issue was that if a directory entry is both found
in the persisted metadata and in the fsync log, at log replay time
the directory got set ith a wrong i_size. This was fixed in btrfs
with the following linux kernel patch:

 Btrfs: fix directory recovery from fsync log

Signed-off-by: Filipe Manana fdman...@suse.com
---

V2: Replaced direct use of xfs_io with $XFS_IO_PROG.

 tests/generic/326 | 95 +++
 tests/generic/326.out |  2 ++
 tests/generic/group   |  1 +
 3 files changed, 98 insertions(+)
 create mode 100755 tests/generic/326
 create mode 100644 tests/generic/326.out

diff --git a/tests/generic/326 b/tests/generic/326
new file mode 100755
index 000..886c856
--- /dev/null
+++ b/tests/generic/326
@@ -0,0 +1,95 @@
+#! /bin/bash
+# FS QA Test No. 326
+#
+# This test is motivated by a bug found in btrfs when fsync'ing a directory.
+# The issue was that if a directory entry is both found in the persisted
+# metadata and in the fsync log, at log replay time the directory got set
+# with a wrong i_size. This was fixed in btrfs with the following linux
+# kernel patch:
+#
+# Btrfs: fix directory recovery from fsync log
+#
+#---
+# Copyright (C) 2014 SUSE Linux Products GmbH. All Rights Reserved.
+# Author: Filipe Manana fdman...@suse.com
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it would be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write the Free Software Foundation,
+# Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+#---
+#
+
+seq=`basename $0`
+seqres=$RESULT_DIR/$seq
+echo QA output created by $seq
+
+here=`pwd`
+status=1   # failure is the default!
+
+_cleanup()
+{
+   _cleanup_flakey
+}
+trap _cleanup; exit \$status 0 1 2 3 15
+
+# get standard environment, filters and checks
+. ./common/rc
+. ./common/filter
+. ./common/dmflakey
+
+# real QA test starts here
+_supported_fs generic
+_supported_os Linux
+_need_to_be_root
+_require_scratch
+_require_dm_flakey
+
+rm -f $seqres.full
+
+_scratch_mkfs  $seqres.full 21
+
+_init_flakey
+_mount_flakey
+
+touch $SCRATCH_MNT/foo
+
+# Invoke sync here because it's necessary to trigger the original bug in btrfs.
+# The intention is that at log recovery time we have a dir entry for 'foo' both
+# in the fs/subvol tree and in the log tree - this is necessary to trigger the
+# bug on btrfs.
+sync
+
+touch $SCRATCH_MNT/bar
+$XFS_IO_PROG -c fsync $SCRATCH_MNT
+$XFS_IO_PROG -c fsync $SCRATCH_MNT/bar
+
+_load_flakey_table $FLAKEY_DROP_WRITES
+_unmount_flakey
+_check_scratch_fs $FLAKEY_DEV
+
+_load_flakey_table $FLAKEY_ALLOW_WRITES
+_mount_flakey
+
+[ -f $SCRATCH_MNT/foo ] || echo file foo is missing
+[ -f $SCRATCH_MNT/bar ] || echo file bar is missing
+
+_unmount_flakey
+
+# In the original btrfs bug, the filesystem consistency check failed here
+# because the directory inode got set with a wrong i_size by the log replay
+# at mount time (dentry 'foo' was accounted for twice).
+_check_scratch_fs $FLAKEY_DEV
+
+echo Silence is golden
+
+status=0
+exit
diff --git a/tests/generic/326.out b/tests/generic/326.out
new file mode 100644
index 000..4ac0db5
--- /dev/null
+++ b/tests/generic/326.out
@@ -0,0 +1,2 @@
+QA output created by 326
+Silence is golden
diff --git a/tests/generic/group b/tests/generic/group
index bdcfd9d..cc5fecc 100644
--- a/tests/generic/group
+++ b/tests/generic/group
@@ -145,3 +145,4 @@
 323 auto aio stress
 324 auto fsr quick
 325 auto quick data log
+326 auto quick metadata log
-- 
1.9.1

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH] Btrfs: add missing compression property remove in btrfs_ioctl_setflags

2014-09-10 Thread Filipe Manana

The behaviour of a 'chattr -c' consists of getting the current flags,
clearing the FS_COMPR_FL bit and then sending the result to the set
flags ioctl - this means the bit FS_NOCOMP_FL isn't set in the flags
passed to the ioctl. This results in the compression property not being
cleared from the inode - it was cleared only if the bit FS_NOCOMP_FL
was set in the received flags.

Reproducer:

$ mkfs.btrfs -f /dev/sdd
$ mount /dev/sdd /mnt  cd /mnt
$ mkdir a
$ chattr +c a
$ touch a/file
$ lsattr a/file
c--- a/file
$ chattr -c a
$ touch a/file2
$ lsattr a/file2
c--- a/file2
$ lsattr -d a
 a

Reported-by: Andreas Schneider a...@cryptomilk.org
Signed-off-by: Filipe Manana fdman...@suse.com
---
 fs/btrfs/ioctl.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index a010c44..8e6950c 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -333,6 +333,9 @@ static int btrfs_ioctl_setflags(struct file *file, void 
__user *arg)
 
} else {
ip-flags = ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS);
+   ret = btrfs_set_prop(inode, btrfs.compression, NULL, 0, 0);
+   if (ret  ret != -ENODATA)
+   goto out_drop;
}
 
trans = btrfs_start_transaction(root, 1);
-- 
1.9.1

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v2] Btrfs: add missing compression property remove in btrfs_ioctl_setflags

2014-09-11 Thread Filipe Manana

The behaviour of a 'chattr -c' consists of getting the current flags,
clearing the FS_COMPR_FL bit and then sending the result to the set
flags ioctl - this means the bit FS_NOCOMP_FL isn't set in the flags
passed to the ioctl. This results in the compression property not being
cleared from the inode - it was cleared only if the bit FS_NOCOMP_FL
was set in the received flags.

Reproducer:

$ mkfs.btrfs -f /dev/sdd
$ mount /dev/sdd /mnt  cd /mnt
$ mkdir a
$ chattr +c a
$ touch a/file
$ lsattr a/file
c--- a/file
$ chattr -c a
$ touch a/file2
$ lsattr a/file2
c--- a/file2
$ lsattr -d a
 a

Reported-by: Andreas Schneider a...@cryptomilk.org
Signed-off-by: Filipe Manana fdman...@suse.com
---

V2: Ensure BTRFS_INODE_NOCOMPRESS isn't set (unless the bit FS_NOCOMP_FL is 
set).

 fs/btrfs/ioctl.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index a010c44..a46c169 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -332,6 +332,9 @@ static int btrfs_ioctl_setflags(struct file *file, void 
__user *arg)
goto out_drop;
 
} else {
+   ret = btrfs_set_prop(inode, btrfs.compression, NULL, 0, 0);
+   if (ret  ret != -ENODATA)
+   goto out_drop;
ip-flags = ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS);
}
 
-- 
1.9.1

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH] xfstests: btrfs: add test regarding clearing compression flag/property

2014-09-11 Thread Filipe Manana

Regression test for btrfs where removing the flag FS_COMPR_FL
(chattr -c) from an inode wouldn't clear its compression property.
This was fixed in the following linux kernel patch:

  Btrfs: add missing compression property remove in btrfs_ioctl_setflags

Signed-off-by: Filipe Manana fdman...@suse.com
---
 tests/btrfs/059 | 85 +
 tests/btrfs/059.out | 11 +++
 tests/btrfs/group   |  1 +
 3 files changed, 97 insertions(+)
 create mode 100755 tests/btrfs/059
 create mode 100644 tests/btrfs/059.out

diff --git a/tests/btrfs/059 b/tests/btrfs/059
new file mode 100755
index 000..3379ead
--- /dev/null
+++ b/tests/btrfs/059
@@ -0,0 +1,85 @@
+#! /bin/bash
+# FS QA Test No. btrfs/059
+#
+# Regression test for btrfs where removing the flag FS_COMPR_FL (chattr -c)
+# from an inode wouldn't clear its compression property.
+# This was fixed in the following linux kernel patch:
+#
+# Btrfs: add missing compression property remove in btrfs_ioctl_setflags
+#
+#---
+# Copyright (C) 2014 SUSE Linux Products GmbH. All Rights Reserved.
+# Author: Filipe Manana fdman...@suse.com
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it would be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write the Free Software Foundation,
+# Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+#---
+#
+
+seq=`basename $0`
+seqres=$RESULT_DIR/$seq
+echo QA output created by $seq
+
+here=`pwd`
+tmp=/tmp/$$
+
+status=1   # failure is the default!
+trap _cleanup; exit \$status 0 1 2 3 15
+
+_cleanup()
+{
+   rm -fr $tmp
+}
+
+# get standard environment, filters and checks
+. ./common/rc
+. ./common/filter
+
+# real QA test starts here
+_supported_fs btrfs
+_supported_os Linux
+_require_test
+_require_scratch
+_require_btrfs property
+_need_to_be_root
+
+rm -f $seqres.full
+
+_scratch_mkfs  $seqres.full 21
+_scratch_mount
+
+mkdir $SCRATCH_MNT/testdir
+echo Setting compression flag in the directory...
+chattr +c $SCRATCH_MNT/testdir
+echo Directory compression property value:
+$BTRFS_UTIL_PROG property get $SCRATCH_MNT/testdir compression
+
+touch $SCRATCH_MNT/testdir/file1
+echo file1 compression property value:
+$BTRFS_UTIL_PROG property get $SCRATCH_MNT/testdir/file1 compression
+
+echo Clearing compression flag from directory...
+chattr -c $SCRATCH_MNT/testdir
+echo Directory compression property value:
+$BTRFS_UTIL_PROG property get $SCRATCH_MNT/testdir compression
+
+touch $SCRATCH_MNT/testdir/file2
+echo file2 compression property value:
+$BTRFS_UTIL_PROG property get $SCRATCH_MNT/testdir/file2 compression
+
+touch $SCRATCH_MNT/testdir/file1
+echo file1 compression property value:
+$BTRFS_UTIL_PROG property get $SCRATCH_MNT/testdir/file1 compression
+
+status=0
+exit
diff --git a/tests/btrfs/059.out b/tests/btrfs/059.out
new file mode 100644
index 000..9ec9a53
--- /dev/null
+++ b/tests/btrfs/059.out
@@ -0,0 +1,11 @@
+QA output created by 059
+Setting compression flag in the directory...
+Directory compression property value:
+compression=zlib
+file1 compression property value:
+compression=zlib
+Clearing compression flag from directory...
+Directory compression property value:
+file2 compression property value:
+file1 compression property value:
+compression=zlib
diff --git a/tests/btrfs/group b/tests/btrfs/group
index 3fa9778..68b5c79 100644
--- a/tests/btrfs/group
+++ b/tests/btrfs/group
@@ -61,3 +61,4 @@
 056 auto quick
 057 auto quick
 058 auto quick
+059 auto quick
-- 
1.9.1

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH] Btrfs: set inode's logged_trans/last_log_commit after ranged fsync

2014-09-11 Thread Filipe Manana

When a ranged fsync finishes if there are still extent maps in the modified
list, still set the inode's logged_trans and last_log_commit. This is important
in case an inode is fsync'ed and unlinked in the same transaction, to ensure its
inode ref gets deleted from the log and the respective dentries in its parent
are deleted too from the log (if the parent directory was fsync'ed in the same
transaction).

Instead make btrfs_inode_in_log() return false if the list of modified extent
maps isn't empty.

This is an incremental on top of the v4 version of the patch:

Btrfs: fix fsync data loss after a ranged fsync

which was added to its v5, but didn't make it on time.

Signed-off-by: Filipe Manana fdman...@suse.com
---
 fs/btrfs/btrfs_inode.h | 13 +++--
 fs/btrfs/tree-log.c| 14 ++
 2 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 74ff403..3511031 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -246,8 +246,17 @@ static inline int btrfs_inode_in_log(struct inode *inode, 
u64 generation)
BTRFS_I(inode)-last_sub_trans =
BTRFS_I(inode)-last_log_commit 
BTRFS_I(inode)-last_sub_trans =
-   BTRFS_I(inode)-root-last_log_commit)
-   return 1;
+   BTRFS_I(inode)-root-last_log_commit) {
+   /*
+* After a ranged fsync we might have left some extent maps
+* (that fall outside the fsync's range). So return false
+* here if the list isn't empty, to make sure btrfs_log_inode()
+* will be called and process those extent maps.
+*/
+   smp_mb();
+   if (list_empty(BTRFS_I(inode)-extent_tree.modified_extents))
+   return 1;
+   }
return 0;
 }
 
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 82db14f..d7c1459 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -4090,18 +4090,8 @@ log_extents:
}
}
 
-   write_lock(em_tree-lock);
-   /*
-* If we're doing a ranged fsync and there are still modified extents
-* in the list, we must run on the next fsync call as it might cover
-* those extents (a full fsync or an fsync for other range).
-*/
-   if (list_empty(em_tree-modified_extents)) {
-   BTRFS_I(inode)-logged_trans = trans-transid;
-   BTRFS_I(inode)-last_log_commit =
-   BTRFS_I(inode)-last_sub_trans;
-   }
-   write_unlock(em_tree-lock);
+   BTRFS_I(inode)-logged_trans = trans-transid;
+   BTRFS_I(inode)-last_log_commit = BTRFS_I(inode)-last_sub_trans;
 out_unlock:
if (unlikely(err))
btrfs_put_logged_extents(logged_list);
-- 
1.9.1

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 1/2] Btrfs: add missing end_page_writeback on submit_extent_page failure

2014-09-22 Thread Filipe Manana

If submit_extent_page() fails in write_one_eb(), we end up with the current
page not marked dirty anymore, unlocked and marked for writeback. But we never
end up calling end_page_writeback() against the page, which will make calls to
filemap_fdatawait_range (e.g. at transaction commit time) hang forever waiting
for the writeback bit to be cleared from the page.

Signed-off-by: Filipe Manana fdman...@suse.com
---
 fs/btrfs/extent_io.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 3af4966..91f866c 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3668,6 +3668,7 @@ static noinline_for_stack int write_one_eb(struct 
extent_buffer *eb,
if (ret) {
set_bit(EXTENT_BUFFER_IOERR, eb-bflags);
SetPageError(p);
+   end_page_writeback(p);
if (atomic_sub_and_test(num_pages - i, eb-io_pages))
end_extent_buffer_writeback(eb);
ret = -EIO;
-- 
1.9.1

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 2/2] Btrfs: be aware of btree inode write errors to avoid fs corruption

2014-09-22 Thread Filipe Manana

While we have a transaction ongoing, the VM might decide at any time
to call btree_inode-i_mapping-a_ops-writepages(), which will start
writeback of dirty pages belonging to btree nodes/leafs. This call
might return an error or the writeback might finish with an error
before we attempt to commit the running transaction. If this happens,
we might have no way of knowing that such error happened when we are
committing the transaction - because the pages might no longer be
marked dirty nor tagged for writeback (if a subsequent modification
to the extent buffer didn't happen before the transaction commit) which
makes filemap_fdata[write|wait]_range unable to find such pages (even
if they're marked with SetPageError).
So if this happens we must abort the transaction, otherwise we commit
a super block with btree roots that point to btree nodes/leafs whose
content on disk is invalid - either garbage or the content of some
node/leaf from a past generation that got cowed or deleted and is no
longer valid (for this later case we end up getting error messages like
parent transid verify failed on 10826481664 wanted 25748 found 29562
when reading btree nodes/leafs from disk).

Note that setting and checking AS_EIO/AS_ENOSPC in the btree inode's
i_mapping would not be enough because we need to distinguish between
log tree extents (not fatal) vs non-log tree extents (fatal) and
because the next call to filemap_fdatawait_range() will catch and clear
such errors in the mapping - and that call might be from a log sync and
not from a transaction commit, which means we would not know about the
error at transaction commit time. Also, checking for the eb flag
EXTENT_BUFFER_IOERR at transaction commit time isn't done and would
not be completely reliable, as the eb might be removed from memory and
read back when trying to get it, which clears that flag right before
reading the eb's pages from disk, making us not know about the previous
write error.

Using the BTRFS_INODE_BTREE_IO_ERR and BTRFS_INODE_BTREE_LOG_IO_ERR
inode flags also makes us achieve the goal of AS_EIO/AS_ENOSPC when
writepages() returns success, started writeback for all dirty pages
and before filemap_fdatawait_range() is called, the writeback for
all dirty pages had already finished with errors - because we were
not using AS_EIO/AS_ENOSPC, filemap_fdatawait_range() would return
success, as it could not know that writeback errors happened (the
pages were no longer tagged for writeback).

Signed-off-by: Filipe Manana fdman...@suse.com
---
 fs/btrfs/btrfs_inode.h |  2 ++
 fs/btrfs/extent_io.c   | 69 +++---
 fs/btrfs/transaction.c | 20 ---
 fs/btrfs/transaction.h |  3 +--
 fs/btrfs/tree-log.c| 13 ++
 5 files changed, 93 insertions(+), 14 deletions(-)

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 3511031..dbe37dc 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -44,6 +44,8 @@
 #define BTRFS_INODE_IN_DELALLOC_LIST   9
 #define BTRFS_INODE_READDIO_NEED_LOCK  10
 #define BTRFS_INODE_HAS_PROPS  11
+#define BTRFS_INODE_BTREE_IO_ERR   12
+#define BTRFS_INODE_BTREE_LOG_IO_ERR   13
 
 /* in memory btrfs inode */
 struct btrfs_inode {
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 91f866c..33b113b 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -20,6 +20,7 @@
 #include locking.h
 #include rcu-string.h
 #include backref.h
+#include transaction.h
 
 static struct kmem_cache *extent_state_cache;
 static struct kmem_cache *extent_buffer_cache;
@@ -3606,6 +3607,68 @@ static void end_extent_buffer_writeback(struct 
extent_buffer *eb)
wake_up_bit(eb-bflags, EXTENT_BUFFER_WRITEBACK);
 }
 
+static void set_btree_ioerr(struct page *page, int err)
+{
+   struct extent_buffer *eb = (struct extent_buffer *)page-private;
+   const u64 start = eb-start;
+   const u64 end = eb-start + eb-len - 1;
+   struct btrfs_fs_info *fs_info = eb-fs_info;
+   int ret;
+
+   set_bit(EXTENT_BUFFER_IOERR, eb-bflags);
+   SetPageError(page);
+
+   /*
+* If writeback for a btree extent that doesn't belong to a log tree
+* failed, set the bit BTRFS_INODE_BTREE_IO_ERR in the inode btree.
+* We do this because while the transaction is running and before it's
+* committing (when we call filemap_fdata[write|wait]_range against
+* the btree inode), we might have
+* btree_inode-i_mapping-a_ops-writepages() called by the VM - if it
+* returns an error or an error happens during writeback, when we're
+* committing the transaction we wouldn't know about it, since the pages
+* can be no longer dirty nor marked anymore for writeback (if a
+* subsequent modification to the extent buffer didn't happen before the
+* transaction commit), which makes filemap_fdata[write|wait]_range not
+* able to find

[PATCH 2/2 v2] Btrfs: be aware of btree inode write errors to avoid fs corruption

2014-09-23 Thread Filipe Manana

While we have a transaction ongoing, the VM might decide at any time
to call btree_inode-i_mapping-a_ops-writepages(), which will start
writeback of dirty pages belonging to btree nodes/leafs. This call
might return an error or the writeback might finish with an error
before we attempt to commit the running transaction. If this happens,
we might have no way of knowing that such error happened when we are
committing the transaction - because the pages might no longer be
marked dirty nor tagged for writeback (if a subsequent modification
to the extent buffer didn't happen before the transaction commit) which
makes filemap_fdata[write|wait]_range unable to find such pages (even
if they're marked with SetPageError).
So if this happens we must abort the transaction, otherwise we commit
a super block with btree roots that point to btree nodes/leafs whose
content on disk is invalid - either garbage or the content of some
node/leaf from a past generation that got cowed or deleted and is no
longer valid (for this later case we end up getting error messages like
parent transid verify failed on 10826481664 wanted 25748 found 29562
when reading btree nodes/leafs from disk).

Note that setting and checking AS_EIO/AS_ENOSPC in the btree inode's
i_mapping would not be enough because we need to distinguish between
log tree extents (not fatal) vs non-log tree extents (fatal) and
because the next call to filemap_fdatawait_range() will catch and clear
such errors in the mapping - and that call might be from a log sync and
not from a transaction commit, which means we would not know about the
error at transaction commit time. Also, checking for the eb flag
EXTENT_BUFFER_IOERR at transaction commit time isn't done and would
not be completely reliable, as the eb might be removed from memory and
read back when trying to get it, which clears that flag right before
reading the eb's pages from disk, making us not know about the previous
write error.

Using the new counters eb_write_errors and log_eb_write_errors in the
transaction also makes us achieve the goal of AS_EIO/AS_ENOSPC when
writepages() returns success, started writeback for all dirty pages
and before filemap_fdatawait_range() is called, the writeback for
all dirty pages had already finished with errors - because we were
not using AS_EIO/AS_ENOSPC, filemap_fdatawait_range() would return
success, as it could not know that writeback errors happened (the
pages were no longer tagged for writeback).

Signed-off-by: Filipe Manana fdman...@suse.com
---

V2: If an extent buffer's write failed but it's also deleted from the tree
before the transaction commits, don't abort the transaction with -EIO,
since the unwritten node/leaf it represents can't be pointed to by any
other node in a tree.

 fs/btrfs/disk-io.c |  4 +--
 fs/btrfs/extent-tree.c |  8 ++
 fs/btrfs/extent_io.c   | 76 +-
 fs/btrfs/extent_io.h   |  3 +-
 fs/btrfs/transaction.c | 22 +--
 fs/btrfs/transaction.h |  5 ++--
 fs/btrfs/tree-log.c| 13 +
 7 files changed, 111 insertions(+), 20 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 23393ec..8b54acf 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -610,7 +610,7 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio 
*io_bio,
goto err;
 
eb-read_mirror = mirror;
-   if (test_bit(EXTENT_BUFFER_IOERR, eb-bflags)) {
+   if (test_bit(EXTENT_BUFFER_READ_ERR, eb-bflags)) {
ret = -EIO;
goto err;
}
@@ -683,7 +683,7 @@ static int btree_io_failed_hook(struct page *page, int 
failed_mirror)
struct btrfs_root *root = BTRFS_I(page-mapping-host)-root;
 
eb = (struct extent_buffer *)page-private;
-   set_bit(EXTENT_BUFFER_IOERR, eb-bflags);
+   set_bit(EXTENT_BUFFER_READ_ERR, eb-bflags);
eb-read_mirror = failed_mirror;
atomic_dec(eb-io_pages);
if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, eb-bflags))
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index ef0845d..608814b 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -6221,6 +6221,14 @@ out:
 * anymore.
 */
clear_bit(EXTENT_BUFFER_CORRUPT, buf-bflags);
+   /*
+* The unwritten node/leaf (due to an IO error) isn't pointed to by any
+* other node in a tree, so it's safe to forget about the write error
+* and avoid a transaction abort.
+*/
+   if (test_and_clear_bit(EXTENT_BUFFER_WRITE_ERR, buf-bflags))
+   atomic_dec(trans-transaction-eb_write_errors);
+
btrfs_put_block_group(cache);
 }
 
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 91f866c..e21f200 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -20,6 +20,7 @@
 #include locking.h
 #include rcu-string.h
 #include backref.h
+#include transaction.h
 
 static struct kmem_cache

[PATCH 2/2 v3] Btrfs: be aware of btree inode write errors to avoid fs corruption

2014-09-23 Thread Filipe Manana

While we have a transaction ongoing, the VM might decide at any time
to call btree_inode-i_mapping-a_ops-writepages(), which will start
writeback of dirty pages belonging to btree nodes/leafs. This call
might return an error or the writeback might finish with an error
before we attempt to commit the running transaction. If this happens,
we might have no way of knowing that such error happened when we are
committing the transaction - because the pages might no longer be
marked dirty nor tagged for writeback (if a subsequent modification
to the extent buffer didn't happen before the transaction commit) which
makes filemap_fdata[write|wait]_range unable to find such pages (even
if they're marked with SetPageError).
So if this happens we must abort the transaction, otherwise we commit
a super block with btree roots that point to btree nodes/leafs whose
content on disk is invalid - either garbage or the content of some
node/leaf from a past generation that got cowed or deleted and is no
longer valid (for this later case we end up getting error messages like
parent transid verify failed on 10826481664 wanted 25748 found 29562
when reading btree nodes/leafs from disk).

Note that setting and checking AS_EIO/AS_ENOSPC in the btree inode's
i_mapping would not be enough because we need to distinguish between
log tree extents (not fatal) vs non-log tree extents (fatal) and
because the next call to filemap_fdatawait_range() will catch and clear
such errors in the mapping - and that call might be from a log sync and
not from a transaction commit, which means we would not know about the
error at transaction commit time. Also, checking for the eb flag
EXTENT_BUFFER_IOERR at transaction commit time isn't done and would
not be completely reliable, as the eb might be removed from memory and
read back when trying to get it, which clears that flag right before
reading the eb's pages from disk, making us not know about the previous
write error.

Using the new counters eb_write_errors and log_eb_write_errors in the
transaction also makes us achieve the goal of AS_EIO/AS_ENOSPC when
writepages() returns success, started writeback for all dirty pages
and before filemap_fdatawait_range() is called, the writeback for
all dirty pages had already finished with errors - because we were
not using AS_EIO/AS_ENOSPC, filemap_fdatawait_range() would return
success, as it could not know that writeback errors happened (the
pages were no longer tagged for writeback).

Signed-off-by: Filipe Manana fdman...@suse.com
---

V2: If an extent buffer's write failed but it's also deleted from the tree
before the transaction commits, don't abort the transaction with -EIO,
since the unwritten node/leaf it represents can't be pointed to by any
other node in a tree.

V3: Correct V2, missed unstaged changes.

 fs/btrfs/disk-io.c |  4 +--
 fs/btrfs/extent-tree.c |  8 ++
 fs/btrfs/extent_io.c   | 76 +-
 fs/btrfs/extent_io.h   |  3 +-
 fs/btrfs/transaction.c | 22 +--
 fs/btrfs/transaction.h |  5 ++--
 fs/btrfs/tree-log.c| 13 +
 7 files changed, 111 insertions(+), 20 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 23393ec..8b54acf 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -610,7 +610,7 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio 
*io_bio,
goto err;
 
eb-read_mirror = mirror;
-   if (test_bit(EXTENT_BUFFER_IOERR, eb-bflags)) {
+   if (test_bit(EXTENT_BUFFER_READ_ERR, eb-bflags)) {
ret = -EIO;
goto err;
}
@@ -683,7 +683,7 @@ static int btree_io_failed_hook(struct page *page, int 
failed_mirror)
struct btrfs_root *root = BTRFS_I(page-mapping-host)-root;
 
eb = (struct extent_buffer *)page-private;
-   set_bit(EXTENT_BUFFER_IOERR, eb-bflags);
+   set_bit(EXTENT_BUFFER_READ_ERR, eb-bflags);
eb-read_mirror = failed_mirror;
atomic_dec(eb-io_pages);
if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, eb-bflags))
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index ef0845d..608814b 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -6221,6 +6221,14 @@ out:
 * anymore.
 */
clear_bit(EXTENT_BUFFER_CORRUPT, buf-bflags);
+   /*
+* The unwritten node/leaf (due to an IO error) isn't pointed to by any
+* other node in a tree, so it's safe to forget about the write error
+* and avoid a transaction abort.
+*/
+   if (test_and_clear_bit(EXTENT_BUFFER_WRITE_ERR, buf-bflags))
+   atomic_dec(trans-transaction-eb_write_errors);
+
btrfs_put_block_group(cache);
 }
 
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 91f866c..da1706f 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -20,6 +20,7 @@
 #include locking.h
 #include rcu-string.h
 #include backref.h
+#include transaction.h

[PATCH 2/2 v4] Btrfs: be aware of btree inode write errors to avoid fs corruption

2014-09-23 Thread Filipe Manana

While we have a transaction ongoing, the VM might decide at any time
to call btree_inode-i_mapping-a_ops-writepages(), which will start
writeback of dirty pages belonging to btree nodes/leafs. This call
might return an error or the writeback might finish with an error
before we attempt to commit the running transaction. If this happens,
we might have no way of knowing that such error happened when we are
committing the transaction - because the pages might no longer be
marked dirty nor tagged for writeback (if a subsequent modification
to the extent buffer didn't happen before the transaction commit) which
makes filemap_fdata[write|wait]_range unable to find such pages (even
if they're marked with SetPageError).
So if this happens we must abort the transaction, otherwise we commit
a super block with btree roots that point to btree nodes/leafs whose
content on disk is invalid - either garbage or the content of some
node/leaf from a past generation that got cowed or deleted and is no
longer valid (for this later case we end up getting error messages like
parent transid verify failed on 10826481664 wanted 25748 found 29562
when reading btree nodes/leafs from disk).

Note that setting and checking AS_EIO/AS_ENOSPC in the btree inode's
i_mapping would not be enough because we need to distinguish between
log tree extents (not fatal) vs non-log tree extents (fatal) and
because the next call to filemap_fdatawait_range() will catch and clear
such errors in the mapping - and that call might be from a log sync and
not from a transaction commit, which means we would not know about the
error at transaction commit time. Also, checking for the eb flag
EXTENT_BUFFER_IOERR at transaction commit time isn't done and would
not be completely reliable, as the eb might be removed from memory and
read back when trying to get it, which clears that flag right before
reading the eb's pages from disk, making us not know about the previous
write error.

Using the new counters eb_write_errors and log_eb_write_errors in the
transaction also makes us achieve the goal of AS_EIO/AS_ENOSPC when
writepages() returns success, started writeback for all dirty pages
and before filemap_fdatawait_range() is called, the writeback for
all dirty pages had already finished with errors - because we were
not using AS_EIO/AS_ENOSPC, filemap_fdatawait_range() would return
success, as it could not know that writeback errors happened (the
pages were no longer tagged for writeback).

Signed-off-by: Filipe Manana fdman...@suse.com
---

V2: If an extent buffer's write failed but it's also deleted from the tree
before the transaction commits, don't abort the transaction with -EIO,
since the unwritten node/leaf it represents can't be pointed to by any
other node in a tree.

V3: Correct V2, missed unstaged changes.

V4: Use root's key to figure out which counter to update.

 fs/btrfs/disk-io.c |  4 +--
 fs/btrfs/extent-tree.c | 12 +
 fs/btrfs/extent_io.c   | 71 +-
 fs/btrfs/extent_io.h   |  3 ++-
 fs/btrfs/transaction.c | 22 +---
 fs/btrfs/transaction.h |  5 ++--
 fs/btrfs/tree-log.c| 13 +
 7 files changed, 110 insertions(+), 20 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 23393ec..8b54acf 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -610,7 +610,7 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio 
*io_bio,
goto err;
 
eb-read_mirror = mirror;
-   if (test_bit(EXTENT_BUFFER_IOERR, eb-bflags)) {
+   if (test_bit(EXTENT_BUFFER_READ_ERR, eb-bflags)) {
ret = -EIO;
goto err;
}
@@ -683,7 +683,7 @@ static int btree_io_failed_hook(struct page *page, int 
failed_mirror)
struct btrfs_root *root = BTRFS_I(page-mapping-host)-root;
 
eb = (struct extent_buffer *)page-private;
-   set_bit(EXTENT_BUFFER_IOERR, eb-bflags);
+   set_bit(EXTENT_BUFFER_READ_ERR, eb-bflags);
eb-read_mirror = failed_mirror;
atomic_dec(eb-io_pages);
if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, eb-bflags))
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index ef0845d..bdacd33 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -6221,6 +6221,18 @@ out:
 * anymore.
 */
clear_bit(EXTENT_BUFFER_CORRUPT, buf-bflags);
+   /*
+* The unwritten node/leaf (due to an IO error) isn't pointed to by any
+* other node in a tree, so it's safe to forget about the write error
+* and avoid a transaction abort.
+*/
+   if (test_and_clear_bit(EXTENT_BUFFER_WRITE_ERR, buf-bflags)) {
+   if (root-root_key.objectid == BTRFS_TREE_LOG_OBJECTID)
+   atomic_dec(trans-transaction-log_eb_write_errors);
+   else
+   atomic_dec(trans-transaction-eb_write_errors);
+   }
+
btrfs_put_block_group(cache

[PATCH 2/2 v5] Btrfs: be aware of btree inode write errors to avoid fs corruption

2014-09-24 Thread Filipe Manana

While we have a transaction ongoing, the VM might decide at any time
to call btree_inode-i_mapping-a_ops-writepages(), which will start
writeback of dirty pages belonging to btree nodes/leafs. This call
might return an error or the writeback might finish with an error
before we attempt to commit the running transaction. If this happens,
we might have no way of knowing that such error happened when we are
committing the transaction - because the pages might no longer be
marked dirty nor tagged for writeback (if a subsequent modification
to the extent buffer didn't happen before the transaction commit) which
makes filemap_fdata[write|wait]_range unable to find such pages (even
if they're marked with SetPageError).
So if this happens we must abort the transaction, otherwise we commit
a super block with btree roots that point to btree nodes/leafs whose
content on disk is invalid - either garbage or the content of some
node/leaf from a past generation that got cowed or deleted and is no
longer valid (for this later case we end up getting error messages like
parent transid verify failed on 10826481664 wanted 25748 found 29562
when reading btree nodes/leafs from disk).

Note that setting and checking AS_EIO/AS_ENOSPC in the btree inode's
i_mapping would not be enough because we need to distinguish between
log tree extents (not fatal) vs non-log tree extents (fatal) and
because the next call to filemap_fdatawait_range() will catch and clear
such errors in the mapping - and that call might be from a log sync and
not from a transaction commit, which means we would not know about the
error at transaction commit time. Also, checking for the eb flag
EXTENT_BUFFER_IOERR at transaction commit time isn't done and would
not be completely reliable, as the eb might be removed from memory and
read back when trying to get it, which clears that flag right before
reading the eb's pages from disk, making us not know about the previous
write error.

Using the new counters eb_write_errors and log_eb_write_errors in the
transaction also makes us achieve the goal of AS_EIO/AS_ENOSPC when
writepages() returns success, started writeback for all dirty pages
and before filemap_fdatawait_range() is called, the writeback for
all dirty pages had already finished with errors - because we were
not using AS_EIO/AS_ENOSPC, filemap_fdatawait_range() would return
success, as it could not know that writeback errors happened (the
pages were no longer tagged for writeback).

Signed-off-by: Filipe Manana fdman...@suse.com
---

V2: If an extent buffer's write failed but it's also deleted from the tree
before the transaction commits, don't abort the transaction with -EIO,
since the unwritten node/leaf it represents can't be pointed to by any
other node in a tree.

V3: Correct V2, missed unstaged changes.

V4: Use root's key to figure out which counter to update.

V5: Decrement the error counters too when an eb is made dirty again (the
next write attempt might succeed).

 fs/btrfs/disk-io.c |  4 +--
 fs/btrfs/extent-tree.c |  1 +
 fs/btrfs/extent_io.c   | 98 ++
 fs/btrfs/extent_io.h   |  4 ++-
 fs/btrfs/transaction.c | 21 +--
 fs/btrfs/transaction.h |  5 +--
 fs/btrfs/tree-log.c| 13 ---
 7 files changed, 126 insertions(+), 20 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 23393ec..8b54acf 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -610,7 +610,7 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio 
*io_bio,
goto err;
 
eb-read_mirror = mirror;
-   if (test_bit(EXTENT_BUFFER_IOERR, eb-bflags)) {
+   if (test_bit(EXTENT_BUFFER_READ_ERR, eb-bflags)) {
ret = -EIO;
goto err;
}
@@ -683,7 +683,7 @@ static int btree_io_failed_hook(struct page *page, int 
failed_mirror)
struct btrfs_root *root = BTRFS_I(page-mapping-host)-root;
 
eb = (struct extent_buffer *)page-private;
-   set_bit(EXTENT_BUFFER_IOERR, eb-bflags);
+   set_bit(EXTENT_BUFFER_READ_ERR, eb-bflags);
eb-read_mirror = failed_mirror;
atomic_dec(eb-io_pages);
if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, eb-bflags))
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index ef0845d..ec185b5 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -6221,6 +6221,7 @@ out:
 * anymore.
 */
clear_bit(EXTENT_BUFFER_CORRUPT, buf-bflags);
+   clear_extent_buffer_write_err(buf);
btrfs_put_block_group(cache);
 }
 
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 91f866c..ed1be9c 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -20,6 +20,7 @@
 #include locking.h
 #include rcu-string.h
 #include backref.h
+#include transaction.h
 
 static struct kmem_cache *extent_state_cache;
 static struct kmem_cache *extent_buffer_cache;
@@ -3606,6 +3607,63 @@ static void

[PATCH 2/2 v6] Btrfs: be aware of btree inode write errors to avoid fs corruption

2014-09-24 Thread Filipe Manana

While we have a transaction ongoing, the VM might decide at any time
to call btree_inode-i_mapping-a_ops-writepages(), which will start
writeback of dirty pages belonging to btree nodes/leafs. This call
might return an error or the writeback might finish with an error
before we attempt to commit the running transaction. If this happens,
we might have no way of knowing that such error happened when we are
committing the transaction - because the pages might no longer be
marked dirty nor tagged for writeback (if a subsequent modification
to the extent buffer didn't happen before the transaction commit) which
makes filemap_fdata[write|wait]_range unable to find such pages (even
if they're marked with SetPageError).
So if this happens we must abort the transaction, otherwise we commit
a super block with btree roots that point to btree nodes/leafs whose
content on disk is invalid - either garbage or the content of some
node/leaf from a past generation that got cowed or deleted and is no
longer valid (for this later case we end up getting error messages like
parent transid verify failed on 10826481664 wanted 25748 found 29562
when reading btree nodes/leafs from disk).

Note that setting and checking AS_EIO/AS_ENOSPC in the btree inode's
i_mapping would not be enough because we need to distinguish between
log tree extents (not fatal) vs non-log tree extents (fatal) and
because the next call to filemap_fdatawait_range() will catch and clear
such errors in the mapping - and that call might be from a log sync and
not from a transaction commit, which means we would not know about the
error at transaction commit time. Also, checking for the eb flag
EXTENT_BUFFER_IOERR at transaction commit time isn't done and would
not be completely reliable, as the eb might be removed from memory and
read back when trying to get it, which clears that flag right before
reading the eb's pages from disk, making us not know about the previous
write error.

Using the new counters eb_write_errors and log_eb_write_errors in the
transaction also makes us achieve the goal of AS_EIO/AS_ENOSPC when
writepages() returns success, started writeback for all dirty pages
and before filemap_fdatawait_range() is called, the writeback for
all dirty pages had already finished with errors - because we were
not using AS_EIO/AS_ENOSPC, filemap_fdatawait_range() would return
success, as it could not know that writeback errors happened (the
pages were no longer tagged for writeback).

Signed-off-by: Filipe Manana fdman...@suse.com
---

V2: If an extent buffer's write failed but it's also deleted from the tree
before the transaction commits, don't abort the transaction with -EIO,
since the unwritten node/leaf it represents can't be pointed to by any
other node in a tree.

V3: Correct V2, missed unstaged changes.

V4: Use root's key to figure out which counter to update.

V5: Decrement the error counters too when an eb is made dirty again (the
next write attempt might succeed).

V6: Moved counters from transaction struct to fs_info struct, because there's
a (short) time window where fs_info-running_transaction is NULL.
There's now 2 counters for log extent buffers too, each one representing
a different log transaction.

 fs/btrfs/ctree.h   |   3 ++
 fs/btrfs/disk-io.c |   7 +++-
 fs/btrfs/extent-tree.c |   1 +
 fs/btrfs/extent_io.c   | 105 +
 fs/btrfs/extent_io.h   |   4 +-
 fs/btrfs/transaction.c |  18 +
 fs/btrfs/tree-log.c|  23 +++
 fs/btrfs/tree-log.h|   2 +
 8 files changed, 153 insertions(+), 10 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index f20b60d..0e5ca39 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1510,6 +1510,9 @@ struct btrfs_fs_info {
atomic_t async_delalloc_pages;
atomic_t open_ioctl_trans;
 
+   atomic_t eb_write_errors;
+   atomic_t log_eb_write_errors[2];
+
/*
 * this is used to protect the following list -- ordered_roots.
 */
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 23393ec..8f1deca 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -610,7 +610,7 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio 
*io_bio,
goto err;
 
eb-read_mirror = mirror;
-   if (test_bit(EXTENT_BUFFER_IOERR, eb-bflags)) {
+   if (test_bit(EXTENT_BUFFER_READ_ERR, eb-bflags)) {
ret = -EIO;
goto err;
}
@@ -683,7 +683,7 @@ static int btree_io_failed_hook(struct page *page, int 
failed_mirror)
struct btrfs_root *root = BTRFS_I(page-mapping-host)-root;
 
eb = (struct extent_buffer *)page-private;
-   set_bit(EXTENT_BUFFER_IOERR, eb-bflags);
+   set_bit(EXTENT_BUFFER_READ_ERR, eb-bflags);
eb-read_mirror = failed_mirror;
atomic_dec(eb-io_pages);
if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, eb-bflags))
@@ -2271,6 +2271,9 @@ int

[PATCH 2/2 v7] Btrfs: be aware of btree inode write errors to avoid fs corruption

2014-09-24 Thread Filipe Manana

While we have a transaction ongoing, the VM might decide at any time
to call btree_inode-i_mapping-a_ops-writepages(), which will start
writeback of dirty pages belonging to btree nodes/leafs. This call
might return an error or the writeback might finish with an error
before we attempt to commit the running transaction. If this happens,
we might have no way of knowing that such error happened when we are
committing the transaction - because the pages might no longer be
marked dirty nor tagged for writeback (if a subsequent modification
to the extent buffer didn't happen before the transaction commit) which
makes filemap_fdata[write|wait]_range unable to find such pages (even
if they're marked with SetPageError).
So if this happens we must abort the transaction, otherwise we commit
a super block with btree roots that point to btree nodes/leafs whose
content on disk is invalid - either garbage or the content of some
node/leaf from a past generation that got cowed or deleted and is no
longer valid (for this later case we end up getting error messages like
parent transid verify failed on 10826481664 wanted 25748 found 29562
when reading btree nodes/leafs from disk).

Note that setting and checking AS_EIO/AS_ENOSPC in the btree inode's
i_mapping would not be enough because we need to distinguish between
log tree extents (not fatal) vs non-log tree extents (fatal) and
because the next call to filemap_fdatawait_range() will catch and clear
such errors in the mapping - and that call might be from a log sync and
not from a transaction commit, which means we would not know about the
error at transaction commit time. Also, checking for the eb flag
EXTENT_BUFFER_IOERR at transaction commit time isn't done and would
not be completely reliable, as the eb might be removed from memory and
read back when trying to get it, which clears that flag right before
reading the eb's pages from disk, making us not know about the previous
write error.

Using the new counters eb_write_errors and log_eb_write_errors in the
transaction also makes us achieve the goal of AS_EIO/AS_ENOSPC when
writepages() returns success, started writeback for all dirty pages
and before filemap_fdatawait_range() is called, the writeback for
all dirty pages had already finished with errors - because we were
not using AS_EIO/AS_ENOSPC, filemap_fdatawait_range() would return
success, as it could not know that writeback errors happened (the
pages were no longer tagged for writeback).

Signed-off-by: Filipe Manana fdman...@suse.com
---

V2: If an extent buffer's write failed but it's also deleted from the tree
before the transaction commits, don't abort the transaction with -EIO,
since the unwritten node/leaf it represents can't be pointed to by any
other node in a tree.

V3: Correct V2, missed unstaged changes.

V4: Use root's key to figure out which counter to update.

V5: Decrement the error counters too when an eb is made dirty again (the
next write attempt might succeed).

V6: Moved counters from transaction struct to fs_info struct, because there's
a (short) time window where fs_info-running_transaction is NULL.
There's now 2 counters for log extent buffers too, each one representing
a different log transaction.

V7: Track the eb's log index in the eb itself, otherwise it wasn't possible
to find it when writeback triggered from a transaction commit.

 fs/btrfs/ctree.h   |   3 ++
 fs/btrfs/disk-io.c |   7 +++-
 fs/btrfs/extent-tree.c |   4 +-
 fs/btrfs/extent_io.c   | 101 +
 fs/btrfs/extent_io.h   |   6 ++-
 fs/btrfs/transaction.c |  18 +
 6 files changed, 128 insertions(+), 11 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index f20b60d..0e5ca39 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1510,6 +1510,9 @@ struct btrfs_fs_info {
atomic_t async_delalloc_pages;
atomic_t open_ioctl_trans;
 
+   atomic_t eb_write_errors;
+   atomic_t log_eb_write_errors[2];
+
/*
 * this is used to protect the following list -- ordered_roots.
 */
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 23393ec..8f1deca 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -610,7 +610,7 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio 
*io_bio,
goto err;
 
eb-read_mirror = mirror;
-   if (test_bit(EXTENT_BUFFER_IOERR, eb-bflags)) {
+   if (test_bit(EXTENT_BUFFER_READ_ERR, eb-bflags)) {
ret = -EIO;
goto err;
}
@@ -683,7 +683,7 @@ static int btree_io_failed_hook(struct page *page, int 
failed_mirror)
struct btrfs_root *root = BTRFS_I(page-mapping-host)-root;
 
eb = (struct extent_buffer *)page-private;
-   set_bit(EXTENT_BUFFER_IOERR, eb-bflags);
+   set_bit(EXTENT_BUFFER_READ_ERR, eb-bflags);
eb-read_mirror = failed_mirror;
atomic_dec(eb-io_pages

[PATCH 2/2 v8] Btrfs: be aware of btree inode write errors to avoid fs corruption

2014-09-25 Thread Filipe Manana

While we have a transaction ongoing, the VM might decide at any time
to call btree_inode-i_mapping-a_ops-writepages(), which will start
writeback of dirty pages belonging to btree nodes/leafs. This call
might return an error or the writeback might finish with an error
before we attempt to commit the running transaction. If this happens,
we might have no way of knowing that such error happened when we are
committing the transaction - because the pages might no longer be
marked dirty nor tagged for writeback (if a subsequent modification
to the extent buffer didn't happen before the transaction commit) which
makes filemap_fdata[write|wait]_range unable to find such pages (even
if they're marked with SetPageError).
So if this happens we must abort the transaction, otherwise we commit
a super block with btree roots that point to btree nodes/leafs whose
content on disk is invalid - either garbage or the content of some
node/leaf from a past generation that got cowed or deleted and is no
longer valid (for this later case we end up getting error messages like
parent transid verify failed on 10826481664 wanted 25748 found 29562
when reading btree nodes/leafs from disk).

Note that setting and checking AS_EIO/AS_ENOSPC in the btree inode's
i_mapping would not be enough because we need to distinguish between
log tree extents (not fatal) vs non-log tree extents (fatal) and
because the next call to filemap_fdatawait_range() will catch and clear
such errors in the mapping - and that call might be from a log sync and
not from a transaction commit, which means we would not know about the
error at transaction commit time. Also, checking for the eb flag
EXTENT_BUFFER_IOERR at transaction commit time isn't done and would
not be completely reliable, as the eb might be removed from memory and
read back when trying to get it, which clears that flag right before
reading the eb's pages from disk, making us not know about the previous
write error.

Using the new counters eb_write_errors and log_eb_write_errors in the
transaction also makes us achieve the goal of AS_EIO/AS_ENOSPC when
writepages() returns success, started writeback for all dirty pages
and before filemap_fdatawait_range() is called, the writeback for
all dirty pages had already finished with errors - because we were
not using AS_EIO/AS_ENOSPC, filemap_fdatawait_range() would return
success, as it could not know that writeback errors happened (the
pages were no longer tagged for writeback).

Signed-off-by: Filipe Manana fdman...@suse.com
---

V2: If an extent buffer's write failed but it's also deleted from the tree
before the transaction commits, don't abort the transaction with -EIO,
since the unwritten node/leaf it represents can't be pointed to by any
other node in a tree.

V3: Correct V2, missed unstaged changes.

V4: Use root's key to figure out which counter to update.

V5: Decrement the error counters too when an eb is made dirty again (the
next write attempt might succeed).

V6: Moved counters from transaction struct to fs_info struct, because there's
a (short) time window where fs_info-running_transaction is NULL.
There's now 2 counters for log extent buffers too, each one representing
a different log transaction.

V7: Track the eb's log index in the eb itself, otherwise it wasn't possible
to find it when writeback triggered from a transaction commit.

V8: Track the log eb write errors per root instead, and reset them on a
transaction commit.

 fs/btrfs/ctree.h   |   2 +
 fs/btrfs/disk-io.c |   7 +++-
 fs/btrfs/extent-tree.c |   4 +-
 fs/btrfs/extent_io.c   | 100 +
 fs/btrfs/extent_io.h   |   8 +++-
 fs/btrfs/transaction.c |  18 +
 fs/btrfs/tree-log.c|   2 +
 7 files changed, 129 insertions(+), 12 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index f20b60d..96f5186 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1509,6 +1509,7 @@ struct btrfs_fs_info {
atomic_t nr_async_bios;
atomic_t async_delalloc_pages;
atomic_t open_ioctl_trans;
+   atomic_t eb_write_errors;
 
/*
 * this is used to protect the following list -- ordered_roots.
@@ -1790,6 +1791,7 @@ struct btrfs_root {
atomic_t log_writers;
atomic_t log_commit[2];
atomic_t log_batch;
+   atomic_t log_eb_write_errors[2];
int log_transid;
/* No matter the commit succeeds or not*/
int log_transid_committed;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 23393ec..e792ee3 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -610,7 +610,7 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio 
*io_bio,
goto err;
 
eb-read_mirror = mirror;
-   if (test_bit(EXTENT_BUFFER_IOERR, eb-bflags)) {
+   if (test_bit(EXTENT_BUFFER_READ_ERR, eb-bflags)) {
ret = -EIO;
goto err;
}
@@ -683,7 +683,7

[PATCH v3] xfstests: generic: add dir fsync test, motivated by a btrfs bug

2014-09-25 Thread Filipe Manana

This test is motivated by a bug found in btrfs when replaying a
directory from the fsync log. The issue was that if a directory
entry is both found in the persisted metadata and in the fsync
log, at log replay time the directory got set with a wrong i_size.
This had the consequence of not being able to rmdir empty
directories (failed with errno ENOTEMPTY).
This was fixed in btrfs with the following linux kernel patch:

 Btrfs: fix directory recovery from fsync log

Signed-off-by: Filipe Manana fdman...@suse.com
---

V2: Replaced direct use of xfs_io with $XFS_IO_PROG.

V3: Made the explanations more clear and mention what's the consequence
of getting a directory's inode with an incorrect i_size on btrfs.
Changed the test to not call the filesystem check (since it's already
done by the test framework) and verify that an rmdir against an empty
directory succeeds.

 tests/generic/326 | 105 ++
 tests/generic/326.out |   2 +
 tests/generic/group   |   1 +
 3 files changed, 108 insertions(+)
 create mode 100755 tests/generic/326
 create mode 100644 tests/generic/326.out

diff --git a/tests/generic/326 b/tests/generic/326
new file mode 100755
index 000..e59c560
--- /dev/null
+++ b/tests/generic/326
@@ -0,0 +1,105 @@
+#! /bin/bash
+# FS QA Test No. 326
+#
+# This test is motivated by a bug found in btrfs when replaying a directory
+# from the fsync log. The issue was that if a directory entry is both found
+# in the persisted metadata and in the fsync log, at log replay time the
+# directory got set with a wrong i_size. This had the consequence of not being
+# able to rmdir empty directories (failed with errno ENOTEMPTY).
+# This was fixed in btrfs with the following linux kernel patch:
+#
+# Btrfs: fix directory recovery from fsync log
+#
+#---
+# Copyright (C) 2014 SUSE Linux Products GmbH. All Rights Reserved.
+# Author: Filipe Manana fdman...@suse.com
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it would be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write the Free Software Foundation,
+# Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+#---
+#
+
+seq=`basename $0`
+seqres=$RESULT_DIR/$seq
+echo QA output created by $seq
+
+here=`pwd`
+status=1   # failure is the default!
+
+_cleanup()
+{
+   _cleanup_flakey
+}
+trap _cleanup; exit \$status 0 1 2 3 15
+
+# get standard environment, filters and checks
+. ./common/rc
+. ./common/filter
+. ./common/dmflakey
+
+# real QA test starts here
+_supported_fs generic
+_supported_os Linux
+_need_to_be_root
+_require_scratch
+_require_dm_flakey
+
+rm -f $seqres.full
+
+_scratch_mkfs  $seqres.full 21
+
+_init_flakey
+_mount_flakey
+
+mkdir $SCRATCH_MNT/test_dir
+touch $SCRATCH_MNT/test_dir/foo
+
+# Invoke sync here because it's necessary to trigger the original bug in btrfs.
+# The intention is that at log recovery time we have a dir entry for 'foo' both
+# in the fs/subvol tree and in the log tree - this is necessary to trigger the
+# bug on btrfs.
+sync
+
+touch $SCRATCH_MNT/test_dir/bar
+$XFS_IO_PROG -c fsync $SCRATCH_MNT/test_dir
+$XFS_IO_PROG -c fsync $SCRATCH_MNT/test_dir/bar
+
+_load_flakey_table $FLAKEY_DROP_WRITES
+_unmount_flakey
+
+_load_flakey_table $FLAKEY_ALLOW_WRITES
+# In the original btrfs bug, log replay would update the directory's inode
+# i_size incorrectly - it would sum again the size of dentry 'foo' (3) to
+# the inode's i_size, which is incorrect because the dentry was already
+# persisted before (in the fs/subvol tree).
+_mount_flakey
+
+[ -f $SCRATCH_MNT/test_dir/foo ] || echo file foo is missing
+[ -f $SCRATCH_MNT/test_dir/bar ] || echo file bar is missing
+
+rm -f $SCRATCH_MNT/test_dir/foo
+rm -f $SCRATCH_MNT/test_dir/bar
+
+# In btrfs removing all entries from a directory should set the directory's
+# inode i_size to 0, but with this bug that didn't happen and this made
+# an rmdir fail with errno ENOTEMPTY (even though the directory had no more
+# entries in it).
+rmdir $SCRATCH_MNT/test_dir
+[ -d $SCRATCH_MNT/test_dir ]  echo rmdir didn't succeed
+
+_unmount_flakey
+
+echo Silence is golden
+
+status=0
+exit
diff --git a/tests/generic/326.out b/tests/generic/326.out
new file mode 100644
index 000..4ac0db5
--- /dev/null
+++ b/tests/generic/326.out
@@ -0,0 +1,2 @@
+QA output created by 326
+Silence is golden
diff --git a/tests/generic/group b/tests/generic/group
index bdcfd9d

[PATCH 2/2 v9] Btrfs: be aware of btree inode write errors to avoid fs corruption

2014-09-25 Thread Filipe Manana

While we have a transaction ongoing, the VM might decide at any time
to call btree_inode-i_mapping-a_ops-writepages(), which will start
writeback of dirty pages belonging to btree nodes/leafs. This call
might return an error or the writeback might finish with an error
before we attempt to commit the running transaction. If this happens,
we might have no way of knowing that such error happened when we are
committing the transaction - because the pages might no longer be
marked dirty nor tagged for writeback (if a subsequent modification
to the extent buffer didn't happen before the transaction commit) which
makes filemap_fdata[write|wait]_range unable to find such pages (even
if they're marked with SetPageError).
So if this happens we must abort the transaction, otherwise we commit
a super block with btree roots that point to btree nodes/leafs whose
content on disk is invalid - either garbage or the content of some
node/leaf from a past generation that got cowed or deleted and is no
longer valid (for this later case we end up getting error messages like
parent transid verify failed on 10826481664 wanted 25748 found 29562
when reading btree nodes/leafs from disk).

Note that setting and checking AS_EIO/AS_ENOSPC in the btree inode's
i_mapping would not be enough because we need to distinguish between
log tree extents (not fatal) vs non-log tree extents (fatal) and
because the next call to filemap_fdatawait_range() will catch and clear
such errors in the mapping - and that call might be from a log sync and
not from a transaction commit, which means we would not know about the
error at transaction commit time. Also, checking for the eb flag
EXTENT_BUFFER_IOERR at transaction commit time isn't done and would
not be completely reliable, as the eb might be removed from memory and
read back when trying to get it, which clears that flag right before
reading the eb's pages from disk, making us not know about the previous
write error.

Using the new counters eb_write_errors and log_eb_write_errors in the
transaction also makes us achieve the goal of AS_EIO/AS_ENOSPC when
writepages() returns success, started writeback for all dirty pages
and before filemap_fdatawait_range() is called, the writeback for
all dirty pages had already finished with errors - because we were
not using AS_EIO/AS_ENOSPC, filemap_fdatawait_range() would return
success, as it could not know that writeback errors happened (the
pages were no longer tagged for writeback).

Signed-off-by: Filipe Manana fdman...@suse.com
---

V2: If an extent buffer's write failed but it's also deleted from the tree
before the transaction commits, don't abort the transaction with -EIO,
since the unwritten node/leaf it represents can't be pointed to by any
other node in a tree.

V3: Correct V2, missed unstaged changes.

V4: Use root's key to figure out which counter to update.

V5: Decrement the error counters too when an eb is made dirty again (the
next write attempt might succeed).

V6: Moved counters from transaction struct to fs_info struct, because there's
a (short) time window where fs_info-running_transaction is NULL.
There's now 2 counters for log extent buffers too, each one representing
a different log transaction.

V7: Track the eb's log index in the eb itself, otherwise it wasn't possible
to find it when writeback triggered from a transaction commit.

V8: Track the log eb write errors per root instead, and reset them on a
transaction commit.

V9: Don't decrement the error counters if the eb is deleted or re-written.
It is not safe because there's a time window when committing a transaction,
between setting fs_info-current_transaction to NULL and checking the
error counters in btrfs_write_and_wait_transaction(), where a new 
transaction
can start and delete or re-write an eb that has the write error flag set.
If this happens it means the previous transaction can write a superblock
that refers to trees that point to unwritten nodes.
Replaced the counters with simple flags in the btree inode's runtime
flags - essentially back to V1 but accounting for the 2 different log
sub-transactions.
Removed access to an eb's parent root through
BTRFS_I(eb-pages[0]-mapping-host)-root since it was not correct, as this
always gives us the btree inode's root (objectid 1ULL). Instead use the
field eb-log_index to know wether it's a log btree eb (and which sub-
-transaction) or a non-log btree eb.

 fs/btrfs/btrfs_inode.h | 11 
 fs/btrfs/disk-io.c |  4 +--
 fs/btrfs/extent-tree.c |  4 ++-
 fs/btrfs/extent_io.c   | 76 +-
 fs/btrfs/extent_io.h   |  8 --
 fs/btrfs/transaction.c | 22 +++
 fs/btrfs/tree-log.c|  6 
 7 files changed, 119 insertions(+), 12 deletions(-)

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 3511031..aee4050 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs

[PATCH 2/2 v10] Btrfs: be aware of btree inode write errors to avoid fs corruption

2014-09-25 Thread Filipe Manana

While we have a transaction ongoing, the VM might decide at any time
to call btree_inode-i_mapping-a_ops-writepages(), which will start
writeback of dirty pages belonging to btree nodes/leafs. This call
might return an error or the writeback might finish with an error
before we attempt to commit the running transaction. If this happens,
we might have no way of knowing that such error happened when we are
committing the transaction - because the pages might no longer be
marked dirty nor tagged for writeback (if a subsequent modification
to the extent buffer didn't happen before the transaction commit) which
makes filemap_fdata[write|wait]_range unable to find such pages (even
if they're marked with SetPageError).
So if this happens we must abort the transaction, otherwise we commit
a super block with btree roots that point to btree nodes/leafs whose
content on disk is invalid - either garbage or the content of some
node/leaf from a past generation that got cowed or deleted and is no
longer valid (for this later case we end up getting error messages like
parent transid verify failed on 10826481664 wanted 25748 found 29562
when reading btree nodes/leafs from disk).

Note that setting and checking AS_EIO/AS_ENOSPC in the btree inode's
i_mapping would not be enough because we need to distinguish between
log tree extents (not fatal) vs non-log tree extents (fatal) and
because the next call to filemap_fdatawait_range() will catch and clear
such errors in the mapping - and that call might be from a log sync and
not from a transaction commit, which means we would not know about the
error at transaction commit time. Also, checking for the eb flag
EXTENT_BUFFER_IOERR at transaction commit time isn't done and would
not be completely reliable, as the eb might be removed from memory and
read back when trying to get it, which clears that flag right before
reading the eb's pages from disk, making us not know about the previous
write error.

Using the new counters eb_write_errors and log_eb_write_errors in the
transaction also makes us achieve the goal of AS_EIO/AS_ENOSPC when
writepages() returns success, started writeback for all dirty pages
and before filemap_fdatawait_range() is called, the writeback for
all dirty pages had already finished with errors - because we were
not using AS_EIO/AS_ENOSPC, filemap_fdatawait_range() would return
success, as it could not know that writeback errors happened (the
pages were no longer tagged for writeback).

Signed-off-by: Filipe Manana fdman...@suse.com
---

V2: If an extent buffer's write failed but it's also deleted from the tree
before the transaction commits, don't abort the transaction with -EIO,
since the unwritten node/leaf it represents can't be pointed to by any
other node in a tree.

V3: Correct V2, missed unstaged changes.

V4: Use root's key to figure out which counter to update.

V5: Decrement the error counters too when an eb is made dirty again (the
next write attempt might succeed).

V6: Moved counters from transaction struct to fs_info struct, because there's
a (short) time window where fs_info-running_transaction is NULL.
There's now 2 counters for log extent buffers too, each one representing
a different log transaction.

V7: Track the eb's log index in the eb itself, otherwise it wasn't possible
to find it when writeback triggered from a transaction commit.

V8: Track the log eb write errors per root instead, and reset them on a
transaction commit.

V9: Don't decrement the error counters if the eb is deleted or re-written.
It is not safe because there's a time window when committing a transaction,
between setting fs_info-current_transaction to NULL and checking the
error counters in btrfs_write_and_wait_transaction(), where a new 
transaction
can start and delete or re-write an eb that has the write error flag set.
If this happens it means the previous transaction can write a superblock
that refers to trees that point to unwritten nodes.
Replaced the counters with simple flags in the btree inode's runtime
flags - essentially back to V1 but accounting for the 2 different log
sub-transactions.
Removed access to an eb's parent root through
BTRFS_I(eb-pages[0]-mapping-host)-root since it was not correct, as this
always gives us the btree inode's root (objectid 1ULL). Instead use the
field eb-log_index to know wether it's a log btree eb (and which sub-
-transaction) or a non-log btree eb.

V10: Clear the log eb write error flags in a more logical place (transaction
 commit function).

 fs/btrfs/btrfs_inode.h | 11 
 fs/btrfs/disk-io.c |  4 +--
 fs/btrfs/extent-tree.c |  4 ++-
 fs/btrfs/extent_io.c   | 76 +-
 fs/btrfs/extent_io.h   |  8 --
 fs/btrfs/transaction.c | 26 +
 6 files changed, 117 insertions(+), 12 deletions(-)

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index

[PATCH 2/2 v11] Btrfs: be aware of btree inode write errors to avoid fs corruption

2014-09-25 Thread Filipe Manana

While we have a transaction ongoing, the VM might decide at any time
to call btree_inode-i_mapping-a_ops-writepages(), which will start
writeback of dirty pages belonging to btree nodes/leafs. This call
might return an error or the writeback might finish with an error
before we attempt to commit the running transaction. If this happens,
we might have no way of knowing that such error happened when we are
committing the transaction - because the pages might no longer be
marked dirty nor tagged for writeback (if a subsequent modification
to the extent buffer didn't happen before the transaction commit) which
makes filemap_fdata[write|wait]_range unable to find such pages (even
if they're marked with SetPageError).
So if this happens we must abort the transaction, otherwise we commit
a super block with btree roots that point to btree nodes/leafs whose
content on disk is invalid - either garbage or the content of some
node/leaf from a past generation that got cowed or deleted and is no
longer valid (for this later case we end up getting error messages like
parent transid verify failed on 10826481664 wanted 25748 found 29562
when reading btree nodes/leafs from disk).

Note that setting and checking AS_EIO/AS_ENOSPC in the btree inode's
i_mapping would not be enough because we need to distinguish between
log tree extents (not fatal) vs non-log tree extents (fatal) and
because the next call to filemap_fdatawait_range() will catch and clear
such errors in the mapping - and that call might be from a log sync and
not from a transaction commit, which means we would not know about the
error at transaction commit time. Also, checking for the eb flag
EXTENT_BUFFER_IOERR at transaction commit time isn't done and would
not be completely reliable, as the eb might be removed from memory and
read back when trying to get it, which clears that flag right before
reading the eb's pages from disk, making us not know about the previous
write error.

Using the new 3 flags for the btree inode also makes us achieve the
goal of AS_EIO/AS_ENOSPC when writepages() returns success, started
writeback for all dirty pages and before filemap_fdatawait_range() is
called, the writeback for all dirty pages had already finished with
errors - because we were not using AS_EIO/AS_ENOSPC,
filemap_fdatawait_range() would return success, as it could not know
that writeback errors happened (the pages were no longer tagged for
writeback).

Signed-off-by: Filipe Manana fdman...@suse.com
---

V2: If an extent buffer's write failed but it's also deleted from the tree
before the transaction commits, don't abort the transaction with -EIO,
since the unwritten node/leaf it represents can't be pointed to by any
other node in a tree.

V3: Correct V2, missed unstaged changes.

V4: Use root's key to figure out which counter to update.

V5: Decrement the error counters too when an eb is made dirty again (the
next write attempt might succeed).

V6: Moved counters from transaction struct to fs_info struct, because there's
a (short) time window where fs_info-running_transaction is NULL.
There's now 2 counters for log extent buffers too, each one representing
a different log transaction.

V7: Track the eb's log index in the eb itself, otherwise it wasn't possible
to find it when writeback triggered from a transaction commit.

V8: Track the log eb write errors per root instead, and reset them on a
transaction commit.

V9: Don't decrement the error counters if the eb is deleted or re-written.
It is not safe because there's a time window when committing a transaction,
between setting fs_info-current_transaction to NULL and checking the
error counters in btrfs_write_and_wait_transaction(), where a new 
transaction
can start and delete or re-write an eb that has the write error flag set.
If this happens it means the previous transaction can write a superblock
that refers to trees that point to unwritten nodes.
Replaced the counters with simple flags in the btree inode's runtime
flags - essentially back to V1 but accounting for the 2 different log
sub-transactions.
Removed access to an eb's parent root through
BTRFS_I(eb-pages[0]-mapping-host)-root since it was not correct, as this
always gives us the btree inode's root (objectid 1ULL). Instead use the
field eb-log_index to know wether it's a log btree eb (and which sub-
-transaction) or a non-log btree eb.

V10: Clear the log eb write error flags in a more logical place (transaction
 commit function).

V11: Updated commit message and a comment, replaced an ASSERT() with a BUG()
 and changed eb-lock_nested to a short to keep the structure size.

 fs/btrfs/btrfs_inode.h | 11 
 fs/btrfs/disk-io.c |  4 +--
 fs/btrfs/extent-tree.c |  4 ++-
 fs/btrfs/extent_io.c   | 76 +-
 fs/btrfs/extent_io.h   |  8 --
 fs/btrfs/transaction.c | 26 +
 6 files changed

[PATCH 2/2 v12] Btrfs: be aware of btree inode write errors to avoid fs corruption

2014-09-26 Thread Filipe Manana

While we have a transaction ongoing, the VM might decide at any time
to call btree_inode-i_mapping-a_ops-writepages(), which will start
writeback of dirty pages belonging to btree nodes/leafs. This call
might return an error or the writeback might finish with an error
before we attempt to commit the running transaction. If this happens,
we might have no way of knowing that such error happened when we are
committing the transaction - because the pages might no longer be
marked dirty nor tagged for writeback (if a subsequent modification
to the extent buffer didn't happen before the transaction commit) which
makes filemap_fdata[write|wait]_range unable to find such pages (even
if they're marked with SetPageError).
So if this happens we must abort the transaction, otherwise we commit
a super block with btree roots that point to btree nodes/leafs whose
content on disk is invalid - either garbage or the content of some
node/leaf from a past generation that got cowed or deleted and is no
longer valid (for this later case we end up getting error messages like
parent transid verify failed on 10826481664 wanted 25748 found 29562
when reading btree nodes/leafs from disk).

Note that setting and checking AS_EIO/AS_ENOSPC in the btree inode's
i_mapping would not be enough because we need to distinguish between
log tree extents (not fatal) vs non-log tree extents (fatal) and
because the next call to filemap_fdatawait_range() will catch and clear
such errors in the mapping - and that call might be from a log sync and
not from a transaction commit, which means we would not know about the
error at transaction commit time. Also, checking for the eb flag
EXTENT_BUFFER_IOERR at transaction commit time isn't done and would
not be completely reliable, as the eb might be removed from memory and
read back when trying to get it, which clears that flag right before
reading the eb's pages from disk, making us not know about the previous
write error.

Using the new 3 flags for the btree inode also makes us achieve the
goal of AS_EIO/AS_ENOSPC when writepages() returns success, started
writeback for all dirty pages and before filemap_fdatawait_range() is
called, the writeback for all dirty pages had already finished with
errors - because we were not using AS_EIO/AS_ENOSPC,
filemap_fdatawait_range() would return success, as it could not know
that writeback errors happened (the pages were no longer tagged for
writeback).

Signed-off-by: Filipe Manana fdman...@suse.com
---

V2: If an extent buffer's write failed but it's also deleted from the tree
before the transaction commits, don't abort the transaction with -EIO,
since the unwritten node/leaf it represents can't be pointed to by any
other node in a tree.

V3: Correct V2, missed unstaged changes.

V4: Use root's key to figure out which counter to update.

V5: Decrement the error counters too when an eb is made dirty again (the
next write attempt might succeed).

V6: Moved counters from transaction struct to fs_info struct, because there's
a (short) time window where fs_info-running_transaction is NULL.
There's now 2 counters for log extent buffers too, each one representing
a different log transaction.

V7: Track the eb's log index in the eb itself, otherwise it wasn't possible
to find it when writeback triggered from a transaction commit.

V8: Track the log eb write errors per root instead, and reset them on a
transaction commit.

V9: Don't decrement the error counters if the eb is deleted or re-written.
It is not safe because there's a time window when committing a transaction,
between setting fs_info-current_transaction to NULL and checking the
error counters in btrfs_write_and_wait_transaction(), where a new 
transaction
can start and delete or re-write an eb that has the write error flag set.
If this happens it means the previous transaction can write a superblock
that refers to trees that point to unwritten nodes.
Replaced the counters with simple flags in the btree inode's runtime
flags - essentially back to V1 but accounting for the 2 different log
sub-transactions.
Removed access to an eb's parent root through
BTRFS_I(eb-pages[0]-mapping-host)-root since it was not correct, as this
always gives us the btree inode's root (objectid 1ULL). Instead use the
field eb-log_index to know wether it's a log btree eb (and which sub-
-transaction) or a non-log btree eb.

V10: Clear the log eb write error flags in a more logical place (transaction
 commit function).

V11: Updated commit message and a comment, replaced an ASSERT() with a BUG()
 and changed eb-lock_nested to a short to keep the structure size.

V12: Removed leftovers from previous versions (no longer necessary #include and
 prototype in extent_io.h of no longer existing function) and updated parts
 from a comment that apply only to some past versions.
 Rebased against latest integration branch (didn't apply

[PATCH] Btrfs: send, don't delay dir move if there's a new parent inode

2014-10-02 Thread Filipe Manana

If between two snapshots we rename an existing directory named X to Y and
make it a child (direct or not) of a new inode named X, we were delaying
the move/rename of the former directory unnecessarily, which would result
in attempting to rename the new directory from its orphan name to name X
prematurely.

Minimal reproducer:

$ mkfs.btrfs -f /dev/vdd
$ mount /dev/vdd /mnt
$ mkdir -p /mnt/merlin/RC/OSD/Source

$ btrfs subvolume snapshot -r /mnt /mnt/mysnap1

$ mkdir /mnt/OSD
$ mv /mnt/merlin/RC/OSD /mnt/OSD/OSD-Plane_788
$ mv /mnt/OSD /mnt/merlin/RC

$ btrfs subvolume snapshot -r /mnt /mnt/mysnap2

$ btrfs send /mnt/mysnap1 -f /tmp/1.snap
$ btrfs send -p /mnt/mysnap1 /mnt/mysnap2 -f /tmp/2.snap

$ mkfs.btrfs -f /dev/vdc
$ mount /dev/vdc /mnt2

$ btrfs receive /mnt2 -f /tmp/1.snap
$ btrfs receive /mnt2 -f /tmp/2.snap

The second receive (from an incremental send) failed with the following
error message: rename o261-7-0 - merlin/RC/OSD failed.
This is a regression introduced in the 3.16 kernel.

A test case for xfstests follows.

Reported-by: Marc Merlin m...@merlins.org
Signed-off-by: Filipe Manana fdman...@suse.com
---
 fs/btrfs/send.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 7edfc7c..b9c27aa 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -3327,7 +3327,7 @@ static int wait_for_parent_move(struct send_ctx *sctx,
if (ret  0  ret != -ENOENT) {
goto out;
} else if (ret == -ENOENT) {
-   ret = 1;
+   ret = 0;
break;
}
 
-- 
1.9.1

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH] fstests: regression test for btrfs incremental send

2014-10-02 Thread Filipe Manana

This is a regression test for a btrfs incremental send issue.
If between two snapshots we rename an existing directory named X to Y and
make it a child (direct or not) of a new inode named X, we were delaying
the move/rename of the former directory unnecessarily, which would result
in attempting to rename the new directory from its orphan name to name X
prematurely. This made btrfs receive fail with an error message like the
following:

   rename o261-7-0 - merlin/RC/OSD failed

This issue was a regression in the 3.16 kernel and got fixed by the
following linux kernel btrfs patch:

   Btrfs: send, don't delay dir move if there's a new parent inode

Signed-off-by: Filipe Manana fdman...@suse.com
---
 tests/btrfs/060 | 148 
 tests/btrfs/060.out |   2 +
 tests/btrfs/group   |   1 +
 3 files changed, 151 insertions(+)
 create mode 100755 tests/btrfs/060
 create mode 100644 tests/btrfs/060.out

diff --git a/tests/btrfs/060 b/tests/btrfs/060
new file mode 100755
index 000..20dc0ad
--- /dev/null
+++ b/tests/btrfs/060
@@ -0,0 +1,148 @@
+#! /bin/bash
+# FS QA Test No. btrfs/060
+#
+# Regression test for a btrfs incremental send issue.
+# If between two snapshots we rename an existing directory named X to Y and
+# make it a child (direct or not) of a new inode named X, we were delaying
+# the move/rename of the former directory unnecessarily, which would result
+# in attempting to rename the new directory from its orphan name to name X
+# prematurely. This made btrfs receive fail with an error message like the
+# following:
+#
+#   rename o261-7-0 - merlin/RC/OSD failed
+#
+# This issue was a regression in the 3.16 kernel and got fixed by the following
+# linux kernel btrfs patch:
+#
+#   Btrfs: send, don't delay dir move if there's a new parent inode
+#
+#---
+# Copyright (C) 2014 SUSE Linux Products GmbH. All Rights Reserved.
+# Author: Filipe Manana fdman...@suse.com
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it would be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write the Free Software Foundation,
+# Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+#---
+#
+
+seq=`basename $0`
+seqres=$RESULT_DIR/$seq
+echo QA output created by $seq
+
+tmp=`mktemp -d`
+status=1   # failure is the default!
+trap _cleanup; exit \$status 0 1 2 3 15
+
+_cleanup()
+{
+rm -fr $send_files_dir
+rm -fr $tmp
+}
+
+# get standard environment, filters and checks
+. ./common/rc
+. ./common/filter
+
+# real QA test starts here
+_supported_fs btrfs
+_supported_os Linux
+_require_scratch
+_require_fssum
+_need_to_be_root
+
+send_files_dir=$TEST_DIR/btrfs-test-$seq
+
+rm -f $seqres.full
+rm -fr $send_files_dir
+mkdir $send_files_dir
+
+_scratch_mkfs $seqres.full 21
+_scratch_mount
+
+mkdir -p $SCRATCH_MNT/merlin/RC/OSD/Source
+mkdir -p $SCRATCH_MNT/fdm/RCz/OSDz/Sourcez
+mkdir -p $SCRATCH_MNT/Z/Z2
+
+# Filesystem looks like:
+#
+# . (ino 256)
+# | merlin/ (ino 257)
+# || RC/(ino 258)
+# |  |- OSD/(ino 259)
+# |  | Source/  (ino 260)
+# |
+# | fdm/(ino 261)
+# |   | RCz/(ino 262)
+# | |- OSDz/(ino 263)
+# | | Sourcez/  (ino 264)
+# |
+# | Z/  (ino 265)
+#   | Z2/   (ino 266)
+#
+_run_btrfs_util_prog subvolume snapshot -r $SCRATCH_MNT $SCRATCH_MNT/mysnap1
+
+mkdir $SCRATCH_MNT/OSD
+mv $SCRATCH_MNT/merlin/RC/OSD $SCRATCH_MNT/OSD/OSD-Plane_788
+mv $SCRATCH_MNT/OSD $SCRATCH_MNT/merlin/RC
+
+mkdir $SCRATCH_MNT/OSDz
+mv $SCRATCH_MNT/Z/Z2 $SCRATCH_MNT/OSDz/xz2
+mv $SCRATCH_MNT/Z $SCRATCH_MNT/OSDz/xz2/xz
+mv $SCRATCH_MNT/fdm/RCz/OSDz $SCRATCH_MNT/OSDz/xz2/xz/OSD-Plane_788z
+mv $SCRATCH_MNT/OSDz $SCRATCH_MNT/fdm/RCz
+
+# Filesystem now looks like:
+#
+#
+# .  (ino 256)
+# | merlin/  (ino 257)
+# |   | RC/  (ino 258)
+# | |- OSD/ (ino 267

Re: FIBMAP unsupported

2014-10-03 Thread Filipe Manana

On 10/02/2014 11:11 PM, Marc Dietrich wrote:
Am Donnerstag 02 Oktober 2014, 21:55:55 schrieb Marc Dietrich:
Will try to restore the file using btrfs restore

ok, restore worked. I did some more tests. This is unrelated to CoW. It seems
that the fallocate -n in combination with dd conv=notrunc using large
files (10G) triggers it. Maybe this rings some bells.

Just tried it and I confirm filefrag's call to ioctl FS_IOC_FIEMAP fails
with -EEXIST.

It's actually a known issue affecting any of the 3.17 RCs (except RC1).
The extent map manipulation/merging is broken for some cases. Try with
this 2 patches on top of 3.17-rcX:

https://patchwork.kernel.org/patch/4929981/
https://patchwork.kernel.org/patch/4945191/

Or, alternatively, reverting this patch:
https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=51f395ad4058883e4273b02fdebe98072dbdc0d2

Someone else reported on this list a write/pwrite/writev failure with
errno EEXIST too (and apparently caused by the same reason).

This broken extent map handling is serious IMHO, it can make fsync log
bogus extent items for example, amongst other possible bad and weird things.

Marc

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html

[PATCH 4/5] Btrfs: process all async extents on compressed write failure

2014-10-06 Thread Filipe Manana

If we had an error when processing one of the async extents from our list,
we were not processing the remaining async extents, meaning we would leak
those async_extent structs, never release the pages with the compressed
data and never unlock and clear the dirty flag from the inode's pages (those
that correspond to the uncompressed content).

Signed-off-by: Filipe Manana fdman...@suse.com
---
 fs/btrfs/inode.c | 6 +-
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a3e2330..8636499 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -839,13 +839,9 @@ retry:
}
alloc_hint = ins.objectid + ins.offset;
kfree(async_extent);
-   if (ret)
-   goto out;
cond_resched();
}
-   ret = 0;
-out:
-   return ret;
+   return 0;
 out_free_reserve:
btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
 out_free:
-- 
1.9.1

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 1/5] Btrfs: set page and mapping error on compressed write failure

2014-10-06 Thread Filipe Manana

If we fail in submit_compressed_extents() before calling 
btrfs_submit_compressed_write(),
we start and end the writeback for the pages (clear their dirty flag, unlock 
them, etc)
but we don't tag the pages, nor the inode's mapping, with an error. This makes 
it
impossible for a caller of filemap_fdatawait_range() (fsync, or transaction 
commit
for e.g.) know that there was an error.

Note that the return value of submit_compressed_extents() is useless, as that 
function
is executed by a workqueue task and not directly by the fill_delalloc callback. 
This
means the writepage/s callbacks of the inode's address space operations don't 
get that
return value.

Signed-off-by: Filipe Manana fdman...@suse.com
---
 fs/btrfs/extent_io.c | 5 +
 fs/btrfs/extent_io.h | 1 +
 fs/btrfs/inode.c | 3 ++-
 3 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 9cc757f..865594c 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1746,6 +1746,9 @@ int extent_clear_unlock_delalloc(struct inode *inode, u64 
start, u64 end,
if (page_ops == 0)
return 0;
 
+   if ((page_ops  PAGE_SET_ERROR)  nr_pages  0)
+   mapping_set_error(inode-i_mapping, -EIO);
+
while (nr_pages  0) {
ret = find_get_pages_contig(inode-i_mapping, index,
 min_t(unsigned long,
@@ -1763,6 +1766,8 @@ int extent_clear_unlock_delalloc(struct inode *inode, u64 
start, u64 end,
clear_page_dirty_for_io(pages[i]);
if (page_ops  PAGE_SET_WRITEBACK)
set_page_writeback(pages[i]);
+   if (page_ops  PAGE_SET_ERROR)
+   SetPageError(pages[i]);
if (page_ops  PAGE_END_WRITEBACK)
end_page_writeback(pages[i]);
if (page_ops  PAGE_UNLOCK)
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 06f030c..5654e14 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -52,6 +52,7 @@
 #define PAGE_SET_WRITEBACK (1  2)
 #define PAGE_END_WRITEBACK (1  3)
 #define PAGE_SET_PRIVATE2  (1  4)
+#define PAGE_SET_ERROR (1  5)
 
 /*
  * page-private values.  Every page that is controlled by the extent
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 344a322..cefa618 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -832,7 +832,8 @@ out_free:
 NULL, EXTENT_LOCKED | EXTENT_DELALLOC |
 EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
 PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
-PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK);
+PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK |
+PAGE_SET_ERROR);
kfree(async_extent);
goto again;
 }
-- 
1.9.1

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 2/5] Btrfs: fix hang on compressed write error

2014-10-06 Thread Filipe Manana

In inode.c:submit_compressed_extents(), before calling 
btrfs_submit_compressed_write()
we start writeback for all pages, clear their dirty flag, unlock them, etc, but 
if
btrfs_submit_compressed_write() fails (at the moment it can only fail with 
-ENOMEM),
we never end the writeback on the pages, so any filemap_fdatawait_range() call 
will
hang forever. We were also not calling the writepage end io hook, which means 
the
corresponding ordered extent will never complete and all its waiters will block
forever, such as a full fsync (via btrfs_wait_ordered_range()).

Signed-off-by: Filipe Manana fdman...@suse.com
---
 fs/btrfs/inode.c | 14 ++
 1 file changed, 14 insertions(+)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index cefa618..e2c4650 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -814,6 +814,20 @@ retry:
ins.objectid,
ins.offset, async_extent-pages,
async_extent-nr_pages);
+   if (ret) {
+   struct extent_io_tree *tree = BTRFS_I(inode)-io_tree;
+   struct page *p = async_extent-pages[0];
+   const u64 start = async_extent-start;
+   const u64 end = start + async_extent-ram_size - 1;
+
+   p-mapping = inode-i_mapping;
+   tree-ops-writepage_end_io_hook(p, start, end,
+NULL, 0);
+   p-mapping = NULL;
+   extent_clear_unlock_delalloc(inode, start, end, NULL, 0,
+PAGE_END_WRITEBACK |
+PAGE_SET_ERROR);
+   }
alloc_hint = ins.objectid + ins.offset;
kfree(async_extent);
if (ret)
-- 
1.9.1

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 0/5] Proper error handling for the compressed write path

2014-10-06 Thread Filipe Manana

This patchset fixes several issues in inode.c:submit_compressed_extents()
when one of the functions it calls fails. These issues range from hangs,
missing error reporting (silent failure), memory leaks and pages not getting
released.

Filipe Manana (5):
  Btrfs: set page and mapping error on compressed write failure
  Btrfs: fix hang on compressed write error
  Btrfs: don't leak pages and memory on compressed write error
  Btrfs: process all async extents on compressed write failure
  Btrfs: make inode.c:submit_compressed_extents() return void

 fs/btrfs/extent_io.c |  5 +
 fs/btrfs/extent_io.h |  1 +
 fs/btrfs/inode.c | 56 ++--
 3 files changed, 43 insertions(+), 19 deletions(-)

-- 
1.9.1

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 3/5] Btrfs: don't leak pages and memory on compressed write error

2014-10-06 Thread Filipe Manana

In inode.c:submit_compressed_extents(), if we fail before calling
btrfs_submit_compressed_write(), or when that function fails, we
were freeing the async_extent structure without releasing its pages
and freeing the pages array.

Signed-off-by: Filipe Manana fdman...@suse.com
---
 fs/btrfs/inode.c | 28 +++-
 1 file changed, 19 insertions(+), 9 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index e2c4650..a3e2330 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -633,6 +633,22 @@ free_pages_out:
goto out;
 }
 
+static void free_async_extent_pages(struct async_extent *async_extent)
+{
+   int i;
+
+   if (!async_extent-pages)
+   return;
+
+   for (i = 0; i  async_extent-nr_pages; i++) {
+   WARN_ON(async_extent-pages[i]-mapping);
+   page_cache_release(async_extent-pages[i]);
+   }
+   kfree(async_extent-pages);
+   async_extent-nr_pages = 0;
+   async_extent-pages = NULL;
+}
+
 /*
  * phase two of compressed writeback.  This is the ordered portion
  * of the code, which only gets called in the order the work was
@@ -709,15 +725,7 @@ retry:
   async_extent-compressed_size,
   0, alloc_hint, ins, 1, 1);
if (ret) {
-   int i;
-
-   for (i = 0; i  async_extent-nr_pages; i++) {
-   WARN_ON(async_extent-pages[i]-mapping);
-   page_cache_release(async_extent-pages[i]);
-   }
-   kfree(async_extent-pages);
-   async_extent-nr_pages = 0;
-   async_extent-pages = NULL;
+   free_async_extent_pages(async_extent);
 
if (ret == -ENOSPC) {
unlock_extent(io_tree, async_extent-start,
@@ -827,6 +835,7 @@ retry:
extent_clear_unlock_delalloc(inode, start, end, NULL, 0,
 PAGE_END_WRITEBACK |
 PAGE_SET_ERROR);
+   free_async_extent_pages(async_extent);
}
alloc_hint = ins.objectid + ins.offset;
kfree(async_extent);
@@ -848,6 +857,7 @@ out_free:
 PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
 PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK |
 PAGE_SET_ERROR);
+   free_async_extent_pages(async_extent);
kfree(async_extent);
goto again;
 }
-- 
1.9.1

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 5/5] Btrfs: make inode.c:submit_compressed_extents() return void

2014-10-06 Thread Filipe Manana

Its return value is completely ignored by its single caller and it's
useless anyway, since errors are indicated through SetPageError and
the bit AS_EIO set in the flags of the inode's mapping. The caller
can't do anything with the value, as it's invoked from a workqueue
task and not by the task calling filemap_fdatawrite_range (which calls
the writepages address space callback, which in turn calls the inode's
fill_delalloc callback).

Signed-off-by: Filipe Manana fdman...@suse.com
---
 fs/btrfs/inode.c | 7 ++-
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 8636499..7635b1d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -655,7 +655,7 @@ static void free_async_extent_pages(struct async_extent 
*async_extent)
  * queued.  We walk all the async extents created by compress_file_range
  * and send them down to the disk.
  */
-static noinline int submit_compressed_extents(struct inode *inode,
+static noinline void submit_compressed_extents(struct inode *inode,
  struct async_cow *async_cow)
 {
struct async_extent *async_extent;
@@ -667,9 +667,6 @@ static noinline int submit_compressed_extents(struct inode 
*inode,
struct extent_io_tree *io_tree;
int ret = 0;
 
-   if (list_empty(async_cow-extents))
-   return 0;
-
 again:
while (!list_empty(async_cow-extents)) {
async_extent = list_entry(async_cow-extents.next,
@@ -841,7 +838,7 @@ retry:
kfree(async_extent);
cond_resched();
}
-   return 0;
+   return;
 out_free_reserve:
btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
 out_free:
-- 
1.9.1

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH] Btrfs: don't ignore compressed bio write errors

2014-10-06 Thread Filipe Manana

Our compressed bio write end callback was essentially ignoring the error
parameter. When a write error happens, it must pass a value of 0 to the
inode's write_page_end_io_hook callback, SetPageError on the respective
pages and set AS_EIO in the inode's mapping flags, so that a call to
filemap_fdatawait_range() / filemap_fdatawait() can find out that errors
happened (we surely don't want silent failures on fsync for example).

Signed-off-by: Filipe Manana fdman...@suse.com
---
 fs/btrfs/compression.c | 18 --
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 138..9f0e882 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -224,16 +224,19 @@ out:
  * Clear the writeback bits on all of the file
  * pages for a compressed write
  */
-static noinline void end_compressed_writeback(struct inode *inode, u64 start,
- unsigned long ram_size)
+static noinline void end_compressed_writeback(struct inode *inode,
+ const struct compressed_bio *cb)
 {
-   unsigned long index = start  PAGE_CACHE_SHIFT;
-   unsigned long end_index = (start + ram_size - 1)  PAGE_CACHE_SHIFT;
+   unsigned long index = cb-start  PAGE_CACHE_SHIFT;
+   unsigned long end_index = (cb-start + cb-len - 1)  PAGE_CACHE_SHIFT;
struct page *pages[16];
unsigned long nr_pages = end_index - index + 1;
int i;
int ret;
 
+   if (cb-errors)
+   mapping_set_error(inode-i_mapping, -EIO);
+
while (nr_pages  0) {
ret = find_get_pages_contig(inode-i_mapping, index,
 min_t(unsigned long,
@@ -244,6 +247,8 @@ static noinline void end_compressed_writeback(struct inode 
*inode, u64 start,
continue;
}
for (i = 0; i  ret; i++) {
+   if (cb-errors)
+   SetPageError(pages[i]);
end_page_writeback(pages[i]);
page_cache_release(pages[i]);
}
@@ -287,10 +292,11 @@ static void end_compressed_bio_write(struct bio *bio, int 
err)
tree-ops-writepage_end_io_hook(cb-compressed_pages[0],
 cb-start,
 cb-start + cb-len - 1,
-NULL, 1);
+NULL,
+err ? 0 : 1);
cb-compressed_pages[0]-mapping = NULL;
 
-   end_compressed_writeback(inode, cb-start, cb-len);
+   end_compressed_writeback(inode, cb);
/* note, our inode could be gone now */
 
/*
-- 
1.9.1

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 2/2] Btrfs: make inode.c:compress_file_range() return void

2014-10-09 Thread Filipe Manana

Its return value is useless, its single caller ignores it and can't do
anything with it anyway, since it's a workqueue task and not the task
calling filemap_fdatawrite_range (writepages) nor filemap_fdatawait_range().
Failure is communicated to such functions via start and end of writeback
with the respective pages tagged with an error and AS_EIO flag set in the
inode's imapping.

Signed-off-by: Filipe Manana fdman...@suse.com
---
 fs/btrfs/inode.c | 7 ++-
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index b91a171..aef0fa3 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -382,7 +382,7 @@ static inline int inode_need_compress(struct inode *inode)
  * are written in the same order that the flusher thread sent them
  * down.
  */
-static noinline int compress_file_range(struct inode *inode,
+static noinline void compress_file_range(struct inode *inode,
struct page *locked_page,
u64 start, u64 end,
struct async_cow *async_cow,
@@ -621,8 +621,7 @@ cleanup_and_bail_uncompressed:
*num_added += 1;
}
 
-out:
-   return ret;
+   return;
 
 free_pages_out:
for (i = 0; i  nr_pages_ret; i++) {
@@ -630,8 +629,6 @@ free_pages_out:
page_cache_release(pages[i]);
}
kfree(pages);
-
-   goto out;
 }
 
 static void free_async_extent_pages(struct async_extent *async_extent)
-- 
1.9.1

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 1/2] Btrfs: report error after failure inlining extent in compressed write path

2014-10-09 Thread Filipe Manana

If cow_file_range_inline() failed, when called from compress_file_range(),
we were tagging the locked page for writeback, end its writeback and unlock it,
but not marking it with an error nor setting AS_EIO in inode's mapping flags.

This made it impossible for a caller of filemap_fdatawrite_range (writepages)
or filemap_fdatawait_range() to know that an error happened. And the return
value of compress_file_range() is useless because it's returned to a workqueue
task and not to the task calling filemap_fdatawrite_range (writepages).

This change applies on top of the previous patchset starting at the patch
titled:

[1/5] Btrfs: set page and mapping error on compressed write failure

Which changed extent_clear_unlock_delalloc() to use SetPageError and
mapping_set_error().

Signed-off-by: Filipe Manana fdman...@suse.com
---
 fs/btrfs/inode.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 7635b1d..b91a171 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -538,6 +538,7 @@ cont:
 clear_flags, PAGE_UNLOCK |
 PAGE_CLEAR_DIRTY |
 PAGE_SET_WRITEBACK |
+PAGE_SET_ERROR |
 PAGE_END_WRITEBACK);
goto free_pages_out;
}
-- 
1.9.1

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 1/2] Btrfs: correctly flush compressed data before/after direct IO

2014-10-09 Thread Filipe Manana

For compressed writes, after doing the first filemap_fdatawrite_range() we
don't get the pages tagged for writeback immediately. Instead we create
a workqueue task, which is run by other kthread, and keep the pages locked.
That other kthread compresses data, creates the respective ordered extent/s,
tags the pages for writeback and unlocks them. Therefore we need a second
call to filemap_fdatawrite_range() if we have compressed writes, as this
second call will wait for the pages to become unlocked, then see they became
tagged for writeback and finally wait for the writeback to finish.

Signed-off-by: Filipe Manana fdman...@suse.com
---
 fs/btrfs/file.c  | 12 +++-
 fs/btrfs/inode.c | 16 +---
 2 files changed, 24 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 29b147d..82c7229 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1692,8 +1692,18 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb,
err = written_buffered;
goto out;
}
+   /*
+* Ensure all data is persisted. We want the next direct IO read to be
+* able to read what was just written.
+*/
endbyte = pos + written_buffered - 1;
-   err = filemap_write_and_wait_range(file-f_mapping, pos, endbyte);
+   err = filemap_fdatawrite_range(file-f_mapping, pos, endbyte);
+   if (!err  test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+BTRFS_I(file_inode(file))-runtime_flags))
+   err = filemap_fdatawrite_range(file-f_mapping, pos, endbyte);
+   if (err)
+   goto out;
+   err = filemap_fdatawait_range(file-f_mapping, pos, endbyte);
if (err)
goto out;
written += written_buffered;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index aef0fa3..752ff18 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -7052,9 +7052,19 @@ static int lock_extent_direct(struct inode *inode, u64 
lockstart, u64 lockend,
btrfs_put_ordered_extent(ordered);
} else {
/* Screw you mmap */
-   ret = filemap_write_and_wait_range(inode-i_mapping,
-  lockstart,
-  lockend);
+   ret = filemap_fdatawrite_range(inode-i_mapping,
+  lockstart,
+  lockend);
+   if (!ret  test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+BTRFS_I(inode)-runtime_flags))
+   ret = filemap_fdatawrite_range(inode-i_mapping,
+  lockstart,
+  lockend);
+   if (ret)
+   break;
+   ret = filemap_fdatawait_range(inode-i_mapping,
+ lockstart,
+ lockend);
if (ret)
break;
 
-- 
1.9.1

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 2/2] Btrfs: add helper btrfs_fdatawrite_range

2014-10-09 Thread Filipe Manana

To avoid duplicating this double filemap_fdatawrite_range() call for
inodes with async extents (compressed writes) so often.

Signed-off-by: Filipe Manana fdman...@suse.com
---
 fs/btrfs/ctree.h|  1 +
 fs/btrfs/file.c | 36 
 fs/btrfs/inode.c|  9 +
 fs/btrfs/ordered-data.c | 24 ++--
 4 files changed, 32 insertions(+), 38 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 089f6da..4e0ad8c 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3896,6 +3896,7 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct 
inode *inode,
  struct page **pages, size_t num_pages,
  loff_t pos, size_t write_bytes,
  struct extent_state **cached);
+int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end);
 
 /* tree-defrag.c */
 int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 82c7229..2df1dce 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1697,10 +1697,7 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb,
 * able to read what was just written.
 */
endbyte = pos + written_buffered - 1;
-   err = filemap_fdatawrite_range(file-f_mapping, pos, endbyte);
-   if (!err  test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
-BTRFS_I(file_inode(file))-runtime_flags))
-   err = filemap_fdatawrite_range(file-f_mapping, pos, endbyte);
+   err = btrfs_fdatawrite_range(file-f_mapping, pos, endbyte);
if (err)
goto out;
err = filemap_fdatawait_range(file-f_mapping, pos, endbyte);
@@ -1864,10 +1861,7 @@ static int start_ordered_ops(struct inode *inode, loff_t 
start, loff_t end)
int ret;
 
atomic_inc(BTRFS_I(inode)-sync_writers);
-   ret = filemap_fdatawrite_range(inode-i_mapping, start, end);
-   if (!ret  test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
-BTRFS_I(inode)-runtime_flags))
-   ret = filemap_fdatawrite_range(inode-i_mapping, start, end);
+   ret = btrfs_fdatawrite_range(inode-i_mapping, start, end);
atomic_dec(BTRFS_I(inode)-sync_writers);
 
return ret;
@@ -2820,3 +2814,29 @@ int btrfs_auto_defrag_init(void)
 
return 0;
 }
+
+int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end)
+{
+   int ret;
+
+   /*
+* So with compression we will find and lock a dirty page and clear the
+* first one as dirty, setup an async extent, and immediately return
+* with the entire range locked but with nobody actually marked with
+* writeback.  So we can't just filemap_write_and_wait_range() and
+* expect it to work since it will just kick off a thread to do the
+* actual work.  So we need to call filemap_fdatawrite_range _again_
+* since it will wait on the page lock, which won't be unlocked until
+* after the pages have been marked as writeback and so we're good to go
+* from there.  We have to do this otherwise we'll miss the ordered
+* extents and that results in badness.  Please Josef, do not think you
+* know better and pull this out at some point in the future, it is
+* right and you are wrong.
+*/
+   ret = filemap_fdatawrite_range(inode-i_mapping, start, end);
+   if (!ret  test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+BTRFS_I(inode)-runtime_flags))
+   ret = filemap_fdatawrite_range(inode-i_mapping, start, end);
+
+   return ret;
+}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 752ff18..be955481 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -7052,14 +7052,7 @@ static int lock_extent_direct(struct inode *inode, u64 
lockstart, u64 lockend,
btrfs_put_ordered_extent(ordered);
} else {
/* Screw you mmap */
-   ret = filemap_fdatawrite_range(inode-i_mapping,
-  lockstart,
-  lockend);
-   if (!ret  test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
-BTRFS_I(inode)-runtime_flags))
-   ret = filemap_fdatawrite_range(inode-i_mapping,
-  lockstart,
-  lockend);
+   ret = btrfs_fdatawrite_range(inode, lockstart, lockend);
if (ret)
break;
ret = filemap_fdatawait_range(inode-i_mapping,
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index ac734ec..1401b1a 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -725,30

[PATCH 2/2 v2] Btrfs: add helper btrfs_fdatawrite_range

2014-10-10 Thread Filipe Manana

To avoid duplicating this double filemap_fdatawrite_range() call for
inodes with async extents (compressed writes) so often.

Signed-off-by: Filipe Manana fdman...@suse.com
---

V2: Pass right arguments to the new helper. Missed unstaged changes.

 fs/btrfs/ctree.h|  1 +
 fs/btrfs/file.c | 39 ++-
 fs/btrfs/inode.c|  9 +
 fs/btrfs/ordered-data.c | 24 ++--
 4 files changed, 34 insertions(+), 39 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 089f6da..4e0ad8c 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3896,6 +3896,7 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct 
inode *inode,
  struct page **pages, size_t num_pages,
  loff_t pos, size_t write_bytes,
  struct extent_state **cached);
+int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end);
 
 /* tree-defrag.c */
 int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 82c7229..bbd474b 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1676,6 +1676,7 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb,
loff_t pos)
 {
struct file *file = iocb-ki_filp;
+   struct inode *inode = file_inode(file);
ssize_t written;
ssize_t written_buffered;
loff_t endbyte;
@@ -1697,13 +1698,10 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb,
 * able to read what was just written.
 */
endbyte = pos + written_buffered - 1;
-   err = filemap_fdatawrite_range(file-f_mapping, pos, endbyte);
-   if (!err  test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
-BTRFS_I(file_inode(file))-runtime_flags))
-   err = filemap_fdatawrite_range(file-f_mapping, pos, endbyte);
+   err = btrfs_fdatawrite_range(inode, pos, endbyte);
if (err)
goto out;
-   err = filemap_fdatawait_range(file-f_mapping, pos, endbyte);
+   err = filemap_fdatawait_range(inode-i_mapping, pos, endbyte);
if (err)
goto out;
written += written_buffered;
@@ -1864,10 +1862,7 @@ static int start_ordered_ops(struct inode *inode, loff_t 
start, loff_t end)
int ret;
 
atomic_inc(BTRFS_I(inode)-sync_writers);
-   ret = filemap_fdatawrite_range(inode-i_mapping, start, end);
-   if (!ret  test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
-BTRFS_I(inode)-runtime_flags))
-   ret = filemap_fdatawrite_range(inode-i_mapping, start, end);
+   ret = btrfs_fdatawrite_range(inode, start, end);
atomic_dec(BTRFS_I(inode)-sync_writers);
 
return ret;
@@ -2820,3 +2815,29 @@ int btrfs_auto_defrag_init(void)
 
return 0;
 }
+
+int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end)
+{
+   int ret;
+
+   /*
+* So with compression we will find and lock a dirty page and clear the
+* first one as dirty, setup an async extent, and immediately return
+* with the entire range locked but with nobody actually marked with
+* writeback.  So we can't just filemap_write_and_wait_range() and
+* expect it to work since it will just kick off a thread to do the
+* actual work.  So we need to call filemap_fdatawrite_range _again_
+* since it will wait on the page lock, which won't be unlocked until
+* after the pages have been marked as writeback and so we're good to go
+* from there.  We have to do this otherwise we'll miss the ordered
+* extents and that results in badness.  Please Josef, do not think you
+* know better and pull this out at some point in the future, it is
+* right and you are wrong.
+*/
+   ret = filemap_fdatawrite_range(inode-i_mapping, start, end);
+   if (!ret  test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+BTRFS_I(inode)-runtime_flags))
+   ret = filemap_fdatawrite_range(inode-i_mapping, start, end);
+
+   return ret;
+}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 752ff18..be955481 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -7052,14 +7052,7 @@ static int lock_extent_direct(struct inode *inode, u64 
lockstart, u64 lockend,
btrfs_put_ordered_extent(ordered);
} else {
/* Screw you mmap */
-   ret = filemap_fdatawrite_range(inode-i_mapping,
-  lockstart,
-  lockend);
-   if (!ret  test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
-BTRFS_I(inode)-runtime_flags))
-   ret = filemap_fdatawrite_range(inode-i_mapping

[PATCH 1/2 v2] Btrfs: report error after failure inlining extent in compressed write path

2014-10-10 Thread Filipe Manana

If cow_file_range_inline() failed, when called from compress_file_range(),
we were tagging the locked page for writeback, end its writeback and unlock it,
but not marking it with an error nor setting AS_EIO in inode's mapping flags.

This made it impossible for a caller of filemap_fdatawrite_range (writepages)
or filemap_fdatawait_range() to know that an error happened. And the return
value of compress_file_range() is useless because it's returned to a workqueue
task and not to the task calling filemap_fdatawrite_range (writepages).

This change applies on top of the previous patchset starting at the patch
titled:

[1/5] Btrfs: set page and mapping error on compressed write failure

Which changed extent_clear_unlock_delalloc() to use SetPageError and
mapping_set_error().

Signed-off-by: Filipe Manana fdman...@suse.com
---

V2: Use SET_PAGE_ERROR only if ret  0, obviously. Thanks btrfs/056.

 fs/btrfs/inode.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 7635b1d..2b09425 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -527,7 +527,10 @@ cont:
if (ret = 0) {
unsigned long clear_flags = EXTENT_DELALLOC |
EXTENT_DEFRAG;
+   unsigned long page_error_op;
+
clear_flags |= (ret  0) ? EXTENT_DO_ACCOUNTING : 0;
+   page_error_op = ret  0 ? PAGE_SET_ERROR : 0;
 
/*
 * inline extent creation worked or returned error,
@@ -538,6 +541,7 @@ cont:
 clear_flags, PAGE_UNLOCK |
 PAGE_CLEAR_DIRTY |
 PAGE_SET_WRITEBACK |
+page_error_op |
 PAGE_END_WRITEBACK);
goto free_pages_out;
}
-- 
1.9.1

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 1/3] Btrfs: deal with convert_extent_bit errors to avoid fs corruption

2014-10-13 Thread Filipe Manana

When committing a transaction or a log, we look for btree extents that
need to be durably persisted by searching for ranges in a io tree that
have some bits set (EXTENT_DIRTY or EXTENT_NEW). We then attempt to clear
those bits and set the EXTENT_NEED_WAIT bit, with calls to the function
convert_extent_bit, and then start writeback for the extents.

That function however can return an error (at the moment only -ENOMEM
is possible, specially when it does GFP_ATOMIC allocation requests
through alloc_extent_state_atomic) - that means the ranges didn't got
the EXTENT_NEED_WAIT bit set (or at least not for the whole range),
which in turn means a call to btrfs_wait_marked_extents() won't find
those ranges for which we started writeback, causing a transaction
commit or a log commit to persist a new superblock without waiting
for the writeback of extents in that range to finish first.

Therefore if a crash happens after persisting the new superblock and
before writeback finishes, we have a superblock pointing to roots that
weren't fully persisted or roots that point to nodes or leafs that weren't
fully persisted, causing all sorts of unexpected/bad behaviour as we endup
reading garbage from disk or the content of some node/leaf from a past
generation that got cowed or deleted and is no longer valid (for this later
case we end up getting error messages like parent transid verify failed on
X wanted Y found Z when reading btree nodes/leafs from disk).

Signed-off-by: Filipe Manana fdman...@suse.com
---
 fs/btrfs/transaction.c | 92 +-
 fs/btrfs/transaction.h |  2 --
 2 files changed, 76 insertions(+), 18 deletions(-)

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 8f1a408..cb673d4 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -76,6 +76,32 @@ void btrfs_put_transaction(struct btrfs_transaction 
*transaction)
}
 }
 
+static void clear_btree_io_tree(struct extent_io_tree *tree)
+{
+   spin_lock(tree-lock);
+   while (!RB_EMPTY_ROOT(tree-state)) {
+   struct rb_node *node;
+   struct extent_state *state;
+
+   node = rb_first(tree-state);
+   state = rb_entry(node, struct extent_state, rb_node);
+   rb_erase(state-rb_node, tree-state);
+   RB_CLEAR_NODE(state-rb_node);
+   /*
+* btree io trees aren't supposed to have tasks waiting for
+* changes in the flags of extent states ever.
+*/
+   ASSERT(!waitqueue_active(state-wq));
+   free_extent_state(state);
+   if (need_resched()) {
+   spin_unlock(tree-lock);
+   cond_resched();
+   spin_lock(tree-lock);
+   }
+   }
+   spin_unlock(tree-lock);
+}
+
 static noinline void switch_commit_roots(struct btrfs_transaction *trans,
 struct btrfs_fs_info *fs_info)
 {
@@ -89,6 +115,7 @@ static noinline void switch_commit_roots(struct 
btrfs_transaction *trans,
root-commit_root = btrfs_root_node(root);
if (is_fstree(root-objectid))
btrfs_unpin_free_ino(root);
+   clear_btree_io_tree(root-dirty_log_pages);
}
up_write(fs_info-commit_root_sem);
 }
@@ -827,17 +854,38 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
 
while (!find_first_extent_bit(dirty_pages, start, start, end,
  mark, cached_state)) {
-   convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT,
-  mark, cached_state, GFP_NOFS);
-   cached_state = NULL;
-   err = filemap_fdatawrite_range(mapping, start, end);
+   bool wait_writeback = false;
+
+   err = convert_extent_bit(dirty_pages, start, end,
+EXTENT_NEED_WAIT,
+mark, cached_state, GFP_NOFS);
+   /*
+* convert_extent_bit can return -ENOMEM, which is most of the
+* time a temporary error. So when it happens, ignore the error
+* and wait for writeback of this range to finish - because we
+* failed to set the bit EXTENT_NEED_WAIT for the range, a call
+* to btrfs_wait_marked_extents() would not know that writeback
+* for this range started and therefore wouldn't wait for it to
+* finish - we don't want to commit a superblock that points to
+* btree nodes/leafs for which writeback hasn't finished yet
+* (and without errors).
+* We cleanup any entries left in the io tree when committing
+* the transaction (through clear_btree_io_tree()).
+*/
+   if (err == -ENOMEM

[PATCH 3/3] Btrfs: avoid returning -ENOMEM in convert_extent_bit() too early

2014-10-13 Thread Filipe Manana

We try to allocate an extent state before acquiring the tree's spinlock
just in case we end up needing to split an existing extent state into two.
If that allocation failed, we would return -ENOMEM.
However, our only single caller (transaction/log commit code), passes in
an extent state that was cached from a call to find_first_extent_bit() and
that has a very high chance to match exactly the input range (always true
for a transaction commit and very often, but not always, true for a log
commit) - in this case we end up not needing at all that initial extent
state used for an eventual split. Therefore just don't return -ENOMEM if
we can't allocate the temporary extent state, since we might not need it
at all, and if we end up needing one, we'll do it later anyway.

Signed-off-by: Filipe Manana fdman...@suse.com
---
 fs/btrfs/extent_io.c | 11 ++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 0d931b1..654ed3d 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1066,13 +1066,21 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 
start, u64 end,
int err = 0;
u64 last_start;
u64 last_end;
+   bool first_iteration = true;
 
btrfs_debug_check_extent_io_range(tree, start, end);
 
 again:
if (!prealloc  (mask  __GFP_WAIT)) {
+   /*
+* Best effort, don't worry if extent state allocation fails
+* here for the first iteration. We might have a cached state
+* that matches exactly the target range, in which case no
+* extent state allocations are needed. We'll only know this
+* after locking the tree.
+*/
prealloc = alloc_extent_state(mask);
-   if (!prealloc)
+   if (!prealloc  !first_iteration)
return -ENOMEM;
}
 
@@ -1242,6 +1250,7 @@ search_again:
spin_unlock(tree-lock);
if (mask  __GFP_WAIT)
cond_resched();
+   first_iteration = false;
goto again;
 }
 
-- 
1.9.1

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 2/3] Btrfs: make find_first_extent_bit be able to cache any state

2014-10-13 Thread Filipe Manana

Right now the only caller of find_first_extent_bit() that is interested
in caching extent states (transaction or log commit), never gets an extent
state cached. This is because find_first_extent_bit() only caches states
that have at least one of the flags EXTENT_IOBITS or EXTENT_BOUNDARY, and
the transaction/log commit caller always passes a tree that doesn't have
ever extent states with any of those flags (they can only have one of the
following flags: EXTENT_DIRTY, EXTENT_NEW or EXTENT_NEED_WAIT).

This change together with the following one in the patch series (titled
Btrfs: avoid returning -ENOMEM in convert_extent_bit() too early) will
help reduce significantly the chances of calls to convert_extent_bit()
fail with -ENOMEM when called from the transaction/log commit code.

Signed-off-by: Filipe Manana fdman...@suse.com
---
 fs/btrfs/extent_io.c   | 16 
 fs/btrfs/transaction.c |  3 +++
 2 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 420fe26..0d931b1 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -796,17 +796,25 @@ static void set_state_bits(struct extent_io_tree *tree,
state-state |= bits_to_set;
 }
 
-static void cache_state(struct extent_state *state,
-   struct extent_state **cached_ptr)
+static void cache_state_if_flags(struct extent_state *state,
+struct extent_state **cached_ptr,
+const u64 flags)
 {
if (cached_ptr  !(*cached_ptr)) {
-   if (state-state  (EXTENT_IOBITS | EXTENT_BOUNDARY)) {
+   if (!flags || (state-state  flags)) {
*cached_ptr = state;
atomic_inc(state-refs);
}
}
 }
 
+static void cache_state(struct extent_state *state,
+   struct extent_state **cached_ptr)
+{
+   return cache_state_if_flags(state, cached_ptr,
+   EXTENT_IOBITS | EXTENT_BOUNDARY);
+}
+
 /*
  * set some bits on a range in the tree.  This may require allocations or
  * sleeping, so the gfp mask is used to indicate what is allowed.
@@ -1482,7 +1490,7 @@ int find_first_extent_bit(struct extent_io_tree *tree, 
u64 start,
state = find_first_extent_bit_state(tree, start, bits);
 got_it:
if (state) {
-   cache_state(state, cached_state);
+   cache_state_if_flags(state, cached_state, 0);
*start_ret = state-start;
*end_ret = state-end;
ret = 0;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index cb673d4..396ae8b 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -882,6 +882,7 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
werr = err;
else if (wait_writeback)
werr = filemap_fdatawait_range(mapping, start, end);
+   free_extent_state(cached_state);
cached_state = NULL;
cond_resched();
start = end + 1;
@@ -926,6 +927,8 @@ int btrfs_wait_marked_extents(struct btrfs_root *root,
err = filemap_fdatawait_range(mapping, start, end);
if (err)
werr = err;
+   free_extent_state(cached_state);
+   cached_state = NULL;
cond_resched();
start = end + 1;
}
-- 
1.9.1

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH] Revert Btrfs: race free update of commit root for ro snapshots

2014-10-15 Thread Filipe Manana

This reverts commit 9c3b306e1c9e6be4be09e99a8fe2227d1005effc.

Switching only one commit root during a transaction is wrong because it leads
the fs into an inconsistent state. All commit roots should be switched at once,
at transaction commit time, otherwise backref walking can often miss important
references that were only accessible through the old commit root.
Plus, the root item for the snapshot's root wasn't getting updated and 
preventing
the next transaction commit to do it.

This made several users get into random corruption issues after creation of
readonly snapshots.

A regression test for xfstests will follow soon.

Cc: sta...@vger.kernel.org # 3.17
Signed-off-by: Filipe Manana fdman...@suse.com
---
 fs/btrfs/inode.c | 36 
 fs/btrfs/ioctl.c | 33 +
 2 files changed, 33 insertions(+), 36 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index fc9c043..d23362f 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -5261,42 +5261,6 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, 
struct dentry *dentry)
iput(inode);
inode = ERR_PTR(ret);
}
-   /*
-* If orphan cleanup did remove any orphans, it means the tree
-* was modified and therefore the commit root is not the same as
-* the current root anymore. This is a problem, because send
-* uses the commit root and therefore can see inode items that
-* don't exist in the current root anymore, and for example make
-* calls to btrfs_iget, which will do tree lookups based on the
-* current root and not on the commit root. Those lookups will
-* fail, returning a -ESTALE error, and making send fail with
-* that error. So make sure a send does not see any orphans we
-* have just removed, and that it will see the same inodes
-* regardless of whether a transaction commit happened before
-* it started (meaning that the commit root will be the same as
-* the current root) or not.
-*/
-   if (sub_root-node != sub_root-commit_root) {
-   u64 sub_flags = btrfs_root_flags(sub_root-root_item);
-
-   if (sub_flags  BTRFS_ROOT_SUBVOL_RDONLY) {
-   struct extent_buffer *eb;
-
-   /*
-* Assert we can't have races between dentry
-* lookup called through the snapshot creation
-* ioctl and the VFS.
-*/
-   ASSERT(mutex_is_locked(dir-i_mutex));
-
-   down_write(root-fs_info-commit_root_sem);
-   eb = sub_root-commit_root;
-   sub_root-commit_root =
-   btrfs_root_node(sub_root);
-   up_write(root-fs_info-commit_root_sem);
-   free_extent_buffer(eb);
-   }
-   }
}
 
return inode;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index e732274..33c80f5 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -713,6 +713,39 @@ static int create_snapshot(struct btrfs_root *root, struct 
inode *dir,
if (ret)
goto fail;
 
+   ret = btrfs_orphan_cleanup(pending_snapshot-snap);
+   if (ret)
+   goto fail;
+
+   /*
+* If orphan cleanup did remove any orphans, it means the tree was
+* modified and therefore the commit root is not the same as the
+* current root anymore. This is a problem, because send uses the
+* commit root and therefore can see inode items that don't exist
+* in the current root anymore, and for example make calls to
+* btrfs_iget, which will do tree lookups based on the current root
+* and not on the commit root. Those lookups will fail, returning a
+* -ESTALE error, and making send fail with that error. So make sure
+* a send does not see any orphans we have just removed, and that it
+* will see the same inodes regardless of whether a transaction
+* commit happened before it started (meaning that the commit root
+* will be the same as the current root) or not.
+*/
+   if (readonly  pending_snapshot-snap-node !=
+   pending_snapshot-snap-commit_root) {
+   trans = btrfs_join_transaction(pending_snapshot-snap);
+   if (IS_ERR(trans)  PTR_ERR(trans) != -ENOENT) {
+   ret = PTR_ERR(trans);
+   goto fail;
+   }
+   if (!IS_ERR(trans)) {
+   ret

[PATCH] fstests: btrfs: regression test for ro snapshot creation

2014-10-15 Thread Filipe Manana

Regression test for a btrfs issue where creation of readonly snapshots caused
the filesystem to get into an inconsistent state.

This regression was introduced in the 3.17 kernel and fixed by reverting the
following linux kernel commit:

 Btrfs: race free update of commit root for ro snapshots
 9c3b306e1c9e6be4be09e99a8fe2227d1005effc

Signed-off-by: Filipe Manana fdman...@suse.com
---
 tests/btrfs/078 | 85 +
 tests/btrfs/078.out |  2 ++
 tests/btrfs/group   |  1 +
 3 files changed, 88 insertions(+)
 create mode 100755 tests/btrfs/078
 create mode 100644 tests/btrfs/078.out

diff --git a/tests/btrfs/078 b/tests/btrfs/078
new file mode 100755
index 000..48de357
--- /dev/null
+++ b/tests/btrfs/078
@@ -0,0 +1,85 @@
+#! /bin/bash
+# FSQA Test No. 078
+#
+# Regression test for a btrfs issue where creation of readonly snapshots caused
+# the filesystem to get into an inconsistent state.
+#
+# This regression was introduced in the 3.17 kernel and fixed by reverting the
+# following linux kernel commit:
+#
+# Btrfs: race free update of commit root for ro snapshots
+# 9c3b306e1c9e6be4be09e99a8fe2227d1005effc
+#
+#---
+#
+# Copyright (C) 2014 SUSE Linux Products GmbH. All Rights Reserved.
+# Author: Filipe Manana fdman...@suse.com
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it would be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write the Free Software Foundation,
+# Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+#---
+#
+
+seq=`basename $0`
+seqres=$RESULT_DIR/$seq
+echo QA output created by $seq
+
+tmp=`mktemp -d`
+status=1   # failure is the default!
+trap _cleanup; exit \$status 0 1 2 3 15
+
+_cleanup()
+{
+   rm -fr $tmp
+}
+
+# get standard environment, filters and checks
+. ./common/rc
+. ./common/filter
+
+# real QA test starts here
+_need_to_be_root
+_supported_fs btrfs
+_supported_os Linux
+_require_scratch
+
+rm -f $seqres.full
+
+workout()
+{
+   ops=$1
+   procs=$2
+   num_snapshots=$3
+
+   _scratch_mkfs  $seqres.full 21
+   _scratch_mount
+
+   snapshot_cmd=$BTRFS_UTIL_PROG subvolume snapshot -r $SCRATCH_MNT
+   snapshot_cmd=$snapshot_cmd $SCRATCH_MNT/snap_\`date +'%H_%M_%S_%N'\`
+   run_check $FSSTRESS_PROG -p $procs \
+   -x $snapshot_cmd -X $num_snapshots -d $SCRATCH_MNT -n $ops
+}
+
+ops=8000
+procs=4
+snapshots=100
+workout $ops $procs $snapshots
+
+# The fstests framework runs a file system check against the scratch device
+# automatically when a test case finishes (if the test calls _require_scratch).
+# That filesystem check (btrfsck, btrfs.fsck) failed reporting several fs
+# inconsistencies. Therefore there's no need to call _check_scratch_fs here.
+
+echo Silence is golden
+status=0
+exit
diff --git a/tests/btrfs/078.out b/tests/btrfs/078.out
new file mode 100644
index 000..b8acea8
--- /dev/null
+++ b/tests/btrfs/078.out
@@ -0,0 +1,2 @@
+QA output created by 078
+Silence is golden
diff --git a/tests/btrfs/group b/tests/btrfs/group
index 9adf862..40e7430 100644
--- a/tests/btrfs/group
+++ b/tests/btrfs/group
@@ -80,3 +80,4 @@
 075 auto quick subvol
 076 auto quick
 077 auto quick
+078 auto
-- 
1.9.1

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH] Btrfs-progs: check, fix return value check of is_child_root()

2014-10-15 Thread Filipe Manana

The following commit:

btrfs-progs: fsck: remove unfriendly BUG_ON() for searching tree failure
f495a2ac66116f0a1b15e73380c8cbca6e0a4ca0

introduced a regression, detected through xfstests/btrfs/054, where
previously a negative return value (-1) was used to mean a particular
root didn't had any parent root, and now, after that change, a negative
value is also used to mean that an error happened. That change also made
the only caller of is_child_root() interpret any negative return value
as an error and therefore incorrectly made the caller leave with an
error, instead of jumping to its skip label.

Since the return value that means the root with id child_root_id doesn't
have any parent root isn't used by the only caller of is_child_root(),
just get rid of it and make is_child_root() return 0 if parent_root_id
isn't a parent of child_root_id, return 1 if it is, and a negative value
on error.

This affects only the 3.17 release candidates (3.16 and older releases
don't have this issue).

Signed-off-by: Filipe Manana fdman...@suse.com
---
 cmds-check.c | 5 +
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/cmds-check.c b/cmds-check.c
index 99d1a94..002d3e9 100644
--- a/cmds-check.c
+++ b/cmds-check.c
@@ -901,7 +901,6 @@ static int is_child_root(struct btrfs_root *root, u64 
parent_root_id,
struct btrfs_path path;
struct btrfs_key key;
struct extent_buffer *leaf;
-   int has_parent = 0;
int ret;
 
btrfs_init_path(path);
@@ -939,8 +938,6 @@ static int is_child_root(struct btrfs_root *root, u64 
parent_root_id,
key.type != BTRFS_ROOT_BACKREF_KEY)
break;
 
-   has_parent = 1;
-
if (key.offset == parent_root_id) {
btrfs_release_path(path);
return 1;
@@ -952,7 +949,7 @@ out:
btrfs_release_path(path);
if (ret  0)
return ret;
-   return has_parent? 0 : -1;
+   return 0;
 }
 
 static int process_dir_item(struct btrfs_root *root,
-- 
1.9.1

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v2] Btrfs-progs: check, fix return value check of is_child_root()

2014-10-15 Thread Filipe Manana

The following commit:

btrfs-progs: fsck: remove unfriendly BUG_ON() for searching tree failure
f495a2ac66116f0a1b15e73380c8cbca6e0a4ca0

introduced a regression, detected through xfstests/btrfs/054, where
previously a negative return value (-1) was used to mean a particular
root didn't had any parent root, and now, after that change, a negative
value is also used to mean that an error happened. That change also made
the only caller of is_child_root() interpret any negative return value
as an error and therefore incorrectly made the caller leave with an
error, instead of continuing.

This affects only the 3.17 release candidates (3.16 and older releases
don't have this issue).

Signed-off-by: Filipe Manana fdman...@suse.com
---

V2: Made it return 2 (instead of -1) when the root child_root_id doesn't
have any parent roots, in order to behave exactly like the code
pre-commit f495a2ac66116f0a1b15e73380c8cbca6e0a4ca0.

 cmds-check.c | 10 +-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/cmds-check.c b/cmds-check.c
index 99d1a94..310eb2a 100644
--- a/cmds-check.c
+++ b/cmds-check.c
@@ -895,6 +895,14 @@ static int leave_shared_node(struct btrfs_root *root,
return 0;
 }
 
+/*
+ * Returns:
+ *  0 - on error
+ * 1   - if the root with id child_root_id is a child of root parent_root_id
+ * 0   - if the root child_root_id isn't a child of the root parent_root_id but
+ *   has other root(s) as parent(s)
+ * 2   - if the root child_root_id doesn't have any parent roots
+ */
 static int is_child_root(struct btrfs_root *root, u64 parent_root_id,
 u64 child_root_id)
 {
@@ -952,7 +960,7 @@ out:
btrfs_release_path(path);
if (ret  0)
return ret;
-   return has_parent? 0 : -1;
+   return has_parent ? 0 : 2;
 }
 
 static int process_dir_item(struct btrfs_root *root,
-- 
1.9.1

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH] Btrfs-progs: check, ability to detect and fix outdated snapshot root items

2014-10-17 Thread Filipe Manana

This change adds code to detect and fix the issue introduced in the kernel
release 3.17, where creation of read-only snapshots lead to a corrupted
filesystem if they were created at a moment when the source subvolume/snapshot
had orphan items. The issue was that the on-disk root items became incorrect,
referring to the pre orphan cleanup root node instead of the post orphan
cleanup root node.

A test filesystem can be generated with the test case recently submitted for
xfstests/fstests, which is essencially the following (bash script):

workout()
{
ops=$1
procs=$2
num_snapshots=$3

_scratch_mkfs  $seqres.full 21
_scratch_mount

snapshot_cmd=$BTRFS_UTIL_PROG subvolume snapshot -r $SCRATCH_MNT
snapshot_cmd=$snapshot_cmd $SCRATCH_MNT/snap_\`date +'%H_%M_%S_%N'\`
run_check $FSSTRESS_PROG -p $procs \
-x $snapshot_cmd -X $num_snapshots -d $SCRATCH_MNT -n $ops
}

ops=1
procs=4
snapshots=500
workout $ops $procs $snapshots

Example of btrfsck's (btrfs check) behaviour against such filesystem:

  $ btrfsck /dev/loop0
  root item for root 311, current bytenr 44630016, current gen 60, current 
level 1, new bytenr 44957696, new gen 61, new level 1
  root item for root 1480, current bytenr 1003569152, current gen 1271, current 
level 1, new bytenr 1004175360, new gen 1272, new level 1
  root item for root 1509, current bytenr 1037434880, current gen 1300, current 
level 1, new bytenr 1038467072, new gen 1301, new level 1
  root item for root 1562, current bytenr 33636352, current gen 1354, current 
level 1, new bytenr 3442, new gen 1355, new level 1
  root item for root 3094, current bytenr 1011712000, current gen 2935, current 
level 1, new bytenr 1008484352, new gen 2936, new level 1
  root item for root 3716, current bytenr 80805888, current gen 3578, current 
level 1, new bytenr 73515008, new gen 3579, new level 1
  root item for root 4085, current bytenr 714031104, current gen 3958, current 
level 1, new bytenr 716816384, new gen 3959, new level 1
  Found 7 roots with an outdated root item.
  Please run a filesystem check with the option --repair to fix them.

  $ echo $?
  1

  $ btrfsck --repair /dev/loop0
  enabling repair mode
  fixing root item for root 311, current bytenr 44630016, current gen 60, 
current level 1, new bytenr 44957696, new gen 61, new level 1
  fixing root item for root 1480, current bytenr 1003569152, current gen 1271, 
current level 1, new bytenr 1004175360, new gen 1272, new level 1
  fixing root item for root 1509, current bytenr 1037434880, current gen 1300, 
current level 1, new bytenr 1038467072, new gen 1301, new level 1
  fixing root item for root 1562, current bytenr 33636352, current gen 1354, 
current level 1, new bytenr 3442, new gen 1355, new level 1
  fixing root item for root 3094, current bytenr 1011712000, current gen 2935, 
current level 1, new bytenr 1008484352, new gen 2936, new level 1
  fixing root item for root 3716, current bytenr 80805888, current gen 3578, 
current level 1, new bytenr 73515008, new gen 3579, new level 1
  fixing root item for root 4085, current bytenr 714031104, current gen 3958, 
current level 1, new bytenr 716816384, new gen 3959, new level 1
  Fixed 7 roots.
  Checking filesystem on /dev/loop0
  UUID: 2186e9b9-c977-4a35-9c7b-69c6609d4620
  checking extents
  checking free space cache
  cache and super generation don't match, space cache will be invalidated
  checking fs roots
  checking csums
  checking root refs
  found 618537000 bytes used err is 0
  total csum bytes: 130824
  total tree bytes: 601620480
  total fs tree bytes: 580288512
  total extent tree bytes: 18464768
  btree space waste bytes: 136939144
  file data blocks allocated: 34150318080
   referenced 27815415808
  Btrfs v3.17-rc3-2-gbbe1dd8

  $ echo $?
  0

Signed-off-by: Filipe Manana fdman...@suse.com
---
 cmds-check.c   | 356 +
 disk-io.c  |   2 +
 extent-tree.c  |  22 +-
 tests/fsck-tests.sh|  15 +-
 tests/fsck-tests/006-bad_root_items_fs.tar.xz  | Bin 0 - 24980 bytes
 .../fsck-tests/007-bad_root_items_fs_skinny.tar.xz | Bin 0 - 26520 bytes
 utils.c|  21 ++
 utils.h|   2 +
 8 files changed, 395 insertions(+), 23 deletions(-)
 create mode 100644 tests/fsck-tests/006-bad_root_items_fs.tar.xz
 create mode 100644 tests/fsck-tests/007-bad_root_items_fs_skinny.tar.xz

diff --git a/cmds-check.c b/cmds-check.c
index 310eb2a..2a5f823 100644
--- a/cmds-check.c
+++ b/cmds-check.c
@@ -7198,6 +7198,345 @@ static int fill_csum_tree(struct btrfs_trans_handle 
*trans,
return ret;
 }
 
+struct root_item_info {
+   /* level of the root */
+   u8 level;
+   /* number of nodes at this level, must be 1

Re: [GIT PULL] Btrfs for stable (mostly 3.17)

2014-10-20 Thread Filipe Manana



On 10/20/2014 12:13 AM, Greg KH wrote:
 On Sun, Oct 19, 2014 at 09:55:11PM +0200, Greg KH wrote:
 On Sun, Oct 19, 2014 at 06:01:16AM -0400, Chris Mason wrote:
 Hi everyone,

 I've pulled out some of the btrfs commits from the merge window that
 we'd like to see in stable.  The full list of sha's from Linus is below,
 you can see 4 of them are only needed on 3.17

 2fad4e83e12591eb3bd213875b9edc2d18e93383
 0b4699dcb65c2cff793210b07f40b98c2d423a43 # v3.17
 12b894cb288d57292b01cf158177b6d5c89a6272
 78a017a2c92df9b571db0a55a016280f9019c65e
 4d1a40c66bed0b3fa43b9da5fbd5cbe332e4eccf
 e6c4efd87ab04e5ead363f24e6ac35ed3506d401 # v3.17
 f6acfd50110b335c7af636cf1fc8e55319cae5fc
 1d52c78afbbf80b58299e076a159617d6b42fe3c
 75bfb9aff45e44625260f52a5fd581b92ace3e62
 bbe9051441effce51c9a533d2c56440df64db2d7
 32be3a1ac6d09576c57063c6c350ca36eaebdbd3 # v3.17
 42383020beb1cfb05f5d330cc311931bc4917a97
 d37973082b453ba6b89ec07eb7b84305895d35e1 # v3.17

 I'm confused, the others not marked with a # v3.17 need to go on older
 kernels as well?
 
 I've picked up the ones that apply and build for the older stable
 kernels I maintain now, thanks for the list.

May I suggest porting the following commit to 3.14 too?

https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=766b5e5ae78dd04a93a275690a49e23d7dcb1f39

It fixes a data corruption issue for an incremental send. Particularly
important, IMHO, as the corruption happens silently (no errors returned
to user space nor any sort of warnings/errors in syslog, etc). It
affects only 3.14, and the change applies cleanly on 3.14.22.

Thanks

 
 greg k-h
 --
 To unsubscribe from this list: send the line unsubscribe linux-btrfs in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
 
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH] Btrfs: ensure send always works on roots without orphans

2014-10-21 Thread Filipe Manana

Move the logic from the snapshot creation ioctl into send. This avoids
doing the transaction commit if send isn't used, and ensures that if
a crash/reboot happens after the transaction commit that created the
snapshot and before the transaction commit that switched the commit
root, send will not get a commit root that differs from the main root
(that has orphan items).

Signed-off-by: Filipe Manana fdman...@suse.com
---
 fs/btrfs/ioctl.c | 29 -
 fs/btrfs/send.c  | 49 +
 2 files changed, 49 insertions(+), 29 deletions(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 33c80f5..994c573 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -717,35 +717,6 @@ static int create_snapshot(struct btrfs_root *root, struct 
inode *dir,
if (ret)
goto fail;
 
-   /*
-* If orphan cleanup did remove any orphans, it means the tree was
-* modified and therefore the commit root is not the same as the
-* current root anymore. This is a problem, because send uses the
-* commit root and therefore can see inode items that don't exist
-* in the current root anymore, and for example make calls to
-* btrfs_iget, which will do tree lookups based on the current root
-* and not on the commit root. Those lookups will fail, returning a
-* -ESTALE error, and making send fail with that error. So make sure
-* a send does not see any orphans we have just removed, and that it
-* will see the same inodes regardless of whether a transaction
-* commit happened before it started (meaning that the commit root
-* will be the same as the current root) or not.
-*/
-   if (readonly  pending_snapshot-snap-node !=
-   pending_snapshot-snap-commit_root) {
-   trans = btrfs_join_transaction(pending_snapshot-snap);
-   if (IS_ERR(trans)  PTR_ERR(trans) != -ENOENT) {
-   ret = PTR_ERR(trans);
-   goto fail;
-   }
-   if (!IS_ERR(trans)) {
-   ret = btrfs_commit_transaction(trans,
-  pending_snapshot-snap);
-   if (ret)
-   goto fail;
-   }
-   }
-
inode = btrfs_lookup_dentry(dentry-d_parent-d_inode, dentry);
if (IS_ERR(inode)) {
ret = PTR_ERR(inode);
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 874828d..804432d 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -5507,6 +5507,51 @@ out:
return ret;
 }
 
+/*
+ * If orphan cleanup did remove any orphans from a root, it means the tree
+ * was modified and therefore the commit root is not the same as the current
+ * root anymore. This is a problem, because send uses the commit root and
+ * therefore can see inode items that don't exist in the current root anymore,
+ * and for example make calls to btrfs_iget, which will do tree lookups based
+ * on the current root and not on the commit root. Those lookups will fail,
+ * returning a -ESTALE error, and making send fail with that error. So make
+ * sure a send does not see any orphans we have just removed, and that it will
+ * see the same inodes regardless of whether a transaction commit happened
+ * before it started (meaning that the commit root will be the same as the
+ * current root) or not.
+ */
+static int ensure_commit_roots_uptodate(struct send_ctx *sctx)
+{
+   int i;
+   struct btrfs_trans_handle *trans = NULL;
+
+again:
+   if (sctx-parent_root 
+   sctx-parent_root-node != sctx-parent_root-commit_root)
+   goto commit_trans;
+
+   for (i = 0; i  sctx-clone_roots_cnt; i++)
+   if (sctx-clone_roots[i].root-node !=
+   sctx-clone_roots[i].root-commit_root)
+   goto commit_trans;
+
+   if (trans)
+   return btrfs_end_transaction(trans, sctx-send_root);
+
+   return 0;
+
+commit_trans:
+   /* Use any root, all fs roots will get their commit roots updated. */
+   if (!trans) {
+   trans = btrfs_join_transaction(sctx-send_root);
+   if (IS_ERR(trans))
+   return PTR_ERR(trans);
+   goto again;
+   }
+
+   return btrfs_commit_transaction(trans, sctx-send_root);
+}
+
 static void btrfs_root_dec_send_in_progress(struct btrfs_root* root)
 {
spin_lock(root-root_item_lock);
@@ -5728,6 +5773,10 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user 
*arg_)
NULL);
sort_clone_roots = 1;
 
+   ret = ensure_commit_roots_uptodate(sctx);
+   if (ret)
+   goto out;
+
current-journal_info = BTRFS_SEND_TRANS_STUB;
ret = send_subvol(sctx);
current-journal_info = NULL;
-- 
1.9.1

--
To unsubscribe from this list

[PATCH] Btrfs: fix snapshot inconsistency after a file write followed by truncate

2014-10-21 Thread Filipe Manana

If right after starting the snapshot creation ioctl we perform a write against a
file followed by a truncate, with both operations increasing the file's size, we
can get a snapshot tree that reflects a state of the source subvolume's tree 
where
the file truncation happened but the write operation didn't. This leaves a gap
between 2 file extent items of the inode, which makes btrfs' fsck complain 
about it.

For example, if we perform the following file operations:

$ mkfs.btrfs -f /dev/vdd
$ mount /dev/vdd /mnt
$ xfs_io -f \
  -c pwrite -S 0xaa -b 32K 0 32K \
  -c fsync \
  -c pwrite -S 0xbb -b 32770 16K 32770 \
  -c truncate 90123 \
  /mnt/foobar

and the snapshot creation ioctl was just called before the second write, we 
often
can get the following inode items in the snapshot's btree:

item 120 key (257 INODE_ITEM 0) itemoff 7987 itemsize 160
inode generation 146 transid 7 size 90123 block group 0 mode 
100600 links 1 uid 0 gid 0 rdev 0 flags 0x0
item 121 key (257 INODE_REF 256) itemoff 7967 itemsize 20
inode ref index 282 namelen 10 name: foobar
item 122 key (257 EXTENT_DATA 0) itemoff 7914 itemsize 53
extent data disk byte 1104855040 nr 32768
extent data offset 0 nr 32768 ram 32768
extent compression 0
item 123 key (257 EXTENT_DATA 53248) itemoff 7861 itemsize 53
extent data disk byte 0 nr 0
extent data offset 0 nr 40960 ram 40960
extent compression 0

There's a file range, corresponding to the interval [32K; ALIGN(16K + 32770, 
4096)[
for which there's no file extent item covering it. This is because the file 
write
and file truncate operations happened both right after the snapshot creation 
ioctl
called btrfs_start_delalloc_inodes(), which means we didn't start and wait for 
the
ordered extent that matches the write and, in btrfs_setsize(), we were able to 
call
btrfs_cont_expand() before being able to commit the current transaction in the
snapshot creation ioctl. So this made it possibe to insert the hole file extent
item in the source subvolume (which represents the region added by the truncate)
right before the transaction commit from the snapshot creation ioctl.

Btrfs' fsck tool complains about such cases with a message like the following:

root 331 inode 257 errors 100, file extent discount

From a user perspective, the expectation when a snapshot is created while those
file operations are being performed is that the snapshot will have a file that
either:

1) is empty
2) only the first write was captured
3) only the 2 writes were captured
4) both writes and the truncation were captured

But never capture a state where only the first write and the truncation were
captured (since the second write was performed before the truncation).

A test case for xfstests follows.

Signed-off-by: Filipe Manana fdman...@suse.com
---
 fs/btrfs/inode.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 0d41741..c28b78f 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4622,6 +4622,9 @@ static int btrfs_setsize(struct inode *inode, struct 
iattr *attr)
}
 
if (newsize  oldsize) {
+   ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
+   if (ret)
+   return ret;
truncate_pagecache(inode, newsize);
ret = btrfs_cont_expand(inode, oldsize, newsize);
if (ret)
-- 
1.9.1

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH] fstests: btrfs, add test for snapshoting after file write + truncate

2014-10-21 Thread Filipe Manana

Regression test for a btrfs issue where if right after the snapshot
creation ioctl started, a file write followed by a file truncate
happened, with both operations increasing the file's size, the created
snapshot would capture an inconsistent state of the file system tree.
That state reflected the file truncation but it didn't reflect the
write operation, and left a gap between two file extent items (and
that gap corresponded to the total or a partial area of the write
operation's range).

This issue was fixed by the following linux kernel patch:

Btrfs: fix snapshot inconsistency after a file write followed by truncate

Signed-off-by: Filipe Manana fdman...@suse.com
---
 tests/btrfs/080 | 152 
 tests/btrfs/080.out |   2 +
 tests/btrfs/group   |   1 +
 3 files changed, 155 insertions(+)
 create mode 100755 tests/btrfs/080
 create mode 100644 tests/btrfs/080.out

diff --git a/tests/btrfs/080 b/tests/btrfs/080
new file mode 100755
index 000..a73e534
--- /dev/null
+++ b/tests/btrfs/080
@@ -0,0 +1,152 @@
+#! /bin/bash
+# FSQA Test No. 080
+#
+# Regression test for a btrfs issue where if right after the snapshot creation
+# ioctl started, a file write followed by a file truncate happened, with both
+# operations increasing the file's size, the created snapshot would capture an
+# inconsistent state of the file system tree. That state reflected the file
+# truncation but it didn't reflect the write operation, and left a gap between
+# two file extent items (and that gap corresponded to the total or a partial
+# area of the write operation's range).
+#
+# This issue was fixed by the following linux kernel patch:
+#
+# Btrfs: fix snapshot inconsistency after a file write followed by truncate
+#
+#---
+#
+# Copyright (C) 2014 SUSE Linux Products GmbH. All Rights Reserved.
+# Author: Filipe Manana fdman...@suse.com
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it would be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write the Free Software Foundation,
+# Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+#---
+#
+
+seq=`basename $0`
+seqres=$RESULT_DIR/$seq
+echo QA output created by $seq
+tmp=/tmp/$$
+status=1   # failure is the default!
+trap _cleanup; exit \$status 0 1 2 3 15
+
+_cleanup()
+{
+   rm -f $tmp.*
+}
+
+# get standard environment, filters and checks
+. ./common/rc
+. ./common/filter
+
+# real QA test starts here
+_need_to_be_root
+_supported_fs btrfs
+_supported_os Linux
+_require_scratch_nocheck
+
+rm -f $seqres.full
+
+create_snapshots()
+{
+   count=$1
+   ts=`date +'%H_%M_%S_%N'`
+
+   for ((i = 1; i = $count; i++)); do
+   _run_btrfs_util_prog subvolume snapshot -r \
+   $SCRATCH_MNT $SCRATCH_MNT/${ts}_snap_$i
+   done
+}
+
+create_file()
+{
+   name=$1
+
+   run_check $XFS_IO_PROG -f \
+   -c pwrite -S 0xaa -b 32K 0 32K \
+   -c fsync \
+   -c pwrite -S 0xbb -b 32770 16K 32770 \
+   -c truncate 90123 \
+   $SCRATCH_MNT/$name
+}
+
+workout()
+{
+   name=$1
+   snapshots=$2
+
+   create_file $name 
+   fpid=$!
+   create_snapshots $snapshots 
+   spid=$!
+   wait $fpid
+   create_ret=$?
+   wait $spid
+   snap_ret=$?
+   if [ $create_ret != 0 -o $snap_ret != 0 ]; then
+   _fail Failure creating file or snapshots, check $seqres.full 
for details
+   fi
+}
+
+# If the installed btrfs mkfs supports the no-holes feature, make sure the
+# created fs doesn't get that feature enabled. With it enabled, the below fsck
+# call wouldn't fail. This feature hasn't been enabled by default since it was
+# introduced, but be safe and explicitly disable it.
+_scratch_mkfs -O list-all 21 | grep -q '\bno\-holes\b'
+if [ $? -eq 0 ]; then
+   mkfs_options=-O ^no-holes
+fi
+_scratch_mkfs $mkfs_options $seqres.full 21
+
+_scratch_mount
+for ((i = 1; i = 100; i++)); do
+   workout foobar_$i 1
+done
+
+for f in $(find $SCRATCH_MNT -name 'foobar_*'); do
+   digest=`md5sum $f | cut -d ' ' -f 1`
+   case $digest in
+   d41d8cd98f00b204e9800998ecf8427e)
+   # ok, empty file
+   ;;
+   c28418534a020122aca59fd3ff9581b5)
+   # ok, only first write captured
+   ;;
+   cd0032da89254cdc498fda396e6a9b54)
+   # ok, only 2

[PATCH] fstests: btrfs, add regression test for clone ioctl

2014-10-21 Thread Filipe Manana

Regression test for a btrfs clone ioctl issue where races between
a clone operation and concurrent target file reads would result in
leaving stale data in the page cache. After the clone operation
finished, reading from the clone target file would return the old
and no longer valid data. This affected only buffered reads (i.e.
didn't affect direct IO reads).

This issue was fixed by the following linux kernel patch:

Btrfs: ensure readers see new data after a clone operation
(commit c125b8bff1d9f6c8c91ce4eb8bd5616058c7d510)

Signed-off-by: Filipe Manana fdman...@suse.com
---
 tests/btrfs/081 | 131 
 tests/btrfs/081.out |   4 ++
 tests/btrfs/group   |   1 +
 3 files changed, 136 insertions(+)
 create mode 100755 tests/btrfs/081
 create mode 100644 tests/btrfs/081.out

diff --git a/tests/btrfs/081 b/tests/btrfs/081
new file mode 100755
index 000..d2e3767
--- /dev/null
+++ b/tests/btrfs/081
@@ -0,0 +1,131 @@
+#! /bin/bash
+# FSQA Test No. 081
+#
+# Regression test for a btrfs clone ioctl issue where races between
+# a clone operation and concurrent target file reads would result in
+# leaving stale data in the page cache. After the clone operation
+# finished, reading from the clone target file would return the old
+# and no longer valid data. This affected only buffered reads (i.e.
+# didn't affect direct IO reads).
+#
+# This issue was fixed by the following linux kernel patch:
+#
+# Btrfs: ensure readers see new data after a clone operation
+# (commit c125b8bff1d9f6c8c91ce4eb8bd5616058c7d510)
+#
+#---
+#
+# Copyright (C) 2014 SUSE Linux Products GmbH. All Rights Reserved.
+# Author: Filipe Manana fdman...@suse.com
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it would be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write the Free Software Foundation,
+# Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+#---
+#
+
+seq=`basename $0`
+seqres=$RESULT_DIR/$seq
+echo QA output created by $seq
+tmp=/tmp/$$
+status=1   # failure is the default!
+trap _cleanup; exit \$status 0 1 2 3 15
+
+_cleanup()
+{
+   rm -f $tmp.*
+}
+
+# get standard environment, filters and checks
+. ./common/rc
+. ./common/filter
+
+# real QA test starts here
+_need_to_be_root
+_supported_fs btrfs
+_supported_os Linux
+_require_scratch
+_require_btrfs_cloner
+
+rm -f $seqres.full
+
+num_extents=100
+extent_size=8192
+
+create_source_file()
+{
+   name=$1
+
+   # Create a file with $num_extents extents, each with a size of
+   # $extent_size bytes.
+   touch $SCRATCH_MNT/$name
+   for ((i = 0; i  $num_extents; i++)); do
+   off=$((i * $extent_size))
+   run_check $XFS_IO_PROG \
+   -c pwrite -S $i -b $extent_size $off $extent_size \
+   -c fsync $SCRATCH_MNT/$name
+   done
+}
+
+create_target_file()
+{
+   name=$1
+   file_size=$(($num_extents * $extent_size))
+
+   run_check $XFS_IO_PROG -f -c pwrite -S 0xff 0 $file_size \
+   -c fsync $SCRATCH_MNT/$name
+}
+
+reader_loop()
+{
+   name=$1
+
+   while true; do
+   cat $SCRATCH_MNT/$name  /dev/null
+   done
+}
+
+_scratch_mkfs $seqres.full 21
+_scratch_mount
+
+create_source_file foo
+create_target_file bar
+
+reader_loop bar 
+reader_pid=$!
+
+$CLONER_PROG -s 0 -d 0 -l $(($num_extents * $extent_size)) \
+   $SCRATCH_MNT/foo $SCRATCH_MNT/bar
+
+kill $reader_pid  /dev/null 21
+
+# Now both foo and bar should have exactly the same content.
+# This didn't use to be the case before the btrfs kernel fix mentioned
+# above. The clone ioctl was racy, as it removed bar's pages from the
+# page cache and only after it would update bar's metadata to point to
+# the same extents that foo's metadata points to - and this was done in
+# an unprotected way, so that a file read request done right after the
+# clone ioctl removed the pages from the page cache and before it updated
+# bar's metadata, would result in populating the page cache with stale
+# data. Therefore a file read after the clone operation finished would
+# not get the cloned data but it would get instead the old and no longer
+# valid data.
+md5sum $SCRATCH_MNT/foo | _filter_scratch
+md5sum $SCRATCH_MNT/bar | _filter_scratch
+
+# Validate the content of bar still matches foo's content even after
+# clearing all of bar's data from the page cache

[PATCH v2] fstests: btrfs, add test for snapshoting after file write + truncate

2014-10-26 Thread Filipe Manana

Regression test for a btrfs issue where if right after the snapshot
creation ioctl started, a file write followed by a file truncate
happened, with both operations increasing the file's size, the created
snapshot would capture an inconsistent state of the file system tree.
That state reflected the file truncation but it didn't reflect the
write operation, and left a gap between two file extent items (and
that gap corresponded to the total or a partial area of the write
operation's range).

This issue was fixed by the following linux kernel patch:

Btrfs: fix snapshot inconsistency after a file write followed by truncate

Signed-off-by: Filipe Manana fdman...@suse.com
---

V2: Added some background processes to cause some cpu load. This makes the
test fail always on environments with a non-debug kernel and where no
other significant load (other the test itself) is running.

 tests/btrfs/080 | 169 
 tests/btrfs/080.out |   2 +
 tests/btrfs/group   |   1 +
 3 files changed, 172 insertions(+)
 create mode 100755 tests/btrfs/080
 create mode 100644 tests/btrfs/080.out

diff --git a/tests/btrfs/080 b/tests/btrfs/080
new file mode 100755
index 000..a5d3b38
--- /dev/null
+++ b/tests/btrfs/080
@@ -0,0 +1,169 @@
+#! /bin/bash
+# FSQA Test No. 080
+#
+# Regression test for a btrfs issue where if right after the snapshot creation
+# ioctl started, a file write followed by a file truncate happened, with both
+# operations increasing the file's size, the created snapshot would capture an
+# inconsistent state of the file system tree. That state reflected the file
+# truncation but it didn't reflect the write operation, and left a gap between
+# two file extent items (and that gap corresponded to the total or a partial
+# area of the write operation's range).
+#
+# This issue was fixed by the following linux kernel patch:
+#
+# Btrfs: fix snapshot inconsistency after a file write followed by truncate
+#
+#---
+#
+# Copyright (C) 2014 SUSE Linux Products GmbH. All Rights Reserved.
+# Author: Filipe Manana fdman...@suse.com
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it would be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write the Free Software Foundation,
+# Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+#---
+#
+
+seq=`basename $0`
+seqres=$RESULT_DIR/$seq
+echo QA output created by $seq
+tmp=/tmp/$$
+status=1   # failure is the default!
+trap _cleanup; exit \$status 0 1 2 3 15
+
+_cleanup()
+{
+   for p in ${cpu_stress_pids[*]}; do
+   kill $p  /dev/null
+   done
+   rm -f $tmp.*
+}
+
+# get standard environment, filters and checks
+. ./common/rc
+. ./common/filter
+
+# real QA test starts here
+_need_to_be_root
+_supported_fs btrfs
+_supported_os Linux
+_require_scratch_nocheck
+
+rm -f $seqres.full
+
+create_snapshot()
+{
+   local ts=`date +'%H_%M_%S_%N'`
+
+   _run_btrfs_util_prog subvolume snapshot -r \
+   $SCRATCH_MNT $SCRATCH_MNT/${ts}_snap
+}
+
+create_file()
+{
+   local name=$1
+
+   run_check $XFS_IO_PROG -f \
+   -c pwrite -S 0xaa -b 32K 0 32K \
+   -c fsync \
+   -c pwrite -S 0xbb -b 32770 16K 32770 \
+   -c truncate 90123 \
+   $SCRATCH_MNT/$name
+}
+
+workout()
+{
+   local name=$1
+
+   create_file $name 
+   fpid=$!
+   create_snapshot 
+   spid=$!
+   wait $fpid
+   create_ret=$?
+   wait $spid
+   snap_ret=$?
+   if [ $create_ret != 0 -o $snap_ret != 0 ]; then
+   _fail Failure creating file or snapshot, check $seqres.full 
for details
+   fi
+}
+
+# If the installed btrfs mkfs supports the no-holes feature, make sure the
+# created fs doesn't get that feature enabled. With it enabled, the below fsck
+# call wouldn't fail. This feature hasn't been enabled by default since it was
+# introduced, but be safe and explicitly disable it.
+_scratch_mkfs -O list-all 21 | grep -q '\bno\-holes\b'
+if [ $? -eq 0 ]; then
+   mkfs_options=-O ^no-holes
+fi
+_scratch_mkfs $mkfs_options $seqres.full 21
+
+_scratch_mount
+
+# Run some background load in order to make the issue easier to trigger.
+# Specially needed when testing with non-debug kernels and there isn't
+# any other significant load on the test machine other than this test.
+num_cpus=`$here/src/feature -o`
+num_procs

[PATCH] Btrfs: fix invalid leaf slot access in btrfs_lookup_extent()

2014-10-27 Thread Filipe Manana

If we couldn't find our extent item, we accessed the current slot
(path-slots[0]) to check if it corresponds to an equivalent skinny
metadata item. However this slot could be beyond our last item in the
leaf (i.e. path-slots[0] = btrfs_header_nritems(leaf)), in which case
we shouldn't process it.

Since btrfs_lookup_extent() is only used to find extent items for data
extents, fix this by removing completely the logic that looks up for an
equivalent skinny metadata item, since it can not exist.

Signed-off-by: Filipe Manana fdman...@suse.com
---
 fs/btrfs/extent-tree.c | 8 +---
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 0d599ba..9141b2b 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -710,7 +710,7 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
rcu_read_unlock();
 }
 
-/* simple helper to search for an existing extent at a given offset */
+/* simple helper to search for an existing data extent at a given offset */
 int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
 {
int ret;
@@ -726,12 +726,6 @@ int btrfs_lookup_extent(struct btrfs_root *root, u64 
start, u64 len)
key.type = BTRFS_EXTENT_ITEM_KEY;
ret = btrfs_search_slot(NULL, root-fs_info-extent_root, key, path,
0, 0);
-   if (ret  0) {
-   btrfs_item_key_to_cpu(path-nodes[0], key, path-slots[0]);
-   if (key.objectid == start 
-   key.type == BTRFS_METADATA_ITEM_KEY)
-   ret = 0;
-   }
btrfs_free_path(path);
return ret;
 }
-- 
1.9.1

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH] Btrfs: fix race that makes btrfs_lookup_extent_info miss skinny extent items

2014-10-27 Thread Filipe Manana

We have a race that can lead us to miss skinny extent items in the function
btrfs_lookup_extent_info() when the skinny metadata feature is enabled.
So basically the sequence of steps is:

1) We search in the extent tree for the skinny extent, which returns  0
   (not found);

2) We check the previous item in the returned leaf for a non-skinny extent,
   and we don't find it;

3) Because we didn't find the non-skinny extent in step 2), we release our
   path to search the extent tree again, but this time for a non-skinny
   extent key;

4) Right after we released our path in step 3), a skinny extent was inserted
   in the extent tree (delayed refs were run) - our second extent tree search
   will miss it, because it's not looking for a skinny extent;

5) After the second search returned (with ret  0), we look for any delayed
   ref for our extent's bytenr (and we do it while holding a read lock on the
   leaf), but we won't find any, as such delayed ref had just run and completed
   after we released out path in step 3) before doing the second search.

Fix this by removing completely the path release and re-search logic. This is
safe, because if we seach for a metadata item and we don't find it, we have the
guarantee that the returned leaf is the one where the item would be inserted,
and so path-slots[0]  0 and path-slots[0] - 1 must be the slot where the
non-skinny extent item is if it exists. The only case where path-slots[0] is
zero is when there are no smaller keys in the tree (i.e. no left siblings for
our leaf), in which case the re-search logic isn't needed as well.

This race has been present since the introduction of skinny metadata (change
3173a18f70554fe7880bb2d85c7da566e364eb3c).

Signed-off-by: Filipe Manana fdman...@suse.com
---
 fs/btrfs/extent-tree.c | 8 
 1 file changed, 8 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 9141b2b..2cedd06 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -780,7 +780,6 @@ search_again:
else
key.type = BTRFS_EXTENT_ITEM_KEY;
 
-again:
ret = btrfs_search_slot(trans, root-fs_info-extent_root,
key, path, 0, 0);
if (ret  0)
@@ -796,13 +795,6 @@ again:
key.offset == root-nodesize)
ret = 0;
}
-   if (ret) {
-   key.objectid = bytenr;
-   key.type = BTRFS_EXTENT_ITEM_KEY;
-   key.offset = root-nodesize;
-   btrfs_release_path(path);
-   goto again;
-   }
}
 
if (ret == 0) {
-- 
1.9.1

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v2] Btrfs: fix invalid leaf slot access in btrfs_lookup_extent()

2014-10-27 Thread Filipe Manana

If we couldn't find our extent item, we accessed the current slot
(path-slots[0]) to check if it corresponds to an equivalent skinny
metadata item. However this slot could be beyond our last item in the
leaf (i.e. path-slots[0] = btrfs_header_nritems(leaf)), in which case
we shouldn't process it.

Since btrfs_lookup_extent() is only used to find extent items for data
extents, fix this by removing completely the logic that looks up for an
equivalent skinny metadata item, since it can not exist.

Signed-off-by: Filipe Manana fdman...@suse.com
---

V2: Renamed btrfs_lookup_extent() to btrfs_lookup_data_extent().

 fs/btrfs/ctree.h   |  2 +-
 fs/btrfs/extent-tree.c | 10 ++
 fs/btrfs/tree-log.c|  2 +-
 3 files changed, 4 insertions(+), 10 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index dd8b275..b72b358 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3276,7 +3276,7 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle 
*trans,
   struct btrfs_root *root, unsigned long count);
 int btrfs_async_run_delayed_refs(struct btrfs_root *root,
 unsigned long count, int wait);
-int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len);
+int btrfs_lookup_data_extent(struct btrfs_root *root, u64 start, u64 len);
 int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
 struct btrfs_root *root, u64 bytenr,
 u64 offset, int metadata, u64 *refs, u64 *flags);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 0d599ba..87c0b46f 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -710,8 +710,8 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
rcu_read_unlock();
 }
 
-/* simple helper to search for an existing extent at a given offset */
-int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
+/* simple helper to search for an existing data extent at a given offset */
+int btrfs_lookup_data_extent(struct btrfs_root *root, u64 start, u64 len)
 {
int ret;
struct btrfs_key key;
@@ -726,12 +726,6 @@ int btrfs_lookup_extent(struct btrfs_root *root, u64 
start, u64 len)
key.type = BTRFS_EXTENT_ITEM_KEY;
ret = btrfs_search_slot(NULL, root-fs_info-extent_root, key, path,
0, 0);
-   if (ret  0) {
-   btrfs_item_key_to_cpu(path-nodes[0], key, path-slots[0]);
-   if (key.objectid == start 
-   key.type == BTRFS_METADATA_ITEM_KEY)
-   ret = 0;
-   }
btrfs_free_path(path);
return ret;
 }
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 2b26dad..6d58d72 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -672,7 +672,7 @@ static noinline int replay_one_extent(struct 
btrfs_trans_handle *trans,
 * is this extent already allocated in the extent
 * allocation tree?  If so, just add a reference
 */
-   ret = btrfs_lookup_extent(root, ins.objectid,
+   ret = btrfs_lookup_data_extent(root, ins.objectid,
ins.offset);
if (ret == 0) {
ret = btrfs_inc_extent_ref(trans, root,
-- 
1.9.1

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v2] Btrfs: fix snapshot inconsistency after a file write followed by truncate

2014-10-28 Thread Filipe Manana

If right after starting the snapshot creation ioctl we perform a write against a
file followed by a truncate, with both operations increasing the file's size, we
can get a snapshot tree that reflects a state of the source subvolume's tree 
where
the file truncation happened but the write operation didn't. This leaves a gap
between 2 file extent items of the inode, which makes btrfs' fsck complain 
about it.

For example, if we perform the following file operations:

$ mkfs.btrfs -f /dev/vdd
$ mount /dev/vdd /mnt
$ xfs_io -f \
  -c pwrite -S 0xaa -b 32K 0 32K \
  -c fsync \
  -c pwrite -S 0xbb -b 32770 16K 32770 \
  -c truncate 90123 \
  /mnt/foobar

and the snapshot creation ioctl was just called before the second write, we 
often
can get the following inode items in the snapshot's btree:

item 120 key (257 INODE_ITEM 0) itemoff 7987 itemsize 160
inode generation 146 transid 7 size 90123 block group 0 mode 
100600 links 1 uid 0 gid 0 rdev 0 flags 0x0
item 121 key (257 INODE_REF 256) itemoff 7967 itemsize 20
inode ref index 282 namelen 10 name: foobar
item 122 key (257 EXTENT_DATA 0) itemoff 7914 itemsize 53
extent data disk byte 1104855040 nr 32768
extent data offset 0 nr 32768 ram 32768
extent compression 0
item 123 key (257 EXTENT_DATA 53248) itemoff 7861 itemsize 53
extent data disk byte 0 nr 0
extent data offset 0 nr 40960 ram 40960
extent compression 0

There's a file range, corresponding to the interval [32K; ALIGN(16K + 32770, 
4096)[
for which there's no file extent item covering it. This is because the file 
write
and file truncate operations happened both right after the snapshot creation 
ioctl
called btrfs_start_delalloc_inodes(), which means we didn't start and wait for 
the
ordered extent that matches the write and, in btrfs_setsize(), we were able to 
call
btrfs_cont_expand() before being able to commit the current transaction in the
snapshot creation ioctl. So this made it possibe to insert the hole file extent
item in the source subvolume (which represents the region added by the truncate)
right before the transaction commit from the snapshot creation ioctl.

Btrfs' fsck tool complains about such cases with a message like the following:

root 331 inode 257 errors 100, file extent discount

From a user perspective, the expectation when a snapshot is created while those
file operations are being performed is that the snapshot will have a file that
either:

1) is empty
2) only the first write was captured
3) only the 2 writes were captured
4) both writes and the truncation were captured

But never capture a state where only the first write and the truncation were
captured (since the second write was performed before the truncation).

A test case for xfstests follows.

Signed-off-by: Filipe Manana fdman...@suse.com
---

V2: Use different approach to solve the problem. Don't start and wait for all
dellaloc to finish after every expanding truncate, instead add an additional
flush at transaction commit time if we're doing a transaction commit that
creates snapshots.

 fs/btrfs/transaction.c | 59 ++
 1 file changed, 59 insertions(+)

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 396ae8b..18c356e 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1714,12 +1714,65 @@ static inline void btrfs_wait_delalloc_flush(struct 
btrfs_fs_info *fs_info)
btrfs_wait_ordered_roots(fs_info, -1);
 }
 
+static int
+start_pending_snapshot_roots_delalloc(struct btrfs_trans_handle *trans,
+ struct list_head *splice)
+{
+   struct btrfs_pending_snapshot *pending_snapshot;
+   int ret = 0;
+
+   if (btrfs_test_opt(trans-root, FLUSHONCOMMIT))
+   return 0;
+
+   spin_lock(trans-root-fs_info-trans_lock);
+   list_splice_init(trans-transaction-pending_snapshots, splice);
+   spin_unlock(trans-root-fs_info-trans_lock);
+
+   /*
+* Start again delalloc for the roots our pending snapshots are made
+* from. We did it before starting/joining a transaction and we do it
+* here again because new inode operations might have happened since
+* then and we want to make sure the snapshot captures a fully
+* consistent state of the source root tree. For example, if after the
+* first delalloc flush a write is made against an inode followed by
+* an expanding truncate, we want to make sure the snapshot captured
+* both the write and the truncation, and not just the truncation.
+* Here we shouldn't have much delalloc work to do, as the bulk of it
+* was done before and outside the transaction.
+*/
+   list_for_each_entry(pending_snapshot, splice

[PATCH v3] Btrfs: fix snapshot inconsistency after a file write followed by truncate

2014-10-29 Thread Filipe Manana

If right after starting the snapshot creation ioctl we perform a write against a
file followed by a truncate, with both operations increasing the file's size, we
can get a snapshot tree that reflects a state of the source subvolume's tree 
where
the file truncation happened but the write operation didn't. This leaves a gap
between 2 file extent items of the inode, which makes btrfs' fsck complain 
about it.

For example, if we perform the following file operations:

$ mkfs.btrfs -f /dev/vdd
$ mount /dev/vdd /mnt
$ xfs_io -f \
  -c pwrite -S 0xaa -b 32K 0 32K \
  -c fsync \
  -c pwrite -S 0xbb -b 32770 16K 32770 \
  -c truncate 90123 \
  /mnt/foobar

and the snapshot creation ioctl was just called before the second write, we 
often
can get the following inode items in the snapshot's btree:

item 120 key (257 INODE_ITEM 0) itemoff 7987 itemsize 160
inode generation 146 transid 7 size 90123 block group 0 mode 
100600 links 1 uid 0 gid 0 rdev 0 flags 0x0
item 121 key (257 INODE_REF 256) itemoff 7967 itemsize 20
inode ref index 282 namelen 10 name: foobar
item 122 key (257 EXTENT_DATA 0) itemoff 7914 itemsize 53
extent data disk byte 1104855040 nr 32768
extent data offset 0 nr 32768 ram 32768
extent compression 0
item 123 key (257 EXTENT_DATA 53248) itemoff 7861 itemsize 53
extent data disk byte 0 nr 0
extent data offset 0 nr 40960 ram 40960
extent compression 0

There's a file range, corresponding to the interval [32K; ALIGN(16K + 32770, 
4096)[
for which there's no file extent item covering it. This is because the file 
write
and file truncate operations happened both right after the snapshot creation 
ioctl
called btrfs_start_delalloc_inodes(), which means we didn't start and wait for 
the
ordered extent that matches the write and, in btrfs_setsize(), we were able to 
call
btrfs_cont_expand() before being able to commit the current transaction in the
snapshot creation ioctl. So this made it possibe to insert the hole file extent
item in the source subvolume (which represents the region added by the truncate)
right before the transaction commit from the snapshot creation ioctl.

Btrfs' fsck tool complains about such cases with a message like the following:

root 331 inode 257 errors 100, file extent discount

From a user perspective, the expectation when a snapshot is created while those
file operations are being performed is that the snapshot will have a file that
either:

1) is empty
2) only the first write was captured
3) only the 2 writes were captured
4) both writes and the truncation were captured

But never capture a state where only the first write and the truncation were
captured (since the second write was performed before the truncation).

A test case for xfstests follows.

Signed-off-by: Filipe Manana fdman...@suse.com
---

V2: Use different approach to solve the problem. Don't start and wait for all
dellaloc to finish after every expanding truncate, instead add an additional
flush at transaction commit time if we're doing a transaction commit that
creates snapshots.

V3: Removed useless test condition in +wait_pending_snapshot_roots_delalloc().

 fs/btrfs/transaction.c | 59 ++
 1 file changed, 59 insertions(+)

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 396ae8b..5e7f004 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1714,12 +1714,65 @@ static inline void btrfs_wait_delalloc_flush(struct 
btrfs_fs_info *fs_info)
btrfs_wait_ordered_roots(fs_info, -1);
 }
 
+static int
+start_pending_snapshot_roots_delalloc(struct btrfs_trans_handle *trans,
+ struct list_head *splice)
+{
+   struct btrfs_pending_snapshot *pending_snapshot;
+   int ret = 0;
+
+   if (btrfs_test_opt(trans-root, FLUSHONCOMMIT))
+   return 0;
+
+   spin_lock(trans-root-fs_info-trans_lock);
+   list_splice_init(trans-transaction-pending_snapshots, splice);
+   spin_unlock(trans-root-fs_info-trans_lock);
+
+   /*
+* Start again delalloc for the roots our pending snapshots are made
+* from. We did it before starting/joining a transaction and we do it
+* here again because new inode operations might have happened since
+* then and we want to make sure the snapshot captures a fully
+* consistent state of the source root tree. For example, if after the
+* first delalloc flush a write is made against an inode followed by
+* an expanding truncate, we want to make sure the snapshot captured
+* both the write and the truncation, and not just the truncation.
+* Here we shouldn't have much delalloc work to do, as the bulk of it
+* was done before and outside

[PATCH v4] Btrfs: fix snapshot inconsistency after a file write followed by truncate

2014-10-29 Thread Filipe Manana

If right after starting the snapshot creation ioctl we perform a write against a
file followed by a truncate, with both operations increasing the file's size, we
can get a snapshot tree that reflects a state of the source subvolume's tree 
where
the file truncation happened but the write operation didn't. This leaves a gap
between 2 file extent items of the inode, which makes btrfs' fsck complain 
about it.

For example, if we perform the following file operations:

$ mkfs.btrfs -f /dev/vdd
$ mount /dev/vdd /mnt
$ xfs_io -f \
  -c pwrite -S 0xaa -b 32K 0 32K \
  -c fsync \
  -c pwrite -S 0xbb -b 32770 16K 32770 \
  -c truncate 90123 \
  /mnt/foobar

and the snapshot creation ioctl was just called before the second write, we 
often
can get the following inode items in the snapshot's btree:

item 120 key (257 INODE_ITEM 0) itemoff 7987 itemsize 160
inode generation 146 transid 7 size 90123 block group 0 mode 
100600 links 1 uid 0 gid 0 rdev 0 flags 0x0
item 121 key (257 INODE_REF 256) itemoff 7967 itemsize 20
inode ref index 282 namelen 10 name: foobar
item 122 key (257 EXTENT_DATA 0) itemoff 7914 itemsize 53
extent data disk byte 1104855040 nr 32768
extent data offset 0 nr 32768 ram 32768
extent compression 0
item 123 key (257 EXTENT_DATA 53248) itemoff 7861 itemsize 53
extent data disk byte 0 nr 0
extent data offset 0 nr 40960 ram 40960
extent compression 0

There's a file range, corresponding to the interval [32K; ALIGN(16K + 32770, 
4096)[
for which there's no file extent item covering it. This is because the file 
write
and file truncate operations happened both right after the snapshot creation 
ioctl
called btrfs_start_delalloc_inodes(), which means we didn't start and wait for 
the
ordered extent that matches the write and, in btrfs_setsize(), we were able to 
call
btrfs_cont_expand() before being able to commit the current transaction in the
snapshot creation ioctl. So this made it possibe to insert the hole file extent
item in the source subvolume (which represents the region added by the truncate)
right before the transaction commit from the snapshot creation ioctl.

Btrfs' fsck tool complains about such cases with a message like the following:

root 331 inode 257 errors 100, file extent discount

From a user perspective, the expectation when a snapshot is created while those
file operations are being performed is that the snapshot will have a file that
either:

1) is empty
2) only the first write was captured
3) only the 2 writes were captured
4) both writes and the truncation were captured

But never capture a state where only the first write and the truncation were
captured (since the second write was performed before the truncation).

A test case for xfstests follows.

Signed-off-by: Filipe Manana fdman...@suse.com
---

V2: Use different approach to solve the problem. Don't start and wait for all
dellaloc to finish after every expanding truncate, instead add an additional
flush at transaction commit time if we're doing a transaction commit that
creates snapshots.

V3: Removed useless test condition in +wait_pending_snapshot_roots_delalloc().

V4: Use another approach that doesn't imply starting delalloc work and wait
for it to finish at transaction commit time.

 fs/btrfs/ctree.h   |  4 ++--
 fs/btrfs/extent-tree.c | 16 +---
 fs/btrfs/file.c| 10 +-
 fs/btrfs/inode.c   | 47 ---
 fs/btrfs/ioctl.c   |  7 ---
 5 files changed, 60 insertions(+), 24 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index b72b358..36f82ba 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3427,8 +3427,8 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info);
 int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
 struct btrfs_fs_info *fs_info);
 int __get_raid_index(u64 flags);
-int btrfs_start_nocow_write(struct btrfs_root *root);
-void btrfs_end_nocow_write(struct btrfs_root *root);
+int btrfs_start_write_no_snapshoting(struct btrfs_root *root);
+void btrfs_end_write_no_snapshoting(struct btrfs_root *root);
 /* ctree.c */
 int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
 int level, int *slot);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index a84e00d..9ba886c 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -9657,12 +9657,14 @@ int btrfs_trim_fs(struct btrfs_root *root, struct 
fstrim_range *range)
 }
 
 /*
- * btrfs_{start,end}_write() is similar to mnt_{want, drop}_write(),
- * they are used to prevent the some tasks writing data into the page cache
- * by nocow before the subvolume is snapshoted, but flush the data into
- * the disk

[PATCH] Btrfs: fix freeing used extent after removing empty block group

2014-11-03 Thread Filipe Manana

Due to ignoring errors returned by clear_extent_bits (at the moment only
-ENOMEM is possible), we can end up freeing an extent that is actually in
use (i.e. return the extent to the free space cache).

The sequence of steps that lead to this:

1) Cleaner thread starts execution and calls btrfs_delete_unused_bgs(), with
   the goal of freeing empty block groups;

2) btrfs_delete_unused_bgs() finds an empty block group, joins the current
   transaction (or starts a new one if none is running) and attempts to
   clear the EXTENT_DIRTY bit for the block group's range from freed_extents[0]
   and freed_extents[1] (of which one corresponds to fs_info-pinned_extents);

3) Clearing the EXTENT_DIRTY bit (via clear_extent_bits()) fails with
   -ENOMEM, but such error is ignored and btrfs_delete_unused_bgs() proceeds
   to delete the block group and the respective chunk, while pinned_extents
   remains with that bit set for the whole (or a part of the) range covered
   by the block group;

4) Later while the transaction is still running, the chunk ends up being reused
   for a new block group (maybe for different purpose, data or metadata), and
   extents belonging to the new block group are allocated for file data or btree
   nodes/leafs;

5) The current transaction is committed, meaning that we unpinned one or more
   extents from the new block group (through btrfs_finish_extent_commit() and
   unpin_extent_range()) which are now being used for new file data or new
   metadata (through btrfs_finish_extent_commit() and unpin_extent_range()).
   And unpinning means we returned the extents to the free space cache of the
   new block group, which implies those extents can be used for future 
allocations
   while they're still in use.

Alternatively, we can hit a BUG_ON() when doing a lookup for a block group's 
cache
object in unpin_extent_range() if a new block group didn't end up being 
allocated for
the same chunk (step 4 above).

Fix this by not freeing the block group and chunk if we fail to clear the dirty 
bit.

Signed-off-by: Filipe Manana fdman...@suse.com
---
 fs/btrfs/extent-tree.c | 13 +++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 9ba886c..744b580 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -9523,10 +9523,18 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info 
*fs_info)
 */
start = block_group-key.objectid;
end = start + block_group-key.offset - 1;
-   clear_extent_bits(fs_info-freed_extents[0], start, end,
+   ret = clear_extent_bits(fs_info-freed_extents[0], start, end,
  EXTENT_DIRTY, GFP_NOFS);
-   clear_extent_bits(fs_info-freed_extents[1], start, end,
+   if (ret) {
+   btrfs_set_block_group_rw(root, block_group);
+   goto end_trans;
+   }
+   ret = clear_extent_bits(fs_info-freed_extents[1], start, end,
  EXTENT_DIRTY, GFP_NOFS);
+   if (ret) {
+   btrfs_set_block_group_rw(root, block_group);
+   goto end_trans;
+   }
 
/* Reset pinned so btrfs_put_block_group doesn't complain */
block_group-pinned = 0;
@@ -9537,6 +9545,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info 
*fs_info)
 */
ret = btrfs_remove_chunk(trans, root,
 block_group-key.objectid);
+end_trans:
btrfs_end_transaction(trans, root);
 next:
btrfs_put_block_group(block_group);
-- 
1.9.1

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH] Btrfs: avoid premature -ENOMEM in clear_extent_bit()

2014-11-03 Thread Filipe Manana

We try to allocate an extent state structure before acquiring the extent
state tree's spinlock as we might need a new one later and therefore avoid
doing later an atomic allocation while holding the tree's spinlock. However
we returned -ENOMEM if that initial non-atomic allocation failed, which is
a bit excessive since we might end up not needing the pre-allocated extent
state at all - for the case where the tree doesn't have any extent states
that cover the input range and cover too any other range. Therefore don't
return -ENOMEM if that pre-allocation fails.

Signed-off-by: Filipe Manana fdman...@suse.com
---
 fs/btrfs/extent_io.c | 9 +++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 654ed3d..4ebabd2 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -595,9 +595,14 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 
start, u64 end,
clear = 1;
 again:
if (!prealloc  (mask  __GFP_WAIT)) {
+   /*
+* Don't care for allocation failure here because we might end
+* up not needing the pre-allocated extent state at all, which
+* is the case if we only have in the tree extent states that
+* cover our input range and don't cover too any other range.
+* If we end up needing a new extent state we allocate it later.
+*/
prealloc = alloc_extent_state(mask);
-   if (!prealloc)
-   return -ENOMEM;
}
 
spin_lock(tree-lock);
-- 
1.9.1

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH] Btrfs: fix race when cleaning unused block groups

2014-11-05 Thread Filipe Manana

We have a race while deleting unused block groups that causes extents written
by past generations/transactions to be rewritten by the current transaction
before that transaction is committed. The steps that lead to this issue:

1) At transaction N one or more block groups became unused and we added them
   to the list fs_info-unused_bgs;

2) While still at transaction N we write btree extents to block group X and the
   transaction is committed;

3) The cleaner kthread is awaken and calls btrfs_delete_unused_bgs() to go 
through
   the list fs_info-unused_bgs and remove unused block groups;

4) Transaction N + 1 starts;

5) At transaction N + 1, block group X becomes unused and is added to the list
   fs_info-unused_bgs - this implies delayed refs were run, so we had the
   following function calls: btrfs_run_delayed_refs() - __btrfs_free_extent()
   - update_block_group(). The update_block_group() function grabs the lock
   fs_info-unused_bgs_lock, adds block group X to fs_info-unused_bgs and
   releases that lock;

6) The cleaner kthread, while at btrfs_delete_unused_bgs(), sees block group X
   added by transaction N + 1 because it's doing a loop that finishes only when
   the list fs_info-unused_bgs is empty and locks and unlocks the spinlock
   fs_info-unused_bgs_lock on each iteration. So it deletes the block group
   and its respective chunk is released. Even if it didn't do the lock/unlock
   per iteration, it could still see block group X in the list, because the
   cleaner kthread might call btrfs_delete_unused_bgs() multiple times (for
   example if there are several snapshots to delete);

7) A new block group X' is created for data, and it's associated to the same 
chunk
   that block group X was associated to;

8) Extents from block group X' are allocated for file data and for example an 
fsync
   makes the file data be effectively written to disk;

9) A crash/reboot happens before transaction N + 1 is committed;

10) On the next mount, we will read extents from block group/chunk X but they no
   longer have valid btree nodes/leafs - they have instead file data, and 
therefore
   all sorts of errors will happen.

So fix this by ensuring the cleaner kthread can never delete a block group that
became unused in the current transaction, that is, only delete block groups that
were added to the unused_bgs list by past transactions.

Signed-off-by: Filipe Manana fdman...@suse.com
---
 fs/btrfs/ctree.h   | 1 +
 fs/btrfs/disk-io.c | 1 +
 fs/btrfs/extent-tree.c | 5 +++--
 fs/btrfs/transaction.c | 5 +
 4 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 36f82ba..a5e471a 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1726,6 +1726,7 @@ struct btrfs_fs_info {
 
spinlock_t unused_bgs_lock;
struct list_head unused_bgs;
+   struct list_head unused_bgs_to_clean;
 
/* For btrfs to record security options */
struct security_mnt_opts security_opts;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 2409718..702bbdf 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2243,6 +2243,7 @@ int open_ctree(struct super_block *sb,
INIT_LIST_HEAD(fs_info-space_info);
INIT_LIST_HEAD(fs_info-tree_mod_seq_list);
INIT_LIST_HEAD(fs_info-unused_bgs);
+   INIT_LIST_HEAD(fs_info-unused_bgs_to_clean);
btrfs_mapping_init(fs_info-mapping_tree);
btrfs_init_block_rsv(fs_info-global_block_rsv,
 BTRFS_BLOCK_RSV_GLOBAL);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 744b580..bc1c0b7 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -8858,6 +8858,7 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
up_write(info-commit_root_sem);
 
spin_lock(info-unused_bgs_lock);
+   list_splice_init(info-unused_bgs_to_clean, info-unused_bgs);
while (!list_empty(info-unused_bgs)) {
block_group = list_first_entry(info-unused_bgs,
   struct btrfs_block_group_cache,
@@ -9466,10 +9467,10 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info 
*fs_info)
return;
 
spin_lock(fs_info-unused_bgs_lock);
-   while (!list_empty(fs_info-unused_bgs)) {
+   while (!list_empty(fs_info-unused_bgs_to_clean)) {
u64 start, end;
 
-   block_group = list_first_entry(fs_info-unused_bgs,
+   block_group = list_first_entry(fs_info-unused_bgs_to_clean,
   struct btrfs_block_group_cache,
   bg_list);
space_info = block_group-space_info;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 396ae8b..86d7cf5 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1937,6 +1937,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle 
*trans

[PATCH] Btrfs: avoid crash when overflowing a dir_item

2014-11-07 Thread Filipe Manana

When attempting to insert a new dir_item, we were calling btrfs_extent_item()
without checking if the leaf has enough space to extend the item. This made
btrfs_extent_item() crash through a BUG() call.
Therefore do the check and return ENOSPC if the leaf doesn't have enough space.

Signed-off-by: Filipe Manana fdman...@suse.com
---
 fs/btrfs/dir-item.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index fc8df86..65bf60e 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -53,6 +53,8 @@ static struct btrfs_dir_item *insert_with_overflow(struct 
btrfs_trans_handle
di = btrfs_match_dir_item_name(root, path, name, name_len);
if (di)
return ERR_PTR(-EEXIST);
+   if (btrfs_leaf_free_space(root, path-nodes[0])  data_size)
+   return ERR_PTR(-ENOSPC);
btrfs_extend_item(root, path, data_size);
} else if (ret  0)
return ERR_PTR(ret);
-- 
1.9.1

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH] Btrfs: make xattr replace operations atomic

2014-11-07 Thread Filipe Manana

Replacing a xattr consists of doing a lookup for its existing value, delete
the current value from the respective leaf, release the search path and then
finally insert the new value. This leaves a time window where readers (getxattr,
listxattrs) won't see any value for the xattr. Xattrs are used to store ACLs,
so this has security implications.

This change also fixes 2 other existing issues which were:

*) Deleting the old xattr value without verifying first if the new xattr will
   fit in the existing leaf item (in case multiple xattrs are packed in the
   same item due to name hash collision);

*) Returning -EEXIST when the flag XATTR_CREATE is given and the xattr doesn't
   exist but we have have an existing item that packs muliple xattrs with
   the same name hash as the input xattr. In this case we should return ENOSPC.

A test case for xfstests follows soon.

Thanks to Alexandre Oliva for reporting the non-atomicity of the xattr replace
implementation.

Reported-by: Alexandre Oliva ol...@gnu.org
Signed-off-by: Filipe Manana fdman...@suse.com
---
 fs/btrfs/ctree.h|   4 ++
 fs/btrfs/dir-item.c |  10 ++--
 fs/btrfs/xattr.c| 142 ++--
 3 files changed, 88 insertions(+), 68 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index a5e471a..9a47dfe 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3687,6 +3687,10 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct 
btrfs_trans_handle *trans,
 int verify_dir_item(struct btrfs_root *root,
struct extent_buffer *leaf,
struct btrfs_dir_item *dir_item);
+struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
+struct btrfs_path *path,
+const char *name,
+int name_len);
 
 /* orphan.c */
 int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 65bf60e..c14e682 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -21,10 +21,6 @@
 #include hash.h
 #include transaction.h
 
-static struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root 
*root,
- struct btrfs_path *path,
- const char *name, int name_len);
-
 /*
  * insert a name into a directory, doing overflow properly if there is a hash
  * collision.  data_size indicates how big the item inserted should be.  On
@@ -385,9 +381,9 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct 
btrfs_trans_handle *trans,
  * this walks through all the entries in a dir item and finds one
  * for a specific name.
  */
-static struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root 
*root,
- struct btrfs_path *path,
- const char *name, int name_len)
+struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
+struct btrfs_path *path,
+const char *name, int name_len)
 {
struct btrfs_dir_item *dir_item;
unsigned long name_ptr;
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index dcf2013..3c8fba1 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -91,7 +91,7 @@ static int do_setxattr(struct btrfs_trans_handle *trans,
   struct inode *inode, const char *name,
   const void *value, size_t size, int flags)
 {
-   struct btrfs_dir_item *di;
+   struct btrfs_dir_item *di = NULL;
struct btrfs_root *root = BTRFS_I(inode)-root;
struct btrfs_path *path;
size_t name_len = strlen(name);
@@ -104,83 +104,103 @@ static int do_setxattr(struct btrfs_trans_handle *trans,
if (!path)
return -ENOMEM;
 
-   if (flags  XATTR_REPLACE) {
-   di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(inode), 
name,
-   name_len, -1);
-   if (IS_ERR(di)) {
-   ret = PTR_ERR(di);
-   goto out;
-   } else if (!di) {
+   if (!value  ((flags  XATTR_REPLACE) || !flags)) {
+   /*
+* We're deleting only a xattr (no replace).
+* Don't follow the path below because it could leave a leaf
+* empty.
+*/
+   di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(inode),
+   name, name_len, -1);
+   if (!di  (flags  XATTR_REPLACE))
ret = -ENODATA;
-   goto out;
-   }
-   ret = btrfs_delete_one_dir_name(trans, root, path, di);
-   if (ret)
-   goto out;
-   btrfs_release_path(path);
+   else if (di

[PATCH] fstests: add generic test to verify xattr replace operations are atomic

2014-11-07 Thread Filipe Manana

This test verifies that replacing a xattr's value is an atomic
operation. This is motivated by an issue in btrfs where replacing
a xattr's value wasn't an atomic operation, it consisted of
removing the old value and then inserting the new value in a
btree. This made readers (getxattr and listxattrs) not getting
neither the old nor the new value during a short time window.

Signed-off-by: Filipe Manana fdman...@suse.com
---
 tests/generic/326 | 107 ++
 tests/generic/326.out |   2 +
 tests/generic/group   |   1 +
 3 files changed, 110 insertions(+)
 create mode 100755 tests/generic/326
 create mode 100644 tests/generic/326.out

diff --git a/tests/generic/326 b/tests/generic/326
new file mode 100755
index 000..c110fc0
--- /dev/null
+++ b/tests/generic/326
@@ -0,0 +1,107 @@
+#! /bin/bash
+# FSQA Test No. 326
+#
+# Verify that replacing a xattr's value is an atomic operation.
+# This is motivated by an issue in btrfs where replacing a xattr's value
+# wasn't an atomic operation, it consisted of removing the old value and
+# then inserting the new value in a btree. This made readers (getxattr
+# and listxattrs) not getting neither the old nor the new value during
+# a short time window.
+#
+# The btrfs issue was fixed by the following linux kernel patch:
+#
+#Btrfs: make xattr replace operations atomic
+#
+#---
+#
+# Copyright (C) 2014 SUSE Linux Products GmbH. All Rights Reserved.
+# Author: Filipe Manana fdman...@suse.com
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it would be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write the Free Software Foundation,
+# Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+#---
+#
+
+seq=`basename $0`
+seqres=$RESULT_DIR/$seq
+echo QA output created by $seq
+tmp=/tmp/$$
+status=1   # failure is the default!
+trap _cleanup; exit \$status 0 1 2 3 15
+
+_cleanup()
+{
+   if [ ! -z $setter_pid ]; then
+   kill $setter_pid  /dev/null
+   fi
+   rm -f $tmp.*
+}
+
+# get standard environment, filters and checks
+. ./common/rc
+. ./common/filter
+. ./common/attr
+
+# real QA test starts here
+_need_to_be_root
+_supported_fs generic
+_supported_os Linux
+_require_scratch
+_require_attrs
+
+rm -f $seqres.full
+
+xattr_name=user.something
+xattr_value1=foobar
+xattr_value2=rabbit_hole
+
+set_xattr_loop()
+{
+   local name=$1
+
+   local cur_val=$xattr_value1
+   while true; do
+   $SETFATTR_PROG -n $xattr_name -v $cur_val $SCRATCH_MNT/$name
+   if [ $cur_val == $xattr_value1 ]; then
+   cur_val=$xattr_value2
+   else
+   cur_val=$xattr_value1
+   fi
+   done
+}
+
+_scratch_mkfs $seqres.full 21
+_scratch_mount
+
+test_file=test_xattr_replace
+touch $SCRATCH_MNT/$test_file
+$SETFATTR_PROG -n $xattr_name -v $xattr_value1 $SCRATCH_MNT/$test_file
+
+set_xattr_loop $test_file 
+setter_pid=$!
+
+for ((i = 0; i  1000; i++)); do
+   xattr_val=$($GETFATTR_PROG --absolute-names -n $xattr_name \
+   $SCRATCH_MNT/$test_file | grep $xattr_name= | cut -d '=' -f 2)
+   if [ $xattr_val != \$xattr_value1\ -a \
+   $xattr_val != \$xattr_value2\ ]; then
+   _fail Missing or unexpected xattr value: $xattr_val
+   fi
+done
+
+kill $setter_pid  /dev/null
+unset setter_pid
+
+echo Silence is golden
+status=0
+exit
diff --git a/tests/generic/326.out b/tests/generic/326.out
new file mode 100644
index 000..4ac0db5
--- /dev/null
+++ b/tests/generic/326.out
@@ -0,0 +1,2 @@
+QA output created by 326
+Silence is golden
diff --git a/tests/generic/group b/tests/generic/group
index 9c82a6f..01f442d 100644
--- a/tests/generic/group
+++ b/tests/generic/group
@@ -152,3 +152,4 @@
 323 auto aio stress
 324 auto fsr quick
 325 auto quick data log
+326 auto quick xattr
-- 
1.9.1

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

1 2 3 4 5 6 7 8 >

1 - 100 of 716 matches

Mail list logo