Re: [PATCH 2/2] Btrfs: fix memory leak of pending_snapshot-inherit

2013-02-07 Thread Arne Jansen
On 02/07/13 07:02, Miao Xie wrote:
 The argument inherit of btrfs_ioctl_snap_create_transid() was assigned
 to NULL during we created the snapshots, so we didn't free it though we
 called kfree() in the caller.
 
 But since we are sure the snapshot creation is done after the function -
 btrfs_ioctl_snap_create_transid() - completes, it is safe that we don't
 assign the pointer inherit to NULL, and just free it in the caller of
 btrfs_ioctl_snap_create_transid(). In this way, the code can become more
 readable.

NAK. The snapshot creation is triggered from btrfs_commit_transaction,
I don't want to implicitly rely on commit_transaction being called for
each snapshot created. I'm not even sure the async path really commits
the transaction.
The responsibility for the creation is passed to the pending_snapshot
data structure, and so should the responsibility for the inherit struct.

-Arne

 
 Reported-by: Alex Lyakas alex.bt...@zadarastorage.com
 Cc: Arne Jansen sensi...@gmx.net
 Signed-off-by: Miao Xie mi...@cn.fujitsu.com
 ---
  fs/btrfs/ioctl.c | 18 +++---
  1 file changed, 7 insertions(+), 11 deletions(-)
 
 diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
 index 02d3035..40f2fbf 100644
 --- a/fs/btrfs/ioctl.c
 +++ b/fs/btrfs/ioctl.c
 @@ -367,7 +367,7 @@ static noinline int create_subvol(struct btrfs_root *root,
 struct dentry *dentry,
 char *name, int namelen,
 u64 *async_transid,
 -   struct btrfs_qgroup_inherit **inherit)
 +   struct btrfs_qgroup_inherit *inherit)
  {
   struct btrfs_trans_handle *trans;
   struct btrfs_key key;
 @@ -401,8 +401,7 @@ static noinline int create_subvol(struct btrfs_root *root,
   if (IS_ERR(trans))
   return PTR_ERR(trans);
  
 - ret = btrfs_qgroup_inherit(trans, root-fs_info, 0, objectid,
 -inherit ? *inherit : NULL);
 + ret = btrfs_qgroup_inherit(trans, root-fs_info, 0, objectid, inherit);
   if (ret)
   goto fail;
  
 @@ -530,7 +529,7 @@ fail:
  
  static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
  char *name, int namelen, u64 *async_transid,
 -bool readonly, struct btrfs_qgroup_inherit **inherit)
 +bool readonly, struct btrfs_qgroup_inherit *inherit)
  {
   struct inode *inode;
   struct btrfs_pending_snapshot *pending_snapshot;
 @@ -549,10 +548,7 @@ static int create_snapshot(struct btrfs_root *root, 
 struct dentry *dentry,
   pending_snapshot-dentry = dentry;
   pending_snapshot-root = root;
   pending_snapshot-readonly = readonly;
 - if (inherit) {
 - pending_snapshot-inherit = *inherit;
 - *inherit = NULL;/* take responsibility to free it */
 - }
 + pending_snapshot-inherit = inherit;
  
   trans = btrfs_start_transaction(root-fs_info-extent_root, 6);
   if (IS_ERR(trans)) {
 @@ -692,7 +688,7 @@ static noinline int btrfs_mksubvol(struct path *parent,
  char *name, int namelen,
  struct btrfs_root *snap_src,
  u64 *async_transid, bool readonly,
 -struct btrfs_qgroup_inherit **inherit)
 +struct btrfs_qgroup_inherit *inherit)
  {
   struct inode *dir  = parent-dentry-d_inode;
   struct dentry *dentry;
 @@ -1454,7 +1450,7 @@ out:
  static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
   char *name, unsigned long fd, int subvol,
   u64 *transid, bool readonly,
 - struct btrfs_qgroup_inherit **inherit)
 + struct btrfs_qgroup_inherit *inherit)
  {
   int namelen;
   int ret = 0;
 @@ -1563,7 +1559,7 @@ static noinline int btrfs_ioctl_snap_create_v2(struct 
 file *file,
  
   ret = btrfs_ioctl_snap_create_transid(file, vol_args-name,
 vol_args-fd, subvol, ptr,
 -   readonly, inherit);
 +   readonly, inherit);
  
   if (ret == 0  ptr 
   copy_to_user(arg +
 

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/2] Btrfs: fix memory leak of pending_snapshot-inherit

2013-02-07 Thread Alex Lyakas
Arne, Miao,
I also agree that it is better to move this responsibility to
create_pending_snapshot().

Alex.


On Thu, Feb 7, 2013 at 10:43 AM, Arne Jansen sensi...@gmx.net wrote:
 On 02/07/13 07:02, Miao Xie wrote:
 The argument inherit of btrfs_ioctl_snap_create_transid() was assigned
 to NULL during we created the snapshots, so we didn't free it though we
 called kfree() in the caller.

 But since we are sure the snapshot creation is done after the function -
 btrfs_ioctl_snap_create_transid() - completes, it is safe that we don't
 assign the pointer inherit to NULL, and just free it in the caller of
 btrfs_ioctl_snap_create_transid(). In this way, the code can become more
 readable.

 NAK. The snapshot creation is triggered from btrfs_commit_transaction,
 I don't want to implicitly rely on commit_transaction being called for
 each snapshot created. I'm not even sure the async path really commits
 the transaction.
 The responsibility for the creation is passed to the pending_snapshot
 data structure, and so should the responsibility for the inherit struct.

 -Arne


 Reported-by: Alex Lyakas alex.bt...@zadarastorage.com
 Cc: Arne Jansen sensi...@gmx.net
 Signed-off-by: Miao Xie mi...@cn.fujitsu.com
 ---
  fs/btrfs/ioctl.c | 18 +++---
  1 file changed, 7 insertions(+), 11 deletions(-)

 diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
 index 02d3035..40f2fbf 100644
 --- a/fs/btrfs/ioctl.c
 +++ b/fs/btrfs/ioctl.c
 @@ -367,7 +367,7 @@ static noinline int create_subvol(struct btrfs_root 
 *root,
 struct dentry *dentry,
 char *name, int namelen,
 u64 *async_transid,
 -   struct btrfs_qgroup_inherit **inherit)
 +   struct btrfs_qgroup_inherit *inherit)
  {
   struct btrfs_trans_handle *trans;
   struct btrfs_key key;
 @@ -401,8 +401,7 @@ static noinline int create_subvol(struct btrfs_root 
 *root,
   if (IS_ERR(trans))
   return PTR_ERR(trans);

 - ret = btrfs_qgroup_inherit(trans, root-fs_info, 0, objectid,
 -inherit ? *inherit : NULL);
 + ret = btrfs_qgroup_inherit(trans, root-fs_info, 0, objectid, inherit);
   if (ret)
   goto fail;

 @@ -530,7 +529,7 @@ fail:

  static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
  char *name, int namelen, u64 *async_transid,
 -bool readonly, struct btrfs_qgroup_inherit 
 **inherit)
 +bool readonly, struct btrfs_qgroup_inherit *inherit)
  {
   struct inode *inode;
   struct btrfs_pending_snapshot *pending_snapshot;
 @@ -549,10 +548,7 @@ static int create_snapshot(struct btrfs_root *root, 
 struct dentry *dentry,
   pending_snapshot-dentry = dentry;
   pending_snapshot-root = root;
   pending_snapshot-readonly = readonly;
 - if (inherit) {
 - pending_snapshot-inherit = *inherit;
 - *inherit = NULL;/* take responsibility to free it */
 - }
 + pending_snapshot-inherit = inherit;

   trans = btrfs_start_transaction(root-fs_info-extent_root, 6);
   if (IS_ERR(trans)) {
 @@ -692,7 +688,7 @@ static noinline int btrfs_mksubvol(struct path *parent,
  char *name, int namelen,
  struct btrfs_root *snap_src,
  u64 *async_transid, bool readonly,
 -struct btrfs_qgroup_inherit **inherit)
 +struct btrfs_qgroup_inherit *inherit)
  {
   struct inode *dir  = parent-dentry-d_inode;
   struct dentry *dentry;
 @@ -1454,7 +1450,7 @@ out:
  static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
   char *name, unsigned long fd, int subvol,
   u64 *transid, bool readonly,
 - struct btrfs_qgroup_inherit **inherit)
 + struct btrfs_qgroup_inherit *inherit)
  {
   int namelen;
   int ret = 0;
 @@ -1563,7 +1559,7 @@ static noinline int btrfs_ioctl_snap_create_v2(struct 
 file *file,

   ret = btrfs_ioctl_snap_create_transid(file, vol_args-name,
 vol_args-fd, subvol, ptr,
 -   readonly, inherit);
 +   readonly, inherit);

   if (ret == 0  ptr 
   copy_to_user(arg +


--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/2] Btrfs: fix memory leak of pending_snapshot-inherit

2013-02-07 Thread Miao Xie
On Thu, 07 Feb 2013 09:43:47 +0100, Arne Jansen wrote:
 On 02/07/13 07:02, Miao Xie wrote:
 The argument inherit of btrfs_ioctl_snap_create_transid() was assigned
 to NULL during we created the snapshots, so we didn't free it though we
 called kfree() in the caller.

 But since we are sure the snapshot creation is done after the function -
 btrfs_ioctl_snap_create_transid() - completes, it is safe that we don't
 assign the pointer inherit to NULL, and just free it in the caller of
 btrfs_ioctl_snap_create_transid(). In this way, the code can become more
 readable.
 
 NAK. The snapshot creation is triggered from btrfs_commit_transaction,
 I don't want to implicitly rely on commit_transaction being called for
 each snapshot created. I'm not even sure the async path really commits
 the transaction.
 The responsibility for the creation is passed to the pending_snapshot
 data structure, and so should the responsibility for the inherit struct.

I don't agree with you.

We are sure the async path really commits the transaction because we pass 1
as the value of the third argument into btrfs_commit_transaction_async(). It
means we must wait for the completion of the current transaction. So Freeing
the inherit struct in the caller is safe.
 
Besides that, the pending_snapshot data structure is also allocated and freed
by the same function in fact, why not use this style for the inherit struct.
I think it is more readable. Assigning a pointer to be NULL and freeing it
in the caller is very strange for the people who reads the code. (It is also
the reason why I made the mistake at the beginning.)

So I think my patch is reasonable.

Thanks
Miao

 -Arne
 

 Reported-by: Alex Lyakas alex.bt...@zadarastorage.com
 Cc: Arne Jansen sensi...@gmx.net
 Signed-off-by: Miao Xie mi...@cn.fujitsu.com
 ---
  fs/btrfs/ioctl.c | 18 +++---
  1 file changed, 7 insertions(+), 11 deletions(-)

 diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
 index 02d3035..40f2fbf 100644
 --- a/fs/btrfs/ioctl.c
 +++ b/fs/btrfs/ioctl.c
 @@ -367,7 +367,7 @@ static noinline int create_subvol(struct btrfs_root 
 *root,
struct dentry *dentry,
char *name, int namelen,
u64 *async_transid,
 -  struct btrfs_qgroup_inherit **inherit)
 +  struct btrfs_qgroup_inherit *inherit)
  {
  struct btrfs_trans_handle *trans;
  struct btrfs_key key;
 @@ -401,8 +401,7 @@ static noinline int create_subvol(struct btrfs_root 
 *root,
  if (IS_ERR(trans))
  return PTR_ERR(trans);
  
 -ret = btrfs_qgroup_inherit(trans, root-fs_info, 0, objectid,
 -   inherit ? *inherit : NULL);
 +ret = btrfs_qgroup_inherit(trans, root-fs_info, 0, objectid, inherit);
  if (ret)
  goto fail;
  
 @@ -530,7 +529,7 @@ fail:
  
  static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
 char *name, int namelen, u64 *async_transid,
 -   bool readonly, struct btrfs_qgroup_inherit **inherit)
 +   bool readonly, struct btrfs_qgroup_inherit *inherit)
  {
  struct inode *inode;
  struct btrfs_pending_snapshot *pending_snapshot;
 @@ -549,10 +548,7 @@ static int create_snapshot(struct btrfs_root *root, 
 struct dentry *dentry,
  pending_snapshot-dentry = dentry;
  pending_snapshot-root = root;
  pending_snapshot-readonly = readonly;
 -if (inherit) {
 -pending_snapshot-inherit = *inherit;
 -*inherit = NULL;/* take responsibility to free it */
 -}
 +pending_snapshot-inherit = inherit;
  
  trans = btrfs_start_transaction(root-fs_info-extent_root, 6);
  if (IS_ERR(trans)) {
 @@ -692,7 +688,7 @@ static noinline int btrfs_mksubvol(struct path *parent,
 char *name, int namelen,
 struct btrfs_root *snap_src,
 u64 *async_transid, bool readonly,
 -   struct btrfs_qgroup_inherit **inherit)
 +   struct btrfs_qgroup_inherit *inherit)
  {
  struct inode *dir  = parent-dentry-d_inode;
  struct dentry *dentry;
 @@ -1454,7 +1450,7 @@ out:
  static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
  char *name, unsigned long fd, int subvol,
  u64 *transid, bool readonly,
 -struct btrfs_qgroup_inherit **inherit)
 +struct btrfs_qgroup_inherit *inherit)
  {
  int namelen;
  int ret = 0;
 @@ -1563,7 +1559,7 @@ static noinline int btrfs_ioctl_snap_create_v2(struct 
 file *file,
  
  ret = btrfs_ioctl_snap_create_transid(file, vol_args-name,
vol_args-fd, subvol, ptr,
 -  

Re: [PATCH 2/2] Btrfs: fix memory leak of pending_snapshot-inherit

2013-02-07 Thread Arne Jansen
On 02/07/13 10:28, Miao Xie wrote:
 On Thu, 07 Feb 2013 09:43:47 +0100, Arne Jansen wrote:
 On 02/07/13 07:02, Miao Xie wrote:
 The argument inherit of btrfs_ioctl_snap_create_transid() was assigned
 to NULL during we created the snapshots, so we didn't free it though we
 called kfree() in the caller.

 But since we are sure the snapshot creation is done after the function -
 btrfs_ioctl_snap_create_transid() - completes, it is safe that we don't
 assign the pointer inherit to NULL, and just free it in the caller of
 btrfs_ioctl_snap_create_transid(). In this way, the code can become more
 readable.

 NAK. The snapshot creation is triggered from btrfs_commit_transaction,
 I don't want to implicitly rely on commit_transaction being called for
 each snapshot created. I'm not even sure the async path really commits
 the transaction.
 The responsibility for the creation is passed to the pending_snapshot
 data structure, and so should the responsibility for the inherit struct.
 
 I don't agree with you.
 
 We are sure the async path really commits the transaction because we pass 1
 as the value of the third argument into btrfs_commit_transaction_async(). It
 means we must wait for the completion of the current transaction. So Freeing
 the inherit struct in the caller is safe.

I see your point. But speaking of readability, I have to trace quite a
lot of functions to see that even the async path waits for the snapshot
to be created. Which makes the name 'async' sort of pointless.
So from what I've read so far I _think_ your patch does the right thing.
Thanks for clearing that up.

-Arne

  
 Besides that, the pending_snapshot data structure is also allocated and freed
 by the same function in fact, why not use this style for the inherit struct.
 I think it is more readable. Assigning a pointer to be NULL and freeing it
 in the caller is very strange for the people who reads the code. (It is also
 the reason why I made the mistake at the beginning.)
 
 So I think my patch is reasonable.
 
 Thanks
 Miao
 
 -Arne


 Reported-by: Alex Lyakas alex.bt...@zadarastorage.com
 Cc: Arne Jansen sensi...@gmx.net
 Signed-off-by: Miao Xie mi...@cn.fujitsu.com
 ---
  fs/btrfs/ioctl.c | 18 +++---
  1 file changed, 7 insertions(+), 11 deletions(-)

 diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
 index 02d3035..40f2fbf 100644
 --- a/fs/btrfs/ioctl.c
 +++ b/fs/btrfs/ioctl.c
 @@ -367,7 +367,7 @@ static noinline int create_subvol(struct btrfs_root 
 *root,
   struct dentry *dentry,
   char *name, int namelen,
   u64 *async_transid,
 - struct btrfs_qgroup_inherit **inherit)
 + struct btrfs_qgroup_inherit *inherit)
  {
 struct btrfs_trans_handle *trans;
 struct btrfs_key key;
 @@ -401,8 +401,7 @@ static noinline int create_subvol(struct btrfs_root 
 *root,
 if (IS_ERR(trans))
 return PTR_ERR(trans);
  
 -   ret = btrfs_qgroup_inherit(trans, root-fs_info, 0, objectid,
 -  inherit ? *inherit : NULL);
 +   ret = btrfs_qgroup_inherit(trans, root-fs_info, 0, objectid, inherit);
 if (ret)
 goto fail;
  
 @@ -530,7 +529,7 @@ fail:
  
  static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
char *name, int namelen, u64 *async_transid,
 -  bool readonly, struct btrfs_qgroup_inherit **inherit)
 +  bool readonly, struct btrfs_qgroup_inherit *inherit)
  {
 struct inode *inode;
 struct btrfs_pending_snapshot *pending_snapshot;
 @@ -549,10 +548,7 @@ static int create_snapshot(struct btrfs_root *root, 
 struct dentry *dentry,
 pending_snapshot-dentry = dentry;
 pending_snapshot-root = root;
 pending_snapshot-readonly = readonly;
 -   if (inherit) {
 -   pending_snapshot-inherit = *inherit;
 -   *inherit = NULL;/* take responsibility to free it */
 -   }
 +   pending_snapshot-inherit = inherit;
  
 trans = btrfs_start_transaction(root-fs_info-extent_root, 6);
 if (IS_ERR(trans)) {
 @@ -692,7 +688,7 @@ static noinline int btrfs_mksubvol(struct path *parent,
char *name, int namelen,
struct btrfs_root *snap_src,
u64 *async_transid, bool readonly,
 -  struct btrfs_qgroup_inherit **inherit)
 +  struct btrfs_qgroup_inherit *inherit)
  {
 struct inode *dir  = parent-dentry-d_inode;
 struct dentry *dentry;
 @@ -1454,7 +1450,7 @@ out:
  static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
 char *name, unsigned long fd, int subvol,
 u64 *transid, bool readonly,
 -   struct btrfs_qgroup_inherit **inherit)
 +   struct 

[RFC][PATCH] Btrfs: fix deadlock due to unsubmitted

2013-02-07 Thread Miao Xie
The deadlock problem happened when running fsstress(a test program in LTP).

Steps to reproduce:
 # mkfs.btrfs -b 100M partition
 # mount partition mnt
 # Path/fsstress -p 3 -n 1000 -d mnt

The reason is:
btrfs_direct_IO()
 |-do_direct_IO()
 |-get_page()
 |-get_blocks()
 |   |-btrfs_delalloc_resereve_space()
 |   |-btrfs_add_ordered_extent() ---  Add a new ordered extent
 |-dio_send_cur_page(page0) -- We didn't submit bio here
 |-get_page()
 |-get_blocks()
 |-btrfs_delalloc_resereve_space()
 |-flush_space()
 |-btrfs_start_ordered_extent()
 |-wait_event() -- Wait the completion of
the ordered extent that is
mentioned above

But because we didn't submit the bio that is mentioned above, the ordered
extent can not complete, we would wait for its completion forever.

There are two methods which can fix this deadlock problem:
1. submit the bio before we invoke get_blocks()
2. reserve the space before we do dio

Though the 1st is the simplest way, we need modify the code of VFS, and it
is likely to break contiguous requests, and introduce performance regression
for the other filesystems.

So we have to choose the 2nd way.

Signed-off-by: Miao Xie mi...@cn.fujitsu.com
Cc: Josef Bacik jba...@fusionio.com
---
 fs/btrfs/extent-tree.c |3 +-
 fs/btrfs/inode.c   |   81 ---
 2 files changed, 43 insertions(+), 41 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 85b8454..ca9afc4 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4670,7 +4670,8 @@ void btrfs_delalloc_release_metadata(struct inode *inode, 
u64 num_bytes)
spin_lock(BTRFS_I(inode)-lock);
dropped = drop_outstanding_extent(inode);
 
-   to_free = calc_csum_metadata_size(inode, num_bytes, 0);
+   if (num_bytes)
+   to_free = calc_csum_metadata_size(inode, num_bytes, 0);
spin_unlock(BTRFS_I(inode)-lock);
if (dropped  0)
to_free += btrfs_calc_trans_metadata_size(root, dropped);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index ca7ace7..c5d829d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6004,16 +6004,15 @@ static int btrfs_get_blocks_direct(struct inode *inode, 
sector_t iblock,
u64 len = bh_result-b_size;
struct btrfs_trans_handle *trans;
int unlock_bits = EXTENT_LOCKED;
-   int ret;
+   int ret = 0;
 
if (create) {
-   ret = btrfs_delalloc_reserve_space(inode, len);
-   if (ret)
-   return ret;
+   spin_lock(BTRFS_I(inode)-lock);
+   BTRFS_I(inode)-outstanding_extents++;
+   spin_unlock(BTRFS_I(inode)-lock);
unlock_bits |= EXTENT_DELALLOC | EXTENT_DIRTY;
-   } else {
+   } else
len = min_t(u64, len, root-sectorsize);
-   }
 
lockstart = start;
lockend = start + len - 1;
@@ -6025,14 +6024,6 @@ static int btrfs_get_blocks_direct(struct inode *inode, 
sector_t iblock,
if (lock_extent_direct(inode, lockstart, lockend, cached_state, 
create))
return -ENOTBLK;
 
-   if (create) {
-   ret = set_extent_bit(BTRFS_I(inode)-io_tree, lockstart,
-lockend, EXTENT_DELALLOC, NULL,
-cached_state, GFP_NOFS);
-   if (ret)
-   goto unlock_err;
-   }
-
em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
if (IS_ERR(em)) {
ret = PTR_ERR(em);
@@ -6064,7 +6055,6 @@ static int btrfs_get_blocks_direct(struct inode *inode, 
sector_t iblock,
if (!create  (em-block_start == EXTENT_MAP_HOLE ||
test_bit(EXTENT_FLAG_PREALLOC, em-flags))) {
free_extent_map(em);
-   ret = 0;
goto unlock_err;
}
 
@@ -6162,6 +6152,11 @@ unlock:
 */
if (start + len  i_size_read(inode))
i_size_write(inode, start + len);
+
+   ret = set_extent_bit(BTRFS_I(inode)-io_tree, lockstart,
+lockstart + len - 1, EXTENT_DELALLOC, NULL,
+cached_state, GFP_NOFS);
+   BUG_ON(ret);
}
 
/*
@@ -6170,24 +6165,9 @@ unlock:
 * aren't using if there is any left over space.
 */
if (lockstart  lockend) {
-   if (create  len  lockend - lockstart) {
-   clear_extent_bit(BTRFS_I(inode)-io_tree, lockstart,
-lockstart + len - 1,
-unlock_bits | EXTENT_DEFRAG, 1, 0,
-cached_state, 

Re: [RFC][PATCH] Btrfs: fix deadlock due to unsubmitted

2013-02-07 Thread Chris Mason
On Thu, Feb 07, 2013 at 03:12:07AM -0700, Miao Xie wrote:
 The deadlock problem happened when running fsstress(a test program in LTP).
 
 Steps to reproduce:
  # mkfs.btrfs -b 100M partition
  # mount partition mnt
  # Path/fsstress -p 3 -n 1000 -d mnt
 
 The reason is:
 btrfs_direct_IO()
  |-do_direct_IO()
  |-get_page()
  |-get_blocks()
  | |-btrfs_delalloc_resereve_space()
  | |-btrfs_add_ordered_extent() ---  Add a new ordered extent
  |-dio_send_cur_page(page0) --   We didn't submit bio 
 here
  |-get_page()
  |-get_blocks()
|-btrfs_delalloc_resereve_space()
|-flush_space()
|-btrfs_start_ordered_extent()
|-wait_event() -- Wait the completion of
   the ordered extent that is
   mentioned above
 
 But because we didn't submit the bio that is mentioned above, the ordered
 extent can not complete, we would wait for its completion forever.
 
 There are two methods which can fix this deadlock problem:
 1. submit the bio before we invoke get_blocks()
 2. reserve the space before we do dio
 
 Though the 1st is the simplest way, we need modify the code of VFS, and it
 is likely to break contiguous requests, and introduce performance regression
 for the other filesystems.
 
 So we have to choose the 2nd way.

The 3rd option is to have get_blocks return -EAGAIN to the direct-io.c
code and let the higher levels submit the bios they have built.

Josef will probably go for option #4, which is dropping the generic code
completely and doing it all ourselves.

But I do like your approach, it makes sense here.

-chris
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC][PATCH] Btrfs: fix deadlock due to unsubmitted

2013-02-07 Thread Josef Bacik
On Thu, Feb 07, 2013 at 03:12:07AM -0700, Miao Xie wrote:
 The deadlock problem happened when running fsstress(a test program in LTP).
 
 Steps to reproduce:
  # mkfs.btrfs -b 100M partition
  # mount partition mnt
  # Path/fsstress -p 3 -n 1000 -d mnt
 
 The reason is:
 btrfs_direct_IO()
  |-do_direct_IO()
  |-get_page()
  |-get_blocks()
  | |-btrfs_delalloc_resereve_space()
  | |-btrfs_add_ordered_extent() ---  Add a new ordered extent
  |-dio_send_cur_page(page0) --   We didn't submit bio 
 here
  |-get_page()
  |-get_blocks()
|-btrfs_delalloc_resereve_space()
|-flush_space()
|-btrfs_start_ordered_extent()
|-wait_event() -- Wait the completion of
   the ordered extent that is
   mentioned above
 
 But because we didn't submit the bio that is mentioned above, the ordered
 extent can not complete, we would wait for its completion forever.
 
 There are two methods which can fix this deadlock problem:
 1. submit the bio before we invoke get_blocks()
 2. reserve the space before we do dio
 
 Though the 1st is the simplest way, we need modify the code of VFS, and it
 is likely to break contiguous requests, and introduce performance regression
 for the other filesystems.
 
 So we have to choose the 2nd way.

I ran into this a few weeks ago and as Chris said I opted for option 4, just
do all the direct io stuff ourselves and ditch the generic stuff.  This approach
works for now though so I'm good with it.

 
 Signed-off-by: Miao Xie mi...@cn.fujitsu.com
 Cc: Josef Bacik jba...@fusionio.com
 ---
  fs/btrfs/extent-tree.c |3 +-
  fs/btrfs/inode.c   |   81 ---
  2 files changed, 43 insertions(+), 41 deletions(-)
 
 diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
 index 85b8454..ca9afc4 100644
 --- a/fs/btrfs/extent-tree.c
 +++ b/fs/btrfs/extent-tree.c
 @@ -4670,7 +4670,8 @@ void btrfs_delalloc_release_metadata(struct inode 
 *inode, u64 num_bytes)
   spin_lock(BTRFS_I(inode)-lock);
   dropped = drop_outstanding_extent(inode);
  
 - to_free = calc_csum_metadata_size(inode, num_bytes, 0);
 + if (num_bytes)
 + to_free = calc_csum_metadata_size(inode, num_bytes, 0);
   spin_unlock(BTRFS_I(inode)-lock);
   if (dropped  0)
   to_free += btrfs_calc_trans_metadata_size(root, dropped);
 diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
 index ca7ace7..c5d829d 100644
 --- a/fs/btrfs/inode.c
 +++ b/fs/btrfs/inode.c
 @@ -6004,16 +6004,15 @@ static int btrfs_get_blocks_direct(struct inode 
 *inode, sector_t iblock,
   u64 len = bh_result-b_size;
   struct btrfs_trans_handle *trans;
   int unlock_bits = EXTENT_LOCKED;
 - int ret;
 + int ret = 0;
  
   if (create) {
 - ret = btrfs_delalloc_reserve_space(inode, len);
 - if (ret)
 - return ret;
 + spin_lock(BTRFS_I(inode)-lock);
 + BTRFS_I(inode)-outstanding_extents++;
 + spin_unlock(BTRFS_I(inode)-lock);
   unlock_bits |= EXTENT_DELALLOC | EXTENT_DIRTY;
 - } else {
 + } else
   len = min_t(u64, len, root-sectorsize);
 - }
  
   lockstart = start;
   lockend = start + len - 1;
 @@ -6025,14 +6024,6 @@ static int btrfs_get_blocks_direct(struct inode 
 *inode, sector_t iblock,
   if (lock_extent_direct(inode, lockstart, lockend, cached_state, 
 create))
   return -ENOTBLK;
  
 - if (create) {
 - ret = set_extent_bit(BTRFS_I(inode)-io_tree, lockstart,
 -  lockend, EXTENT_DELALLOC, NULL,
 -  cached_state, GFP_NOFS);
 - if (ret)
 - goto unlock_err;
 - }
 -
   em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
   if (IS_ERR(em)) {
   ret = PTR_ERR(em);
 @@ -6064,7 +6055,6 @@ static int btrfs_get_blocks_direct(struct inode *inode, 
 sector_t iblock,
   if (!create  (em-block_start == EXTENT_MAP_HOLE ||
   test_bit(EXTENT_FLAG_PREALLOC, em-flags))) {
   free_extent_map(em);
 - ret = 0;

This doesn't look right, this should be left here.

   goto unlock_err;
   }
  
 @@ -6162,6 +6152,11 @@ unlock:
*/
   if (start + len  i_size_read(inode))
   i_size_write(inode, start + len);
 +
 + ret = set_extent_bit(BTRFS_I(inode)-io_tree, lockstart,
 +  lockstart + len - 1, EXTENT_DELALLOC, NULL,
 +  cached_state, GFP_NOFS);
 + BUG_ON(ret);

Return the error if there is one, there's no reason to add new BUG_ON()'s.
Thanks,

Josef
--
To unsubscribe from this list: send the line 

Re: [PATCH 2/2, RFC] btrfs-progs: overhaul mkfs.btrfs -r option

2013-02-07 Thread Eric Sandeen
On 2/6/13 6:08 PM, David Sterba wrote:
 On Tue, Jan 29, 2013 at 02:41:08PM -0600, Eric Sandeen wrote:
 The manpage for the -r option simply says that
 it will copy the path specified to -r into the newly
 made filesystem.

 There's not a lot of reason to treat that option as differently
 as it is now - today it ignores discard, fs size, and mixed
 options, for example.  It also failed to check whether the
 target device was mounted before proceeding.  Etc...

 Rework things so that we really follow the same paths whether
 or not -r is specified, but with one special case for -r:

 * If the device does not exist, it will be created as
   a regular file of the minimum size to hold the -r path,
   or of size specified by the -b option.

 This also changes a little behavior; it does not pre-fill
 the new file with zeros, but allows it to be sparse, and does
 not truncate an existing device file.  If you want to start
 with an empty file, just don't point it at an existing
 file...
 
 Fixes numerous bugs and I find the changes in behaviour sane and
 reasonable. A quick test failed for me when the image file does not
 exist:
 
 $ rm image
 $ ./mkfs.btrfs -r . image
 ...
 not enough free space
 add_file_items failed
 unable to traverse_directory
 Making image is aborted.
 ret = -1
 mkfs.btrfs: mkfs.c:1533: main: Assertion `!(ret)' failed.
 
 seems that the estimated size is not sufficient.

ho hum, thanks for checking.  I guess I only tried it on a very small
source directory.  For now, figuring out btrfs space usage for a given
set of files is above my pay grade and experience, I'm afraid.  :(

-Eric

 'du' on the directory (without the image itself) gives me 59M, the image
 after failed mkfs has 102M, and it's empty when I try to mount it. Using
 the progs .git directory (18M) as sourcedir also failed, but it worked
 with man/ subdirectory (94K).
 
 david
 

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/2, RFC] btrfs-progs: overhaul mkfs.btrfs -r option

2013-02-07 Thread Chris Mason
On Thu, Feb 07, 2013 at 08:52:43AM -0700, Eric Sandeen wrote:
 On 2/6/13 6:08 PM, David Sterba wrote:
  On Tue, Jan 29, 2013 at 02:41:08PM -0600, Eric Sandeen wrote:
  The manpage for the -r option simply says that
  it will copy the path specified to -r into the newly
  made filesystem.
 
  There's not a lot of reason to treat that option as differently
  as it is now - today it ignores discard, fs size, and mixed
  options, for example.  It also failed to check whether the
  target device was mounted before proceeding.  Etc...
 
  Rework things so that we really follow the same paths whether
  or not -r is specified, but with one special case for -r:
 
  * If the device does not exist, it will be created as
a regular file of the minimum size to hold the -r path,
or of size specified by the -b option.
 
  This also changes a little behavior; it does not pre-fill
  the new file with zeros, but allows it to be sparse, and does
  not truncate an existing device file.  If you want to start
  with an empty file, just don't point it at an existing
  file...
  
  Fixes numerous bugs and I find the changes in behaviour sane and
  reasonable. A quick test failed for me when the image file does not
  exist:
  
  $ rm image
  $ ./mkfs.btrfs -r . image
  ...
  not enough free space
  add_file_items failed
  unable to traverse_directory
  Making image is aborted.
  ret = -1
  mkfs.btrfs: mkfs.c:1533: main: Assertion `!(ret)' failed.
  
  seems that the estimated size is not sufficient.
 
 ho hum, thanks for checking.  I guess I only tried it on a very small
 source directory.  For now, figuring out btrfs space usage for a given
 set of files is above my pay grade and experience, I'm afraid.  :(

We should probably make a way to compact a given FS down to minimal
size.  Then we don't have to worry about size estimates at all.

In other words, two passes instead of one.

-chris
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: btrfs balance - hang/crash

2013-02-07 Thread Michael Schneider
o.k.,
I've deleted some files from my btrfs filesystem rebooted the machine
and was able to cancel the balance operation thereafter.

So far, everything seems to be working again.

-Mike

On Wed, Feb 6, 2013 at 11:26 PM, Michael Schneider
mike2.schnei...@googlemail.com wrote:
 # btrfs fi show
 failed to read /dev/sr0
 Label: 'bootssd'  uuid: ac89584c-c757-488e-8a2a-5ee5a5484dde
 Total devices 1 FS bytes used 34.98GB
 devid1 size 56.00GB used 41.07GB path /dev/dm-1

 Btrfs v0.19+

 # btrfs fi df /
 Data: total=35.00GB, used=33.34GB
 System, DUP: total=32.00MB, used=16.00KB
 System: total=4.00MB, used=0.00
 Metadata, DUP: total=3.00GB, used=1.63GB


 On Wed, Feb 6, 2013 at 11:21 PM, Hugo Mills h...@carfax.org.uk wrote:
 On Wed, Feb 06, 2013 at 11:12:14PM +0100, Michael Schneider wrote:
 Hi,

 my btrfs hangs when doing a balance operation.

 I'm using a 3.7.1 kernel from opensuse:  linux-opzz 3.7.1-2.10-m4 #11 SMP
 PREEMPT Fri Jan 11 18:04:04 CET 2013 x86_64 x86_64 x86_64 GNU/Linux
 and Btrfs v0.19+

 I did a scrub which completed without errors. Then I tried btrfs filesystem
 balance / which work fine for the first 23 of 46 chunks, then ist stopped
 processing further chunks, but contiued to consume large amounts of the CPU.
 The system logged filled quickly with messages like:

 [  347.237658] btrfs: block rsv returned -28
 [snip]
 Any idea what's going on?

You're running out of space. Can you post the output of:

 # btrfs fi df /mountpoint
 # btrfs fi show

 this will give us a bit more info about likely scenarios. Also, josef
 published a patch(*) in the last few hours, on this list, which may
 help deal with the issue.

Hugo.

 (*) Btrfs: rework the overcommit logic to be based on the total size

 --
 === Hugo Mills: hugo@... carfax.org.uk | darksatanic.net | lug.org.uk ===
   PGP key: 515C238D from wwwkeys.eu.pgp.net or http://www.carfax.org.uk
   --- What do you give the man who has everything? -- Penicillin is ---
  a good start...
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] Btrfs: steal from global reserve if we are cleaning up orphans

2013-02-07 Thread Josef Bacik
Sometimes xfstest 83 will fail to remount the scratch device because we've
gotten ourselves so full that we cannot cleanup the orphan items.  In this
case check to see if we're doing the orphan cleanup and if we are allow us
to steal our reservation from the global block rsv.  With this patch I've
not been able to reproduce the failed mount problem.  Thanks,

Signed-off-by: Josef Bacik jba...@fusionio.com
---
 fs/btrfs/ctree.h   |5 +
 fs/btrfs/extent-tree.c |   10 ++
 fs/btrfs/inode.c   |5 -
 3 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 50d1be9..2b68f88 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1237,6 +1237,11 @@ struct seq_list {
u64 seq;
 };
 
+enum btrfs_orphan_cleanup_state {
+   ORPHAN_CLEANUP_STARTED  = 1,
+   ORPHAN_CLEANUP_DONE = 2,
+};
+
 /* fs_info */
 struct reloc_control;
 struct btrfs_device;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 53c2898..8dbb45c 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4099,6 +4099,16 @@ again:
goto again;
 
 out:
+   if (ret == -ENOSPC 
+   unlikely(root-orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
+   struct btrfs_block_rsv *global_rsv =
+   root-fs_info-global_block_rsv;
+
+   if (block_rsv != global_rsv 
+   !btrfs_block_rsv_migrate(global_rsv, block_rsv,
+orig_bytes))
+   ret = 0;
+   }
if (flushing) {
spin_lock(space_info-lock);
space_info-flush = 0;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 44d95d1..73e9409 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2218,11 +2218,6 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root)
}
 }
 
-enum btrfs_orphan_cleanup_state {
-   ORPHAN_CLEANUP_STARTED  = 1,
-   ORPHAN_CLEANUP_DONE = 2,
-};
-
 /*
  * This is called in transaction commit time. If there are no orphan
  * files in the subvolume, it removes orphan item and frees block_rsv
-- 
1.7.7.6

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] Btrfs: steal from global reserve if we are cleaning up orphans V2

2013-02-07 Thread Josef Bacik
Sometimes xfstest 83 will fail to remount the scratch device because we've
gotten ourselves so full that we cannot cleanup the orphan items.  In this
case check to see if we're doing the orphan cleanup and if we are allow us
to steal our reservation from the global block rsv.  With this patch I've
not been able to reproduce the failed mount problem.  Thanks,

Signed-off-by: Josef Bacik jba...@fusionio.com
---
V1-V2: use block_rsv_use_bytes not migrate since we call add after we succeed
in reserve_metadata_bytes.

 fs/btrfs/ctree.h   |5 +
 fs/btrfs/extent-tree.c |   11 +++
 fs/btrfs/inode.c   |5 -
 3 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 50d1be9..2b68f88 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1237,6 +1237,11 @@ struct seq_list {
u64 seq;
 };
 
+enum btrfs_orphan_cleanup_state {
+   ORPHAN_CLEANUP_STARTED  = 1,
+   ORPHAN_CLEANUP_DONE = 2,
+};
+
 /* fs_info */
 struct reloc_control;
 struct btrfs_device;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 53c2898..b1534ba 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -102,6 +102,8 @@ static void dump_space_info(struct btrfs_space_info *info, 
u64 bytes,
int dump_block_groups);
 static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
   u64 num_bytes, int reserve);
+static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
+  u64 num_bytes);
 
 static noinline int
 block_group_cache_done(struct btrfs_block_group_cache *cache)
@@ -4099,6 +4101,15 @@ again:
goto again;
 
 out:
+   if (ret == -ENOSPC 
+   unlikely(root-orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
+   struct btrfs_block_rsv *global_rsv =
+   root-fs_info-global_block_rsv;
+
+   if (block_rsv != global_rsv 
+   !block_rsv_use_bytes(global_rsv, orig_bytes))
+   ret = 0;
+   }
if (flushing) {
spin_lock(space_info-lock);
space_info-flush = 0;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 44d95d1..73e9409 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2218,11 +2218,6 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root)
}
 }
 
-enum btrfs_orphan_cleanup_state {
-   ORPHAN_CLEANUP_STARTED  = 1,
-   ORPHAN_CLEANUP_DONE = 2,
-};
-
 /*
  * This is called in transaction commit time. If there are no orphan
  * files in the subvolume, it removes orphan item and frees block_rsv
-- 
1.7.7.6

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] Btrfs: cleanup orphan reservation if truncate fails

2013-02-07 Thread Josef Bacik
I noticed we were getting lots of warnings with xfstest 83 because we have
reservations outstanding.  This is because we moved the orphan add outside
of the truncate, but we don't actually cleanup our reservation if something
fails.  This fixes the problem and I no longer see warnings.  Thanks,

Signed-off-by: Josef Bacik jba...@fusionio.com
---
 fs/btrfs/inode.c |2 ++
 1 files changed, 2 insertions(+), 0 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 73e9409..905297f 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2538,6 +2538,8 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
goto out;
 
ret = btrfs_truncate(inode);
+   if (ret)
+   btrfs_orphan_del(NULL, inode);
} else {
nr_unlink++;
}
-- 
1.7.7.6

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC] Btrfs: Allow the compressed extent size limit to be modified v2

2013-02-07 Thread Mitch Harder
Provide for modification of the limit of compressed extent size
utilizing mount-time configuration settings.

The size of compressed extents was limited to 128K, which
leads to fragmentation of the extents (although the extents
themselves may still be located contiguously).  This limit is
put in place to ease the RAM required when spreading compression
across several CPUs, and to make sure the amount of IO required
to do a random read is reasonably small.

Signed-off-by: Mitch Harder mitch.har...@sabayonlinux.org
---
Changelog v1 - v2:
- Use more self-documenting variable name:
  compressed_extent_size - max_compressed_extent_kb
- Use #define BTRFS_DEFAULT_MAX_COMPR_EXTENTS instead of raw 128.
- Fix min calculation for nr_pages.
- Comment cleanup.
- Use more self-documenting mount option parameter:
  compressed_extent_size - max_compressed_extent_kb
- Fix formatting in btrfs_show_options.
---
 fs/btrfs/ctree.h  |  6 ++
 fs/btrfs/disk-io.c|  1 +
 fs/btrfs/inode.c  |  8 
 fs/btrfs/relocation.c |  7 ---
 fs/btrfs/super.c  | 20 +++-
 5 files changed, 34 insertions(+), 8 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 547b7b0..a62f20c 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -191,6 +191,9 @@ static int btrfs_csum_sizes[] = { 4, 0 };
 /* ioprio of readahead is set to idle */
 #define BTRFS_IOPRIO_READA (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0))
 
+/* Default value for maximum compressed extent size (kb) */
+#define BTRFS_DEFAULT_MAX_COMPR_EXTENTS128
+
 /*
  * The key defines the order in the tree, and so it also defines (optimal)
  * block layout.
@@ -1477,6 +1480,8 @@ struct btrfs_fs_info {
unsigned data_chunk_allocations;
unsigned metadata_ratio;
 
+   unsigned max_compressed_extent_kb;
+
void *bdev_holder;
 
/* private scrub information */
@@ -1829,6 +1834,7 @@ struct btrfs_ioctl_defrag_range_args {
 #define BTRFS_MOUNT_CHECK_INTEGRITY(1  20)
 #define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1  21)
 #define BTRFS_MOUNT_PANIC_ON_FATAL_ERROR   (1  22)
+#define BTRFS_MOUNT_COMPR_EXTENT_SIZE (1  23)
 
 #define btrfs_clear_opt(o, opt)((o) = ~BTRFS_MOUNT_##opt)
 #define btrfs_set_opt(o, opt)  ((o) |= BTRFS_MOUNT_##opt)
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 830bc17..775e7ba 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2056,6 +2056,7 @@ int open_ctree(struct super_block *sb,
fs_info-trans_no_join = 0;
fs_info-free_chunk_space = 0;
fs_info-tree_mod_log = RB_ROOT;
+   fs_info-max_compressed_extent_kb = BTRFS_DEFAULT_MAX_COMPR_EXTENTS;
 
/* readahead state */
INIT_RADIX_TREE(fs_info-reada_tree, GFP_NOFS  ~__GFP_WAIT);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 148abeb..78fc6eb 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -346,8 +346,8 @@ static noinline int compress_file_range(struct inode *inode,
unsigned long nr_pages_ret = 0;
unsigned long total_compressed = 0;
unsigned long total_in = 0;
-   unsigned long max_compressed = 128 * 1024;
-   unsigned long max_uncompressed = 128 * 1024;
+   unsigned long max_compressed = root-fs_info-max_compressed_extent_kb 
* 1024;
+   unsigned long max_uncompressed = 
root-fs_info-max_compressed_extent_kb * 1024;
int i;
int will_compress;
int compress_type = root-fs_info-compress_type;
@@ -361,7 +361,7 @@ static noinline int compress_file_range(struct inode *inode,
 again:
will_compress = 0;
nr_pages = (end  PAGE_CACHE_SHIFT) - (start  PAGE_CACHE_SHIFT) + 1;
-   nr_pages = min(nr_pages, (128 * 1024UL) / PAGE_CACHE_SIZE);
+   nr_pages = min(nr_pages, max_compressed / PAGE_CACHE_SIZE);
 
/*
 * we don't want to send crud past the end of i_size through
@@ -386,7 +386,7 @@ again:
 *
 * We also want to make sure the amount of IO required to do
 * a random read is reasonably small, so we limit the size of
-* a compressed extent to 128k.
+* a compressed extent.
 */
total_compressed = min(total_compressed, max_uncompressed);
num_bytes = (end - start + blocksize)  ~(blocksize - 1);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 300e09a..64bbc9e 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -144,7 +144,7 @@ struct tree_block {
unsigned int key_ready:1;
 };
 
-#define MAX_EXTENTS 128
+#define MAX_EXTENTS 512
 
 struct file_extent_cluster {
u64 start;
@@ -3055,6 +3055,7 @@ int relocate_data_extent(struct inode *inode, struct 
btrfs_key *extent_key,
 struct file_extent_cluster *cluster)
 {
int ret;
+   struct btrfs_fs_info *fs_info = BTRFS_I(inode)-root-fs_info;
 
if (cluster-nr  0  extent_key-objectid != cluster-end + 1) {
ret = 

Re: [RFC] Btrfs: Allow the compressed extent size limit to be modified v2

2013-02-07 Thread Zach Brown
On Thu, Feb 07, 2013 at 03:38:34PM -0600, Mitch Harder wrote:
 Provide for modification of the limit of compressed extent size
 utilizing mount-time configuration settings.
 
 The size of compressed extents was limited to 128K, which
 leads to fragmentation of the extents (although the extents
 themselves may still be located contiguously).  This limit is
 put in place to ease the RAM required when spreading compression
 across several CPUs, and to make sure the amount of IO required
 to do a random read is reasonably small.
 
 Signed-off-by: Mitch Harder mitch.har...@sabayonlinux.org
 ---
 Changelog v1 - v2:
 - Use more self-documenting variable name:
   compressed_extent_size - max_compressed_extent_kb
 - Use #define BTRFS_DEFAULT_MAX_COMPR_EXTENTS instead of raw 128.
 - Fix min calculation for nr_pages.
 - Comment cleanup.
 - Use more self-documenting mount option parameter:
   compressed_extent_size - max_compressed_extent_kb
 - Fix formatting in btrfs_show_options.

Cool, thanks for cleaning those up.  It looks a lot better now.

- z
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC] Btrfs: Allow the compressed extent size limit to be modified v2

2013-02-07 Thread David Sterba
On Thu, Feb 07, 2013 at 03:38:34PM -0600, Mitch Harder wrote:
 --- a/fs/btrfs/relocation.c
 +++ b/fs/btrfs/relocation.c
 @@ -144,7 +144,7 @@ struct tree_block {
   unsigned int key_ready:1;
  };
  
 -#define MAX_EXTENTS 128
 +#define MAX_EXTENTS 512

Is this really related to compression? IIRC I've seen it only in context
of batch work in reloc, but not anywhere near compression. (I may be
wrong of course, just checking).

david
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 3/6] Btrfs-progs: move btrfs-select-super.c

2013-02-07 Thread Ian Kumlien
This patch moves
btrfs-select-super.c - cmds-rescue-super-ops.c

This is done in preparation to merge the select-super
functionality to btrfs.

The naming is because we will also merge
btrfs-dump-super.c from Josef Bacik

Signed-off-by: Ian Kumlien po...@demius.net
---
 btrfs-select-super.c = cmds-rescue-super-ops.c | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename btrfs-select-super.c = cmds-rescue-super-ops.c (100%)

diff --git a/btrfs-select-super.c b/cmds-rescue-super-ops.c
similarity index 100%
rename from btrfs-select-super.c
rename to cmds-rescue-super-ops.c
-- 
1.8.1.2

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/6] Btrfs-progs: Rename btrfsck.c - cmds-check.c

2013-02-07 Thread Ian Kumlien
In preparation for merging btrfsck functionality in to btrfs.

Signed-off-by: Ian Kumlien po...@demius.net
---
 btrfsck.c = cmds-check.c | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename btrfsck.c = cmds-check.c (100%)

diff --git a/btrfsck.c b/cmds-check.c
similarity index 100%
rename from btrfsck.c
rename to cmds-check.c
-- 
1.8.1.2

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Btrfs-progs: Merge btrfs-restore, btrfsck, btrfs-select-super, btrfs-dump-super and btrfs-debug-tree in to btrfs

2013-02-07 Thread Ian Kumlien
Hi,

This patch series moves some of the commands around to reflect that
they are now subcommands of btrfs.

As a stage in this we also add support for btrfs being called as 
btrfsck which yeilds the, now(?), starnard btrfs check or 
fsck.btrfs which is a noop to avoid complications with distributions.

We also merge Josef Baciks btrfs-dump-super in patch number 4.

This patchset has been rebased on top of David Sterbas latest WIP changes.

Comments? Ideas? Suggestions?

Please help me with all the help text and verify that it's correct enough ;)

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 4/6] Btrfs-progs: move debug-tree.c - cmds-rescue-debug-tree.c

2013-02-07 Thread Ian Kumlien
The btrfs-debug-tree functionality will be integrated
in to btrfs as btrfs rescue debug-tree

Signed-off-by: Ian Kumlien po...@demius.net
---
 debug-tree.c = cmds-rescue-debug-tree.c | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename debug-tree.c = cmds-rescue-debug-tree.c (100%)

diff --git a/debug-tree.c b/cmds-rescue-debug-tree.c
similarity index 100%
rename from debug-tree.c
rename to cmds-rescue-debug-tree.c
-- 
1.8.1.2

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 6/6] Btrfs-progs: add the rescue section to btrfs

2013-02-07 Thread Ian Kumlien
the btrfs command now lists:
btrfs rescue select-super -s number device
Select a superblock
btrfs rescue dump-super device
Dump a superblock to disk
btrfs rescue debug-tree [options] device
Debug the filesystem

btrfs-dump-super.c was imported in to cmds-rescue-super-ops.c

This patch integrates all the functionality...

cmds-rescue.c is used to glue cmds-rescue-debug-tree.c,
cmds-rescue-restore.c and cmds-rescue-super-ops.c together to
make the source files more managable.

Signed-off-by: Ian Kumlien po...@demius.net
---
 Makefile |  34 +---
 btrfs-dump-super.c   |  87 ---
 btrfs.c  |   2 +
 cmds-rescue-debug-tree.c |  44 ++--
 cmds-rescue-super-ops.c  | 103 +--
 cmds-rescue.c|  48 ++
 cmds-restore.c   |  42 +++
 commands.h   |   8 +++-
 8 files changed, 195 insertions(+), 173 deletions(-)
 delete mode 100644 btrfs-dump-super.c
 create mode 100644 cmds-rescue.c

diff --git a/Makefile b/Makefile
index 94541b2..7e2db76 100644
--- a/Makefile
+++ b/Makefile
@@ -8,7 +8,9 @@ objects = ctree.o disk-io.o radix-tree.o extent-tree.o 
print-tree.o \
  send-stream.o send-utils.o qgroup.o raid6.o
 cmds_objects = cmds-subvolume.o cmds-filesystem.o cmds-device.o cmds-scrub.o \
   cmds-inspect.o cmds-balance.o cmds-send.o cmds-receive.o \
-  cmds-quota.o cmds-qgroup.o cmds-replace.o cmds-check.o
+  cmds-quota.o cmds-qgroup.o cmds-replace.o cmds-check.o \
+  cmds-rescue.o cmds-rescue-super-ops.o \
+  cmds-rescue-debug-tree.o cmds-restore.o
 
 CHECKFLAGS= -D__linux__ -Dlinux -D__STDC__ -Dunix -D__unix__ -Wbitwise \
-Wuninitialized -Wshadow -Wundef
@@ -17,8 +19,7 @@ DEPFLAGS = -Wp,-MMD,$(@D)/.$(@F).d,-MT,$@
 INSTALL = install
 prefix ?= /usr/local
 bindir = $(prefix)/bin
-LIBS=-luuid -lm
-RESTORE_LIBS=-lz -llzo2
+LIBS=-luuid -lm -lz -llzo2
 
 ifeq ($(origin V), command line)
   BUILD_VERBOSE = $(V)
@@ -35,10 +36,9 @@ endif
 
 MAKEOPTS = --no-print-directory Q=$(Q)
 
-progs = btrfsctl mkfs.btrfs btrfs-debug-tree btrfs-show btrfs-vol \
-   btrfs btrfs-map-logical btrfs-image btrfs-zero-log btrfs-convert \
-   btrfs-find-root btrfs-restore btrfstune btrfs-show-super \
-   btrfs-dump-super
+progs = btrfsctl mkfs.btrfs btrfs-show btrfs-vol btrfs \
+   btrfs-map-logical btrfs-image btrfs-zero-log btrfs-convert \
+   btrfs-find-root btrfstune \
 
 # Create all the static targets
 static_objects = $(patsubst %.o, %.static.o, $(objects))
@@ -95,10 +95,6 @@ btrfs-find-root: $(objects) find-root.o
@echo [LD] $@
$(Q)$(CC) $(CFLAGS) -o btrfs-find-root find-root.o $(objects) 
$(LDFLAGS) $(LIBS)
 
-btrfs-restore: $(objects) restore.o
-   @echo [LD] $@
-   $(Q)$(CC) $(CFLAGS) -o btrfs-restore restore.o $(objects) $(LDFLAGS) 
$(LIBS) $(RESTORE_LIBS)
-
 btrfsctl: $(objects) btrfsctl.o
@echo [LD] $@
$(Q)$(CC) $(CFLAGS) -o btrfsctl btrfsctl.o $(objects) $(LDFLAGS) $(LIBS)
@@ -115,10 +111,6 @@ mkfs.btrfs: $(objects) mkfs.o
@echo [LD] $@
$(Q)$(CC) $(CFLAGS) -o mkfs.btrfs $(objects) mkfs.o $(LDFLAGS) $(LIBS) 
-lblkid
 
-btrfs-debug-tree: $(objects) debug-tree.o
-   @echo [LD] $@
-   $(Q)$(CC) $(CFLAGS) -o btrfs-debug-tree $(objects) debug-tree.o 
$(LDFLAGS) $(LIBS)
-
 btrfs-zero-log: $(objects) btrfs-zero-log.o
@echo [LD] $@
$(Q)$(CC) $(CFLAGS) -o btrfs-zero-log $(objects) btrfs-zero-log.o 
$(LDFLAGS) $(LIBS)
@@ -127,14 +119,6 @@ btrfs-show-super: $(objects) btrfs-show-super.o
@echo [LD] $@
$(Q)$(CC) $(CFLAGS) -o btrfs-show-super $(objects) btrfs-show-super.o 
$(LDFLAGS) $(LIBS)
 
-btrfs-select-super: $(objects) btrfs-select-super.o
-   @echo [LD] $@
-   $(Q)$(CC) $(CFLAGS) -o btrfs-select-super $(objects) 
btrfs-select-super.o $(LDFLAGS) $(LIBS)
-
-btrfs-dump-super: $(objects) btrfs-dump-super.o
-   @echo [LD] $@
-   $(Q)$(CC) $(CFLAGS) -o btrfs-dump-super $(objects) btrfs-dump-super.o 
$(LDFLAGS) $(LIBS)
-
 btrfstune: $(objects) btrfstune.o
@echo [LD] $@
$(Q)$(CC) $(CFLAGS) -o btrfstune $(objects) btrfstune.o $(LDFLAGS) 
$(LIBS)
@@ -175,8 +159,8 @@ install-man:
 
 clean :
@echo Cleaning
-   $(Q)rm -f $(progs) cscope.out *.o .*.d btrfs-convert btrfs-image 
btrfs-select-super \
- btrfs-zero-log btrfstune btrfs-dump-super dir-test ioctl-test 
quick-test \
+   $(Q)rm -f $(progs) cscope.out *.o .*.d btrfs-convert btrfs-image \
+ btrfs-zero-log btrfstune dir-test ioctl-test quick-test \
  version.h
$(Q)$(MAKE) $(MAKEOPTS) -C man $@
 
diff --git a/btrfs-dump-super.c b/btrfs-dump-super.c
deleted file mode 100644
index 033140c..000
--- 

[PATCH 5/6] Btrfs-progs: restore.c - cmds-restore.c

2013-02-07 Thread Ian Kumlien
The btrfs-restore functionality will be integrated in
btrs as btrfs restore

Signed-off-by: Ian Kumlien po...@demius.net
---
 restore.c = cmds-restore.c | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename restore.c = cmds-restore.c (100%)

diff --git a/restore.c b/cmds-restore.c
similarity index 100%
rename from restore.c
rename to cmds-restore.c
-- 
1.8.1.2

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/6] Btrfs-progs: add btrfsck functionality to btrfs

2013-02-07 Thread Ian Kumlien
This patch includes the functionality of btrfs, it's
found as btrfs check however it makes the binary
behave differently depending on what it's run as.

btrfsck - will act like normal btrfsck
fsck.btrfs - noop

Signed-off-by: Ian Kumlien po...@demius.net
---
 Makefile |  8 ++-
 btrfs.c  | 68 
 cmds-check.c | 40 ---
 commands.h   |  3 +++
 4 files changed, 78 insertions(+), 41 deletions(-)

diff --git a/Makefile b/Makefile
index 2c2a500..94541b2 100644
--- a/Makefile
+++ b/Makefile
@@ -8,7 +8,7 @@ objects = ctree.o disk-io.o radix-tree.o extent-tree.o 
print-tree.o \
  send-stream.o send-utils.o qgroup.o raid6.o
 cmds_objects = cmds-subvolume.o cmds-filesystem.o cmds-device.o cmds-scrub.o \
   cmds-inspect.o cmds-balance.o cmds-send.o cmds-receive.o \
-  cmds-quota.o cmds-qgroup.o cmds-replace.o
+  cmds-quota.o cmds-qgroup.o cmds-replace.o cmds-check.o
 
 CHECKFLAGS= -D__linux__ -Dlinux -D__STDC__ -Dunix -D__unix__ -Wbitwise \
-Wuninitialized -Wshadow -Wundef
@@ -35,7 +35,7 @@ endif
 
 MAKEOPTS = --no-print-directory Q=$(Q)
 
-progs = btrfsctl mkfs.btrfs btrfs-debug-tree btrfs-show btrfs-vol btrfsck \
+progs = btrfsctl mkfs.btrfs btrfs-debug-tree btrfs-show btrfs-vol \
btrfs btrfs-map-logical btrfs-image btrfs-zero-log btrfs-convert \
btrfs-find-root btrfs-restore btrfstune btrfs-show-super \
btrfs-dump-super
@@ -111,10 +111,6 @@ btrfs-show: $(objects) btrfs-show.o
@echo [LD] $@
$(Q)$(CC) $(CFLAGS) -o btrfs-show btrfs-show.o $(objects) $(LDFLAGS) 
$(LIBS)
 
-btrfsck: $(objects) btrfsck.o
-   @echo [LD] $@
-   $(Q)$(CC) $(CFLAGS) -o btrfsck btrfsck.o $(objects) $(LDFLAGS) $(LIBS)
-
 mkfs.btrfs: $(objects) mkfs.o
@echo [LD] $@
$(Q)$(CC) $(CFLAGS) -o mkfs.btrfs $(objects) mkfs.o $(LDFLAGS) $(LIBS) 
-lblkid
diff --git a/btrfs.c b/btrfs.c
index 7b0e50f..cf2f320 100644
--- a/btrfs.c
+++ b/btrfs.c
@@ -48,8 +48,13 @@ int prefixcmp(const char *str, const char *prefix)
return (unsigned char)*prefix - (unsigned char)*str;
 }
 
-static int parse_one_token(const char *arg, const struct cmd_group *grp,
-  const struct cmd_struct **cmd_ret)
+#define parse_one_token(arg, grp, cmd_ret) \
+   _parse_one_token((arg), (grp), (cmd_ret), 0)
+#define parse_one_exact_token(arg, grp, cmd_ret) \
+   _parse_one_token((arg), (grp), (cmd_ret), 1)
+
+static int _parse_one_token(const char *arg, const struct cmd_group *grp,
+  const struct cmd_struct **cmd_ret, int exact)
 {
const struct cmd_struct *cmd = grp-commands;
const struct cmd_struct *abbrev_cmd = NULL, *ambiguous_cmd = NULL;
@@ -80,12 +85,15 @@ static int parse_one_token(const char *arg, const struct 
cmd_group *grp,
return 0;
}
 
-   if (ambiguous_cmd)
-   return -2;
+   if (!exact)
+   {
+   if (ambiguous_cmd)
+   return -2;
 
-   if (abbrev_cmd) {
-   *cmd_ret = abbrev_cmd;
-   return 0;
+   if (abbrev_cmd) {
+   *cmd_ret = abbrev_cmd;
+   return 0;
+   }
}
 
return -1;
@@ -246,6 +254,7 @@ const struct cmd_group btrfs_cmd_group = {
{ balance, cmd_balance, NULL, balance_cmd_group, 0 },
{ device, cmd_device, NULL, device_cmd_group, 0 },
{ scrub, cmd_scrub, NULL, scrub_cmd_group, 0 },
+   { check, cmd_check, cmd_check_usage, NULL, 0 },
{ inspect-internal, cmd_inspect, NULL, inspect_cmd_group, 0 
},
{ send, cmd_send, cmd_send_usage, NULL, 0 },
{ receive, cmd_receive, cmd_receive_usage, NULL, 0 },
@@ -258,24 +267,47 @@ const struct cmd_group btrfs_cmd_group = {
},
 };
 
+static int cmd_dummy(int argc, char **argv)
+{
+   return 0;
+}
+
+/* change behaviour depending on what we're called */
+const struct cmd_group function_cmd_group = {
+   NULL, NULL,
+   {
+   { btrfsck, cmd_check, NULL, NULL, 0 },
+   { fsck.btrfs, cmd_dummy, NULL, NULL, 0 },
+   { 0, 0, 0, 0, 0 }
+   },
+};
+
 int main(int argc, char **argv)
 {
const struct cmd_struct *cmd;
+   char *called_as = strrchr(argv[0], '/');
+   if (called_as)
+   argv[0] = ++called_as;
 
crc32c_optimization_init();
 
-   argc--;
-   argv++;
-   handle_options(argc, argv);
-   if (argc  0) {
-   if (!prefixcmp(argv[0], --))
-   argv[0] += 2;
-   } else {
-   usage_command_group(btrfs_cmd_group, 0, 0);
-   exit(1);
-   }
+   /* if we have cmd, we're started as a sub command */
+   if 

Re: [RFC] Btrfs: Allow the compressed extent size limit to be modified v2

2013-02-07 Thread Mitch Harder
On Thu, Feb 7, 2013 at 6:28 PM, David Sterba d...@jikos.cz wrote:
 On Thu, Feb 07, 2013 at 03:38:34PM -0600, Mitch Harder wrote:
 --- a/fs/btrfs/relocation.c
 +++ b/fs/btrfs/relocation.c
 @@ -144,7 +144,7 @@ struct tree_block {
   unsigned int key_ready:1;
  };

 -#define MAX_EXTENTS 128
 +#define MAX_EXTENTS 512

 Is this really related to compression? IIRC I've seen it only in context
 of batch work in reloc, but not anywhere near compression. (I may be
 wrong of course, just checking).


When you defragment compressed extents, it will run through relocation.

If autodefrag is enabled, I found most everything I touched was
running through relocation.

It has been a while since I looked at the issue, but I think balancing
your data will also run through relocation.
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] Btrfs: fix the deadlock between the transaction attach and commit

2013-02-07 Thread Miao Xie
Here is the whole story:
Trans_Attach_Task   Trans_Commit_Task
btrfs_commit_transaction()
 |-wait writers to be 1
btrfs_attach_transaction()   |
btrfs_commit_transaction()   |
 |   |-set trans_no_join to 1
 |   |  (close join transaction)
 |-btrfs_run_ordered_operations |
(Those ordered operations|
 are added when releasing|
 file)   |
 |-btrfs_join_transaction() |
|-wait_commit() |
 |-wait writers to be 1

Then these two tasks waited for each other.

As we know, btrfs_attach_transaction() is used to catch the current
transaction, and commit it, so if someone has committed the transaction,
it is unnecessary to join it and commit it, wait is the best choice
for it. In this way, we can fix the above problem.

Signed-off-by: Miao Xie mi...@cn.fujitsu.com
---
 fs/btrfs/transaction.c |8 
 1 files changed, 8 insertions(+), 0 deletions(-)

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index f154946..7be9d5e 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -285,6 +285,14 @@ static int may_wait_transaction(struct btrfs_root *root, 
int type)
if (type == TRANS_USERSPACE)
return 1;
 
+   /*
+* If we are ATTACH, it means we just want to catch the current
+* transaction and commit it. So if someone is committing the
+* current transaction now, it is very glad to wait it.
+*/
+   if (type == TRANS_ATTACH)
+   return 1;
+
if (type == TRANS_START 
!atomic_read(root-fs_info-open_ioctl_trans))
return 1;
-- 
1.6.5.2
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH V2 1/2] Btrfs: serialize unlocked dio reads with truncate

2013-02-07 Thread Miao Xie
Currently, we can do unlocked dio reads, but the following race
is possible:

dio_read_task   truncate_task
-btrfs_setattr()
-btrfs_direct_IO
-__blockdev_direct_IO
  -btrfs_get_block
  -btrfs_truncate()
 #alloc truncated blocks
 #to other inode
  -submit_io()
 #INFORMATION LEAK

In order to avoid this problem, we must serialize unlocked dio reads with
truncate. There are two approaches:
- use extent lock to protect the extent that we truncate
- use inode_dio_wait() to make sure the truncating task will wait for
  the read DIO.

If we use the 1st one, we will meet the endless truncation problem due to
the nonlocked read DIO after we implement the nonlocked write DIO. It is
because we still need invoke inode_dio_wait() avoid the race between write
DIO and truncation. By that time, we have to introduce

  btrfs_inode_{block, resume}_nolock_dio()

again. That is we have to implement this patch again, so I choose the 2nd
way to fix the problem.

Signed-off-by: Miao Xie mi...@cn.fujitsu.com
---
Changlog v1 - v2:
- Rebase the patch against the following one:
  [RFC][PATCH] Btrfs: fix deadlock due to unsubmitted
- Modify the changelog to explain why we don't choose the extent lock to
  fix the bug
---
 fs/btrfs/btrfs_inode.h |   19 +++
 fs/btrfs/inode.c   |   23 +--
 2 files changed, 40 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 2a8c242..00e2601 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -40,6 +40,7 @@
 #define BTRFS_INODE_HAS_ASYNC_EXTENT   6
 #define BTRFS_INODE_NEEDS_FULL_SYNC7
 #define BTRFS_INODE_COPY_EVERYTHING8
+#define BTRFS_INODE_READDIO_NEED_LOCK  9
 
 /* in memory btrfs inode */
 struct btrfs_inode {
@@ -216,4 +217,22 @@ static inline int btrfs_inode_in_log(struct inode *inode, 
u64 generation)
return 0;
 }
 
+/*
+ * Disable DIO read nolock optimization, so new dio readers will be forced
+ * to grab i_mutex. It is used to avoid the endless truncate due to
+ * nonlocked dio read.
+ */
+static inline void btrfs_inode_block_unlocked_dio(struct inode *inode)
+{
+   set_bit(BTRFS_INODE_READDIO_NEED_LOCK, BTRFS_I(inode)-runtime_flags);
+   smp_mb();
+}
+
+static inline void btrfs_inode_resume_unlocked_dio(struct inode *inode)
+{
+   smp_mb__before_clear_bit();
+   clear_bit(BTRFS_INODE_READDIO_NEED_LOCK,
+ BTRFS_I(inode)-runtime_flags);
+}
+
 #endif
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index c5d829d..a49be05 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3832,6 +3832,12 @@ static int btrfs_setsize(struct inode *inode, struct 
iattr *attr)
 
/* we don't support swapfiles, so vmtruncate shouldn't fail */
truncate_setsize(inode, newsize);
+
+   /* Disable nonlocked read DIO to avoid the end less truncate */
+   btrfs_inode_block_unlocked_dio(inode);
+   inode_dio_wait(inode);
+   btrfs_inode_resume_unlocked_dio(inode);
+
ret = btrfs_truncate(inode);
if (ret  inode-i_nlink)
btrfs_orphan_del(NULL, inode);
@@ -6615,6 +6621,8 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
struct file *file = iocb-ki_filp;
struct inode *inode = file-f_mapping-host;
size_t count = 0;
+   int flags = 0;
+   bool wakeup = false;
ssize_t ret;
 
if (check_direct_IO(BTRFS_I(inode)-root, rw, iocb, iov,
@@ -6626,13 +6634,22 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb 
*iocb,
ret = btrfs_delalloc_reserve_space(inode, count);
if (ret)
return ret;
+   } else {
+   atomic_inc(inode-i_dio_count);
+   smp_mb__after_atomic_inc();
+   if (unlikely(test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
+ BTRFS_I(inode)-runtime_flags))) {
+   inode_dio_done(inode);
+   flags = DIO_LOCKING | DIO_SKIP_HOLES;
+   } else {
+   wakeup = true;
+   }
}
 
ret = __blockdev_direct_IO(rw, iocb, inode,
BTRFS_I(inode)-root-fs_info-fs_devices-latest_bdev,
iov, offset, nr_segs, btrfs_get_blocks_direct, NULL,
-   btrfs_submit_direct, 0);
-
+   btrfs_submit_direct, flags);
if (rw  WRITE) {
if (ret  0  ret != -EIOCBQUEUED)
btrfs_delalloc_release_space(inode, count);
@@ -6645,6 +6662,8 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
}
btrfs_delalloc_release_metadata(inode, 0);
}
+   if 

[PATCH V2 2/2] Btrfs: implement unlocked dio write

2013-02-07 Thread Miao Xie
This idea is from ext4. By this patch, we can make the dio write parallel,
and improve the performance. But because we can not update isize without
i_mutex, the unlocked dio write just can be done in front of the EOF.

We needn't worry about the race between dio write and truncate, because the
truncate need wait untill all the dio write end.

And we also needn't worry about the race between dio write and punch hole,
because we have extent lock to protect our operation.

I ran fio to test the performance of this feature.

== Hardware ==
CPU: Intel(R) Core(TM)2 Duo CPU E7500  @ 2.93GHz
Mem: 2GB
SSD: Intel X25-M 120GB (Test Partition: 60GB)

== config file ==
[global]
ioengine=psync
direct=1
bs=4k
size=32G
runtime=60
directory=/mnt/btrfs/
filename=testfile
group_reporting
thread

[file1]
numjobs=1 # 2 4
rw=randwrite

== result (KBps) ==
write   1   2   4
lock24936   24738   24726
nolock  24962   30866   32101

== result (iops) ==
write   1   2   4
lock623461846181
nolock  624077168025

Signed-off-by: Miao Xie mi...@cn.fujitsu.com
---
Changelog v1 - v2:
- don't do nolocked DIO write if it is beyond the EOF 
---
 fs/btrfs/inode.c |   35 +++
 1 files changed, 23 insertions(+), 12 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a49be05..2948123 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6622,28 +6622,36 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb 
*iocb,
struct inode *inode = file-f_mapping-host;
size_t count = 0;
int flags = 0;
-   bool wakeup = false;
+   bool wakeup = true;
+   bool relock = false;
ssize_t ret;
 
if (check_direct_IO(BTRFS_I(inode)-root, rw, iocb, iov,
offset, nr_segs))
return 0;
 
+   atomic_inc(inode-i_dio_count);
+   smp_mb__after_atomic_inc();
+
if (rw  WRITE) {
count = iov_length(iov, nr_segs);
+   /*
+* If the write DIO is beyond the EOF, we need update
+* the isize, but it is protected by i_mutex. So we can
+* not unlock the i_mutex at this case.
+*/
+   if (offset + count = inode-i_size) {
+   mutex_unlock(inode-i_mutex);
+   relock = true;
+   }
ret = btrfs_delalloc_reserve_space(inode, count);
if (ret)
-   return ret;
-   } else {
-   atomic_inc(inode-i_dio_count);
-   smp_mb__after_atomic_inc();
-   if (unlikely(test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
- BTRFS_I(inode)-runtime_flags))) {
-   inode_dio_done(inode);
-   flags = DIO_LOCKING | DIO_SKIP_HOLES;
-   } else {
-   wakeup = true;
-   }
+   goto out;
+   } else if (unlikely(test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
+BTRFS_I(inode)-runtime_flags))) {
+   inode_dio_done(inode);
+   flags = DIO_LOCKING | DIO_SKIP_HOLES;
+   wakeup = false;
}
 
ret = __blockdev_direct_IO(rw, iocb, inode,
@@ -6662,8 +6670,11 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb 
*iocb,
}
btrfs_delalloc_release_metadata(inode, 0);
}
+out:
if (wakeup)
inode_dio_done(inode);
+   if (relock)
+   mutex_lock(inode-i_mutex);
 
return ret;
 }
-- 
1.6.5.2
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html